ium_487197/data.py
2023-04-04 12:11:02 +02:00

24 lines
1.1 KiB
Python

import pandas as pd
from sklearn.model_selection import train_test_split
baltimore=pd.read_csv('BPD_Part_1_Victim_Based_Crime_Data.csv')
baltimore["Weapon"].fillna("None", inplace=True)
baltimore.dropna(inplace=True)
baltimore['Post'] = baltimore['Post'] /baltimore['Post'].abs().max()
baltimore['Location']=baltimore['Location'].str.lower()
baltimore['Description']=baltimore['Description'].str.lower()
baltimore['Weapon']=baltimore['Weapon'].str.lower()
baltimore['Premise']=baltimore['Premise'].str.lower()
baltimore['District']=baltimore['District'].str.lower()
baltimore['CrimeCode']=baltimore['CrimeCode'].str.lower()
baltimore['Neighborhood']=baltimore['Neighborhood'].str.lower()
baltimore['Inside/Outside']=baltimore['Inside/Outside'].str.lower()
baltimore_train, baltimore_test = train_test_split(baltimore, test_size=0.1, random_state=1)
baltimore_train, baltimore_dev= train_test_split(baltimore_train, test_size=0.25, random_state=1)
baltimore_test.to_csv("baltimore_test.csv", encoding="utf-8", index=False)
baltimore_dev.to_csv("baltimore_dev.csv", encoding="utf-8", index=False)
baltimore_train.to_csv("baltimore_train.csv", encoding="utf-8", index=False)