Added scripts for docker container
This commit is contained in:
parent
e0b89db85c
commit
4d028db0cf
12
calc_stats.py
Normal file
12
calc_stats.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
crime_train = pd.read_csv('crime_train.csv')
|
||||||
|
crime_test = pd.read_csv('crime_test.csv')
|
||||||
|
crime_dev = pd.read_csv('crime_dev.csv')
|
||||||
|
|
||||||
|
print(crime_train.describe(include="all"))
|
||||||
|
print(crime_test.describe(include="all"))
|
||||||
|
print(crime_dev.describe(include="all"))
|
||||||
|
print(crime_train["OFFENSE_CODE"].value_counts())
|
||||||
|
print(crime_train["DISTRICT"].value_counts())
|
||||||
|
print(crime_train["YEAR"].value_counts())
|
31
clean_and_split_data.py
Normal file
31
clean_and_split_data.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
crime = pd.read_csv('crime_conv.csv')
|
||||||
|
|
||||||
|
# Większość danych o strzelaninach jest pusta więc zakładam, że są to inceydenty bez strzelanin
|
||||||
|
crime["SHOOTING"].fillna("N", inplace=True)
|
||||||
|
|
||||||
|
# Kolumna location powtarza wartości z Lat i Long
|
||||||
|
crime.drop(columns=["Location"], inplace=True)
|
||||||
|
|
||||||
|
# Usuwam błędne/brakujące wartości współrzędnych
|
||||||
|
crime = crime[(crime["Lat"] > 35) & (crime["Long"] < -65)]
|
||||||
|
|
||||||
|
# Lowercase na polach tekstowych
|
||||||
|
crime["OFFENSE_CODE_GROUP"] = crime["OFFENSE_CODE_GROUP"].str.lower()
|
||||||
|
crime["OFFENSE_DESCRIPTION"] = crime["OFFENSE_DESCRIPTION"].str.lower()
|
||||||
|
crime["DAY_OF_WEEK"] = crime["DAY_OF_WEEK"].str.lower()
|
||||||
|
crime["UCR_PART"] = crime["UCR_PART"].str.lower()
|
||||||
|
crime["STREET"] = crime["STREET"].str.lower()
|
||||||
|
|
||||||
|
# Usuwam pozostałe wiersze zawierające nulle
|
||||||
|
crime.dropna()
|
||||||
|
|
||||||
|
# Zbiór jest całkiem duży - 300k wierszy po oczyszczeniu, więc wybieram podział 8:1:1, czyli w przybliżeniu 30k na dev i test
|
||||||
|
crime_train, crime_test = train_test_split(crime, test_size=60000, random_state=1)
|
||||||
|
crime_test, crime_dev = train_test_split(crime_test, test_size=30000, random_state=1)
|
||||||
|
|
||||||
|
crime_test.to_csv("crime_test.csv", encoding="utf-8", index=False)
|
||||||
|
crime_dev.to_csv("crime_dev.csv", encoding="utf-8", index=False)
|
||||||
|
crime_train.to_csv("crime_train.csv", encoding="utf-8", index=False)
|
@ -2,7 +2,8 @@
|
|||||||
|
|
||||||
kaggle datasets download -d AnalyzeBoston/crimes-in-boston --force
|
kaggle datasets download -d AnalyzeBoston/crimes-in-boston --force
|
||||||
unzip -o crimes-in-boston.zip
|
unzip -o crimes-in-boston.zip
|
||||||
shuf crime.csv | head -n $1 > crime.shuf
|
iconv -f "windows-1252" -t "UTF-8" crime.csv > crime_conv.csv
|
||||||
head -n 30000 crime.shuf > crime.test
|
#shuf crime.csv | head -n $1 > crime.shuf
|
||||||
head -n 60000 crime.shuf | tail -n 30000 > crime.dev
|
#head -n 30000 crime.shuf > crime.test
|
||||||
tail -n +60001 crime.shuf > crime.train
|
#head -n 60000 crime.shuf | tail -n 30000 > crime.dev
|
||||||
|
#tail -n +60001 crime.shuf > crime.train
|
||||||
|
Loading…
Reference in New Issue
Block a user