Add dockerfile and python script

This commit is contained in:
Wojciech Lidwin 2023-04-04 12:11:02 +02:00
parent 1fe09ed25d
commit 180431b160
3 changed files with 48 additions and 5 deletions

21
Dockerfile Normal file
View File

@ -0,0 +1,21 @@
FROM ubuntu:latest
ENV KAGGLE_USERNAME="jaszwajcar"
ENV KAGGLE_KEY="key"
RUN apt update && apt install -y figlet
RUN apt install unzip -y
RUN apt install python3-pip -y
RUN pip3 install kaggle
RUN pip3 install pandas
RUN pip3 install unzip
RUN pip3 install scikit-learn
RUN pip3 install seaborn
WORKDIR /app
COPY ./download_dataset.sh ./
COPY ./stats.sh ./
COPY ./data.py ./

24
data.py Normal file
View File

@ -0,0 +1,24 @@
import pandas as pd
from sklearn.model_selection import train_test_split
baltimore=pd.read_csv('BPD_Part_1_Victim_Based_Crime_Data.csv')
baltimore["Weapon"].fillna("None", inplace=True)
baltimore.dropna(inplace=True)
baltimore['Post'] = baltimore['Post'] /baltimore['Post'].abs().max()
baltimore['Location']=baltimore['Location'].str.lower()
baltimore['Description']=baltimore['Description'].str.lower()
baltimore['Weapon']=baltimore['Weapon'].str.lower()
baltimore['Premise']=baltimore['Premise'].str.lower()
baltimore['District']=baltimore['District'].str.lower()
baltimore['CrimeCode']=baltimore['CrimeCode'].str.lower()
baltimore['Neighborhood']=baltimore['Neighborhood'].str.lower()
baltimore['Inside/Outside']=baltimore['Inside/Outside'].str.lower()
baltimore_train, baltimore_test = train_test_split(baltimore, test_size=0.1, random_state=1)
baltimore_train, baltimore_dev= train_test_split(baltimore_train, test_size=0.25, random_state=1)
baltimore_test.to_csv("baltimore_test.csv", encoding="utf-8", index=False)
baltimore_dev.to_csv("baltimore_dev.csv", encoding="utf-8", index=False)
baltimore_train.to_csv("baltimore_train.csv", encoding="utf-8", index=False)

View File

@ -1,6 +1,4 @@
kaggle datasets download -d sohier/crime-in-baltimore
unzip crime-in-baltimore.zip
#!/bin/bash
head -n 8000 BPD_Part_1_Victim_Based_Crime_Data.csv > baltimore_train.csv
tail -n 2000 BPD_Part_1_Victim_Based_Crime_Data.csv > baltimore_test.csv
head -n 2000 baltimore_train.csv > baltimore_dev.csv
kaggle datasets download -d sohier/crime-in-baltimore
unzip crime-in-baltimore.zip