diff --git a/Dockerfile b/Dockerfile index 625ef4e..182e4c8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,11 +9,12 @@ ENV IS_DOCKER=True WORKDIR /app COPY ./download_data.sh calc_stats.sh ./ COPY ./clean_and_split_data.py calc_stats.py ./ +COPY ./train_model.py ./ RUN apt-get update && apt-get install -y python3-pip unzip && rm -rf /var/lib/apt/lists/* RUN export PATH="$PATH:/root/.local/bin" -RUN pip3 install kaggle pandas scikit-learn +RUN pip3 install kaggle pandas scikit-learn tensorflow RUN mkdir /.kaggle && chmod o+w /.kaggle diff --git a/train_model.py b/train_model.py new file mode 100644 index 0000000..9b30e2a --- /dev/null +++ b/train_model.py @@ -0,0 +1,80 @@ +import pandas as pd +from sklearn.preprocessing import LabelEncoder +from keras.models import Sequential +from keras.layers import Dense +import tensorflow as tf +import numpy as np + + +tf.config.set_visible_devices([], 'GPU') + +# Read and split data +train_data = pd.read_csv("crime_train.csv") +val_data = pd.read_csv("crime_dev.csv") +test_data = pd.read_csv("crime_test.csv") + +x_columns = ["DISTRICT", "STREET", "YEAR", "MONTH", "DAY_OF_WEEK", "HOUR", "Lat", "Long"] +y_column = "OFFENSE_CODE_GROUP" + +x_train = train_data[x_columns] +y_train = train_data[y_column] +x_val = val_data[x_columns] +y_val = val_data[y_column] +x_test = test_data[x_columns] +y_test = test_data[y_column] + +num_categories = len(y_train.unique()) +num_features = len(x_columns) + +# Train label encoders for categorical data +encoder_y = LabelEncoder() +encoder_day = LabelEncoder() +encoder_dist = LabelEncoder() +encoder_street = LabelEncoder() +encoder_y.fit(y_train) +encoder_day.fit(x_train["DAY_OF_WEEK"]) +encoder_dist.fit(x_train["DISTRICT"]) +encoder_street.fit(pd.concat([x_val["STREET"], x_test["STREET"], x_train["STREET"]], axis=0)) + + +# Encode train categorical data +y_train = encoder_y.transform(y_train) +x_train["DAY_OF_WEEK"] = encoder_day.transform(x_train["DAY_OF_WEEK"]) +x_train["DISTRICT"] = encoder_dist.transform(x_train["DISTRICT"]) +x_train["STREET"] = encoder_street.transform(x_train["STREET"]) + +# Encode train categorical data +y_val = encoder_y.transform(y_val) +x_val["DAY_OF_WEEK"] = encoder_day.transform(x_val["DAY_OF_WEEK"]) +x_val["DISTRICT"] = encoder_dist.transform(x_val["DISTRICT"]) +x_val["STREET"] = encoder_street.transform(x_val["STREET"]) + +# Encode train categorical data +y_test = encoder_y.transform(y_test) +x_test["DAY_OF_WEEK"] = encoder_day.transform(x_test["DAY_OF_WEEK"]) +x_test["DISTRICT"] = encoder_dist.transform(x_test["DISTRICT"]) +x_test["STREET"] = encoder_street.transform(x_test["STREET"]) + +# Define model +model = Sequential() +model.add(Dense(32, activation='relu', input_dim=num_features)) +model.add(Dense(64, activation='relu')) +model.add(Dense(128, activation='relu')) +model.add(Dense(num_categories, activation='softmax')) +model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_accuracy']) + +# Train model +model.fit(x_train, y_train, epochs=100, validation_data=(x_val, y_val)) + +# Make predictions +y_pred = model.predict(x_test) +output = [np.argmax(pred) for pred in y_pred] +output_text = encoder_y.inverse_transform(list(output)) + +# Save predictions +data_to_save = pd.concat([test_data[x_columns], test_data[y_column]], axis = 1) +data_to_save["PREDICTED"] = output_text +data_to_save.to_csv("out.csv") + +# Save model +model.save("model")