import sys import mlflow import pandas as pd from sklearn.preprocessing import LabelEncoder from keras.models import Sequential from keras.layers import Dense import numpy as np from mlflow.models.signature import infer_signature import tensorflow as tf tf.config.set_visible_devices([], 'GPU') mlflow.set_experiment("s444380") epochs = int(sys.argv[1]) def main(epochs): mlflow.log_param("epochs", epochs) mlflow.log_param("test", 100) # Read and split data train_data = pd.read_csv("crime_train.csv") val_data = pd.read_csv("crime_dev.csv") test_data = pd.read_csv("crime_test.csv") x_columns = ["DISTRICT", "STREET", "YEAR", "MONTH", "DAY_OF_WEEK", "HOUR", "Lat", "Long"] y_column = "OFFENSE_CODE_GROUP" x_train = train_data[x_columns] y_train = train_data[y_column] x_val = val_data[x_columns] y_val = val_data[y_column] x_test = test_data[x_columns] y_test = test_data[y_column] num_categories = len(y_train.unique()) num_features = len(x_columns) # Train label encoders for categorical data encoder_y = LabelEncoder() encoder_day = LabelEncoder() encoder_dist = LabelEncoder() encoder_street = LabelEncoder() encoder_y.fit(y_train) encoder_day.fit(x_train["DAY_OF_WEEK"]) encoder_dist.fit(x_train["DISTRICT"]) encoder_street.fit(pd.concat([x_val["STREET"], x_test["STREET"], x_train["STREET"]], axis=0)) # Encode train categorical data y_train = encoder_y.transform(y_train) x_train["DAY_OF_WEEK"] = encoder_day.transform(x_train["DAY_OF_WEEK"]) x_train["DISTRICT"] = encoder_dist.transform(x_train["DISTRICT"]) x_train["STREET"] = encoder_street.transform(x_train["STREET"]) # Encode train categorical data y_val = encoder_y.transform(y_val) x_val["DAY_OF_WEEK"] = encoder_day.transform(x_val["DAY_OF_WEEK"]) x_val["DISTRICT"] = encoder_dist.transform(x_val["DISTRICT"]) x_val["STREET"] = encoder_street.transform(x_val["STREET"]) # Encode train categorical data y_test = encoder_y.transform(y_test) x_test["DAY_OF_WEEK"] = encoder_day.transform(x_test["DAY_OF_WEEK"]) x_test["DISTRICT"] = encoder_dist.transform(x_test["DISTRICT"]) x_test["STREET"] = encoder_street.transform(x_test["STREET"]) # Define model model = Sequential() model.add(Dense(32, activation='relu', input_dim=num_features)) model.add(Dense(64, activation='relu')) model.add(Dense(128, activation='relu')) model.add(Dense(num_categories, activation='softmax')) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_accuracy']) # Train model history = model.fit(x_train, y_train, epochs=int(epochs), validation_data=(x_val, y_val)) # Make predictions y_pred = model.predict(x_test) output = [np.argmax(pred) for pred in y_pred] output_text = encoder_y.inverse_transform(list(output)) # Save predictions data_to_save = pd.concat([test_data[x_columns], test_data[y_column]], axis = 1) data_to_save["PREDICTED"] = output_text data_to_save.to_csv("out.csv") # Save model model.save("model") signature = infer_signature(x_train, y_train) input_example = np.array([x_test.values[0]]) mlflow.keras.log_model(model, "model", signature=signature, input_example=input_example) # Log metrics mlflow.log_param("test", 33) mlflow.log_metric("loss", history.history["loss"]) mlflow.log_metric("accuracy", history.history["accuracy"]) mlflow.log_metric("val_loss", history.history["val_loss"]) mlflow.log_metric("val_accuracy", history.history["val_accuracy"])