ium_444380/train_model_mlflow.py
2022-05-15 20:15:27 +02:00

102 lines
3.7 KiB
Python

import sys
import mlflow
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense
import numpy as np
from mlflow.models.signature import infer_signature
import tensorflow as tf
tf.config.set_visible_devices([], 'GPU')
mlflow.set_experiment("s444380")
epochs = int(sys.argv[1])
def main(epochs):
mlflow.log_param("epochs", epochs)
mlflow.log_param("test", 100)
# Read and split data
train_data = pd.read_csv("crime_train.csv")
val_data = pd.read_csv("crime_dev.csv")
test_data = pd.read_csv("crime_test.csv")
x_columns = ["DISTRICT", "STREET", "YEAR", "MONTH", "DAY_OF_WEEK", "HOUR", "Lat", "Long"]
y_column = "OFFENSE_CODE_GROUP"
x_train = train_data[x_columns]
y_train = train_data[y_column]
x_val = val_data[x_columns]
y_val = val_data[y_column]
x_test = test_data[x_columns]
y_test = test_data[y_column]
num_categories = len(y_train.unique())
num_features = len(x_columns)
# Train label encoders for categorical data
encoder_y = LabelEncoder()
encoder_day = LabelEncoder()
encoder_dist = LabelEncoder()
encoder_street = LabelEncoder()
encoder_y.fit(y_train)
encoder_day.fit(x_train["DAY_OF_WEEK"])
encoder_dist.fit(x_train["DISTRICT"])
encoder_street.fit(pd.concat([x_val["STREET"], x_test["STREET"], x_train["STREET"]], axis=0))
# Encode train categorical data
y_train = encoder_y.transform(y_train)
x_train["DAY_OF_WEEK"] = encoder_day.transform(x_train["DAY_OF_WEEK"])
x_train["DISTRICT"] = encoder_dist.transform(x_train["DISTRICT"])
x_train["STREET"] = encoder_street.transform(x_train["STREET"])
# Encode train categorical data
y_val = encoder_y.transform(y_val)
x_val["DAY_OF_WEEK"] = encoder_day.transform(x_val["DAY_OF_WEEK"])
x_val["DISTRICT"] = encoder_dist.transform(x_val["DISTRICT"])
x_val["STREET"] = encoder_street.transform(x_val["STREET"])
# Encode train categorical data
y_test = encoder_y.transform(y_test)
x_test["DAY_OF_WEEK"] = encoder_day.transform(x_test["DAY_OF_WEEK"])
x_test["DISTRICT"] = encoder_dist.transform(x_test["DISTRICT"])
x_test["STREET"] = encoder_street.transform(x_test["STREET"])
# Define model
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=num_features))
model.add(Dense(64, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(num_categories, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_accuracy'])
# Train model
history = model.fit(x_train, y_train, epochs=int(epochs), validation_data=(x_val, y_val))
# Make predictions
y_pred = model.predict(x_test)
output = [np.argmax(pred) for pred in y_pred]
output_text = encoder_y.inverse_transform(list(output))
# Save predictions
data_to_save = pd.concat([test_data[x_columns], test_data[y_column]], axis = 1)
data_to_save["PREDICTED"] = output_text
data_to_save.to_csv("out.csv")
# Save model
model.save("model")
signature = infer_signature(x_train, y_train)
input_example = np.array([x_test.values[0]])
mlflow.keras.log_model(model, "model", signature=signature, input_example=input_example)
# Log metrics
mlflow.log_param("test", 33)
mlflow.log_metric("loss", history.history["loss"])
mlflow.log_metric("accuracy", history.history["accuracy"])
mlflow.log_metric("val_loss", history.history["val_loss"])
mlflow.log_metric("val_accuracy", history.history["val_accuracy"])