From 798937cb8704f052c8342768a29b32de30c26b01 Mon Sep 17 00:00:00 2001 From: Kamil Guttmann Date: Sat, 7 May 2022 19:44:54 +0200 Subject: [PATCH] Added Sacred --- Dockerfile | 2 +- Jenkinsfile.train | 4 +- train_model.py | 156 +++++++++++++++++++++++++--------------------- 3 files changed, 89 insertions(+), 73 deletions(-) diff --git a/Dockerfile b/Dockerfile index ec10343..42bb45a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,6 +16,6 @@ RUN apt-get update && apt-get install -y python3-pip unzip && rm -rf /var/lib/ap RUN export PATH="$PATH:/root/.local/bin" -RUN pip3 install kaggle pandas scikit-learn tensorflow keras matplotlib numpy +RUN pip3 install kaggle pandas scikit-learn tensorflow keras matplotlib numpy sacred RUN mkdir /.kaggle && chmod o+w /.kaggle diff --git a/Jenkinsfile.train b/Jenkinsfile.train index 1223670..d8778dd 100644 --- a/Jenkinsfile.train +++ b/Jenkinsfile.train @@ -28,8 +28,8 @@ pipeline { stage("Train model") { steps { sh "chmod u+x ./train_model.py" - sh "python3 ./train_model.py $EPOCHS" - archiveArtifacts artifacts: "model/*, out.csv", onlyIfSuccessful: true + sh "python3 ./train_model.py with 'epcohs=$EPOCHS'" + archiveArtifacts artifacts: "model/*, out.csv, experiments/*/*", onlyIfSuccessful: true } } } diff --git a/train_model.py b/train_model.py index 70ddba4..96c7187 100644 --- a/train_model.py +++ b/train_model.py @@ -7,81 +7,97 @@ from keras.layers import Dense import tensorflow as tf import numpy as np +from sacred import Experiment +from sacred.observers import FileStorageObserver, MongoObserver -tf.config.set_visible_devices([], 'GPU') - -# Read and split data -train_data = pd.read_csv("crime_train.csv") -val_data = pd.read_csv("crime_dev.csv") -test_data = pd.read_csv("crime_test.csv") - -x_columns = ["DISTRICT", "STREET", "YEAR", "MONTH", "DAY_OF_WEEK", "HOUR", "Lat", "Long"] -y_column = "OFFENSE_CODE_GROUP" - -x_train = train_data[x_columns] -y_train = train_data[y_column] -x_val = val_data[x_columns] -y_val = val_data[y_column] -x_test = test_data[x_columns] -y_test = test_data[y_column] - -num_categories = len(y_train.unique()) -num_features = len(x_columns) - -# Train label encoders for categorical data -encoder_y = LabelEncoder() -encoder_day = LabelEncoder() -encoder_dist = LabelEncoder() -encoder_street = LabelEncoder() -encoder_y.fit(y_train) -encoder_day.fit(x_train["DAY_OF_WEEK"]) -encoder_dist.fit(x_train["DISTRICT"]) -encoder_street.fit(pd.concat([x_val["STREET"], x_test["STREET"], x_train["STREET"]], axis=0)) +ex = Experiment() +ex.observers.append(FileStorageObserver("experiments")) +#ex.observers.append(MongoObserver(url="mongodb://mongo_user:mongo_password_IUM_2021@localhost:27017", db_name="sacred")) -# Encode train categorical data -y_train = encoder_y.transform(y_train) -x_train["DAY_OF_WEEK"] = encoder_day.transform(x_train["DAY_OF_WEEK"]) -x_train["DISTRICT"] = encoder_dist.transform(x_train["DISTRICT"]) -x_train["STREET"] = encoder_street.transform(x_train["STREET"]) - -# Encode train categorical data -y_val = encoder_y.transform(y_val) -x_val["DAY_OF_WEEK"] = encoder_day.transform(x_val["DAY_OF_WEEK"]) -x_val["DISTRICT"] = encoder_dist.transform(x_val["DISTRICT"]) -x_val["STREET"] = encoder_street.transform(x_val["STREET"]) - -# Encode train categorical data -y_test = encoder_y.transform(y_test) -x_test["DAY_OF_WEEK"] = encoder_day.transform(x_test["DAY_OF_WEEK"]) -x_test["DISTRICT"] = encoder_dist.transform(x_test["DISTRICT"]) -x_test["STREET"] = encoder_street.transform(x_test["STREET"]) - -# Define model -model = Sequential() -model.add(Dense(32, activation='relu', input_dim=num_features)) -model.add(Dense(64, activation='relu')) -model.add(Dense(128, activation='relu')) -model.add(Dense(num_categories, activation='softmax')) -model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_accuracy']) - -if len(sys.argv) > 1: - epochs = sys.argv[1] -else: +@ex.config +def config(): epochs = 10 -# Train model -model.fit(x_train, y_train, epochs=int(epochs), validation_data=(x_val, y_val)) -# Make predictions -y_pred = model.predict(x_test) -output = [np.argmax(pred) for pred in y_pred] -output_text = encoder_y.inverse_transform(list(output)) +@ex.automain +def main(epochs): + tf.config.set_visible_devices([], 'GPU') -# Save predictions -data_to_save = pd.concat([test_data[x_columns], test_data[y_column]], axis = 1) -data_to_save["PREDICTED"] = output_text -data_to_save.to_csv("out.csv") + # Read and split data + train_data = pd.read_csv("crime_train.csv") + val_data = pd.read_csv("crime_dev.csv") + test_data = pd.read_csv("crime_test.csv") -# Save model -model.save("model") + x_columns = ["DISTRICT", "STREET", "YEAR", "MONTH", "DAY_OF_WEEK", "HOUR", "Lat", "Long"] + y_column = "OFFENSE_CODE_GROUP" + + x_train = train_data[x_columns] + y_train = train_data[y_column] + x_val = val_data[x_columns] + y_val = val_data[y_column] + x_test = test_data[x_columns] + y_test = test_data[y_column] + + num_categories = len(y_train.unique()) + num_features = len(x_columns) + + # Train label encoders for categorical data + encoder_y = LabelEncoder() + encoder_day = LabelEncoder() + encoder_dist = LabelEncoder() + encoder_street = LabelEncoder() + encoder_y.fit(y_train) + encoder_day.fit(x_train["DAY_OF_WEEK"]) + encoder_dist.fit(x_train["DISTRICT"]) + encoder_street.fit(pd.concat([x_val["STREET"], x_test["STREET"], x_train["STREET"]], axis=0)) + + + # Encode train categorical data + y_train = encoder_y.transform(y_train) + x_train["DAY_OF_WEEK"] = encoder_day.transform(x_train["DAY_OF_WEEK"]) + x_train["DISTRICT"] = encoder_dist.transform(x_train["DISTRICT"]) + x_train["STREET"] = encoder_street.transform(x_train["STREET"]) + + # Encode train categorical data + y_val = encoder_y.transform(y_val) + x_val["DAY_OF_WEEK"] = encoder_day.transform(x_val["DAY_OF_WEEK"]) + x_val["DISTRICT"] = encoder_dist.transform(x_val["DISTRICT"]) + x_val["STREET"] = encoder_street.transform(x_val["STREET"]) + + # Encode train categorical data + y_test = encoder_y.transform(y_test) + x_test["DAY_OF_WEEK"] = encoder_day.transform(x_test["DAY_OF_WEEK"]) + x_test["DISTRICT"] = encoder_dist.transform(x_test["DISTRICT"]) + x_test["STREET"] = encoder_street.transform(x_test["STREET"]) + + # Define model + model = Sequential() + model.add(Dense(32, activation='relu', input_dim=num_features)) + model.add(Dense(64, activation='relu')) + model.add(Dense(128, activation='relu')) + model.add(Dense(num_categories, activation='softmax')) + model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_accuracy']) + + # Train model + history = model.fit(x_train, y_train, epochs=int(epochs), validation_data=(x_val, y_val)) + + # Make predictions + y_pred = model.predict(x_test) + output = [np.argmax(pred) for pred in y_pred] + output_text = encoder_y.inverse_transform(list(output)) + + # Save predictions + data_to_save = pd.concat([test_data[x_columns], test_data[y_column]], axis = 1) + data_to_save["PREDICTED"] = output_text + data_to_save.to_csv("out.csv") + + # Save model + model.save("model") + ex.add_artifact("model/saved_model.pb") + + # Log metrics + ex.log_scalar("loss", history.history["loss"]) + ex.log_scalar("accuracy", history.history["accuracy"]) + ex.log_scalar("val_loss", history.history["val_loss"]) + ex.log_scalar("val_accuracy", history.history["val_accuracy"])