ium_444517/nn_train_mlflow.py
2022-05-15 11:24:22 +02:00

108 lines
4.0 KiB
Python

import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from keras.utils import np_utils
from tensorflow import keras
import mlflow
import sys
from urllib.parse import urlparse
mlflow.set_experiment("s444517")
mlflow.set_tracking_uri("http://172.17.0.1:5000")
# reading data
def read_data():
all_data = []
for name in ['train', 'test', 'validate']:
all_data.append(pd.read_csv(f'apps_{name}.csv', header=0))
return all_data
def data_prep():
train_set, test_set, validate_set = read_data()
train_set = train_set.drop(columns=["Unnamed: 0"])
test_set = test_set.drop(columns=["Unnamed: 0"])
validate_set = validate_set.drop(columns=["Unnamed: 0"])
numeric_columns = ["Rating", "Reviews", "Installs", "Price", "Genres_numeric_value"]
# train set set-up
x_train_set = train_set[numeric_columns]
y_train_set = train_set["Category"]
encoder = LabelEncoder()
encoder.fit(y_train_set)
encoded_Y = encoder.transform(y_train_set)
dummy_y = np_utils.to_categorical(encoded_Y)
# validation set set-up
x_validate_set = validate_set[numeric_columns]
y_validate_set = validate_set["Category"]
encoder = LabelEncoder()
encoder.fit(y_validate_set)
encoded_Yv = encoder.transform(y_validate_set)
dummy_yv = np_utils.to_categorical(encoded_Yv)
#test set set-up
x_test_set = test_set[numeric_columns]
y_test_set = test_set["Category"]
y_class_names = train_set["Category"].unique()
encoder = LabelEncoder()
encoder.fit(y_test_set)
encoded_Ytt = encoder.transform(y_test_set)
dummy_ytt = np_utils.to_categorical(encoded_Ytt)
return x_train_set, dummy_y, x_validate_set, dummy_yv, x_test_set, y_test_set, y_class_names
with mlflow.start_run():
epoch = int(sys.argv[1]) if len(sys.argv) > 1 else 200
first_activation_funct = int(sys.argv[2]) if len(sys.argv) > 2 else "relu"
second_activation_funct = int(sys.argv[3]) if len(sys.argv) > 3 else "softmax"
x_train_set, dummy_y, x_validate_set, dummy_yv, x_test_set, y_test_set, y_class_names = data_prep()
number_of_classes = 33
number_of_features = 5
model = Sequential()
model.add(Dense(number_of_classes, activation=first_activation_funct))
model.add(Dense(number_of_classes, activation=second_activation_funct,input_dim=number_of_features))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', 'categorical_accuracy'])
model.fit(x_train_set, dummy_y, epochs=epoch, validation_data=(x_validate_set, dummy_yv))
#model.save("my_model/")
#model predictions
yhat = model.predict(x_test_set)
y_true = []
y_pred = []
for numerator, single_pred in enumerate(yhat):
y_true.append(sorted(y_class_names)[np.argmax(single_pred)])
y_pred.append(y_test_set[numerator])
signature = mlflow.models.signature.infer_signature(x_train_set, model.predict(x_train_set))
input_example = {
"Rating": 4.100000,
"Reviews": 0.000001,
"Installs": 0.000005,
"Price": 0.000000,
"Genres_numeric_value": 57.000000
}
mlflow.log_param("epoch", epoch)
mlflow.log_param("1st_activation_funct", first_activation_funct)
mlflow.log_param("2nd_activation_funct", second_activation_funct)
mlflow.log_metric("accuracy", accuracy_score(y_true, y_pred))
tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
if tracking_url_type_store != "file":
mlflow.sklearn.log_model(model, "my_model_mlflow", registered_model_name="s444517", signature=signature, input_example=input_example)
else:
mlflow.sklearn.log_model(model, "my_model_mlflow", signature=signature, input_example=input_example)
mlflow.keras.save_model(model, "my_model_mlflow", signature=signature, input_example=input_example)