import pandas as pd import numpy as np from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense from sklearn.preprocessing import LabelEncoder from sklearn.metrics import accuracy_score from keras.utils import np_utils from tensorflow import keras import mlflow import sys from urllib.parse import urlparse mlflow.set_experiment("s444517") mlflow.set_tracking_uri("http://172.17.0.1:5000") # reading data def read_data(): all_data = [] for name in ['train', 'test', 'validate']: all_data.append(pd.read_csv(f'apps_{name}.csv', header=0)) return all_data def data_prep(): train_set, test_set, validate_set = read_data() train_set = train_set.drop(columns=["Unnamed: 0"]) test_set = test_set.drop(columns=["Unnamed: 0"]) validate_set = validate_set.drop(columns=["Unnamed: 0"]) numeric_columns = ["Rating", "Reviews", "Installs", "Price", "Genres_numeric_value"] # train set set-up x_train_set = train_set[numeric_columns] y_train_set = train_set["Category"] encoder = LabelEncoder() encoder.fit(y_train_set) encoded_Y = encoder.transform(y_train_set) dummy_y = np_utils.to_categorical(encoded_Y) # validation set set-up x_validate_set = validate_set[numeric_columns] y_validate_set = validate_set["Category"] encoder = LabelEncoder() encoder.fit(y_validate_set) encoded_Yv = encoder.transform(y_validate_set) dummy_yv = np_utils.to_categorical(encoded_Yv) #test set set-up x_test_set = test_set[numeric_columns] y_test_set = test_set["Category"] y_class_names = train_set["Category"].unique() encoder = LabelEncoder() encoder.fit(y_test_set) encoded_Ytt = encoder.transform(y_test_set) dummy_ytt = np_utils.to_categorical(encoded_Ytt) return x_train_set, dummy_y, x_validate_set, dummy_yv, x_test_set, y_test_set, y_class_names with mlflow.start_run(): epoch = int(sys.argv[1]) if len(sys.argv) > 1 else 200 first_activation_funct = int(sys.argv[2]) if len(sys.argv) > 2 else "relu" second_activation_funct = int(sys.argv[3]) if len(sys.argv) > 3 else "softmax" x_train_set, dummy_y, x_validate_set, dummy_yv, x_test_set, y_test_set, y_class_names = data_prep() number_of_classes = 33 number_of_features = 5 model = Sequential() model.add(Dense(number_of_classes, activation=first_activation_funct)) model.add(Dense(number_of_classes, activation=second_activation_funct,input_dim=number_of_features)) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', 'categorical_accuracy']) model.fit(x_train_set, dummy_y, epochs=epoch, validation_data=(x_validate_set, dummy_yv)) #model.save("my_model/") #model predictions yhat = model.predict(x_test_set) y_true = [] y_pred = [] for numerator, single_pred in enumerate(yhat): y_true.append(sorted(y_class_names)[np.argmax(single_pred)]) y_pred.append(y_test_set[numerator]) signature = mlflow.models.signature.infer_signature(x_train_set, model.predict(x_train_set)) input_example = { "Rating": 4.100000, "Reviews": 0.000001, "Installs": 0.000005, "Price": 0.000000, "Genres_numeric_value": 57.000000 } mlflow.log_param("epoch", epoch) mlflow.log_param("1st_activation_funct", first_activation_funct) mlflow.log_param("2nd_activation_funct", second_activation_funct) mlflow.log_metric("accuracy", accuracy_score(y_true, y_pred)) tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme if tracking_url_type_store != "file": mlflow.sklearn.log_model(model, "my_model_mlflow", registered_model_name="s444517", signature=signature, input_example=input_example) else: mlflow.sklearn.log_model(model, "my_model_mlflow", signature=signature, input_example=input_example) mlflow.keras.save_model(model, "my_model_mlflow", signature=signature, input_example=input_example)