2022-05-11 14:31:24 +02:00
import pandas as pd
import numpy as np
from tensorflow . keras . models import Sequential
from tensorflow . keras . layers import Dense
from sklearn . preprocessing import LabelEncoder
from sklearn . metrics import accuracy_score
from keras . utils import np_utils
from tensorflow import keras
import mlflow
import sys
2022-05-15 11:05:21 +02:00
from urllib . parse import urlparse
2022-05-11 14:31:24 +02:00
2022-05-15 11:05:21 +02:00
mlflow . set_tracking_uri ( " http://172.17.0.1:5000 " )
2022-05-15 11:47:57 +02:00
mlflow . set_experiment ( " s_444517 " )
2022-05-15 12:04:04 +02:00
# w nazwie dodalam podkreslnik poniewaz dostawalam bardzo dziwny blad o istniejacym eksperymencie ze statusem "deleted" i nie udalo mi sie przywrocic oryginalnego eksperymentu
2022-05-11 14:31:24 +02:00
# reading data
def read_data ( ) :
all_data = [ ]
for name in [ ' train ' , ' test ' , ' validate ' ] :
all_data . append ( pd . read_csv ( f ' apps_ { name } .csv ' , header = 0 ) )
return all_data
def data_prep ( ) :
train_set , test_set , validate_set = read_data ( )
train_set = train_set . drop ( columns = [ " Unnamed: 0 " ] )
test_set = test_set . drop ( columns = [ " Unnamed: 0 " ] )
validate_set = validate_set . drop ( columns = [ " Unnamed: 0 " ] )
numeric_columns = [ " Rating " , " Reviews " , " Installs " , " Price " , " Genres_numeric_value " ]
# train set set-up
x_train_set = train_set [ numeric_columns ]
y_train_set = train_set [ " Category " ]
encoder = LabelEncoder ( )
encoder . fit ( y_train_set )
encoded_Y = encoder . transform ( y_train_set )
dummy_y = np_utils . to_categorical ( encoded_Y )
# validation set set-up
x_validate_set = validate_set [ numeric_columns ]
y_validate_set = validate_set [ " Category " ]
encoder = LabelEncoder ( )
encoder . fit ( y_validate_set )
encoded_Yv = encoder . transform ( y_validate_set )
dummy_yv = np_utils . to_categorical ( encoded_Yv )
#test set set-up
x_test_set = test_set [ numeric_columns ]
y_test_set = test_set [ " Category " ]
y_class_names = train_set [ " Category " ] . unique ( )
encoder = LabelEncoder ( )
encoder . fit ( y_test_set )
encoded_Ytt = encoder . transform ( y_test_set )
dummy_ytt = np_utils . to_categorical ( encoded_Ytt )
return x_train_set , dummy_y , x_validate_set , dummy_yv , x_test_set , y_test_set , y_class_names
with mlflow . start_run ( ) :
epoch = int ( sys . argv [ 1 ] ) if len ( sys . argv ) > 1 else 200
first_activation_funct = int ( sys . argv [ 2 ] ) if len ( sys . argv ) > 2 else " relu "
second_activation_funct = int ( sys . argv [ 3 ] ) if len ( sys . argv ) > 3 else " softmax "
x_train_set , dummy_y , x_validate_set , dummy_yv , x_test_set , y_test_set , y_class_names = data_prep ( )
2022-05-15 11:05:21 +02:00
2022-05-11 14:31:24 +02:00
number_of_classes = 33
number_of_features = 5
model = Sequential ( )
model . add ( Dense ( number_of_classes , activation = first_activation_funct ) )
model . add ( Dense ( number_of_classes , activation = second_activation_funct , input_dim = number_of_features ) )
model . compile ( optimizer = ' adam ' , loss = ' categorical_crossentropy ' , metrics = [ ' accuracy ' , ' categorical_accuracy ' ] )
model . fit ( x_train_set , dummy_y , epochs = epoch , validation_data = ( x_validate_set , dummy_yv ) )
2022-05-11 14:42:00 +02:00
#model.save("my_model/")
2022-05-11 14:31:24 +02:00
#model predictions
yhat = model . predict ( x_test_set )
y_true = [ ]
y_pred = [ ]
for numerator , single_pred in enumerate ( yhat ) :
y_true . append ( sorted ( y_class_names ) [ np . argmax ( single_pred ) ] )
y_pred . append ( y_test_set [ numerator ] )
2022-05-15 11:05:21 +02:00
signature = mlflow . models . signature . infer_signature ( x_train_set , model . predict ( x_train_set ) )
input_example = {
" Rating " : 4.100000 ,
" Reviews " : 0.000001 ,
" Installs " : 0.000005 ,
" Price " : 0.000000 ,
" Genres_numeric_value " : 57.000000
}
2022-05-11 14:31:24 +02:00
mlflow . log_param ( " epoch " , epoch )
mlflow . log_param ( " 1st_activation_funct " , first_activation_funct )
mlflow . log_param ( " 2nd_activation_funct " , second_activation_funct )
mlflow . log_metric ( " accuracy " , accuracy_score ( y_true , y_pred ) )
2022-05-15 11:05:21 +02:00
tracking_url_type_store = urlparse ( mlflow . get_tracking_uri ( ) ) . scheme
if tracking_url_type_store != " file " :
2022-05-15 11:47:57 +02:00
mlflow . sklearn . log_model ( model , " my_model_mlflow " , registered_model_name = " s_444517 " , signature = signature , input_example = input_example )
2022-05-15 11:05:21 +02:00
else :
2022-05-15 11:47:57 +02:00
mlflow . sklearn . log_model ( model , " my_model_mlflow " , registered_model_name = " s_444517 " , signature = signature , input_example = input_example )
2022-05-15 12:04:04 +02:00
# mlflow.keras.save_model(model, "my_model_mlflow", signature=signature, input_example=input_example)