#!/usr/bin/python from urllib.parse import urlparse import numpy as np import torch from torch import nn from torch.autograd import Variable from sklearn.metrics import accuracy_score, f1_score import torch.nn.functional as F import pandas as pd from sklearn import preprocessing import sys import logging import mlflow import mlflow.pytorch logging.basicConfig(level=logging.WARN) logger = logging.getLogger(__name__) # mlflow.set_tracking_uri("http://172.17.0.1:5000") mlflow.set_experiment("s444507") class Model(nn.Module): def __init__(self, input_dim): super(Model, self).__init__() self.layer1 = nn.Linear(input_dim, 100) self.layer2 = nn.Linear(100, 60) self.layer3 = nn.Linear(60, 5) def forward(self, x): x = F.relu(self.layer1(x.float())) x = F.relu(self.layer2(x.float())) x = F.softmax(self.layer3(x.float())) # To check with the loss function return x def load_dataset_raw(): """ Load data from .csv file. """ cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv', usecols=[1, 4, 5, 6, 10], sep=',') return cars def load_dataset_files(): """ Load shuffled, splitted dev and train files from .csv files. """ cars_dev = pd.read_csv('./Car_Prices_Poland_Kaggle_dev.csv', usecols=[1, 4, 5, 6, 10], sep=',', names= [str(i) for i in range(5)]) cars_train = pd.read_csv('./Car_Prices_Poland_Kaggle_train.csv', usecols=[1, 4, 5, 6, 10], sep=',', names= [str(i) for i in range(5)]) return cars_dev, cars_train def remove_rows(data_dev, data_train): dev_removed_rows = data_dev.loc[(data_dev['0'] == 'audi') | (data_dev['0'] == 'bmw') | (data_dev['0'] == 'ford') | (data_dev['0'] == 'opel') | (data_dev['0'] == 'volkswagen')] train_removed_rows = data_train.loc[(data_train['0'] == 'audi') | (data_train['0'] == 'bmw') | (data_train['0'] == 'ford') | (data_train['0'] == 'opel') | (data_train['0'] == 'volkswagen')] return dev_removed_rows, train_removed_rows def prepare_labels_features(dataset): """ Label make column""" le = preprocessing.LabelEncoder() mark_column = np.array(dataset[:]['0']) le.fit(mark_column) print(list(le.classes_)) lab = le.transform(mark_column) feat = dataset.drop(['0'], axis=1).to_numpy() mm_scaler = preprocessing.MinMaxScaler() feat = mm_scaler.fit_transform(feat) return lab, feat def my_main(epoch): print("Loading dataset...") dev, train = load_dataset_files() print("Dataset loaded") print("Preparing dataset...") dev, train = remove_rows(dev, train) labels_train, features_train = prepare_labels_features(train) labels_test, features_test = prepare_labels_features(dev) print("Dataset prepared") # Training model = Model(features_train.shape[1]) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) loss_fn = nn.CrossEntropyLoss() # number of epochs is parametrized try: epochs_n = int(epoch) except Exception as e: print(e) print("Setting default epochs value to 10.") epochs_n = 10 print(f"Number of epochs: {epochs_n}") mlflow.log_param("epochs", epochs_n) print("Starting model training...") x_train, y_train = Variable(torch.from_numpy(features_train)).float(), Variable(torch.from_numpy(labels_train)).long() for epoch in range(1, epochs_n + 1): print("Epoch #", epoch) y_pred = model(x_train) loss = loss_fn(y_pred, y_train) print(f"The loss calculated: {loss}") # Zero gradients optimizer.zero_grad() loss.backward() # Gradients optimizer.step() # Update print("Model training finished") x_test = Variable(torch.from_numpy(features_test)).float() pred = model(x_test) pred = pred.detach().numpy() print(f"The accuracy metric is: {accuracy_score(labels_test, np.argmax(pred, axis=1))}") accuracy = accuracy_score(labels_test, np.argmax(pred, axis=1)) f1 = f1_score(labels_test, np.argmax(pred, axis=1), average='weighted') mlflow.log_metric("accuracy", accuracy) mlflow.log_metric("f1", f1) # Infer model signature to log it signature = mlflow.models.signature.infer_signature(features_train, labels_train) tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme if tracking_url_type_store != "file": mlflow.pytorch.log_model(model, "model", registered_model_name="s444507", signature=signature, input_example=features_train) else: mlflow.pytorch.log_model(model, "model", signature=signature, input_example=features_train) mlflow.pytorch.save_model(model, "my_model", signature=signature, input_example=features_train) print("Saving model to file...") torch.save(model, "CarPrices_pytorch_model.pkl") print("Model saved with name: CarPrices_pytorch_model.pkl") saved_model = torch.load("CarPrices_pytorch_model.pkl") print(np.argmax(saved_model(x_test[0]).detach().numpy(), axis=0)) pd_predictions = pd.DataFrame(pred) pd_predictions.to_csv("./prediction_results.csv") try: epochs = int(sys.argv[1]) except Exception as e: print(e) print("Setting default epochs value to 1000.") epochs = 100 with mlflow.start_run() as run: print("MLflow run experiment_id: {0}".format(run.info.experiment_id)) print("MLflow run artifact_uri: {0}".format(run.info.artifact_uri)) my_main(epochs)