165 lines
5.5 KiB
165 lines
5.5 KiB
from urllib.parse import urlparse
import mlflow
import numpy as np
import torch
from torch import nn
from torch.autograd import Variable
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import torch.nn.functional as F
import pandas as pd
from sklearn import preprocessing
import sys
import logging
import mlflow
import mlflow.pytorch
logger = logging.getLogger(__name__)
class Model(nn.Module):
def __init__(self, input_dim):
super(Model, self).__init__()
self.layer1 = nn.Linear(input_dim, 100)
self.layer2 = nn.Linear(100, 60)
self.layer3 = nn.Linear(60, 5)
def forward(self, x):
x = F.relu(self.layer1(x.float()))
x = F.relu(self.layer2(x.float()))
x = F.softmax(self.layer3(x.float())) # To check with the loss function
return x
def load_dataset_raw():
""" Load data from .csv file. """
cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv', usecols=[1, 4, 5, 6, 10], sep=',')
return cars
def load_dataset_files():
""" Load shuffled, splitted dev and train files from .csv files. """
cars_dev = pd.read_csv('./Car_Prices_Poland_Kaggle_dev.csv', usecols=[1, 4, 5, 6, 10], sep=',', names= [str(i) for i in range(5)])
cars_train = pd.read_csv('./Car_Prices_Poland_Kaggle_train.csv', usecols=[1, 4, 5, 6, 10], sep=',', names= [str(i) for i in range(5)])
return cars_dev, cars_train
def remove_rows(data_dev, data_train):
dev_removed_rows = data_dev.loc[(data_dev['0'] == 'audi') | (data_dev['0'] == 'bmw') | (data_dev['0'] == 'ford') | (data_dev['0'] == 'opel') | (data_dev['0'] == 'volkswagen')]
train_removed_rows = data_train.loc[(data_train['0'] == 'audi') | (data_train['0'] == 'bmw') | (data_train['0'] == 'ford') | (data_train['0'] == 'opel') | (data_train['0'] == 'volkswagen')]
return dev_removed_rows, train_removed_rows
def prepare_labels_features(dataset):
""" Label make column"""
le = preprocessing.LabelEncoder()
mark_column = np.array(dataset[:]['0'])
lab = le.transform(mark_column)
feat = dataset.drop(['0'], axis=1).to_numpy()
mm_scaler = preprocessing.MinMaxScaler()
feat = mm_scaler.fit_transform(feat)
return lab, feat
def my_main(epoch):
print("Loading dataset...")
dev, train = load_dataset_files()
print("Dataset loaded")
print("Preparing dataset...")
dev, train = remove_rows(dev, train)
labels_train, features_train = prepare_labels_features(train)
labels_test, features_test = prepare_labels_features(dev)
print("Dataset prepared")
# Training
model = Model(features_train.shape[1])
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()
# number of epochs is parametrized
epochs_n = int(epoch)
except Exception as e:
print("Setting default epochs value to 10.")
epochs_n = 10
print(f"Number of epochs: {epochs_n}")
mlflow.log_param("epochs", epochs_n)
print("Starting model training...")
x_train, y_train = Variable(torch.from_numpy(features_train)).float(), Variable(torch.from_numpy(labels_train)).long()
for epoch in range(1, epochs_n + 1):
print("Epoch #", epoch)
y_pred = model(x_train)
loss = loss_fn(y_pred, y_train)
print(f"The loss calculated: {loss}")
# Zero gradients
loss.backward() # Gradients
optimizer.step() # Update
print("Model training finished")
x_test = Variable(torch.from_numpy(features_test)).float()
pred = model(x_test)
pred = pred.detach().numpy()
print(f"The accuracy metric is: {accuracy_score(labels_test, np.argmax(pred, axis=1))}")
accuracy = accuracy_score(labels_test, np.argmax(pred, axis=1))
f1 = f1_score(labels_test, np.argmax(pred, axis=1), average='weighted')
mlflow.log_metric("accuracy", accuracy)
mlflow.log_metric("f1", f1)
# Infer model signature to log it
signature = mlflow.models.signature.infer_signature(features_train, labels_train)
tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
if tracking_url_type_store != "file":
mlflow.pytorch.log_model(model, "model", registered_model_name="s444507", signature=signature,
mlflow.pytorch.log_model(model, "model", signature=signature, input_example=features_train)
mlflow.pytorch.save_model(model, "my_model", signature=signature, input_example=features_train)
print("Saving model to file...")
torch.save(model, "CarPrices_pytorch_model.pkl")
print("Model saved with name: CarPrices_pytorch_model.pkl")
saved_model = torch.load("CarPrices_pytorch_model.pkl")
print(np.argmax(saved_model(x_test[0]).detach().numpy(), axis=0))
pd_predictions = pd.DataFrame(pred)
epochs = int(sys.argv[1])
except Exception as e:
print("Setting default epochs value to 1000.")
epochs = 100
with mlflow.start_run() as run:
print("MLflow run experiment_id: {0}".format(run.info.experiment_id))
print("MLflow run artifact_uri: {0}".format(run.info.artifact_uri))