save model

2022-05-16 01:31:02 +02:00 · 2022-05-16 01:31:02 +02:00 · 7a446f1753
commit 7a446f1753
parent 518f414734
6 changed files with 208 additions and 11 deletions
--- a/.gitignore
+++ b/.gitignore
@ -154,4 +154,4 @@ fabric.properties
 kaggle.json
 Car_Prices_Poland_Kaggle*
 CarPrices*
-
+IUM08/*
--- a/9
+++ b/9
@ -13,13 +13,18 @@ pipeline {
        }
        stage('Train model with sacred') {
            steps {
-                sh "python3 lab07_sacred.py with 'epochs=$epoch'"
+                sh "python3 lab08_deepLearining_mlflow.py $epoch'"
+                archiveArtifacts artifacts: 'games_model.pkl'
+                archiveArtifacts artifacts: 'mlruns/**'
+                archiveArtifacts artifacts: 'my_model/**'
+                sh 'rm -r mlruns'
+                sh 'rm -r my_model'
            }
        }
    }
    post {
        success {
-            archiveArtifacts artifacts: 'prediction_results.csv, CarPrices_pytorch_model.pkl, s444507_sacred_FileObserver/**/*.*', followSymlinks: false
+            archiveArtifacts artifacts: 'CarPrices_pytorch_model.pkl, mlruns/**, my_model/**', followSymlinks: false
        }
        always {
            emailext body: "${currentBuild.currentResult}", subject: 's444507-training', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms'
--- a/28
+++ b/28
@ -0,0 +1,28 @@
+pipeline {
+    agent {
+        docker { image 's444507_create_dataset_image:latest' }
+    }
+    parameters {
+        string(name: 'epoch', defaultValue: '1000', description: 'Number of epochs to train model.')
+    }
+    stages {
+        stage('Get arifacts') {
+            steps {
+                copyArtifacts fingerprintArtifacts: true, projectName: 's444507-create-dataset', selector: lastSuccessful()
+            }
+        }
+        stage('Train model with sacred') {
+            steps {
+                sh "python3 lab07_sacred.py with 'epochs=$epoch'"
+            }
+        }
+    }
+    post {
+        success {
+            archiveArtifacts artifacts: 'prediction_results.csv, CarPrices_pytorch_model.pkl, s444507_sacred_FileObserver/**/*.*', followSymlinks: false
+        }
+        always {
+            emailext body: "${currentBuild.currentResult}", subject: 's444507-training', to: 'e19191c5.uam.onmicrosoft.com@emea.teams.ms'
+        }
+    }
+}
--- a/14
+++ b/14
@ -1,13 +1,11 @@
-name: tutorial
+name: s444507

-conda_env: conda.yaml #ścieżka do pliku conda.yaml z definicją środowiska
-
-#docker_env:
-#  image: mlflow-docker-example-environment
+docker_env:
+  image: adamwojdyla
+  volumes: ["/mlflow/tmp/mlruns"]

 entry_points:
  main:
    parameters:
-      epochs: {type: float, default: 0.5}
-      l1_ratio: {type: float, default: 0.1}
-    command: "python train.py {epochs}"
+      epochs: {type: float, default: 1}
+    command: "python3 lab08_deepLearining_mlflow.py {epochs}"
--- a/lab06_evaluation.py
+++ b/lab06_evaluation.py
@ -14,6 +14,7 @@ import os
 import matplotlib.pyplot as plt
 import json

+
 class Model(nn.Module):
    def __init__(self, input_dim):
        super(Model, self).__init__()
@ -27,6 +28,7 @@ class Model(nn.Module):
        x = F.softmax(self.layer3(x))  # To check with the loss function
        return x

+
 def prepare_labels_features(dataset):
    """ Label make column"""
    le = preprocessing.LabelEncoder()
--- a/lab08_deepLearining_mlflow.py
+++ b/lab08_deepLearining_mlflow.py
@ -0,0 +1,164 @@
+#!/usr/bin/python
+
+from urllib.parse import urlparse
+import mlflow
+import numpy as np
+import torch
+from torch import nn
+from torch.autograd import Variable
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, f1_score
+import torch.nn.functional as F
+import pandas as pd
+from sklearn import preprocessing
+import sys
+import logging
+import mlflow
+import mlflow.sklearn
+
+logging.basicConfig(level=logging.WARN)
+logger = logging.getLogger(__name__)
+
+# mlflow.set_tracking_uri("http://localhost:5000/")
+mlflow.set_experiment("s444507")
+
+
+class Model(nn.Module):
+    def __init__(self, input_dim):
+        super(Model, self).__init__()
+        self.layer1 = nn.Linear(input_dim, 100)
+        self.layer2 = nn.Linear(100, 60)
+        self.layer3 = nn.Linear(60, 5)
+
+    def forward(self, x):
+        x = F.relu(self.layer1(x))
+        x = F.relu(self.layer2(x))
+        x = F.softmax(self.layer3(x))  # To check with the loss function
+        return x
+
+
+def load_dataset_raw():
+    """ Load data from .csv file. """
+    cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv', usecols=[1, 4, 5, 6, 10], sep=',')
+    return cars
+
+
+def load_dataset_files():
+    """ Load shuffled, splitted dev and train files from .csv files. """
+
+    cars_dev = pd.read_csv('./Car_Prices_Poland_Kaggle_dev.csv', usecols=[1, 4, 5, 6, 10], sep=',', names= [str(i) for i in range(5)])
+    cars_train = pd.read_csv('./Car_Prices_Poland_Kaggle_train.csv', usecols=[1, 4, 5, 6, 10], sep=',', names= [str(i) for i in range(5)])
+
+    return cars_dev, cars_train
+
+
+def remove_rows(data_dev, data_train):
+    dev_removed_rows = data_dev.loc[(data_dev['0'] == 'audi') | (data_dev['0'] == 'bmw') | (data_dev['0'] == 'ford') | (data_dev['0'] == 'opel') | (data_dev['0'] == 'volkswagen')]
+    train_removed_rows = data_train.loc[(data_train['0'] == 'audi') | (data_train['0'] == 'bmw') | (data_train['0'] == 'ford') | (data_train['0'] == 'opel') | (data_train['0'] == 'volkswagen')]
+
+    return dev_removed_rows, train_removed_rows
+
+
+def prepare_labels_features(dataset):
+    """ Label make column"""
+    le = preprocessing.LabelEncoder()
+    mark_column = np.array(dataset[:]['0'])
+    le.fit(mark_column)
+
+    print(list(le.classes_))
+    lab = le.transform(mark_column)
+    feat = dataset.drop(['0'], axis=1).to_numpy()
+
+    mm_scaler = preprocessing.MinMaxScaler()
+    feat = mm_scaler.fit_transform(feat)
+
+    return lab, feat
+
+
+def my_main(epoch):
+    print("Loading dataset...")
+    dev, train = load_dataset_files()
+    print("Dataset loaded")
+
+    print("Preparing dataset...")
+    dev, train = remove_rows(dev, train)
+    labels_train, features_train = prepare_labels_features(train)
+    labels_test, features_test = prepare_labels_features(dev)
+    print("Dataset prepared")
+
+    # Training
+    model = Model(features_train.shape[1])
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
+    loss_fn = nn.CrossEntropyLoss()
+
+    # number of epochs is parametrized
+    try:
+        epochs_n = int(epoch)
+    except Exception as e:
+        print(e)
+        print("Setting default epochs value to 10.")
+        epochs_n = 10
+
+    print(f"Number of epochs: {epochs_n}")
+    mlflow.log_param("epochs", epochs_n)
+
+    print("Starting model training...")
+    x_train, y_train = Variable(torch.from_numpy(features_train)).float(), Variable(torch.from_numpy(labels_train)).long()
+    for epoch in range(1, epochs_n + 1):
+        print("Epoch #", epoch)
+        y_pred = model(x_train)
+        loss = loss_fn(y_pred, y_train)
+        print(f"The loss calculated: {loss}")
+
+        # Zero gradients
+        optimizer.zero_grad()
+        loss.backward()  # Gradients
+        optimizer.step()  # Update
+    print("Model training finished")
+
+    x_test = Variable(torch.from_numpy(features_test)).float()
+    pred = model(x_test)
+    pred = pred.detach().numpy()
+    print(f"The accuracy metric is: {accuracy_score(labels_test, np.argmax(pred, axis=1))}")
+
+    accuracy = accuracy_score(labels_test, np.argmax(pred, axis=1))
+    f1 = f1_score(labels_test, np.argmax(pred, axis=1), average='weighted')
+
+    mlflow.log_metric("accuracy", accuracy)
+    mlflow.log_metric("f1", f1)
+
+    # Infer model signature to log it
+    signature = mlflow.models.signature.infer_signature(features_train, labels_train)
+    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
+
+    if tracking_url_type_store != "file":
+        mlflow.pytorch.log_model(model, "model", registered_model_name="s444507", signature=signature,
+                                 input_example=features_train)
+    else:
+        mlflow.pytorch.log_model(model, "model", signature=signature, input_example=features_train)
+        mlflow.pytorch.save_model(model, "my_model", signature=signature, input_example=features_train)
+
+    print("Saving model to file...")
+    torch.save(model, "CarPrices_pytorch_model.pkl")
+    print("Model saved with name: CarPrices_pytorch_model.pkl")
+
+    saved_model = torch.load("CarPrices_pytorch_model.pkl")
+    print(np.argmax(saved_model(x_test[0]).detach().numpy(), axis=0))
+
+    pd_predictions = pd.DataFrame(pred)
+    pd_predictions.to_csv("./prediction_results.csv")
+
+
+try:
+    epochs = int(sys.argv[1])
+except Exception as e:
+    print(e)
+    print("Setting default epochs value to 1000.")
+    epochs = 100
+
+with mlflow.start_run() as run:
+    print("MLflow run experiment_id: {0}".format(run.info.experiment_id))
+    print("MLflow run artifact_uri: {0}".format(run.info.artifact_uri))
+    my_main(epochs)
+