From 8c71cb721c377db36540b18574b44f3e5258d493 Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 4 Jun 2022 13:38:43 +0200 Subject: [PATCH 1/8] Dvc init --- .dvc/.gitignore | 3 +++ .dvc/config | 0 .dvcignore | 3 +++ 3 files changed, 6 insertions(+) create mode 100644 .dvc/.gitignore create mode 100644 .dvc/config create mode 100644 .dvcignore diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..e69de29 diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000..5197305 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore From 927e47d943f93274564da201e638dec67d9ecb9f Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 4 Jun 2022 13:40:00 +0200 Subject: [PATCH 2/8] dvc data added --- .gitignore | 2 +- data/Car_Prices_Poland_Kaggle.csv.dvc | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 data/Car_Prices_Poland_Kaggle.csv.dvc diff --git a/.gitignore b/.gitignore index 6e381fa..6743dda 100644 --- a/.gitignore +++ b/.gitignore @@ -152,7 +152,7 @@ fabric.properties # kaggle kaggle.json -Car_Prices_Poland_Kaggle* +Car_Prices_Poland_Kaggle*.csv CarPrices* IUM08/* .DS_store diff --git a/data/Car_Prices_Poland_Kaggle.csv.dvc b/data/Car_Prices_Poland_Kaggle.csv.dvc new file mode 100644 index 0000000..e12d1da --- /dev/null +++ b/data/Car_Prices_Poland_Kaggle.csv.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 9170e9b525149cb1f571f318cd604913 + size: 9894367 + path: Car_Prices_Poland_Kaggle.csv From 881a4229cddac57c7f82c3a765690352432b1ee2 Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 4 Jun 2022 14:39:00 +0200 Subject: [PATCH 3/8] local dvc remote --- .dvc/config | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.dvc/config b/.dvc/config index e69de29..01f6c65 100644 --- a/.dvc/config +++ b/.dvc/config @@ -0,0 +1,6 @@ +[core] + remote = my_local_remote +['remote "ium_ssh_remote"'] + url = ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl +['remote "my_local_remote"'] + url = /dvcstore From c345b9a6579ff1e11177ffa3a20ddb1ffc2e2787 Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 4 Jun 2022 14:59:17 +0200 Subject: [PATCH 4/8] local remote added --- .dvc/config | 4 +--- .gitignore | 3 ++- data/Car_Prices_Poland_Kaggle.csv.dvc | 0 3 files changed, 3 insertions(+), 4 deletions(-) mode change 100644 => 100755 data/Car_Prices_Poland_Kaggle.csv.dvc diff --git a/.dvc/config b/.dvc/config index 01f6c65..ae782f7 100644 --- a/.dvc/config +++ b/.dvc/config @@ -1,6 +1,4 @@ [core] remote = my_local_remote -['remote "ium_ssh_remote"'] - url = ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl ['remote "my_local_remote"'] - url = /dvcstore + url = /Users/adamwojdyla/Documents/Studia/Magisterskie/1_sem/IUM/ium_444507/dvcstore diff --git a/.gitignore b/.gitignore index 6743dda..92d3292 100644 --- a/.gitignore +++ b/.gitignore @@ -158,4 +158,5 @@ IUM08/* .DS_store *.db mlruns -my_model \ No newline at end of file +my_model +dvcstore diff --git a/data/Car_Prices_Poland_Kaggle.csv.dvc b/data/Car_Prices_Poland_Kaggle.csv.dvc old mode 100644 new mode 100755 From a9ad0e2ee1b007ab1b7fa377c7c7247a0c55b1f6 Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 4 Jun 2022 15:26:11 +0200 Subject: [PATCH 5/8] remote 2 added --- .dvc/config | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.dvc/config b/.dvc/config index ae782f7..8d28287 100644 --- a/.dvc/config +++ b/.dvc/config @@ -1,4 +1,6 @@ [core] - remote = my_local_remote + remote = ium_ssh_remote ['remote "my_local_remote"'] url = /Users/adamwojdyla/Documents/Studia/Magisterskie/1_sem/IUM/ium_444507/dvcstore +['remote "ium_ssh_remote"'] + url = ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl From 54ce588c87c15ed10b092f1f53f81bc8346988bc Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sat, 4 Jun 2022 22:15:20 +0200 Subject: [PATCH 6/8] dvc repro --- .gitignore | 1 + data/.gitignore | 1 + dvc.lock | 46 +++++++++++++++++ dvc.yaml | 23 +++++++++ lab06_evaluation.py | 4 +- lab08_deepLearining_mlflow.py | 3 -- lab10_evaluate.py | 96 +++++++++++++++++++++++++++++++++++ script_prepare.py | 60 ++++++++++++++++++++++ 8 files changed, 229 insertions(+), 5 deletions(-) create mode 100644 data/.gitignore create mode 100644 dvc.lock create mode 100644 dvc.yaml create mode 100644 lab10_evaluate.py create mode 100755 script_prepare.py diff --git a/.gitignore b/.gitignore index 92d3292..3ff8f0c 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,4 @@ IUM08/* mlruns my_model dvcstore +/prediction_results.csv diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..348b87e --- /dev/null +++ b/data/.gitignore @@ -0,0 +1 @@ +/prepared diff --git a/dvc.lock b/dvc.lock new file mode 100644 index 0000000..905c8c1 --- /dev/null +++ b/dvc.lock @@ -0,0 +1,46 @@ +schema: '2.0' +stages: + prepare: + cmd: python3 script_prepare.py data/Car_Prices_Poland_Kaggle.csv + deps: + - path: data/Car_Prices_Poland_Kaggle.csv + md5: 9170e9b525149cb1f571f318cd604913 + size: 9894367 + - path: script_prepare.py + md5: f1dfe33a503f5acc687c53dee448f71b + size: 1899 + outs: + - path: data/Car_Prices_Poland_Kaggle_dev.csv + md5: cf9355749edc79f588e264de5b2bf1f0 + size: 1648309 + - path: data/Car_Prices_Poland_Kaggle_test.csv + md5: cf9355749edc79f588e264de5b2bf1f0 + size: 1648309 + - path: data/Car_Prices_Poland_Kaggle_train.csv + md5: 8818f758e2de344a4b9ad712379b81e1 + size: 6597472 + train: + cmd: python3 lab05_deepLearning.py 50 + deps: + - path: data/Car_Prices_Poland_Kaggle_dev.csv + md5: cf9355749edc79f588e264de5b2bf1f0 + size: 1648309 + - path: data/Car_Prices_Poland_Kaggle_test.csv + md5: cf9355749edc79f588e264de5b2bf1f0 + size: 1648309 + - path: data/Car_Prices_Poland_Kaggle_train.csv + md5: 8818f758e2de344a4b9ad712379b81e1 + size: 6597472 + outs: + - path: CarPrices_pytorch_model.pkl + md5: cff6a79945bbf839058a4fd1b2dcc98f + size: 30039 + - path: prediction_results.csv + md5: 62b9e54cdfebc7f1dfb060e18e9b8738 + size: 585197 + evaluate: + cmd: python3 lab10_evaluate.py + deps: + - path: CarPrices_pytorch_model.pkl + md5: cff6a79945bbf839058a4fd1b2dcc98f + size: 30039 diff --git a/dvc.yaml b/dvc.yaml new file mode 100644 index 0000000..6e03281 --- /dev/null +++ b/dvc.yaml @@ -0,0 +1,23 @@ +stages: + prepare: + cmd: python3 script_prepare.py data/Car_Prices_Poland_Kaggle.csv + deps: + - data/Car_Prices_Poland_Kaggle.csv + - script_prepare.py + outs: + - data/Car_Prices_Poland_Kaggle_dev.csv + - data/Car_Prices_Poland_Kaggle_train.csv + - data/Car_Prices_Poland_Kaggle_test.csv + train: + cmd: python3 lab05_deepLearning.py 50 + deps: + - data/Car_Prices_Poland_Kaggle_dev.csv + - data/Car_Prices_Poland_Kaggle_train.csv + - data/Car_Prices_Poland_Kaggle_test.csv + outs: + - CarPrices_pytorch_model.pkl + - prediction_results.csv + evaluate: + cmd: python3 lab10_evaluate.py + deps: + - CarPrices_pytorch_model.pkl diff --git a/lab06_evaluation.py b/lab06_evaluation.py index 3a1c334..d9d9721 100644 --- a/lab06_evaluation.py +++ b/lab06_evaluation.py @@ -90,9 +90,9 @@ labels_test, features_test = prepare_labels_features(cars_dev) x_test = Variable(torch.from_numpy(features_test)).float() pred = model(x_test) pred = pred.detach().numpy() -print_metrics(labels_test, pred) +# print_metrics(labels_test, pred) -draw_plot() +# draw_plot() diff --git a/lab08_deepLearining_mlflow.py b/lab08_deepLearining_mlflow.py index b16ffe7..cc11b83 100644 --- a/lab08_deepLearining_mlflow.py +++ b/lab08_deepLearining_mlflow.py @@ -1,13 +1,10 @@ #!/usr/bin/python from urllib.parse import urlparse -import mlflow import numpy as np import torch from torch import nn from torch.autograd import Variable -from sklearn.datasets import load_iris -from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, f1_score import torch.nn.functional as F import pandas as pd diff --git a/lab10_evaluate.py b/lab10_evaluate.py new file mode 100644 index 0000000..31d9109 --- /dev/null +++ b/lab10_evaluate.py @@ -0,0 +1,96 @@ +#!/usr/bin/python + +import torch +from torch import nn +import pandas as pd +from sklearn import preprocessing +import numpy as np +from torch.autograd import Variable +from sklearn.metrics import accuracy_score, f1_score +from csv import DictWriter +import torch.nn.functional as F +import sys +import os +import matplotlib.pyplot as plt + + +class Model(nn.Module): + def __init__(self, input_dim): + super(Model, self).__init__() + self.layer1 = nn.Linear(input_dim, 100) + self.layer2 = nn.Linear(100, 60) + self.layer3 = nn.Linear(60, 5) + + def forward(self, x): + x = F.relu(self.layer1(x)) + x = F.relu(self.layer2(x)) + x = F.softmax(self.layer3(x)) # To check with the loss function + return x + + +def prepare_labels_features(dataset): + """ Label make column""" + dataset = dataset.dropna() + le = preprocessing.LabelEncoder() + mark_column = np.array(dataset[:]['0']) + le.fit(mark_column) + + print(list(le.classes_)) + lab = le.transform(mark_column) + feat = dataset.drop(['0'], axis=1).to_numpy() + + mm_scaler = preprocessing.StandardScaler() + feat = mm_scaler.fit_transform(feat) + + return lab, feat + + +def print_metrics(test_labels, predictions): + # take column with max predicted score + f1 = f1_score(labels_test, np.argmax(predictions, axis=1), average='weighted') + accuracy = accuracy_score(test_labels, np.argmax(predictions, axis=1)) + print(f"The F1_score metric is: {f1}") + print(f"The accuracy metric is: {accuracy}") + + if len(sys.argv) != 2: + return + + + build_number = sys.argv[1] + print(f"Build number: {build_number}") + field_names = ['BUILD_NUMBER', 'F1', 'ACCURACY'] + dict = {'BUILD_NUMBER': build_number, 'F1': f1, 'ACCURACY': accuracy } + filename = "./metrics.csv" + file_exists = os.path.isfile(filename) + + with open(filename, 'a') as metrics_file: + dictwriter_object = DictWriter(metrics_file, fieldnames=field_names) + if not file_exists: + dictwriter_object.writeheader() + dictwriter_object.writerow(dict) + metrics_file.close() + + +""" +Load model and data +""" +model = torch.load("CarPrices_pytorch_model.pkl") +cars_dev = pd.read_csv('data/Car_Prices_Poland_Kaggle_dev.csv', usecols=[1, 4, 5, 6, 10], sep=',', names=[str(i) for i in range(5)]) + + +""" +Prepare data +""" +cars_dev = cars_dev.loc[(cars_dev['0'] == 'audi') | (cars_dev['0'] == 'bmw') | (cars_dev['0'] == 'ford') | (cars_dev['0'] == 'opel') | (cars_dev['0'] == 'volkswagen')] +labels_test, features_test = prepare_labels_features(cars_dev) +x_test = Variable(torch.from_numpy(features_test)).float() + +""" +Make predictions +""" +pred = model(x_test) +pred = pred.detach().numpy() +print_metrics(labels_test, pred) + + + diff --git a/script_prepare.py b/script_prepare.py new file mode 100755 index 0000000..b123619 --- /dev/null +++ b/script_prepare.py @@ -0,0 +1,60 @@ +import subprocess +import sys +import pandas as pd +import os +import numpy as np + + +try: + dataset_path = sys.argv[1] +except Exception as e: + print("Exception while retrieving dataset path") + print(e) + + +def divide_dataset(dataset, path): + """Split dataset to dev, train, test datasets. """ + + print('Shuffle dataset...') + shuf_path = 'data/Car_Prices_Poland_Kaggle_shuf.csv' + os.system(f'tail -n +2 {path} | shuf > {shuf_path}') + + len1 = len(dataset) // 6 + len2 = (len1 * 2) + 1 + + print('Dividing dataset...') + os.system(f'head -n {len1} {shuf_path} > data/Car_Prices_Poland_Kaggle_dev.csv') + os.system(f'head -n {len1} {shuf_path} | tail -n {len1} > data/Car_Prices_Poland_Kaggle_test.csv') + os.system(f'tail -n +{len2} {shuf_path} > data/Car_Prices_Poland_Kaggle_train.csv') + + os.system(f'rm {shuf_path}') + print("Len match: " + str(sum([len1 * 2, len2]) == len(dataset))) + os.system('cat Car_Prices_Poland_Kaggle_train.csv | wc -l') + os.system('cat Car_Prices_Poland_Kaggle_dev.csv | wc -l') + os.system('cat Car_Prices_Poland_Kaggle_test.csv | wc -l') + + print('Dataset devided') + + +def normalize_dataset(dataset): + """Drop unnecessary columns and set numeric values to [0,1] range""" + + print(f'--------------- Initial dataset length ---------------') + print(len(dataset)) + + # drop columns + dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True) + dataset = dataset.dropna() + + # normalize numbers to [0, 1] + for column in dataset.columns: + if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64): + dataset[column] = (dataset[column] - dataset[column].min()) / (dataset[column].max() - dataset[column].min()) + return dataset + + +cars = pd.read_csv(dataset_path) +df = pd.DataFrame(cars) +df = normalize_dataset(df) +divide_dataset(df, dataset_path) + From 99f9a39d12158de5500fde7f1aef7df8a979c4d5 Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sun, 5 Jun 2022 09:34:51 +0200 Subject: [PATCH 7/8] jenkinsfile dvc --- Jenkinsfile_dvc | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 Jenkinsfile_dvc diff --git a/Jenkinsfile_dvc b/Jenkinsfile_dvc new file mode 100644 index 0000000..e8660f2 --- /dev/null +++ b/Jenkinsfile_dvc @@ -0,0 +1,28 @@ +pipeline { + agent { + docker { image 's444507_create_dataset_image' } + } + parameters { + buildSelector(defaultSelector: lastSuccessful(), description: 'Which build to use for copying artifacts.', name: 'BUILD_SELECTOR') + gitParameter branchFilter: 'origin/(.*)', defaultValue: 'master', name: 'BRANCH', type: 'PT_BRANCH' + + } + stages { + stage('DVC') { + steps { + withCredentials( + [sshUserPrivateKey(credentialsId: '48ac7004-216e-4260-abba-1fe5db753e18', keyFileVariable: 'IUM_SFTP_KEY', passphraseVariable: '', usernameVariable: '')]) { + sh 'dvc remote add -d ium_ssh_remote ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl/ium-sftp' + sh 'dvc remote modify --local ium_ssh_remote keyfile $IUM_SFTP_KEY' + sh 'dvc pull' + sh 'dvc repro' + } + } + } + } + post { + success { + archiveArtifacts artifacts: 'prediction_results.csv, *.pkl', followSymlinks: false + } + } +} \ No newline at end of file From 61cab884dc8c1aca784c499d74187a40d3c0d8f9 Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sun, 5 Jun 2022 10:08:50 +0200 Subject: [PATCH 8/8] dockerfile update --- Dockerfile | 4 ++++ dvc.yaml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index bc14a6b..45e5519 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,6 +25,9 @@ RUN python3 -m pip install matplotlib RUN python3 -m pip install sacred RUN python3 -m pip install pymongo RUN python3 -m pip install mlflow +RUN python3 -m pip install dvc +RUN python3 -m pip install dvc[ssh] paramiko + RUN python3 -m pip freeze ENV PATH="/root/.local/bin:${PATH}" @@ -35,6 +38,7 @@ ARG KAGGLE_KEY RUN chmod a+x ./stats-docker.sh RUN chmod a+x ./script-stats.py +RUN useradd -r -u 111 jenkins # RUN ./download.sh 117928 RUN python3 ./script-download.py diff --git a/dvc.yaml b/dvc.yaml index 6e03281..c2245fe 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -9,7 +9,7 @@ stages: - data/Car_Prices_Poland_Kaggle_train.csv - data/Car_Prices_Poland_Kaggle_test.csv train: - cmd: python3 lab05_deepLearning.py 50 + cmd: python3 lab05_deepLearning.py 70 deps: - data/Car_Prices_Poland_Kaggle_dev.csv - data/Car_Prices_Poland_Kaggle_train.csv