diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..8d28287 --- /dev/null +++ b/.dvc/config @@ -0,0 +1,6 @@ +[core] + remote = ium_ssh_remote +['remote "my_local_remote"'] + url = /Users/adamwojdyla/Documents/Studia/Magisterskie/1_sem/IUM/ium_444507/dvcstore +['remote "ium_ssh_remote"'] + url = ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000..5197305 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/.gitignore b/.gitignore index 6e381fa..3ff8f0c 100644 --- a/.gitignore +++ b/.gitignore @@ -152,10 +152,12 @@ fabric.properties # kaggle kaggle.json -Car_Prices_Poland_Kaggle* +Car_Prices_Poland_Kaggle*.csv CarPrices* IUM08/* .DS_store *.db mlruns -my_model \ No newline at end of file +my_model +dvcstore +/prediction_results.csv diff --git a/Dockerfile b/Dockerfile index bc14a6b..45e5519 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,6 +25,9 @@ RUN python3 -m pip install matplotlib RUN python3 -m pip install sacred RUN python3 -m pip install pymongo RUN python3 -m pip install mlflow +RUN python3 -m pip install dvc +RUN python3 -m pip install dvc[ssh] paramiko + RUN python3 -m pip freeze ENV PATH="/root/.local/bin:${PATH}" @@ -35,6 +38,7 @@ ARG KAGGLE_KEY RUN chmod a+x ./stats-docker.sh RUN chmod a+x ./script-stats.py +RUN useradd -r -u 111 jenkins # RUN ./download.sh 117928 RUN python3 ./script-download.py diff --git a/Jenkinsfile_dvc b/Jenkinsfile_dvc new file mode 100644 index 0000000..e8660f2 --- /dev/null +++ b/Jenkinsfile_dvc @@ -0,0 +1,28 @@ +pipeline { + agent { + docker { image 's444507_create_dataset_image' } + } + parameters { + buildSelector(defaultSelector: lastSuccessful(), description: 'Which build to use for copying artifacts.', name: 'BUILD_SELECTOR') + gitParameter branchFilter: 'origin/(.*)', defaultValue: 'master', name: 'BRANCH', type: 'PT_BRANCH' + + } + stages { + stage('DVC') { + steps { + withCredentials( + [sshUserPrivateKey(credentialsId: '48ac7004-216e-4260-abba-1fe5db753e18', keyFileVariable: 'IUM_SFTP_KEY', passphraseVariable: '', usernameVariable: '')]) { + sh 'dvc remote add -d ium_ssh_remote ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl/ium-sftp' + sh 'dvc remote modify --local ium_ssh_remote keyfile $IUM_SFTP_KEY' + sh 'dvc pull' + sh 'dvc repro' + } + } + } + } + post { + success { + archiveArtifacts artifacts: 'prediction_results.csv, *.pkl', followSymlinks: false + } + } +} \ No newline at end of file diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..348b87e --- /dev/null +++ b/data/.gitignore @@ -0,0 +1 @@ +/prepared diff --git a/data/Car_Prices_Poland_Kaggle.csv.dvc b/data/Car_Prices_Poland_Kaggle.csv.dvc new file mode 100755 index 0000000..e12d1da --- /dev/null +++ b/data/Car_Prices_Poland_Kaggle.csv.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 9170e9b525149cb1f571f318cd604913 + size: 9894367 + path: Car_Prices_Poland_Kaggle.csv diff --git a/dvc.lock b/dvc.lock new file mode 100644 index 0000000..905c8c1 --- /dev/null +++ b/dvc.lock @@ -0,0 +1,46 @@ +schema: '2.0' +stages: + prepare: + cmd: python3 script_prepare.py data/Car_Prices_Poland_Kaggle.csv + deps: + - path: data/Car_Prices_Poland_Kaggle.csv + md5: 9170e9b525149cb1f571f318cd604913 + size: 9894367 + - path: script_prepare.py + md5: f1dfe33a503f5acc687c53dee448f71b + size: 1899 + outs: + - path: data/Car_Prices_Poland_Kaggle_dev.csv + md5: cf9355749edc79f588e264de5b2bf1f0 + size: 1648309 + - path: data/Car_Prices_Poland_Kaggle_test.csv + md5: cf9355749edc79f588e264de5b2bf1f0 + size: 1648309 + - path: data/Car_Prices_Poland_Kaggle_train.csv + md5: 8818f758e2de344a4b9ad712379b81e1 + size: 6597472 + train: + cmd: python3 lab05_deepLearning.py 50 + deps: + - path: data/Car_Prices_Poland_Kaggle_dev.csv + md5: cf9355749edc79f588e264de5b2bf1f0 + size: 1648309 + - path: data/Car_Prices_Poland_Kaggle_test.csv + md5: cf9355749edc79f588e264de5b2bf1f0 + size: 1648309 + - path: data/Car_Prices_Poland_Kaggle_train.csv + md5: 8818f758e2de344a4b9ad712379b81e1 + size: 6597472 + outs: + - path: CarPrices_pytorch_model.pkl + md5: cff6a79945bbf839058a4fd1b2dcc98f + size: 30039 + - path: prediction_results.csv + md5: 62b9e54cdfebc7f1dfb060e18e9b8738 + size: 585197 + evaluate: + cmd: python3 lab10_evaluate.py + deps: + - path: CarPrices_pytorch_model.pkl + md5: cff6a79945bbf839058a4fd1b2dcc98f + size: 30039 diff --git a/dvc.yaml b/dvc.yaml new file mode 100644 index 0000000..c2245fe --- /dev/null +++ b/dvc.yaml @@ -0,0 +1,23 @@ +stages: + prepare: + cmd: python3 script_prepare.py data/Car_Prices_Poland_Kaggle.csv + deps: + - data/Car_Prices_Poland_Kaggle.csv + - script_prepare.py + outs: + - data/Car_Prices_Poland_Kaggle_dev.csv + - data/Car_Prices_Poland_Kaggle_train.csv + - data/Car_Prices_Poland_Kaggle_test.csv + train: + cmd: python3 lab05_deepLearning.py 70 + deps: + - data/Car_Prices_Poland_Kaggle_dev.csv + - data/Car_Prices_Poland_Kaggle_train.csv + - data/Car_Prices_Poland_Kaggle_test.csv + outs: + - CarPrices_pytorch_model.pkl + - prediction_results.csv + evaluate: + cmd: python3 lab10_evaluate.py + deps: + - CarPrices_pytorch_model.pkl diff --git a/lab06_evaluation.py b/lab06_evaluation.py index 3a1c334..d9d9721 100644 --- a/lab06_evaluation.py +++ b/lab06_evaluation.py @@ -90,9 +90,9 @@ labels_test, features_test = prepare_labels_features(cars_dev) x_test = Variable(torch.from_numpy(features_test)).float() pred = model(x_test) pred = pred.detach().numpy() -print_metrics(labels_test, pred) +# print_metrics(labels_test, pred) -draw_plot() +# draw_plot() diff --git a/lab08_deepLearining_mlflow.py b/lab08_deepLearining_mlflow.py index b16ffe7..cc11b83 100644 --- a/lab08_deepLearining_mlflow.py +++ b/lab08_deepLearining_mlflow.py @@ -1,13 +1,10 @@ #!/usr/bin/python from urllib.parse import urlparse -import mlflow import numpy as np import torch from torch import nn from torch.autograd import Variable -from sklearn.datasets import load_iris -from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, f1_score import torch.nn.functional as F import pandas as pd diff --git a/lab10_evaluate.py b/lab10_evaluate.py new file mode 100644 index 0000000..31d9109 --- /dev/null +++ b/lab10_evaluate.py @@ -0,0 +1,96 @@ +#!/usr/bin/python + +import torch +from torch import nn +import pandas as pd +from sklearn import preprocessing +import numpy as np +from torch.autograd import Variable +from sklearn.metrics import accuracy_score, f1_score +from csv import DictWriter +import torch.nn.functional as F +import sys +import os +import matplotlib.pyplot as plt + + +class Model(nn.Module): + def __init__(self, input_dim): + super(Model, self).__init__() + self.layer1 = nn.Linear(input_dim, 100) + self.layer2 = nn.Linear(100, 60) + self.layer3 = nn.Linear(60, 5) + + def forward(self, x): + x = F.relu(self.layer1(x)) + x = F.relu(self.layer2(x)) + x = F.softmax(self.layer3(x)) # To check with the loss function + return x + + +def prepare_labels_features(dataset): + """ Label make column""" + dataset = dataset.dropna() + le = preprocessing.LabelEncoder() + mark_column = np.array(dataset[:]['0']) + le.fit(mark_column) + + print(list(le.classes_)) + lab = le.transform(mark_column) + feat = dataset.drop(['0'], axis=1).to_numpy() + + mm_scaler = preprocessing.StandardScaler() + feat = mm_scaler.fit_transform(feat) + + return lab, feat + + +def print_metrics(test_labels, predictions): + # take column with max predicted score + f1 = f1_score(labels_test, np.argmax(predictions, axis=1), average='weighted') + accuracy = accuracy_score(test_labels, np.argmax(predictions, axis=1)) + print(f"The F1_score metric is: {f1}") + print(f"The accuracy metric is: {accuracy}") + + if len(sys.argv) != 2: + return + + + build_number = sys.argv[1] + print(f"Build number: {build_number}") + field_names = ['BUILD_NUMBER', 'F1', 'ACCURACY'] + dict = {'BUILD_NUMBER': build_number, 'F1': f1, 'ACCURACY': accuracy } + filename = "./metrics.csv" + file_exists = os.path.isfile(filename) + + with open(filename, 'a') as metrics_file: + dictwriter_object = DictWriter(metrics_file, fieldnames=field_names) + if not file_exists: + dictwriter_object.writeheader() + dictwriter_object.writerow(dict) + metrics_file.close() + + +""" +Load model and data +""" +model = torch.load("CarPrices_pytorch_model.pkl") +cars_dev = pd.read_csv('data/Car_Prices_Poland_Kaggle_dev.csv', usecols=[1, 4, 5, 6, 10], sep=',', names=[str(i) for i in range(5)]) + + +""" +Prepare data +""" +cars_dev = cars_dev.loc[(cars_dev['0'] == 'audi') | (cars_dev['0'] == 'bmw') | (cars_dev['0'] == 'ford') | (cars_dev['0'] == 'opel') | (cars_dev['0'] == 'volkswagen')] +labels_test, features_test = prepare_labels_features(cars_dev) +x_test = Variable(torch.from_numpy(features_test)).float() + +""" +Make predictions +""" +pred = model(x_test) +pred = pred.detach().numpy() +print_metrics(labels_test, pred) + + + diff --git a/script_prepare.py b/script_prepare.py new file mode 100755 index 0000000..b123619 --- /dev/null +++ b/script_prepare.py @@ -0,0 +1,60 @@ +import subprocess +import sys +import pandas as pd +import os +import numpy as np + + +try: + dataset_path = sys.argv[1] +except Exception as e: + print("Exception while retrieving dataset path") + print(e) + + +def divide_dataset(dataset, path): + """Split dataset to dev, train, test datasets. """ + + print('Shuffle dataset...') + shuf_path = 'data/Car_Prices_Poland_Kaggle_shuf.csv' + os.system(f'tail -n +2 {path} | shuf > {shuf_path}') + + len1 = len(dataset) // 6 + len2 = (len1 * 2) + 1 + + print('Dividing dataset...') + os.system(f'head -n {len1} {shuf_path} > data/Car_Prices_Poland_Kaggle_dev.csv') + os.system(f'head -n {len1} {shuf_path} | tail -n {len1} > data/Car_Prices_Poland_Kaggle_test.csv') + os.system(f'tail -n +{len2} {shuf_path} > data/Car_Prices_Poland_Kaggle_train.csv') + + os.system(f'rm {shuf_path}') + print("Len match: " + str(sum([len1 * 2, len2]) == len(dataset))) + os.system('cat Car_Prices_Poland_Kaggle_train.csv | wc -l') + os.system('cat Car_Prices_Poland_Kaggle_dev.csv | wc -l') + os.system('cat Car_Prices_Poland_Kaggle_test.csv | wc -l') + + print('Dataset devided') + + +def normalize_dataset(dataset): + """Drop unnecessary columns and set numeric values to [0,1] range""" + + print(f'--------------- Initial dataset length ---------------') + print(len(dataset)) + + # drop columns + dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True) + dataset = dataset.dropna() + + # normalize numbers to [0, 1] + for column in dataset.columns: + if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64): + dataset[column] = (dataset[column] - dataset[column].min()) / (dataset[column].max() - dataset[column].min()) + return dataset + + +cars = pd.read_csv(dataset_path) +df = pd.DataFrame(cars) +df = normalize_dataset(df) +divide_dataset(df, dataset_path) +