Merge branch 'feature/dvc'
Some checks failed
444507-training/pipeline/head This commit looks good
s444507-dvc/pipeline/head There was a failure building this commit
s444507-predict-s444356/pipeline/head This commit looks good
s444507-evaluation/pipeline/head This commit looks good

This commit is contained in:
Adam Wojdyla 2022-06-05 10:11:09 +02:00
commit 37127b1b05
14 changed files with 280 additions and 7 deletions

3
.dvc/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
/config.local
/tmp
/cache

6
.dvc/config Normal file
View File

@ -0,0 +1,6 @@
[core]
remote = ium_ssh_remote
['remote "my_local_remote"']
url = /Users/adamwojdyla/Documents/Studia/Magisterskie/1_sem/IUM/ium_444507/dvcstore
['remote "ium_ssh_remote"']
url = ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl

3
.dvcignore Normal file
View File

@ -0,0 +1,3 @@
# Add patterns of files dvc should ignore, which could improve
# the performance. Learn more at
# https://dvc.org/doc/user-guide/dvcignore

4
.gitignore vendored
View File

@ -152,10 +152,12 @@ fabric.properties
# kaggle # kaggle
kaggle.json kaggle.json
Car_Prices_Poland_Kaggle* Car_Prices_Poland_Kaggle*.csv
CarPrices* CarPrices*
IUM08/* IUM08/*
.DS_store .DS_store
*.db *.db
mlruns mlruns
my_model my_model
dvcstore
/prediction_results.csv

View File

@ -25,6 +25,9 @@ RUN python3 -m pip install matplotlib
RUN python3 -m pip install sacred RUN python3 -m pip install sacred
RUN python3 -m pip install pymongo RUN python3 -m pip install pymongo
RUN python3 -m pip install mlflow RUN python3 -m pip install mlflow
RUN python3 -m pip install dvc
RUN python3 -m pip install dvc[ssh] paramiko
RUN python3 -m pip freeze RUN python3 -m pip freeze
ENV PATH="/root/.local/bin:${PATH}" ENV PATH="/root/.local/bin:${PATH}"
@ -35,6 +38,7 @@ ARG KAGGLE_KEY
RUN chmod a+x ./stats-docker.sh RUN chmod a+x ./stats-docker.sh
RUN chmod a+x ./script-stats.py RUN chmod a+x ./script-stats.py
RUN useradd -r -u 111 jenkins
# RUN ./download.sh 117928 # RUN ./download.sh 117928
RUN python3 ./script-download.py RUN python3 ./script-download.py

28
Jenkinsfile_dvc Normal file
View File

@ -0,0 +1,28 @@
pipeline {
agent {
docker { image 's444507_create_dataset_image' }
}
parameters {
buildSelector(defaultSelector: lastSuccessful(), description: 'Which build to use for copying artifacts.', name: 'BUILD_SELECTOR')
gitParameter branchFilter: 'origin/(.*)', defaultValue: 'master', name: 'BRANCH', type: 'PT_BRANCH'
}
stages {
stage('DVC') {
steps {
withCredentials(
[sshUserPrivateKey(credentialsId: '48ac7004-216e-4260-abba-1fe5db753e18', keyFileVariable: 'IUM_SFTP_KEY', passphraseVariable: '', usernameVariable: '')]) {
sh 'dvc remote add -d ium_ssh_remote ssh://ium-sftp@tzietkiewicz.vm.wmi.amu.edu.pl/ium-sftp'
sh 'dvc remote modify --local ium_ssh_remote keyfile $IUM_SFTP_KEY'
sh 'dvc pull'
sh 'dvc repro'
}
}
}
}
post {
success {
archiveArtifacts artifacts: 'prediction_results.csv, *.pkl', followSymlinks: false
}
}
}

1
data/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/prepared

View File

@ -0,0 +1,4 @@
outs:
- md5: 9170e9b525149cb1f571f318cd604913
size: 9894367
path: Car_Prices_Poland_Kaggle.csv

46
dvc.lock Normal file
View File

@ -0,0 +1,46 @@
schema: '2.0'
stages:
prepare:
cmd: python3 script_prepare.py data/Car_Prices_Poland_Kaggle.csv
deps:
- path: data/Car_Prices_Poland_Kaggle.csv
md5: 9170e9b525149cb1f571f318cd604913
size: 9894367
- path: script_prepare.py
md5: f1dfe33a503f5acc687c53dee448f71b
size: 1899
outs:
- path: data/Car_Prices_Poland_Kaggle_dev.csv
md5: cf9355749edc79f588e264de5b2bf1f0
size: 1648309
- path: data/Car_Prices_Poland_Kaggle_test.csv
md5: cf9355749edc79f588e264de5b2bf1f0
size: 1648309
- path: data/Car_Prices_Poland_Kaggle_train.csv
md5: 8818f758e2de344a4b9ad712379b81e1
size: 6597472
train:
cmd: python3 lab05_deepLearning.py 50
deps:
- path: data/Car_Prices_Poland_Kaggle_dev.csv
md5: cf9355749edc79f588e264de5b2bf1f0
size: 1648309
- path: data/Car_Prices_Poland_Kaggle_test.csv
md5: cf9355749edc79f588e264de5b2bf1f0
size: 1648309
- path: data/Car_Prices_Poland_Kaggle_train.csv
md5: 8818f758e2de344a4b9ad712379b81e1
size: 6597472
outs:
- path: CarPrices_pytorch_model.pkl
md5: cff6a79945bbf839058a4fd1b2dcc98f
size: 30039
- path: prediction_results.csv
md5: 62b9e54cdfebc7f1dfb060e18e9b8738
size: 585197
evaluate:
cmd: python3 lab10_evaluate.py
deps:
- path: CarPrices_pytorch_model.pkl
md5: cff6a79945bbf839058a4fd1b2dcc98f
size: 30039

23
dvc.yaml Normal file
View File

@ -0,0 +1,23 @@
stages:
prepare:
cmd: python3 script_prepare.py data/Car_Prices_Poland_Kaggle.csv
deps:
- data/Car_Prices_Poland_Kaggle.csv
- script_prepare.py
outs:
- data/Car_Prices_Poland_Kaggle_dev.csv
- data/Car_Prices_Poland_Kaggle_train.csv
- data/Car_Prices_Poland_Kaggle_test.csv
train:
cmd: python3 lab05_deepLearning.py 70
deps:
- data/Car_Prices_Poland_Kaggle_dev.csv
- data/Car_Prices_Poland_Kaggle_train.csv
- data/Car_Prices_Poland_Kaggle_test.csv
outs:
- CarPrices_pytorch_model.pkl
- prediction_results.csv
evaluate:
cmd: python3 lab10_evaluate.py
deps:
- CarPrices_pytorch_model.pkl

View File

@ -90,9 +90,9 @@ labels_test, features_test = prepare_labels_features(cars_dev)
x_test = Variable(torch.from_numpy(features_test)).float() x_test = Variable(torch.from_numpy(features_test)).float()
pred = model(x_test) pred = model(x_test)
pred = pred.detach().numpy() pred = pred.detach().numpy()
print_metrics(labels_test, pred) # print_metrics(labels_test, pred)
draw_plot() # draw_plot()

View File

@ -1,13 +1,10 @@
#!/usr/bin/python #!/usr/bin/python
from urllib.parse import urlparse from urllib.parse import urlparse
import mlflow
import numpy as np import numpy as np
import torch import torch
from torch import nn from torch import nn
from torch.autograd import Variable from torch.autograd import Variable
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score from sklearn.metrics import accuracy_score, f1_score
import torch.nn.functional as F import torch.nn.functional as F
import pandas as pd import pandas as pd

96
lab10_evaluate.py Normal file
View File

@ -0,0 +1,96 @@
#!/usr/bin/python
import torch
from torch import nn
import pandas as pd
from sklearn import preprocessing
import numpy as np
from torch.autograd import Variable
from sklearn.metrics import accuracy_score, f1_score
from csv import DictWriter
import torch.nn.functional as F
import sys
import os
import matplotlib.pyplot as plt
class Model(nn.Module):
def __init__(self, input_dim):
super(Model, self).__init__()
self.layer1 = nn.Linear(input_dim, 100)
self.layer2 = nn.Linear(100, 60)
self.layer3 = nn.Linear(60, 5)
def forward(self, x):
x = F.relu(self.layer1(x))
x = F.relu(self.layer2(x))
x = F.softmax(self.layer3(x)) # To check with the loss function
return x
def prepare_labels_features(dataset):
""" Label make column"""
dataset = dataset.dropna()
le = preprocessing.LabelEncoder()
mark_column = np.array(dataset[:]['0'])
le.fit(mark_column)
print(list(le.classes_))
lab = le.transform(mark_column)
feat = dataset.drop(['0'], axis=1).to_numpy()
mm_scaler = preprocessing.StandardScaler()
feat = mm_scaler.fit_transform(feat)
return lab, feat
def print_metrics(test_labels, predictions):
# take column with max predicted score
f1 = f1_score(labels_test, np.argmax(predictions, axis=1), average='weighted')
accuracy = accuracy_score(test_labels, np.argmax(predictions, axis=1))
print(f"The F1_score metric is: {f1}")
print(f"The accuracy metric is: {accuracy}")
if len(sys.argv) != 2:
return
build_number = sys.argv[1]
print(f"Build number: {build_number}")
field_names = ['BUILD_NUMBER', 'F1', 'ACCURACY']
dict = {'BUILD_NUMBER': build_number, 'F1': f1, 'ACCURACY': accuracy }
filename = "./metrics.csv"
file_exists = os.path.isfile(filename)
with open(filename, 'a') as metrics_file:
dictwriter_object = DictWriter(metrics_file, fieldnames=field_names)
if not file_exists:
dictwriter_object.writeheader()
dictwriter_object.writerow(dict)
metrics_file.close()
"""
Load model and data
"""
model = torch.load("CarPrices_pytorch_model.pkl")
cars_dev = pd.read_csv('data/Car_Prices_Poland_Kaggle_dev.csv', usecols=[1, 4, 5, 6, 10], sep=',', names=[str(i) for i in range(5)])
"""
Prepare data
"""
cars_dev = cars_dev.loc[(cars_dev['0'] == 'audi') | (cars_dev['0'] == 'bmw') | (cars_dev['0'] == 'ford') | (cars_dev['0'] == 'opel') | (cars_dev['0'] == 'volkswagen')]
labels_test, features_test = prepare_labels_features(cars_dev)
x_test = Variable(torch.from_numpy(features_test)).float()
"""
Make predictions
"""
pred = model(x_test)
pred = pred.detach().numpy()
print_metrics(labels_test, pred)

60
script_prepare.py Executable file
View File

@ -0,0 +1,60 @@
import subprocess
import sys
import pandas as pd
import os
import numpy as np
try:
dataset_path = sys.argv[1]
except Exception as e:
print("Exception while retrieving dataset path")
print(e)
def divide_dataset(dataset, path):
"""Split dataset to dev, train, test datasets. """
print('Shuffle dataset...')
shuf_path = 'data/Car_Prices_Poland_Kaggle_shuf.csv'
os.system(f'tail -n +2 {path} | shuf > {shuf_path}')
len1 = len(dataset) // 6
len2 = (len1 * 2) + 1
print('Dividing dataset...')
os.system(f'head -n {len1} {shuf_path} > data/Car_Prices_Poland_Kaggle_dev.csv')
os.system(f'head -n {len1} {shuf_path} | tail -n {len1} > data/Car_Prices_Poland_Kaggle_test.csv')
os.system(f'tail -n +{len2} {shuf_path} > data/Car_Prices_Poland_Kaggle_train.csv')
os.system(f'rm {shuf_path}')
print("Len match: " + str(sum([len1 * 2, len2]) == len(dataset)))
os.system('cat Car_Prices_Poland_Kaggle_train.csv | wc -l')
os.system('cat Car_Prices_Poland_Kaggle_dev.csv | wc -l')
os.system('cat Car_Prices_Poland_Kaggle_test.csv | wc -l')
print('Dataset devided')
def normalize_dataset(dataset):
"""Drop unnecessary columns and set numeric values to [0,1] range"""
print(f'--------------- Initial dataset length ---------------')
print(len(dataset))
# drop columns
dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True)
dataset = dataset.dropna()
# normalize numbers to [0, 1]
for column in dataset.columns:
if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64):
dataset[column] = (dataset[column] - dataset[column].min()) / (dataset[column].max() - dataset[column].min())
return dataset
cars = pd.read_csv(dataset_path)
df = pd.DataFrame(cars)
df = normalize_dataset(df)
divide_dataset(df, dataset_path)