dvc repro
Some checks failed
s444507-predict-s444356/pipeline/head There was a failure building this commit
s444507-evaluation/pipeline/head This commit looks good
444507-training/pipeline/head This commit looks good

This commit is contained in:
Adam Wojdyla 2022-06-04 22:15:20 +02:00
parent a9ad0e2ee1
commit 54ce588c87
8 changed files with 229 additions and 5 deletions

1
.gitignore vendored
View File

@ -160,3 +160,4 @@ IUM08/*
mlruns mlruns
my_model my_model
dvcstore dvcstore
/prediction_results.csv

1
data/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/prepared

46
dvc.lock Normal file
View File

@ -0,0 +1,46 @@
schema: '2.0'
stages:
prepare:
cmd: python3 script_prepare.py data/Car_Prices_Poland_Kaggle.csv
deps:
- path: data/Car_Prices_Poland_Kaggle.csv
md5: 9170e9b525149cb1f571f318cd604913
size: 9894367
- path: script_prepare.py
md5: f1dfe33a503f5acc687c53dee448f71b
size: 1899
outs:
- path: data/Car_Prices_Poland_Kaggle_dev.csv
md5: cf9355749edc79f588e264de5b2bf1f0
size: 1648309
- path: data/Car_Prices_Poland_Kaggle_test.csv
md5: cf9355749edc79f588e264de5b2bf1f0
size: 1648309
- path: data/Car_Prices_Poland_Kaggle_train.csv
md5: 8818f758e2de344a4b9ad712379b81e1
size: 6597472
train:
cmd: python3 lab05_deepLearning.py 50
deps:
- path: data/Car_Prices_Poland_Kaggle_dev.csv
md5: cf9355749edc79f588e264de5b2bf1f0
size: 1648309
- path: data/Car_Prices_Poland_Kaggle_test.csv
md5: cf9355749edc79f588e264de5b2bf1f0
size: 1648309
- path: data/Car_Prices_Poland_Kaggle_train.csv
md5: 8818f758e2de344a4b9ad712379b81e1
size: 6597472
outs:
- path: CarPrices_pytorch_model.pkl
md5: cff6a79945bbf839058a4fd1b2dcc98f
size: 30039
- path: prediction_results.csv
md5: 62b9e54cdfebc7f1dfb060e18e9b8738
size: 585197
evaluate:
cmd: python3 lab10_evaluate.py
deps:
- path: CarPrices_pytorch_model.pkl
md5: cff6a79945bbf839058a4fd1b2dcc98f
size: 30039

23
dvc.yaml Normal file
View File

@ -0,0 +1,23 @@
stages:
prepare:
cmd: python3 script_prepare.py data/Car_Prices_Poland_Kaggle.csv
deps:
- data/Car_Prices_Poland_Kaggle.csv
- script_prepare.py
outs:
- data/Car_Prices_Poland_Kaggle_dev.csv
- data/Car_Prices_Poland_Kaggle_train.csv
- data/Car_Prices_Poland_Kaggle_test.csv
train:
cmd: python3 lab05_deepLearning.py 50
deps:
- data/Car_Prices_Poland_Kaggle_dev.csv
- data/Car_Prices_Poland_Kaggle_train.csv
- data/Car_Prices_Poland_Kaggle_test.csv
outs:
- CarPrices_pytorch_model.pkl
- prediction_results.csv
evaluate:
cmd: python3 lab10_evaluate.py
deps:
- CarPrices_pytorch_model.pkl

View File

@ -90,9 +90,9 @@ labels_test, features_test = prepare_labels_features(cars_dev)
x_test = Variable(torch.from_numpy(features_test)).float() x_test = Variable(torch.from_numpy(features_test)).float()
pred = model(x_test) pred = model(x_test)
pred = pred.detach().numpy() pred = pred.detach().numpy()
print_metrics(labels_test, pred) # print_metrics(labels_test, pred)
draw_plot() # draw_plot()

View File

@ -1,13 +1,10 @@
#!/usr/bin/python #!/usr/bin/python
from urllib.parse import urlparse from urllib.parse import urlparse
import mlflow
import numpy as np import numpy as np
import torch import torch
from torch import nn from torch import nn
from torch.autograd import Variable from torch.autograd import Variable
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score from sklearn.metrics import accuracy_score, f1_score
import torch.nn.functional as F import torch.nn.functional as F
import pandas as pd import pandas as pd

96
lab10_evaluate.py Normal file
View File

@ -0,0 +1,96 @@
#!/usr/bin/python
import torch
from torch import nn
import pandas as pd
from sklearn import preprocessing
import numpy as np
from torch.autograd import Variable
from sklearn.metrics import accuracy_score, f1_score
from csv import DictWriter
import torch.nn.functional as F
import sys
import os
import matplotlib.pyplot as plt
class Model(nn.Module):
def __init__(self, input_dim):
super(Model, self).__init__()
self.layer1 = nn.Linear(input_dim, 100)
self.layer2 = nn.Linear(100, 60)
self.layer3 = nn.Linear(60, 5)
def forward(self, x):
x = F.relu(self.layer1(x))
x = F.relu(self.layer2(x))
x = F.softmax(self.layer3(x)) # To check with the loss function
return x
def prepare_labels_features(dataset):
""" Label make column"""
dataset = dataset.dropna()
le = preprocessing.LabelEncoder()
mark_column = np.array(dataset[:]['0'])
le.fit(mark_column)
print(list(le.classes_))
lab = le.transform(mark_column)
feat = dataset.drop(['0'], axis=1).to_numpy()
mm_scaler = preprocessing.StandardScaler()
feat = mm_scaler.fit_transform(feat)
return lab, feat
def print_metrics(test_labels, predictions):
# take column with max predicted score
f1 = f1_score(labels_test, np.argmax(predictions, axis=1), average='weighted')
accuracy = accuracy_score(test_labels, np.argmax(predictions, axis=1))
print(f"The F1_score metric is: {f1}")
print(f"The accuracy metric is: {accuracy}")
if len(sys.argv) != 2:
return
build_number = sys.argv[1]
print(f"Build number: {build_number}")
field_names = ['BUILD_NUMBER', 'F1', 'ACCURACY']
dict = {'BUILD_NUMBER': build_number, 'F1': f1, 'ACCURACY': accuracy }
filename = "./metrics.csv"
file_exists = os.path.isfile(filename)
with open(filename, 'a') as metrics_file:
dictwriter_object = DictWriter(metrics_file, fieldnames=field_names)
if not file_exists:
dictwriter_object.writeheader()
dictwriter_object.writerow(dict)
metrics_file.close()
"""
Load model and data
"""
model = torch.load("CarPrices_pytorch_model.pkl")
cars_dev = pd.read_csv('data/Car_Prices_Poland_Kaggle_dev.csv', usecols=[1, 4, 5, 6, 10], sep=',', names=[str(i) for i in range(5)])
"""
Prepare data
"""
cars_dev = cars_dev.loc[(cars_dev['0'] == 'audi') | (cars_dev['0'] == 'bmw') | (cars_dev['0'] == 'ford') | (cars_dev['0'] == 'opel') | (cars_dev['0'] == 'volkswagen')]
labels_test, features_test = prepare_labels_features(cars_dev)
x_test = Variable(torch.from_numpy(features_test)).float()
"""
Make predictions
"""
pred = model(x_test)
pred = pred.detach().numpy()
print_metrics(labels_test, pred)

60
script_prepare.py Executable file
View File

@ -0,0 +1,60 @@
import subprocess
import sys
import pandas as pd
import os
import numpy as np
try:
dataset_path = sys.argv[1]
except Exception as e:
print("Exception while retrieving dataset path")
print(e)
def divide_dataset(dataset, path):
"""Split dataset to dev, train, test datasets. """
print('Shuffle dataset...')
shuf_path = 'data/Car_Prices_Poland_Kaggle_shuf.csv'
os.system(f'tail -n +2 {path} | shuf > {shuf_path}')
len1 = len(dataset) // 6
len2 = (len1 * 2) + 1
print('Dividing dataset...')
os.system(f'head -n {len1} {shuf_path} > data/Car_Prices_Poland_Kaggle_dev.csv')
os.system(f'head -n {len1} {shuf_path} | tail -n {len1} > data/Car_Prices_Poland_Kaggle_test.csv')
os.system(f'tail -n +{len2} {shuf_path} > data/Car_Prices_Poland_Kaggle_train.csv')
os.system(f'rm {shuf_path}')
print("Len match: " + str(sum([len1 * 2, len2]) == len(dataset)))
os.system('cat Car_Prices_Poland_Kaggle_train.csv | wc -l')
os.system('cat Car_Prices_Poland_Kaggle_dev.csv | wc -l')
os.system('cat Car_Prices_Poland_Kaggle_test.csv | wc -l')
print('Dataset devided')
def normalize_dataset(dataset):
"""Drop unnecessary columns and set numeric values to [0,1] range"""
print(f'--------------- Initial dataset length ---------------')
print(len(dataset))
# drop columns
dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True)
dataset = dataset.dropna()
# normalize numbers to [0, 1]
for column in dataset.columns:
if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64):
dataset[column] = (dataset[column] - dataset[column].min()) / (dataset[column].max() - dataset[column].min())
return dataset
cars = pd.read_csv(dataset_path)
df = pd.DataFrame(cars)
df = normalize_dataset(df)
divide_dataset(df, dataset_path)