added evaluation
All checks were successful
444507-training/pipeline/head This commit looks good

This commit is contained in:
Adam Wojdyla 2022-05-02 00:10:35 +02:00
parent be740d8b71
commit 3d6749570d
2 changed files with 102 additions and 22 deletions

View File

@ -6,7 +6,7 @@ from torch import nn
from torch.autograd import Variable from torch.autograd import Variable
from sklearn.datasets import load_iris from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score from sklearn.metrics import accuracy_score, f1_score
import torch.nn.functional as F import torch.nn.functional as F
import pandas as pd import pandas as pd
from sklearn import preprocessing from sklearn import preprocessing
@ -33,28 +33,31 @@ def load_dataset_raw():
return cars return cars
def remove_rows(dataset): def load_dataset_files():
# dataset.drop(dataset[dataset['mark'] == 'alfa-romeo'].index, inplace=True) """ Load shuffled, splitted dev and train files from .csv files. """
# dataset.drop(dataset[dataset['mark'] == 'chevrolet'].index, inplace=True)
# dataset.drop(dataset[dataset['mark'] == 'mitsubishi'].index, inplace=True)
# dataset.drop(dataset[dataset['mark'] == 'mini'].index, inplace=True)
# audi bmw ford opel volkswagen
new_data = dataset.loc[(dataset['mark'] == 'audi') | (dataset['mark'] == 'bmw') | (dataset['mark'] == 'ford') | (dataset['mark'] == 'opel') | (dataset['mark'] == 'volkswagen')] cars_dev = pd.read_csv('./Car_Prices_Poland_Kaggle_dev.csv', usecols=[1, 4, 5, 6, 10], sep=',', names= [str(i) for i in range(5)])
return new_data cars_train = pd.read_csv('./Car_Prices_Poland_Kaggle_train.csv', usecols=[1, 4, 5, 6, 10], sep=',', names= [str(i) for i in range(5)])
# dataset = dataset.drop(dataset)
# return dataset return cars_dev, cars_train
def prepare_dataset_raw(dataset): def remove_rows(data_dev, data_train):
dev_removed_rows = data_dev.loc[(data_dev['0'] == 'audi') | (data_dev['0'] == 'bmw') | (data_dev['0'] == 'ford') | (data_dev['0'] == 'opel') | (data_dev['0'] == 'volkswagen')]
train_removed_rows = data_train.loc[(data_train['0'] == 'audi') | (data_train['0'] == 'bmw') | (data_train['0'] == 'ford') | (data_train['0'] == 'opel') | (data_train['0'] == 'volkswagen')]
return dev_removed_rows, train_removed_rows
def prepare_labels_features(dataset):
""" Label make column""" """ Label make column"""
le = preprocessing.LabelEncoder() le = preprocessing.LabelEncoder()
mark_column = np.array(dataset[:]['mark']) mark_column = np.array(dataset[:]['0'])
le.fit(mark_column) le.fit(mark_column)
print(list(le.classes_)) print(list(le.classes_))
lab = le.transform(mark_column) lab = le.transform(mark_column)
feat = dataset.drop(['mark'], axis=1).to_numpy() feat = dataset.drop(['0'], axis=1).to_numpy()
mm_scaler = preprocessing.MinMaxScaler() mm_scaler = preprocessing.MinMaxScaler()
feat = mm_scaler.fit_transform(feat) feat = mm_scaler.fit_transform(feat)
@ -62,6 +65,9 @@ def prepare_dataset_raw(dataset):
return lab, feat return lab, feat
# def draw_plot(lbl): # def draw_plot(lbl):
# need to import matplotlib to work # need to import matplotlib to work
# plt.hist(lbl, bins=[i for i in range(len(set(lbl)))], edgecolor="black") # plt.hist(lbl, bins=[i for i in range(len(set(lbl)))], edgecolor="black")
@ -70,17 +76,15 @@ def prepare_dataset_raw(dataset):
# Prepare dataset # Prepare dataset
print("Loading dataset...") print("Loading dataset...")
dataset = load_dataset_raw() dev, train = load_dataset_files()
print("Dataset loaded") print("Dataset loaded")
print("Preparing dataset...") print("Preparing dataset...")
dataset = remove_rows(dataset) dev, train = remove_rows(dev, train)
labels, features = prepare_dataset_raw(dataset) labels_train, features_train = prepare_labels_features(train)
labels_test, features_test = prepare_labels_features(dev)
print("Dataset prepared") print("Dataset prepared")
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, random_state=42,
shuffle=True)
# Training # Training
model = Model(features_train.shape[1]) model = Model(features_train.shape[1])
optimizer = torch.optim.Adam(model.parameters(), lr=0.01) optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
@ -92,7 +96,7 @@ try:
except Exception as e: except Exception as e:
print(e) print(e)
print("Setting default epochs value to 1000.") print("Setting default epochs value to 1000.")
epochs = 1000 epochs = 100
print(f"Number of epochs: {epochs}") print(f"Number of epochs: {epochs}")
@ -113,7 +117,8 @@ print("Model training finished")
x_test = Variable(torch.from_numpy(features_test)).float() x_test = Variable(torch.from_numpy(features_test)).float()
pred = model(x_test) pred = model(x_test)
pred = pred.detach().numpy() pred = pred.detach().numpy()
print("The accuracy is", accuracy_score(labels_test, np.argmax(pred, axis=1))) print(f"The accuracy metric is: {accuracy_score(labels_test, np.argmax(pred, axis=1))}")
# Checking for first value # Checking for first value
# print(np.argmax(model(x_test[0]).detach().numpy(), axis=0)) # print(np.argmax(model(x_test[0]).detach().numpy(), axis=0))

75
lab06_evaluation.py Normal file
View File

@ -0,0 +1,75 @@
#!/usr/bin/python
import torch
from torch import nn
import pandas as pd
from sklearn import preprocessing
import numpy as np
from torch.autograd import Variable
from sklearn.metrics import accuracy_score, f1_score
from csv import DictWriter
import torch.nn.functional as F
import sys
class Model(nn.Module):
def __init__(self, input_dim):
super(Model, self).__init__()
self.layer1 = nn.Linear(input_dim, 100)
self.layer2 = nn.Linear(100, 60)
self.layer3 = nn.Linear(60, 5)
def forward(self, x):
x = F.relu(self.layer1(x))
x = F.relu(self.layer2(x))
x = F.softmax(self.layer3(x)) # To check with the loss function
return x
def prepare_labels_features(dataset):
""" Label make column"""
le = preprocessing.LabelEncoder()
mark_column = np.array(dataset[:]['0'])
le.fit(mark_column)
print(list(le.classes_))
lab = le.transform(mark_column)
feat = dataset.drop(['0'], axis=1).to_numpy()
mm_scaler = preprocessing.MinMaxScaler()
feat = mm_scaler.fit_transform(feat)
return lab, feat
def print_metrics(test_labels, predictions):
# take column with max predicted score
f1 = f1_score(labels_test, np.argmax(predictions, axis=1), average='weighted')
accuracy = accuracy_score(test_labels, np.argmax(predictions, axis=1))
print(f"The F1_score metric is: {f1}")
print(f"The accuracy metric is: {accuracy}")
try:
# build_number = sys.argv[1]
build_number = 1
field_names = ['BUILD_NUMBER', 'F1', 'ACCURACY']
dict = {'BUILD_NUMBER': build_number, 'F1': f1, 'ACCURACY': accuracy }
with open('metrics.csv', 'a') as metrics_file:
dictwriter_object = DictWriter(metrics_file, fieldnames=field_names)
dictwriter_object.writerow(dict)
metrics_file.close()
except Exception as e:
print(e)
model = torch.load("CarPrices_pytorch_model.pkl")
cars_dev = pd.read_csv('./Car_Prices_Poland_Kaggle_dev.csv', usecols=[1, 4, 5, 6, 10], sep=',', names=[str(i) for i in range(5)])
cars_dev = cars_dev.loc[(cars_dev['0'] == 'audi') | (cars_dev['0'] == 'bmw') | (cars_dev['0'] == 'ford') | (cars_dev['0'] == 'opel') | (cars_dev['0'] == 'volkswagen')]
labels_test, features_test = prepare_labels_features(cars_dev)
x_test = Variable(torch.from_numpy(features_test)).float()
pred = model(x_test)
pred = pred.detach().numpy()
print_metrics(labels_test, pred)