diff --git a/lab05_deepLearning.py b/lab05_deepLearning.py index 4f1a8d8..2f81e8c 100644 --- a/lab05_deepLearning.py +++ b/lab05_deepLearning.py @@ -6,7 +6,7 @@ from torch import nn from torch.autograd import Variable from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score +from sklearn.metrics import accuracy_score, f1_score import torch.nn.functional as F import pandas as pd from sklearn import preprocessing @@ -33,28 +33,31 @@ def load_dataset_raw(): return cars -def remove_rows(dataset): - # dataset.drop(dataset[dataset['mark'] == 'alfa-romeo'].index, inplace=True) - # dataset.drop(dataset[dataset['mark'] == 'chevrolet'].index, inplace=True) - # dataset.drop(dataset[dataset['mark'] == 'mitsubishi'].index, inplace=True) - # dataset.drop(dataset[dataset['mark'] == 'mini'].index, inplace=True) - # audi bmw ford opel volkswagen +def load_dataset_files(): + """ Load shuffled, splitted dev and train files from .csv files. """ - new_data = dataset.loc[(dataset['mark'] == 'audi') | (dataset['mark'] == 'bmw') | (dataset['mark'] == 'ford') | (dataset['mark'] == 'opel') | (dataset['mark'] == 'volkswagen')] - return new_data - # dataset = dataset.drop(dataset) - # return dataset + cars_dev = pd.read_csv('./Car_Prices_Poland_Kaggle_dev.csv', usecols=[1, 4, 5, 6, 10], sep=',', names= [str(i) for i in range(5)]) + cars_train = pd.read_csv('./Car_Prices_Poland_Kaggle_train.csv', usecols=[1, 4, 5, 6, 10], sep=',', names= [str(i) for i in range(5)]) + + return cars_dev, cars_train -def prepare_dataset_raw(dataset): +def remove_rows(data_dev, data_train): + dev_removed_rows = data_dev.loc[(data_dev['0'] == 'audi') | (data_dev['0'] == 'bmw') | (data_dev['0'] == 'ford') | (data_dev['0'] == 'opel') | (data_dev['0'] == 'volkswagen')] + train_removed_rows = data_train.loc[(data_train['0'] == 'audi') | (data_train['0'] == 'bmw') | (data_train['0'] == 'ford') | (data_train['0'] == 'opel') | (data_train['0'] == 'volkswagen')] + + return dev_removed_rows, train_removed_rows + + +def prepare_labels_features(dataset): """ Label make column""" le = preprocessing.LabelEncoder() - mark_column = np.array(dataset[:]['mark']) + mark_column = np.array(dataset[:]['0']) le.fit(mark_column) print(list(le.classes_)) lab = le.transform(mark_column) - feat = dataset.drop(['mark'], axis=1).to_numpy() + feat = dataset.drop(['0'], axis=1).to_numpy() mm_scaler = preprocessing.MinMaxScaler() feat = mm_scaler.fit_transform(feat) @@ -62,6 +65,9 @@ def prepare_dataset_raw(dataset): return lab, feat + + + # def draw_plot(lbl): # need to import matplotlib to work # plt.hist(lbl, bins=[i for i in range(len(set(lbl)))], edgecolor="black") @@ -70,17 +76,15 @@ def prepare_dataset_raw(dataset): # Prepare dataset print("Loading dataset...") -dataset = load_dataset_raw() +dev, train = load_dataset_files() print("Dataset loaded") print("Preparing dataset...") -dataset = remove_rows(dataset) -labels, features = prepare_dataset_raw(dataset) +dev, train = remove_rows(dev, train) +labels_train, features_train = prepare_labels_features(train) +labels_test, features_test = prepare_labels_features(dev) print("Dataset prepared") - -features_train, features_test, labels_train, labels_test = train_test_split(features, labels, random_state=42, - shuffle=True) # Training model = Model(features_train.shape[1]) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) @@ -92,7 +96,7 @@ try: except Exception as e: print(e) print("Setting default epochs value to 1000.") - epochs = 1000 + epochs = 100 print(f"Number of epochs: {epochs}") @@ -113,7 +117,8 @@ print("Model training finished") x_test = Variable(torch.from_numpy(features_test)).float() pred = model(x_test) pred = pred.detach().numpy() -print("The accuracy is", accuracy_score(labels_test, np.argmax(pred, axis=1))) +print(f"The accuracy metric is: {accuracy_score(labels_test, np.argmax(pred, axis=1))}") + # Checking for first value # print(np.argmax(model(x_test[0]).detach().numpy(), axis=0)) diff --git a/lab06_evaluation.py b/lab06_evaluation.py new file mode 100644 index 0000000..3560f89 --- /dev/null +++ b/lab06_evaluation.py @@ -0,0 +1,75 @@ +#!/usr/bin/python + +import torch +from torch import nn +import pandas as pd +from sklearn import preprocessing +import numpy as np +from torch.autograd import Variable +from sklearn.metrics import accuracy_score, f1_score +from csv import DictWriter +import torch.nn.functional as F +import sys + +class Model(nn.Module): + def __init__(self, input_dim): + super(Model, self).__init__() + self.layer1 = nn.Linear(input_dim, 100) + self.layer2 = nn.Linear(100, 60) + self.layer3 = nn.Linear(60, 5) + + def forward(self, x): + x = F.relu(self.layer1(x)) + x = F.relu(self.layer2(x)) + x = F.softmax(self.layer3(x)) # To check with the loss function + return x + +def prepare_labels_features(dataset): + """ Label make column""" + le = preprocessing.LabelEncoder() + mark_column = np.array(dataset[:]['0']) + le.fit(mark_column) + + print(list(le.classes_)) + lab = le.transform(mark_column) + feat = dataset.drop(['0'], axis=1).to_numpy() + + mm_scaler = preprocessing.MinMaxScaler() + feat = mm_scaler.fit_transform(feat) + + return lab, feat + + +def print_metrics(test_labels, predictions): + # take column with max predicted score + f1 = f1_score(labels_test, np.argmax(predictions, axis=1), average='weighted') + accuracy = accuracy_score(test_labels, np.argmax(predictions, axis=1)) + print(f"The F1_score metric is: {f1}") + print(f"The accuracy metric is: {accuracy}") + + try: + # build_number = sys.argv[1] + build_number = 1 + field_names = ['BUILD_NUMBER', 'F1', 'ACCURACY'] + dict = {'BUILD_NUMBER': build_number, 'F1': f1, 'ACCURACY': accuracy } + + with open('metrics.csv', 'a') as metrics_file: + dictwriter_object = DictWriter(metrics_file, fieldnames=field_names) + dictwriter_object.writerow(dict) + metrics_file.close() + except Exception as e: + print(e) + + +model = torch.load("CarPrices_pytorch_model.pkl") +cars_dev = pd.read_csv('./Car_Prices_Poland_Kaggle_dev.csv', usecols=[1, 4, 5, 6, 10], sep=',', names=[str(i) for i in range(5)]) +cars_dev = cars_dev.loc[(cars_dev['0'] == 'audi') | (cars_dev['0'] == 'bmw') | (cars_dev['0'] == 'ford') | (cars_dev['0'] == 'opel') | (cars_dev['0'] == 'volkswagen')] +labels_test, features_test = prepare_labels_features(cars_dev) + +x_test = Variable(torch.from_numpy(features_test)).float() +pred = model(x_test) +pred = pred.detach().numpy() +print_metrics(labels_test, pred) + + +