Model evaluation.

This commit is contained in:
s487179 2023-06-13 19:53:46 +02:00
parent 048e489250
commit 451c85755d
5 changed files with 285 additions and 0 deletions

43
MLEvaluate/Jenkinsfile vendored Normal file
View File

@ -0,0 +1,43 @@
pipeline {
agent any
parameters {
buildSelector(
name: 'BUILD_SELECTOR',
defaultSelector: lastSuccessful(),
description: 'A build to take the artifacts from'
)
string(
name: 'EPOCHS',
description: 'Number of epochs',
defaultValue: '10'
)
}
stages {
stage('Copy artifacts') {
steps {
script {
copyArtifacts(
projectName: 'z-s487179-training',
selector: buildParameter('BUILD_SELECTOR'),
target: './MLEvaluate'
)
}
}
}
stage('Run training and save model') {
steps {
script {
sh 'ls -l'
docker.image('docker-image').inside {
dir('./MLEvaluate') {
sh 'ls -l'
sh 'python3 ./model_test.py'
archiveArtifacts 'plot.png'
archiveArtifacts 'metrics.csv'
}
}
}
}
}
}
}

82
MLEvaluate/model_test.py Normal file
View File

@ -0,0 +1,82 @@
import torch
import sys
sys.path.append("../MLTrain/")
from model_train import MyNeuralNetwork, load_data
from torch.utils.data import DataLoader
import csv
import os
import matplotlib.pyplot as plt
from typing import Tuple, List
def evaluate_model() -> Tuple[List[float], float]:
model: MyNeuralNetwork = MyNeuralNetwork()
model.load_state_dict(torch.load('model.pt'))
model.eval()
test_dataset = load_data("home_loan_test.csv")
batch_size: int = 32
test_dataloader: DataLoader = DataLoader(test_dataset, batch_size=batch_size)
predictions = []
labels = []
get_label = lambda pred: 1 if pred >= 0.5 else 0
total = 0
correct = 0
with torch.no_grad():
for batch_data, batch_labels in test_dataloader:
batch_predictions = model(batch_data)
predicted_batch_labels = [get_label(prediction) for prediction in batch_predictions]
total += len(predicted_batch_labels)
batch_labels_list = list(map(int,batch_labels.tolist()))
correct += sum(x == y for x, y in zip(predicted_batch_labels, batch_labels_list))
predictions.extend(batch_predictions)
labels.extend(batch_labels)
accuracy = correct/total
return predictions, accuracy
def save_predictions(predictions: list[float]) -> None:
filename = "results.csv"
column_name = "predict"
with open(filename, 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow([column_name])
for result in predictions:
loan_decision = 1 if result.item() > 0.5 else 0
writer.writerow([loan_decision])
def save_accuracy(accuracy: float) -> None:
filename = 'metrics.csv'
if os.path.exists(filename):
with open(filename, 'a') as file:
writer = csv.writer(file)
writer.writerow([accuracy])
else:
with open(filename, 'w') as file:
writer = csv.writer(file)
writer.writerow(['accuracy'])
writer.writerow([accuracy])
def plot_accuracy() -> None:
filename = 'metrics.csv'
accuracy_results = []
if os.path.exists(filename):
with open(filename, 'r') as file:
reader = csv.reader(file)
for idx, row in enumerate(reader):
if idx == 0:
continue
accuracy_results.append(float(row[0]))
iterations = list(map(str,range(1, len(accuracy_results)+1)))
plt.plot(iterations, accuracy_results)
plt.xlabel('build')
plt.ylabel('accuracy')
plt.title("Accuracies over builds.")
plt.savefig("plot.png")
def main() -> None:
predictions, accuracy = evaluate_model()
save_predictions(predictions)
save_accuracy(accuracy)
plot_accuracy()
if __name__ == "__main__":
main()

47
MLTrain/Jenkinsfile vendored Normal file
View File

@ -0,0 +1,47 @@
pipeline {
agent any
parameters {
buildSelector(
name: 'BUILD_SELECTOR',
defaultSelector: lastSuccessful(),
description: 'A build to take the artifacts from'
)
string(
name: 'EPOCHS',
description: 'Number of epochs',
defaultValue: '10'
)
}
stages {
stage('Copy artifacts') {
steps {
script {
copyArtifacts(
projectName: 'z-s487179-create-dataset',
selector: buildParameter('BUILD_SELECTOR'),
target: './MLTrain'
)
}
}
}
stage('Run training and save model') {
steps {
script {
sh 'ls -l'
docker.image('docker-image').inside {
dir('./MLTrain') {
sh 'ls -l'
sh 'python3 ./model_train.py'
archiveArtifacts 'model.pt'
}
}
}
}
}
}
post {
success {
build job: 'z-s487179-evaluation.eg/main', propagate: false, wait: false
}
}
}

Binary file not shown.

113
MLTrain/model_train.py Normal file
View File

@ -0,0 +1,113 @@
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import argparse
class MyNeuralNetwork(nn.Module):
def __init__(self, *args, **kwargs) -> None:
super(MyNeuralNetwork, self).__init__(*args, **kwargs)
self.fc1 = nn.Linear(12, 64)
self.relu = nn.ReLU()
self.fc1 = nn.Linear(12, 64)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(64, 1)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
x = self.sigmoid(x)
return x
def prepare_df_for_nn(df: pd.DataFrame):
id_column_name_list: list[str] = [column for column in df.columns.to_list() if 'id' in column.lower()]
if len(id_column_name_list) == 0:
pass
else:
df.drop(id_column_name_list[0], inplace=True, axis=1)
encoder: LabelBinarizer = LabelBinarizer()
df.reset_index(inplace=True)
for column in df.columns:
if str(df[column].dtype).lower() == 'object':
encoded_column: np.ndarray = encoder.fit_transform(df[column])
df[column] = pd.Series(encoded_column.flatten(), dtype=pd.Int16Dtype)
return df
def load_data(path: str):
df: pd.DataFrame = pd.read_csv(path)
train_dataset: pd.DataFrame = prepare_df_for_nn(df)
x: np.ndarray = train_dataset.iloc[:, :-1].values.astype(float)
y: np.ndarray = train_dataset.iloc[:, -1].values.astype(float)
x_tensor: torch.Tensor = torch.tensor(x, dtype=torch.float32)
y_tensor: torch.Tensor = torch.tensor(y, dtype=torch.float32)
dataset: TensorDataset = TensorDataset(x_tensor, y_tensor)
return dataset
def train(epochs: int, dataloader_train: DataLoader, dataloader_val: DataLoader):
model: MyNeuralNetwork = MyNeuralNetwork()
criterion: nn.BCELoss = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in range(epochs):
total_correct_train = 0
total_samples_train = 0
total_correct_val = 0
total_samples_val = 0
for inputs, labels in dataloader_train:
outputs = model(inputs)
labels = labels.reshape((labels.shape[0], 1))
loss = criterion(outputs, labels)
predicted_labels = (outputs > 0.5).float()
total_correct_train += (predicted_labels == labels).sum().item()
total_samples_train += labels.size(0)
optimizer.zero_grad()
loss.backward()
optimizer.step()
with torch.no_grad():
for inputs, labels in dataloader_val:
outputs_val = model(inputs)
predicted_labels_val = (outputs_val > 0.5).float()
labels = labels.reshape((labels.shape[0], 1))
total_correct_val += (predicted_labels_val == labels).sum().item()
total_samples_val += labels.size(0)
accuracy_val = total_correct_val / total_samples_val
accuracy_train = total_correct_train / total_samples_train
print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}, Accuracy train: {accuracy_train:.4f}, Accuracy val: {accuracy_val:.4f}")
return model
def main() -> None:
parser = argparse.ArgumentParser(description='A test program.')
parser.add_argument("--epochs", help="Prints the supplied argument.", default='10')
args = parser.parse_args()
config = vars(args)
epochs = int(config["epochs"])
train_dataset = load_data("home_loan_train.csv")
val_dataset = load_data("home_loan_val.csv")
batch_size: int = 32
dataloader_train = DataLoader(train_dataset, batch_size = batch_size, shuffle=True)
dataloader_val = DataLoader(val_dataset, batch_size = batch_size)
model = train(epochs, dataloader_train, dataloader_val)
torch.save(model.state_dict(), 'model.pt')
if __name__ == "__main__":
main()