add jnks,etc

This commit is contained in:
bartosz.maslanka.consultant 2023-06-28 22:39:02 +02:00
parent 493b2e6e37
commit 8c2f6e4e0f
6 changed files with 220 additions and 39 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

View File

@ -1,14 +1,6 @@
FROM ubuntu:latest
RUN apt-get update \
&& apt-get install -y git python3 python3-pip curl \
&& curl -O https://bootstrap.pypa.io/get-pip.py \
&& python3 get-pip.py --user \
&& rm get-pip.py \
&& pip3 install --user kaggle \
&& pip3 install --user pandas \
&& pip3 install --user seaborn \
&& pip3 install --user scikit-learn
ENV PATH="/root/.local/bin:$PATH"
WORKDIR /app
COPY . /app
CMD ["python", "create_dataset.py"]
RUN apt-get update && \
apt-get install -y python3-pip python3-dev && \
apt-get install -y build-essential && \
pip3 install pandas kaggle seaborn scikit-learn torch matplotlib \

79
Jenkinsfile vendored
View File

@ -1,29 +1,56 @@
node {
stage('Preparation') {
properties([
parameters([
string(
defaultValue: 'bartekmalanka',
description: 'Kaggle username',
name: 'KAGGLE_USERNAME',
trim: false
),
password(
defaultValue: '',
description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
name: 'KAGGLE_KEY'
)
])
])
}
stage('Build') {
// Run the maven build
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
"KAGGLE_KEY=${params.KAGGLE_KEY}" ]) {
sh 'kaggle datasets download -d elakiricoder/gender-classification-dataset > output.txt'
pipeline {
agent any
parameters {
string(
defaultValue: 'wojciechbatruszewicz',
description: 'Kaggle username',
name: 'KAGGLE_USERNAME',
trim: false
)
password(
defaultValue: '',
description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
name: 'KAGGLE_KEY'
)
string(
defaultValue: '30',
description: 'dataset cutoff',
name: 'CUTOFF',
trim: false
)
}
stages {
stage('Download dataset') {
steps {
checkout scm
sh 'ls -l'
archiveArtifacts artifacts: 'gender_classification_v7.csv, output.txt'
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
"KAGGLE_KEY=${params.KAGGLE_KEY}" ]) {
sh 'kaggle datasets download -d elakiricoder/gender-classification-dataset'
sh 'unzip -o gender-classification-dataset.zip'
}
}
}
stage('Docker') {
steps {
script {
def dockerImage = docker.build("docker-image", "./")
dockerImage.inside {
sh 'ls -l'
sh 'ls -l'
sh 'python3 createDataset.py'
archiveArtifacts 'gender_classification_train.csv'
archiveArtifacts 'gender_classification_test.csv'
archiveArtifacts 'gender_classification_val.csv'
sh 'ls -l'
}
}
}
}
}
post {
success {
build job: 'x1-training/main', wait: false
}
}
}
}

80
evaluate.py Normal file
View File

@ -0,0 +1,80 @@
import torch
from train import MyNeuralNetwork, load_data
from torch.utils.data import DataLoader
import csv
import os
import matplotlib.pyplot as plt
from typing import Tuple, List
def evaluate_model() -> Tuple[List[float], float]:
model = MyNeuralNetwork()
model.load_state_dict(torch.load('model.pt'))
model.eval()
test_dataset = load_data("gender_classification_test.csv")
batch_size: int = 32
test_dataloader: DataLoader = DataLoader(test_dataset, batch_size=batch_size)
predictions = []
labels = []
get_label = lambda pred: 1 if pred >= 0.5 else 0
total = 0
correct = 0
with torch.no_grad():
for batch_data, batch_labels in test_dataloader:
batch_predictions = model(batch_data)
predicted_batch_labels = [get_label(prediction) for prediction in batch_predictions]
total += len(predicted_batch_labels)
batch_labels_list = list(map(int,batch_labels.tolist()))
correct += sum(x == y for x, y in zip(predicted_batch_labels, batch_labels_list))
predictions.extend(batch_predictions)
labels.extend(batch_labels)
accuracy = correct/total
return predictions, accuracy
def save_predictions(predictions: list[float]) -> None:
filename = "results.csv"
column_name = "predict"
with open(filename, 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow([column_name])
for result in predictions:
loan_decision = 1 if result.item() > 0.5 else 0
writer.writerow([loan_decision])
def save_accuracy(accuracy):
filename = 'results.csv'
if os.path.exists(filename):
with open(filename, 'a') as file:
writer = csv.writer(file)
writer.writerow([accuracy])
else:
with open(filename, 'w') as file:
writer = csv.writer(file)
writer.writerow(['accuracy'])
writer.writerow([accuracy])
def plot_accuracy():
filename = 'results.csv'
accuracy_results = []
if os.path.exists(filename):
with open(filename, 'r') as file:
reader = csv.reader(file)
for idx, row in enumerate(reader):
if idx == 0:
continue
accuracy_results.append(float(row[0]))
iterations = list(map(str,range(1, len(accuracy_results)+1)))
plt.plot(iterations, accuracy_results)
plt.xlabel('build')
plt.ylabel('accuracy')
plt.title("Accuracies over builds.")
plt.savefig("plot.png")
def main():
predictions, accuracy = evaluate_model()
save_predictions(predictions)
save_accuracy(accuracy)
plot_accuracy()
if __name__ == "__main__":
main()

BIN
model.pt Normal file

Binary file not shown.

82
train.py Normal file
View File

@ -0,0 +1,82 @@
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
import numpy as np
import argparse
class MyNeuralNetwork(nn.Module):
def __init__(self, *args, **kwargs) -> None:
super(MyNeuralNetwork, self).__init__(*args, **kwargs)
self.fc1 = nn.Linear(7, 12)
self.relu = nn.ReLU()
self.fc1 = nn.Linear(7, 12)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(12, 1)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
x = self.sigmoid(x)
return x
def prepare_df_for_nn(df):
id_column_name_list = [column for column in df.columns.to_list() if 'id' in column.lower()]
if len(id_column_name_list) == 0:
pass
else:
df.drop(id_column_name_list[0], inplace=True, axis=1)
encoder = LabelBinarizer()
df.reset_index(inplace=True)
for column in df.columns:
if str(df[column].dtype).lower() == 'object':
encoded_column = encoder.fit_transform(df[column])
df[column] = pd.Series(encoded_column.flatten(), dtype=pd.Int16Dtype)
return df
def load_data(path):
df = pd.read_csv(path)
train_dataset = prepare_df_for_nn(df)
x = train_dataset.iloc[:, :-1].values.astype(float)
y = train_dataset.iloc[:, -1].values.astype(float)
x_tensor = torch.tensor(x, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)
dataset = TensorDataset(x_tensor, y_tensor)
return dataset
def train(epochs, dataloader_train):
model: MyNeuralNetwork = MyNeuralNetwork()
criterion: nn.BCELoss = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in range(epochs):
for inputs, labels in dataloader_train:
outputs = model(inputs)
labels = labels.reshape((labels.shape[0], 1))
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")
return model
def main():
parser = argparse.ArgumentParser(description='A test program.')
parser.add_argument("--epochs", help="Prints the supplied argument.", default='10')
args = parser.parse_args()
config = vars(args)
epochs = int(config["epochs"])
train_dataset = load_data("gender_classification_train.csv")
batch_size = 32
dataloader_train = DataLoader(train_dataset, batch_size = batch_size, shuffle=True)
model = train(epochs, dataloader_train)
torch.save(model.state_dict(), 'model.pt')
if __name__ == "__main__":
main()