Compare commits

...

4 Commits

Author SHA1 Message Date
bartosz.maslanka.consultant
db75e5f227 add 2023-06-28 22:58:05 +02:00
bartosz.maslanka.consultant
30326bc23a add jenks files 2023-06-28 22:40:30 +02:00
bartosz.maslanka.consultant
c0ae2dd329 add script 2023-06-28 22:39:38 +02:00
bartosz.maslanka.consultant
8c2f6e4e0f add jnks,etc 2023-06-28 22:39:02 +02:00
11 changed files with 379 additions and 39 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

View File

@ -1,14 +1,6 @@
FROM ubuntu:latest
RUN apt-get update \
&& apt-get install -y git python3 python3-pip curl \
&& curl -O https://bootstrap.pypa.io/get-pip.py \
&& python3 get-pip.py --user \
&& rm get-pip.py \
&& pip3 install --user kaggle \
&& pip3 install --user pandas \
&& pip3 install --user seaborn \
&& pip3 install --user scikit-learn
ENV PATH="/root/.local/bin:$PATH"
WORKDIR /app
COPY . /app
CMD ["python", "create_dataset.py"]
RUN apt-get update && \
apt-get install -y python3-pip python3-dev && \
apt-get install -y build-essential && \
pip3 install pandas kaggle seaborn scikit-learn torch matplotlib \

79
Jenkinsfile vendored
View File

@ -1,29 +1,56 @@
node {
stage('Preparation') {
properties([
parameters([
string(
defaultValue: 'bartekmalanka',
description: 'Kaggle username',
name: 'KAGGLE_USERNAME',
trim: false
),
password(
defaultValue: '',
description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
name: 'KAGGLE_KEY'
)
])
])
}
stage('Build') {
// Run the maven build
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
"KAGGLE_KEY=${params.KAGGLE_KEY}" ]) {
sh 'kaggle datasets download -d elakiricoder/gender-classification-dataset > output.txt'
pipeline {
agent any
parameters {
string(
defaultValue: 'wojciechbatruszewicz',
description: 'Kaggle username',
name: 'KAGGLE_USERNAME',
trim: false
)
password(
defaultValue: '',
description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
name: 'KAGGLE_KEY'
)
string(
defaultValue: '30',
description: 'dataset cutoff',
name: 'CUTOFF',
trim: false
)
}
stages {
stage('Download dataset') {
steps {
checkout scm
sh 'ls -l'
archiveArtifacts artifacts: 'gender_classification_v7.csv, output.txt'
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
"KAGGLE_KEY=${params.KAGGLE_KEY}" ]) {
sh 'kaggle datasets download -d elakiricoder/gender-classification-dataset'
sh 'unzip -o gender-classification-dataset.zip'
}
}
}
stage('Docker') {
steps {
script {
def dockerImage = docker.build("docker-image", "./")
dockerImage.inside {
sh 'ls -l'
sh 'ls -l'
sh 'python3 createDataset.py'
archiveArtifacts 'gender_classification_train.csv'
archiveArtifacts 'gender_classification_test.csv'
archiveArtifacts 'gender_classification_val.csv'
sh 'ls -l'
}
}
}
}
}
post {
success {
build job: 'x1-training/main', wait: false
}
}
}
}

34
JenkinsfileDatasetStats Normal file
View File

@ -0,0 +1,34 @@
pipeline {
agent any
parameters {
buildSelector(
name: 'BUILD_SELECTOR',
defaultSelector: lastSuccessful(),
description: 'A build to take the artifacts from'
)
}
stages {
stage('Copy artifacts') {
steps {
script {
copyArtifacts(
projectName: 'x1-create-dataset',
selector: buildParameter('BUILD_SELECTOR'),
target: './'
)
}
}
}
stage('Run python file') {
steps {
script {
sh 'ls -l'
docker.image('docker-image').inside {
sh 'ls -l'
sh 'python3 ./datasetStats.py'
}
}
}
}
}
}

46
JenkinsfileEvaluate Normal file
View File

@ -0,0 +1,46 @@
pipeline {
agent any
parameters {
buildSelector(
name: 'BUILD_SELECTOR',
defaultSelector: lastSuccessful(),
description: 'A build to take the artifacts from'
)
string(
name: 'EPOCHS',
description: 'Number of epochs',
defaultValue: '10'
)
}
stages {
stage('Copy artifacts') {
steps {
script {
copyArtifacts(
projectName: 'x1-training/main',
selector: buildParameter('BUILD_SELECTOR'),
target: './'
)
copyArtifacts(
projectName: 'x1-create-dataset',
selector: buildParameter('BUILD_SELECTOR'),
target: './'
)
}
}
}
stage('Save evaluation') {
steps {
script {
sh 'ls -l'
docker.image('docker-image').inside {
sh 'ls -l'
sh 'python3 ./evaluate.py'
archiveArtifacts 'plot.png'
archiveArtifacts 'results.csv'
}
}
}
}
}
}

45
JenkinsfileTrain Normal file
View File

@ -0,0 +1,45 @@
pipeline {
agent any
parameters {
buildSelector(
name: 'BUILD_SELECTOR',
defaultSelector: lastSuccessful(),
description: 'A build to take the artifacts from'
)
string(
name: 'EPOCHS',
description: 'Number of epochs',
defaultValue: '10'
)
}
stages {
stage('Copy artifacts') {
steps {
script {
copyArtifacts(
projectName: 'x1-create-dataset',
selector: buildParameter('BUILD_SELECTOR'),
target: './'
)
}
}
}
stage('Run training and save model') {
steps {
script {
sh 'ls -l'
docker.image('docker-image').inside {
sh 'ls -l'
sh 'python3 ./train.py'
archiveArtifacts 'model.pt'
}
}
}
}
}
post {
success {
build job: 'x1-evaluation.eg/main', wait: false
}
}
}

25
createDataset.py Normal file
View File

@ -0,0 +1,25 @@
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
gender_classification = pd.read_csv('gender_classification_v7.csv')
gender_classification_train_final, gender_classification_test = train_test_split(gender_classification, test_size=0.2, random_state=1)
gender_classification_test_final, gender_classification_val_final = train_test_split(gender_classification_test, test_size=0.5, random_state=1)
numeric_cols_train = gender_classification_train_final.select_dtypes(include='number').columns
numeric_cols_test = gender_classification_test_final.select_dtypes(include='number').columns
numeric_cols_val = gender_classification_val_final.select_dtypes(include='number').columns
scaler = MinMaxScaler()
gender_classification_train_final[numeric_cols_train] = scaler.fit_transform(gender_classification_train_final[numeric_cols_train])
gender_classification_test_final[numeric_cols_test] = scaler.fit_transform(gender_classification_test_final[numeric_cols_test])
gender_classification_val_final[numeric_cols_val] = scaler.fit_transform(gender_classification_val_final[numeric_cols_val])
gender_classification_train_final = gender_classification_train_final.dropna()
gender_classification_test_final = gender_classification_test_final.dropna()
gender_classification_val_final = gender_classification_val_final.dropna()
gender_classification_train_final.to_csv('gender_classification_train.csv', index=False)
gender_classification_test_final.to_csv('gender_classification_test.csv', index=False)
gender_classification_val_final.to_csv('gender_classification_val.csv', index=False)

9
datasetStats.py Normal file
View File

@ -0,0 +1,9 @@
import pandas as pd
home_loan_train = pd.read_csv('gender_classification_train.csv')
home_loan_test = pd.read_csv('gender_classification_test.csv')
home_loan_val = pd.read_csv('gender_classification_val.csv')
home_loan_train.describe()
home_loan_test.describe()
home_loan_val.describe()

80
evaluate.py Normal file
View File

@ -0,0 +1,80 @@
import torch
from train import MyNeuralNetwork, load_data
from torch.utils.data import DataLoader
import csv
import os
import matplotlib.pyplot as plt
from typing import Tuple, List
def evaluate_model() -> Tuple[List[float], float]:
model = MyNeuralNetwork()
model.load_state_dict(torch.load('model.pt'))
model.eval()
test_dataset = load_data("gender_classification_test.csv")
batch_size: int = 32
test_dataloader: DataLoader = DataLoader(test_dataset, batch_size=batch_size)
predictions = []
labels = []
get_label = lambda pred: 1 if pred >= 0.5 else 0
total = 0
correct = 0
with torch.no_grad():
for batch_data, batch_labels in test_dataloader:
batch_predictions = model(batch_data)
predicted_batch_labels = [get_label(prediction) for prediction in batch_predictions]
total += len(predicted_batch_labels)
batch_labels_list = list(map(int,batch_labels.tolist()))
correct += sum(x == y for x, y in zip(predicted_batch_labels, batch_labels_list))
predictions.extend(batch_predictions)
labels.extend(batch_labels)
accuracy = correct/total
return predictions, accuracy
def save_predictions(predictions: list[float]) -> None:
filename = "results.csv"
column_name = "predict"
with open(filename, 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow([column_name])
for result in predictions:
loan_decision = 1 if result.item() > 0.5 else 0
writer.writerow([loan_decision])
def save_accuracy(accuracy):
filename = 'results.csv'
if os.path.exists(filename):
with open(filename, 'a') as file:
writer = csv.writer(file)
writer.writerow([accuracy])
else:
with open(filename, 'w') as file:
writer = csv.writer(file)
writer.writerow(['accuracy'])
writer.writerow([accuracy])
def plot_accuracy():
filename = 'results.csv'
accuracy_results = []
if os.path.exists(filename):
with open(filename, 'r') as file:
reader = csv.reader(file)
for idx, row in enumerate(reader):
if idx == 0:
continue
accuracy_results.append(float(row[0]))
iterations = list(map(str,range(1, len(accuracy_results)+1)))
plt.plot(iterations, accuracy_results)
plt.xlabel('build')
plt.ylabel('accuracy')
plt.title("Accuracies over builds.")
plt.savefig("plot.png")
def main():
predictions, accuracy = evaluate_model()
save_predictions(predictions)
save_accuracy(accuracy)
plot_accuracy()
if __name__ == "__main__":
main()

BIN
model.pt Normal file

Binary file not shown.

82
train.py Normal file
View File

@ -0,0 +1,82 @@
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
import numpy as np
import argparse
class MyNeuralNetwork(nn.Module):
def __init__(self, *args, **kwargs) -> None:
super(MyNeuralNetwork, self).__init__(*args, **kwargs)
self.fc1 = nn.Linear(7, 12)
self.relu = nn.ReLU()
self.fc1 = nn.Linear(7, 12)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(12, 1)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
x = self.sigmoid(x)
return x
def prepare_df_for_nn(df):
id_column_name_list = [column for column in df.columns.to_list() if 'id' in column.lower()]
if len(id_column_name_list) == 0:
pass
else:
df.drop(id_column_name_list[0], inplace=True, axis=1)
encoder = LabelBinarizer()
df.reset_index(inplace=True)
for column in df.columns:
if str(df[column].dtype).lower() == 'object':
encoded_column = encoder.fit_transform(df[column])
df[column] = pd.Series(encoded_column.flatten(), dtype=pd.Int16Dtype)
return df
def load_data(path):
df = pd.read_csv(path)
train_dataset = prepare_df_for_nn(df)
x = train_dataset.iloc[:, :-1].values.astype(float)
y = train_dataset.iloc[:, -1].values.astype(float)
x_tensor = torch.tensor(x, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)
dataset = TensorDataset(x_tensor, y_tensor)
return dataset
def train(epochs, dataloader_train):
model: MyNeuralNetwork = MyNeuralNetwork()
criterion: nn.BCELoss = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in range(epochs):
for inputs, labels in dataloader_train:
outputs = model(inputs)
labels = labels.reshape((labels.shape[0], 1))
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")
return model
def main():
parser = argparse.ArgumentParser(description='A test program.')
parser.add_argument("--epochs", help="Prints the supplied argument.", default='10')
args = parser.parse_args()
config = vars(args)
epochs = int(config["epochs"])
train_dataset = load_data("gender_classification_train.csv")
batch_size = 32
dataloader_train = DataLoader(train_dataset, batch_size = batch_size, shuffle=True)
model = train(epochs, dataloader_train)
torch.save(model.state_dict(), 'model.pt')
if __name__ == "__main__":
main()