Compare commits

..

No commits in common. "db75e5f227f6056104e31d9a8b3817dccc5419c6" and "493b2e6e37e4e45f0dcdb8fc50f5fe03645bad8b" have entirely different histories.

11 changed files with 39 additions and 379 deletions

BIN
.DS_Store vendored

Binary file not shown.

View File

@ -1,6 +1,14 @@
FROM ubuntu:latest
RUN apt-get update && \
apt-get install -y python3-pip python3-dev && \
apt-get install -y build-essential && \
pip3 install pandas kaggle seaborn scikit-learn torch matplotlib \
RUN apt-get update \
&& apt-get install -y git python3 python3-pip curl \
&& curl -O https://bootstrap.pypa.io/get-pip.py \
&& python3 get-pip.py --user \
&& rm get-pip.py \
&& pip3 install --user kaggle \
&& pip3 install --user pandas \
&& pip3 install --user seaborn \
&& pip3 install --user scikit-learn
ENV PATH="/root/.local/bin:$PATH"
WORKDIR /app
COPY . /app
CMD ["python", "create_dataset.py"]

53
Jenkinsfile vendored
View File

@ -1,56 +1,29 @@
pipeline {
agent any
parameters {
node {
stage('Preparation') {
properties([
parameters([
string(
defaultValue: 'wojciechbatruszewicz',
defaultValue: 'bartekmalanka',
description: 'Kaggle username',
name: 'KAGGLE_USERNAME',
trim: false
)
),
password(
defaultValue: '',
description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
name: 'KAGGLE_KEY'
)
string(
defaultValue: '30',
description: 'dataset cutoff',
name: 'CUTOFF',
trim: false
)
])
])
}
stages {
stage('Download dataset') {
steps {
checkout scm
sh 'ls -l'
stage('Build') {
// Run the maven build
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
"KAGGLE_KEY=${params.KAGGLE_KEY}" ]) {
sh 'kaggle datasets download -d elakiricoder/gender-classification-dataset'
sh 'unzip -o gender-classification-dataset.zip'
}
}
}
stage('Docker') {
steps {
script {
def dockerImage = docker.build("docker-image", "./")
dockerImage.inside {
sh 'kaggle datasets download -d elakiricoder/gender-classification-dataset > output.txt'
sh 'ls -l'
sh 'ls -l'
sh 'python3 createDataset.py'
archiveArtifacts 'gender_classification_train.csv'
archiveArtifacts 'gender_classification_test.csv'
archiveArtifacts 'gender_classification_val.csv'
sh 'ls -l'
}
}
}
}
}
post {
success {
build job: 'x1-training/main', wait: false
archiveArtifacts artifacts: 'gender_classification_v7.csv, output.txt'
}
}
}

View File

@ -1,34 +0,0 @@
pipeline {
agent any
parameters {
buildSelector(
name: 'BUILD_SELECTOR',
defaultSelector: lastSuccessful(),
description: 'A build to take the artifacts from'
)
}
stages {
stage('Copy artifacts') {
steps {
script {
copyArtifacts(
projectName: 'x1-create-dataset',
selector: buildParameter('BUILD_SELECTOR'),
target: './'
)
}
}
}
stage('Run python file') {
steps {
script {
sh 'ls -l'
docker.image('docker-image').inside {
sh 'ls -l'
sh 'python3 ./datasetStats.py'
}
}
}
}
}
}

View File

@ -1,46 +0,0 @@
pipeline {
agent any
parameters {
buildSelector(
name: 'BUILD_SELECTOR',
defaultSelector: lastSuccessful(),
description: 'A build to take the artifacts from'
)
string(
name: 'EPOCHS',
description: 'Number of epochs',
defaultValue: '10'
)
}
stages {
stage('Copy artifacts') {
steps {
script {
copyArtifacts(
projectName: 'x1-training/main',
selector: buildParameter('BUILD_SELECTOR'),
target: './'
)
copyArtifacts(
projectName: 'x1-create-dataset',
selector: buildParameter('BUILD_SELECTOR'),
target: './'
)
}
}
}
stage('Save evaluation') {
steps {
script {
sh 'ls -l'
docker.image('docker-image').inside {
sh 'ls -l'
sh 'python3 ./evaluate.py'
archiveArtifacts 'plot.png'
archiveArtifacts 'results.csv'
}
}
}
}
}
}

View File

@ -1,45 +0,0 @@
pipeline {
agent any
parameters {
buildSelector(
name: 'BUILD_SELECTOR',
defaultSelector: lastSuccessful(),
description: 'A build to take the artifacts from'
)
string(
name: 'EPOCHS',
description: 'Number of epochs',
defaultValue: '10'
)
}
stages {
stage('Copy artifacts') {
steps {
script {
copyArtifacts(
projectName: 'x1-create-dataset',
selector: buildParameter('BUILD_SELECTOR'),
target: './'
)
}
}
}
stage('Run training and save model') {
steps {
script {
sh 'ls -l'
docker.image('docker-image').inside {
sh 'ls -l'
sh 'python3 ./train.py'
archiveArtifacts 'model.pt'
}
}
}
}
}
post {
success {
build job: 'x1-evaluation.eg/main', wait: false
}
}
}

View File

@ -1,25 +0,0 @@
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
gender_classification = pd.read_csv('gender_classification_v7.csv')
gender_classification_train_final, gender_classification_test = train_test_split(gender_classification, test_size=0.2, random_state=1)
gender_classification_test_final, gender_classification_val_final = train_test_split(gender_classification_test, test_size=0.5, random_state=1)
numeric_cols_train = gender_classification_train_final.select_dtypes(include='number').columns
numeric_cols_test = gender_classification_test_final.select_dtypes(include='number').columns
numeric_cols_val = gender_classification_val_final.select_dtypes(include='number').columns
scaler = MinMaxScaler()
gender_classification_train_final[numeric_cols_train] = scaler.fit_transform(gender_classification_train_final[numeric_cols_train])
gender_classification_test_final[numeric_cols_test] = scaler.fit_transform(gender_classification_test_final[numeric_cols_test])
gender_classification_val_final[numeric_cols_val] = scaler.fit_transform(gender_classification_val_final[numeric_cols_val])
gender_classification_train_final = gender_classification_train_final.dropna()
gender_classification_test_final = gender_classification_test_final.dropna()
gender_classification_val_final = gender_classification_val_final.dropna()
gender_classification_train_final.to_csv('gender_classification_train.csv', index=False)
gender_classification_test_final.to_csv('gender_classification_test.csv', index=False)
gender_classification_val_final.to_csv('gender_classification_val.csv', index=False)

View File

@ -1,9 +0,0 @@
import pandas as pd
home_loan_train = pd.read_csv('gender_classification_train.csv')
home_loan_test = pd.read_csv('gender_classification_test.csv')
home_loan_val = pd.read_csv('gender_classification_val.csv')
home_loan_train.describe()
home_loan_test.describe()
home_loan_val.describe()

View File

@ -1,80 +0,0 @@
import torch
from train import MyNeuralNetwork, load_data
from torch.utils.data import DataLoader
import csv
import os
import matplotlib.pyplot as plt
from typing import Tuple, List
def evaluate_model() -> Tuple[List[float], float]:
model = MyNeuralNetwork()
model.load_state_dict(torch.load('model.pt'))
model.eval()
test_dataset = load_data("gender_classification_test.csv")
batch_size: int = 32
test_dataloader: DataLoader = DataLoader(test_dataset, batch_size=batch_size)
predictions = []
labels = []
get_label = lambda pred: 1 if pred >= 0.5 else 0
total = 0
correct = 0
with torch.no_grad():
for batch_data, batch_labels in test_dataloader:
batch_predictions = model(batch_data)
predicted_batch_labels = [get_label(prediction) for prediction in batch_predictions]
total += len(predicted_batch_labels)
batch_labels_list = list(map(int,batch_labels.tolist()))
correct += sum(x == y for x, y in zip(predicted_batch_labels, batch_labels_list))
predictions.extend(batch_predictions)
labels.extend(batch_labels)
accuracy = correct/total
return predictions, accuracy
def save_predictions(predictions: list[float]) -> None:
filename = "results.csv"
column_name = "predict"
with open(filename, 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow([column_name])
for result in predictions:
loan_decision = 1 if result.item() > 0.5 else 0
writer.writerow([loan_decision])
def save_accuracy(accuracy):
filename = 'results.csv'
if os.path.exists(filename):
with open(filename, 'a') as file:
writer = csv.writer(file)
writer.writerow([accuracy])
else:
with open(filename, 'w') as file:
writer = csv.writer(file)
writer.writerow(['accuracy'])
writer.writerow([accuracy])
def plot_accuracy():
filename = 'results.csv'
accuracy_results = []
if os.path.exists(filename):
with open(filename, 'r') as file:
reader = csv.reader(file)
for idx, row in enumerate(reader):
if idx == 0:
continue
accuracy_results.append(float(row[0]))
iterations = list(map(str,range(1, len(accuracy_results)+1)))
plt.plot(iterations, accuracy_results)
plt.xlabel('build')
plt.ylabel('accuracy')
plt.title("Accuracies over builds.")
plt.savefig("plot.png")
def main():
predictions, accuracy = evaluate_model()
save_predictions(predictions)
save_accuracy(accuracy)
plot_accuracy()
if __name__ == "__main__":
main()

BIN
model.pt

Binary file not shown.

View File

@ -1,82 +0,0 @@
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
import numpy as np
import argparse
class MyNeuralNetwork(nn.Module):
def __init__(self, *args, **kwargs) -> None:
super(MyNeuralNetwork, self).__init__(*args, **kwargs)
self.fc1 = nn.Linear(7, 12)
self.relu = nn.ReLU()
self.fc1 = nn.Linear(7, 12)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(12, 1)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
x = self.sigmoid(x)
return x
def prepare_df_for_nn(df):
id_column_name_list = [column for column in df.columns.to_list() if 'id' in column.lower()]
if len(id_column_name_list) == 0:
pass
else:
df.drop(id_column_name_list[0], inplace=True, axis=1)
encoder = LabelBinarizer()
df.reset_index(inplace=True)
for column in df.columns:
if str(df[column].dtype).lower() == 'object':
encoded_column = encoder.fit_transform(df[column])
df[column] = pd.Series(encoded_column.flatten(), dtype=pd.Int16Dtype)
return df
def load_data(path):
df = pd.read_csv(path)
train_dataset = prepare_df_for_nn(df)
x = train_dataset.iloc[:, :-1].values.astype(float)
y = train_dataset.iloc[:, -1].values.astype(float)
x_tensor = torch.tensor(x, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)
dataset = TensorDataset(x_tensor, y_tensor)
return dataset
def train(epochs, dataloader_train):
model: MyNeuralNetwork = MyNeuralNetwork()
criterion: nn.BCELoss = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in range(epochs):
for inputs, labels in dataloader_train:
outputs = model(inputs)
labels = labels.reshape((labels.shape[0], 1))
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")
return model
def main():
parser = argparse.ArgumentParser(description='A test program.')
parser.add_argument("--epochs", help="Prints the supplied argument.", default='10')
args = parser.parse_args()
config = vars(args)
epochs = int(config["epochs"])
train_dataset = load_data("gender_classification_train.csv")
batch_size = 32
dataloader_train = DataLoader(train_dataset, batch_size = batch_size, shuffle=True)
model = train(epochs, dataloader_train)
torch.save(model.state_dict(), 'model.pt')
if __name__ == "__main__":
main()