Compare commits
No commits in common. "db75e5f227f6056104e31d9a8b3817dccc5419c6" and "493b2e6e37e4e45f0dcdb8fc50f5fe03645bad8b" have entirely different histories.
db75e5f227
...
493b2e6e37
18
Dockerfile
18
Dockerfile
@ -1,6 +1,14 @@
|
|||||||
FROM ubuntu:latest
|
FROM ubuntu:latest
|
||||||
|
RUN apt-get update \
|
||||||
RUN apt-get update && \
|
&& apt-get install -y git python3 python3-pip curl \
|
||||||
apt-get install -y python3-pip python3-dev && \
|
&& curl -O https://bootstrap.pypa.io/get-pip.py \
|
||||||
apt-get install -y build-essential && \
|
&& python3 get-pip.py --user \
|
||||||
pip3 install pandas kaggle seaborn scikit-learn torch matplotlib \
|
&& rm get-pip.py \
|
||||||
|
&& pip3 install --user kaggle \
|
||||||
|
&& pip3 install --user pandas \
|
||||||
|
&& pip3 install --user seaborn \
|
||||||
|
&& pip3 install --user scikit-learn
|
||||||
|
ENV PATH="/root/.local/bin:$PATH"
|
||||||
|
WORKDIR /app
|
||||||
|
COPY . /app
|
||||||
|
CMD ["python", "create_dataset.py"]
|
53
Jenkinsfile
vendored
53
Jenkinsfile
vendored
@ -1,56 +1,29 @@
|
|||||||
pipeline {
|
node {
|
||||||
agent any
|
stage('Preparation') {
|
||||||
parameters {
|
properties([
|
||||||
|
parameters([
|
||||||
string(
|
string(
|
||||||
defaultValue: 'wojciechbatruszewicz',
|
defaultValue: 'bartekmalanka',
|
||||||
description: 'Kaggle username',
|
description: 'Kaggle username',
|
||||||
name: 'KAGGLE_USERNAME',
|
name: 'KAGGLE_USERNAME',
|
||||||
trim: false
|
trim: false
|
||||||
)
|
),
|
||||||
password(
|
password(
|
||||||
defaultValue: '',
|
defaultValue: '',
|
||||||
description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
|
description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
|
||||||
name: 'KAGGLE_KEY'
|
name: 'KAGGLE_KEY'
|
||||||
)
|
)
|
||||||
string(
|
])
|
||||||
defaultValue: '30',
|
])
|
||||||
description: 'dataset cutoff',
|
|
||||||
name: 'CUTOFF',
|
|
||||||
trim: false
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
stages {
|
stage('Build') {
|
||||||
stage('Download dataset') {
|
// Run the maven build
|
||||||
steps {
|
|
||||||
checkout scm
|
|
||||||
sh 'ls -l'
|
|
||||||
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
|
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
|
||||||
"KAGGLE_KEY=${params.KAGGLE_KEY}" ]) {
|
"KAGGLE_KEY=${params.KAGGLE_KEY}" ]) {
|
||||||
sh 'kaggle datasets download -d elakiricoder/gender-classification-dataset'
|
sh 'kaggle datasets download -d elakiricoder/gender-classification-dataset > output.txt'
|
||||||
sh 'unzip -o gender-classification-dataset.zip'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage('Docker') {
|
|
||||||
steps {
|
|
||||||
script {
|
|
||||||
def dockerImage = docker.build("docker-image", "./")
|
|
||||||
dockerImage.inside {
|
|
||||||
sh 'ls -l'
|
sh 'ls -l'
|
||||||
sh 'ls -l'
|
archiveArtifacts artifacts: 'gender_classification_v7.csv, output.txt'
|
||||||
sh 'python3 createDataset.py'
|
|
||||||
archiveArtifacts 'gender_classification_train.csv'
|
|
||||||
archiveArtifacts 'gender_classification_test.csv'
|
|
||||||
archiveArtifacts 'gender_classification_val.csv'
|
|
||||||
sh 'ls -l'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
post {
|
|
||||||
success {
|
|
||||||
build job: 'x1-training/main', wait: false
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -1,34 +0,0 @@
|
|||||||
pipeline {
|
|
||||||
agent any
|
|
||||||
parameters {
|
|
||||||
buildSelector(
|
|
||||||
name: 'BUILD_SELECTOR',
|
|
||||||
defaultSelector: lastSuccessful(),
|
|
||||||
description: 'A build to take the artifacts from'
|
|
||||||
)
|
|
||||||
}
|
|
||||||
stages {
|
|
||||||
stage('Copy artifacts') {
|
|
||||||
steps {
|
|
||||||
script {
|
|
||||||
copyArtifacts(
|
|
||||||
projectName: 'x1-create-dataset',
|
|
||||||
selector: buildParameter('BUILD_SELECTOR'),
|
|
||||||
target: './'
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage('Run python file') {
|
|
||||||
steps {
|
|
||||||
script {
|
|
||||||
sh 'ls -l'
|
|
||||||
docker.image('docker-image').inside {
|
|
||||||
sh 'ls -l'
|
|
||||||
sh 'python3 ./datasetStats.py'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,46 +0,0 @@
|
|||||||
pipeline {
|
|
||||||
agent any
|
|
||||||
parameters {
|
|
||||||
buildSelector(
|
|
||||||
name: 'BUILD_SELECTOR',
|
|
||||||
defaultSelector: lastSuccessful(),
|
|
||||||
description: 'A build to take the artifacts from'
|
|
||||||
)
|
|
||||||
string(
|
|
||||||
name: 'EPOCHS',
|
|
||||||
description: 'Number of epochs',
|
|
||||||
defaultValue: '10'
|
|
||||||
)
|
|
||||||
}
|
|
||||||
stages {
|
|
||||||
stage('Copy artifacts') {
|
|
||||||
steps {
|
|
||||||
script {
|
|
||||||
copyArtifacts(
|
|
||||||
projectName: 'x1-training/main',
|
|
||||||
selector: buildParameter('BUILD_SELECTOR'),
|
|
||||||
target: './'
|
|
||||||
)
|
|
||||||
copyArtifacts(
|
|
||||||
projectName: 'x1-create-dataset',
|
|
||||||
selector: buildParameter('BUILD_SELECTOR'),
|
|
||||||
target: './'
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage('Save evaluation') {
|
|
||||||
steps {
|
|
||||||
script {
|
|
||||||
sh 'ls -l'
|
|
||||||
docker.image('docker-image').inside {
|
|
||||||
sh 'ls -l'
|
|
||||||
sh 'python3 ./evaluate.py'
|
|
||||||
archiveArtifacts 'plot.png'
|
|
||||||
archiveArtifacts 'results.csv'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,45 +0,0 @@
|
|||||||
pipeline {
|
|
||||||
agent any
|
|
||||||
parameters {
|
|
||||||
buildSelector(
|
|
||||||
name: 'BUILD_SELECTOR',
|
|
||||||
defaultSelector: lastSuccessful(),
|
|
||||||
description: 'A build to take the artifacts from'
|
|
||||||
)
|
|
||||||
string(
|
|
||||||
name: 'EPOCHS',
|
|
||||||
description: 'Number of epochs',
|
|
||||||
defaultValue: '10'
|
|
||||||
)
|
|
||||||
}
|
|
||||||
stages {
|
|
||||||
stage('Copy artifacts') {
|
|
||||||
steps {
|
|
||||||
script {
|
|
||||||
copyArtifacts(
|
|
||||||
projectName: 'x1-create-dataset',
|
|
||||||
selector: buildParameter('BUILD_SELECTOR'),
|
|
||||||
target: './'
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage('Run training and save model') {
|
|
||||||
steps {
|
|
||||||
script {
|
|
||||||
sh 'ls -l'
|
|
||||||
docker.image('docker-image').inside {
|
|
||||||
sh 'ls -l'
|
|
||||||
sh 'python3 ./train.py'
|
|
||||||
archiveArtifacts 'model.pt'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
post {
|
|
||||||
success {
|
|
||||||
build job: 'x1-evaluation.eg/main', wait: false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,25 +0,0 @@
|
|||||||
import pandas as pd
|
|
||||||
from sklearn.preprocessing import MinMaxScaler
|
|
||||||
from sklearn.model_selection import train_test_split
|
|
||||||
gender_classification = pd.read_csv('gender_classification_v7.csv')
|
|
||||||
|
|
||||||
gender_classification_train_final, gender_classification_test = train_test_split(gender_classification, test_size=0.2, random_state=1)
|
|
||||||
gender_classification_test_final, gender_classification_val_final = train_test_split(gender_classification_test, test_size=0.5, random_state=1)
|
|
||||||
|
|
||||||
numeric_cols_train = gender_classification_train_final.select_dtypes(include='number').columns
|
|
||||||
numeric_cols_test = gender_classification_test_final.select_dtypes(include='number').columns
|
|
||||||
numeric_cols_val = gender_classification_val_final.select_dtypes(include='number').columns
|
|
||||||
|
|
||||||
scaler = MinMaxScaler()
|
|
||||||
|
|
||||||
gender_classification_train_final[numeric_cols_train] = scaler.fit_transform(gender_classification_train_final[numeric_cols_train])
|
|
||||||
gender_classification_test_final[numeric_cols_test] = scaler.fit_transform(gender_classification_test_final[numeric_cols_test])
|
|
||||||
gender_classification_val_final[numeric_cols_val] = scaler.fit_transform(gender_classification_val_final[numeric_cols_val])
|
|
||||||
|
|
||||||
gender_classification_train_final = gender_classification_train_final.dropna()
|
|
||||||
gender_classification_test_final = gender_classification_test_final.dropna()
|
|
||||||
gender_classification_val_final = gender_classification_val_final.dropna()
|
|
||||||
|
|
||||||
gender_classification_train_final.to_csv('gender_classification_train.csv', index=False)
|
|
||||||
gender_classification_test_final.to_csv('gender_classification_test.csv', index=False)
|
|
||||||
gender_classification_val_final.to_csv('gender_classification_val.csv', index=False)
|
|
@ -1,9 +0,0 @@
|
|||||||
import pandas as pd
|
|
||||||
|
|
||||||
home_loan_train = pd.read_csv('gender_classification_train.csv')
|
|
||||||
home_loan_test = pd.read_csv('gender_classification_test.csv')
|
|
||||||
home_loan_val = pd.read_csv('gender_classification_val.csv')
|
|
||||||
|
|
||||||
home_loan_train.describe()
|
|
||||||
home_loan_test.describe()
|
|
||||||
home_loan_val.describe()
|
|
80
evaluate.py
80
evaluate.py
@ -1,80 +0,0 @@
|
|||||||
import torch
|
|
||||||
from train import MyNeuralNetwork, load_data
|
|
||||||
from torch.utils.data import DataLoader
|
|
||||||
import csv
|
|
||||||
import os
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
from typing import Tuple, List
|
|
||||||
|
|
||||||
def evaluate_model() -> Tuple[List[float], float]:
|
|
||||||
model = MyNeuralNetwork()
|
|
||||||
model.load_state_dict(torch.load('model.pt'))
|
|
||||||
model.eval()
|
|
||||||
test_dataset = load_data("gender_classification_test.csv")
|
|
||||||
batch_size: int = 32
|
|
||||||
test_dataloader: DataLoader = DataLoader(test_dataset, batch_size=batch_size)
|
|
||||||
predictions = []
|
|
||||||
labels = []
|
|
||||||
get_label = lambda pred: 1 if pred >= 0.5 else 0
|
|
||||||
total = 0
|
|
||||||
correct = 0
|
|
||||||
with torch.no_grad():
|
|
||||||
for batch_data, batch_labels in test_dataloader:
|
|
||||||
batch_predictions = model(batch_data)
|
|
||||||
predicted_batch_labels = [get_label(prediction) for prediction in batch_predictions]
|
|
||||||
total += len(predicted_batch_labels)
|
|
||||||
batch_labels_list = list(map(int,batch_labels.tolist()))
|
|
||||||
correct += sum(x == y for x, y in zip(predicted_batch_labels, batch_labels_list))
|
|
||||||
predictions.extend(batch_predictions)
|
|
||||||
labels.extend(batch_labels)
|
|
||||||
accuracy = correct/total
|
|
||||||
return predictions, accuracy
|
|
||||||
|
|
||||||
def save_predictions(predictions: list[float]) -> None:
|
|
||||||
filename = "results.csv"
|
|
||||||
column_name = "predict"
|
|
||||||
with open(filename, 'w', newline='') as file:
|
|
||||||
writer = csv.writer(file)
|
|
||||||
writer.writerow([column_name])
|
|
||||||
for result in predictions:
|
|
||||||
loan_decision = 1 if result.item() > 0.5 else 0
|
|
||||||
writer.writerow([loan_decision])
|
|
||||||
|
|
||||||
def save_accuracy(accuracy):
|
|
||||||
filename = 'results.csv'
|
|
||||||
if os.path.exists(filename):
|
|
||||||
with open(filename, 'a') as file:
|
|
||||||
writer = csv.writer(file)
|
|
||||||
writer.writerow([accuracy])
|
|
||||||
else:
|
|
||||||
with open(filename, 'w') as file:
|
|
||||||
writer = csv.writer(file)
|
|
||||||
writer.writerow(['accuracy'])
|
|
||||||
writer.writerow([accuracy])
|
|
||||||
|
|
||||||
def plot_accuracy():
|
|
||||||
filename = 'results.csv'
|
|
||||||
accuracy_results = []
|
|
||||||
if os.path.exists(filename):
|
|
||||||
with open(filename, 'r') as file:
|
|
||||||
reader = csv.reader(file)
|
|
||||||
for idx, row in enumerate(reader):
|
|
||||||
if idx == 0:
|
|
||||||
continue
|
|
||||||
accuracy_results.append(float(row[0]))
|
|
||||||
iterations = list(map(str,range(1, len(accuracy_results)+1)))
|
|
||||||
plt.plot(iterations, accuracy_results)
|
|
||||||
plt.xlabel('build')
|
|
||||||
plt.ylabel('accuracy')
|
|
||||||
plt.title("Accuracies over builds.")
|
|
||||||
plt.savefig("plot.png")
|
|
||||||
|
|
||||||
def main():
|
|
||||||
predictions, accuracy = evaluate_model()
|
|
||||||
save_predictions(predictions)
|
|
||||||
save_accuracy(accuracy)
|
|
||||||
plot_accuracy()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
82
train.py
82
train.py
@ -1,82 +0,0 @@
|
|||||||
import torch
|
|
||||||
import torch.nn as nn
|
|
||||||
import torch.optim as optim
|
|
||||||
from torch.utils.data import DataLoader, TensorDataset
|
|
||||||
import pandas as pd
|
|
||||||
from sklearn.preprocessing import LabelBinarizer
|
|
||||||
import numpy as np
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
|
|
||||||
class MyNeuralNetwork(nn.Module):
|
|
||||||
def __init__(self, *args, **kwargs) -> None:
|
|
||||||
super(MyNeuralNetwork, self).__init__(*args, **kwargs)
|
|
||||||
self.fc1 = nn.Linear(7, 12)
|
|
||||||
self.relu = nn.ReLU()
|
|
||||||
self.fc1 = nn.Linear(7, 12)
|
|
||||||
self.relu = nn.ReLU()
|
|
||||||
self.fc2 = nn.Linear(12, 1)
|
|
||||||
self.sigmoid = nn.Sigmoid()
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
x = self.fc1(x)
|
|
||||||
x = self.relu(x)
|
|
||||||
x = self.fc2(x)
|
|
||||||
x = self.sigmoid(x)
|
|
||||||
return x
|
|
||||||
|
|
||||||
def prepare_df_for_nn(df):
|
|
||||||
|
|
||||||
id_column_name_list = [column for column in df.columns.to_list() if 'id' in column.lower()]
|
|
||||||
if len(id_column_name_list) == 0:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
df.drop(id_column_name_list[0], inplace=True, axis=1)
|
|
||||||
encoder = LabelBinarizer()
|
|
||||||
df.reset_index(inplace=True)
|
|
||||||
for column in df.columns:
|
|
||||||
if str(df[column].dtype).lower() == 'object':
|
|
||||||
encoded_column = encoder.fit_transform(df[column])
|
|
||||||
df[column] = pd.Series(encoded_column.flatten(), dtype=pd.Int16Dtype)
|
|
||||||
return df
|
|
||||||
|
|
||||||
def load_data(path):
|
|
||||||
df = pd.read_csv(path)
|
|
||||||
train_dataset = prepare_df_for_nn(df)
|
|
||||||
x = train_dataset.iloc[:, :-1].values.astype(float)
|
|
||||||
y = train_dataset.iloc[:, -1].values.astype(float)
|
|
||||||
x_tensor = torch.tensor(x, dtype=torch.float32)
|
|
||||||
y_tensor = torch.tensor(y, dtype=torch.float32)
|
|
||||||
dataset = TensorDataset(x_tensor, y_tensor)
|
|
||||||
return dataset
|
|
||||||
|
|
||||||
def train(epochs, dataloader_train):
|
|
||||||
model: MyNeuralNetwork = MyNeuralNetwork()
|
|
||||||
criterion: nn.BCELoss = nn.BCELoss()
|
|
||||||
optimizer = optim.Adam(model.parameters(), lr=0.001)
|
|
||||||
for epoch in range(epochs):
|
|
||||||
for inputs, labels in dataloader_train:
|
|
||||||
outputs = model(inputs)
|
|
||||||
labels = labels.reshape((labels.shape[0], 1))
|
|
||||||
loss = criterion(outputs, labels)
|
|
||||||
optimizer.zero_grad()
|
|
||||||
loss.backward()
|
|
||||||
optimizer.step()
|
|
||||||
print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")
|
|
||||||
|
|
||||||
return model
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(description='A test program.')
|
|
||||||
parser.add_argument("--epochs", help="Prints the supplied argument.", default='10')
|
|
||||||
args = parser.parse_args()
|
|
||||||
config = vars(args)
|
|
||||||
epochs = int(config["epochs"])
|
|
||||||
train_dataset = load_data("gender_classification_train.csv")
|
|
||||||
batch_size = 32
|
|
||||||
dataloader_train = DataLoader(train_dataset, batch_size = batch_size, shuffle=True)
|
|
||||||
model = train(epochs, dataloader_train)
|
|
||||||
torch.save(model.state_dict(), 'model.pt')
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
Loading…
Reference in New Issue
Block a user