From 8c2f6e4e0fc033e466bdb5c82f61795121a871ef Mon Sep 17 00:00:00 2001 From: "bartosz.maslanka.consultant" Date: Wed, 28 Jun 2023 22:39:02 +0200 Subject: [PATCH] add jnks,etc --- .DS_Store | Bin 0 -> 6148 bytes Dockerfile | 18 ++++-------- Jenkinsfile | 79 +++++++++++++++++++++++++++++++++----------------- evaluate.py | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++ model.pt | Bin 0 -> 1939 bytes train.py | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 220 insertions(+), 39 deletions(-) create mode 100644 .DS_Store create mode 100644 evaluate.py create mode 100644 model.pt create mode 100644 train.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..19e6e6f8242d1f6daa3a726326a81637c7bdbc05 GIT binary patch literal 6148 zcmeHKOHKko5Pgj#fVg1k#$--F)Dw&Z7rONTDhv|B1PmyNu4eCHJdKBOrLU@50>ijf zW2%y>*PnV*^y|qq4IuO$vlE~Lph*|3wHPW)=EZlc7DpPypwAd$iPc3u9}e=JY#;cG z3dr6ykYR)sy4cvi`G7NgEy5A8`=O=_*he38W=_dj;0_by7}=hGacXZRP6uOV&$P}0 z*JS6o<-4-1u8G$Yzl}@fPg!k7R&~{JiPgjdSxd(1mA#bmTg+VWcS((E#(RlzfM@1s zT+uO3$(i6DPh=0a^}tz)$F7?z;0m|`zo-DuY_VqB(4#Bh3b+Dm1?2mX&;`?og`s{r z81xE2taDh6ZMjPbCyAIwEDSkA3q~axHN-1MFgnMRn3qN@42_NuFCQUR4)KN(v^t-k zXgWe_=+PB$1-2E~vu;Pu|MTzf|Jx+*xdN`hzfvI7`&a!QmlV&|oyEyno6v9QVv<)F lt|@HjQOsO9iVx{(98aV{Od}SC9HIFi0hPgnEAXQVd;%<|a9RKW literal 0 HcmV?d00001 diff --git a/Dockerfile b/Dockerfile index a207441..7d01f66 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,14 +1,6 @@ FROM ubuntu:latest -RUN apt-get update \ - && apt-get install -y git python3 python3-pip curl \ - && curl -O https://bootstrap.pypa.io/get-pip.py \ - && python3 get-pip.py --user \ - && rm get-pip.py \ - && pip3 install --user kaggle \ - && pip3 install --user pandas \ - && pip3 install --user seaborn \ - && pip3 install --user scikit-learn -ENV PATH="/root/.local/bin:$PATH" -WORKDIR /app -COPY . /app -CMD ["python", "create_dataset.py"] \ No newline at end of file + +RUN apt-get update && \ + apt-get install -y python3-pip python3-dev && \ + apt-get install -y build-essential && \ + pip3 install pandas kaggle seaborn scikit-learn torch matplotlib \ \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile index 20628d5..1ea80b8 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,29 +1,56 @@ -node { - stage('Preparation') { - properties([ - parameters([ - string( - defaultValue: 'bartekmalanka', - description: 'Kaggle username', - name: 'KAGGLE_USERNAME', - trim: false - ), - password( - defaultValue: '', - description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials', - name: 'KAGGLE_KEY' - ) - ]) - ]) - } - stage('Build') { - // Run the maven build - withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", - "KAGGLE_KEY=${params.KAGGLE_KEY}" ]) { - sh 'kaggle datasets download -d elakiricoder/gender-classification-dataset > output.txt' +pipeline { + agent any + parameters { + string( + defaultValue: 'wojciechbatruszewicz', + description: 'Kaggle username', + name: 'KAGGLE_USERNAME', + trim: false + ) + password( + defaultValue: '', + description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials', + name: 'KAGGLE_KEY' + ) + string( + defaultValue: '30', + description: 'dataset cutoff', + name: 'CUTOFF', + trim: false + ) + } + stages { + stage('Download dataset') { + steps { + checkout scm sh 'ls -l' - archiveArtifacts artifacts: 'gender_classification_v7.csv, output.txt' - + withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", + "KAGGLE_KEY=${params.KAGGLE_KEY}" ]) { + sh 'kaggle datasets download -d elakiricoder/gender-classification-dataset' + sh 'unzip -o gender-classification-dataset.zip' + } + } + } + stage('Docker') { + steps { + script { + def dockerImage = docker.build("docker-image", "./") + dockerImage.inside { + sh 'ls -l' + sh 'ls -l' + sh 'python3 createDataset.py' + archiveArtifacts 'gender_classification_train.csv' + archiveArtifacts 'gender_classification_test.csv' + archiveArtifacts 'gender_classification_val.csv' + sh 'ls -l' + } + } + } + } + } + post { + success { + build job: 'x1-training/main', wait: false } } -} \ No newline at end of file +} diff --git a/evaluate.py b/evaluate.py new file mode 100644 index 0000000..38c2fff --- /dev/null +++ b/evaluate.py @@ -0,0 +1,80 @@ +import torch +from train import MyNeuralNetwork, load_data +from torch.utils.data import DataLoader +import csv +import os +import matplotlib.pyplot as plt +from typing import Tuple, List + +def evaluate_model() -> Tuple[List[float], float]: + model = MyNeuralNetwork() + model.load_state_dict(torch.load('model.pt')) + model.eval() + test_dataset = load_data("gender_classification_test.csv") + batch_size: int = 32 + test_dataloader: DataLoader = DataLoader(test_dataset, batch_size=batch_size) + predictions = [] + labels = [] + get_label = lambda pred: 1 if pred >= 0.5 else 0 + total = 0 + correct = 0 + with torch.no_grad(): + for batch_data, batch_labels in test_dataloader: + batch_predictions = model(batch_data) + predicted_batch_labels = [get_label(prediction) for prediction in batch_predictions] + total += len(predicted_batch_labels) + batch_labels_list = list(map(int,batch_labels.tolist())) + correct += sum(x == y for x, y in zip(predicted_batch_labels, batch_labels_list)) + predictions.extend(batch_predictions) + labels.extend(batch_labels) + accuracy = correct/total + return predictions, accuracy + +def save_predictions(predictions: list[float]) -> None: + filename = "results.csv" + column_name = "predict" + with open(filename, 'w', newline='') as file: + writer = csv.writer(file) + writer.writerow([column_name]) + for result in predictions: + loan_decision = 1 if result.item() > 0.5 else 0 + writer.writerow([loan_decision]) + +def save_accuracy(accuracy): + filename = 'results.csv' + if os.path.exists(filename): + with open(filename, 'a') as file: + writer = csv.writer(file) + writer.writerow([accuracy]) + else: + with open(filename, 'w') as file: + writer = csv.writer(file) + writer.writerow(['accuracy']) + writer.writerow([accuracy]) + +def plot_accuracy(): + filename = 'results.csv' + accuracy_results = [] + if os.path.exists(filename): + with open(filename, 'r') as file: + reader = csv.reader(file) + for idx, row in enumerate(reader): + if idx == 0: + continue + accuracy_results.append(float(row[0])) + iterations = list(map(str,range(1, len(accuracy_results)+1))) + plt.plot(iterations, accuracy_results) + plt.xlabel('build') + plt.ylabel('accuracy') + plt.title("Accuracies over builds.") + plt.savefig("plot.png") + +def main(): + predictions, accuracy = evaluate_model() + save_predictions(predictions) + save_accuracy(accuracy) + plot_accuracy() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/model.pt b/model.pt new file mode 100644 index 0000000000000000000000000000000000000000..c17f2d355607f4647f8bdd99c0bab933a4167e08 GIT binary patch literal 1939 zcmb7Ee@t6d6n>@j2c@tLHgrShKyV;X+OiQM>wSTk;`Yes3X+0urL=EtrBH7BibgF$ zG8ki779|pcPBU>~fSSl~ThjJE{K2MVI++VY7Kk$@Mn&BIaEl2p-uK$!YX=%{a{FG- zJKuNDIo~x%B1hiTaq;5ofi!T`t#<)8$o|J#?eb>9Uy_ zy2b1EnA@}pD9*_-$uP=`JuP+`#)lyay~}N3s>4PFl$b==M-8PWF@9QGeNaZ#Fi^ga zA}A##>xK!Gj1qN2#SsS=Ng@N4Mwn#cQsu&FbXvSH*@4e;#5=-V8o16Oqm&F>&vSIa z4V0)0riAgS4BTjhnI##h`@`gc%;>7(|q^8*awI;3&3)S<)H!kP$u{i9#EW zVrvvf5qt!5h#0ty=jehNTojpM{G$xaGQw=1A3kPEz?qnvX~x0{1h?~9;>uww0dxKE zalVFaw8x93fq9h3%NGjg4)gu6z`OR@#1#2w77vCi|a%qwS&leZU8(r zQUg9osRYzuZ|LyIoUU$XJa|8p$JP$!p$eac{N?4FAk{DirdpT4LP9#YQlE}S_nrqA zM+0EEp%;{-WRmtXLY$tL+Go?lOS*4RkYt!656vv1~#AjE4W~v z3rtM1=+kvcY}xf?bo9V(w(>_dq`CG1@Z9b}eJ4gx@1HM#pU6>k>_P|9*S!fwKa&QY z>9>cfgO@@3#R4+eaUP6~Cz4djYk{3-Ea+0{EciJypd0!jnRKZ)khNQ-Hl?@BXOf|KED|<4lL{BWkVpSd<*RFKg8GPqt~at{om&ot$zNZWS-~z@Pc!Y zn{@%b)u0SxVQqOHWn$=AR0+XqL4+g$Z0|S zlA`jgSwTh=Gf+}#e#B*~xM0Ou8hN|-G~?mp))h>^EsW!?Q9{H25(M|zKuLIaDIp~| H|FG}h>q-w` literal 0 HcmV?d00001 diff --git a/train.py b/train.py new file mode 100644 index 0000000..ab2bbbd --- /dev/null +++ b/train.py @@ -0,0 +1,82 @@ +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader, TensorDataset +import pandas as pd +from sklearn.preprocessing import LabelBinarizer +import numpy as np +import argparse + + +class MyNeuralNetwork(nn.Module): + def __init__(self, *args, **kwargs) -> None: + super(MyNeuralNetwork, self).__init__(*args, **kwargs) + self.fc1 = nn.Linear(7, 12) + self.relu = nn.ReLU() + self.fc1 = nn.Linear(7, 12) + self.relu = nn.ReLU() + self.fc2 = nn.Linear(12, 1) + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + x = self.fc1(x) + x = self.relu(x) + x = self.fc2(x) + x = self.sigmoid(x) + return x + +def prepare_df_for_nn(df): + + id_column_name_list = [column for column in df.columns.to_list() if 'id' in column.lower()] + if len(id_column_name_list) == 0: + pass + else: + df.drop(id_column_name_list[0], inplace=True, axis=1) + encoder = LabelBinarizer() + df.reset_index(inplace=True) + for column in df.columns: + if str(df[column].dtype).lower() == 'object': + encoded_column = encoder.fit_transform(df[column]) + df[column] = pd.Series(encoded_column.flatten(), dtype=pd.Int16Dtype) + return df + +def load_data(path): + df = pd.read_csv(path) + train_dataset = prepare_df_for_nn(df) + x = train_dataset.iloc[:, :-1].values.astype(float) + y = train_dataset.iloc[:, -1].values.astype(float) + x_tensor = torch.tensor(x, dtype=torch.float32) + y_tensor = torch.tensor(y, dtype=torch.float32) + dataset = TensorDataset(x_tensor, y_tensor) + return dataset + +def train(epochs, dataloader_train): + model: MyNeuralNetwork = MyNeuralNetwork() + criterion: nn.BCELoss = nn.BCELoss() + optimizer = optim.Adam(model.parameters(), lr=0.001) + for epoch in range(epochs): + for inputs, labels in dataloader_train: + outputs = model(inputs) + labels = labels.reshape((labels.shape[0], 1)) + loss = criterion(outputs, labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}") + + return model + +def main(): + parser = argparse.ArgumentParser(description='A test program.') + parser.add_argument("--epochs", help="Prints the supplied argument.", default='10') + args = parser.parse_args() + config = vars(args) + epochs = int(config["epochs"]) + train_dataset = load_data("gender_classification_train.csv") + batch_size = 32 + dataloader_train = DataLoader(train_dataset, batch_size = batch_size, shuffle=True) + model = train(epochs, dataloader_train) + torch.save(model.state_dict(), 'model.pt') + +if __name__ == "__main__": + main() \ No newline at end of file