diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..19e6e6f Binary files /dev/null and b/.DS_Store differ diff --git a/Dockerfile b/Dockerfile index a207441..7d01f66 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,14 +1,6 @@ FROM ubuntu:latest -RUN apt-get update \ - && apt-get install -y git python3 python3-pip curl \ - && curl -O https://bootstrap.pypa.io/get-pip.py \ - && python3 get-pip.py --user \ - && rm get-pip.py \ - && pip3 install --user kaggle \ - && pip3 install --user pandas \ - && pip3 install --user seaborn \ - && pip3 install --user scikit-learn -ENV PATH="/root/.local/bin:$PATH" -WORKDIR /app -COPY . /app -CMD ["python", "create_dataset.py"] \ No newline at end of file + +RUN apt-get update && \ + apt-get install -y python3-pip python3-dev && \ + apt-get install -y build-essential && \ + pip3 install pandas kaggle seaborn scikit-learn torch matplotlib \ \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile index 20628d5..1ea80b8 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,29 +1,56 @@ -node { - stage('Preparation') { - properties([ - parameters([ - string( - defaultValue: 'bartekmalanka', - description: 'Kaggle username', - name: 'KAGGLE_USERNAME', - trim: false - ), - password( - defaultValue: '', - description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials', - name: 'KAGGLE_KEY' - ) - ]) - ]) - } - stage('Build') { - // Run the maven build - withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", - "KAGGLE_KEY=${params.KAGGLE_KEY}" ]) { - sh 'kaggle datasets download -d elakiricoder/gender-classification-dataset > output.txt' +pipeline { + agent any + parameters { + string( + defaultValue: 'wojciechbatruszewicz', + description: 'Kaggle username', + name: 'KAGGLE_USERNAME', + trim: false + ) + password( + defaultValue: '', + description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials', + name: 'KAGGLE_KEY' + ) + string( + defaultValue: '30', + description: 'dataset cutoff', + name: 'CUTOFF', + trim: false + ) + } + stages { + stage('Download dataset') { + steps { + checkout scm sh 'ls -l' - archiveArtifacts artifacts: 'gender_classification_v7.csv, output.txt' - + withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", + "KAGGLE_KEY=${params.KAGGLE_KEY}" ]) { + sh 'kaggle datasets download -d elakiricoder/gender-classification-dataset' + sh 'unzip -o gender-classification-dataset.zip' + } + } + } + stage('Docker') { + steps { + script { + def dockerImage = docker.build("docker-image", "./") + dockerImage.inside { + sh 'ls -l' + sh 'ls -l' + sh 'python3 createDataset.py' + archiveArtifacts 'gender_classification_train.csv' + archiveArtifacts 'gender_classification_test.csv' + archiveArtifacts 'gender_classification_val.csv' + sh 'ls -l' + } + } + } + } + } + post { + success { + build job: 'x1-training/main', wait: false } } -} \ No newline at end of file +} diff --git a/evaluate.py b/evaluate.py new file mode 100644 index 0000000..38c2fff --- /dev/null +++ b/evaluate.py @@ -0,0 +1,80 @@ +import torch +from train import MyNeuralNetwork, load_data +from torch.utils.data import DataLoader +import csv +import os +import matplotlib.pyplot as plt +from typing import Tuple, List + +def evaluate_model() -> Tuple[List[float], float]: + model = MyNeuralNetwork() + model.load_state_dict(torch.load('model.pt')) + model.eval() + test_dataset = load_data("gender_classification_test.csv") + batch_size: int = 32 + test_dataloader: DataLoader = DataLoader(test_dataset, batch_size=batch_size) + predictions = [] + labels = [] + get_label = lambda pred: 1 if pred >= 0.5 else 0 + total = 0 + correct = 0 + with torch.no_grad(): + for batch_data, batch_labels in test_dataloader: + batch_predictions = model(batch_data) + predicted_batch_labels = [get_label(prediction) for prediction in batch_predictions] + total += len(predicted_batch_labels) + batch_labels_list = list(map(int,batch_labels.tolist())) + correct += sum(x == y for x, y in zip(predicted_batch_labels, batch_labels_list)) + predictions.extend(batch_predictions) + labels.extend(batch_labels) + accuracy = correct/total + return predictions, accuracy + +def save_predictions(predictions: list[float]) -> None: + filename = "results.csv" + column_name = "predict" + with open(filename, 'w', newline='') as file: + writer = csv.writer(file) + writer.writerow([column_name]) + for result in predictions: + loan_decision = 1 if result.item() > 0.5 else 0 + writer.writerow([loan_decision]) + +def save_accuracy(accuracy): + filename = 'results.csv' + if os.path.exists(filename): + with open(filename, 'a') as file: + writer = csv.writer(file) + writer.writerow([accuracy]) + else: + with open(filename, 'w') as file: + writer = csv.writer(file) + writer.writerow(['accuracy']) + writer.writerow([accuracy]) + +def plot_accuracy(): + filename = 'results.csv' + accuracy_results = [] + if os.path.exists(filename): + with open(filename, 'r') as file: + reader = csv.reader(file) + for idx, row in enumerate(reader): + if idx == 0: + continue + accuracy_results.append(float(row[0])) + iterations = list(map(str,range(1, len(accuracy_results)+1))) + plt.plot(iterations, accuracy_results) + plt.xlabel('build') + plt.ylabel('accuracy') + plt.title("Accuracies over builds.") + plt.savefig("plot.png") + +def main(): + predictions, accuracy = evaluate_model() + save_predictions(predictions) + save_accuracy(accuracy) + plot_accuracy() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/model.pt b/model.pt new file mode 100644 index 0000000..c17f2d3 Binary files /dev/null and b/model.pt differ diff --git a/train.py b/train.py new file mode 100644 index 0000000..ab2bbbd --- /dev/null +++ b/train.py @@ -0,0 +1,82 @@ +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader, TensorDataset +import pandas as pd +from sklearn.preprocessing import LabelBinarizer +import numpy as np +import argparse + + +class MyNeuralNetwork(nn.Module): + def __init__(self, *args, **kwargs) -> None: + super(MyNeuralNetwork, self).__init__(*args, **kwargs) + self.fc1 = nn.Linear(7, 12) + self.relu = nn.ReLU() + self.fc1 = nn.Linear(7, 12) + self.relu = nn.ReLU() + self.fc2 = nn.Linear(12, 1) + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + x = self.fc1(x) + x = self.relu(x) + x = self.fc2(x) + x = self.sigmoid(x) + return x + +def prepare_df_for_nn(df): + + id_column_name_list = [column for column in df.columns.to_list() if 'id' in column.lower()] + if len(id_column_name_list) == 0: + pass + else: + df.drop(id_column_name_list[0], inplace=True, axis=1) + encoder = LabelBinarizer() + df.reset_index(inplace=True) + for column in df.columns: + if str(df[column].dtype).lower() == 'object': + encoded_column = encoder.fit_transform(df[column]) + df[column] = pd.Series(encoded_column.flatten(), dtype=pd.Int16Dtype) + return df + +def load_data(path): + df = pd.read_csv(path) + train_dataset = prepare_df_for_nn(df) + x = train_dataset.iloc[:, :-1].values.astype(float) + y = train_dataset.iloc[:, -1].values.astype(float) + x_tensor = torch.tensor(x, dtype=torch.float32) + y_tensor = torch.tensor(y, dtype=torch.float32) + dataset = TensorDataset(x_tensor, y_tensor) + return dataset + +def train(epochs, dataloader_train): + model: MyNeuralNetwork = MyNeuralNetwork() + criterion: nn.BCELoss = nn.BCELoss() + optimizer = optim.Adam(model.parameters(), lr=0.001) + for epoch in range(epochs): + for inputs, labels in dataloader_train: + outputs = model(inputs) + labels = labels.reshape((labels.shape[0], 1)) + loss = criterion(outputs, labels) + optimizer.zero_grad() + loss.backward() + optimizer.step() + print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}") + + return model + +def main(): + parser = argparse.ArgumentParser(description='A test program.') + parser.add_argument("--epochs", help="Prints the supplied argument.", default='10') + args = parser.parse_args() + config = vars(args) + epochs = int(config["epochs"]) + train_dataset = load_data("gender_classification_train.csv") + batch_size = 32 + dataloader_train = DataLoader(train_dataset, batch_size = batch_size, shuffle=True) + model = train(epochs, dataloader_train) + torch.save(model.state_dict(), 'model.pt') + +if __name__ == "__main__": + main() \ No newline at end of file