11 changed files with 39 additions and 379 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/18
+++ b/18
@ -1,6 +1,14 @@
 FROM ubuntu:latest
-
-RUN apt-get update && \
-    apt-get install -y python3-pip python3-dev && \
-    apt-get install -y build-essential && \
-    pip3 install pandas kaggle seaborn scikit-learn torch matplotlib \
+RUN apt-get update \
+    && apt-get install -y git python3 python3-pip curl \
+    && curl -O https://bootstrap.pypa.io/get-pip.py \
+    && python3 get-pip.py --user \
+    && rm get-pip.py \
+    && pip3 install --user kaggle \
+    && pip3 install --user pandas \
+    && pip3 install --user seaborn \
+    && pip3 install --user scikit-learn
+ENV PATH="/root/.local/bin:$PATH"
+WORKDIR /app
+COPY . /app
+CMD ["python", "create_dataset.py"]
--- a/53
+++ b/53
@ -1,56 +1,29 @@
-pipeline {
-   agent any
-   parameters {
+node {
+    stage('Preparation') { 
+        properties([
+            parameters([
                string(
-            defaultValue: 'wojciechbatruszewicz',
+                    defaultValue: 'bartekmalanka',
                    description: 'Kaggle username',
                    name: 'KAGGLE_USERNAME',
                    trim: false
-         )
+                ),
                password(
                    defaultValue: '',
                    description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
                    name: 'KAGGLE_KEY'
                )
-      string(
-            defaultValue: '30',
-            description: 'dataset cutoff',
-            name: 'CUTOFF',
-            trim: false
-         )
+            ])
+        ])
    }
-   stages {
-      stage('Download dataset') {
-         steps {
-            checkout scm
-            sh 'ls -l'
+    stage('Build') {
+        // Run the maven build
        withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
                 "KAGGLE_KEY=${params.KAGGLE_KEY}" ]) {
-                sh 'kaggle datasets download -d elakiricoder/gender-classification-dataset'
-                sh 'unzip -o gender-classification-dataset.zip'
-            }
-         }
-      }
-      stage('Docker') {
-        steps {
-               script {
-                  def dockerImage = docker.build("docker-image", "./")
-                  dockerImage.inside {
+             sh 'kaggle datasets download -d elakiricoder/gender-classification-dataset > output.txt'
            sh 'ls -l'
-                        sh 'ls -l'
-                        sh 'python3 createDataset.py'
-                        archiveArtifacts 'gender_classification_train.csv'
-                        archiveArtifacts 'gender_classification_test.csv'
-                        archiveArtifacts 'gender_classification_val.csv'
-                     sh 'ls -l'
-                  }  
-            }
-        }
-      }           
-   }
-   post {
-        success {
-            build job: 'x1-training/main', wait: false
+             archiveArtifacts artifacts: 'gender_classification_v7.csv, output.txt'
+
        }
    }
 }
--- a/34
+++ b/34
@ -1,34 +0,0 @@
-pipeline {
-    agent any
-    parameters {
-        buildSelector(
-            name: 'BUILD_SELECTOR',
-            defaultSelector: lastSuccessful(),
-            description: 'A build to take the artifacts from'
-        )
-    }
-    stages {
-        stage('Copy artifacts') {
-            steps {
-                script {
-                    copyArtifacts(
-                        projectName: 'x1-create-dataset',
-                        selector: buildParameter('BUILD_SELECTOR'),
-                        target: './'
-                    )
-                }
-            }
-        }
-        stage('Run python file') {
-            steps {
-                    script {
-                    sh 'ls -l'
-                    docker.image('docker-image').inside {
-                        sh 'ls -l'
-                        sh 'python3 ./datasetStats.py'
-                    }
-                }
-            }
-        }
-    }
-}
--- a/46
+++ b/46
@ -1,46 +0,0 @@
-pipeline {
-    agent any
-    parameters {
-        buildSelector(
-            name: 'BUILD_SELECTOR',
-            defaultSelector: lastSuccessful(),
-            description: 'A build to take the artifacts from'
-        )
-        string(
-            name: 'EPOCHS',
-            description: 'Number of epochs',
-            defaultValue: '10'
-        )
-    }
-    stages {
-        stage('Copy artifacts') {
-            steps {
-                script {
-                    copyArtifacts(
-                        projectName: 'x1-training/main',
-                        selector: buildParameter('BUILD_SELECTOR'),
-                        target: './'
-                    )
-                    copyArtifacts(
-                        projectName: 'x1-create-dataset',
-                        selector: buildParameter('BUILD_SELECTOR'),
-                        target: './'
-                    )
-                }
-            }
-        }
-        stage('Save evaluation') {
-            steps {
-                    script {
-                    sh 'ls -l'
-                    docker.image('docker-image').inside {
-                            sh 'ls -l'
-                            sh 'python3 ./evaluate.py'
-                            archiveArtifacts 'plot.png'
-                            archiveArtifacts 'results.csv'
-                    }
-                }
-            }
-        }
-    }
-}
--- a/45
+++ b/45
@ -1,45 +0,0 @@
-pipeline {
-    agent any
-    parameters {
-        buildSelector(
-            name: 'BUILD_SELECTOR',
-            defaultSelector: lastSuccessful(),
-            description: 'A build to take the artifacts from'
-        )
-        string(
-            name: 'EPOCHS',
-            description: 'Number of epochs',
-            defaultValue: '10'
-        )
-    }
-    stages {
-        stage('Copy artifacts') {
-            steps {
-                script {
-                    copyArtifacts(
-                        projectName: 'x1-create-dataset',
-                        selector: buildParameter('BUILD_SELECTOR'),
-                        target: './'
-                    )
-                }
-            }
-        }
-        stage('Run training and save model') {
-            steps {
-                    script {
-                    sh 'ls -l'
-                    docker.image('docker-image').inside {
-                            sh 'ls -l'
-                            sh 'python3 ./train.py'
-                            archiveArtifacts 'model.pt'
-                    }
-                }
-            }
-        }
-    }
-    post {
-        success {
-            build job: 'x1-evaluation.eg/main', wait: false
-        }
-    }
-}
--- a/createDataset.py
+++ b/createDataset.py
@ -1,25 +0,0 @@
-import pandas as pd
-from sklearn.preprocessing import MinMaxScaler
-from sklearn.model_selection import train_test_split
-gender_classification = pd.read_csv('gender_classification_v7.csv')
-
-gender_classification_train_final, gender_classification_test = train_test_split(gender_classification, test_size=0.2, random_state=1)
-gender_classification_test_final, gender_classification_val_final = train_test_split(gender_classification_test, test_size=0.5, random_state=1)
-
-numeric_cols_train = gender_classification_train_final.select_dtypes(include='number').columns
-numeric_cols_test = gender_classification_test_final.select_dtypes(include='number').columns
-numeric_cols_val = gender_classification_val_final.select_dtypes(include='number').columns
-
-scaler = MinMaxScaler()
-
-gender_classification_train_final[numeric_cols_train] = scaler.fit_transform(gender_classification_train_final[numeric_cols_train])
-gender_classification_test_final[numeric_cols_test] = scaler.fit_transform(gender_classification_test_final[numeric_cols_test])
-gender_classification_val_final[numeric_cols_val] = scaler.fit_transform(gender_classification_val_final[numeric_cols_val])
-
-gender_classification_train_final = gender_classification_train_final.dropna()
-gender_classification_test_final = gender_classification_test_final.dropna()
-gender_classification_val_final = gender_classification_val_final.dropna()
-
-gender_classification_train_final.to_csv('gender_classification_train.csv', index=False)
-gender_classification_test_final.to_csv('gender_classification_test.csv', index=False)
-gender_classification_val_final.to_csv('gender_classification_val.csv', index=False)
--- a/datasetStats.py
+++ b/datasetStats.py
@ -1,9 +0,0 @@
-import pandas as pd
-
-home_loan_train = pd.read_csv('gender_classification_train.csv')
-home_loan_test = pd.read_csv('gender_classification_test.csv')
-home_loan_val = pd.read_csv('gender_classification_val.csv')
-
-home_loan_train.describe()
-home_loan_test.describe()
-home_loan_val.describe()
--- a/evaluate.py
+++ b/evaluate.py
@ -1,80 +0,0 @@
-import torch
-from train import MyNeuralNetwork, load_data
-from torch.utils.data import DataLoader
-import csv
-import os
-import matplotlib.pyplot as plt
-from typing import Tuple, List
-
-def evaluate_model() -> Tuple[List[float], float]:
-    model = MyNeuralNetwork()
-    model.load_state_dict(torch.load('model.pt'))
-    model.eval()
-    test_dataset = load_data("gender_classification_test.csv")
-    batch_size: int = 32
-    test_dataloader: DataLoader = DataLoader(test_dataset, batch_size=batch_size)
-    predictions = []
-    labels = []
-    get_label = lambda pred: 1 if pred >= 0.5 else 0
-    total = 0
-    correct = 0
-    with torch.no_grad():
-        for batch_data, batch_labels in test_dataloader:
-            batch_predictions = model(batch_data)
-            predicted_batch_labels = [get_label(prediction) for prediction in batch_predictions]
-            total += len(predicted_batch_labels)
-            batch_labels_list = list(map(int,batch_labels.tolist()))
-            correct += sum(x == y for x, y in zip(predicted_batch_labels, batch_labels_list))
-            predictions.extend(batch_predictions)
-            labels.extend(batch_labels)
-    accuracy = correct/total
-    return predictions, accuracy
-
-def save_predictions(predictions: list[float]) -> None:
-    filename = "results.csv"
-    column_name = "predict"
-    with open(filename, 'w', newline='') as file:
-        writer = csv.writer(file)
-        writer.writerow([column_name])
-        for result in predictions:
-            loan_decision = 1 if result.item() > 0.5 else 0
-            writer.writerow([loan_decision])
-
-def save_accuracy(accuracy):
-    filename = 'results.csv'
-    if os.path.exists(filename):
-        with open(filename, 'a') as file:
-            writer = csv.writer(file)
-            writer.writerow([accuracy])
-    else:
-        with open(filename, 'w') as file:
-            writer = csv.writer(file)
-            writer.writerow(['accuracy'])
-            writer.writerow([accuracy])
-        
-def plot_accuracy():
-    filename = 'results.csv'
-    accuracy_results = []
-    if os.path.exists(filename):
-        with open(filename, 'r') as file:
-            reader =  csv.reader(file)
-            for idx, row in enumerate(reader):
-                if idx == 0:
-                    continue
-                accuracy_results.append(float(row[0]))
-    iterations = list(map(str,range(1, len(accuracy_results)+1)))
-    plt.plot(iterations, accuracy_results)
-    plt.xlabel('build')
-    plt.ylabel('accuracy')
-    plt.title("Accuracies over builds.")
-    plt.savefig("plot.png")
-                
-def main():
-    predictions, accuracy = evaluate_model()
-    save_predictions(predictions)
-    save_accuracy(accuracy)
-    plot_accuracy()
-    
-    
-if __name__ == "__main__":
-    main()
--- a/model.pt
+++ b/model.pt
--- a/train.py
+++ b/train.py
@ -1,82 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.optim as optim
-from torch.utils.data import DataLoader, TensorDataset
-import pandas as pd
-from sklearn.preprocessing import LabelBinarizer
-import numpy as np
-import argparse
-
-
-class MyNeuralNetwork(nn.Module):
-    def __init__(self, *args, **kwargs) -> None:
-        super(MyNeuralNetwork, self).__init__(*args, **kwargs)
-        self.fc1 = nn.Linear(7, 12)
-        self.relu = nn.ReLU()
-        self.fc1 = nn.Linear(7, 12)
-        self.relu = nn.ReLU()
-        self.fc2 = nn.Linear(12, 1)
-        self.sigmoid = nn.Sigmoid()
-    
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.relu(x)
-        x = self.fc2(x)
-        x = self.sigmoid(x)
-        return x
-
-def prepare_df_for_nn(df):
-    
-    id_column_name_list = [column for column in df.columns.to_list() if 'id' in  column.lower()]
-    if len(id_column_name_list) == 0:
-        pass
-    else:
-        df.drop(id_column_name_list[0], inplace=True, axis=1)
-    encoder =  LabelBinarizer()
-    df.reset_index(inplace=True)
-    for column in df.columns:
-        if str(df[column].dtype).lower() == 'object':
-            encoded_column = encoder.fit_transform(df[column])
-            df[column] = pd.Series(encoded_column.flatten(), dtype=pd.Int16Dtype)
-    return df
-
-def load_data(path):
-    df = pd.read_csv(path) 
-    train_dataset = prepare_df_for_nn(df)
-    x = train_dataset.iloc[:, :-1].values.astype(float)
-    y = train_dataset.iloc[:, -1].values.astype(float)
-    x_tensor = torch.tensor(x, dtype=torch.float32)
-    y_tensor = torch.tensor(y, dtype=torch.float32)
-    dataset = TensorDataset(x_tensor, y_tensor)
-    return dataset
-
-def train(epochs, dataloader_train):
-    model: MyNeuralNetwork = MyNeuralNetwork()
-    criterion: nn.BCELoss = nn.BCELoss()
-    optimizer = optim.Adam(model.parameters(), lr=0.001)
-    for epoch in range(epochs):
-        for inputs, labels in dataloader_train:
-            outputs = model(inputs)
-            labels = labels.reshape((labels.shape[0], 1))
-            loss = criterion(outputs, labels)
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")
-        
-    return model
-
-def main():
-    parser = argparse.ArgumentParser(description='A test program.')
-    parser.add_argument("--epochs", help="Prints the supplied argument.", default='10')
-    args = parser.parse_args()
-    config = vars(args)
-    epochs =  int(config["epochs"])  
-    train_dataset = load_data("gender_classification_train.csv")
-    batch_size = 32
-    dataloader_train = DataLoader(train_dataset, batch_size = batch_size, shuffle=True)
-    model = train(epochs, dataloader_train)
-    torch.save(model.state_dict(), 'model.pt')
-    
-if __name__ == "__main__":
-    main()