From c97a233bd3529c9bc212e3ce83c152548472807d Mon Sep 17 00:00:00 2001 From: wojciechbatruszewicz Date: Mon, 26 Jun 2023 18:49:28 +0200 Subject: [PATCH] Create dataset --- Dockerfile | 6 +++++ Jenkinsfile | 68 ++++++++++++++++++++++++++++++++++++++++++++++++ createDataset.py | 25 ++++++++++++++++++ 3 files changed, 99 insertions(+) create mode 100644 Dockerfile create mode 100644 Jenkinsfile create mode 100644 createDataset.py diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..7d01f66 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,6 @@ +FROM ubuntu:latest + +RUN apt-get update && \ + apt-get install -y python3-pip python3-dev && \ + apt-get install -y build-essential && \ + pip3 install pandas kaggle seaborn scikit-learn torch matplotlib \ \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000..99c5cac --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,68 @@ +pipeline { + agent any + parameters { + string( + defaultValue: 'wojciechbatruszewicz', + description: 'Kaggle username', + name: 'KAGGLE_USERNAME', + trim: false + ) + password( + defaultValue: '', + description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials', + name: 'KAGGLE_KEY' + ) + string( + defaultValue: '30', + description: 'dataset cutoff', + name: 'CUTOFF', + trim: false + ) + } + stages { + stage('Download dataset') { + steps { + checkout scm + dir ('./createDataset') { + sh 'ls -l' + withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", + "KAGGLE_KEY=${params.KAGGLE_KEY}" ]) { + sh 'kaggle datasets download -d elakiricoder/gender-classification-dataset' + sh 'unzip -o gender_classification.zip' + } + } + } + } + stage('Docker') { + steps { + script { + def dockerImage = docker.build("docker-image", "./docker") + dockerImage.inside { + sh 'ls -l' + dir ('./createDataset') { + sh 'ls -l' + sh 'python3 ./createDataset.py' + archiveArtifacts 'gender_classification_train.csv' + archiveArtifacts 'gender_classification_test.csv' + archiveArtifacts 'gender_classification_val.csv' + } + sh 'ls -l' + } + } + } + } + + // stage('Archive file') { + // steps { + // dir ('./createDataset') { + // archiveArtifacts artifacts: 'loan_sanction_shuffled.csv', fingerprint: true\ + // } + // } + // } + } +// post { +// success { +// build job: 'z-s487179-training/main', wait: false +// } +// } +} diff --git a/createDataset.py b/createDataset.py new file mode 100644 index 0000000..92a3558 --- /dev/null +++ b/createDataset.py @@ -0,0 +1,25 @@ +import pandas as pd +from sklearn.preprocessing import MinMaxScaler +from sklearn.model_selection import train_test_split +gender_classification = pd.read_csv('gender_classification_v7.csv.csv') + +gender_classification_train_final, gender_classification_test = train_test_split(gender_classification, test_size=0.2, random_state=1) +gender_classification_test_final, gender_classification_val_final = train_test_split(gender_classification_test, test_size=0.5, random_state=1) + +numeric_cols_train = gender_classification_train_final.select_dtypes(include='number').columns +numeric_cols_test = gender_classification_test_final.select_dtypes(include='number').columns +numeric_cols_val = gender_classification_val_final.select_dtypes(include='number').columns + +scaler = MinMaxScaler() + +gender_classification_train_final[numeric_cols_train] = scaler.fit_transform(gender_classification_train_final[numeric_cols_train]) +gender_classification_test_final[numeric_cols_test] = scaler.fit_transform(gender_classification_test_final[numeric_cols_test]) +gender_classification_val_final[numeric_cols_val] = scaler.fit_transform(gender_classification_val_final[numeric_cols_val]) + +gender_classification_train_final = gender_classification_train_final.dropna() +gender_classification_test_final = gender_classification_test_final.dropna() +gender_classification_val_final = gender_classification_val_final.dropna() + +gender_classification_train_final.to_csv('gender_classification_train.csv', index=False) +gender_classification_test_final.to_csv('gender_classification_test.csv', index=False) +gender_classification_val_final.to_csv('gender_classification_val.csv', index=False) \ No newline at end of file