Create dataset

This commit is contained in:
wojciechbatruszewicz 2023-06-26 18:49:28 +02:00
commit c97a233bd3
3 changed files with 99 additions and 0 deletions

6
Dockerfile Normal file
View File

@ -0,0 +1,6 @@
FROM ubuntu:latest
RUN apt-get update && \
apt-get install -y python3-pip python3-dev && \
apt-get install -y build-essential && \
pip3 install pandas kaggle seaborn scikit-learn torch matplotlib \

68
Jenkinsfile vendored Normal file
View File

@ -0,0 +1,68 @@
pipeline {
agent any
parameters {
string(
defaultValue: 'wojciechbatruszewicz',
description: 'Kaggle username',
name: 'KAGGLE_USERNAME',
trim: false
)
password(
defaultValue: '',
description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
name: 'KAGGLE_KEY'
)
string(
defaultValue: '30',
description: 'dataset cutoff',
name: 'CUTOFF',
trim: false
)
}
stages {
stage('Download dataset') {
steps {
checkout scm
dir ('./createDataset') {
sh 'ls -l'
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
"KAGGLE_KEY=${params.KAGGLE_KEY}" ]) {
sh 'kaggle datasets download -d elakiricoder/gender-classification-dataset'
sh 'unzip -o gender_classification.zip'
}
}
}
}
stage('Docker') {
steps {
script {
def dockerImage = docker.build("docker-image", "./docker")
dockerImage.inside {
sh 'ls -l'
dir ('./createDataset') {
sh 'ls -l'
sh 'python3 ./createDataset.py'
archiveArtifacts 'gender_classification_train.csv'
archiveArtifacts 'gender_classification_test.csv'
archiveArtifacts 'gender_classification_val.csv'
}
sh 'ls -l'
}
}
}
}
// stage('Archive file') {
// steps {
// dir ('./createDataset') {
// archiveArtifacts artifacts: 'loan_sanction_shuffled.csv', fingerprint: true\
// }
// }
// }
}
// post {
// success {
// build job: 'z-s487179-training/main', wait: false
// }
// }
}

25
createDataset.py Normal file
View File

@ -0,0 +1,25 @@
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
gender_classification = pd.read_csv('gender_classification_v7.csv.csv')
gender_classification_train_final, gender_classification_test = train_test_split(gender_classification, test_size=0.2, random_state=1)
gender_classification_test_final, gender_classification_val_final = train_test_split(gender_classification_test, test_size=0.5, random_state=1)
numeric_cols_train = gender_classification_train_final.select_dtypes(include='number').columns
numeric_cols_test = gender_classification_test_final.select_dtypes(include='number').columns
numeric_cols_val = gender_classification_val_final.select_dtypes(include='number').columns
scaler = MinMaxScaler()
gender_classification_train_final[numeric_cols_train] = scaler.fit_transform(gender_classification_train_final[numeric_cols_train])
gender_classification_test_final[numeric_cols_test] = scaler.fit_transform(gender_classification_test_final[numeric_cols_test])
gender_classification_val_final[numeric_cols_val] = scaler.fit_transform(gender_classification_val_final[numeric_cols_val])
gender_classification_train_final = gender_classification_train_final.dropna()
gender_classification_test_final = gender_classification_test_final.dropna()
gender_classification_val_final = gender_classification_val_final.dropna()
gender_classification_train_final.to_csv('gender_classification_train.csv', index=False)
gender_classification_test_final.to_csv('gender_classification_test.csv', index=False)
gender_classification_val_final.to_csv('gender_classification_val.csv', index=False)