Create dataset
This commit is contained in:
commit
c97a233bd3
6
Dockerfile
Normal file
6
Dockerfile
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
FROM ubuntu:latest
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y python3-pip python3-dev && \
|
||||||
|
apt-get install -y build-essential && \
|
||||||
|
pip3 install pandas kaggle seaborn scikit-learn torch matplotlib \
|
68
Jenkinsfile
vendored
Normal file
68
Jenkinsfile
vendored
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
pipeline {
|
||||||
|
agent any
|
||||||
|
parameters {
|
||||||
|
string(
|
||||||
|
defaultValue: 'wojciechbatruszewicz',
|
||||||
|
description: 'Kaggle username',
|
||||||
|
name: 'KAGGLE_USERNAME',
|
||||||
|
trim: false
|
||||||
|
)
|
||||||
|
password(
|
||||||
|
defaultValue: '',
|
||||||
|
description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials',
|
||||||
|
name: 'KAGGLE_KEY'
|
||||||
|
)
|
||||||
|
string(
|
||||||
|
defaultValue: '30',
|
||||||
|
description: 'dataset cutoff',
|
||||||
|
name: 'CUTOFF',
|
||||||
|
trim: false
|
||||||
|
)
|
||||||
|
}
|
||||||
|
stages {
|
||||||
|
stage('Download dataset') {
|
||||||
|
steps {
|
||||||
|
checkout scm
|
||||||
|
dir ('./createDataset') {
|
||||||
|
sh 'ls -l'
|
||||||
|
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
|
||||||
|
"KAGGLE_KEY=${params.KAGGLE_KEY}" ]) {
|
||||||
|
sh 'kaggle datasets download -d elakiricoder/gender-classification-dataset'
|
||||||
|
sh 'unzip -o gender_classification.zip'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('Docker') {
|
||||||
|
steps {
|
||||||
|
script {
|
||||||
|
def dockerImage = docker.build("docker-image", "./docker")
|
||||||
|
dockerImage.inside {
|
||||||
|
sh 'ls -l'
|
||||||
|
dir ('./createDataset') {
|
||||||
|
sh 'ls -l'
|
||||||
|
sh 'python3 ./createDataset.py'
|
||||||
|
archiveArtifacts 'gender_classification_train.csv'
|
||||||
|
archiveArtifacts 'gender_classification_test.csv'
|
||||||
|
archiveArtifacts 'gender_classification_val.csv'
|
||||||
|
}
|
||||||
|
sh 'ls -l'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// stage('Archive file') {
|
||||||
|
// steps {
|
||||||
|
// dir ('./createDataset') {
|
||||||
|
// archiveArtifacts artifacts: 'loan_sanction_shuffled.csv', fingerprint: true\
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
// post {
|
||||||
|
// success {
|
||||||
|
// build job: 'z-s487179-training/main', wait: false
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
}
|
25
createDataset.py
Normal file
25
createDataset.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
gender_classification = pd.read_csv('gender_classification_v7.csv.csv')
|
||||||
|
|
||||||
|
gender_classification_train_final, gender_classification_test = train_test_split(gender_classification, test_size=0.2, random_state=1)
|
||||||
|
gender_classification_test_final, gender_classification_val_final = train_test_split(gender_classification_test, test_size=0.5, random_state=1)
|
||||||
|
|
||||||
|
numeric_cols_train = gender_classification_train_final.select_dtypes(include='number').columns
|
||||||
|
numeric_cols_test = gender_classification_test_final.select_dtypes(include='number').columns
|
||||||
|
numeric_cols_val = gender_classification_val_final.select_dtypes(include='number').columns
|
||||||
|
|
||||||
|
scaler = MinMaxScaler()
|
||||||
|
|
||||||
|
gender_classification_train_final[numeric_cols_train] = scaler.fit_transform(gender_classification_train_final[numeric_cols_train])
|
||||||
|
gender_classification_test_final[numeric_cols_test] = scaler.fit_transform(gender_classification_test_final[numeric_cols_test])
|
||||||
|
gender_classification_val_final[numeric_cols_val] = scaler.fit_transform(gender_classification_val_final[numeric_cols_val])
|
||||||
|
|
||||||
|
gender_classification_train_final = gender_classification_train_final.dropna()
|
||||||
|
gender_classification_test_final = gender_classification_test_final.dropna()
|
||||||
|
gender_classification_val_final = gender_classification_val_final.dropna()
|
||||||
|
|
||||||
|
gender_classification_train_final.to_csv('gender_classification_train.csv', index=False)
|
||||||
|
gender_classification_test_final.to_csv('gender_classification_test.csv', index=False)
|
||||||
|
gender_classification_val_final.to_csv('gender_classification_val.csv', index=False)
|
Loading…
Reference in New Issue
Block a user