IUM_03 - Add symulation of data preprocessing - delete first column, split into train, test, dev sets
This commit is contained in:
parent
c1f418ce97
commit
93c4d885d8
4
Jenkinsfile
vendored
4
Jenkinsfile
vendored
@ -14,7 +14,7 @@ pipeline {
|
||||
)
|
||||
string (
|
||||
name: 'CUTOFF',
|
||||
defaultValue: '10',
|
||||
defaultValue: '500',
|
||||
description: 'Get only the first CUTOFF rows of the dataset'
|
||||
)
|
||||
}
|
||||
@ -31,7 +31,7 @@ pipeline {
|
||||
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}", "CUTOFF=${params.CUTOFF}"]) {
|
||||
sh "chmod +x download_dataset.sh"
|
||||
sh "./download_dataset.sh $CUTOFF"
|
||||
archiveArtifacts artifacts: 'data.csv', onlyIfSuccessful: true
|
||||
archiveArtifacts artifacts: 'data.csv,train.csv,dev.csv,test.csv', onlyIfSuccessful: true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,28 @@
|
||||
#!/bin/bash
|
||||
# Download dataset from kaggle
|
||||
kaggle datasets download -d uciml/breast-cancer-wisconsin-data
|
||||
|
||||
# Unzip dataset -> data.csv
|
||||
unzip -o breast-cancer-wisconsin-data.zip
|
||||
head -n "$1" data.csv > data.csv.tmp && mv data.csv.tmp data.csv
|
||||
|
||||
# Remove id column
|
||||
cut -d, -f2- data.csv > data.csv.tmp && mv data.csv.tmp data.csv
|
||||
|
||||
# Remove first n rows CUTOFF
|
||||
head -n "$1" data.csv > data.csv.tmp && mv data.csv.tmp data.csv
|
||||
|
||||
# Get number of rows in data.csv
|
||||
if [ $1 -gt $(wc -l < data.csv) ]; then
|
||||
data_size=$(wc -l < data.csv)
|
||||
else
|
||||
data_size=$1
|
||||
fi
|
||||
|
||||
# Split data into train, dev, test
|
||||
head -n $(expr $data_size / 10 \* 8) data.csv > train.csv
|
||||
tail -n $(expr $data_size / 10 \* 2) data.csv > dev.csv.tmp
|
||||
|
||||
dev_size=$(wc -l < dev.csv.tmp)
|
||||
|
||||
head -n $(expr $dev_size / 10 \* 5) dev.csv.tmp > dev.csv
|
||||
tail -n $(expr $dev_size / 10 \* 5) dev.csv.tmp > test.csv && rm dev.csv.tmp
|
Loading…
Reference in New Issue
Block a user