IUM_03 - Add symulation of data preprocessing - delete first column, split into train, test, dev sets
This commit is contained in:
parent
c1f418ce97
commit
93c4d885d8
4
Jenkinsfile
vendored
4
Jenkinsfile
vendored
@ -14,7 +14,7 @@ pipeline {
|
|||||||
)
|
)
|
||||||
string (
|
string (
|
||||||
name: 'CUTOFF',
|
name: 'CUTOFF',
|
||||||
defaultValue: '10',
|
defaultValue: '500',
|
||||||
description: 'Get only the first CUTOFF rows of the dataset'
|
description: 'Get only the first CUTOFF rows of the dataset'
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@ -31,7 +31,7 @@ pipeline {
|
|||||||
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}", "CUTOFF=${params.CUTOFF}"]) {
|
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}", "CUTOFF=${params.CUTOFF}"]) {
|
||||||
sh "chmod +x download_dataset.sh"
|
sh "chmod +x download_dataset.sh"
|
||||||
sh "./download_dataset.sh $CUTOFF"
|
sh "./download_dataset.sh $CUTOFF"
|
||||||
archiveArtifacts artifacts: 'data.csv', onlyIfSuccessful: true
|
archiveArtifacts artifacts: 'data.csv,train.csv,dev.csv,test.csv', onlyIfSuccessful: true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,28 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
# Download dataset from kaggle
|
||||||
kaggle datasets download -d uciml/breast-cancer-wisconsin-data
|
kaggle datasets download -d uciml/breast-cancer-wisconsin-data
|
||||||
|
|
||||||
|
# Unzip dataset -> data.csv
|
||||||
unzip -o breast-cancer-wisconsin-data.zip
|
unzip -o breast-cancer-wisconsin-data.zip
|
||||||
head -n "$1" data.csv > data.csv.tmp && mv data.csv.tmp data.csv
|
|
||||||
|
# Remove id column
|
||||||
|
cut -d, -f2- data.csv > data.csv.tmp && mv data.csv.tmp data.csv
|
||||||
|
|
||||||
|
# Remove first n rows CUTOFF
|
||||||
|
head -n "$1" data.csv > data.csv.tmp && mv data.csv.tmp data.csv
|
||||||
|
|
||||||
|
# Get number of rows in data.csv
|
||||||
|
if [ $1 -gt $(wc -l < data.csv) ]; then
|
||||||
|
data_size=$(wc -l < data.csv)
|
||||||
|
else
|
||||||
|
data_size=$1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Split data into train, dev, test
|
||||||
|
head -n $(expr $data_size / 10 \* 8) data.csv > train.csv
|
||||||
|
tail -n $(expr $data_size / 10 \* 2) data.csv > dev.csv.tmp
|
||||||
|
|
||||||
|
dev_size=$(wc -l < dev.csv.tmp)
|
||||||
|
|
||||||
|
head -n $(expr $dev_size / 10 \* 5) dev.csv.tmp > dev.csv
|
||||||
|
tail -n $(expr $dev_size / 10 \* 5) dev.csv.tmp > test.csv && rm dev.csv.tmp
|
Loading…
Reference in New Issue
Block a user