IUM_03 - Add symulation of data preprocessing - delete first column, split into train, test, dev sets

This commit is contained in:
Paweł Łączkowski 2024-03-20 18:31:15 +01:00
parent c1f418ce97
commit 93c4d885d8
2 changed files with 27 additions and 3 deletions

4
Jenkinsfile vendored
View File

@ -14,7 +14,7 @@ pipeline {
) )
string ( string (
name: 'CUTOFF', name: 'CUTOFF',
defaultValue: '10', defaultValue: '500',
description: 'Get only the first CUTOFF rows of the dataset' description: 'Get only the first CUTOFF rows of the dataset'
) )
} }
@ -31,7 +31,7 @@ pipeline {
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}", "CUTOFF=${params.CUTOFF}"]) { withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}", "CUTOFF=${params.CUTOFF}"]) {
sh "chmod +x download_dataset.sh" sh "chmod +x download_dataset.sh"
sh "./download_dataset.sh $CUTOFF" sh "./download_dataset.sh $CUTOFF"
archiveArtifacts artifacts: 'data.csv', onlyIfSuccessful: true archiveArtifacts artifacts: 'data.csv,train.csv,dev.csv,test.csv', onlyIfSuccessful: true
} }
} }
} }

View File

@ -1,4 +1,28 @@
#!/bin/bash #!/bin/bash
# Download dataset from kaggle
kaggle datasets download -d uciml/breast-cancer-wisconsin-data kaggle datasets download -d uciml/breast-cancer-wisconsin-data
# Unzip dataset -> data.csv
unzip -o breast-cancer-wisconsin-data.zip unzip -o breast-cancer-wisconsin-data.zip
# Remove id column
cut -d, -f2- data.csv > data.csv.tmp && mv data.csv.tmp data.csv
# Remove first n rows CUTOFF
head -n "$1" data.csv > data.csv.tmp && mv data.csv.tmp data.csv head -n "$1" data.csv > data.csv.tmp && mv data.csv.tmp data.csv
# Get number of rows in data.csv
if [ $1 -gt $(wc -l < data.csv) ]; then
data_size=$(wc -l < data.csv)
else
data_size=$1
fi
# Split data into train, dev, test
head -n $(expr $data_size / 10 \* 8) data.csv > train.csv
tail -n $(expr $data_size / 10 \* 2) data.csv > dev.csv.tmp
dev_size=$(wc -l < dev.csv.tmp)
head -n $(expr $dev_size / 10 \* 5) dev.csv.tmp > dev.csv
tail -n $(expr $dev_size / 10 \* 5) dev.csv.tmp > test.csv && rm dev.csv.tmp