From 93c4d885d87cc7ce8dca9d2bc2553a77df82a11f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20=C5=81=C4=85czkowski?= Date: Wed, 20 Mar 2024 18:31:15 +0100 Subject: [PATCH] IUM_03 - Add symulation of data preprocessing - delete first column, split into train, test, dev sets --- Jenkinsfile | 4 ++-- download_dataset.sh | 26 +++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index bf4ad2f..ace8b03 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -14,7 +14,7 @@ pipeline { ) string ( name: 'CUTOFF', - defaultValue: '10', + defaultValue: '500', description: 'Get only the first CUTOFF rows of the dataset' ) } @@ -31,7 +31,7 @@ pipeline { withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}", "CUTOFF=${params.CUTOFF}"]) { sh "chmod +x download_dataset.sh" sh "./download_dataset.sh $CUTOFF" - archiveArtifacts artifacts: 'data.csv', onlyIfSuccessful: true + archiveArtifacts artifacts: 'data.csv,train.csv,dev.csv,test.csv', onlyIfSuccessful: true } } } diff --git a/download_dataset.sh b/download_dataset.sh index 6a35d3a..6cccb15 100644 --- a/download_dataset.sh +++ b/download_dataset.sh @@ -1,4 +1,28 @@ #!/bin/bash +# Download dataset from kaggle kaggle datasets download -d uciml/breast-cancer-wisconsin-data + +# Unzip dataset -> data.csv unzip -o breast-cancer-wisconsin-data.zip -head -n "$1" data.csv > data.csv.tmp && mv data.csv.tmp data.csv \ No newline at end of file + +# Remove id column +cut -d, -f2- data.csv > data.csv.tmp && mv data.csv.tmp data.csv + +# Remove first n rows CUTOFF +head -n "$1" data.csv > data.csv.tmp && mv data.csv.tmp data.csv + +# Get number of rows in data.csv +if [ $1 -gt $(wc -l < data.csv) ]; then + data_size=$(wc -l < data.csv) +else + data_size=$1 +fi + +# Split data into train, dev, test +head -n $(expr $data_size / 10 \* 8) data.csv > train.csv +tail -n $(expr $data_size / 10 \* 2) data.csv > dev.csv.tmp + +dev_size=$(wc -l < dev.csv.tmp) + +head -n $(expr $dev_size / 10 \* 5) dev.csv.tmp > dev.csv +tail -n $(expr $dev_size / 10 \* 5) dev.csv.tmp > test.csv && rm dev.csv.tmp \ No newline at end of file