From 80326968b192ba4c2b13ea043a190c66f3555f88 Mon Sep 17 00:00:00 2001 From: Adam Wojdyla Date: Sun, 27 Mar 2022 14:03:36 +0200 Subject: [PATCH] cutoff --- Jenkinsfile | 12 +++++++++--- Jenkinsfile2 | 53 ---------------------------------------------------- download.sh | 37 +++++++++++++++++++++++++++++++++++- test.sh | 1 - 4 files changed, 45 insertions(+), 58 deletions(-) delete mode 100644 Jenkinsfile2 delete mode 100755 test.sh diff --git a/Jenkinsfile b/Jenkinsfile index dacb02c..59413f7 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -12,6 +12,12 @@ pipeline { description: 'Kaggle token', name: 'KAGGLE_KEY' ) + string( + defaultValue: '1', + description: 'Cutoff', + name: 'CUTOFF', + trim: false + ) } stages { stage('Checkout') { @@ -26,11 +32,11 @@ pipeline { stage('Script') { steps { script { - withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}"]) { + withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}", "CUTOFF=${params.CUTOFF}"]) { sh 'echo KAGGLE_USERNAME: $KAGGLE_USERNAME' sh 'kaggle datasets list' - sh './download.sh' - + sh './download.sh $CUTOFF > ./script_logs.txt' + archiveArtifacts artifacts: 'car_prices.csv.dev, car_prices.csv.test, car_prices.csv.train', followSymlinks: false } } } diff --git a/Jenkinsfile2 b/Jenkinsfile2 deleted file mode 100644 index dbe16db..0000000 --- a/Jenkinsfile2 +++ /dev/null @@ -1,53 +0,0 @@ -pipeline { - agent any - - parameters { - string( - defaultValue: 'heatedboss2', - description: 'Kaggle username', - name: 'KAGGLE_USERNAME', - trim: false - ) - password( - defaultValue: '', - description: 'Kaggle token taken from kaggle.json file, as described in https://github.com/Kaggle/kaggle-api#api-credentials', - name: 'KAGGLE_KEY' - ) - string( - defaultValue: '1', - description: 'Cutoff lines', - name: 'CUTOFF' - ) - } - environment { - KAGGLE_USERNAME="$params.KAGGLE_USERNAME" - KAGGLE_KEY="$params.KAGGLE_KEY" - } - - stages { - stage('Checkout') { - steps { - checkout([$class: 'GitSCM', branches: [ - [name: '*/master'] - ], extensions: [], userRemoteConfigs: [ - [credentialsId: '8b8d54ee-f03c-4980-90b1-959faa97082b', url: 'https://git.wmi.amu.edu.pl/s444507/ium_444507.git'] - ]]) - } - } - - stage('Script'){ - steps { - script { - withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", - "KAGGLE_KEY=${params.KAGGLE_KEY}" ]) { - sh 'export KAGGLE_USERNAME=${params.KAGGLE_USERNAME}' - sh 'export KAGGLE_KEY=${params.KAGGLE_KEY}' - sh 'echo KAGGLE_USERNAME: $KAGGLE_USERNAME' - sh 'kaggle datasets list' - } - } - sh './download.sh' - } - } - } -} \ No newline at end of file diff --git a/download.sh b/download.sh index 21cafea..06937f8 100755 --- a/download.sh +++ b/download.sh @@ -1,2 +1,37 @@ +#!/bin/bash + echo 'Downloading Dataset' -kaggle datasets download -d aleksandrglotov/car-prices-poland \ No newline at end of file +kaggle datasets download -d aleksandrglotov/car-prices-poland +echo 'Dataset downloaded' + +echo 'Unzippig Dataset' +unzip -o car-prices-poland.zip +echo 'Dataset unzipped' + +len=$(cat ./Car_Prices_Poland_Kaggle.csv | wc -l) +echo 'Initial dataset count:' $len + +echo 'CUTOFF VALUE: ' $1 + +echo 'Skip first header row and shuffle' +# example in materials don't work (head -n -1) +tail -n +2 Car_Prices_Poland_Kaggle.csv | shuf | head -n $1 > ./Car_Prices_Poland_Kaggle_shuf.csv +echo 'Shuffled' + +len=$(cat ./Car_Prices_Poland_Kaggle_shuf.csv | wc -l) +echo 'Dataset count after cutoff:' $len +len1=$(($len/6)) +len2=$(($len1*2+1)) +echo 'len: '$len +echo 'len1: '$len1 +echo 'len2: '$len2 + +echo 'Divide and save to files' +head -n $len1 Car_Prices_Poland_Kaggle_shuf.csv> car_prices.csv.test +head -n $len1 Car_Prices_Poland_Kaggle_shuf.csv| tail -n $len1 > car_prices.csv.dev +tail -n +$len2 Car_Prices_Poland_Kaggle_shuf.csv> car_prices.csv.train +rm ./Car_Prices_Poland_Kaggle_shuf.csv + +echo 'Divided datasets count' +wc -l car_prices.csv.* + diff --git a/test.sh b/test.sh deleted file mode 100755 index 624dabf..0000000 --- a/test.sh +++ /dev/null @@ -1 +0,0 @@ -echo 'Downloading Dataset' \ No newline at end of file