diff --git a/Jenkinsfile b/Jenkinsfile index a9e7d81..bb32d37 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,10 +1,32 @@ pipeline { agent any + + parameters { + string(name: 'CUTOFF', defaultValue: '100', description: 'Ilość wierszy do odcięcia') + string(name: 'KAGGLE_USERNAME', defaultValue: '', description: 'Kaggle username') + password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key') + } + stages { - stage('Stage 1') { + stage('Clone repo') { steps { - echo 'elo 420 v2' + git url: "https://git.wmi.amu.edu.pl/s464937/ium_464937" + } + } + + stage('Pobierz i przeprocesuj zbiór') { + steps { + withEnv([ + "KAGGLE_USERNAME=${env.KAGGLE_USERNAME}", + "KAGGLE_KEY=${env.KAGGLE_KEY}" + ]) { + sh "bash ./script1.sh ${params.CUTOFF}" + } + } + } + stage('Archive Results') { + steps { + archiveArtifacts artifacts: 'data/*', onlyIfSuccessful: true } } } -} \ No newline at end of file diff --git a/script1.sh b/script1.sh new file mode 100644 index 0000000..1a60f4c --- /dev/null +++ b/script1.sh @@ -0,0 +1,18 @@ +#!/bin/bash +pip install kaggle +kaggle datasets download -d open-powerlifting/powerlifting-database +unzip -o powerlifting-database.zip +DATASET_FILE="openpowerlifting.csv" +echo "Obcięte wiersze: ${1}" +head -n $1 $DATASET_FILE > cutoff_$DATASET_FILE +echo "Podział i wymieszanie" +total_lines=$(tail -n +2 cutoff_$DATASET_FILE | wc -l) +train_lines=$((total_lines * 90 / 100)) +dev_lines=$((total_lines * 10 / 100)) +test_lines=$((total_lines - train_lines - dev_lines)) +shuf cutoff_$DATASET_FILE -o shuffled.csv +head -n $train_lines shuffled.csv > train.csv +tail -n $((dev_lines + test_lines)) shuffled.csv | head -n $dev_lines > dev.csv +tail -n $test_lines shuffled.csv > test.csv +mkdir -p data +mv train.csv dev.csv test.csv data/ \ No newline at end of file