download dataset

2024-03-26 19:21:06 +01:00 · 2024-03-26 19:21:06 +01:00 · 060de23459
commit 060de23459
parent 424e4b2478
2 changed files with 43 additions and 3 deletions
--- a/28
+++ b/28
@ -1,10 +1,32 @@
 pipeline {
    agent any
    parameters {
        string(name: 'CUTOFF', defaultValue: '100', description: 'Ilość wierszy do odcięcia')
 	    string(name: 'KAGGLE_USERNAME', defaultValue: '', description: 'Kaggle username')
        password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key')
    }
    stages {
-        stage('Stage 1') {
+        stage('Clone repo') {
            steps {
-                echo 'elo 420 v2'
+                git url: "https://git.wmi.amu.edu.pl/s464937/ium_464937"
            }
        }
        stage('Pobierz i przeprocesuj zbiór') {
            steps {
                withEnv([
                    "KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
                    "KAGGLE_KEY=${env.KAGGLE_KEY}"
                ]) {
                    sh "bash ./script1.sh ${params.CUTOFF}"
                }
            }
        }
        stage('Archive Results') {
            steps {
                archiveArtifacts artifacts: 'data/*', onlyIfSuccessful: true
            }
        }
    }
 }
--- a/script1.sh
+++ b/script1.sh
@ -0,0 +1,18 @@
 #!/bin/bash
 pip install kaggle
 kaggle datasets download -d open-powerlifting/powerlifting-database
 unzip -o powerlifting-database.zip
 DATASET_FILE="openpowerlifting.csv"
 echo "Obcięte wiersze: ${1}"
 head -n $1 $DATASET_FILE > cutoff_$DATASET_FILE
 echo "Podział i wymieszanie"
 total_lines=$(tail -n +2 cutoff_$DATASET_FILE | wc -l)
 train_lines=$((total_lines * 90 / 100))
 dev_lines=$((total_lines * 10 / 100))
 test_lines=$((total_lines - train_lines - dev_lines))
 shuf cutoff_$DATASET_FILE -o shuffled.csv
 head -n $train_lines shuffled.csv > train.csv
 tail -n $((dev_lines + test_lines)) shuffled.csv | head -n $dev_lines > dev.csv
 tail -n $test_lines shuffled.csv > test.csv
 mkdir -p data
 mv train.csv dev.csv test.csv data/