download dataset

2024-03-26 19:21:06 +01:00 · 2024-03-26 19:21:06 +01:00 · 060de23459
commit 060de23459
parent 424e4b2478
2 changed files with 43 additions and 3 deletions
--- a/28
+++ b/28
@ -1,10 +1,32 @@
 pipeline {
    agent any
+
+    parameters {
+        string(name: 'CUTOFF', defaultValue: '100', description: 'Ilość wierszy do odcięcia')
+	    string(name: 'KAGGLE_USERNAME', defaultValue: '', description: 'Kaggle username')
+        password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key')
+    }
+
    stages {
-        stage('Stage 1') {
+        stage('Clone repo') {
            steps {
-                echo 'elo 420 v2'
+                git url: "https://git.wmi.amu.edu.pl/s464937/ium_464937"
+            }
+        }
+
+        stage('Pobierz i przeprocesuj zbiór') {
+            steps {
+                withEnv([
+                    "KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
+                    "KAGGLE_KEY=${env.KAGGLE_KEY}"
+                ]) {
+                    sh "bash ./script1.sh ${params.CUTOFF}"
+                }
+            }
+        }
+        stage('Archive Results') {
+            steps {
+                archiveArtifacts artifacts: 'data/*', onlyIfSuccessful: true
            }
        }
    }
-}
--- a/script1.sh
+++ b/script1.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+pip install kaggle
+kaggle datasets download -d open-powerlifting/powerlifting-database
+unzip -o powerlifting-database.zip
+DATASET_FILE="openpowerlifting.csv"
+echo "Obcięte wiersze: ${1}"
+head -n $1 $DATASET_FILE > cutoff_$DATASET_FILE
+echo "Podział i wymieszanie"
+total_lines=$(tail -n +2 cutoff_$DATASET_FILE | wc -l)
+train_lines=$((total_lines * 90 / 100))
+dev_lines=$((total_lines * 10 / 100))
+test_lines=$((total_lines - train_lines - dev_lines))
+shuf cutoff_$DATASET_FILE -o shuffled.csv
+head -n $train_lines shuffled.csv > train.csv
+tail -n $((dev_lines + test_lines)) shuffled.csv | head -n $dev_lines > dev.csv
+tail -n $test_lines shuffled.csv > test.csv
+mkdir -p data
+mv train.csv dev.csv test.csv data/