download dataset

This commit is contained in:
Szymon Bartanowicz 2024-03-26 19:21:06 +01:00
parent 424e4b2478
commit 060de23459
2 changed files with 43 additions and 3 deletions

28
Jenkinsfile vendored
View File

@ -1,10 +1,32 @@
pipeline {
agent any
parameters {
string(name: 'CUTOFF', defaultValue: '100', description: 'Ilość wierszy do odcięcia')
string(name: 'KAGGLE_USERNAME', defaultValue: '', description: 'Kaggle username')
password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key')
}
stages {
stage('Stage 1') {
stage('Clone repo') {
steps {
echo 'elo 420 v2'
git url: "https://git.wmi.amu.edu.pl/s464937/ium_464937"
}
}
stage('Pobierz i przeprocesuj zbiór') {
steps {
withEnv([
"KAGGLE_USERNAME=${env.KAGGLE_USERNAME}",
"KAGGLE_KEY=${env.KAGGLE_KEY}"
]) {
sh "bash ./script1.sh ${params.CUTOFF}"
}
}
}
stage('Archive Results') {
steps {
archiveArtifacts artifacts: 'data/*', onlyIfSuccessful: true
}
}
}
}

18
script1.sh Normal file
View File

@ -0,0 +1,18 @@
#!/bin/bash
pip install kaggle
kaggle datasets download -d open-powerlifting/powerlifting-database
unzip -o powerlifting-database.zip
DATASET_FILE="openpowerlifting.csv"
echo "Obcięte wiersze: ${1}"
head -n $1 $DATASET_FILE > cutoff_$DATASET_FILE
echo "Podział i wymieszanie"
total_lines=$(tail -n +2 cutoff_$DATASET_FILE | wc -l)
train_lines=$((total_lines * 90 / 100))
dev_lines=$((total_lines * 10 / 100))
test_lines=$((total_lines - train_lines - dev_lines))
shuf cutoff_$DATASET_FILE -o shuffled.csv
head -n $train_lines shuffled.csv > train.csv
tail -n $((dev_lines + test_lines)) shuffled.csv | head -n $dev_lines > dev.csv
tail -n $test_lines shuffled.csv > test.csv
mkdir -p data
mv train.csv dev.csv test.csv data/