diff --git a/Jenkinsfile b/Jenkinsfile index 4d3f11f..53912c3 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -3,7 +3,8 @@ pipeline { parameters { string(name: 'KAGGLE_USERNAME', defaultValue: 'gulczas', description: 'Kaggle username') - password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key') + password(name: 'KAGGLE_KEY', defaultValue: '', description: 'Kaggle API key') + string(name: 'CUTOFF', defaultValue: '10', description: 'Number of rows to cut') } stages { @@ -19,7 +20,7 @@ pipeline { "KAGGLE_USERNAME=${env.KAGGLE_USERNAME}", "KAGGLE_KEY=${env.KAGGLE_KEY}"]) { - sh 'bash ./download_dataset.sh' + sh 'bash ./download_dataset.sh ${params.CUTOFF}' } } } diff --git a/download_dataset.sh b/download_dataset.sh index f5d5a40..27110ae 100644 --- a/download_dataset.sh +++ b/download_dataset.sh @@ -4,36 +4,23 @@ pip install kaggle --upgrade kaggle datasets download -d gulczas/spotify-dataset -unzip spotify-dataset.zip +unzip -o spotify-dataset.zip +echo "------------------ Shufle ------------------" shuf Spotify_Dataset.csv -o shuffled_spotify.csv -head -n 100 shuffled_spotify.csv > subset1.csv -tail -n 100 shuffled_spotify.csv > subset2.csv +echo "------------------ Cut off top: ${1} rows ------------------" +head -n $1 shuffled_spotify.csv > cutoff_spotify.csv -cut -d ',' -f 1,2,3 shuffled_spotify.csv > trimmed_spotify.csv +echo "------------------ Split ------------------" +total_lines=$(wc -l < cutoff_spotify.csv) +num_test=$((total_lines / 10)) +num_train=$((total_lines - (num_test * 2))) +num_validation=$num_test -cut -d ',' -f 1,2,4,5,6 shuffled_spotify.csv > processed_spotify.csv - -echo "Shuffled dataset:" > results.txt -head shuffled_spotify.csv >> results.txt -echo "" >> results.txt - -echo "Subset 1:" >> results.txt -head subset1.csv >> results.txt -echo "" >> results.txt - -echo "Subset 2:" >> results.txt -head subset2.csv >> results.txt -echo "" >> results.txt - -echo "Trimmed dataset:" >> results.txt -head trimmed_spotify.csv >> results.txt -echo "" >> results.txt - -echo "Processed dataset:" >> results.txt -head processed_spotify.csv >> results.txt -echo "" >> results.txt +head -n $num_train cutoff_spotify.csv > train.csv +tail -n $num_test cutoff_spotify.csv | head -n $num_validation > validation.csv +tail -n $num_test cutoff_spotify.csv | tail -n +$(($num_validation + 1)) > test.csv mkdir -p artifacts -mv shuffled_spotify.csv subset1.csv subset2.csv trimmed_spotify.csv processed_spotify.csv results.txt artifacts/ +mv Spotify_Dataset.csv cutoff_spotify.csv train.csv validation.csv test.csv artifacts/ \ No newline at end of file