2024-03-23 20:11:47 +01:00
|
|
|
#!/bin/bash
|
|
|
|
|
|
|
|
pip install kaggle --upgrade
|
|
|
|
|
2024-03-25 20:03:27 +01:00
|
|
|
kaggle datasets download -d gulczas/spotify-dataset
|
2024-03-23 20:11:47 +01:00
|
|
|
|
2024-03-25 21:06:06 +01:00
|
|
|
unzip -o spotify-dataset.zip
|
2024-03-25 20:38:27 +01:00
|
|
|
|
2024-03-25 21:06:06 +01:00
|
|
|
echo "------------------ Shufle ------------------"
|
2024-03-25 20:03:27 +01:00
|
|
|
shuf Spotify_Dataset.csv -o shuffled_spotify.csv
|
2024-03-23 20:11:47 +01:00
|
|
|
|
2024-03-25 21:06:06 +01:00
|
|
|
echo "------------------ Cut off top: ${1} rows ------------------"
|
|
|
|
head -n $1 shuffled_spotify.csv > cutoff_spotify.csv
|
2024-03-23 20:11:47 +01:00
|
|
|
|
2024-03-25 21:06:06 +01:00
|
|
|
echo "------------------ Split ------------------"
|
|
|
|
total_lines=$(wc -l < cutoff_spotify.csv)
|
|
|
|
num_test=$((total_lines / 10))
|
|
|
|
num_train=$((total_lines - (num_test * 2)))
|
|
|
|
num_validation=$num_test
|
2024-03-23 20:11:47 +01:00
|
|
|
|
2024-03-25 21:06:06 +01:00
|
|
|
head -n $num_train cutoff_spotify.csv > train.csv
|
|
|
|
tail -n $num_test cutoff_spotify.csv | head -n $num_validation > validation.csv
|
|
|
|
tail -n $num_test cutoff_spotify.csv | tail -n +$(($num_validation + 1)) > test.csv
|
2024-03-23 20:11:47 +01:00
|
|
|
|
|
|
|
mkdir -p artifacts
|
2024-03-25 21:06:06 +01:00
|
|
|
mv Spotify_Dataset.csv cutoff_spotify.csv train.csv validation.csv test.csv artifacts/
|