ium_464953/download_dataset.sh

27 lines
848 B
Bash
Raw Normal View History

2024-03-23 20:11:47 +01:00
#!/bin/bash
pip install kaggle --upgrade
2024-03-25 20:03:27 +01:00
kaggle datasets download -d gulczas/spotify-dataset
2024-03-23 20:11:47 +01:00
2024-03-25 21:06:06 +01:00
unzip -o spotify-dataset.zip
2024-03-25 20:38:27 +01:00
2024-03-25 21:06:06 +01:00
echo "------------------ Shufle ------------------"
2024-03-25 20:03:27 +01:00
shuf Spotify_Dataset.csv -o shuffled_spotify.csv
2024-03-23 20:11:47 +01:00
2024-03-25 21:19:43 +01:00
echo "------------------ Cut off to top $1 rows ------------------"
2024-03-25 21:06:06 +01:00
head -n $1 shuffled_spotify.csv > cutoff_spotify.csv
2024-03-23 20:11:47 +01:00
2024-03-25 21:06:06 +01:00
echo "------------------ Split ------------------"
total_lines=$(wc -l < cutoff_spotify.csv)
num_test=$((total_lines / 10))
num_train=$((total_lines - (num_test * 2)))
num_validation=$num_test
2024-03-23 20:11:47 +01:00
2024-03-25 21:06:06 +01:00
head -n $num_train cutoff_spotify.csv > train.csv
2024-03-25 21:29:11 +01:00
tail -n $((num_test+num_validation)) cutoff_spotify.csv | head -n $num_test > test.csv
tail -n $num_validation cutoff_spotify.csv > validation.csv
2024-03-23 20:11:47 +01:00
mkdir -p artifacts
2024-03-25 21:14:59 +01:00
mv Spotify_Dataset.csv cutoff_spotify.csv train.csv validation.csv test.csv artifacts/