2024-03-24 14:39:33 +01:00
|
|
|
#!/bin/bash
|
|
|
|
|
2024-03-24 17:49:57 +01:00
|
|
|
# Pobieranie i rozpakowanie
|
2024-03-24 14:39:33 +01:00
|
|
|
pip install kaggle
|
2024-03-24 15:22:43 +01:00
|
|
|
kaggle datasets download -d thedevastator/1-5-million-beer-reviews-from-beer-advocate
|
|
|
|
unzip -o 1-5-million-beer-reviews-from-beer-advocate.zip
|
2024-03-24 17:49:57 +01:00
|
|
|
DATASET_FILE="beer_reviews.csv"
|
2024-03-24 14:39:33 +01:00
|
|
|
|
2024-03-24 18:21:21 +01:00
|
|
|
echo "------------------ Cut off top: ${1} rows ------------------"
|
2024-03-24 17:49:57 +01:00
|
|
|
head -n $1 $DATASET_FILE > cutoff_$DATASET_FILE
|
2024-03-24 14:39:33 +01:00
|
|
|
|
2024-03-24 17:49:57 +01:00
|
|
|
# Mieszanie i podział
|
2024-03-24 18:21:21 +01:00
|
|
|
echo "------------------ Split and shufle ------------------"
|
2024-03-24 17:49:57 +01:00
|
|
|
total_lines=$(tail -n +2 cutoff_$DATASET_FILE | wc -l)
|
2024-03-24 16:10:21 +01:00
|
|
|
|
2024-03-24 17:49:57 +01:00
|
|
|
train_lines=$((total_lines * 80 / 100))
|
|
|
|
dev_lines=$((total_lines * 10 / 100))
|
|
|
|
test_lines=$((total_lines - train_lines - dev_lines))
|
2024-03-24 14:39:33 +01:00
|
|
|
|
2024-03-24 17:49:57 +01:00
|
|
|
shuf cutoff_$DATASET_FILE -o shuffled.csv
|
2024-03-24 14:39:33 +01:00
|
|
|
|
2024-03-24 17:49:57 +01:00
|
|
|
head -n $train_lines shuffled.csv > train.csv
|
|
|
|
tail -n $((dev_lines + test_lines)) shuffled.csv | head -n $dev_lines > dev.csv
|
|
|
|
tail -n $test_lines shuffled.csv > test.csv
|
2024-03-24 14:39:33 +01:00
|
|
|
|
2024-03-24 17:49:57 +01:00
|
|
|
# Czyszczenie
|
2024-03-24 18:21:21 +01:00
|
|
|
echo "------------------ Clean ------------------"
|
2024-03-24 19:02:45 +01:00
|
|
|
rm cutoff_$DATASET_FILE shuffled.csv
|
|
|
|
|
2024-03-24 19:08:44 +01:00
|
|
|
mkdir -p data
|
2024-03-24 19:02:45 +01:00
|
|
|
mv train.csv dev.csv test.csv data/
|