2024-03-24 14:39:33 +01:00
|
|
|
#!/bin/bash
|
|
|
|
|
|
|
|
pip install kaggle
|
|
|
|
|
2024-03-24 15:22:43 +01:00
|
|
|
kaggle datasets download -d thedevastator/1-5-million-beer-reviews-from-beer-advocate
|
2024-03-24 14:39:33 +01:00
|
|
|
|
2024-03-24 15:22:43 +01:00
|
|
|
unzip -o 1-5-million-beer-reviews-from-beer-advocate.zip
|
2024-03-24 14:39:33 +01:00
|
|
|
|
2024-03-24 15:22:43 +01:00
|
|
|
shuf beer_reviews.csv > shuffled_dataset.csv
|
2024-03-24 14:39:33 +01:00
|
|
|
|
2024-03-24 16:10:21 +01:00
|
|
|
pwd
|
|
|
|
ls -a
|
|
|
|
|
2024-03-24 14:39:33 +01:00
|
|
|
split -l 80000 shuffled_dataset.csv train.csv
|
|
|
|
split -l 10000 train.csv dev.csv
|
|
|
|
mv shuffled_dataset.csv test.csv
|
|
|
|
|
|
|
|
head -n 1000 train.csv > train_head.csv
|
|
|
|
tail -n 1000 train.csv > train_tail.csv
|
|
|
|
|
|
|
|
if [ -n "$CUTOFF" ]; then
|
|
|
|
head -n "$CUTOFF" train.csv > train_cutoff.csv
|
|
|
|
fi
|
|
|
|
|
|
|
|
tar -czf artifacts.tar.gz train.csv dev.csv test.csv train_head.csv train_tail.csv train_cutoff.csv
|
|
|
|
|
|
|
|
rm $DATASET_FILE shuffled_dataset.csv
|
|
|
|
|
|
|
|
echo "artifacts.tar.gz"
|
|
|
|
|