diff --git a/kuggle_download.sh b/kuggle_download.sh index 16d3378..4f45907 100644 --- a/kuggle_download.sh +++ b/kuggle_download.sh @@ -1,30 +1,30 @@ #!/bin/bash +# Pobieranie i rozpakowanie pip install kaggle - kaggle datasets download -d thedevastator/1-5-million-beer-reviews-from-beer-advocate - unzip -o 1-5-million-beer-reviews-from-beer-advocate.zip +DATASET_FILE="beer_reviews.csv" -shuf beer_reviews.csv > shuffled_dataset.csv +head -n $1 $DATASET_FILE > cutoff_$DATASET_FILE -pwd -ls -a +# Mieszanie i podziaƂ +total_lines=$(tail -n +2 cutoff_$DATASET_FILE | wc -l) -split -l 80000 shuffled_dataset.csv train.csv -split -l 10000 train.csv dev.csv -mv shuffled_dataset.csv test.csv +train_lines=$((total_lines * 80 / 100)) +dev_lines=$((total_lines * 10 / 100)) +test_lines=$((total_lines - train_lines - dev_lines)) -head -n 1000 train.csv > train_head.csv -tail -n 1000 train.csv > train_tail.csv +shuf cutoff_$DATASET_FILE -o shuffled.csv -if [ -n "$CUTOFF" ]; then - head -n "$CUTOFF" train.csv > train_cutoff.csv -fi +head -n $train_lines shuffled.csv > train.csv +tail -n $((dev_lines + test_lines)) shuffled.csv | head -n $dev_lines > dev.csv +tail -n $test_lines shuffled.csv > test.csv -tar -czf artifacts.tar.gz train.csv dev.csv test.csv train_head.csv train_tail.csv train_cutoff.csv +# Archiwizacja +tar -czf artifacts.tar.gz train.csv dev.csv test.csv -rm $DATASET_FILE shuffled_dataset.csv +# Czyszczenie +rm cutoff_$DATASET_FILE shuffled.csv echo "artifacts.tar.gz" -