#!/bin/bash # Pobieranie i rozpakowanie echo "$KAGGLE_USERNAME" echo "$KAGGLE_KEY" kaggle datasets download -d thedevastator/1-5-million-beer-reviews-from-beer-advocate unzip -o 1-5-million-beer-reviews-from-beer-advocate.zip DATASET_FILE="beer_reviews.csv" echo "------------------ Cut off top: ${1} rows ------------------" head -n $1 $DATASET_FILE > cutoff_$DATASET_FILE # Mieszanie i podziaƂ echo "------------------ Split and shufle ------------------" total_lines=$(tail -n +2 cutoff_$DATASET_FILE | wc -l) train_lines=$((total_lines * 80 / 100)) dev_lines=$((total_lines * 10 / 100)) test_lines=$((total_lines - train_lines - dev_lines)) shuf cutoff_$DATASET_FILE -o shuffled.csv head -n $train_lines shuffled.csv > train.csv tail -n $((dev_lines + test_lines)) shuffled.csv | head -n $dev_lines > dev.csv tail -n $test_lines shuffled.csv > test.csv # Czyszczenie echo "------------------ Clean ------------------" rm cutoff_$DATASET_FILE shuffled.csv mkdir -p data mv train.csv dev.csv test.csv data/