# ratios as percentages train_percent=80 dev_percent=10 test_percent=10 split_dataset() { # shuffling the dataset, excluding the first line which describes the columns shuffled_file=$1.shuf header=$(head -n 1 $1) tail -n +2 $1 | shuf > $shuffled_file # getting line count so we can split the datasets: lines=$( wc -l $shuffled_file | cut -d ' ' -f 1 ) get_line_count_for() { echo $(($lines * $1 / 100)) } train_lines=$( get_line_count_for $train_percent ) dev_lines=$( get_line_count_for $dev_percent ) test_lines=$( get_line_count_for $test_percent ) # first line has to be the header init_file() { echo $header > $1 } init_file $1.test init_file $1.dev init_file $1.train head -n $test_lines $shuffled_file >> $1.test head -n $(($test_lines + $dev_lines)) $shuffled_file | tail -n $test_lines >> $1.dev tail -n +$(($test_lines + $dev_lines + 1)) $shuffled_file >> $1.train rm $shuffled_file } mkdir data cd data echo "Downloading dataset from Kaggle..." kaggle datasets download -d anikannal/solar-power-generation-data echo "Done." echo "Unzipping archive" files=$(unzip solar-power-generation-data.zip | tail -n +2 | cut -d ' ' -f 4) echo "Done." echo "Splitting datasets" echo "Using ratio (train:dev:test) $train_percent:$dev_percent:$test_percent" for file in $files; do split_dataset $file done echo "Done! Files are inside ./data folder"