diff --git a/.gitignore b/.gitignore index 207d123..69742a4 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,5 @@ ipython_config.py # Remove previous ipynb_checkpoints # git rm -r .ipynb_checkpoints/ +data/ + diff --git a/download_dataset.sh b/download_dataset.sh new file mode 100644 index 0000000..6930433 --- /dev/null +++ b/download_dataset.sh @@ -0,0 +1,57 @@ +# ratios as percentages +train_percent=80 +dev_percent=10 +test_percent=10 + +split_dataset() { + # shuffling the dataset, excluding the first line which describes the columns + shuffled_file=$1.shuf + header=$(head -n 1 $1) + tail -n +2 $1 | shuf > $shuffled_file + + # getting line count so we can split the datasets: + + + lines=$( wc -l $shuffled_file | cut -d ' ' -f 1 ) + + + get_line_count_for() { + echo $(($lines * $1 / 100)) + } + + train_lines=$( get_line_count_for $train_percent ) + dev_lines=$( get_line_count_for $dev_percent ) + test_lines=$( get_line_count_for $test_percent ) + + # first line has to be the header + init_file() { + echo $header > $1 + } + + init_file $1.test + init_file $1.dev + init_file $1.train + + head -n $test_lines $shuffled_file >> $1.test + head -n $(($test_lines + $dev_lines)) $shuffled_file | tail -n $test_lines >> $1.dev + tail -n +$(($test_lines + $dev_lines + 1)) $shuffled_file >> $1.train + rm $shuffled_file +} + +mkdir data +cd data + +echo "Downloading dataset from Kaggle..." +kaggle datasets download -d anikannal/solar-power-generation-data +echo "Done." + +echo "Unzipping archive" +files=$(unzip solar-power-generation-data.zip | tail -n +2 | cut -d ' ' -f 4) +echo "Done." + +echo "Splitting datasets" +echo "Using ratio (train:dev:test) $train_percent:$dev_percent:$test_percent" +for file in $files; do + split_dataset $file +done +echo "Done! Files are inside ./data folder"