From 063993c91a3f18aac45b29e4cd67acb52668fc15 Mon Sep 17 00:00:00 2001 From: s444018 Date: Mon, 21 Mar 2022 00:44:54 +0100 Subject: [PATCH] downloader script --- download_dataset.sh | 55 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 download_dataset.sh diff --git a/download_dataset.sh b/download_dataset.sh new file mode 100644 index 0000000..190a43b --- /dev/null +++ b/download_dataset.sh @@ -0,0 +1,55 @@ +# ratios as percentages +train_percent=80 +dev_percent=10 +test_percent=10 + +split_dataset() { + # shuffling the dataset, excluding the first line which describes the columns + shuffled_file=$1.shuf + header=$(head -n 1 $1) + tail -n +2 $1 | shuf > $shuffled_file + + # getting line count so we can split the datasets: + + lines=$( wc -l $shuffled_file | cut -d ' ' -f 1 ) + + get_line_count_for() { + echo $(($lines * $1 / 100)) + } + + train_lines=$( get_line_count_for $train_percent ) + dev_lines=$( get_line_count_for $dev_percent ) + test_lines=$( get_line_count_for $test_percent ) + + # first line has to be the header + init_file() { + echo $header > $1 + } + + init_file $1.test + init_file $1.dev + init_file $1.train + + head -n $test_lines $shuffled_file >> $1.test + head -n $(($test_lines + $dev_lines)) $shuffled_file | tail -n $test_lines >> $1.dev + tail -n +$(($test_lines + $dev_lines + 1)) $shuffled_file >> $1.train + rm $shuffled_file +} + +mkdir data +cd data + +echo "Downloading dataset from Kaggle..." +kaggle datasets download -d harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows +echo "Done." + +echo "Unzipping archive" +files=$(unzip imdb-dataset-of-top-1000-movies-and-tv-shows.zip | tail -n +2 | cut -d ' ' -f 4) +echo "Done." + +echo "Splitting datasets" +echo "Using ratio (train:dev:test) $train_percent:$dev_percent:$test_percent" +for file in $files; do + split_dataset $file +done +echo "Done! Files are inside ./data folder" \ No newline at end of file