From 063993c91a3f18aac45b29e4cd67acb52668fc15 Mon Sep 17 00:00:00 2001
From: s444018 <szym99on@gmail.com>
Date: Mon, 21 Mar 2022 00:44:54 +0100
Subject: [PATCH] downloader script

---
 download_dataset.sh | 55 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 download_dataset.sh

diff --git a/download_dataset.sh b/download_dataset.sh
new file mode 100644
index 0000000..190a43b
--- /dev/null
+++ b/download_dataset.sh
@@ -0,0 +1,55 @@
+# ratios as percentages
+train_percent=80
+dev_percent=10
+test_percent=10
+
+split_dataset() {
+    # shuffling the dataset, excluding the first line which describes the columns
+    shuffled_file=$1.shuf
+    header=$(head -n 1 $1)
+    tail -n +2 $1 | shuf > $shuffled_file
+
+    # getting line count so we can split the datasets:
+
+    lines=$( wc -l $shuffled_file | cut -d ' ' -f 1 )
+
+    get_line_count_for() {
+        echo $(($lines * $1 / 100))
+    }
+
+    train_lines=$( get_line_count_for $train_percent )
+    dev_lines=$( get_line_count_for $dev_percent )
+    test_lines=$( get_line_count_for $test_percent )
+
+    # first line has to be the header
+    init_file() {
+        echo $header > $1
+    }
+
+    init_file $1.test
+    init_file $1.dev
+    init_file $1.train
+
+    head -n $test_lines $shuffled_file >> $1.test
+    head -n $(($test_lines + $dev_lines)) $shuffled_file | tail -n $test_lines >> $1.dev
+    tail -n +$(($test_lines + $dev_lines + 1)) $shuffled_file >> $1.train
+    rm $shuffled_file
+}
+
+mkdir data
+cd data
+
+echo "Downloading dataset from Kaggle..."
+kaggle datasets download -d harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows
+echo "Done."
+
+echo "Unzipping archive"
+files=$(unzip imdb-dataset-of-top-1000-movies-and-tv-shows.zip | tail -n +2 | cut -d ' ' -f 4)
+echo "Done."
+
+echo "Splitting datasets"
+echo "Using ratio (train:dev:test) $train_percent:$dev_percent:$test_percent"
+for file in $files; do
+    split_dataset $file
+done
+echo "Done! Files are inside ./data folder"
\ No newline at end of file