cleaning

2022-03-21 02:06:14 +01:00 · 2022-03-21 02:06:14 +01:00 · 01e02a7cb0
commit 01e02a7cb0
parent 7178e8efb7
4 changed files with 0 additions and 1058 deletions
--- a/data/imdb-dataset-of-top-1000-movies-and-tv-shows.zip
+++ b/data/imdb-dataset-of-top-1000-movies-and-tv-shows.zip
--- a/data/imdb_top_1000.csv
+++ b/data/imdb_top_1000.csv
--- a/download_dataset.sh
+++ b/download_dataset.sh
@ -1,55 +0,0 @@
-# ratios as percentages
-train_percent=80
-dev_percent=10
-test_percent=10
-
-split_dataset() {
-    # shuffling the dataset, excluding the first line which describes the columns
-    shuffled_file=$1.shuf
-    header=$(head -n 1 $1)
-    tail -n +2 $1 | shuf > $shuffled_file
-
-    # getting line count so we can split the datasets:
-
-    lines=$( wc -l $shuffled_file | cut -d ' ' -f 1 )
-
-    get_line_count_for() {
-        echo $(($lines * $1 / 100))
-    }
-
-    train_lines=$( get_line_count_for $train_percent )
-    dev_lines=$( get_line_count_for $dev_percent )
-    test_lines=$( get_line_count_for $test_percent )
-
-    # first line has to be the header
-    init_file() {
-        echo $header > $1
-    }
-
-    init_file $1.test
-    init_file $1.dev
-    init_file $1.train
-
-    head -n $test_lines $shuffled_file >> $1.test
-    head -n $(($test_lines + $dev_lines)) $shuffled_file | tail -n $test_lines >> $1.dev
-    tail -n +$(($test_lines + $dev_lines + 1)) $shuffled_file >> $1.train
-    rm $shuffled_file
-}
-
-mkdir data
-cd data
-
-echo "Downloading dataset from Kaggle..."
-kaggle datasets download -d harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows
-echo "Done."
-
-echo "Unzipping archive"
-files=$(unzip imdb-dataset-of-top-1000-movies-and-tv-shows.zip | tail -n +2 | cut -d ' ' -f 4)
-echo "Done."
-
-echo "Splitting datasets"
-echo "Using ratio (train:dev:test) $train_percent:$dev_percent:$test_percent"
-for file in $files; do
-    split_dataset $file
-done
-echo "Done! Files are inside ./data folder"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,2 +0,0 @@
-kaggle==1.5.12
-pandas==1.4.1