downloader script

This commit is contained in:
s444018 2022-03-21 00:44:54 +01:00
parent 07ee2d02b9
commit 063993c91a

55
download_dataset.sh Normal file
View File

@ -0,0 +1,55 @@
# ratios as percentages
train_percent=80
dev_percent=10
test_percent=10
split_dataset() {
# shuffling the dataset, excluding the first line which describes the columns
shuffled_file=$1.shuf
header=$(head -n 1 $1)
tail -n +2 $1 | shuf > $shuffled_file
# getting line count so we can split the datasets:
lines=$( wc -l $shuffled_file | cut -d ' ' -f 1 )
get_line_count_for() {
echo $(($lines * $1 / 100))
}
train_lines=$( get_line_count_for $train_percent )
dev_lines=$( get_line_count_for $dev_percent )
test_lines=$( get_line_count_for $test_percent )
# first line has to be the header
init_file() {
echo $header > $1
}
init_file $1.test
init_file $1.dev
init_file $1.train
head -n $test_lines $shuffled_file >> $1.test
head -n $(($test_lines + $dev_lines)) $shuffled_file | tail -n $test_lines >> $1.dev
tail -n +$(($test_lines + $dev_lines + 1)) $shuffled_file >> $1.train
rm $shuffled_file
}
mkdir data
cd data
echo "Downloading dataset from Kaggle..."
kaggle datasets download -d harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows
echo "Done."
echo "Unzipping archive"
files=$(unzip imdb-dataset-of-top-1000-movies-and-tv-shows.zip | tail -n +2 | cut -d ' ' -f 4)
echo "Done."
echo "Splitting datasets"
echo "Using ratio (train:dev:test) $train_percent:$dev_percent:$test_percent"
for file in $files; do
split_dataset $file
done
echo "Done! Files are inside ./data folder"