2022-03-27 17:43:59 +02:00
|
|
|
dataset_operation() {
|
2022-03-27 22:59:50 +02:00
|
|
|
tail -n +2 imdb_top_1000.csv | shuf > imdb_top_1000.csv.s
|
|
|
|
head -n $CUTOFF imdb_top_1000.csv.s > ./imdb_top_1000.csv.shuf
|
|
|
|
len1=$(cat ./imdb_top_1000.csv.shuf | wc -l)
|
2022-03-27 17:43:59 +02:00
|
|
|
len2=$(($len1/10))
|
|
|
|
len3=$(($len2*2))
|
|
|
|
len4=$(($len3+1))
|
2022-04-03 23:44:56 +02:00
|
|
|
head -n $len2 imdb_top_1000.csv.shuf > imdb_top_1000_test2.csv
|
|
|
|
head -n $len3 imdb_top_1000.csv.shuf | tail -n $len2 > imdb_top_1000_dev2.csv
|
|
|
|
tail -n +$len4 imdb_top_1000.csv.shuf > imdb_top_1000_train2.csv
|
2022-03-27 22:59:50 +02:00
|
|
|
rm imdb_top_1000.csv.shuf
|
|
|
|
wc -l imdb_top_1000.csv.*
|
2022-03-27 17:43:59 +02:00
|
|
|
}
|
|
|
|
|
2022-03-27 22:49:35 +02:00
|
|
|
kaggle datasets download -d harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows
|
2022-03-27 23:55:53 +02:00
|
|
|
unzip -o imdb-dataset-of-top-1000-movies-and-tv-shows.zip
|
2022-03-27 22:41:36 +02:00
|
|
|
dataset_operation
|