ium_444507/download.sh

38 lines
1.0 KiB
Bash
Executable File

#!/bin/bash
echo 'Downloading Dataset'
kaggle datasets download -d aleksandrglotov/car-prices-poland
echo 'Dataset downloaded'
echo 'Unzippig Dataset'
unzip -o car-prices-poland.zip
echo 'Dataset unzipped'
len=$(cat ./Car_Prices_Poland_Kaggle.csv | wc -l)
echo 'Initial dataset count:' $len
echo 'CUTOFF VALUE: ' $1
echo 'Skip first header row and shuffle'
# example in materials don't work (head -n -1)
tail -n +2 Car_Prices_Poland_Kaggle.csv | shuf | head -n $1 > ./Car_Prices_Poland_Kaggle_shuf.csv
echo 'Shuffled'
len=$(cat ./Car_Prices_Poland_Kaggle_shuf.csv | wc -l)
echo 'Dataset count after cutoff:' $len
len1=$(($len/6))
len2=$(($len1*2+1))
echo 'len: '$len
echo 'len1: '$len1
echo 'len2: '$len2
echo 'Divide and save to files'
head -n $len1 Car_Prices_Poland_Kaggle_shuf.csv> car_prices.csv.test
head -n $len1 Car_Prices_Poland_Kaggle_shuf.csv| tail -n $len1 > car_prices.csv.dev
tail -n +$len2 Car_Prices_Poland_Kaggle_shuf.csv> car_prices.csv.train
rm ./Car_Prices_Poland_Kaggle_shuf.csv
echo 'Divided datasets count'
wc -l car_prices.csv.*