2021-03-28 12:23:39 +02:00
|
|
|
# zadanie 5
|
|
|
|
|
|
|
|
# pobranie zbioru danych
|
|
|
|
kaggle datasets download -d timmate/avocado-prices-2020
|
|
|
|
unzip -o avocado-prices-2020.zip
|
2021-03-28 16:55:29 +02:00
|
|
|
TOTAL_SIZE=$(wc -l avocado-updated-2020.csv | awk '{print $1}')
|
2021-03-28 12:23:39 +02:00
|
|
|
|
|
|
|
# przetwarzanie pliku
|
|
|
|
grep -P "^$" -n avocado-updated-2020.csv
|
|
|
|
|
|
|
|
# usuniecie zbednej kolumny year (redundantne dane)
|
2021-03-28 16:22:31 +02:00
|
|
|
cut -d ',' -f12 --complement avocado-updated-2020.csv
|
2021-03-28 15:22:47 +02:00
|
|
|
|
2021-03-28 16:22:31 +02:00
|
|
|
head -n 1 avocado-updated-2020.csv > header.csv
|
2021-03-28 15:24:06 +02:00
|
|
|
head -n -1 avocado-updated-2020.csv | shuf > avocado-updated-2020-shuf.csv
|
2021-03-28 12:23:39 +02:00
|
|
|
|
2021-03-28 16:25:25 +02:00
|
|
|
head -n "$1" avocado-updated-2020-shuf.csv > avocado-2020.csv
|
2021-03-28 16:55:29 +02:00
|
|
|
TRUNCATED_SIZE=$(wc -l avocado-2020.csv | awk '{print $1}')
|
2021-03-28 17:11:30 +02:00
|
|
|
declare -i test_size=0,2*$TRUNCATED_SIZE
|
|
|
|
declare -i dev_size=2*0,2*$TRUNCATED_SIZE
|
|
|
|
declare -i train_size=2*0,2*$TRUNCATED_SIZE+1
|
2021-03-28 16:22:31 +02:00
|
|
|
|
2021-03-28 16:39:22 +02:00
|
|
|
# podzial na train/dev/test 6/2/2
|
2021-03-28 17:11:30 +02:00
|
|
|
head -n $test_size avocado-2020.csv > avocado-updated-2020-test.csv
|
|
|
|
head -n $dev_size avocado-2020.csv | tail -n $TEST_SIZE > avocado-updated-2020-dev.csv
|
|
|
|
tail -n +$train_size avocado-2020.csv > avocado-updated-2020-train.csv
|
2021-03-28 12:23:39 +02:00
|
|
|
|
|
|
|
wc -l avocado-updated-2020-*.csv
|
|
|
|
|
|
|
|
cat header.csv avocado-updated-2020-test.csv > test.csv
|
|
|
|
cat header.csv avocado-updated-2020-dev.csv > dev.csv
|
|
|
|
cat header.csv avocado-updated-2020-train.csv > train.csv
|
|
|
|
|
|
|
|
# usuniecie pomocniczych plikow
|
|
|
|
rm avocado-updated-2020-test.csv
|
|
|
|
rm avocado-updated-2020-dev.csv
|
|
|
|
rm avocado-updated-2020-train.csv
|
|
|
|
rm avocado-updated-2020-shuf.csv
|
|
|
|
rm avocado-prices-2020.zip
|
2021-03-28 12:35:37 +02:00
|
|
|
rm header.csv
|