ium_434742/avocado-preprocessing.sh

35 lines
1.1 KiB
Bash
Raw Normal View History

2021-03-28 12:23:39 +02:00
# zadanie 5
# pobranie zbioru danych
kaggle datasets download -d timmate/avocado-prices-2020
unzip -o avocado-prices-2020.zip
wc -l avocado-updated-2020.csv
2021-03-28 16:10:32 +02:00
echo '$1 AAAAAAAAAAAAAAAAA'
2021-03-28 12:23:39 +02:00
# przetwarzanie pliku
grep -P "^$" -n avocado-updated-2020.csv
# usuniecie zbednej kolumny year (redundantne dane)
cut -d -f12 --complement avocado-updated-2020.csv
2021-03-28 15:22:47 +02:00
2021-03-28 15:24:06 +02:00
head -n 1 avocado-updated-2020.csv> header.csv
head -n -1 avocado-updated-2020.csv | shuf > avocado-updated-2020-shuf.csv
2021-03-28 12:23:39 +02:00
# podzial na train/dev/test
2021-03-28 15:24:06 +02:00
head -n 6609 avocado-updated-2020-shuf.csv > avocado-updated-2020-test.csv
head -n 13218 avocado-updated-2020-shuf.csv | tail -n 6609 > avocado-updated-2020-dev.csv
2021-03-28 12:23:39 +02:00
tail -n +13219 avocado-updated-2020-shuf.csv > avocado-updated-2020-train.csv
wc -l avocado-updated-2020-*.csv
cat header.csv avocado-updated-2020-test.csv > test.csv
cat header.csv avocado-updated-2020-dev.csv > dev.csv
cat header.csv avocado-updated-2020-train.csv > train.csv
# usuniecie pomocniczych plikow
rm avocado-updated-2020-test.csv
rm avocado-updated-2020-dev.csv
rm avocado-updated-2020-train.csv
rm avocado-updated-2020-shuf.csv
rm avocado-prices-2020.zip
2021-03-28 12:35:37 +02:00
rm header.csv