ium_434742/avocado-preprocessing.sh
patrycjalazna 473ddf5712 cutoff
2021-03-28 17:47:32 +02:00

41 lines
1.4 KiB
Bash
Executable File

#! /usr/bin/env bash
# zadanie 5
# pobranie zbioru danych
kaggle datasets download -d timmate/avocado-prices-2020
unzip -o avocado-prices-2020.zip
TOTAL_SIZE=$(wc -l avocado-updated-2020.csv | awk '{print $1}')
# przetwarzanie pliku
grep -P "^$" -n avocado-updated-2020.csv
# usuniecie zbednej kolumny year (redundantne dane)
cut -d ',' -f12 --complement avocado-updated-2020.csv
head -n 1 avocado-updated-2020.csv > header.csv
head -n -1 avocado-updated-2020.csv | shuf > avocado-updated-2020-shuf.csv
head -n "$1" avocado-updated-2020-shuf.csv > avocado-2020.csv
let truncated_size=$(wc -l avocado-2020.csv | awk '{print $1}')
let test_size=$(($truncated_size*2/10))
let dev_size=$(($truncated_size*2*2/10))
let train_size=$(($truncated_size*2*2/10+1))
# podzial na train/dev/test 6/2/2
head -n $test_size avocado-2020.csv > avocado-updated-2020-test.csv
head -n $dev_size avocado-2020.csv | tail -n $test_size > avocado-updated-2020-dev.csv
tail -n +$train_size avocado-2020.csv > avocado-updated-2020-train.csv
wc -l avocado-updated-2020-*.csv
cat header.csv avocado-updated-2020-test.csv > test.csv
cat header.csv avocado-updated-2020-dev.csv > dev.csv
cat header.csv avocado-updated-2020-train.csv > train.csv
# usuniecie pomocniczych plikow
rm avocado-updated-2020-test.csv
rm avocado-updated-2020-dev.csv
rm avocado-updated-2020-train.csv
rm avocado-updated-2020-shuf.csv
rm avocado-prices-2020.zip
rm header.csv