#! /usr/bin/env bash # zadanie 5 # pobranie zbioru danych kaggle datasets download -d timmate/avocado-prices-2020 unzip -o avocado-prices-2020.zip TOTAL_SIZE=$(wc -l avocado-updated-2020.csv | awk '{print $1}') # przetwarzanie pliku grep -P "^$" -n avocado-updated-2020.csv # usuniecie zbednej kolumny year (redundantne dane) cut -d ',' -f12 --complement avocado-updated-2020.csv head -n 1 avocado-updated-2020.csv > header.csv head -n -1 avocado-updated-2020.csv | shuf > avocado-updated-2020-shuf.csv head -n "$1" avocado-updated-2020-shuf.csv > avocado-2020.csv let truncated_size=$(wc -l avocado-2020.csv | awk '{print $1}') let test_size=$(($truncated_size*2/10)) let dev_size=$(($truncated_size*2*2/10)) let train_size=$(($truncated_size*2*2/10+1)) # podzial na train/dev/test 6/2/2 head -n $test_size avocado-2020.csv > avocado-updated-2020-test.csv head -n $dev_size avocado-2020.csv | tail -n $test_size > avocado-updated-2020-dev.csv tail -n +$train_size avocado-2020.csv > avocado-updated-2020-train.csv wc -l avocado-updated-2020-*.csv cat header.csv avocado-updated-2020-test.csv > test.csv cat header.csv avocado-updated-2020-dev.csv > dev.csv cat header.csv avocado-updated-2020-train.csv > train.csv # usuniecie pomocniczych plikow rm avocado-updated-2020-test.csv rm avocado-updated-2020-dev.csv rm avocado-updated-2020-train.csv rm avocado-updated-2020-shuf.csv rm avocado-prices-2020.zip rm header.csv