2021-03-29 01:58:55 +02:00
|
|
|
curl -OL https://git.wmi.amu.edu.pl/s434784/ium_434784/raw/branch/master/who_suicide_statistics.csv
|
2021-03-29 00:57:26 +02:00
|
|
|
|
2021-03-29 01:55:42 +02:00
|
|
|
# uciecie nazw kolumn
|
|
|
|
head -n 1 who_suicide_statistics.csv > names.csv
|
|
|
|
|
|
|
|
# Randomizacja zbioru.
|
2021-03-29 02:30:55 +02:00
|
|
|
|
2021-03-29 02:14:15 +02:00
|
|
|
sed 1d who_suicide_statistics.csv | shuf > data.shuf
|
2021-03-29 02:30:55 +02:00
|
|
|
# Ilosc wierszy
|
|
|
|
NUMROWS=$(cat data.shuf | wc -l)
|
|
|
|
echo "Initial number of rows ${NUMROWS}"
|
2021-03-29 02:37:59 +02:00
|
|
|
CUTOFF=$1
|
2021-03-29 02:32:50 +02:00
|
|
|
DELETE=$((NUMROWS - CUTOFF))
|
2021-03-29 02:34:31 +02:00
|
|
|
echo "Cutting ${CUTOFF} rows"
|
2021-03-29 02:30:55 +02:00
|
|
|
tail -n $DELETE data.shuf > data.shuf.cut
|
2021-03-29 01:55:42 +02:00
|
|
|
|
|
|
|
# Podzial na 3 pozbiory
|
|
|
|
#Ustalenie proporcji 6:2:2
|
2021-03-29 02:30:55 +02:00
|
|
|
NUMROWS=$(cat data.shuf.cut | wc -l)
|
2021-03-29 01:58:55 +02:00
|
|
|
TEST=$((NUMROWS / 10 * 2 ))
|
|
|
|
DEV=$((NUMROWS / 10 * 4 ))
|
|
|
|
TRAIN=$((NUMROWS - DEV ))
|
2021-03-29 01:55:42 +02:00
|
|
|
|
2021-03-29 02:30:55 +02:00
|
|
|
head -n $TEST data.shuf.cut > data.test
|
|
|
|
head -n $DEV data.shuf.cut | tail -n $TEST > data.dev
|
|
|
|
head -n $NUMROWS data.shuf.cut | tail -n $TRAIN > data.train
|
|
|
|
|
|
|
|
rm data.shuf.cut
|
2021-03-29 01:55:42 +02:00
|
|
|
|
|
|
|
echo "Test rows ${TEST}"
|
2021-03-29 02:19:13 +02:00
|
|
|
echo "Dev rows ${TEST}"
|
2021-03-29 01:55:42 +02:00
|
|
|
echo "Train rows ${TRAIN}"
|
2021-03-29 02:14:15 +02:00
|
|
|
echo "All number of rows ${NUMROWS}"
|
2021-03-29 02:30:55 +02:00
|
|
|
|