script can now create 8_1_1 datasets
This commit is contained in:
parent
5b20395ba3
commit
6de4db7a83
1586
TEST_winequality-red.csv
Normal file
1586
TEST_winequality-red.csv
Normal file
File diff suppressed because it is too large
Load Diff
12968
TRAIN_winequality-red.csv
Normal file
12968
TRAIN_winequality-red.csv
Normal file
File diff suppressed because it is too large
Load Diff
1696
VAL_winequality-red.csv
Normal file
1696
VAL_winequality-red.csv
Normal file
File diff suppressed because it is too large
Load Diff
58
script.sh
58
script.sh
@ -1,4 +1,56 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
kaggle datasets download -d uciml/red-wine-quality-cortez-et-al-2009
|
#kaggle datasets download -d uciml/red-wine-quality-cortez-et-al-2009
|
||||||
unzip -o red-wine-quality-cortez-et-al-2009.zip > data.txt
|
#unzip -o red-wine-quality-cortez-et-al-2009.zip > data.txt
|
||||||
head data.txt
|
|
||||||
|
|
||||||
|
FILE=winequality-red.csv
|
||||||
|
|
||||||
|
TRAIN_SET_PERCENT=80
|
||||||
|
TEST_SET_PERCENT=10
|
||||||
|
VAL_SET_PERCENT=$(( 100 - $TRAIN_SET_PERCENT - $TEST_SET_PERCENT ))
|
||||||
|
|
||||||
|
TRAIN_DATA=TRAIN_$FILE
|
||||||
|
TEST_DATA=TEST_$FILE
|
||||||
|
VAL_DATA=VAL_$FILE
|
||||||
|
|
||||||
|
> $TRAIN_DATA
|
||||||
|
> $TEST_DATA
|
||||||
|
> $VAL_DATA
|
||||||
|
|
||||||
|
for state in `cut -d',' -f1 $FILE | sort | uniq`
|
||||||
|
do
|
||||||
|
NUM_STATE_DATA=`grep "$state" $FILE | wc -l`
|
||||||
|
echo "$state: $NUM_STATE_DATA"
|
||||||
|
|
||||||
|
TRAIN_NUM_DATA=$(( $NUM_STATE_DATA * $TRAIN_SET_PERCENT / 100 ))
|
||||||
|
TEST_NUM_DATA=$(( $NUM_STATE_DATA * $TEST_SET_PERCENT / 100 ))
|
||||||
|
VAL_NUM_DATA=$(( $NUM_STATE_DATA - $TRAIN_NUM_DATA - $TEST_NUM_DATA ))
|
||||||
|
|
||||||
|
|
||||||
|
STATE_DATA=`grep $state $FILE`
|
||||||
|
|
||||||
|
|
||||||
|
# Train set
|
||||||
|
per=$TRAIN_SET_PERCENT
|
||||||
|
num=$TRAIN_NUM_DATA; from=1; to=$(($from + $num - 1));
|
||||||
|
echo Train set: $per% $num from=$from to=$to
|
||||||
|
echo "$STATE_DATA" | head -$to >> $TRAIN_DATA
|
||||||
|
|
||||||
|
# Test set
|
||||||
|
per=$TEST_SET_PERCENT
|
||||||
|
num=$TEST_NUM_DATA; from=$(($to + 1)); to=$(($from + $num - 1));
|
||||||
|
echo Test set: $per% $num from=$from to=$to
|
||||||
|
echo "$STATE_DATA" | head -$to | tail -$num >> $TEST_DATA
|
||||||
|
|
||||||
|
# Validate set
|
||||||
|
per=$VAL_SET_PERCENT
|
||||||
|
num=$VAL_NUM_DATA; from=$(($to + 1)); to=$NUM_STATE_DATA;
|
||||||
|
echo Validate set: $per% $num from=$from to=$to
|
||||||
|
echo "$STATE_DATA" | tail -$num >> $VAL_DATA
|
||||||
|
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
1600
winequality-red.csv
Normal file
1600
winequality-red.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user