script can now create 8_1_1 datasets

This commit is contained in:
Adrian Charkiewicz 2022-03-24 18:07:04 +01:00
parent 5b20395ba3
commit 6de4db7a83
6 changed files with 17907 additions and 3 deletions

1586
TEST_winequality-red.csv Normal file

File diff suppressed because it is too large Load Diff

12968
TRAIN_winequality-red.csv Normal file

File diff suppressed because it is too large Load Diff

1696
VAL_winequality-red.csv Normal file

File diff suppressed because it is too large Load Diff

2
data.txt Normal file
View File

@ -0,0 +1,2 @@
Archive: wine.zip
inflating: winequality-red.csv

View File

@ -1,4 +1,56 @@
#!/bin/sh #!/bin/sh
kaggle datasets download -d uciml/red-wine-quality-cortez-et-al-2009 #kaggle datasets download -d uciml/red-wine-quality-cortez-et-al-2009
unzip -o red-wine-quality-cortez-et-al-2009.zip > data.txt #unzip -o red-wine-quality-cortez-et-al-2009.zip > data.txt
head data.txt
FILE=winequality-red.csv
TRAIN_SET_PERCENT=80
TEST_SET_PERCENT=10
VAL_SET_PERCENT=$(( 100 - $TRAIN_SET_PERCENT - $TEST_SET_PERCENT ))
TRAIN_DATA=TRAIN_$FILE
TEST_DATA=TEST_$FILE
VAL_DATA=VAL_$FILE
> $TRAIN_DATA
> $TEST_DATA
> $VAL_DATA
for state in `cut -d',' -f1 $FILE | sort | uniq`
do
NUM_STATE_DATA=`grep "$state" $FILE | wc -l`
echo "$state: $NUM_STATE_DATA"
TRAIN_NUM_DATA=$(( $NUM_STATE_DATA * $TRAIN_SET_PERCENT / 100 ))
TEST_NUM_DATA=$(( $NUM_STATE_DATA * $TEST_SET_PERCENT / 100 ))
VAL_NUM_DATA=$(( $NUM_STATE_DATA - $TRAIN_NUM_DATA - $TEST_NUM_DATA ))
STATE_DATA=`grep $state $FILE`
# Train set
per=$TRAIN_SET_PERCENT
num=$TRAIN_NUM_DATA; from=1; to=$(($from + $num - 1));
echo Train set: $per% $num from=$from to=$to
echo "$STATE_DATA" | head -$to >> $TRAIN_DATA
# Test set
per=$TEST_SET_PERCENT
num=$TEST_NUM_DATA; from=$(($to + 1)); to=$(($from + $num - 1));
echo Test set: $per% $num from=$from to=$to
echo "$STATE_DATA" | head -$to | tail -$num >> $TEST_DATA
# Validate set
per=$VAL_SET_PERCENT
num=$VAL_NUM_DATA; from=$(($to + 1)); to=$NUM_STATE_DATA;
echo Validate set: $per% $num from=$from to=$to
echo "$STATE_DATA" | tail -$num >> $VAL_DATA
echo
done

1600
winequality-red.csv Normal file

File diff suppressed because it is too large Load Diff