2018-05-28 15:10:31 +02:00
|
|
|
#!/usr/bin/env bash
|
|
|
|
|
|
|
|
split_data() {
|
|
|
|
split -l $[ $(wc -l "$1" | cut -d" " -f1) * 1 / 5 ] "$1" part- -d
|
|
|
|
mv part-00 "$1_test.txt"
|
2018-06-18 14:56:41 +02:00
|
|
|
# mv part-01 "$1_dev.txt"
|
2018-05-28 15:10:31 +02:00
|
|
|
cat part-0* > "$1_train.txt" && rm part-0* $1
|
|
|
|
}
|
|
|
|
|
2018-06-18 14:56:41 +02:00
|
|
|
# grep '__label__yes' "$1" > yes.txt
|
|
|
|
# grep '__label__no' "$1" > no.txt
|
|
|
|
# split_data yes.txt
|
|
|
|
# split_data no.txt
|
|
|
|
# cat yes.txt_train.txt no.txt_train.txt | shuf > train.txt
|
|
|
|
# cat yes.txt_test.txt no.txt_test.txt | shuf > test.txt
|
|
|
|
# cat yes.txt_dev.txt no.txt_dev.txt | shuf > dev.txt
|
2018-05-28 15:10:31 +02:00
|
|
|
|
2018-06-18 14:56:41 +02:00
|
|
|
# diff <(sort "$1") <(cat train.txt test.txt dev.txt | sort)
|
|
|
|
# rm yes.txt* no.txt*
|
|
|
|
|
|
|
|
|
|
|
|
# ---
|
|
|
|
egrep '__label__(yes|no)' "$1" | shuf --random-source=/dev/urandom > yesno.txt
|
|
|
|
split_data yesno.txt
|
|
|
|
mv yesno.txt_train.txt train.txt
|
|
|
|
mv yesno.txt_test.txt test.txt
|
|
|
|
# mv yesno.txt_dev.txt > dev.txt
|