2022-03-20 17:37:26 +01:00
|
|
|
# ratios as percentages
|
|
|
|
train_percent=80
|
|
|
|
dev_percent=10
|
|
|
|
test_percent=10
|
|
|
|
|
|
|
|
split_dataset() {
|
|
|
|
# shuffling the dataset, excluding the first line which describes the columns
|
|
|
|
shuffled_file=$1.shuf
|
|
|
|
header=$(head -n 1 $1)
|
|
|
|
tail -n +2 $1 | shuf > $shuffled_file
|
|
|
|
|
|
|
|
# getting line count so we can split the datasets:
|
|
|
|
|
|
|
|
|
|
|
|
lines=$( wc -l $shuffled_file | cut -d ' ' -f 1 )
|
|
|
|
|
|
|
|
|
|
|
|
get_line_count_for() {
|
|
|
|
echo $(($lines * $1 / 100))
|
|
|
|
}
|
|
|
|
|
|
|
|
train_lines=$( get_line_count_for $train_percent )
|
|
|
|
dev_lines=$( get_line_count_for $dev_percent )
|
|
|
|
test_lines=$( get_line_count_for $test_percent )
|
|
|
|
|
|
|
|
# first line has to be the header
|
|
|
|
init_file() {
|
|
|
|
echo $header > $1
|
|
|
|
}
|
|
|
|
|
|
|
|
init_file $1.test
|
|
|
|
init_file $1.dev
|
|
|
|
init_file $1.train
|
|
|
|
|
|
|
|
head -n $test_lines $shuffled_file >> $1.test
|
|
|
|
head -n $(($test_lines + $dev_lines)) $shuffled_file | tail -n $test_lines >> $1.dev
|
|
|
|
tail -n +$(($test_lines + $dev_lines + 1)) $shuffled_file >> $1.train
|
|
|
|
rm $shuffled_file
|
|
|
|
}
|
|
|
|
|
|
|
|
mkdir data
|
|
|
|
cd data
|
|
|
|
|
|
|
|
echo "Downloading dataset from Kaggle..."
|
|
|
|
kaggle datasets download -d anikannal/solar-power-generation-data
|
|
|
|
echo "Done."
|
|
|
|
|
|
|
|
echo "Unzipping archive"
|
2022-03-27 23:25:26 +02:00
|
|
|
files=$(unzip -o solar-power-generation-data.zip | tail -n +2 | cut -d ' ' -f 4)
|
2022-03-20 17:37:26 +01:00
|
|
|
echo "Done."
|
|
|
|
|
|
|
|
echo "Splitting datasets"
|
|
|
|
echo "Using ratio (train:dev:test) $train_percent:$dev_percent:$test_percent"
|
|
|
|
for file in $files; do
|
|
|
|
split_dataset $file
|
|
|
|
done
|
|
|
|
echo "Done! Files are inside ./data folder"
|