ium_444409/download_dataset.sh

# ratios as percentages
train_percent=80
dev_percent=10
test_percent=10

split_dataset() {
    # shuffling the dataset, excluding the first line which describes the columns
    shuffled_file=$1.shuf
    header=$(head -n 1 $1)
    tail -n +2 $1 | shuf > $shuffled_file

    # getting line count so we can split the datasets:


    lines=$( wc -l $shuffled_file | cut -d ' ' -f 1 )


    get_line_count_for() {
        echo $(($lines * $1 / 100))
    }

    train_lines=$( get_line_count_for $train_percent )
    dev_lines=$( get_line_count_for $dev_percent )
    test_lines=$( get_line_count_for $test_percent )

    # first line has to be the header
    init_file() {
        echo $header > $1
    }

    init_file $1.test
    init_file $1.dev
    init_file $1.train

    head -n $test_lines $shuffled_file >> $1.test
    head -n $(($test_lines + $dev_lines)) $shuffled_file | tail -n $test_lines >> $1.dev
    tail -n +$(($test_lines + $dev_lines + 1)) $shuffled_file >> $1.train
    rm $shuffled_file
}

mkdir data
cd data

echo "Downloading dataset from Kaggle..."
kaggle datasets download -d anikannal/solar-power-generation-data
echo "Done."

echo "Unzipping archive"
files=$(unzip -o solar-power-generation-data.zip | tail -n +2 | cut -d ' ' -f 4)
echo "Done."

echo "Splitting datasets"
echo "Using ratio (train:dev:test) $train_percent:$dev_percent:$test_percent"
for file in $files; do
    split_dataset $file
done
echo "Done! Files are inside ./data folder"
Added dataset download script 2022-03-20 17:37:26 +01:00			`# ratios as percentages`
			`train_percent=80`
			`dev_percent=10`
			`test_percent=10`

			`split_dataset() {`
			`# shuffling the dataset, excluding the first line which describes the columns`
			`shuffled_file=$1.shuf`
			`header=$(head -n 1 $1)`
			`tail -n +2 $1 \| shuf > $shuffled_file`

			`# getting line count so we can split the datasets:`


			`lines=$( wc -l $shuffled_file \| cut -d ' ' -f 1 )`


			`get_line_count_for() {`
			`echo $(($lines * $1 / 100))`
			`}`

			`train_lines=$( get_line_count_for $train_percent )`
			`dev_lines=$( get_line_count_for $dev_percent )`
			`test_lines=$( get_line_count_for $test_percent )`

			`# first line has to be the header`
			`init_file() {`
			`echo $header > $1`
			`}`

			`init_file $1.test`
			`init_file $1.dev`
			`init_file $1.train`

			`head -n $test_lines $shuffled_file >> $1.test`
			`head -n $(($test_lines + $dev_lines)) $shuffled_file \| tail -n $test_lines >> $1.dev`
			`tail -n +$(($test_lines + $dev_lines + 1)) $shuffled_file >> $1.train`
			`rm $shuffled_file`
			`}`

			`mkdir data`
			`cd data`

			`echo "Downloading dataset from Kaggle..."`
			`kaggle datasets download -d anikannal/solar-power-generation-data`
			`echo "Done."`

			`echo "Unzipping archive"`
Save artifacts 2022-03-27 23:25:26 +02:00			`files=$(unzip -o solar-power-generation-data.zip \| tail -n +2 \| cut -d ' ' -f 4)`
Added dataset download script 2022-03-20 17:37:26 +01:00			`echo "Done."`

			`echo "Splitting datasets"`
			`echo "Using ratio (train:dev:test) $train_percent:$dev_percent:$test_percent"`
			`for file in $files; do`
			`split_dataset $file`
			`done`
			`echo "Done! Files are inside ./data folder"`