ium_464914/get_dataset.sh
2024-04-13 18:55:49 +02:00

32 lines
925 B
Bash

#!/bin/bash
kaggle datasets download -d uciml/forest-cover-type-dataset
unzip -o forest-cover-type-dataset.zip
###Zmienne###
train_ratio=0.8
test_val_ratio=0.5
##Przetwrazanie pliku##
shuf covtype.csv -o forest.csv
##Cut off $1 rows##
head -n $1 forest.csv > forest.csv
total_lines=$(wc -l < forest.csv)
train_lines=$(echo $total_lines*$train_ratio| bc)
train_lines=$(echo "($train_lines+0.5)/1" | bc )
test_lines=$(echo "($total_lines-$train_lines)*$test_val_ratio" | bc)
test_lines=$(echo "($test_lines+0.5)/1" | bc )
validation_lines=$(echo $total_lines-$train_lines-$test_lines | bc)
head -n "$train_lines" forest.csv > "forest_train.csv"
tail -n $((test_lines+validation_lines)) forest.csv | head -n "$test_lines" > "forest_test.csv"
tail -n "$validation_lines" forest.csv > "forest_validation.csv"
mkdir -p artifacts
mv covtype.csv forest.csv forest_test.csv forest_train.csv forest_validation.csv artifacts/