ium_464914/get_dataset.sh

32 lines
925 B
Bash
Raw Normal View History

2024-03-26 18:56:58 +01:00
#!/bin/bash
2024-04-13 18:55:49 +02:00
kaggle datasets download -d uciml/forest-cover-type-dataset
2024-03-26 18:56:58 +01:00
2024-04-13 18:55:49 +02:00
unzip -o forest-cover-type-dataset.zip
2024-03-26 18:56:58 +01:00
###Zmienne###
train_ratio=0.8
test_val_ratio=0.5
##Przetwrazanie pliku##
2024-04-13 18:55:49 +02:00
shuf covtype.csv -o forest.csv
2024-03-26 18:56:58 +01:00
2024-03-26 19:41:42 +01:00
##Cut off $1 rows##
2024-04-13 18:55:49 +02:00
head -n $1 forest.csv > forest.csv
2024-03-26 19:41:42 +01:00
2024-04-13 18:55:49 +02:00
total_lines=$(wc -l < forest.csv)
2024-03-26 18:56:58 +01:00
train_lines=$(echo $total_lines*$train_ratio| bc)
train_lines=$(echo "($train_lines+0.5)/1" | bc )
test_lines=$(echo "($total_lines-$train_lines)*$test_val_ratio" | bc)
test_lines=$(echo "($test_lines+0.5)/1" | bc )
validation_lines=$(echo $total_lines-$train_lines-$test_lines | bc)
2024-04-13 18:55:49 +02:00
head -n "$train_lines" forest.csv > "forest_train.csv"
tail -n $((test_lines+validation_lines)) forest.csv | head -n "$test_lines" > "forest_test.csv"
tail -n "$validation_lines" forest.csv > "forest_validation.csv"
2024-03-26 18:56:58 +01:00
mkdir -p artifacts
2024-04-13 18:55:49 +02:00
mv covtype.csv forest.csv forest_test.csv forest_train.csv forest_validation.csv artifacts/