2024-03-26 18:56:58 +01:00
|
|
|
#!/bin/bash
|
2024-04-13 18:55:49 +02:00
|
|
|
kaggle datasets download -d uciml/forest-cover-type-dataset
|
2024-03-26 18:56:58 +01:00
|
|
|
|
2024-04-13 18:55:49 +02:00
|
|
|
unzip -o forest-cover-type-dataset.zip
|
2024-03-26 18:56:58 +01:00
|
|
|
|
|
|
|
###Zmienne###
|
|
|
|
|
|
|
|
train_ratio=0.8
|
|
|
|
test_val_ratio=0.5
|
|
|
|
|
|
|
|
##Przetwrazanie pliku##
|
|
|
|
|
2024-04-13 18:55:49 +02:00
|
|
|
shuf covtype.csv -o forest.csv
|
2024-03-26 18:56:58 +01:00
|
|
|
|
2024-03-26 19:41:42 +01:00
|
|
|
##Cut off $1 rows##
|
2024-04-13 18:55:49 +02:00
|
|
|
head -n $1 forest.csv > forest.csv
|
2024-03-26 19:41:42 +01:00
|
|
|
|
2024-04-13 18:55:49 +02:00
|
|
|
total_lines=$(wc -l < forest.csv)
|
2024-03-26 18:56:58 +01:00
|
|
|
train_lines=$(echo $total_lines*$train_ratio| bc)
|
|
|
|
train_lines=$(echo "($train_lines+0.5)/1" | bc )
|
|
|
|
|
|
|
|
test_lines=$(echo "($total_lines-$train_lines)*$test_val_ratio" | bc)
|
|
|
|
test_lines=$(echo "($test_lines+0.5)/1" | bc )
|
|
|
|
|
|
|
|
validation_lines=$(echo $total_lines-$train_lines-$test_lines | bc)
|
|
|
|
|
2024-04-13 18:55:49 +02:00
|
|
|
head -n "$train_lines" forest.csv > "forest_train.csv"
|
|
|
|
tail -n $((test_lines+validation_lines)) forest.csv | head -n "$test_lines" > "forest_test.csv"
|
|
|
|
tail -n "$validation_lines" forest.csv > "forest_validation.csv"
|
2024-03-26 18:56:58 +01:00
|
|
|
|
|
|
|
mkdir -p artifacts
|
2024-04-13 18:55:49 +02:00
|
|
|
mv covtype.csv forest.csv forest_test.csv forest_train.csv forest_validation.csv artifacts/
|