ium_464914/get_dataset.sh
Alicja Szulecka 90c0f9769c fix kaggle
2024-04-02 13:24:56 +02:00

32 lines
1.1 KiB
Bash

#!/bin/bash
kaggle datasets download -d nasa/meteorite-landings
unzip -o meteorite-landings.zip
###Zmienne###
train_ratio=0.8
test_val_ratio=0.5
##Przetwrazanie pliku##
shuf meteorite-landings.csv -o shuffled-meteorite-landings.csv
##Cut off $1 rows##
head -n $1 shuffled-meteorite-landings.csv > shuffled-meteorite-landings.csv
total_lines=$(wc -l < shuffled-meteorite-landings.csv)
train_lines=$(echo $total_lines*$train_ratio| bc)
train_lines=$(echo "($train_lines+0.5)/1" | bc )
test_lines=$(echo "($total_lines-$train_lines)*$test_val_ratio" | bc)
test_lines=$(echo "($test_lines+0.5)/1" | bc )
validation_lines=$(echo $total_lines-$train_lines-$test_lines | bc)
head -n "$train_lines" shuffled-meteorite-landings.csv > "meteorite_train.csv"
tail -n $((test_lines+validation_lines)) shuffled-meteorite-landings.csv | head -n "$test_lines" > "meteorite_test.csv"
tail -n "$validation_lines" shuffled-meteorite-landings.csv > "meteorite_validation.csv"
mkdir -p artifacts
mv meteorite-landings.csv shuffled-meteorite-landings.csv meteorite_test.csv meteorite_train.csv meteorite_validation.csv artifacts/