From 877ee10827775427038f44da8610bfd78ea3e0ce Mon Sep 17 00:00:00 2001 From: s464962 Date: Tue, 26 Mar 2024 22:22:08 +0100 Subject: [PATCH] added create-dataset.sh --- create-dataset.sh | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 create-dataset.sh diff --git a/create-dataset.sh b/create-dataset.sh new file mode 100644 index 0000000..41102c3 --- /dev/null +++ b/create-dataset.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +pip install kaggle + +kaggle datasets download -d syedanwarafridi/vehicle-sales-data + +unzip -o vehicle-sales-data.zip + +# Tasowanie +shuf car_prices.csv -o car_prices_shuf.csv + +# Podział danych na podzbiory +total_rows=$(wc -l < car_prices_shuf.csv) +test_dev_rows=$(( $1 * 2 )) + +head -n $1 car_prices_shuf.csv > car_prices_test.csv +head -n $test_dev_rows car_prices_shuf.csv | tail -n +$(( $1 + 1 )) > car_prices_dev.csv +tail -n +$(( $test_dev_rows + 1 )) car_prices_shuf.csv > car_prices_train.csv + +test_size=$(wc -l < car_prices_test.csv) +dev_size=$(wc -l < car_prices_dev.csv) +train_size=$(wc -l < car_prices_train.csv) +echo "Rozmiar zbioru testowego: $test_size" +echo "Rozmiar zbioru deweloperskiego: $dev_size" +echo "Rozmiar zbioru treningowego: $train_size" + +# Zapis artefaktów +mkdir -p data +mv car_prices.csv car_prices_shuf.csv car_prices_test.csv car_prices_dev.csv car_prices_train.csv data/