ium_464962/create-dataset.sh

36 lines
1.3 KiB
Bash
Raw Normal View History

2024-04-28 20:29:40 +02:00
#!/bin/bash
export PATH=$PATH:/root/.local/bindock
pip install kaggle
kaggle datasets download -d syedanwarafridi/vehicle-sales-data
unzip -o vehicle-sales-data.zip
head -n 1 car_prices.csv > car_prices_header.csv
tail -n +2 car_prices.csv | awk -F, '!/,,/' | shuf > car_prices_no_null.csv
total_rows=$(wc -l < car_prices_no_null.csv)
test_dev_rows=$(( $1 * 2 ))
head -n $1 car_prices_no_null.csv > car_prices_test_temp.csv
head -n $test_dev_rows car_prices_no_null.csv | tail -n +$(( $1 + 1 )) > car_prices_dev_temp.csv
tail -n +$(( $test_dev_rows + 1 )) car_prices_no_null.csv > car_prices_train_temp.csv
cat car_prices_header.csv car_prices_test_temp.csv > car_prices_test.csv
cat car_prices_header.csv car_prices_dev_temp.csv > car_prices_dev.csv
cat car_prices_header.csv car_prices_train_temp.csv > car_prices_train.csv
rm car_prices_test_temp.csv car_prices_dev_temp.csv car_prices_train_temp.csv car_prices_no_null.csv car_prices_header.csv
test_size=$(wc -l < car_prices_test.csv)
dev_size=$(wc -l < car_prices_dev.csv)
train_size=$(wc -l < car_prices_train.csv)
echo "Rozmiar zbioru testowego: $test_size"
echo "Rozmiar zbioru deweloperskiego: $dev_size"
echo "Rozmiar zbioru treningowego: $train_size"
mkdir -p data
mv car_prices.csv car_prices_test.csv car_prices_dev.csv car_prices_train.csv data/