ium_444507/script-download.py

75 lines
2.3 KiB
Python
Raw Normal View History

2022-03-19 19:54:20 +01:00
import subprocess
import sys
2022-03-31 22:55:56 +02:00
import pandas as pd
import os
import numpy as np
2022-03-19 19:54:20 +01:00
def unzip_package():
"""Unzip dataset"""
2022-04-02 14:23:49 +02:00
print('Unzipping dataset...')
2022-04-02 22:21:12 +02:00
os.system('unzip -o ./car-prices-poland.zip')
2022-04-02 14:23:49 +02:00
print('Dataset unzipped')
2022-04-02 20:28:16 +02:00
print('Removing .zip file...')
os.system('rm ./car-prices-poland.zip')
print('Zip file removed')
2022-03-19 19:54:20 +01:00
2022-04-24 14:52:07 +02:00
2022-03-19 19:54:20 +01:00
def download_dataset():
"""Download kaggle dataset."""
2022-04-02 14:23:49 +02:00
print('Downloading dataset...')
2022-04-02 20:28:16 +02:00
os.system('kaggle datasets download -d aleksandrglotov/car-prices-poland')
2022-04-02 18:38:48 +02:00
2022-04-02 18:45:37 +02:00
print('Dir after downloading')
2022-04-02 18:38:48 +02:00
os.system('ls -la')
2022-04-02 14:23:49 +02:00
print('Dataset downloaded')
2022-03-19 19:54:20 +01:00
def divide_dataset(dataset):
"""Split dataset to dev, train, test datasets. """
2022-04-02 14:23:49 +02:00
print('Dividing dataset...')
2022-03-31 22:55:56 +02:00
os.system('tail -n +2 Car_Prices_Poland_Kaggle.csv | shuf > ./Car_Prices_Poland_Kaggle_shuf.csv')
2022-03-19 19:54:20 +01:00
2022-03-31 22:55:56 +02:00
len1 = len(dataset) // 6
2022-04-02 17:23:10 +02:00
len2 = (len1 * 2) + 1
2022-03-19 19:54:20 +01:00
2022-04-02 17:23:10 +02:00
os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv > Car_Prices_Poland_Kaggle_dev.csv')
os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv | tail -n {len1} > Car_Prices_Poland_Kaggle_test.csv')
os.system(f'tail -n +{len2} Car_Prices_Poland_Kaggle_shuf.csv > Car_Prices_Poland_Kaggle_train.csv')
2022-03-19 19:54:20 +01:00
os.system('rm ./Car_Prices_Poland_Kaggle_shuf.csv')
2022-03-31 22:55:56 +02:00
print("Len match: " + str(sum([len1 * 2, len2]) == len(dataset)))
os.system('cat Car_Prices_Poland_Kaggle_train.csv | wc -l')
os.system('cat Car_Prices_Poland_Kaggle_dev.csv | wc -l')
os.system('cat Car_Prices_Poland_Kaggle_test.csv | wc -l')
2022-04-02 17:23:10 +02:00
2022-04-02 14:23:49 +02:00
print('Dataset devided')
2022-03-19 19:54:20 +01:00
def normalize_dataset(dataset):
"""Drop unnecessary columns and set numeric values to [0,1] range"""
2022-04-02 17:23:10 +02:00
print(f'--------------- Initial dataset length ---------------')
print(len(dataset))
2022-03-19 19:54:20 +01:00
# drop columns
dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True)
2022-03-31 22:55:56 +02:00
dataset = dataset.dropna()
2022-03-19 19:54:20 +01:00
# normalize numbers to [0, 1]
for column in dataset.columns:
if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64):
2022-04-02 20:28:16 +02:00
dataset[column] = (dataset[column] - dataset[column].min()) / (dataset[column].max() - dataset[column].min())
2022-03-19 19:54:20 +01:00
return dataset
2022-03-31 23:23:30 +02:00
download_dataset()
2022-03-19 19:54:20 +01:00
unzip_package()
cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv')
2022-03-31 22:55:56 +02:00
df = pd.DataFrame(cars)
df = normalize_dataset(df)
divide_dataset(df)
2022-04-02 20:28:16 +02:00