import subprocess import sys def install_dependencies(): """Install kaggle and pandas.""" subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip']) subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'kaggle']) subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pandas']) subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'seaborn']) subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn']) def unzip_package(): """Unzip dataset""" os.system('unzip -o car-prices-poland.zip') def download_dataset(): """Download kaggle dataset.""" os.system('kaggle datasets download -d aleksandrglotov/car-prices-poland') def divide_dataset(dataset): """Split dataset to dev, train, test datasets. """ os.system('cat Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_shuf.csv') len_train = len(dataset) // 10 * 6 len_dev = len(dataset) // 10 * 2 len_test = len(dataset) // 10 * 2 if len_test + len_train + len_dev != len(dataset): len_train += len(dataset) - (len_test + len_train + len_dev) os.system(f'head -n {len_train} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_train.csv') os.system(f'head -n {len_dev} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_dev.csv') os.system(f'head -n {len_test} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_test.csv') os.system('rm ./Car_Prices_Poland_Kaggle_shuf.csv') print("Len match: " + str(sum([len_test, len_dev, len_train]) == len(dataset))) def get_statistics(dataset): """Mean, min, max, median etc.""" print(f'--------------- Dataset length ---------------') print(len(dataset)) print(f'---------------Describe dataset---------------') pd.set_option('display.max_columns', None) print(dataset.describe(include='all')) def normalize_dataset(dataset): """Drop unnecessary columns and set numeric values to [0,1] range""" # drop columns dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True) # normalize numbers to [0, 1] for column in dataset.columns: if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64): dataset[column] = (dataset[column] - dataset[column].min()) / ( dataset[column].max() - dataset[column].min()) # There is no null rows # dataset.isnull().sum() return dataset install_dependencies() import pandas as pd import os import numpy as np download_dataset() unzip_package() cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv') normalize_dataset(cars) divide_dataset(cars) get_statistics(cars)