2022-03-19 19:54:20 +01:00
|
|
|
import subprocess
|
|
|
|
import sys
|
2022-03-31 22:55:56 +02:00
|
|
|
import pandas as pd
|
|
|
|
import os
|
|
|
|
import numpy as np
|
2022-03-19 19:54:20 +01:00
|
|
|
|
|
|
|
|
|
|
|
def install_dependencies():
|
|
|
|
"""Install kaggle and pandas."""
|
|
|
|
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip'])
|
|
|
|
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'kaggle'])
|
|
|
|
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pandas'])
|
|
|
|
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'seaborn'])
|
|
|
|
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn'])
|
|
|
|
|
|
|
|
|
|
|
|
def unzip_package():
|
|
|
|
"""Unzip dataset"""
|
|
|
|
os.system('unzip -o car-prices-poland.zip')
|
|
|
|
|
|
|
|
|
|
|
|
def download_dataset():
|
|
|
|
"""Download kaggle dataset."""
|
|
|
|
os.system('kaggle datasets download -d aleksandrglotov/car-prices-poland')
|
|
|
|
|
|
|
|
|
|
|
|
def divide_dataset(dataset):
|
|
|
|
"""Split dataset to dev, train, test datasets. """
|
2022-03-31 22:55:56 +02:00
|
|
|
|
|
|
|
os.system('tail -n +2 Car_Prices_Poland_Kaggle.csv | shuf > ./Car_Prices_Poland_Kaggle_shuf.csv')
|
2022-03-19 19:54:20 +01:00
|
|
|
|
2022-03-31 22:55:56 +02:00
|
|
|
len1 = len(dataset) // 6
|
|
|
|
len2 = (len1 * 2) +1
|
2022-03-19 19:54:20 +01:00
|
|
|
|
2022-03-31 22:55:56 +02:00
|
|
|
os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv f > Car_Prices_Poland_Kaggle_dev.csv')
|
|
|
|
os.system(f'head -n {len1} Car_Prices_Poland_Kaggle_shuf.csv| tail -n {len1} > Car_Prices_Poland_Kaggle_test.csv')
|
|
|
|
os.system(f'tail -n +{len2} Car_Prices_Poland_Kaggle_shuf.csv > Car_Prices_Poland_Kaggle_train.csv')
|
2022-03-19 19:54:20 +01:00
|
|
|
|
|
|
|
os.system('rm ./Car_Prices_Poland_Kaggle_shuf.csv')
|
2022-03-31 22:55:56 +02:00
|
|
|
print("Len match: " + str(sum([len1 * 2, len2]) == len(dataset)))
|
|
|
|
os.system('cat Car_Prices_Poland_Kaggle_train.csv | wc -l')
|
|
|
|
os.system('cat Car_Prices_Poland_Kaggle_dev.csv | wc -l')
|
|
|
|
os.system('cat Car_Prices_Poland_Kaggle_test.csv | wc -l')
|
2022-03-19 19:54:20 +01:00
|
|
|
|
|
|
|
|
|
|
|
def get_statistics(dataset):
|
|
|
|
"""Mean, min, max, median etc."""
|
|
|
|
|
|
|
|
print(f'--------------- Dataset length ---------------')
|
|
|
|
print(len(dataset))
|
|
|
|
|
|
|
|
print(f'---------------Describe dataset---------------')
|
|
|
|
pd.set_option('display.max_columns', None)
|
|
|
|
print(dataset.describe(include='all'))
|
|
|
|
|
|
|
|
|
|
|
|
def normalize_dataset(dataset):
|
|
|
|
"""Drop unnecessary columns and set numeric values to [0,1] range"""
|
|
|
|
|
|
|
|
# drop columns
|
|
|
|
dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True)
|
2022-03-31 22:55:56 +02:00
|
|
|
dataset = dataset.dropna()
|
2022-03-19 19:54:20 +01:00
|
|
|
|
|
|
|
# normalize numbers to [0, 1]
|
|
|
|
for column in dataset.columns:
|
|
|
|
if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64):
|
|
|
|
dataset[column] = (dataset[column] - dataset[column].min()) / (
|
|
|
|
dataset[column].max() - dataset[column].min())
|
|
|
|
return dataset
|
|
|
|
|
|
|
|
|
2022-03-31 23:23:30 +02:00
|
|
|
install_dependencies()
|
|
|
|
download_dataset()
|
2022-03-19 19:54:20 +01:00
|
|
|
unzip_package()
|
|
|
|
cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv')
|
2022-03-31 22:55:56 +02:00
|
|
|
df = pd.DataFrame(cars)
|
|
|
|
df = normalize_dataset(df)
|
|
|
|
divide_dataset(df)
|
|
|
|
get_statistics(df)
|
2022-03-19 19:54:20 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|