import subprocess
import sys


def install_dependencies():
    """Install kaggle and pandas."""
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip'])
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'kaggle'])
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pandas'])
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'seaborn'])
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn'])


def unzip_package():
    """Unzip dataset"""
    os.system('unzip -o car-prices-poland.zip')


def download_dataset():
    """Download kaggle dataset."""
    os.system('kaggle datasets download -d aleksandrglotov/car-prices-poland')


def divide_dataset(dataset):
    """Split dataset to dev, train, test datasets. """

    os.system('cat Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_shuf.csv')

    len_train = len(dataset) // 10 * 6
    len_dev = len(dataset) // 10 * 2
    len_test = len(dataset) // 10 * 2

    if len_test + len_train + len_dev != len(dataset):
        len_train += len(dataset) - (len_test + len_train + len_dev)

    os.system(f'head -n {len_train} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_train.csv')
    os.system(f'head -n {len_dev} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_dev.csv')
    os.system(f'head -n {len_test} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_test.csv')

    os.system('rm ./Car_Prices_Poland_Kaggle_shuf.csv')
    print("Len match: " + str(sum([len_test, len_dev, len_train]) == len(dataset)))


def get_statistics(dataset):
    """Mean, min, max, median etc."""

    print(f'--------------- Dataset length ---------------')
    print(len(dataset))

    print(f'---------------Describe dataset---------------')
    pd.set_option('display.max_columns', None)
    print(dataset.describe(include='all'))


def normalize_dataset(dataset):
    """Drop unnecessary columns and set numeric values to [0,1] range"""

    # drop columns
    dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True)

    # normalize numbers to [0, 1]
    for column in dataset.columns:
        if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64):
            dataset[column] = (dataset[column] - dataset[column].min()) / (
                    dataset[column].max() - dataset[column].min())

        # There is no null rows
        # dataset.isnull().sum()

    return dataset


install_dependencies()

import pandas as pd
import os
import numpy as np

download_dataset()
unzip_package()
cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv')
normalize_dataset(cars)
divide_dataset(cars)
get_statistics(cars)