diff --git a/script.py b/script.py index 7342f5f..3e9bc9e 100644 --- a/script.py +++ b/script.py @@ -1 +1,88 @@ -print('c') \ No newline at end of file +import subprocess +import sys + + +def install_dependencies(): + """Install kaggle and pandas.""" + subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip']) + subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'kaggle']) + subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pandas']) + subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'seaborn']) + subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn']) + + +def unzip_package(): + """Unzip dataset""" + os.system('unzip -o car-prices-poland.zip') + + +def download_dataset(): + """Download kaggle dataset.""" + os.system('kaggle datasets download -d aleksandrglotov/car-prices-poland') + + +def divide_dataset(dataset): + """Split dataset to dev, train, test datasets. """ + + os.system('cat Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_shuf.csv') + + len_train = len(dataset) // 10 * 6 + len_dev = len(dataset) // 10 * 2 + len_test = len(dataset) // 10 * 2 + + if len_test + len_train + len_dev != len(dataset): + len_train += len(dataset) - (len_test + len_train + len_dev) + + os.system(f'head -n {len_train} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_train.csv') + os.system(f'head -n {len_dev} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_dev.csv') + os.system(f'head -n {len_test} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_test.csv') + + os.system('rm ./Car_Prices_Poland_Kaggle_shuf.csv') + print("Len match: " + str(sum([len_test, len_dev, len_train]) == len(dataset))) + + +def get_statistics(dataset): + """Mean, min, max, median etc.""" + + print(f'--------------- Dataset length ---------------') + print(len(dataset)) + + print(f'---------------Describe dataset---------------') + pd.set_option('display.max_columns', None) + print(dataset.describe(include='all')) + + +def normalize_dataset(dataset): + """Drop unnecessary columns and set numeric values to [0,1] range""" + + # drop columns + dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True) + + # normalize numbers to [0, 1] + for column in dataset.columns: + if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64): + dataset[column] = (dataset[column] - dataset[column].min()) / ( + dataset[column].max() - dataset[column].min()) + + # There is no null rows + # dataset.isnull().sum() + + return dataset + + +install_dependencies() + +import pandas as pd +import os +import numpy as np + +download_dataset() +unzip_package() +cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv') +normalize_dataset(cars) +divide_dataset(cars) +get_statistics(cars) + + + + diff --git a/tesy.py b/tesy.py new file mode 100644 index 0000000..f965213 --- /dev/null +++ b/tesy.py @@ -0,0 +1,10 @@ +# cars = pd.read_csv('Car_Prices_Poland_Kaggle.csv') +# cars_normalized = normalize_dataset(cars) +# +# # cars[["mark", "price"]].groupby("mark").mean().plot(kind="bar") +# cars["mark"].value_counts().plot(kind="bar") +# +# print(cars.describe(include='all')) +# print(cars["price"].value_counts()) +# +# divide_dataset(cars) \ No newline at end of file