Merge branch 'feature/ium_02'

2022-03-19 19:54:49 +01:00 · 2022-03-19 19:54:49 +01:00 · bbbaa7d35a
commit bbbaa7d35a
parent 747933d63c 3225701a1b
2 changed files with 98 additions and 1 deletions
--- a/script.py
+++ b/script.py
@ -1 +1,88 @@
-print('c')
+import subprocess
+import sys
+
+
+def install_dependencies():
+    """Install kaggle and pandas."""
+    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip'])
+    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'kaggle'])
+    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pandas'])
+    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'seaborn'])
+    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn'])
+
+
+def unzip_package():
+    """Unzip dataset"""
+    os.system('unzip -o car-prices-poland.zip')
+
+
+def download_dataset():
+    """Download kaggle dataset."""
+    os.system('kaggle datasets download -d aleksandrglotov/car-prices-poland')
+
+
+def divide_dataset(dataset):
+    """Split dataset to dev, train, test datasets. """
+
+    os.system('cat Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_shuf.csv')
+
+    len_train = len(dataset) // 10 * 6
+    len_dev = len(dataset) // 10 * 2
+    len_test = len(dataset) // 10 * 2
+
+    if len_test + len_train + len_dev != len(dataset):
+        len_train += len(dataset) - (len_test + len_train + len_dev)
+
+    os.system(f'head -n {len_train} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_train.csv')
+    os.system(f'head -n {len_dev} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_dev.csv')
+    os.system(f'head -n {len_test} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_test.csv')
+
+    os.system('rm ./Car_Prices_Poland_Kaggle_shuf.csv')
+    print("Len match: " + str(sum([len_test, len_dev, len_train]) == len(dataset)))
+
+
+def get_statistics(dataset):
+    """Mean, min, max, median etc."""
+
+    print(f'--------------- Dataset length ---------------')
+    print(len(dataset))
+
+    print(f'---------------Describe dataset---------------')
+    pd.set_option('display.max_columns', None)
+    print(dataset.describe(include='all'))
+
+
+def normalize_dataset(dataset):
+    """Drop unnecessary columns and set numeric values to [0,1] range"""
+
+    # drop columns
+    dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True)
+
+    # normalize numbers to [0, 1]
+    for column in dataset.columns:
+        if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64):
+            dataset[column] = (dataset[column] - dataset[column].min()) / (
+                    dataset[column].max() - dataset[column].min())
+
+        # There is no null rows
+        # dataset.isnull().sum()
+
+    return dataset
+
+
+install_dependencies()
+
+import pandas as pd
+import os
+import numpy as np
+
+download_dataset()
+unzip_package()
+cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv')
+normalize_dataset(cars)
+divide_dataset(cars)
+get_statistics(cars)
+
+
+
+
--- a/tesy.py
+++ b/tesy.py
@ -0,0 +1,10 @@
+# cars = pd.read_csv('Car_Prices_Poland_Kaggle.csv')
+# cars_normalized = normalize_dataset(cars)
+#
+# # cars[["mark", "price"]].groupby("mark").mean().plot(kind="bar")
+# cars["mark"].value_counts().plot(kind="bar")
+#
+# print(cars.describe(include='all'))
+# print(cars["price"].value_counts())
+#
+# divide_dataset(cars)