Merge branch 'feature/ium_02'
This commit is contained in:
commit
bbbaa7d35a
89
script.py
89
script.py
@ -1 +1,88 @@
|
||||
print('c')
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
|
||||
def install_dependencies():
|
||||
"""Install kaggle and pandas."""
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip'])
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'kaggle'])
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pandas'])
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'seaborn'])
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn'])
|
||||
|
||||
|
||||
def unzip_package():
|
||||
"""Unzip dataset"""
|
||||
os.system('unzip -o car-prices-poland.zip')
|
||||
|
||||
|
||||
def download_dataset():
|
||||
"""Download kaggle dataset."""
|
||||
os.system('kaggle datasets download -d aleksandrglotov/car-prices-poland')
|
||||
|
||||
|
||||
def divide_dataset(dataset):
|
||||
"""Split dataset to dev, train, test datasets. """
|
||||
|
||||
os.system('cat Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_shuf.csv')
|
||||
|
||||
len_train = len(dataset) // 10 * 6
|
||||
len_dev = len(dataset) // 10 * 2
|
||||
len_test = len(dataset) // 10 * 2
|
||||
|
||||
if len_test + len_train + len_dev != len(dataset):
|
||||
len_train += len(dataset) - (len_test + len_train + len_dev)
|
||||
|
||||
os.system(f'head -n {len_train} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_train.csv')
|
||||
os.system(f'head -n {len_dev} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_dev.csv')
|
||||
os.system(f'head -n {len_test} Car_Prices_Poland_Kaggle.csv | shuf > Car_Prices_Poland_Kaggle_test.csv')
|
||||
|
||||
os.system('rm ./Car_Prices_Poland_Kaggle_shuf.csv')
|
||||
print("Len match: " + str(sum([len_test, len_dev, len_train]) == len(dataset)))
|
||||
|
||||
|
||||
def get_statistics(dataset):
|
||||
"""Mean, min, max, median etc."""
|
||||
|
||||
print(f'--------------- Dataset length ---------------')
|
||||
print(len(dataset))
|
||||
|
||||
print(f'---------------Describe dataset---------------')
|
||||
pd.set_option('display.max_columns', None)
|
||||
print(dataset.describe(include='all'))
|
||||
|
||||
|
||||
def normalize_dataset(dataset):
|
||||
"""Drop unnecessary columns and set numeric values to [0,1] range"""
|
||||
|
||||
# drop columns
|
||||
dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True)
|
||||
|
||||
# normalize numbers to [0, 1]
|
||||
for column in dataset.columns:
|
||||
if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64):
|
||||
dataset[column] = (dataset[column] - dataset[column].min()) / (
|
||||
dataset[column].max() - dataset[column].min())
|
||||
|
||||
# There is no null rows
|
||||
# dataset.isnull().sum()
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
install_dependencies()
|
||||
|
||||
import pandas as pd
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
download_dataset()
|
||||
unzip_package()
|
||||
cars = pd.read_csv('./Car_Prices_Poland_Kaggle.csv')
|
||||
normalize_dataset(cars)
|
||||
divide_dataset(cars)
|
||||
get_statistics(cars)
|
||||
|
||||
|
||||
|
||||
|
||||
|
10
tesy.py
Normal file
10
tesy.py
Normal file
@ -0,0 +1,10 @@
|
||||
# cars = pd.read_csv('Car_Prices_Poland_Kaggle.csv')
|
||||
# cars_normalized = normalize_dataset(cars)
|
||||
#
|
||||
# # cars[["mark", "price"]].groupby("mark").mean().plot(kind="bar")
|
||||
# cars["mark"].value_counts().plot(kind="bar")
|
||||
#
|
||||
# print(cars.describe(include='all'))
|
||||
# print(cars["price"].value_counts())
|
||||
#
|
||||
# divide_dataset(cars)
|
Loading…
Reference in New Issue
Block a user