61 lines
1.9 KiB
Python
61 lines
1.9 KiB
Python
|
import subprocess
|
||
|
import sys
|
||
|
import pandas as pd
|
||
|
import os
|
||
|
import numpy as np
|
||
|
|
||
|
|
||
|
try:
|
||
|
dataset_path = sys.argv[1]
|
||
|
except Exception as e:
|
||
|
print("Exception while retrieving dataset path")
|
||
|
print(e)
|
||
|
|
||
|
|
||
|
def divide_dataset(dataset, path):
|
||
|
"""Split dataset to dev, train, test datasets. """
|
||
|
|
||
|
print('Shuffle dataset...')
|
||
|
shuf_path = 'data/Car_Prices_Poland_Kaggle_shuf.csv'
|
||
|
os.system(f'tail -n +2 {path} | shuf > {shuf_path}')
|
||
|
|
||
|
len1 = len(dataset) // 6
|
||
|
len2 = (len1 * 2) + 1
|
||
|
|
||
|
print('Dividing dataset...')
|
||
|
os.system(f'head -n {len1} {shuf_path} > data/Car_Prices_Poland_Kaggle_dev.csv')
|
||
|
os.system(f'head -n {len1} {shuf_path} | tail -n {len1} > data/Car_Prices_Poland_Kaggle_test.csv')
|
||
|
os.system(f'tail -n +{len2} {shuf_path} > data/Car_Prices_Poland_Kaggle_train.csv')
|
||
|
|
||
|
os.system(f'rm {shuf_path}')
|
||
|
print("Len match: " + str(sum([len1 * 2, len2]) == len(dataset)))
|
||
|
os.system('cat Car_Prices_Poland_Kaggle_train.csv | wc -l')
|
||
|
os.system('cat Car_Prices_Poland_Kaggle_dev.csv | wc -l')
|
||
|
os.system('cat Car_Prices_Poland_Kaggle_test.csv | wc -l')
|
||
|
|
||
|
print('Dataset devided')
|
||
|
|
||
|
|
||
|
def normalize_dataset(dataset):
|
||
|
"""Drop unnecessary columns and set numeric values to [0,1] range"""
|
||
|
|
||
|
print(f'--------------- Initial dataset length ---------------')
|
||
|
print(len(dataset))
|
||
|
|
||
|
# drop columns
|
||
|
dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True)
|
||
|
dataset = dataset.dropna()
|
||
|
|
||
|
# normalize numbers to [0, 1]
|
||
|
for column in dataset.columns:
|
||
|
if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64):
|
||
|
dataset[column] = (dataset[column] - dataset[column].min()) / (dataset[column].max() - dataset[column].min())
|
||
|
return dataset
|
||
|
|
||
|
|
||
|
cars = pd.read_csv(dataset_path)
|
||
|
df = pd.DataFrame(cars)
|
||
|
df = normalize_dataset(df)
|
||
|
divide_dataset(df, dataset_path)
|
||
|
|