import subprocess import sys import pandas as pd import os import numpy as np try: dataset_path = sys.argv[1] except Exception as e: print("Exception while retrieving dataset path") print(e) def divide_dataset(dataset, path): """Split dataset to dev, train, test datasets. """ print('Shuffle dataset...') shuf_path = 'data/Car_Prices_Poland_Kaggle_shuf.csv' os.system(f'tail -n +2 {path} | shuf > {shuf_path}') len1 = len(dataset) // 6 len2 = (len1 * 2) + 1 print('Dividing dataset...') os.system(f'head -n {len1} {shuf_path} > data/Car_Prices_Poland_Kaggle_dev.csv') os.system(f'head -n {len1} {shuf_path} | tail -n {len1} > data/Car_Prices_Poland_Kaggle_test.csv') os.system(f'tail -n +{len2} {shuf_path} > data/Car_Prices_Poland_Kaggle_train.csv') os.system(f'rm {shuf_path}') print("Len match: " + str(sum([len1 * 2, len2]) == len(dataset))) os.system('cat Car_Prices_Poland_Kaggle_train.csv | wc -l') os.system('cat Car_Prices_Poland_Kaggle_dev.csv | wc -l') os.system('cat Car_Prices_Poland_Kaggle_test.csv | wc -l') print('Dataset devided') def normalize_dataset(dataset): """Drop unnecessary columns and set numeric values to [0,1] range""" print(f'--------------- Initial dataset length ---------------') print(len(dataset)) # drop columns dataset.drop(columns=["Unnamed: 0", "generation_name"], inplace=True) dataset = dataset.dropna() # normalize numbers to [0, 1] for column in dataset.columns: if isinstance(dataset.iloc[1][column], np.int64) or isinstance(dataset.iloc[1][column], np.float64): dataset[column] = (dataset[column] - dataset[column].min()) / (dataset[column].max() - dataset[column].min()) return dataset cars = pd.read_csv(dataset_path) df = pd.DataFrame(cars) df = normalize_dataset(df) divide_dataset(df, dataset_path)