import os import pandas as pd from sklearn.model_selection import train_test_split # get data sells = pd.read_csv('data/Property Sales of Melbourne City.csv') # delete unnecessary columns and drop rows with NaN values columns_to_drop = [ 'Lattitude', 'Longtitude', 'CouncilArea', 'Propertycount', 'Method', 'SellerG', 'Date', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'BuildingArea', 'Address' ] sells = sells.drop(columns_to_drop, axis=1).dropna() # normalize values sells["Price"] = sells["Price"] / sells["Price"].max() sells["Landsize"] = sells["Landsize"] / sells["Landsize"].max() sells["Distance"] = sells["Distance"] / sells["Distance"].max() # cut off dataset to fixed number of values cutoff = int(os.environ['CUTOFF']) sells = sells.sample(cutoff) # split to train/dev/test subsets X = sells Y = sells.pop('Price') X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.3, random_state=1) X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=1) # save subsets to files X_train.to_csv('X_train.csv', index=False) X_val.to_csv('X_val.csv', index=False) X_test.to_csv('X_test.csv', index=False) Y_train.to_csv('Y_train.csv', index=False) Y_val.to_csv('Y_val.csv', index=False) Y_test.to_csv('Y_test.csv', index=False)