import os import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler # get data sells = pd.read_csv('data/Property Sales of Melbourne City.csv') # prepare column, which will be predicted price_median = sells['Price'].median() def price_above_median(price): if price > price_median: return 1 else: return 0 sells['PriceAboveMedian'] = sells['Price'].apply(price_above_median) # delete unnecessary columns and drop rows with NaN values columns_to_take = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'PriceAboveMedian'] sells = sells[columns_to_take].dropna() # cut off dataset to fixed number of values cutoff = 1000 sells = sells.sample(cutoff) # split dataset train_data, test_data = train_test_split(sells, test_size=0.2, random_state=42) train_data, dev_data = train_test_split(train_data, test_size=0.2, random_state=42) # prepare dataset features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'] target = 'PriceAboveMedian' X_train = train_data[features].values y_train = train_data[target].values X_dev = dev_data[features].values y_dev = dev_data[target].values X_test = test_data[features].values y_test = test_data[target].values # normalize values scaler = MinMaxScaler() features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'] for feature in features: X_train = scaler.fit_transform(X_train) X_dev = scaler.fit_transform(X_dev) X_test = scaler.fit_transform(X_test) # save subsets to files X_train = pd.DataFrame(X_train, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']) y_train = pd.DataFrame(y_train, columns = ['PriceAboveMedian']) X_dev = pd.DataFrame(X_dev, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']) y_dev = pd.DataFrame(y_dev, columns = ['PriceAboveMedian']) X_test = pd.DataFrame(X_test, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']) y_test = pd.DataFrame(y_test, columns = ['PriceAboveMedian']) X_train.to_csv('X_train.csv', index=False) X_dev.to_csv('X_val.csv', index=False) X_test.to_csv('X_test.csv', index=False) y_train.to_csv('Y_train.csv', index=False) y_dev.to_csv('Y_val.csv', index=False) y_test.to_csv('Y_test.csv', index=False)