2023-04-21 09:37:13 +02:00
|
|
|
import os
|
2023-04-20 21:01:03 +02:00
|
|
|
import pandas as pd
|
|
|
|
from sklearn.model_selection import train_test_split
|
2023-04-21 14:05:54 +02:00
|
|
|
from sklearn.preprocessing import MinMaxScaler
|
2023-04-20 21:01:03 +02:00
|
|
|
|
|
|
|
# get data
|
|
|
|
sells = pd.read_csv('data/Property Sales of Melbourne City.csv')
|
|
|
|
|
2023-04-21 14:05:54 +02:00
|
|
|
# prepare column, which will be predicted
|
|
|
|
price_median = sells['Price'].median()
|
2023-04-20 21:01:03 +02:00
|
|
|
|
2023-04-21 14:05:54 +02:00
|
|
|
def price_above_median(price):
|
|
|
|
if price > price_median:
|
|
|
|
return 1
|
|
|
|
else:
|
|
|
|
return 0
|
|
|
|
|
|
|
|
sells['PriceAboveMedian'] = sells['Price'].apply(price_above_median)
|
|
|
|
|
|
|
|
# delete unnecessary columns and drop rows with NaN values
|
|
|
|
columns_to_take = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'PriceAboveMedian']
|
|
|
|
sells = sells[columns_to_take].dropna()
|
2023-04-20 21:01:03 +02:00
|
|
|
|
2023-04-21 09:37:13 +02:00
|
|
|
# cut off dataset to fixed number of values
|
2023-04-21 14:05:54 +02:00
|
|
|
cutoff = 1000
|
2023-04-21 09:37:13 +02:00
|
|
|
sells = sells.sample(cutoff)
|
|
|
|
|
2023-04-21 14:05:54 +02:00
|
|
|
# split dataset
|
|
|
|
train_data, test_data = train_test_split(sells, test_size=0.2, random_state=42)
|
|
|
|
train_data, dev_data = train_test_split(train_data, test_size=0.2, random_state=42)
|
|
|
|
|
|
|
|
# prepare dataset
|
|
|
|
features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']
|
|
|
|
target = 'PriceAboveMedian'
|
2023-04-20 21:01:03 +02:00
|
|
|
|
2023-04-21 14:05:54 +02:00
|
|
|
X_train = train_data[features].values
|
|
|
|
y_train = train_data[target].values
|
|
|
|
|
|
|
|
X_dev = dev_data[features].values
|
|
|
|
y_dev = dev_data[target].values
|
|
|
|
|
|
|
|
X_test = test_data[features].values
|
|
|
|
y_test = test_data[target].values
|
|
|
|
|
|
|
|
# normalize values
|
|
|
|
scaler = MinMaxScaler()
|
|
|
|
features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']
|
|
|
|
for feature in features:
|
|
|
|
X_train = scaler.fit_transform(X_train)
|
|
|
|
X_dev = scaler.fit_transform(X_dev)
|
|
|
|
X_test = scaler.fit_transform(X_test)
|
2023-04-20 21:01:03 +02:00
|
|
|
|
|
|
|
# save subsets to files
|
2023-04-21 14:05:54 +02:00
|
|
|
X_train = pd.DataFrame(X_train, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'])
|
|
|
|
y_train = pd.DataFrame(y_train, columns = ['PriceAboveMedian'])
|
|
|
|
X_dev = pd.DataFrame(X_dev, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'])
|
|
|
|
y_dev = pd.DataFrame(y_dev, columns = ['PriceAboveMedian'])
|
|
|
|
X_test = pd.DataFrame(X_test, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'])
|
|
|
|
y_test = pd.DataFrame(y_test, columns = ['PriceAboveMedian'])
|
|
|
|
|
2023-04-20 21:01:03 +02:00
|
|
|
X_train.to_csv('X_train.csv', index=False)
|
2023-04-21 14:05:54 +02:00
|
|
|
X_dev.to_csv('X_val.csv', index=False)
|
2023-04-20 21:01:03 +02:00
|
|
|
X_test.to_csv('X_test.csv', index=False)
|
|
|
|
|
2023-04-21 14:05:54 +02:00
|
|
|
y_train.to_csv('Y_train.csv', index=False)
|
|
|
|
y_dev.to_csv('Y_val.csv', index=False)
|
|
|
|
y_test.to_csv('Y_test.csv', index=False)
|