ium_z487183/prepare_dataset.py

68 lines
2.2 KiB
Python
Raw Permalink Normal View History

2023-04-21 09:37:13 +02:00
import os
2023-04-20 21:01:03 +02:00
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
2023-04-20 21:01:03 +02:00
# get data
sells = pd.read_csv('data/Property Sales of Melbourne City.csv')
# prepare column, which will be predicted
price_median = sells['Price'].median()
2023-04-20 21:01:03 +02:00
def price_above_median(price):
if price > price_median:
return 1
else:
return 0
sells['PriceAboveMedian'] = sells['Price'].apply(price_above_median)
# delete unnecessary columns and drop rows with NaN values
columns_to_take = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'PriceAboveMedian']
sells = sells[columns_to_take].dropna()
2023-04-20 21:01:03 +02:00
2023-04-21 09:37:13 +02:00
# cut off dataset to fixed number of values
cutoff = 1000
2023-04-21 09:37:13 +02:00
sells = sells.sample(cutoff)
# split dataset
train_data, test_data = train_test_split(sells, test_size=0.2, random_state=42)
train_data, dev_data = train_test_split(train_data, test_size=0.2, random_state=42)
# prepare dataset
features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']
target = 'PriceAboveMedian'
2023-04-20 21:01:03 +02:00
X_train = train_data[features].values
y_train = train_data[target].values
X_dev = dev_data[features].values
y_dev = dev_data[target].values
X_test = test_data[features].values
y_test = test_data[target].values
# normalize values
scaler = MinMaxScaler()
features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom']
for feature in features:
X_train = scaler.fit_transform(X_train)
X_dev = scaler.fit_transform(X_dev)
X_test = scaler.fit_transform(X_test)
2023-04-20 21:01:03 +02:00
# save subsets to files
X_train = pd.DataFrame(X_train, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'])
y_train = pd.DataFrame(y_train, columns = ['PriceAboveMedian'])
X_dev = pd.DataFrame(X_dev, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'])
y_dev = pd.DataFrame(y_dev, columns = ['PriceAboveMedian'])
X_test = pd.DataFrame(X_test, columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom'])
y_test = pd.DataFrame(y_test, columns = ['PriceAboveMedian'])
2023-04-20 21:01:03 +02:00
X_train.to_csv('X_train.csv', index=False)
X_dev.to_csv('X_val.csv', index=False)
2023-04-20 21:01:03 +02:00
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('Y_train.csv', index=False)
y_dev.to_csv('Y_val.csv', index=False)
y_test.to_csv('Y_test.csv', index=False)