54 lines
2.1 KiB
Python
54 lines
2.1 KiB
Python
from pandas.core.frame import DataFrame
|
|
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.preprocessing import LabelEncoder
|
|
|
|
class DataPreprocessor:
|
|
def __init__(self, data : DataFrame) -> None:
|
|
self._data = data
|
|
self._transformers = {}
|
|
|
|
def remove_outliers_by(self, property : str, lower : float, upper: float) -> None:
|
|
q_low = self._data[property].quantile(lower)
|
|
q_hi = self._data[property].quantile(upper)
|
|
self._data = self._data[(self._data[property] < q_hi) & (self._data[property] > q_low)]
|
|
|
|
def transform_column_with_standard_scaler(self, column_name : str):
|
|
scaler = StandardScaler()
|
|
scaler.fit(self._data[[column_name]])
|
|
self._data[[column_name]] = scaler.transform(self._data[[column_name]])
|
|
self._transformers[column_name] = scaler
|
|
|
|
def encode_labels(self, column_name: str) -> None:
|
|
encoder = LabelEncoder()
|
|
encoder.fit(self._data[column_name])
|
|
self._data[column_name] = encoder.transform(self._data[column_name])
|
|
self._transformers[column_name] = encoder
|
|
|
|
def get_value(self, column_name, original_data):
|
|
return self._transformers[column_name].transform(original_data)
|
|
|
|
def transorm_address_to_district(self, row):
|
|
elements = row.split(',')
|
|
if len(elements) < 4:
|
|
return ""
|
|
return elements[-4].lstrip()
|
|
|
|
def trasform_column(self, column_name, function):
|
|
self._data[column_name] = self._data[column_name].apply(function)
|
|
|
|
def preprocess_data(self) -> None:
|
|
self.transform_column_with_standard_scaler("Area")
|
|
self.transform_column_with_standard_scaler("Construction year")
|
|
self.encode_labels("Property form")
|
|
self.encode_labels("State")
|
|
self.trasform_column("Location", self.transorm_address_to_district)
|
|
self._data = self._data[self._data['Location'] != ""]
|
|
self.encode_labels("Location")
|
|
|
|
self.remove_outliers_by("Price", 0.1, 0.9)
|
|
self.remove_outliers_by("Area", 0.1, 0.9)
|
|
|
|
def get_preprocessed_data(self) -> DataFrame:
|
|
return self._data
|