project_python_rynekNieruch.../home_pricing/DataPreprocessor/DataPreprocessor.py
2024-02-26 16:54:44 +01:00

54 lines
2.1 KiB
Python

from pandas.core.frame import DataFrame
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
class DataPreprocessor:
def __init__(self, data : DataFrame) -> None:
self._data = data
self._transformers = {}
def remove_outliers_by(self, property : str, lower : float, upper: float) -> None:
q_low = self._data[property].quantile(lower)
q_hi = self._data[property].quantile(upper)
self._data = self._data[(self._data[property] < q_hi) & (self._data[property] > q_low)]
def transform_column_with_standard_scaler(self, column_name : str):
scaler = StandardScaler()
scaler.fit(self._data[[column_name]])
self._data[[column_name]] = scaler.transform(self._data[[column_name]])
self._transformers[column_name] = scaler
def encode_labels(self, column_name: str) -> None:
encoder = LabelEncoder()
encoder.fit(self._data[column_name])
self._data[column_name] = encoder.transform(self._data[column_name])
self._transformers[column_name] = encoder
def get_value(self, column_name, original_data):
return self._transformers[column_name].transform(original_data)
def transorm_address_to_district(self, row):
elements = row.split(',')
if len(elements) < 4:
return ""
return elements[-4].lstrip()
def trasform_column(self, column_name, function):
self._data[column_name] = self._data[column_name].apply(function)
def preprocess_data(self) -> None:
self.transform_column_with_standard_scaler("Area")
self.transform_column_with_standard_scaler("Construction year")
self.encode_labels("Property form")
self.encode_labels("State")
self.trasform_column("Location", self.transorm_address_to_district)
self._data = self._data[self._data['Location'] != ""]
self.encode_labels("Location")
self.remove_outliers_by("Price", 0.1, 0.9)
self.remove_outliers_by("Area", 0.1, 0.9)
def get_preprocessed_data(self) -> DataFrame:
return self._data