from pandas.core.frame import DataFrame from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import LabelEncoder class DataPreprocessor: def __init__(self, data : DataFrame) -> None: self._data = data self._transformers = {} def remove_outliers_by(self, property : str, lower : float, upper: float) -> None: q_low = self._data[property].quantile(lower) q_hi = self._data[property].quantile(upper) self._data = self._data[(self._data[property] < q_hi) & (self._data[property] > q_low)] def transform_column_with_standard_scaler(self, column_name : str): scaler = StandardScaler() scaler.fit(self._data[[column_name]]) self._data[[column_name]] = scaler.transform(self._data[[column_name]]) self._transformers[column_name] = scaler def encode_labels(self, column_name: str) -> None: encoder = LabelEncoder() encoder.fit(self._data[column_name]) self._data[column_name] = encoder.transform(self._data[column_name]) self._transformers[column_name] = encoder def get_value(self, column_name, original_data): return self._transformers[column_name].transform(original_data) def transorm_address_to_district(self, row): elements = row.split(',') if len(elements) < 4: return "" return elements[-4].lstrip() def trasform_column(self, column_name, function): self._data[column_name] = self._data[column_name].apply(function) def preprocess_data(self) -> None: self.transform_column_with_standard_scaler("Area") self.transform_column_with_standard_scaler("Construction year") self.encode_labels("Property form") self.encode_labels("State") self.trasform_column("Location", self.transorm_address_to_district) self._data = self._data[self._data['Location'] != ""] self.encode_labels("Location") self.remove_outliers_by("Price", 0.1, 0.9) self.remove_outliers_by("Area", 0.1, 0.9) def get_preprocessed_data(self) -> DataFrame: return self._data