add home_pricing application

This commit is contained in:
s1201695 2024-02-26 16:54:44 +01:00
parent fde621ab57
commit 762b979ca9
14 changed files with 2657 additions and 0 deletions

View File

@ -0,0 +1,39 @@
from typing import Any
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from DataCollectingScraper.DataScrapers.IDataScraper import IDataScraper, IDataScraperState
class DataCollectingScraper:
def __init__(self, scraper_impl: IDataScraper) -> None:
self.scraper_impl : IDataScraper = scraper_impl
self.driver : webdriver.Chrome = None
self.instantiate_driver()
@staticmethod
def get_driver_options() -> list:
return [
# "--headless",
"--no-sandbox",
"--disable-dev-shm-usage",
]
def instantiate_driver(self) -> None:
options = Options()
opts = DataCollectingScraper.get_driver_options()
for opt in opts:
options.add_argument(opt)
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
def __call__(self, *args: Any, **kwds: Any) -> None:
state : IDataScraperState = None
if self.driver is not None:
while(True):
print(f"Calling DataCollectingScraper for {str(self.scraper_impl)} and page {self.scraper_impl.get_home_page()}")
state = self.scraper_impl.scrap_data(self.driver, state)
if not state.finished():
self.instantiate_driver()
else:
break

View File

@ -0,0 +1,20 @@
class IDataScraperState:
def __init__(self) -> None:
self._finished = False
def finished(self) -> bool:
return self._finished
class IDataScraper:
def __init__(self) -> None:
self.name = ""
def __str__(self) -> str:
return self.name
@staticmethod
def get_home_page() -> str:
return ""
def scrap_data() -> IDataScraperState:
pass

View File

@ -0,0 +1,94 @@
from selenium import webdriver
from selenium.webdriver.common.by import By
from DataCollectingScraper.DataScrapers.IDataScraper import IDataScraper, IDataScraperState
from DataCollectingScraper.helpers.WebDriverWaiter import WebDriverWaiter
from DataCollectingScraper.helpers.SingleOfferCSVWriter import SingleOfferCSVWriter
from DataCollectingScraper.models.SingleOffer import SingleOffer
class OtoDomDataScraperState(IDataScraperState):
def __init__(self) -> None:
super().__init__()
# Scrapping offers links stage
self._offers_links = []
self._offers_page_number = 1
self._offers_pages_scrapped = False
# Scrapping offers
self._scrapped_offers_links = []
# Final state
self._finished = False
class OtoDomDataScraperImpl(IDataScraper):
def __init__(self, offers_sublink = "") -> None:
super().__init__()
self.name = "OtoDomDataScraperImpl"
self.output_csv_file = "output.csv"
self.offers_sublink = offers_sublink
self.pages_amount = 275
self.offers_list_presence_selector = "//*[contains(text(), 'Wszystkie ogłoszenia')]"
self.offers_list_selector = "a[data-cy='listing-item-link']"
self.area_detail_selector = "div[aria-label='Powierzchnia'] > div:nth-child(3) > div"
self.rooms_number_detail_selector = "div[aria-label='Liczba pokoi'] > div:nth-child(3) > div"
self.floor_number_detail_selector = "div[aria-label='Piętro'] > div:nth-child(3) > div"
self.property_form_detail_selector = "div[aria-label='Forma własności'] > div:nth-child(3) > div"
self.state_detail_selector = "div[aria-label='Stan wykończenia'] > div:nth-child(3) > div"
self.location_detail_selector = "a[aria-label='Adres']"
self.construction_year_detail_selector = "div[aria-label='Rok budowy'] > div:nth-child(2) > div"
self.price_detail_selector = "strong[aria-label='Cena']"
@staticmethod
def get_home_page() -> str:
return "https://www.otodom.pl/pl/"
def scrap_one_offer(self, driver: webdriver.Chrome, offer_link : str) -> SingleOffer:
single_offer = SingleOffer()
driver.get(offer_link)
WebDriverWaiter.wait_for(driver, (By.CSS_SELECTOR, self.area_detail_selector))
single_offer.area = driver.find_element(By.CSS_SELECTOR, self.area_detail_selector).text
single_offer.rooms_number = driver.find_element(By.CSS_SELECTOR, self.rooms_number_detail_selector).text
single_offer.floor = driver.find_element(By.CSS_SELECTOR, self.floor_number_detail_selector).text
single_offer.property_form = driver.find_element(By.CSS_SELECTOR, self.property_form_detail_selector).text
single_offer.state = driver.find_element(By.CSS_SELECTOR, self.state_detail_selector).text
single_offer.location = driver.find_element(By.CSS_SELECTOR, self.location_detail_selector).text
single_offer.construction_year = driver.find_element(By.CSS_SELECTOR, self.construction_year_detail_selector).text
single_offer.price = driver.find_element(By.CSS_SELECTOR, self.price_detail_selector).text
return single_offer
def scrap_data(self, driver: webdriver.Chrome, state: OtoDomDataScraperState = None) -> OtoDomDataScraperState:
if not state:
state = OtoDomDataScraperState()
try:
if not state._offers_pages_scrapped:
for page_number in range(state._offers_page_number, self.pages_amount):
state._offers_page_number = page_number
print(f"Get page: {self.offers_sublink}&page={page_number}")
driver.get(self.get_home_page() + self.offers_sublink + f"&page={page_number}")
WebDriverWaiter.wait_for(driver, (By.CSS_SELECTOR, self.offers_list_selector))
offers_list = driver.find_elements(By.CSS_SELECTOR, self.offers_list_selector)
if len(offers_list) == 0:
continue
for offer in offers_list:
offer_link = offer.get_attribute("href")
state._offers_links.append(offer_link)
state._offers_pages_scrapped = True
for offer_link in state._offers_links:
if offer_link not in state._scrapped_offers_links:
print(f"Scrapping offer: {offer_link}")
state._scrapped_offers_links.append(offer_link)
single_offer = self.scrap_one_offer(driver, offer_link)
SingleOfferCSVWriter.save_to_file(self.output_csv_file, single_offer)
driver.close()
state._finished = True
except:
pass
return state

View File

@ -0,0 +1,13 @@
from DataCollectingScraper.models.SingleOffer import SingleOffer
import csv
class SingleOfferCSVWriter:
@staticmethod
def save_to_file(file_name: str, offer: SingleOffer):
with open(file_name, mode="a", newline="") as file:
writer = csv.DictWriter(file, fieldnames=offer.get_columns_names())
if file.tell() == 0:
writer.writeheader()
writer.writerows([offer.get_dict_repr()])

View File

@ -0,0 +1,16 @@
from typing import Tuple
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class WebDriverWaiter:
@staticmethod
def wait_for(driver: webdriver.Chrome, locator: Tuple[str, str]):
delay = 10 # seconds
try:
WebDriverWait(driver, delay).until(EC.presence_of_element_located(locator))
except TimeoutException:
print("Timeout")

View File

@ -0,0 +1,118 @@
class SingleOffer():
def __init__(self):
self._area : float = None
self._rooms_number : int = None
self._floor : int = None
self._property_form : str = None
self._state : str = None
self._location : str = None
self._construction_year : int = None
self._price : float = None
def __str__(self) -> str:
result = f"Area: {self.area} "
result += f"rooms number: {self.rooms_number} "
result += f"floor: {self.floor} "
result += f"property form: {self.property_form} "
result += f"state: {self.state} "
result += f"location: {self.location} "
result += f"construction year: {self.construction_year} "
result += f"price: {self.price} "
return result
@staticmethod
def get_columns_names() -> list:
return ["Area", "Rooms", "Floor", "Property form", "State", "Location", "Construction year", "Price"]
def get_dict_repr(self) -> dict:
return {
"Area": self.area,
"Rooms": self.rooms_number,
"Floor": self.floor,
"Property form": self.property_form,
"State": self.state,
"Location": self.location,
"Construction year": self.construction_year,
"Price": self.price
}
# Area
@property
def area(self):
return self._area
@area.setter
def area(self, area : str):
area_str = area.replace("", "").replace(",", ".")
self._area = float(area_str)
# Rooms number
@property
def rooms_number(self):
return self._rooms_number
@rooms_number.setter
def rooms_number(self, rooms_number : str):
self._rooms_number = int(rooms_number)
# Floor
@property
def floor(self):
return self._floor
@floor.setter
def floor(self, floor : str):
floor_str = floor
index_of_slash = floor.find("/")
if index_of_slash != -1:
floor_str = floor_str[:index_of_slash]
if floor_str == "parter":
floor_str = "0"
self._floor = int(floor_str)
# Property form
@property
def property_form(self):
return self._property_form
@property_form.setter
def property_form(self, property_form : str):
self._property_form = property_form
# State
@property
def state(self):
return self._state
@state.setter
def state(self, state : str):
self._state = state
# Location
@property
def location(self):
return self._location
@location.setter
def location(self, location : str):
self._location = location
# Construction year
@property
def construction_year(self):
return self._construction_year
@construction_year.setter
def construction_year(self, construction_year : str):
self._construction_year = int(construction_year)
# Price
@property
def price(self):
return self._price
@price.setter
def price(self, price : str):
price_str = price.replace("", "").replace(",", ".").replace(" ", "")
self._price = float(price_str)

View File

@ -0,0 +1,53 @@
from pandas.core.frame import DataFrame
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
class DataPreprocessor:
def __init__(self, data : DataFrame) -> None:
self._data = data
self._transformers = {}
def remove_outliers_by(self, property : str, lower : float, upper: float) -> None:
q_low = self._data[property].quantile(lower)
q_hi = self._data[property].quantile(upper)
self._data = self._data[(self._data[property] < q_hi) & (self._data[property] > q_low)]
def transform_column_with_standard_scaler(self, column_name : str):
scaler = StandardScaler()
scaler.fit(self._data[[column_name]])
self._data[[column_name]] = scaler.transform(self._data[[column_name]])
self._transformers[column_name] = scaler
def encode_labels(self, column_name: str) -> None:
encoder = LabelEncoder()
encoder.fit(self._data[column_name])
self._data[column_name] = encoder.transform(self._data[column_name])
self._transformers[column_name] = encoder
def get_value(self, column_name, original_data):
return self._transformers[column_name].transform(original_data)
def transorm_address_to_district(self, row):
elements = row.split(',')
if len(elements) < 4:
return ""
return elements[-4].lstrip()
def trasform_column(self, column_name, function):
self._data[column_name] = self._data[column_name].apply(function)
def preprocess_data(self) -> None:
self.transform_column_with_standard_scaler("Area")
self.transform_column_with_standard_scaler("Construction year")
self.encode_labels("Property form")
self.encode_labels("State")
self.trasform_column("Location", self.transorm_address_to_district)
self._data = self._data[self._data['Location'] != ""]
self.encode_labels("Location")
self.remove_outliers_by("Price", 0.1, 0.9)
self.remove_outliers_by("Area", 0.1, 0.9)
def get_preprocessed_data(self) -> DataFrame:
return self._data

View File

@ -0,0 +1,7 @@
import pandas as pd
from pandas.core.frame import DataFrame
class OffersCSVReader:
@staticmethod
def read_from_file(file_name: str) -> DataFrame:
return pd.read_csv(file_name)

View File

@ -0,0 +1,35 @@
from pandas.core.frame import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
import math
class PredictionModelTrainer:
def __init__(self, preprocessed_data : DataFrame) -> None:
self.data_ : DataFrame = preprocessed_data
self.trained_model_ : MLPRegressor = None
def train(self) -> None:
print("Training home pricing model with MLPRegressor")
X = self.data_.drop(columns=['Price'])
y = self.data_['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)
self.trained_model_ = MLPRegressor(
activation="relu",
hidden_layer_sizes=(10, 80, 200),
max_iter=2000,
random_state=5,
alpha=0.01,
solver="lbfgs"
)
self.trained_model_.fit(X_train, y_train)
y_pred = self.trained_model_.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
rmse = math.sqrt(mse)
print(f"Root mean squared error: {rmse}")
print(f"Model on avaerage is wrong by {round(rmse, 2)} PLN")
def get_trained_model(self) -> MLPRegressor:
return self.trained_model_

View File

@ -0,0 +1,216 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Home pricing tool</title>
<style>
body {
font-family: Arial, sans-serif;
background-color: #f2f2f2;
padding: 20px;
}
h2 {
color: #333;
}
form {
background-color: #fff;
padding: 20px;
border-radius: 10px;
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
max-width: 400px;
margin: 0 auto;
}
label {
display: block;
margin-bottom: 5px;
color: #555;
}
input[type="text"],
select {
width: 100%;
padding: 10px;
margin-bottom: 15px;
border: 1px solid #ccc;
border-radius: 5px;
box-sizing: border-box; /* Ensure padding and border are included in element's total width and height */
}
input[type="button"] {
background-color: #4CAF50;
color: white;
padding: 15px 30px; /* Adjusted padding to make it larger */
border: none;
border-radius: 5px;
cursor: pointer;
transition: background-color 0.3s;
display: block; /* Centering the button */
margin: 0 auto; /* Centering the button */
}
input[type="button"]:hover {
background-color: #45a049;
}
#response {
margin-top: 20px;
border: 1px solid #ccc;
padding: 20px;
border-radius: 5px;
}
#response h3 {
color: #333;
margin-top: 0;
}
#response pre {
background-color: #f9f9f9;
padding: 10px;
border-radius: 5px;
border: 1px solid #ccc;
font-size: 14px;
}
</style>
</head>
<body>
<h2>Home pricing tool</h2>
<form id="propertyForm" action="#" method="post">
<label for="powierzchnia">Powierzchnia:</label>
<input type="text" id="powierzchnia" name="powierzchnia">
<label for="rok_budowy">Rok budowy:</label>
<input type="text" id="rok_budowy" name="rok_budowy">
<label for="ilosc_pokoi">Ilość pokoi:</label>
<input type="text" id="ilosc_pokoi" name="ilosc_pokoi">
<label for="numer_pietra">Numer piętra:</label>
<input type="text" id="numer_pietra" name="numer_pietra">
<label for="forma_wlasnosci">Forma własności:</label>
<select id="forma_wlasnosci" name="forma_wlasnosci">
<option value="pełna własność">pełna własność</option>
<option value="spółdzielcze wł. prawo do lokalu">spółdzielcze wł. prawo do lokalu</option>
</select>
<label for="dzielnica">Dzielnica:</label>
<select id="dzielnica" name="dzielnica">
<option value="Bartoszowice">Bartoszowice</option>
<option value="Bieńkowice">Bieńkowice</option>
<option value="Biskupin">Biskupin</option>
<option value="Borek">Borek</option>
<option value="Brochów">Brochów</option>
<option value="Dąbie">Dąbie</option>
<option value="Gaj">Gaj</option>
<option value="Gajowice">Gajowice</option>
<option value="Grabiszyn">Grabiszyn</option>
<option value="Grabiszynek">Grabiszynek</option>
<option value="Gądów">Gądów</option>
<option value="Huby">Huby</option>
<option value="Iwiny">Iwiny</option>
<option value="Jagodno">Jagodno</option>
<option value="Karłowice">Karłowice</option>
<option value="Klecina">Klecina</option>
<option value="Kleczków">Kleczków</option>
<option value="Kowale">Kowale</option>
<option value="Kozanów">Kozanów</option>
<option value="Krzyki">Krzyki</option>
<option value="Księże">Księże</option>
<option value="Kuźniki">Kuźniki</option>
<option value="Leśnica">Leśnica</option>
<option value="Ligota">Ligota</option>
<option value="Lipa Piotrowska">Lipa Piotrowska</option>
<option value="Maślice">Maślice</option>
<option value="Muchobór Mały">Muchobór Mały</option>
<option value="Muchobór Wielki">Muchobór Wielki</option>
<option value="Nadodrze">Nadodrze</option>
<option value="Nowy Dwór">Nowy Dwór</option>
<option value="Oporów">Oporów</option>
<option value="Osobowice">Osobowice</option>
<option value="Ołbin">Ołbin</option>
<option value="Ołtaszyn">Ołtaszyn</option>
<option value="Partynice">Partynice</option>
<option value="Pawłowice">Pawłowice</option>
<option value="Pilczyce">Pilczyce</option>
<option value="Plac Grunwaldzki">Plac Grunwaldzki</option>
<option value="Polanowice">Polanowice</option>
<option value="Popowice Północne">Popowice Północne</option>
<option value="Powstańców Śląskich">Powstańców Śląskich</option>
<option value="Poświętne">Poświętne</option>
<option value="Pracze Odrzańskie">Pracze Odrzańskie</option>
<option value="Przedmieście Oławskie">Przedmieście Oławskie</option>
<option value="Przedmieście Świdnickie">Przedmieście Świdnickie</option>
<option value="Psie Pole">Psie Pole</option>
<option value="Radomierzyce">Radomierzyce</option>
<option value="Radwanice">Radwanice</option>
<option value="Różanka">Różanka</option>
<option value="Sołtysowice">Sołtysowice</option>
<option value="Stare Miasto">Stare Miasto</option>
<option value="Strachocin">Strachocin</option>
<option value="Strachowice">Strachowice</option>
<option value="Swojczyce">Swojczyce</option>
<option value="Szczepin">Szczepin</option>
<option value="Szczytniki">Szczytniki</option>
<option value="Sępolno">Sępolno</option>
<option value="Tarnogaj">Tarnogaj</option>
<option value="Widawa">Widawa</option>
<option value="Wilczyce">Wilczyce</option>
<option value="Wojszyce">Wojszyce</option>
<option value="Wysoka">Wysoka</option>
<option value="Zawidawie">Zawidawie</option>
<option value="Żerniki">Żerniki</option>
</select>
<label for="stan_nieruchomosci">Stan nieruchomości:</label>
<select id="stan_nieruchomosci" name="stan_nieruchomosci">
<option value="do remontu">do remontu</option>
<option value="do wykończenia">do wykończenia</option>
<option value="do zamieszkania">do zamieszkania</option>
</select>
<input type="button" value="Wylicz cenę" onclick="submitForm()">
</form>
<div id="response">
<h3>Spodziewana cena: </h3>
<div id="response_value"></div>
</div>
<script>
function submitForm() {
var formData = {
powierzchnia: document.getElementById("powierzchnia").value,
rok_budowy: document.getElementById("rok_budowy").value,
ilosc_pokoi: document.getElementById("ilosc_pokoi").value,
numer_pietra: document.getElementById("numer_pietra").value,
forma_wlasnosci: document.getElementById("forma_wlasnosci").value,
dzielnica: document.getElementById("dzielnica").value,
stan_nieruchomosci: document.getElementById("stan_nieruchomosci").value
};
const url = 'http://localhost:8081/calculate_price';
fetch(url, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify(formData),
})
.then((response) => response.json())
.then((data) => {
document.getElementById("response_value").textContent = data["estimated_price"] + " PLN"
console.log(data["estimated_price"])
})
.catch(error => console.error('Error:', error));
}
</script>
</body>
</html>

46
home_pricing/main.py Normal file
View File

@ -0,0 +1,46 @@
from DataCollectingScraper.DataScrapers.OtoDomDataScraperImpl import OtoDomDataScraperImpl
from DataCollectingScraper.DataCollectingScraper import DataCollectingScraper
from DataPreprocessor.helpers.OffersCSVReader import OffersCSVReader
from DataPreprocessor.DataPreprocessor import DataPreprocessor
from Prediction.Trainer.PredictionModelTrainer import PredictionModelTrainer
from sklearn.neural_network import MLPRegressor
from pandas.core.frame import DataFrame
import pandas as pd
import joblib
download_data = False
train_model = False
# Downloading raw data
if download_data:
offers_sublink = "wyniki/sprzedaz/mieszkanie/dolnoslaskie/wroclaw/wroclaw/wroclaw?viewType=listing"
scraper = DataCollectingScraper(OtoDomDataScraperImpl(offers_sublink))
scraper()
# Reading downloaded data
data_frame : DataFrame = OffersCSVReader.read_from_file("output.csv")
# Prepare data for neural network (data preprocessing)
data_preprocessor = DataPreprocessor(data_frame)
data_preprocessor.preprocess_data()
if train_model:
preprocessed_data : DataFrame = data_preprocessor.get_preprocessed_data()
# Train neural network with preprocessed data
trainer = PredictionModelTrainer(preprocessed_data)
trainer.train()
trained_model : MLPRegressor = trainer.get_trained_model()
joblib.dump(trained_model, 'trained_model.pkl')
trained_model = joblib.load('trained_model.pkl')
scaled_area = data_preprocessor.get_value('Area', pd.DataFrame({'Area': [56.0]}))
scaled_construction_year = data_preprocessor.get_value('Construction year', pd.DataFrame({'Construction year': [1980]}))
encoded_location = data_preprocessor.get_value("Location", ['Krzyki'])
sample_data = [[scaled_area, 3, 8, 0, 2, encoded_location, scaled_construction_year]]
sample = pd.DataFrame(sample_data, columns=['Area', 'Rooms', 'Floor', 'Property form' , 'State', 'Location', 'Construction year'])
prediction = trained_model.predict(sample)
print('Predicted price: ', round(float(prediction),0), '')

1954
home_pricing/output.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,46 @@
from flask import Flask, request, jsonify
from flask_cors import CORS
import joblib
import pandas as pd
from DataPreprocessor.helpers.OffersCSVReader import OffersCSVReader
from DataPreprocessor.DataPreprocessor import DataPreprocessor
from pandas.core.frame import DataFrame
app = Flask(__name__)
CORS(app) # This will enable CORS for all routes
# Reading downloaded data
data_frame : DataFrame = OffersCSVReader.read_from_file("output.csv")
# Prepare data for neural network (data preprocessing)
data_preprocessor = DataPreprocessor(data_frame)
data_preprocessor.preprocess_data()
trained_model = joblib.load('trained_model.pkl')
@app.route('/calculate_price', methods=['POST'])
def calculate_price():
input_data = request.json
scaled_area = data_preprocessor.get_value('Area', pd.DataFrame({'Area': [input_data["powierzchnia"]]}))
scaled_construction_year = data_preprocessor.get_value('Construction year', pd.DataFrame({'Construction year': [input_data["rok_budowy"]]}))
encoded_location = data_preprocessor.get_value("Location", [input_data["dzielnica"]])
encoded_state = data_preprocessor.get_value("State", [input_data["stan_nieruchomosci"]])
encoded_property_form = data_preprocessor.get_value("Property form", [input_data["forma_wlasnosci"]])
floor = input_data['numer_pietra']
rooms = input_data['ilosc_pokoi']
sample_data = [[scaled_area, rooms, floor, encoded_property_form, encoded_state, encoded_location, scaled_construction_year]]
sample = pd.DataFrame(sample_data, columns=['Area', 'Rooms', 'Floor', 'Property form' , 'State', 'Location', 'Construction year'])
prediction = trained_model.predict(sample)
calculated_price = {
'estimated_price': round(float(prediction),0)
}
return jsonify(calculated_price)
if __name__ == '__main__':
app.run(debug=True, port=8081)

Binary file not shown.