project_python_rynekNieruch.../home_pricing/DataCollectingScraper/DataScrapers/OtoDomDataScraperImpl.py
2024-02-26 16:54:44 +01:00

95 lines
4.7 KiB
Python

from selenium import webdriver
from selenium.webdriver.common.by import By
from DataCollectingScraper.DataScrapers.IDataScraper import IDataScraper, IDataScraperState
from DataCollectingScraper.helpers.WebDriverWaiter import WebDriverWaiter
from DataCollectingScraper.helpers.SingleOfferCSVWriter import SingleOfferCSVWriter
from DataCollectingScraper.models.SingleOffer import SingleOffer
class OtoDomDataScraperState(IDataScraperState):
def __init__(self) -> None:
super().__init__()
# Scrapping offers links stage
self._offers_links = []
self._offers_page_number = 1
self._offers_pages_scrapped = False
# Scrapping offers
self._scrapped_offers_links = []
# Final state
self._finished = False
class OtoDomDataScraperImpl(IDataScraper):
def __init__(self, offers_sublink = "") -> None:
super().__init__()
self.name = "OtoDomDataScraperImpl"
self.output_csv_file = "output.csv"
self.offers_sublink = offers_sublink
self.pages_amount = 275
self.offers_list_presence_selector = "//*[contains(text(), 'Wszystkie ogłoszenia')]"
self.offers_list_selector = "a[data-cy='listing-item-link']"
self.area_detail_selector = "div[aria-label='Powierzchnia'] > div:nth-child(3) > div"
self.rooms_number_detail_selector = "div[aria-label='Liczba pokoi'] > div:nth-child(3) > div"
self.floor_number_detail_selector = "div[aria-label='Piętro'] > div:nth-child(3) > div"
self.property_form_detail_selector = "div[aria-label='Forma własności'] > div:nth-child(3) > div"
self.state_detail_selector = "div[aria-label='Stan wykończenia'] > div:nth-child(3) > div"
self.location_detail_selector = "a[aria-label='Adres']"
self.construction_year_detail_selector = "div[aria-label='Rok budowy'] > div:nth-child(2) > div"
self.price_detail_selector = "strong[aria-label='Cena']"
@staticmethod
def get_home_page() -> str:
return "https://www.otodom.pl/pl/"
def scrap_one_offer(self, driver: webdriver.Chrome, offer_link : str) -> SingleOffer:
single_offer = SingleOffer()
driver.get(offer_link)
WebDriverWaiter.wait_for(driver, (By.CSS_SELECTOR, self.area_detail_selector))
single_offer.area = driver.find_element(By.CSS_SELECTOR, self.area_detail_selector).text
single_offer.rooms_number = driver.find_element(By.CSS_SELECTOR, self.rooms_number_detail_selector).text
single_offer.floor = driver.find_element(By.CSS_SELECTOR, self.floor_number_detail_selector).text
single_offer.property_form = driver.find_element(By.CSS_SELECTOR, self.property_form_detail_selector).text
single_offer.state = driver.find_element(By.CSS_SELECTOR, self.state_detail_selector).text
single_offer.location = driver.find_element(By.CSS_SELECTOR, self.location_detail_selector).text
single_offer.construction_year = driver.find_element(By.CSS_SELECTOR, self.construction_year_detail_selector).text
single_offer.price = driver.find_element(By.CSS_SELECTOR, self.price_detail_selector).text
return single_offer
def scrap_data(self, driver: webdriver.Chrome, state: OtoDomDataScraperState = None) -> OtoDomDataScraperState:
if not state:
state = OtoDomDataScraperState()
try:
if not state._offers_pages_scrapped:
for page_number in range(state._offers_page_number, self.pages_amount):
state._offers_page_number = page_number
print(f"Get page: {self.offers_sublink}&page={page_number}")
driver.get(self.get_home_page() + self.offers_sublink + f"&page={page_number}")
WebDriverWaiter.wait_for(driver, (By.CSS_SELECTOR, self.offers_list_selector))
offers_list = driver.find_elements(By.CSS_SELECTOR, self.offers_list_selector)
if len(offers_list) == 0:
continue
for offer in offers_list:
offer_link = offer.get_attribute("href")
state._offers_links.append(offer_link)
state._offers_pages_scrapped = True
for offer_link in state._offers_links:
if offer_link not in state._scrapped_offers_links:
print(f"Scrapping offer: {offer_link}")
state._scrapped_offers_links.append(offer_link)
single_offer = self.scrap_one_offer(driver, offer_link)
SingleOfferCSVWriter.save_to_file(self.output_csv_file, single_offer)
driver.close()
state._finished = True
except:
pass
return state