project_python_rynekNieruch.../home_pricing/DataCollectingScraper/DataScrapers/OtoDomDataScraperImpl.py

95 lines
4.7 KiB
Python
Raw Normal View History

2024-02-26 16:54:44 +01:00
from selenium import webdriver
from selenium.webdriver.common.by import By
from DataCollectingScraper.DataScrapers.IDataScraper import IDataScraper, IDataScraperState
from DataCollectingScraper.helpers.WebDriverWaiter import WebDriverWaiter
from DataCollectingScraper.helpers.SingleOfferCSVWriter import SingleOfferCSVWriter
from DataCollectingScraper.models.SingleOffer import SingleOffer
class OtoDomDataScraperState(IDataScraperState):
def __init__(self) -> None:
super().__init__()
# Scrapping offers links stage
self._offers_links = []
self._offers_page_number = 1
self._offers_pages_scrapped = False
# Scrapping offers
self._scrapped_offers_links = []
# Final state
self._finished = False
class OtoDomDataScraperImpl(IDataScraper):
def __init__(self, offers_sublink = "") -> None:
super().__init__()
self.name = "OtoDomDataScraperImpl"
self.output_csv_file = "output.csv"
self.offers_sublink = offers_sublink
self.pages_amount = 275
self.offers_list_presence_selector = "//*[contains(text(), 'Wszystkie ogłoszenia')]"
self.offers_list_selector = "a[data-cy='listing-item-link']"
self.area_detail_selector = "div[aria-label='Powierzchnia'] > div:nth-child(3) > div"
self.rooms_number_detail_selector = "div[aria-label='Liczba pokoi'] > div:nth-child(3) > div"
self.floor_number_detail_selector = "div[aria-label='Piętro'] > div:nth-child(3) > div"
self.property_form_detail_selector = "div[aria-label='Forma własności'] > div:nth-child(3) > div"
self.state_detail_selector = "div[aria-label='Stan wykończenia'] > div:nth-child(3) > div"
self.location_detail_selector = "a[aria-label='Adres']"
self.construction_year_detail_selector = "div[aria-label='Rok budowy'] > div:nth-child(2) > div"
self.price_detail_selector = "strong[aria-label='Cena']"
@staticmethod
def get_home_page() -> str:
return "https://www.otodom.pl/pl/"
def scrap_one_offer(self, driver: webdriver.Chrome, offer_link : str) -> SingleOffer:
single_offer = SingleOffer()
driver.get(offer_link)
WebDriverWaiter.wait_for(driver, (By.CSS_SELECTOR, self.area_detail_selector))
single_offer.area = driver.find_element(By.CSS_SELECTOR, self.area_detail_selector).text
single_offer.rooms_number = driver.find_element(By.CSS_SELECTOR, self.rooms_number_detail_selector).text
single_offer.floor = driver.find_element(By.CSS_SELECTOR, self.floor_number_detail_selector).text
single_offer.property_form = driver.find_element(By.CSS_SELECTOR, self.property_form_detail_selector).text
single_offer.state = driver.find_element(By.CSS_SELECTOR, self.state_detail_selector).text
single_offer.location = driver.find_element(By.CSS_SELECTOR, self.location_detail_selector).text
single_offer.construction_year = driver.find_element(By.CSS_SELECTOR, self.construction_year_detail_selector).text
single_offer.price = driver.find_element(By.CSS_SELECTOR, self.price_detail_selector).text
return single_offer
def scrap_data(self, driver: webdriver.Chrome, state: OtoDomDataScraperState = None) -> OtoDomDataScraperState:
if not state:
state = OtoDomDataScraperState()
try:
if not state._offers_pages_scrapped:
for page_number in range(state._offers_page_number, self.pages_amount):
state._offers_page_number = page_number
print(f"Get page: {self.offers_sublink}&page={page_number}")
driver.get(self.get_home_page() + self.offers_sublink + f"&page={page_number}")
WebDriverWaiter.wait_for(driver, (By.CSS_SELECTOR, self.offers_list_selector))
offers_list = driver.find_elements(By.CSS_SELECTOR, self.offers_list_selector)
if len(offers_list) == 0:
continue
for offer in offers_list:
offer_link = offer.get_attribute("href")
state._offers_links.append(offer_link)
state._offers_pages_scrapped = True
for offer_link in state._offers_links:
if offer_link not in state._scrapped_offers_links:
print(f"Scrapping offer: {offer_link}")
state._scrapped_offers_links.append(offer_link)
single_offer = self.scrap_one_offer(driver, offer_link)
SingleOfferCSVWriter.save_to_file(self.output_csv_file, single_offer)
driver.close()
state._finished = True
except:
pass
return state