from selenium import webdriver from selenium.webdriver.common.by import By from DataCollectingScraper.DataScrapers.IDataScraper import IDataScraper, IDataScraperState from DataCollectingScraper.helpers.WebDriverWaiter import WebDriverWaiter from DataCollectingScraper.helpers.SingleOfferCSVWriter import SingleOfferCSVWriter from DataCollectingScraper.models.SingleOffer import SingleOffer class OtoDomDataScraperState(IDataScraperState): def __init__(self) -> None: super().__init__() # Scrapping offers links stage self._offers_links = [] self._offers_page_number = 1 self._offers_pages_scrapped = False # Scrapping offers self._scrapped_offers_links = [] # Final state self._finished = False class OtoDomDataScraperImpl(IDataScraper): def __init__(self, offers_sublink = "") -> None: super().__init__() self.name = "OtoDomDataScraperImpl" self.output_csv_file = "output.csv" self.offers_sublink = offers_sublink self.pages_amount = 275 self.offers_list_presence_selector = "//*[contains(text(), 'Wszystkie ogłoszenia')]" self.offers_list_selector = "a[data-cy='listing-item-link']" self.area_detail_selector = "div[aria-label='Powierzchnia'] > div:nth-child(3) > div" self.rooms_number_detail_selector = "div[aria-label='Liczba pokoi'] > div:nth-child(3) > div" self.floor_number_detail_selector = "div[aria-label='Piętro'] > div:nth-child(3) > div" self.property_form_detail_selector = "div[aria-label='Forma własności'] > div:nth-child(3) > div" self.state_detail_selector = "div[aria-label='Stan wykończenia'] > div:nth-child(3) > div" self.location_detail_selector = "a[aria-label='Adres']" self.construction_year_detail_selector = "div[aria-label='Rok budowy'] > div:nth-child(2) > div" self.price_detail_selector = "strong[aria-label='Cena']" @staticmethod def get_home_page() -> str: return "https://www.otodom.pl/pl/" def scrap_one_offer(self, driver: webdriver.Chrome, offer_link : str) -> SingleOffer: single_offer = SingleOffer() driver.get(offer_link) WebDriverWaiter.wait_for(driver, (By.CSS_SELECTOR, self.area_detail_selector)) single_offer.area = driver.find_element(By.CSS_SELECTOR, self.area_detail_selector).text single_offer.rooms_number = driver.find_element(By.CSS_SELECTOR, self.rooms_number_detail_selector).text single_offer.floor = driver.find_element(By.CSS_SELECTOR, self.floor_number_detail_selector).text single_offer.property_form = driver.find_element(By.CSS_SELECTOR, self.property_form_detail_selector).text single_offer.state = driver.find_element(By.CSS_SELECTOR, self.state_detail_selector).text single_offer.location = driver.find_element(By.CSS_SELECTOR, self.location_detail_selector).text single_offer.construction_year = driver.find_element(By.CSS_SELECTOR, self.construction_year_detail_selector).text single_offer.price = driver.find_element(By.CSS_SELECTOR, self.price_detail_selector).text return single_offer def scrap_data(self, driver: webdriver.Chrome, state: OtoDomDataScraperState = None) -> OtoDomDataScraperState: if not state: state = OtoDomDataScraperState() try: if not state._offers_pages_scrapped: for page_number in range(state._offers_page_number, self.pages_amount): state._offers_page_number = page_number print(f"Get page: {self.offers_sublink}&page={page_number}") driver.get(self.get_home_page() + self.offers_sublink + f"&page={page_number}") WebDriverWaiter.wait_for(driver, (By.CSS_SELECTOR, self.offers_list_selector)) offers_list = driver.find_elements(By.CSS_SELECTOR, self.offers_list_selector) if len(offers_list) == 0: continue for offer in offers_list: offer_link = offer.get_attribute("href") state._offers_links.append(offer_link) state._offers_pages_scrapped = True for offer_link in state._offers_links: if offer_link not in state._scrapped_offers_links: print(f"Scrapping offer: {offer_link}") state._scrapped_offers_links.append(offer_link) single_offer = self.scrap_one_offer(driver, offer_link) SingleOfferCSVWriter.save_to_file(self.output_csv_file, single_offer) driver.close() state._finished = True except: pass return state