95 lines
4.7 KiB
Python
95 lines
4.7 KiB
Python
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
|
|
from DataCollectingScraper.DataScrapers.IDataScraper import IDataScraper, IDataScraperState
|
|
from DataCollectingScraper.helpers.WebDriverWaiter import WebDriverWaiter
|
|
from DataCollectingScraper.helpers.SingleOfferCSVWriter import SingleOfferCSVWriter
|
|
|
|
from DataCollectingScraper.models.SingleOffer import SingleOffer
|
|
|
|
class OtoDomDataScraperState(IDataScraperState):
|
|
def __init__(self) -> None:
|
|
super().__init__()
|
|
# Scrapping offers links stage
|
|
self._offers_links = []
|
|
self._offers_page_number = 1
|
|
self._offers_pages_scrapped = False
|
|
# Scrapping offers
|
|
self._scrapped_offers_links = []
|
|
# Final state
|
|
self._finished = False
|
|
|
|
|
|
class OtoDomDataScraperImpl(IDataScraper):
|
|
def __init__(self, offers_sublink = "") -> None:
|
|
super().__init__()
|
|
self.name = "OtoDomDataScraperImpl"
|
|
self.output_csv_file = "output.csv"
|
|
self.offers_sublink = offers_sublink
|
|
self.pages_amount = 275
|
|
|
|
self.offers_list_presence_selector = "//*[contains(text(), 'Wszystkie ogłoszenia')]"
|
|
self.offers_list_selector = "a[data-cy='listing-item-link']"
|
|
|
|
self.area_detail_selector = "div[aria-label='Powierzchnia'] > div:nth-child(3) > div"
|
|
self.rooms_number_detail_selector = "div[aria-label='Liczba pokoi'] > div:nth-child(3) > div"
|
|
self.floor_number_detail_selector = "div[aria-label='Piętro'] > div:nth-child(3) > div"
|
|
self.property_form_detail_selector = "div[aria-label='Forma własności'] > div:nth-child(3) > div"
|
|
self.state_detail_selector = "div[aria-label='Stan wykończenia'] > div:nth-child(3) > div"
|
|
self.location_detail_selector = "a[aria-label='Adres']"
|
|
self.construction_year_detail_selector = "div[aria-label='Rok budowy'] > div:nth-child(2) > div"
|
|
self.price_detail_selector = "strong[aria-label='Cena']"
|
|
|
|
@staticmethod
|
|
def get_home_page() -> str:
|
|
return "https://www.otodom.pl/pl/"
|
|
|
|
def scrap_one_offer(self, driver: webdriver.Chrome, offer_link : str) -> SingleOffer:
|
|
single_offer = SingleOffer()
|
|
driver.get(offer_link)
|
|
WebDriverWaiter.wait_for(driver, (By.CSS_SELECTOR, self.area_detail_selector))
|
|
single_offer.area = driver.find_element(By.CSS_SELECTOR, self.area_detail_selector).text
|
|
single_offer.rooms_number = driver.find_element(By.CSS_SELECTOR, self.rooms_number_detail_selector).text
|
|
single_offer.floor = driver.find_element(By.CSS_SELECTOR, self.floor_number_detail_selector).text
|
|
single_offer.property_form = driver.find_element(By.CSS_SELECTOR, self.property_form_detail_selector).text
|
|
single_offer.state = driver.find_element(By.CSS_SELECTOR, self.state_detail_selector).text
|
|
single_offer.location = driver.find_element(By.CSS_SELECTOR, self.location_detail_selector).text
|
|
single_offer.construction_year = driver.find_element(By.CSS_SELECTOR, self.construction_year_detail_selector).text
|
|
single_offer.price = driver.find_element(By.CSS_SELECTOR, self.price_detail_selector).text
|
|
return single_offer
|
|
|
|
def scrap_data(self, driver: webdriver.Chrome, state: OtoDomDataScraperState = None) -> OtoDomDataScraperState:
|
|
if not state:
|
|
state = OtoDomDataScraperState()
|
|
|
|
try:
|
|
if not state._offers_pages_scrapped:
|
|
for page_number in range(state._offers_page_number, self.pages_amount):
|
|
state._offers_page_number = page_number
|
|
print(f"Get page: {self.offers_sublink}&page={page_number}")
|
|
driver.get(self.get_home_page() + self.offers_sublink + f"&page={page_number}")
|
|
WebDriverWaiter.wait_for(driver, (By.CSS_SELECTOR, self.offers_list_selector))
|
|
offers_list = driver.find_elements(By.CSS_SELECTOR, self.offers_list_selector)
|
|
if len(offers_list) == 0:
|
|
continue
|
|
for offer in offers_list:
|
|
offer_link = offer.get_attribute("href")
|
|
state._offers_links.append(offer_link)
|
|
|
|
state._offers_pages_scrapped = True
|
|
|
|
for offer_link in state._offers_links:
|
|
if offer_link not in state._scrapped_offers_links:
|
|
print(f"Scrapping offer: {offer_link}")
|
|
state._scrapped_offers_links.append(offer_link)
|
|
single_offer = self.scrap_one_offer(driver, offer_link)
|
|
SingleOfferCSVWriter.save_to_file(self.output_csv_file, single_offer)
|
|
|
|
|
|
driver.close()
|
|
state._finished = True
|
|
except:
|
|
pass
|
|
|
|
return state
|