2024-11-12 05:16:21 +01:00
|
|
|
import os
|
2024-11-13 23:56:43 +01:00
|
|
|
from selenium import webdriver
|
|
|
|
from selenium.webdriver.chrome.service import Service
|
|
|
|
from selenium.webdriver.common.by import By
|
|
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
|
from webdriver_manager.chrome import ChromeDriverManager
|
|
|
|
import time
|
2024-11-12 05:16:21 +01:00
|
|
|
|
|
|
|
|
2024-11-13 23:56:43 +01:00
|
|
|
class Scraper:
|
|
|
|
def __init__(self, url):
|
|
|
|
self.url = url
|
|
|
|
self.hrefs = []
|
|
|
|
self.driver = self.load_driver()
|
2024-11-12 05:16:21 +01:00
|
|
|
|
2024-11-13 23:56:43 +01:00
|
|
|
def load_driver(self) -> webdriver.Chrome:
|
|
|
|
options = webdriver.ChromeOptions()
|
|
|
|
options.add_argument("--headless")
|
|
|
|
options.add_argument("--no-sandbox")
|
2024-11-12 05:16:21 +01:00
|
|
|
|
2024-11-13 23:56:43 +01:00
|
|
|
return webdriver.Chrome(
|
|
|
|
options=options,
|
|
|
|
service=(
|
|
|
|
Service(ChromeDriverManager().install())
|
|
|
|
if os.path.exists("/.dockerenv")
|
|
|
|
else None
|
|
|
|
),
|
|
|
|
)
|
|
|
|
|
|
|
|
def skip_cookies(self) -> None:
|
|
|
|
WebDriverWait(self.driver, 5).until(
|
|
|
|
EC.presence_of_element_located(
|
|
|
|
(By.CSS_SELECTOR, "[_ngcontent-ng-c745257238]")
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
button = self.driver.find_element(By.CSS_SELECTOR, "[_ngcontent-ng-c745257238]")
|
|
|
|
self.driver.execute_script(
|
|
|
|
"""
|
|
|
|
arguments[0].removeAttribute('disabled');
|
|
|
|
arguments[0].className = 'border-button';
|
|
|
|
""",
|
|
|
|
button,
|
|
|
|
)
|
|
|
|
button.click()
|
|
|
|
time.sleep(2)
|
|
|
|
|
|
|
|
def load_page(self) -> None:
|
|
|
|
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
|
|
|
time.sleep(2)
|
|
|
|
|
|
|
|
def locate_valid_artworks(self) -> list[str]:
|
|
|
|
WebDriverWait(self.driver, 5).until(
|
|
|
|
EC.presence_of_element_located((By.CSS_SELECTOR, ".artwork-search-results"))
|
|
|
|
)
|
|
|
|
artworks = self.driver.find_elements(
|
|
|
|
By.CSS_SELECTOR, ".artwork-search-results article"
|
|
|
|
)
|
|
|
|
for artwork in artworks:
|
|
|
|
href = artwork.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
|
|
|
|
self.hrefs.append(href)
|
|
|
|
return self.hrefs
|
|
|
|
|
|
|
|
|
|
|
|
def scrap(url: str):
|
|
|
|
instance = Scraper(url)
|
|
|
|
driver = instance.driver
|
|
|
|
driver.get(url)
|
|
|
|
|
|
|
|
instance.skip_cookies()
|
|
|
|
instance.load_page()
|
|
|
|
hrefs = instance.locate_valid_artworks()
|
|
|
|
|
|
|
|
print(hrefs)
|
|
|
|
html = driver.page_source
|
|
|
|
driver.quit()
|
|
|
|
return html
|