diff --git a/app/Dockerfile b/app/Dockerfile index f1662d6..dbe91e4 100644 --- a/app/Dockerfile +++ b/app/Dockerfile @@ -3,24 +3,13 @@ FROM python:3.9-slim WORKDIR /usr/src/app COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt +RUN pip install --trusted-host pypi.python.org -r requirements.txt COPY . . -RUN apt-get update && apt-get install -y \ - wget \ - unzip \ - curl \ - libx11-dev \ - libgdk-pixbuf2.0-0 \ - libcanberra-gtk-module \ - && wget https://chromedriver.storage.googleapis.com/114.0.5735.90/chromedriver_linux64.zip \ - && unzip chromedriver_linux64.zip \ - && mv chromedriver /usr/local/bin/ \ - && chmod +x /usr/local/bin/chromedriver +RUN apt-get update && apt-get install -y wget unzip && \ + wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \ + apt install -y ./google-chrome-stable_current_amd64.deb && \ + rm ./google-chrome-stable_current_amd64.deb && \ + apt-get clean -RUN useradd python -RUN chown -R python /usr/src/app -USER python - -CMD ["python", "main.py"] \ No newline at end of file diff --git a/app/main.py b/app/main.py index f87ceb4..94e0aa1 100644 --- a/app/main.py +++ b/app/main.py @@ -1,12 +1,26 @@ -from scraper import scraper +from scraper import scrap +import os +import json import time +urls = ["https://digitalprojects.wpi.art/monet/artworks"] +hrefs = [] + def main(): - print("Starting the application...\n\n") - scraper() - print("\n\nApplication finished!") - time.sleep(8) + directory = "dist" + file_path = os.path.join(directory, "data.json") + scrap(urls[0]) + + data = [] + + try: + os.mkdir("dist") + except FileExistsError: + pass + with open(file_path, "w", encoding="utf-8") as file: + json.dump(data, file) + print("Data has been scraped!") if __name__ == "__main__": diff --git a/app/requirements.txt b/app/requirements.txt index bada454..979dbab 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -1,8 +1,24 @@ +attrs==24.2.0 beautifulsoup4==4.12.3 bs4==0.0.2 certifi==2024.8.30 charset-normalizer==3.4.0 +h11==0.14.0 idna==3.10 +lxml==5.3.0 +outcome==1.3.0.post0 +packaging==24.2 +PySocks==1.7.1 +python-dotenv==1.0.1 requests==2.32.3 +selenium==4.26.1 +sniffio==1.3.1 +sortedcontainers==2.4.0 soupsieve==2.6 +trio==0.27.0 +trio-websocket==0.11.1 +typing_extensions==4.12.2 urllib3==2.2.3 +webdriver-manager==4.0.2 +websocket-client==1.8.0 +wsproto==1.2.0 diff --git a/app/scraper.py b/app/scraper.py index d9c4dc0..64aa99a 100644 --- a/app/scraper.py +++ b/app/scraper.py @@ -1,17 +1,78 @@ import os -import json +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from webdriver_manager.chrome import ChromeDriverManager +import time -def scraper(): - directory = "dist" - file_path = os.path.join(directory, "data.json") +class Scraper: + def __init__(self, url): + self.url = url + self.hrefs = [] + self.driver = self.load_driver() - data = [] + def load_driver(self) -> webdriver.Chrome: + options = webdriver.ChromeOptions() + options.add_argument("--headless") + options.add_argument("--no-sandbox") - try: - os.mkdir("dist") - except FileExistsError: - pass - with open(file_path, "w", encoding="utf-8") as file: - json.dump(data, file) - print("Data has been scraped!") + return webdriver.Chrome( + options=options, + service=( + Service(ChromeDriverManager().install()) + if os.path.exists("/.dockerenv") + else None + ), + ) + + def skip_cookies(self) -> None: + WebDriverWait(self.driver, 5).until( + EC.presence_of_element_located( + (By.CSS_SELECTOR, "[_ngcontent-ng-c745257238]") + ) + ) + + button = self.driver.find_element(By.CSS_SELECTOR, "[_ngcontent-ng-c745257238]") + self.driver.execute_script( + """ + arguments[0].removeAttribute('disabled'); + arguments[0].className = 'border-button'; + """, + button, + ) + button.click() + time.sleep(2) + + def load_page(self) -> None: + self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + time.sleep(2) + + def locate_valid_artworks(self) -> list[str]: + WebDriverWait(self.driver, 5).until( + EC.presence_of_element_located((By.CSS_SELECTOR, ".artwork-search-results")) + ) + artworks = self.driver.find_elements( + By.CSS_SELECTOR, ".artwork-search-results article" + ) + for artwork in artworks: + href = artwork.find_element(By.CSS_SELECTOR, "a").get_attribute("href") + self.hrefs.append(href) + return self.hrefs + + +def scrap(url: str): + instance = Scraper(url) + driver = instance.driver + driver.get(url) + + instance.skip_cookies() + instance.load_page() + hrefs = instance.locate_valid_artworks() + + print(hrefs) + html = driver.page_source + driver.quit() + return html diff --git a/docker-compose.yaml b/docker-compose.yaml index 2a7fd63..ac3339d 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -3,8 +3,6 @@ services: build: context: ./app container_name: webscraper - depends_on: - - redis volumes: - ./app:/usr/src/app develop: @@ -13,14 +11,20 @@ services: action: rebuild - path: ./app target: /usr/src/app - action: sync - redis: - image: "redis:alpine" - volumes: - - redis_data:/data + action: sync+restart + command: tail -f /dev/null + selenium-hub: + image: "selenium/hub:3.141.59" + container_name: selenium-hub ports: - - "6379:6379" + - "4444:4444" + # redis: + # image: "redis:alpine" + # volumes: + # - redis_data:/data + # ports: + # - "6379:6379" volumes: - redis_data: + # redis_data: app: diff --git a/start.sh b/start.sh index 1bcd812..635cdfc 100755 --- a/start.sh +++ b/start.sh @@ -4,4 +4,4 @@ docker compose up -d docker compose wait webscraper > /dev/null -docker compose down \ No newline at end of file +# docker compose down \ No newline at end of file