feat: seperate all logic into small methods

2024-11-13 23:56:43 +01:00 · 2024-11-13 23:56:43 +01:00 · f65292d891
commit f65292d891
parent a451409fa6
6 changed files with 128 additions and 44 deletions
--- a/app/Dockerfile
+++ b/app/Dockerfile
@ -3,24 +3,13 @@ FROM python:3.9-slim
 WORKDIR /usr/src/app

 COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install --trusted-host pypi.python.org -r requirements.txt

 COPY . .

-RUN apt-get update && apt-get install -y \
-    wget \
-    unzip \
-    curl \
-    libx11-dev \
-    libgdk-pixbuf2.0-0 \
-    libcanberra-gtk-module \
-    && wget https://chromedriver.storage.googleapis.com/114.0.5735.90/chromedriver_linux64.zip \
-    && unzip chromedriver_linux64.zip \
-    && mv chromedriver /usr/local/bin/ \
-    && chmod +x /usr/local/bin/chromedriver
+RUN apt-get update && apt-get install -y wget unzip && \
+    wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \
+    apt install -y ./google-chrome-stable_current_amd64.deb && \
+    rm ./google-chrome-stable_current_amd64.deb && \
+    apt-get clean

-RUN useradd python
-RUN chown -R python /usr/src/app
-USER python
-
-CMD ["python", "main.py"]
--- a/app/main.py
+++ b/app/main.py
@ -1,12 +1,26 @@
-from scraper import scraper
+from scraper import scrap
+import os
+import json
 import time

+urls = ["https://digitalprojects.wpi.art/monet/artworks"]
+hrefs = []
+

 def main():
-    print("Starting the application...\n\n")
-    scraper()
-    print("\n\nApplication finished!")
-    time.sleep(8)
+    directory = "dist"
+    file_path = os.path.join(directory, "data.json")
+    scrap(urls[0])
+
+    data = []
+
+    try:
+        os.mkdir("dist")
+    except FileExistsError:
+        pass
+    with open(file_path, "w", encoding="utf-8") as file:
+        json.dump(data, file)
+    print("Data has been scraped!")


 if __name__ == "__main__":
--- a/app/requirements.txt
+++ b/app/requirements.txt
@ -1,8 +1,24 @@
+attrs==24.2.0
 beautifulsoup4==4.12.3
 bs4==0.0.2
 certifi==2024.8.30
 charset-normalizer==3.4.0
+h11==0.14.0
 idna==3.10
+lxml==5.3.0
+outcome==1.3.0.post0
+packaging==24.2
+PySocks==1.7.1
+python-dotenv==1.0.1
 requests==2.32.3
+selenium==4.26.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
 soupsieve==2.6
+trio==0.27.0
+trio-websocket==0.11.1
+typing_extensions==4.12.2
 urllib3==2.2.3
+webdriver-manager==4.0.2
+websocket-client==1.8.0
+wsproto==1.2.0
--- a/app/scraper.py
+++ b/app/scraper.py
@ -1,17 +1,78 @@
 import os
-import json
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from webdriver_manager.chrome import ChromeDriverManager
+import time


-def scraper():
-    directory = "dist"
-    file_path = os.path.join(directory, "data.json")
+class Scraper:
+    def __init__(self, url):
+        self.url = url
+        self.hrefs = []
+        self.driver = self.load_driver()

-    data = []
+    def load_driver(self) -> webdriver.Chrome:
+        options = webdriver.ChromeOptions()
+        options.add_argument("--headless")
+        options.add_argument("--no-sandbox")

-    try:
-        os.mkdir("dist")
-    except FileExistsError:
-        pass
-    with open(file_path, "w", encoding="utf-8") as file:
-        json.dump(data, file)
-    print("Data has been scraped!")
+        return webdriver.Chrome(
+            options=options,
+            service=(
+                Service(ChromeDriverManager().install())
+                if os.path.exists("/.dockerenv")
+                else None
+            ),
+        )
+
+    def skip_cookies(self) -> None:
+        WebDriverWait(self.driver, 5).until(
+            EC.presence_of_element_located(
+                (By.CSS_SELECTOR, "[_ngcontent-ng-c745257238]")
+            )
+        )
+
+        button = self.driver.find_element(By.CSS_SELECTOR, "[_ngcontent-ng-c745257238]")
+        self.driver.execute_script(
+            """
+                arguments[0].removeAttribute('disabled');
+                arguments[0].className = 'border-button';
+            """,
+            button,
+        )
+        button.click()
+        time.sleep(2)
+
+    def load_page(self) -> None:
+        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+        time.sleep(2)
+
+    def locate_valid_artworks(self) -> list[str]:
+        WebDriverWait(self.driver, 5).until(
+            EC.presence_of_element_located((By.CSS_SELECTOR, ".artwork-search-results"))
+        )
+        artworks = self.driver.find_elements(
+            By.CSS_SELECTOR, ".artwork-search-results article"
+        )
+        for artwork in artworks:
+            href = artwork.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
+            self.hrefs.append(href)
+        return self.hrefs
+
+
+def scrap(url: str):
+    instance = Scraper(url)
+    driver = instance.driver
+    driver.get(url)
+
+    instance.skip_cookies()
+    instance.load_page()
+    hrefs = instance.locate_valid_artworks()
+
+    print(hrefs)
+    html = driver.page_source
+    driver.quit()
+    return html
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -3,8 +3,6 @@ services:
    build:
      context: ./app
    container_name: webscraper
-    depends_on:
-      - redis
    volumes:
      - ./app:/usr/src/app
    develop:
@ -13,14 +11,20 @@ services:
          action: rebuild
        - path: ./app
          target: /usr/src/app
-          action: sync
-  redis:
-    image: "redis:alpine"
-    volumes:
-      - redis_data:/data
+          action: sync+restart
+    command: tail -f /dev/null
+  selenium-hub:
+    image: "selenium/hub:3.141.59"
+    container_name: selenium-hub
    ports:
-      - "6379:6379"
+      - "4444:4444"
+  # redis:
+  #   image: "redis:alpine"
+  #   volumes:
+  #     - redis_data:/data
+  #   ports:
+  #     - "6379:6379"

 volumes:
-  redis_data:
+  # redis_data:
  app:
--- a/start.sh
+++ b/start.sh
@ -4,4 +4,4 @@ docker compose up -d

 docker compose wait webscraper > /dev/null

-docker compose down
+# docker compose down