feat: scraper for monet arts

Signed-off-by: paprykdev <58005447+paprykdev@users.noreply.github.com>
2024-12-18 01:41:12 +01:00 · 2024-12-18 01:41:12 +01:00 · 388964d497
commit 388964d497
parent b560ff1c1c
14 changed files with 330 additions and 199 deletions
--- a/.gitignore
+++ b/.gitignore
@ -30,3 +30,9 @@ build/
 # IDE files
 .idea/
 .vscode/
+
+# Images
+images/
+
+# example
+example.py
--- a/app/.dockerignore
+++ b/app/.dockerignore
@ -23,4 +23,5 @@ docker-compose.yaml
 dist/
 build/

-# Ignore any other files or directories you want to exclude
+# Ignore any other files or directories you want to exclude
+.supertajnyplik.donotopen
--- a/app/.supertajnyplik.donotopen
+++ b/app/.supertajnyplik.donotopen
@ -0,0 +1,36 @@
+
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE ARCH BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+I USE VIM BTW!
+
--- a/app/docker-compose.yaml
+++ b/app/docker-compose.yaml
@ -1,27 +1,42 @@
+# services:
+#   webscraper:
+#     build:
+#       context: .
+#       dockerfile: ./docker/scripts/Dockerfile
+#     container_name: webscraper
+#     volumes:
+#       - .:/usr/src/app
+#     command:
+#       - tail
+#       - -f
+#       - /dev/null
+#   selenium-hub:
+#     image: "selenium/hub:3.141.59"
+#     container_name: selenium-hub
+#     ports:
+#       - "4444:4444"
+#   # redis:
+#   #   image: "redis:alpine"
+#   #   volumes:
+#   #     - redis_data:/data
+#   #   ports:
+#   #     - "6379:6379"
+#
+# volumes:
+#   # redis_data:
+#   app:
+
 services:
-  webscraper:
+  scraper:
    build:
      context: .
      dockerfile: ./docker/scripts/Dockerfile
-    container_name: webscraper
+    container_name: scraper
    volumes:
      - .:/usr/src/app
    command:
      - tail
      - -f
      - /dev/null
-  selenium-hub:
-    image: "selenium/hub:3.141.59"
-    container_name: selenium-hub
-    ports:
-      - "4444:4444"
-  # redis:
-  #   image: "redis:alpine"
-  #   volumes:
-  #     - redis_data:/data
-  #   ports:
-  #     - "6379:6379"
-
 volumes:
-  # redis_data:
  app:
--- a/app/docker/scripts/Dockerfile
+++ b/app/docker/scripts/Dockerfile
@ -1,15 +1,29 @@
-FROM python:3.9-slim
+# FROM python:3.9-slim
+#
+# WORKDIR /usr/src/app
+#
+# COPY requirements.txt .
+# RUN pip install --trusted-host pypi.python.org -r requirements.txt
+#
+# COPY . .
+#
+# RUN apt-get update && apt-get install -y wget unzip && \
+#     wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \
+#     apt install -y ./google-chrome-stable_current_amd64.deb && \
+#     rm ./google-chrome-stable_current_amd64.deb && \
+#     apt-get clean
+#

+# Use an official Python runtime as a parent image
+FROM mcr.microsoft.com/playwright/python:v1.49.1-jammy
+
+# Set the working directory to /app
 WORKDIR /usr/src/app

+# Copy the current directory contents into the container at /app
 COPY requirements.txt .
-RUN pip install --trusted-host pypi.python.org -r requirements.txt
+
+# Run the command to install any necessary dependencies
+RUN pip install --no-cache-dir -r requirements.txt

 COPY . .
-
-RUN apt-get update && apt-get install -y wget unzip && \
-    wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \
-    apt install -y ./google-chrome-stable_current_amd64.deb && \
-    rm ./google-chrome-stable_current_amd64.deb && \
-    apt-get clean
-
--- a/app/main.py
+++ b/app/main.py
@ -1,26 +1,6 @@
-from scraper import scrap
-import os
-import json
-
-urls = ["https://digitalprojects.wpi.art/monet/artworks"]
-hrefs = []
-
-
-def main():
-    directory = os.path.dirname(os.path.realpath(__file__))
-    file_path = os.path.join(directory, "dist", "data.json")
-    scrap(urls[0])
-
-    data = []
-
-    try:
-        os.mkdir(os.path.join(directory, "dist"))
-    except FileExistsError:
-        pass
-    with open(file_path, "w", encoding="utf-8") as file:
-        json.dump(data, file)
-    print("Data has been scraped!")
-
+import asyncio
+from scripts.monet import MonetScraper

 if __name__ == "__main__":
-    main()
+    scraper = MonetScraper()
+    asyncio.run(scraper.scrape())
--- a/app/requirements.txt
+++ b/app/requirements.txt
@ -1,24 +1,2 @@
-attrs==24.2.0
-beautifulsoup4==4.12.3
-bs4==0.0.2
-certifi==2024.8.30
-charset-normalizer==3.4.0
-h11==0.14.0
-idna==3.10
-lxml==5.3.0
-outcome==1.3.0.post0
-packaging==24.2
-PySocks==1.7.1
-python-dotenv==1.0.1
+playwright==1.49.1
 requests==2.32.3
-selenium==4.26.1
-sniffio==1.3.1
-sortedcontainers==2.4.0
-soupsieve==2.6
-trio==0.27.0
-trio-websocket==0.11.1
-typing_extensions==4.12.2
-urllib3==2.2.3
-webdriver-manager==4.0.2
-websocket-client==1.8.0
-wsproto==1.2.0
--- a/app/scraper.py
+++ b/app/scraper.py
@ -1,78 +0,0 @@
-import os
-from selenium import webdriver
-from selenium.webdriver.chrome.service import Service
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from webdriver_manager.chrome import ChromeDriverManager
-import time
-
-
-class Scraper:
-    def __init__(self, url):
-        self.url = url
-        self.hrefs = []
-        self.driver = self.load_driver()
-
-    def load_driver(self) -> webdriver.Chrome:
-        options = webdriver.ChromeOptions()
-        options.add_argument("--headless")
-        options.add_argument("--no-sandbox")
-
-        return webdriver.Chrome(
-            options=options,
-            service=(
-                Service(ChromeDriverManager().install())
-                if os.path.exists("/.dockerenv")
-                else None
-            ),
-        )
-
-    def skip_cookies(self) -> None:
-        WebDriverWait(self.driver, 5).until(
-            EC.presence_of_element_located(
-                (By.CSS_SELECTOR, "[_ngcontent-ng-c745257238]")
-            )
-        )
-
-        button = self.driver.find_element(By.CSS_SELECTOR, "[_ngcontent-ng-c745257238]")
-        self.driver.execute_script(
-            """
-                arguments[0].removeAttribute('disabled');
-                arguments[0].className = 'border-button';
-            """,
-            button,
-        )
-        button.click()
-        time.sleep(2)
-
-    def load_page(self) -> None:
-        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
-        time.sleep(2)
-
-    def locate_valid_artworks(self) -> list[str]:
-        WebDriverWait(self.driver, 5).until(
-            EC.presence_of_element_located((By.CSS_SELECTOR, ".artwork-search-results"))
-        )
-        artworks = self.driver.find_elements(
-            By.CSS_SELECTOR, ".artwork-search-results article"
-        )
-        for artwork in artworks:
-            href = artwork.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
-            self.hrefs.append(href)
-        return self.hrefs
-
-
-def scrap(url: str):
-    instance = Scraper(url)
-    driver = instance.driver
-    driver.get(url)
-
-    instance.skip_cookies()
-    instance.load_page()
-    hrefs = instance.locate_valid_artworks()
-
-    print(hrefs)
-    html = driver.page_source
-    driver.quit()
-    return html
--- a/app/scripts/monet.py
+++ b/app/scripts/monet.py
@ -0,0 +1,176 @@
+import time
+import requests
+import os
+import json
+from playwright.async_api import async_playwright
+
+
+class MonetScraper:
+    def __init__(self, url="https://digitalprojects.wpi.art/monet/artworks?page=1", base_url="https://digitalprojects.wpi.art"):
+        self.hrefs = []
+        self.base_url = base_url
+        self.url = url
+        self.data = []
+        self.pages = 3
+
+    async def scrape(self):
+        async with async_playwright() as p:
+            self.browser = await p.chromium.launch(headless=False)
+            self.context = await self.browser.new_context()
+            self.page = await self.context.new_page()
+            self.page.set_default_timeout(5000)
+            await self.go_to(self.url)
+            await self.skip_cookies()
+            await self.get_hrefs()
+            await self.get_data()
+            self.save_data()
+            await self.browser.close()
+
+    async def skip_cookies(self):
+        await self.wait_for_el('.button-disabled')
+        await self.page.eval_on_selector('.button-disabled', 'el => el.removeAttribute("disabled")')
+        await self.page.click('.button-disabled')
+
+    async def insert_value(self, selector, value):
+        await self.page.fill(selector, value)
+
+    async def find_el(self, selector: str):
+        await self.wait_for_el(selector)
+        return await self.page.query_selector(selector)
+
+    async def find_els(self, selector: str):
+        await self.wait_for_el(selector)
+        return await self.page.query_selector_all(selector)
+
+    async def wait_for_el(self, selector: str):
+        await self.page.wait_for_selector(selector)
+
+    async def go_to(self, url, tabs=False):
+        hack = True
+        while hack:
+            try:
+                await self.page.goto(url, timeout=60000)
+                hack = False
+            except Exception as e:
+                print(e)
+                print(f'error go to {url}')
+
+    async def get_hrefs(self):
+        for i in range(self.pages):
+            if i > 0:
+                pagination = await self.find_el('cpd-controls-pagination > button:last-child')
+                await pagination.click()
+                time.sleep(1)
+            el = await self.find_els('.artwork-search-results > article:not(.not-included) > a')
+            for e in el:
+                self.hrefs.append(await e.get_attribute('href'))
+
+    async def get_image(self):
+        image = await self.find_el(".not-full-screen-image-container > img")
+        image = await image.get_attribute('srcset')
+        image = image.split(",")[0].split(" ")[0]
+        i = 0
+        while image == "null" and i < 10:
+            image = await self.find_el(".not-full-screen-image-container > img")
+            image = await image.get_attribute('srcset')
+            image = image.split(",")[0].split(" ")[0]
+            time.sleep(0.5)
+            i += 1
+
+        return image
+
+    def curl_image(self, image, title):
+        try:
+            os.mkdir("images")
+        except FileExistsError:
+            pass
+
+        if image != "null":
+            image_response = requests.get(image)
+            if image_response.status_code == 200:
+                with open(f'images/{title.lower().replace(",", "").replace(" ", "_")}.jpg', 'wb') as img_file:
+                    img_file.write(image_response.content)
+
+    async def get_title(self):
+        title = await self.find_el(".details h1")
+        title = await title.inner_text()
+        return title
+
+    async def get_info(self):
+        info = await self.find_els("article[_ngcontent-ng-c2311764719] p > p")
+        return {
+            "date": await info[0].inner_text(),
+            "technique": await info[1].inner_text(),
+            "dimensions": await info[2].inner_text(),
+            "signature": await info[3].inner_text(),
+        }
+
+    def save_data(self):
+        try:
+            os.mkdir("dist")
+        except FileExistsError:
+            pass
+        open("dist/data.json",
+             "w").write(json.dumps([d for d in self.data], indent=4))
+
+    async def get_provenance(self):
+        provenances = None
+        try:
+            provenances = await self.find_els("#provenance p p")
+        except Exception as e:
+            print(e)
+            return None
+        return [await p.inner_text() for p in provenances]
+
+    async def get_exhibitions(self):
+        exhibitions = None
+        try:
+            exhibitions = await self.find_els("#exhibition article")
+        except Exception as e:
+            print(e)
+            return None
+        arr = []
+        for paragraph in exhibitions:
+            await paragraph.wait_for_selector("p")
+            ps = await paragraph.query_selector_all("p")
+            arr.append(", ".join([await p.inner_text() for p in ps]))
+        return arr
+
+    async def get_bibliography(self):
+        bibliography = None
+        try:
+            bibliography = await self.find_els("#publication article")
+        except Exception as e:
+            print(e)
+            return None
+        arr = []
+        for paragraph in bibliography:
+            await paragraph.wait_for_selector("p")
+            ps = await paragraph.query_selector_all("p")
+            arr.append(", ".join([await p.inner_text() for p in ps]))
+        return arr
+
+    async def get_data(self):
+        for href in self.hrefs:
+            await self.go_to(f"{self.base_url}{href}")
+            image = await self.get_image()
+            title = await self.get_title()
+            get_info = await self.get_info()
+            provenance = await self.get_provenance()
+            exhibitions = await self.get_exhibitions()
+            bibliography = await self.get_bibliography()
+
+            self.curl_image(image, title)
+            self.data.append({
+                "title": title,
+                "date": get_info["date"],
+                "name_of_artist": "Claude Monet",
+                "technique": get_info["technique"],
+                "dimensions": get_info["dimensions"],
+                "signature": get_info["signature"],
+                "location": None,
+                "image": image,
+                "provenance": provenance,
+                "exhibitions": exhibitions,
+                "bibliography": bibliography,
+            })
--- a/scripts/run_command.py
+++ b/scripts/run_command.py
@ -1,13 +1,13 @@
 import subprocess
-import sys


-def run_command(command: str) -> str:
+def run_command(command: str, isPython: bool = False) -> str:
    process = subprocess.run(
        command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
    )
-    if process.returncode != 0:
+    return_massage = ""
+    if process.returncode != 0 and not isPython:
        print(f"Error running command: {command}")
-        print(process.stderr.decode())
-        sys.exit(process.returncode)
-    return process.stdout.decode()
+        return_massage = process.stderr.decode()
+    return_massage = process.stdout.decode()
+    return return_massage
--- a/scripts/start.py
+++ b/scripts/start.py
@ -5,22 +5,29 @@ from get_path import get_path


 def main():
-    docker_compose_file = os.getenv(
-        "DOCKER_COMPOSE_FILE", f"{get_path()}/app/docker-compose.yaml"
-    )
-    service_name = os.getenv("SERVICE_NAME", "webscraper")
-    script_name = os.getenv("SCRIPT_NAME", "main.py")
    try:
-        print("Starting Docker Compose services...\n")
-        run_command(f"docker compose -f {docker_compose_file} up -d")
+        docker_compose_file = os.getenv(
+            "DOCKER_COMPOSE_FILE", f"{get_path()}/app/docker-compose.yaml"
+        )
+        service_name = os.getenv("SERVICE_NAME", "scraper")
+        script_name = os.getenv("SCRIPT_NAME", "main.py")
+        try:
+            print("Starting Docker Compose services...\n")
+            run_command(f"docker compose -f {docker_compose_file} up -d")

-        print(run_command(f"docker exec {service_name} python {script_name}"))
+            print(run_command(f"docker exec -it {service_name} xvfb-run --auto-servernum --server-num=1 --server-args='-screen 0, 1920x1080x24' python3 {script_name}"))

-        print("Stopping and removing Docker Compose services...")
+            print("Stopping and removing Docker Compose services...")
+            run_command(f"docker compose -f {docker_compose_file} down")
+        except subprocess.CalledProcessError as e:
+            print("An error occurred while running the script.")
+            print(e)
+        except KeyboardInterrupt:
+            print("Keyboard interrupt detected. Exiting...")
+            run_command(f"docker compose -f {docker_compose_file} down")
+    except KeyboardInterrupt:
+        print("Keyboard interrupt detected. Exiting...")
        run_command(f"docker compose -f {docker_compose_file} down")
-    except subprocess.CalledProcessError as e:
-        print("An error occurred while running the script.")
-        print(e)


 if __name__ == "__main__":
--- a/scripts/threads/commands.py
+++ b/scripts/threads/commands.py
@ -26,7 +26,7 @@ def clearScreen():

 def systemCommand(command: str) -> str:
    words = command[1:].split()
-    if words[0] == "":
+    if not words:
        return "Command not found. Write 'h' for help."
    try:
        print(
@ -57,9 +57,5 @@ def runCondition(command: str) -> bool:

 def runService():
    print("Running main.py...")
-    print(
-        run_command(
-            "docker exec -it webscraper python main.py",
-        )
-    )
+    print(run_command("docker exec -it webscraper python main.py", True))
    return None
--- a/scripts/threads/prompt.py
+++ b/scripts/threads/prompt.py
@ -3,32 +3,32 @@ from threads.commands import *
 from run_command import run_command
 from get_path import get_path
 from threads.help_list import help_list
+import time


 def prompt():
    while True:
-        command = input("> ")
-        if quitCondition(command):
+        try:
+            command = input("> ")
+            if quitCondition(command):
+                quitService(get_path())
+                break
+            elif helpCondition(command):
+                print(help_list())
+            elif clearCondition(command):
+                clearScreen()
+            elif command.startswith("$"):
+                systemCommand(command)
+            elif restartCondition(command):
+                restartService(get_path())
+            elif runCondition(command):
+                runService()
+            elif command == "":
+                pass
+            else:
+                print(f"Command: {command} not found. Write 'h' for help.")
+            time.sleep(0.1)
+        except KeyboardInterrupt:
+            print("\nExiting...")
            quitService(get_path())
-            break
-        if helpCondition(command):
-            print(help_list())
-            continue
-        if clearCondition(command):
-            clearScreen()
-            continue
-        if command.startswith("$"):
-            systemCommand(command)
-            continue
-        if restartCondition(command):
-            restartService(get_path())
-            continue
-        if runCondition(command):
-            runService()
-            continue
-        if command == "":
-            continue
-        else:
-            print("Command not found. Write 'h' for help.")
-            continue
    sys.exit(0)
--- a/scripts/watch.py
+++ b/scripts/watch.py
@ -17,7 +17,7 @@ def main():
    run_command(f"docker compose -f {docker_compose_file} up -d")
    print("Composed!\n")
    print("Running main.py...")
-    print(run_command("docker exec -it webscraper python main.py"))
+    print(run_command("docker exec -it webscraper python main.py", True))
    print(
        "\n\nWrite 'q' to stop program. Don't stop with 'Ctrl + C' otherwise docker container will be still on."
    )
@ -44,7 +44,7 @@ def main():
            if before[f] != after[f]:
                print(f"\nDetected change in {f}")
                print("Running main.py...")
-                print(run_command("docker exec -it webscraper python main.py"))
+                print(run_command("docker exec -it webscraper python main.py", True))
                before[f] = after[f]