From 388964d497c026348a84074d1061d74f99a63583 Mon Sep 17 00:00:00 2001 From: paprykdev <58005447+paprykdev@users.noreply.github.com> Date: Wed, 18 Dec 2024 01:41:12 +0100 Subject: [PATCH] feat: scraper for monet arts Signed-off-by: paprykdev <58005447+paprykdev@users.noreply.github.com> --- .gitignore | 6 ++ app/.dockerignore | 3 +- app/.supertajnyplik.donotopen | 36 +++++++ app/docker-compose.yaml | 45 ++++++--- app/docker/scripts/Dockerfile | 32 +++++-- app/main.py | 28 +----- app/requirements.txt | 24 +---- app/scraper.py | 78 --------------- app/scripts/monet.py | 176 ++++++++++++++++++++++++++++++++++ scripts/run_command.py | 12 +-- scripts/start.py | 31 +++--- scripts/threads/commands.py | 8 +- scripts/threads/prompt.py | 46 ++++----- scripts/watch.py | 4 +- 14 files changed, 330 insertions(+), 199 deletions(-) create mode 100644 app/.supertajnyplik.donotopen delete mode 100644 app/scraper.py create mode 100644 app/scripts/monet.py diff --git a/.gitignore b/.gitignore index d18c60c..4fde826 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,9 @@ build/ # IDE files .idea/ .vscode/ + +# Images +images/ + +# example +example.py diff --git a/app/.dockerignore b/app/.dockerignore index b7f5930..273e0ad 100644 --- a/app/.dockerignore +++ b/app/.dockerignore @@ -23,4 +23,5 @@ docker-compose.yaml dist/ build/ -# Ignore any other files or directories you want to exclude \ No newline at end of file +# Ignore any other files or directories you want to exclude +.supertajnyplik.donotopen diff --git a/app/.supertajnyplik.donotopen b/app/.supertajnyplik.donotopen new file mode 100644 index 0000000..d840c15 --- /dev/null +++ b/app/.supertajnyplik.donotopen @@ -0,0 +1,36 @@ + +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE ARCH BTW! +I USE VIM BTW! +I USE VIM BTW! +I USE VIM BTW! + diff --git a/app/docker-compose.yaml b/app/docker-compose.yaml index 6491ba7..6e6725b 100644 --- a/app/docker-compose.yaml +++ b/app/docker-compose.yaml @@ -1,27 +1,42 @@ +# services: +# webscraper: +# build: +# context: . +# dockerfile: ./docker/scripts/Dockerfile +# container_name: webscraper +# volumes: +# - .:/usr/src/app +# command: +# - tail +# - -f +# - /dev/null +# selenium-hub: +# image: "selenium/hub:3.141.59" +# container_name: selenium-hub +# ports: +# - "4444:4444" +# # redis: +# # image: "redis:alpine" +# # volumes: +# # - redis_data:/data +# # ports: +# # - "6379:6379" +# +# volumes: +# # redis_data: +# app: + services: - webscraper: + scraper: build: context: . dockerfile: ./docker/scripts/Dockerfile - container_name: webscraper + container_name: scraper volumes: - .:/usr/src/app command: - tail - -f - /dev/null - selenium-hub: - image: "selenium/hub:3.141.59" - container_name: selenium-hub - ports: - - "4444:4444" - # redis: - # image: "redis:alpine" - # volumes: - # - redis_data:/data - # ports: - # - "6379:6379" - volumes: - # redis_data: app: diff --git a/app/docker/scripts/Dockerfile b/app/docker/scripts/Dockerfile index dbe91e4..67655cd 100644 --- a/app/docker/scripts/Dockerfile +++ b/app/docker/scripts/Dockerfile @@ -1,15 +1,29 @@ -FROM python:3.9-slim +# FROM python:3.9-slim +# +# WORKDIR /usr/src/app +# +# COPY requirements.txt . +# RUN pip install --trusted-host pypi.python.org -r requirements.txt +# +# COPY . . +# +# RUN apt-get update && apt-get install -y wget unzip && \ +# wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \ +# apt install -y ./google-chrome-stable_current_amd64.deb && \ +# rm ./google-chrome-stable_current_amd64.deb && \ +# apt-get clean +# +# Use an official Python runtime as a parent image +FROM mcr.microsoft.com/playwright/python:v1.49.1-jammy + +# Set the working directory to /app WORKDIR /usr/src/app +# Copy the current directory contents into the container at /app COPY requirements.txt . -RUN pip install --trusted-host pypi.python.org -r requirements.txt + +# Run the command to install any necessary dependencies +RUN pip install --no-cache-dir -r requirements.txt COPY . . - -RUN apt-get update && apt-get install -y wget unzip && \ - wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \ - apt install -y ./google-chrome-stable_current_amd64.deb && \ - rm ./google-chrome-stable_current_amd64.deb && \ - apt-get clean - diff --git a/app/main.py b/app/main.py index 1292bc6..b0cc139 100644 --- a/app/main.py +++ b/app/main.py @@ -1,26 +1,6 @@ -from scraper import scrap -import os -import json - -urls = ["https://digitalprojects.wpi.art/monet/artworks"] -hrefs = [] - - -def main(): - directory = os.path.dirname(os.path.realpath(__file__)) - file_path = os.path.join(directory, "dist", "data.json") - scrap(urls[0]) - - data = [] - - try: - os.mkdir(os.path.join(directory, "dist")) - except FileExistsError: - pass - with open(file_path, "w", encoding="utf-8") as file: - json.dump(data, file) - print("Data has been scraped!") - +import asyncio +from scripts.monet import MonetScraper if __name__ == "__main__": - main() + scraper = MonetScraper() + asyncio.run(scraper.scrape()) diff --git a/app/requirements.txt b/app/requirements.txt index 979dbab..698e926 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -1,24 +1,2 @@ -attrs==24.2.0 -beautifulsoup4==4.12.3 -bs4==0.0.2 -certifi==2024.8.30 -charset-normalizer==3.4.0 -h11==0.14.0 -idna==3.10 -lxml==5.3.0 -outcome==1.3.0.post0 -packaging==24.2 -PySocks==1.7.1 -python-dotenv==1.0.1 +playwright==1.49.1 requests==2.32.3 -selenium==4.26.1 -sniffio==1.3.1 -sortedcontainers==2.4.0 -soupsieve==2.6 -trio==0.27.0 -trio-websocket==0.11.1 -typing_extensions==4.12.2 -urllib3==2.2.3 -webdriver-manager==4.0.2 -websocket-client==1.8.0 -wsproto==1.2.0 diff --git a/app/scraper.py b/app/scraper.py deleted file mode 100644 index 64aa99a..0000000 --- a/app/scraper.py +++ /dev/null @@ -1,78 +0,0 @@ -import os -from selenium import webdriver -from selenium.webdriver.chrome.service import Service -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from webdriver_manager.chrome import ChromeDriverManager -import time - - -class Scraper: - def __init__(self, url): - self.url = url - self.hrefs = [] - self.driver = self.load_driver() - - def load_driver(self) -> webdriver.Chrome: - options = webdriver.ChromeOptions() - options.add_argument("--headless") - options.add_argument("--no-sandbox") - - return webdriver.Chrome( - options=options, - service=( - Service(ChromeDriverManager().install()) - if os.path.exists("/.dockerenv") - else None - ), - ) - - def skip_cookies(self) -> None: - WebDriverWait(self.driver, 5).until( - EC.presence_of_element_located( - (By.CSS_SELECTOR, "[_ngcontent-ng-c745257238]") - ) - ) - - button = self.driver.find_element(By.CSS_SELECTOR, "[_ngcontent-ng-c745257238]") - self.driver.execute_script( - """ - arguments[0].removeAttribute('disabled'); - arguments[0].className = 'border-button'; - """, - button, - ) - button.click() - time.sleep(2) - - def load_page(self) -> None: - self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") - time.sleep(2) - - def locate_valid_artworks(self) -> list[str]: - WebDriverWait(self.driver, 5).until( - EC.presence_of_element_located((By.CSS_SELECTOR, ".artwork-search-results")) - ) - artworks = self.driver.find_elements( - By.CSS_SELECTOR, ".artwork-search-results article" - ) - for artwork in artworks: - href = artwork.find_element(By.CSS_SELECTOR, "a").get_attribute("href") - self.hrefs.append(href) - return self.hrefs - - -def scrap(url: str): - instance = Scraper(url) - driver = instance.driver - driver.get(url) - - instance.skip_cookies() - instance.load_page() - hrefs = instance.locate_valid_artworks() - - print(hrefs) - html = driver.page_source - driver.quit() - return html diff --git a/app/scripts/monet.py b/app/scripts/monet.py new file mode 100644 index 0000000..2fdd0f7 --- /dev/null +++ b/app/scripts/monet.py @@ -0,0 +1,176 @@ +import time +import requests +import os +import json +from playwright.async_api import async_playwright + + +class MonetScraper: + def __init__(self, url="https://digitalprojects.wpi.art/monet/artworks?page=1", base_url="https://digitalprojects.wpi.art"): + self.hrefs = [] + self.base_url = base_url + self.url = url + self.data = [] + self.pages = 3 + + async def scrape(self): + async with async_playwright() as p: + self.browser = await p.chromium.launch(headless=False) + self.context = await self.browser.new_context() + self.page = await self.context.new_page() + self.page.set_default_timeout(5000) + await self.go_to(self.url) + await self.skip_cookies() + await self.get_hrefs() + await self.get_data() + self.save_data() + await self.browser.close() + + async def skip_cookies(self): + await self.wait_for_el('.button-disabled') + await self.page.eval_on_selector('.button-disabled', 'el => el.removeAttribute("disabled")') + await self.page.click('.button-disabled') + + async def insert_value(self, selector, value): + await self.page.fill(selector, value) + + async def find_el(self, selector: str): + await self.wait_for_el(selector) + return await self.page.query_selector(selector) + + async def find_els(self, selector: str): + await self.wait_for_el(selector) + return await self.page.query_selector_all(selector) + + async def wait_for_el(self, selector: str): + await self.page.wait_for_selector(selector) + + async def go_to(self, url, tabs=False): + hack = True + while hack: + try: + await self.page.goto(url, timeout=60000) + hack = False + except Exception as e: + print(e) + print(f'error go to {url}') + + async def get_hrefs(self): + for i in range(self.pages): + if i > 0: + pagination = await self.find_el('cpd-controls-pagination > button:last-child') + await pagination.click() + time.sleep(1) + el = await self.find_els('.artwork-search-results > article:not(.not-included) > a') + for e in el: + self.hrefs.append(await e.get_attribute('href')) + + async def get_image(self): + image = await self.find_el(".not-full-screen-image-container > img") + image = await image.get_attribute('srcset') + image = image.split(",")[0].split(" ")[0] + i = 0 + while image == "null" and i < 10: + image = await self.find_el(".not-full-screen-image-container > img") + image = await image.get_attribute('srcset') + image = image.split(",")[0].split(" ")[0] + time.sleep(0.5) + i += 1 + + return image + + def curl_image(self, image, title): + try: + os.mkdir("images") + except FileExistsError: + pass + + if image != "null": + image_response = requests.get(image) + if image_response.status_code == 200: + with open(f'images/{title.lower().replace(",", "").replace(" ", "_")}.jpg', 'wb') as img_file: + img_file.write(image_response.content) + + async def get_title(self): + title = await self.find_el(".details h1") + title = await title.inner_text() + return title + + async def get_info(self): + info = await self.find_els("article[_ngcontent-ng-c2311764719] p > p") + return { + "date": await info[0].inner_text(), + "technique": await info[1].inner_text(), + "dimensions": await info[2].inner_text(), + "signature": await info[3].inner_text(), + } + + def save_data(self): + try: + os.mkdir("dist") + except FileExistsError: + pass + open("dist/data.json", + "w").write(json.dumps([d for d in self.data], indent=4)) + + async def get_provenance(self): + provenances = None + try: + provenances = await self.find_els("#provenance p p") + except Exception as e: + print(e) + return None + return [await p.inner_text() for p in provenances] + + async def get_exhibitions(self): + exhibitions = None + try: + exhibitions = await self.find_els("#exhibition article") + except Exception as e: + print(e) + return None + arr = [] + for paragraph in exhibitions: + await paragraph.wait_for_selector("p") + ps = await paragraph.query_selector_all("p") + arr.append(", ".join([await p.inner_text() for p in ps])) + return arr + + async def get_bibliography(self): + bibliography = None + try: + bibliography = await self.find_els("#publication article") + except Exception as e: + print(e) + return None + arr = [] + for paragraph in bibliography: + await paragraph.wait_for_selector("p") + ps = await paragraph.query_selector_all("p") + arr.append(", ".join([await p.inner_text() for p in ps])) + return arr + + async def get_data(self): + for href in self.hrefs: + await self.go_to(f"{self.base_url}{href}") + image = await self.get_image() + title = await self.get_title() + get_info = await self.get_info() + provenance = await self.get_provenance() + exhibitions = await self.get_exhibitions() + bibliography = await self.get_bibliography() + + self.curl_image(image, title) + self.data.append({ + "title": title, + "date": get_info["date"], + "name_of_artist": "Claude Monet", + "technique": get_info["technique"], + "dimensions": get_info["dimensions"], + "signature": get_info["signature"], + "location": None, + "image": image, + "provenance": provenance, + "exhibitions": exhibitions, + "bibliography": bibliography, + }) diff --git a/scripts/run_command.py b/scripts/run_command.py index df64010..800c463 100644 --- a/scripts/run_command.py +++ b/scripts/run_command.py @@ -1,13 +1,13 @@ import subprocess -import sys -def run_command(command: str) -> str: +def run_command(command: str, isPython: bool = False) -> str: process = subprocess.run( command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) - if process.returncode != 0: + return_massage = "" + if process.returncode != 0 and not isPython: print(f"Error running command: {command}") - print(process.stderr.decode()) - sys.exit(process.returncode) - return process.stdout.decode() + return_massage = process.stderr.decode() + return_massage = process.stdout.decode() + return return_massage diff --git a/scripts/start.py b/scripts/start.py index 25a0a37..029a74d 100644 --- a/scripts/start.py +++ b/scripts/start.py @@ -5,22 +5,29 @@ from get_path import get_path def main(): - docker_compose_file = os.getenv( - "DOCKER_COMPOSE_FILE", f"{get_path()}/app/docker-compose.yaml" - ) - service_name = os.getenv("SERVICE_NAME", "webscraper") - script_name = os.getenv("SCRIPT_NAME", "main.py") try: - print("Starting Docker Compose services...\n") - run_command(f"docker compose -f {docker_compose_file} up -d") + docker_compose_file = os.getenv( + "DOCKER_COMPOSE_FILE", f"{get_path()}/app/docker-compose.yaml" + ) + service_name = os.getenv("SERVICE_NAME", "scraper") + script_name = os.getenv("SCRIPT_NAME", "main.py") + try: + print("Starting Docker Compose services...\n") + run_command(f"docker compose -f {docker_compose_file} up -d") - print(run_command(f"docker exec {service_name} python {script_name}")) + print(run_command(f"docker exec -it {service_name} xvfb-run --auto-servernum --server-num=1 --server-args='-screen 0, 1920x1080x24' python3 {script_name}")) - print("Stopping and removing Docker Compose services...") + print("Stopping and removing Docker Compose services...") + run_command(f"docker compose -f {docker_compose_file} down") + except subprocess.CalledProcessError as e: + print("An error occurred while running the script.") + print(e) + except KeyboardInterrupt: + print("Keyboard interrupt detected. Exiting...") + run_command(f"docker compose -f {docker_compose_file} down") + except KeyboardInterrupt: + print("Keyboard interrupt detected. Exiting...") run_command(f"docker compose -f {docker_compose_file} down") - except subprocess.CalledProcessError as e: - print("An error occurred while running the script.") - print(e) if __name__ == "__main__": diff --git a/scripts/threads/commands.py b/scripts/threads/commands.py index e687e36..7df4150 100644 --- a/scripts/threads/commands.py +++ b/scripts/threads/commands.py @@ -26,7 +26,7 @@ def clearScreen(): def systemCommand(command: str) -> str: words = command[1:].split() - if words[0] == "": + if not words: return "Command not found. Write 'h' for help." try: print( @@ -57,9 +57,5 @@ def runCondition(command: str) -> bool: def runService(): print("Running main.py...") - print( - run_command( - "docker exec -it webscraper python main.py", - ) - ) + print(run_command("docker exec -it webscraper python main.py", True)) return None diff --git a/scripts/threads/prompt.py b/scripts/threads/prompt.py index c788b94..ab35a69 100644 --- a/scripts/threads/prompt.py +++ b/scripts/threads/prompt.py @@ -3,32 +3,32 @@ from threads.commands import * from run_command import run_command from get_path import get_path from threads.help_list import help_list +import time def prompt(): while True: - command = input("> ") - if quitCondition(command): + try: + command = input("> ") + if quitCondition(command): + quitService(get_path()) + break + elif helpCondition(command): + print(help_list()) + elif clearCondition(command): + clearScreen() + elif command.startswith("$"): + systemCommand(command) + elif restartCondition(command): + restartService(get_path()) + elif runCondition(command): + runService() + elif command == "": + pass + else: + print(f"Command: {command} not found. Write 'h' for help.") + time.sleep(0.1) + except KeyboardInterrupt: + print("\nExiting...") quitService(get_path()) - break - if helpCondition(command): - print(help_list()) - continue - if clearCondition(command): - clearScreen() - continue - if command.startswith("$"): - systemCommand(command) - continue - if restartCondition(command): - restartService(get_path()) - continue - if runCondition(command): - runService() - continue - if command == "": - continue - else: - print("Command not found. Write 'h' for help.") - continue sys.exit(0) diff --git a/scripts/watch.py b/scripts/watch.py index f13e166..2c9e16e 100644 --- a/scripts/watch.py +++ b/scripts/watch.py @@ -17,7 +17,7 @@ def main(): run_command(f"docker compose -f {docker_compose_file} up -d") print("Composed!\n") print("Running main.py...") - print(run_command("docker exec -it webscraper python main.py")) + print(run_command("docker exec -it webscraper python main.py", True)) print( "\n\nWrite 'q' to stop program. Don't stop with 'Ctrl + C' otherwise docker container will be still on." ) @@ -44,7 +44,7 @@ def main(): if before[f] != after[f]: print(f"\nDetected change in {f}") print("Running main.py...") - print(run_command("docker exec -it webscraper python main.py")) + print(run_command("docker exec -it webscraper python main.py", True)) before[f] = after[f]