feat: scraper for monet arts
Some checks are pending
Docker Image CI / build (push) Waiting to run

Signed-off-by: paprykdev <58005447+paprykdev@users.noreply.github.com>
This commit is contained in:
patilk 2024-12-18 01:41:12 +01:00
parent b560ff1c1c
commit 388964d497
Signed by: s500042
GPG Key ID: 1921AD722E7392EE
14 changed files with 330 additions and 199 deletions

6
.gitignore vendored
View File

@ -30,3 +30,9 @@ build/
# IDE files
.idea/
.vscode/
# Images
images/
# example
example.py

View File

@ -23,4 +23,5 @@ docker-compose.yaml
dist/
build/
# Ignore any other files or directories you want to exclude
# Ignore any other files or directories you want to exclude
.supertajnyplik.donotopen

View File

@ -0,0 +1,36 @@
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE ARCH BTW!
I USE VIM BTW!
I USE VIM BTW!
I USE VIM BTW!

View File

@ -1,27 +1,42 @@
# services:
# webscraper:
# build:
# context: .
# dockerfile: ./docker/scripts/Dockerfile
# container_name: webscraper
# volumes:
# - .:/usr/src/app
# command:
# - tail
# - -f
# - /dev/null
# selenium-hub:
# image: "selenium/hub:3.141.59"
# container_name: selenium-hub
# ports:
# - "4444:4444"
# # redis:
# # image: "redis:alpine"
# # volumes:
# # - redis_data:/data
# # ports:
# # - "6379:6379"
#
# volumes:
# # redis_data:
# app:
services:
webscraper:
scraper:
build:
context: .
dockerfile: ./docker/scripts/Dockerfile
container_name: webscraper
container_name: scraper
volumes:
- .:/usr/src/app
command:
- tail
- -f
- /dev/null
selenium-hub:
image: "selenium/hub:3.141.59"
container_name: selenium-hub
ports:
- "4444:4444"
# redis:
# image: "redis:alpine"
# volumes:
# - redis_data:/data
# ports:
# - "6379:6379"
volumes:
# redis_data:
app:

View File

@ -1,15 +1,29 @@
FROM python:3.9-slim
# FROM python:3.9-slim
#
# WORKDIR /usr/src/app
#
# COPY requirements.txt .
# RUN pip install --trusted-host pypi.python.org -r requirements.txt
#
# COPY . .
#
# RUN apt-get update && apt-get install -y wget unzip && \
# wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \
# apt install -y ./google-chrome-stable_current_amd64.deb && \
# rm ./google-chrome-stable_current_amd64.deb && \
# apt-get clean
#
# Use an official Python runtime as a parent image
FROM mcr.microsoft.com/playwright/python:v1.49.1-jammy
# Set the working directory to /app
WORKDIR /usr/src/app
# Copy the current directory contents into the container at /app
COPY requirements.txt .
RUN pip install --trusted-host pypi.python.org -r requirements.txt
# Run the command to install any necessary dependencies
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
RUN apt-get update && apt-get install -y wget unzip && \
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \
apt install -y ./google-chrome-stable_current_amd64.deb && \
rm ./google-chrome-stable_current_amd64.deb && \
apt-get clean

View File

@ -1,26 +1,6 @@
from scraper import scrap
import os
import json
urls = ["https://digitalprojects.wpi.art/monet/artworks"]
hrefs = []
def main():
directory = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(directory, "dist", "data.json")
scrap(urls[0])
data = []
try:
os.mkdir(os.path.join(directory, "dist"))
except FileExistsError:
pass
with open(file_path, "w", encoding="utf-8") as file:
json.dump(data, file)
print("Data has been scraped!")
import asyncio
from scripts.monet import MonetScraper
if __name__ == "__main__":
main()
scraper = MonetScraper()
asyncio.run(scraper.scrape())

View File

@ -1,24 +1,2 @@
attrs==24.2.0
beautifulsoup4==4.12.3
bs4==0.0.2
certifi==2024.8.30
charset-normalizer==3.4.0
h11==0.14.0
idna==3.10
lxml==5.3.0
outcome==1.3.0.post0
packaging==24.2
PySocks==1.7.1
python-dotenv==1.0.1
playwright==1.49.1
requests==2.32.3
selenium==4.26.1
sniffio==1.3.1
sortedcontainers==2.4.0
soupsieve==2.6
trio==0.27.0
trio-websocket==0.11.1
typing_extensions==4.12.2
urllib3==2.2.3
webdriver-manager==4.0.2
websocket-client==1.8.0
wsproto==1.2.0

View File

@ -1,78 +0,0 @@
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
class Scraper:
def __init__(self, url):
self.url = url
self.hrefs = []
self.driver = self.load_driver()
def load_driver(self) -> webdriver.Chrome:
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
return webdriver.Chrome(
options=options,
service=(
Service(ChromeDriverManager().install())
if os.path.exists("/.dockerenv")
else None
),
)
def skip_cookies(self) -> None:
WebDriverWait(self.driver, 5).until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, "[_ngcontent-ng-c745257238]")
)
)
button = self.driver.find_element(By.CSS_SELECTOR, "[_ngcontent-ng-c745257238]")
self.driver.execute_script(
"""
arguments[0].removeAttribute('disabled');
arguments[0].className = 'border-button';
""",
button,
)
button.click()
time.sleep(2)
def load_page(self) -> None:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
def locate_valid_artworks(self) -> list[str]:
WebDriverWait(self.driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".artwork-search-results"))
)
artworks = self.driver.find_elements(
By.CSS_SELECTOR, ".artwork-search-results article"
)
for artwork in artworks:
href = artwork.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
self.hrefs.append(href)
return self.hrefs
def scrap(url: str):
instance = Scraper(url)
driver = instance.driver
driver.get(url)
instance.skip_cookies()
instance.load_page()
hrefs = instance.locate_valid_artworks()
print(hrefs)
html = driver.page_source
driver.quit()
return html

176
app/scripts/monet.py Normal file
View File

@ -0,0 +1,176 @@
import time
import requests
import os
import json
from playwright.async_api import async_playwright
class MonetScraper:
def __init__(self, url="https://digitalprojects.wpi.art/monet/artworks?page=1", base_url="https://digitalprojects.wpi.art"):
self.hrefs = []
self.base_url = base_url
self.url = url
self.data = []
self.pages = 3
async def scrape(self):
async with async_playwright() as p:
self.browser = await p.chromium.launch(headless=False)
self.context = await self.browser.new_context()
self.page = await self.context.new_page()
self.page.set_default_timeout(5000)
await self.go_to(self.url)
await self.skip_cookies()
await self.get_hrefs()
await self.get_data()
self.save_data()
await self.browser.close()
async def skip_cookies(self):
await self.wait_for_el('.button-disabled')
await self.page.eval_on_selector('.button-disabled', 'el => el.removeAttribute("disabled")')
await self.page.click('.button-disabled')
async def insert_value(self, selector, value):
await self.page.fill(selector, value)
async def find_el(self, selector: str):
await self.wait_for_el(selector)
return await self.page.query_selector(selector)
async def find_els(self, selector: str):
await self.wait_for_el(selector)
return await self.page.query_selector_all(selector)
async def wait_for_el(self, selector: str):
await self.page.wait_for_selector(selector)
async def go_to(self, url, tabs=False):
hack = True
while hack:
try:
await self.page.goto(url, timeout=60000)
hack = False
except Exception as e:
print(e)
print(f'error go to {url}')
async def get_hrefs(self):
for i in range(self.pages):
if i > 0:
pagination = await self.find_el('cpd-controls-pagination > button:last-child')
await pagination.click()
time.sleep(1)
el = await self.find_els('.artwork-search-results > article:not(.not-included) > a')
for e in el:
self.hrefs.append(await e.get_attribute('href'))
async def get_image(self):
image = await self.find_el(".not-full-screen-image-container > img")
image = await image.get_attribute('srcset')
image = image.split(",")[0].split(" ")[0]
i = 0
while image == "null" and i < 10:
image = await self.find_el(".not-full-screen-image-container > img")
image = await image.get_attribute('srcset')
image = image.split(",")[0].split(" ")[0]
time.sleep(0.5)
i += 1
return image
def curl_image(self, image, title):
try:
os.mkdir("images")
except FileExistsError:
pass
if image != "null":
image_response = requests.get(image)
if image_response.status_code == 200:
with open(f'images/{title.lower().replace(",", "").replace(" ", "_")}.jpg', 'wb') as img_file:
img_file.write(image_response.content)
async def get_title(self):
title = await self.find_el(".details h1")
title = await title.inner_text()
return title
async def get_info(self):
info = await self.find_els("article[_ngcontent-ng-c2311764719] p > p")
return {
"date": await info[0].inner_text(),
"technique": await info[1].inner_text(),
"dimensions": await info[2].inner_text(),
"signature": await info[3].inner_text(),
}
def save_data(self):
try:
os.mkdir("dist")
except FileExistsError:
pass
open("dist/data.json",
"w").write(json.dumps([d for d in self.data], indent=4))
async def get_provenance(self):
provenances = None
try:
provenances = await self.find_els("#provenance p p")
except Exception as e:
print(e)
return None
return [await p.inner_text() for p in provenances]
async def get_exhibitions(self):
exhibitions = None
try:
exhibitions = await self.find_els("#exhibition article")
except Exception as e:
print(e)
return None
arr = []
for paragraph in exhibitions:
await paragraph.wait_for_selector("p")
ps = await paragraph.query_selector_all("p")
arr.append(", ".join([await p.inner_text() for p in ps]))
return arr
async def get_bibliography(self):
bibliography = None
try:
bibliography = await self.find_els("#publication article")
except Exception as e:
print(e)
return None
arr = []
for paragraph in bibliography:
await paragraph.wait_for_selector("p")
ps = await paragraph.query_selector_all("p")
arr.append(", ".join([await p.inner_text() for p in ps]))
return arr
async def get_data(self):
for href in self.hrefs:
await self.go_to(f"{self.base_url}{href}")
image = await self.get_image()
title = await self.get_title()
get_info = await self.get_info()
provenance = await self.get_provenance()
exhibitions = await self.get_exhibitions()
bibliography = await self.get_bibliography()
self.curl_image(image, title)
self.data.append({
"title": title,
"date": get_info["date"],
"name_of_artist": "Claude Monet",
"technique": get_info["technique"],
"dimensions": get_info["dimensions"],
"signature": get_info["signature"],
"location": None,
"image": image,
"provenance": provenance,
"exhibitions": exhibitions,
"bibliography": bibliography,
})

View File

@ -1,13 +1,13 @@
import subprocess
import sys
def run_command(command: str) -> str:
def run_command(command: str, isPython: bool = False) -> str:
process = subprocess.run(
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
if process.returncode != 0:
return_massage = ""
if process.returncode != 0 and not isPython:
print(f"Error running command: {command}")
print(process.stderr.decode())
sys.exit(process.returncode)
return process.stdout.decode()
return_massage = process.stderr.decode()
return_massage = process.stdout.decode()
return return_massage

View File

@ -5,22 +5,29 @@ from get_path import get_path
def main():
docker_compose_file = os.getenv(
"DOCKER_COMPOSE_FILE", f"{get_path()}/app/docker-compose.yaml"
)
service_name = os.getenv("SERVICE_NAME", "webscraper")
script_name = os.getenv("SCRIPT_NAME", "main.py")
try:
print("Starting Docker Compose services...\n")
run_command(f"docker compose -f {docker_compose_file} up -d")
docker_compose_file = os.getenv(
"DOCKER_COMPOSE_FILE", f"{get_path()}/app/docker-compose.yaml"
)
service_name = os.getenv("SERVICE_NAME", "scraper")
script_name = os.getenv("SCRIPT_NAME", "main.py")
try:
print("Starting Docker Compose services...\n")
run_command(f"docker compose -f {docker_compose_file} up -d")
print(run_command(f"docker exec {service_name} python {script_name}"))
print(run_command(f"docker exec -it {service_name} xvfb-run --auto-servernum --server-num=1 --server-args='-screen 0, 1920x1080x24' python3 {script_name}"))
print("Stopping and removing Docker Compose services...")
print("Stopping and removing Docker Compose services...")
run_command(f"docker compose -f {docker_compose_file} down")
except subprocess.CalledProcessError as e:
print("An error occurred while running the script.")
print(e)
except KeyboardInterrupt:
print("Keyboard interrupt detected. Exiting...")
run_command(f"docker compose -f {docker_compose_file} down")
except KeyboardInterrupt:
print("Keyboard interrupt detected. Exiting...")
run_command(f"docker compose -f {docker_compose_file} down")
except subprocess.CalledProcessError as e:
print("An error occurred while running the script.")
print(e)
if __name__ == "__main__":

View File

@ -26,7 +26,7 @@ def clearScreen():
def systemCommand(command: str) -> str:
words = command[1:].split()
if words[0] == "":
if not words:
return "Command not found. Write 'h' for help."
try:
print(
@ -57,9 +57,5 @@ def runCondition(command: str) -> bool:
def runService():
print("Running main.py...")
print(
run_command(
"docker exec -it webscraper python main.py",
)
)
print(run_command("docker exec -it webscraper python main.py", True))
return None

View File

@ -3,32 +3,32 @@ from threads.commands import *
from run_command import run_command
from get_path import get_path
from threads.help_list import help_list
import time
def prompt():
while True:
command = input("> ")
if quitCondition(command):
try:
command = input("> ")
if quitCondition(command):
quitService(get_path())
break
elif helpCondition(command):
print(help_list())
elif clearCondition(command):
clearScreen()
elif command.startswith("$"):
systemCommand(command)
elif restartCondition(command):
restartService(get_path())
elif runCondition(command):
runService()
elif command == "":
pass
else:
print(f"Command: {command} not found. Write 'h' for help.")
time.sleep(0.1)
except KeyboardInterrupt:
print("\nExiting...")
quitService(get_path())
break
if helpCondition(command):
print(help_list())
continue
if clearCondition(command):
clearScreen()
continue
if command.startswith("$"):
systemCommand(command)
continue
if restartCondition(command):
restartService(get_path())
continue
if runCondition(command):
runService()
continue
if command == "":
continue
else:
print("Command not found. Write 'h' for help.")
continue
sys.exit(0)

View File

@ -17,7 +17,7 @@ def main():
run_command(f"docker compose -f {docker_compose_file} up -d")
print("Composed!\n")
print("Running main.py...")
print(run_command("docker exec -it webscraper python main.py"))
print(run_command("docker exec -it webscraper python main.py", True))
print(
"\n\nWrite 'q' to stop program. Don't stop with 'Ctrl + C' otherwise docker container will be still on."
)
@ -44,7 +44,7 @@ def main():
if before[f] != after[f]:
print(f"\nDetected change in {f}")
print("Running main.py...")
print(run_command("docker exec -it webscraper python main.py"))
print(run_command("docker exec -it webscraper python main.py", True))
before[f] = after[f]