Signed-off-by: paprykdev <58005447+paprykdev@users.noreply.github.com>
This commit is contained in:
parent
b560ff1c1c
commit
388964d497
6
.gitignore
vendored
6
.gitignore
vendored
@ -30,3 +30,9 @@ build/
|
||||
# IDE files
|
||||
.idea/
|
||||
.vscode/
|
||||
|
||||
# Images
|
||||
images/
|
||||
|
||||
# example
|
||||
example.py
|
||||
|
@ -23,4 +23,5 @@ docker-compose.yaml
|
||||
dist/
|
||||
build/
|
||||
|
||||
# Ignore any other files or directories you want to exclude
|
||||
# Ignore any other files or directories you want to exclude
|
||||
.supertajnyplik.donotopen
|
||||
|
36
app/.supertajnyplik.donotopen
Normal file
36
app/.supertajnyplik.donotopen
Normal file
@ -0,0 +1,36 @@
|
||||
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE ARCH BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
I USE VIM BTW!
|
||||
|
@ -1,27 +1,42 @@
|
||||
# services:
|
||||
# webscraper:
|
||||
# build:
|
||||
# context: .
|
||||
# dockerfile: ./docker/scripts/Dockerfile
|
||||
# container_name: webscraper
|
||||
# volumes:
|
||||
# - .:/usr/src/app
|
||||
# command:
|
||||
# - tail
|
||||
# - -f
|
||||
# - /dev/null
|
||||
# selenium-hub:
|
||||
# image: "selenium/hub:3.141.59"
|
||||
# container_name: selenium-hub
|
||||
# ports:
|
||||
# - "4444:4444"
|
||||
# # redis:
|
||||
# # image: "redis:alpine"
|
||||
# # volumes:
|
||||
# # - redis_data:/data
|
||||
# # ports:
|
||||
# # - "6379:6379"
|
||||
#
|
||||
# volumes:
|
||||
# # redis_data:
|
||||
# app:
|
||||
|
||||
services:
|
||||
webscraper:
|
||||
scraper:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: ./docker/scripts/Dockerfile
|
||||
container_name: webscraper
|
||||
container_name: scraper
|
||||
volumes:
|
||||
- .:/usr/src/app
|
||||
command:
|
||||
- tail
|
||||
- -f
|
||||
- /dev/null
|
||||
selenium-hub:
|
||||
image: "selenium/hub:3.141.59"
|
||||
container_name: selenium-hub
|
||||
ports:
|
||||
- "4444:4444"
|
||||
# redis:
|
||||
# image: "redis:alpine"
|
||||
# volumes:
|
||||
# - redis_data:/data
|
||||
# ports:
|
||||
# - "6379:6379"
|
||||
|
||||
volumes:
|
||||
# redis_data:
|
||||
app:
|
||||
|
@ -1,15 +1,29 @@
|
||||
FROM python:3.9-slim
|
||||
# FROM python:3.9-slim
|
||||
#
|
||||
# WORKDIR /usr/src/app
|
||||
#
|
||||
# COPY requirements.txt .
|
||||
# RUN pip install --trusted-host pypi.python.org -r requirements.txt
|
||||
#
|
||||
# COPY . .
|
||||
#
|
||||
# RUN apt-get update && apt-get install -y wget unzip && \
|
||||
# wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \
|
||||
# apt install -y ./google-chrome-stable_current_amd64.deb && \
|
||||
# rm ./google-chrome-stable_current_amd64.deb && \
|
||||
# apt-get clean
|
||||
#
|
||||
|
||||
# Use an official Python runtime as a parent image
|
||||
FROM mcr.microsoft.com/playwright/python:v1.49.1-jammy
|
||||
|
||||
# Set the working directory to /app
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
# Copy the current directory contents into the container at /app
|
||||
COPY requirements.txt .
|
||||
RUN pip install --trusted-host pypi.python.org -r requirements.txt
|
||||
|
||||
# Run the command to install any necessary dependencies
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN apt-get update && apt-get install -y wget unzip && \
|
||||
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \
|
||||
apt install -y ./google-chrome-stable_current_amd64.deb && \
|
||||
rm ./google-chrome-stable_current_amd64.deb && \
|
||||
apt-get clean
|
||||
|
||||
|
28
app/main.py
28
app/main.py
@ -1,26 +1,6 @@
|
||||
from scraper import scrap
|
||||
import os
|
||||
import json
|
||||
|
||||
urls = ["https://digitalprojects.wpi.art/monet/artworks"]
|
||||
hrefs = []
|
||||
|
||||
|
||||
def main():
|
||||
directory = os.path.dirname(os.path.realpath(__file__))
|
||||
file_path = os.path.join(directory, "dist", "data.json")
|
||||
scrap(urls[0])
|
||||
|
||||
data = []
|
||||
|
||||
try:
|
||||
os.mkdir(os.path.join(directory, "dist"))
|
||||
except FileExistsError:
|
||||
pass
|
||||
with open(file_path, "w", encoding="utf-8") as file:
|
||||
json.dump(data, file)
|
||||
print("Data has been scraped!")
|
||||
|
||||
import asyncio
|
||||
from scripts.monet import MonetScraper
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
scraper = MonetScraper()
|
||||
asyncio.run(scraper.scrape())
|
||||
|
@ -1,24 +1,2 @@
|
||||
attrs==24.2.0
|
||||
beautifulsoup4==4.12.3
|
||||
bs4==0.0.2
|
||||
certifi==2024.8.30
|
||||
charset-normalizer==3.4.0
|
||||
h11==0.14.0
|
||||
idna==3.10
|
||||
lxml==5.3.0
|
||||
outcome==1.3.0.post0
|
||||
packaging==24.2
|
||||
PySocks==1.7.1
|
||||
python-dotenv==1.0.1
|
||||
playwright==1.49.1
|
||||
requests==2.32.3
|
||||
selenium==4.26.1
|
||||
sniffio==1.3.1
|
||||
sortedcontainers==2.4.0
|
||||
soupsieve==2.6
|
||||
trio==0.27.0
|
||||
trio-websocket==0.11.1
|
||||
typing_extensions==4.12.2
|
||||
urllib3==2.2.3
|
||||
webdriver-manager==4.0.2
|
||||
websocket-client==1.8.0
|
||||
wsproto==1.2.0
|
||||
|
@ -1,78 +0,0 @@
|
||||
import os
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
import time
|
||||
|
||||
|
||||
class Scraper:
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
self.hrefs = []
|
||||
self.driver = self.load_driver()
|
||||
|
||||
def load_driver(self) -> webdriver.Chrome:
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument("--headless")
|
||||
options.add_argument("--no-sandbox")
|
||||
|
||||
return webdriver.Chrome(
|
||||
options=options,
|
||||
service=(
|
||||
Service(ChromeDriverManager().install())
|
||||
if os.path.exists("/.dockerenv")
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
||||
def skip_cookies(self) -> None:
|
||||
WebDriverWait(self.driver, 5).until(
|
||||
EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, "[_ngcontent-ng-c745257238]")
|
||||
)
|
||||
)
|
||||
|
||||
button = self.driver.find_element(By.CSS_SELECTOR, "[_ngcontent-ng-c745257238]")
|
||||
self.driver.execute_script(
|
||||
"""
|
||||
arguments[0].removeAttribute('disabled');
|
||||
arguments[0].className = 'border-button';
|
||||
""",
|
||||
button,
|
||||
)
|
||||
button.click()
|
||||
time.sleep(2)
|
||||
|
||||
def load_page(self) -> None:
|
||||
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||
time.sleep(2)
|
||||
|
||||
def locate_valid_artworks(self) -> list[str]:
|
||||
WebDriverWait(self.driver, 5).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, ".artwork-search-results"))
|
||||
)
|
||||
artworks = self.driver.find_elements(
|
||||
By.CSS_SELECTOR, ".artwork-search-results article"
|
||||
)
|
||||
for artwork in artworks:
|
||||
href = artwork.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
|
||||
self.hrefs.append(href)
|
||||
return self.hrefs
|
||||
|
||||
|
||||
def scrap(url: str):
|
||||
instance = Scraper(url)
|
||||
driver = instance.driver
|
||||
driver.get(url)
|
||||
|
||||
instance.skip_cookies()
|
||||
instance.load_page()
|
||||
hrefs = instance.locate_valid_artworks()
|
||||
|
||||
print(hrefs)
|
||||
html = driver.page_source
|
||||
driver.quit()
|
||||
return html
|
176
app/scripts/monet.py
Normal file
176
app/scripts/monet.py
Normal file
@ -0,0 +1,176 @@
|
||||
import time
|
||||
import requests
|
||||
import os
|
||||
import json
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
|
||||
class MonetScraper:
|
||||
def __init__(self, url="https://digitalprojects.wpi.art/monet/artworks?page=1", base_url="https://digitalprojects.wpi.art"):
|
||||
self.hrefs = []
|
||||
self.base_url = base_url
|
||||
self.url = url
|
||||
self.data = []
|
||||
self.pages = 3
|
||||
|
||||
async def scrape(self):
|
||||
async with async_playwright() as p:
|
||||
self.browser = await p.chromium.launch(headless=False)
|
||||
self.context = await self.browser.new_context()
|
||||
self.page = await self.context.new_page()
|
||||
self.page.set_default_timeout(5000)
|
||||
await self.go_to(self.url)
|
||||
await self.skip_cookies()
|
||||
await self.get_hrefs()
|
||||
await self.get_data()
|
||||
self.save_data()
|
||||
await self.browser.close()
|
||||
|
||||
async def skip_cookies(self):
|
||||
await self.wait_for_el('.button-disabled')
|
||||
await self.page.eval_on_selector('.button-disabled', 'el => el.removeAttribute("disabled")')
|
||||
await self.page.click('.button-disabled')
|
||||
|
||||
async def insert_value(self, selector, value):
|
||||
await self.page.fill(selector, value)
|
||||
|
||||
async def find_el(self, selector: str):
|
||||
await self.wait_for_el(selector)
|
||||
return await self.page.query_selector(selector)
|
||||
|
||||
async def find_els(self, selector: str):
|
||||
await self.wait_for_el(selector)
|
||||
return await self.page.query_selector_all(selector)
|
||||
|
||||
async def wait_for_el(self, selector: str):
|
||||
await self.page.wait_for_selector(selector)
|
||||
|
||||
async def go_to(self, url, tabs=False):
|
||||
hack = True
|
||||
while hack:
|
||||
try:
|
||||
await self.page.goto(url, timeout=60000)
|
||||
hack = False
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print(f'error go to {url}')
|
||||
|
||||
async def get_hrefs(self):
|
||||
for i in range(self.pages):
|
||||
if i > 0:
|
||||
pagination = await self.find_el('cpd-controls-pagination > button:last-child')
|
||||
await pagination.click()
|
||||
time.sleep(1)
|
||||
el = await self.find_els('.artwork-search-results > article:not(.not-included) > a')
|
||||
for e in el:
|
||||
self.hrefs.append(await e.get_attribute('href'))
|
||||
|
||||
async def get_image(self):
|
||||
image = await self.find_el(".not-full-screen-image-container > img")
|
||||
image = await image.get_attribute('srcset')
|
||||
image = image.split(",")[0].split(" ")[0]
|
||||
i = 0
|
||||
while image == "null" and i < 10:
|
||||
image = await self.find_el(".not-full-screen-image-container > img")
|
||||
image = await image.get_attribute('srcset')
|
||||
image = image.split(",")[0].split(" ")[0]
|
||||
time.sleep(0.5)
|
||||
i += 1
|
||||
|
||||
return image
|
||||
|
||||
def curl_image(self, image, title):
|
||||
try:
|
||||
os.mkdir("images")
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
if image != "null":
|
||||
image_response = requests.get(image)
|
||||
if image_response.status_code == 200:
|
||||
with open(f'images/{title.lower().replace(",", "").replace(" ", "_")}.jpg', 'wb') as img_file:
|
||||
img_file.write(image_response.content)
|
||||
|
||||
async def get_title(self):
|
||||
title = await self.find_el(".details h1")
|
||||
title = await title.inner_text()
|
||||
return title
|
||||
|
||||
async def get_info(self):
|
||||
info = await self.find_els("article[_ngcontent-ng-c2311764719] p > p")
|
||||
return {
|
||||
"date": await info[0].inner_text(),
|
||||
"technique": await info[1].inner_text(),
|
||||
"dimensions": await info[2].inner_text(),
|
||||
"signature": await info[3].inner_text(),
|
||||
}
|
||||
|
||||
def save_data(self):
|
||||
try:
|
||||
os.mkdir("dist")
|
||||
except FileExistsError:
|
||||
pass
|
||||
open("dist/data.json",
|
||||
"w").write(json.dumps([d for d in self.data], indent=4))
|
||||
|
||||
async def get_provenance(self):
|
||||
provenances = None
|
||||
try:
|
||||
provenances = await self.find_els("#provenance p p")
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return None
|
||||
return [await p.inner_text() for p in provenances]
|
||||
|
||||
async def get_exhibitions(self):
|
||||
exhibitions = None
|
||||
try:
|
||||
exhibitions = await self.find_els("#exhibition article")
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return None
|
||||
arr = []
|
||||
for paragraph in exhibitions:
|
||||
await paragraph.wait_for_selector("p")
|
||||
ps = await paragraph.query_selector_all("p")
|
||||
arr.append(", ".join([await p.inner_text() for p in ps]))
|
||||
return arr
|
||||
|
||||
async def get_bibliography(self):
|
||||
bibliography = None
|
||||
try:
|
||||
bibliography = await self.find_els("#publication article")
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return None
|
||||
arr = []
|
||||
for paragraph in bibliography:
|
||||
await paragraph.wait_for_selector("p")
|
||||
ps = await paragraph.query_selector_all("p")
|
||||
arr.append(", ".join([await p.inner_text() for p in ps]))
|
||||
return arr
|
||||
|
||||
async def get_data(self):
|
||||
for href in self.hrefs:
|
||||
await self.go_to(f"{self.base_url}{href}")
|
||||
image = await self.get_image()
|
||||
title = await self.get_title()
|
||||
get_info = await self.get_info()
|
||||
provenance = await self.get_provenance()
|
||||
exhibitions = await self.get_exhibitions()
|
||||
bibliography = await self.get_bibliography()
|
||||
|
||||
self.curl_image(image, title)
|
||||
self.data.append({
|
||||
"title": title,
|
||||
"date": get_info["date"],
|
||||
"name_of_artist": "Claude Monet",
|
||||
"technique": get_info["technique"],
|
||||
"dimensions": get_info["dimensions"],
|
||||
"signature": get_info["signature"],
|
||||
"location": None,
|
||||
"image": image,
|
||||
"provenance": provenance,
|
||||
"exhibitions": exhibitions,
|
||||
"bibliography": bibliography,
|
||||
})
|
@ -1,13 +1,13 @@
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
|
||||
def run_command(command: str) -> str:
|
||||
def run_command(command: str, isPython: bool = False) -> str:
|
||||
process = subprocess.run(
|
||||
command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
)
|
||||
if process.returncode != 0:
|
||||
return_massage = ""
|
||||
if process.returncode != 0 and not isPython:
|
||||
print(f"Error running command: {command}")
|
||||
print(process.stderr.decode())
|
||||
sys.exit(process.returncode)
|
||||
return process.stdout.decode()
|
||||
return_massage = process.stderr.decode()
|
||||
return_massage = process.stdout.decode()
|
||||
return return_massage
|
||||
|
@ -5,22 +5,29 @@ from get_path import get_path
|
||||
|
||||
|
||||
def main():
|
||||
docker_compose_file = os.getenv(
|
||||
"DOCKER_COMPOSE_FILE", f"{get_path()}/app/docker-compose.yaml"
|
||||
)
|
||||
service_name = os.getenv("SERVICE_NAME", "webscraper")
|
||||
script_name = os.getenv("SCRIPT_NAME", "main.py")
|
||||
try:
|
||||
print("Starting Docker Compose services...\n")
|
||||
run_command(f"docker compose -f {docker_compose_file} up -d")
|
||||
docker_compose_file = os.getenv(
|
||||
"DOCKER_COMPOSE_FILE", f"{get_path()}/app/docker-compose.yaml"
|
||||
)
|
||||
service_name = os.getenv("SERVICE_NAME", "scraper")
|
||||
script_name = os.getenv("SCRIPT_NAME", "main.py")
|
||||
try:
|
||||
print("Starting Docker Compose services...\n")
|
||||
run_command(f"docker compose -f {docker_compose_file} up -d")
|
||||
|
||||
print(run_command(f"docker exec {service_name} python {script_name}"))
|
||||
print(run_command(f"docker exec -it {service_name} xvfb-run --auto-servernum --server-num=1 --server-args='-screen 0, 1920x1080x24' python3 {script_name}"))
|
||||
|
||||
print("Stopping and removing Docker Compose services...")
|
||||
print("Stopping and removing Docker Compose services...")
|
||||
run_command(f"docker compose -f {docker_compose_file} down")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print("An error occurred while running the script.")
|
||||
print(e)
|
||||
except KeyboardInterrupt:
|
||||
print("Keyboard interrupt detected. Exiting...")
|
||||
run_command(f"docker compose -f {docker_compose_file} down")
|
||||
except KeyboardInterrupt:
|
||||
print("Keyboard interrupt detected. Exiting...")
|
||||
run_command(f"docker compose -f {docker_compose_file} down")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print("An error occurred while running the script.")
|
||||
print(e)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -26,7 +26,7 @@ def clearScreen():
|
||||
|
||||
def systemCommand(command: str) -> str:
|
||||
words = command[1:].split()
|
||||
if words[0] == "":
|
||||
if not words:
|
||||
return "Command not found. Write 'h' for help."
|
||||
try:
|
||||
print(
|
||||
@ -57,9 +57,5 @@ def runCondition(command: str) -> bool:
|
||||
|
||||
def runService():
|
||||
print("Running main.py...")
|
||||
print(
|
||||
run_command(
|
||||
"docker exec -it webscraper python main.py",
|
||||
)
|
||||
)
|
||||
print(run_command("docker exec -it webscraper python main.py", True))
|
||||
return None
|
||||
|
@ -3,32 +3,32 @@ from threads.commands import *
|
||||
from run_command import run_command
|
||||
from get_path import get_path
|
||||
from threads.help_list import help_list
|
||||
import time
|
||||
|
||||
|
||||
def prompt():
|
||||
while True:
|
||||
command = input("> ")
|
||||
if quitCondition(command):
|
||||
try:
|
||||
command = input("> ")
|
||||
if quitCondition(command):
|
||||
quitService(get_path())
|
||||
break
|
||||
elif helpCondition(command):
|
||||
print(help_list())
|
||||
elif clearCondition(command):
|
||||
clearScreen()
|
||||
elif command.startswith("$"):
|
||||
systemCommand(command)
|
||||
elif restartCondition(command):
|
||||
restartService(get_path())
|
||||
elif runCondition(command):
|
||||
runService()
|
||||
elif command == "":
|
||||
pass
|
||||
else:
|
||||
print(f"Command: {command} not found. Write 'h' for help.")
|
||||
time.sleep(0.1)
|
||||
except KeyboardInterrupt:
|
||||
print("\nExiting...")
|
||||
quitService(get_path())
|
||||
break
|
||||
if helpCondition(command):
|
||||
print(help_list())
|
||||
continue
|
||||
if clearCondition(command):
|
||||
clearScreen()
|
||||
continue
|
||||
if command.startswith("$"):
|
||||
systemCommand(command)
|
||||
continue
|
||||
if restartCondition(command):
|
||||
restartService(get_path())
|
||||
continue
|
||||
if runCondition(command):
|
||||
runService()
|
||||
continue
|
||||
if command == "":
|
||||
continue
|
||||
else:
|
||||
print("Command not found. Write 'h' for help.")
|
||||
continue
|
||||
sys.exit(0)
|
||||
|
@ -17,7 +17,7 @@ def main():
|
||||
run_command(f"docker compose -f {docker_compose_file} up -d")
|
||||
print("Composed!\n")
|
||||
print("Running main.py...")
|
||||
print(run_command("docker exec -it webscraper python main.py"))
|
||||
print(run_command("docker exec -it webscraper python main.py", True))
|
||||
print(
|
||||
"\n\nWrite 'q' to stop program. Don't stop with 'Ctrl + C' otherwise docker container will be still on."
|
||||
)
|
||||
@ -44,7 +44,7 @@ def main():
|
||||
if before[f] != after[f]:
|
||||
print(f"\nDetected change in {f}")
|
||||
print("Running main.py...")
|
||||
print(run_command("docker exec -it webscraper python main.py"))
|
||||
print(run_command("docker exec -it webscraper python main.py", True))
|
||||
before[f] = after[f]
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user