1
0

fix: provide proper id for image

Signed-off-by: paprykdev <58005447+paprykdev@users.noreply.github.com>
This commit is contained in:
patilk 2025-01-09 23:12:24 +01:00
parent deda6fc0a9
commit cab794cbef
Signed by: s500042
GPG Key ID: 1921AD722E7392EE
3 changed files with 30 additions and 42 deletions

View File

@ -1,31 +1,3 @@
# services:
# webscraper:
# build:
# context: .
# dockerfile: ./docker/scripts/Dockerfile
# container_name: webscraper
# volumes:
# - .:/usr/src/app
# command:
# - tail
# - -f
# - /dev/null
# selenium-hub:
# image: "selenium/hub:3.141.59"
# container_name: selenium-hub
# ports:
# - "4444:4444"
# # redis:
# # image: "redis:alpine"
# # volumes:
# # - redis_data:/data
# # ports:
# # - "6379:6379"
#
# volumes:
# # redis_data:
# app:
services: services:
scraper: scraper:
build: build:

View File

@ -3,4 +3,9 @@ from scripts.monet import MonetScraper
if __name__ == "__main__": if __name__ == "__main__":
scraper = MonetScraper() scraper = MonetScraper()
try:
asyncio.run(scraper.scrape()) asyncio.run(scraper.scrape())
except KeyboardInterrupt:
print('\nDUPA\n')
scraper.save_data()
asyncio.run(scraper.browser.close())

View File

@ -6,7 +6,9 @@ from playwright.async_api import async_playwright
class MonetScraper: class MonetScraper:
def __init__(self, url="https://digitalprojects.wpi.art/monet/artworks?page=1", base_url="https://digitalprojects.wpi.art"): def __init__(self,
url="https://digitalprojects.wpi.art/monet/artworks?page=1",
base_url="https://digitalprojects.wpi.art"):
self.hrefs = [] self.hrefs = []
self.base_url = base_url self.base_url = base_url
self.url = url self.url = url
@ -28,7 +30,8 @@ class MonetScraper:
async def skip_cookies(self): async def skip_cookies(self):
await self.wait_for_el('.button-disabled') await self.wait_for_el('.button-disabled')
await self.page.eval_on_selector('.button-disabled', 'el => el.removeAttribute("disabled")') await self.page.eval_on_selector(
'.button-disabled', 'el => el.removeAttribute("disabled")')
await self.page.click('.button-disabled') await self.page.click('.button-disabled')
async def find_el(self, selector: str): async def find_el(self, selector: str):
@ -55,20 +58,21 @@ class MonetScraper:
async def get_hrefs(self): async def get_hrefs(self):
for i in range(self.pages): for i in range(self.pages):
if i > 0: if i > 0:
pagination = await self.find_el('cpd-controls-pagination > button:last-child') pagination = await self.find_el(
'cpd-controls-pagination > button:last-child')
await pagination.click() await pagination.click()
time.sleep(1) time.sleep(1)
el = await self.find_els('.artwork-search-results > article:not(.not-included) > a') el = await self.find_els(
'.artwork-search-results > article:not(.not-included) > a')
for e in el: for e in el:
self.hrefs.append(await e.get_attribute('href')) self.hrefs.append(await e.get_attribute('href'))
async def get_image(self): async def get_image(self):
image = await self.find_el(".not-full-screen-image-container > img") image = "null"
image = await image.get_attribute('srcset')
image = image.split(",")[0].split(" ")[0]
i = 0 i = 0
while image == "null" and i < 10: while image == "null" and i < 10:
image = await self.find_el(".not-full-screen-image-container > img") image = await self.find_el(
".not-full-screen-image-container > img")
image = await image.get_attribute('srcset') image = await image.get_attribute('srcset')
image = image.split(",")[0].split(" ")[0] image = image.split(",")[0].split(" ")[0]
time.sleep(0.5) time.sleep(0.5)
@ -76,16 +80,22 @@ class MonetScraper:
return image return image
def curl_image(self, image, title): def curl_image(self, image, title, id):
try: try:
os.mkdir("images") os.mkdir("dist")
except FileExistsError:
pass
try:
os.mkdir("dist/images")
except FileExistsError: except FileExistsError:
pass pass
if image != "null": if image != "null":
image_response = requests.get(image) image_response = requests.get(image)
if image_response.status_code == 200: if image_response.status_code == 200:
with open(f'images/{title.lower().replace(",", "").replace(" ", "_")}.jpg', 'wb') as img_file: with open(f'dist/images/{id}.jpg', 'wb')\
as img_file:
img_file.write(image_response.content) img_file.write(image_response.content)
async def get_title(self): async def get_title(self):
@ -148,7 +158,7 @@ class MonetScraper:
return arr return arr
async def get_data(self): async def get_data(self):
for href in self.hrefs: for index, href in enumerate(self.hrefs):
await self.go_to(f"{self.base_url}{href}") await self.go_to(f"{self.base_url}{href}")
image = await self.get_image() image = await self.get_image()
title = await self.get_title() title = await self.get_title()
@ -157,8 +167,9 @@ class MonetScraper:
exhibitions = await self.get_exhibitions() exhibitions = await self.get_exhibitions()
bibliography = await self.get_bibliography() bibliography = await self.get_bibliography()
self.curl_image(image, title) self.curl_image(image, title, id=index)
self.data.append({ self.data.append({
"id": index,
"title": title, "title": title,
"date": get_info["date"], "date": get_info["date"],
"name_of_artist": "Claude Monet", "name_of_artist": "Claude Monet",