fix: provide proper id for image
Signed-off-by: paprykdev <58005447+paprykdev@users.noreply.github.com>
This commit is contained in:
parent
deda6fc0a9
commit
cab794cbef
app
@ -1,31 +1,3 @@
|
|||||||
# services:
|
|
||||||
# webscraper:
|
|
||||||
# build:
|
|
||||||
# context: .
|
|
||||||
# dockerfile: ./docker/scripts/Dockerfile
|
|
||||||
# container_name: webscraper
|
|
||||||
# volumes:
|
|
||||||
# - .:/usr/src/app
|
|
||||||
# command:
|
|
||||||
# - tail
|
|
||||||
# - -f
|
|
||||||
# - /dev/null
|
|
||||||
# selenium-hub:
|
|
||||||
# image: "selenium/hub:3.141.59"
|
|
||||||
# container_name: selenium-hub
|
|
||||||
# ports:
|
|
||||||
# - "4444:4444"
|
|
||||||
# # redis:
|
|
||||||
# # image: "redis:alpine"
|
|
||||||
# # volumes:
|
|
||||||
# # - redis_data:/data
|
|
||||||
# # ports:
|
|
||||||
# # - "6379:6379"
|
|
||||||
#
|
|
||||||
# volumes:
|
|
||||||
# # redis_data:
|
|
||||||
# app:
|
|
||||||
|
|
||||||
services:
|
services:
|
||||||
scraper:
|
scraper:
|
||||||
build:
|
build:
|
||||||
|
@ -3,4 +3,9 @@ from scripts.monet import MonetScraper
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
scraper = MonetScraper()
|
scraper = MonetScraper()
|
||||||
|
try:
|
||||||
asyncio.run(scraper.scrape())
|
asyncio.run(scraper.scrape())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print('\nDUPA\n')
|
||||||
|
scraper.save_data()
|
||||||
|
asyncio.run(scraper.browser.close())
|
||||||
|
@ -6,7 +6,9 @@ from playwright.async_api import async_playwright
|
|||||||
|
|
||||||
|
|
||||||
class MonetScraper:
|
class MonetScraper:
|
||||||
def __init__(self, url="https://digitalprojects.wpi.art/monet/artworks?page=1", base_url="https://digitalprojects.wpi.art"):
|
def __init__(self,
|
||||||
|
url="https://digitalprojects.wpi.art/monet/artworks?page=1",
|
||||||
|
base_url="https://digitalprojects.wpi.art"):
|
||||||
self.hrefs = []
|
self.hrefs = []
|
||||||
self.base_url = base_url
|
self.base_url = base_url
|
||||||
self.url = url
|
self.url = url
|
||||||
@ -28,7 +30,8 @@ class MonetScraper:
|
|||||||
|
|
||||||
async def skip_cookies(self):
|
async def skip_cookies(self):
|
||||||
await self.wait_for_el('.button-disabled')
|
await self.wait_for_el('.button-disabled')
|
||||||
await self.page.eval_on_selector('.button-disabled', 'el => el.removeAttribute("disabled")')
|
await self.page.eval_on_selector(
|
||||||
|
'.button-disabled', 'el => el.removeAttribute("disabled")')
|
||||||
await self.page.click('.button-disabled')
|
await self.page.click('.button-disabled')
|
||||||
|
|
||||||
async def find_el(self, selector: str):
|
async def find_el(self, selector: str):
|
||||||
@ -55,20 +58,21 @@ class MonetScraper:
|
|||||||
async def get_hrefs(self):
|
async def get_hrefs(self):
|
||||||
for i in range(self.pages):
|
for i in range(self.pages):
|
||||||
if i > 0:
|
if i > 0:
|
||||||
pagination = await self.find_el('cpd-controls-pagination > button:last-child')
|
pagination = await self.find_el(
|
||||||
|
'cpd-controls-pagination > button:last-child')
|
||||||
await pagination.click()
|
await pagination.click()
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
el = await self.find_els('.artwork-search-results > article:not(.not-included) > a')
|
el = await self.find_els(
|
||||||
|
'.artwork-search-results > article:not(.not-included) > a')
|
||||||
for e in el:
|
for e in el:
|
||||||
self.hrefs.append(await e.get_attribute('href'))
|
self.hrefs.append(await e.get_attribute('href'))
|
||||||
|
|
||||||
async def get_image(self):
|
async def get_image(self):
|
||||||
image = await self.find_el(".not-full-screen-image-container > img")
|
image = "null"
|
||||||
image = await image.get_attribute('srcset')
|
|
||||||
image = image.split(",")[0].split(" ")[0]
|
|
||||||
i = 0
|
i = 0
|
||||||
while image == "null" and i < 10:
|
while image == "null" and i < 10:
|
||||||
image = await self.find_el(".not-full-screen-image-container > img")
|
image = await self.find_el(
|
||||||
|
".not-full-screen-image-container > img")
|
||||||
image = await image.get_attribute('srcset')
|
image = await image.get_attribute('srcset')
|
||||||
image = image.split(",")[0].split(" ")[0]
|
image = image.split(",")[0].split(" ")[0]
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
@ -76,16 +80,22 @@ class MonetScraper:
|
|||||||
|
|
||||||
return image
|
return image
|
||||||
|
|
||||||
def curl_image(self, image, title):
|
def curl_image(self, image, title, id):
|
||||||
try:
|
try:
|
||||||
os.mkdir("images")
|
os.mkdir("dist")
|
||||||
|
except FileExistsError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
os.mkdir("dist/images")
|
||||||
except FileExistsError:
|
except FileExistsError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if image != "null":
|
if image != "null":
|
||||||
image_response = requests.get(image)
|
image_response = requests.get(image)
|
||||||
if image_response.status_code == 200:
|
if image_response.status_code == 200:
|
||||||
with open(f'images/{title.lower().replace(",", "").replace(" ", "_")}.jpg', 'wb') as img_file:
|
with open(f'dist/images/{id}.jpg', 'wb')\
|
||||||
|
as img_file:
|
||||||
img_file.write(image_response.content)
|
img_file.write(image_response.content)
|
||||||
|
|
||||||
async def get_title(self):
|
async def get_title(self):
|
||||||
@ -148,7 +158,7 @@ class MonetScraper:
|
|||||||
return arr
|
return arr
|
||||||
|
|
||||||
async def get_data(self):
|
async def get_data(self):
|
||||||
for href in self.hrefs:
|
for index, href in enumerate(self.hrefs):
|
||||||
await self.go_to(f"{self.base_url}{href}")
|
await self.go_to(f"{self.base_url}{href}")
|
||||||
image = await self.get_image()
|
image = await self.get_image()
|
||||||
title = await self.get_title()
|
title = await self.get_title()
|
||||||
@ -157,8 +167,9 @@ class MonetScraper:
|
|||||||
exhibitions = await self.get_exhibitions()
|
exhibitions = await self.get_exhibitions()
|
||||||
bibliography = await self.get_bibliography()
|
bibliography = await self.get_bibliography()
|
||||||
|
|
||||||
self.curl_image(image, title)
|
self.curl_image(image, title, id=index)
|
||||||
self.data.append({
|
self.data.append({
|
||||||
|
"id": index,
|
||||||
"title": title,
|
"title": title,
|
||||||
"date": get_info["date"],
|
"date": get_info["date"],
|
||||||
"name_of_artist": "Claude Monet",
|
"name_of_artist": "Claude Monet",
|
||||||
|
Loading…
Reference in New Issue
Block a user