fix: provide proper id for image

Signed-off-by: paprykdev <58005447+paprykdev@users.noreply.github.com>
2025-01-09 23:12:24 +01:00 · 2025-01-09 23:12:24 +01:00 · cab794cbef
commit cab794cbef
parent deda6fc0a9
3 changed files with 30 additions and 42 deletions
--- a/app/docker-compose.yaml
+++ b/app/docker-compose.yaml
@ -1,31 +1,3 @@
-# services:
-#   webscraper:
-#     build:
-#       context: .
-#       dockerfile: ./docker/scripts/Dockerfile
-#     container_name: webscraper
-#     volumes:
-#       - .:/usr/src/app
-#     command:
-#       - tail
-#       - -f
-#       - /dev/null
-#   selenium-hub:
-#     image: "selenium/hub:3.141.59"
-#     container_name: selenium-hub
-#     ports:
-#       - "4444:4444"
-#   # redis:
-#   #   image: "redis:alpine"
-#   #   volumes:
-#   #     - redis_data:/data
-#   #   ports:
-#   #     - "6379:6379"
-#
-# volumes:
-#   # redis_data:
-#   app:
-
 services:
  scraper:
    build:
--- a/app/main.py
+++ b/app/main.py
@ -3,4 +3,9 @@ from scripts.monet import MonetScraper

 if __name__ == "__main__":
    scraper = MonetScraper()
-    asyncio.run(scraper.scrape())
+    try:
+        asyncio.run(scraper.scrape())
+    except KeyboardInterrupt:
+        print('\nDUPA\n')
+        scraper.save_data()
+        asyncio.run(scraper.browser.close())
--- a/app/scripts/monet.py
+++ b/app/scripts/monet.py
@ -6,7 +6,9 @@ from playwright.async_api import async_playwright


 class MonetScraper:
-    def __init__(self, url="https://digitalprojects.wpi.art/monet/artworks?page=1", base_url="https://digitalprojects.wpi.art"):
+    def __init__(self,
+                 url="https://digitalprojects.wpi.art/monet/artworks?page=1",
+                 base_url="https://digitalprojects.wpi.art"):
        self.hrefs = []
        self.base_url = base_url
        self.url = url
@ -28,7 +30,8 @@ class MonetScraper:

    async def skip_cookies(self):
        await self.wait_for_el('.button-disabled')
-        await self.page.eval_on_selector('.button-disabled', 'el => el.removeAttribute("disabled")')
+        await self.page.eval_on_selector(
+            '.button-disabled', 'el => el.removeAttribute("disabled")')
        await self.page.click('.button-disabled')

    async def find_el(self, selector: str):
@ -55,20 +58,21 @@ class MonetScraper:
    async def get_hrefs(self):
        for i in range(self.pages):
            if i > 0:
-                pagination = await self.find_el('cpd-controls-pagination > button:last-child')
+                pagination = await self.find_el(
+                    'cpd-controls-pagination > button:last-child')
                await pagination.click()
                time.sleep(1)
-            el = await self.find_els('.artwork-search-results > article:not(.not-included) > a')
+            el = await self.find_els(
+                '.artwork-search-results > article:not(.not-included) > a')
            for e in el:
                self.hrefs.append(await e.get_attribute('href'))

    async def get_image(self):
-        image = await self.find_el(".not-full-screen-image-container > img")
-        image = await image.get_attribute('srcset')
-        image = image.split(",")[0].split(" ")[0]
+        image = "null"
        i = 0
        while image == "null" and i < 10:
-            image = await self.find_el(".not-full-screen-image-container > img")
+            image = await self.find_el(
+                ".not-full-screen-image-container > img")
            image = await image.get_attribute('srcset')
            image = image.split(",")[0].split(" ")[0]
            time.sleep(0.5)
@ -76,16 +80,22 @@ class MonetScraper:

        return image

-    def curl_image(self, image, title):
+    def curl_image(self, image, title, id):
        try:
-            os.mkdir("images")
+            os.mkdir("dist")
+        except FileExistsError:
+            pass
+
+        try:
+            os.mkdir("dist/images")
        except FileExistsError:
            pass

        if image != "null":
            image_response = requests.get(image)
            if image_response.status_code == 200:
-                with open(f'images/{title.lower().replace(",", "").replace(" ", "_")}.jpg', 'wb') as img_file:
+                with open(f'dist/images/{id}.jpg', 'wb')\
+                        as img_file:
                    img_file.write(image_response.content)

    async def get_title(self):
@ -148,7 +158,7 @@ class MonetScraper:
        return arr

    async def get_data(self):
-        for href in self.hrefs:
+        for index, href in enumerate(self.hrefs):
            await self.go_to(f"{self.base_url}{href}")
            image = await self.get_image()
            title = await self.get_title()
@ -157,8 +167,9 @@ class MonetScraper:
            exhibitions = await self.get_exhibitions()
            bibliography = await self.get_bibliography()

-            self.curl_image(image, title)
+            self.curl_image(image, title, id=index)
            self.data.append({
+                "id": index,
                "title": title,
                "date": get_info["date"],
                "name_of_artist": "Claude Monet",