feat: add fully functional torres scraper

Signed-off-by: paprykdev <58005447+paprykdev@users.noreply.github.com>
2025-01-28 16:26:29 +01:00 · 2025-01-28 16:26:29 +01:00 · d256ec82da
commit d256ec82da
parent 1587f60c2b
6 changed files with 274 additions and 116 deletions
--- a/.gitignore
+++ b/.gitignore
@ -21,6 +21,8 @@ Thumbs.db

 # Ignore output files
 dist/
+torres/
+monet/
 build/
 *.egg-info/

--- a/README.md
+++ b/README.md
@ -4,4 +4,11 @@

 This project is a web scraper designed to extract data from websites.

+## How to use

+1. Clone the repository
+1. `cd webscraper`
+1. `cd app`
+1. `pip3 install -r requirements.txt`
+1. `python3 scripts/monet.py` for the monet scraper
+1. `python3 scripts/torres.py` for the torres scraper
--- a/app/docker/scripts/Dockerfile
+++ b/app/docker/scripts/Dockerfile
@ -1,19 +1,3 @@
-# FROM python:3.9-slim
-#
-# WORKDIR /usr/src/app
-#
-# COPY requirements.txt .
-# RUN pip install --trusted-host pypi.python.org -r requirements.txt
-#
-# COPY . .
-#
-# RUN apt-get update && apt-get install -y wget unzip && \
-#     wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \
-#     apt install -y ./google-chrome-stable_current_amd64.deb && \
-#     rm ./google-chrome-stable_current_amd64.deb && \
-#     apt-get clean
-#
-
 # Use an official Python runtime as a parent image
 FROM mcr.microsoft.com/playwright/python:v1.49.1-jammy

--- a/app/scripts/monet.py
+++ b/app/scripts/monet.py
@ -3,7 +3,7 @@ import requests
 import os
 import json
 from playwright.async_api import async_playwright
-from sys import exit
+import asyncio


 class MonetScraper:
@ -52,8 +52,7 @@ class MonetScraper:
            try:
                await self.page.goto(url, timeout=60000)
                hack = False
-            except Exception as e:
-                print(e)
+            except Exception:
                print(f'error go to {url}')

    async def get_hrefs(self):
@ -78,9 +77,11 @@ class MonetScraper:
                    ".not-full-screen-image-container > img")
            except Exception as e:
                print(f"Error: {e}\n\nOn page: {href}")
-                exit(1)
+                return None
            image = await image.get_attribute('srcset')
-            image = image.split(",")[0].split(" ")[0]
+            image = image.split(",")
+            if len(image) > 0:
+                image = image[len(image) - 1].strip().split(" ")[0]
            time.sleep(0.5)
            i += 1

@ -89,19 +90,14 @@ class MonetScraper:

    def curl_image(self, image, title, id):
        try:
-            os.mkdir("dist")
-        except FileExistsError:
-            pass
-
-        try:
-            os.mkdir("dist/images")
+            os.mkdir("monet")
        except FileExistsError:
            pass

        if image != "null":
            image_response = requests.get(image)
            if image_response.status_code == 200:
-                with open(f'dist/images/{id}.jpg', 'wb')\
+                with open(f'monet/{id}.jpg', 'wb')\
                        as img_file:
                    img_file.write(image_response.content)

@ -111,7 +107,7 @@ class MonetScraper:
        return title

    async def get_info(self):
-        info = await self.find_els("article[_ngcontent-ng-c2311764719] p > p")
+        info = await self.find_els("article[_ngcontent-ng-c746531210] p > p")
        return {
            "date": await info[0].inner_text(),
            "technique": await info[1].inner_text(),
@ -121,18 +117,17 @@ class MonetScraper:

    def save_data(self):
        try:
-            os.mkdir("dist")
+            os.mkdir("monet")
        except FileExistsError:
            pass
-        open("dist/data.json", "w", encoding="utf8").write(
+        open("monet/monet.json", "w", encoding="utf8").write(
            json.dumps([d for d in self.data], indent=4, ensure_ascii=False))

    async def get_provenance(self):
        provenances = None
        try:
            provenances = await self.find_els("#provenance p p")
-        except Exception as e:
-            print(e)
+        except Exception:
            return None
        return [await p.inner_text() for p in provenances]

@ -140,8 +135,7 @@ class MonetScraper:
        exhibitions = None
        try:
            exhibitions = await self.find_els("#exhibition article")
-        except Exception as e:
-            print(e)
+        except Exception:
            return None
        arr = []
        for paragraph in exhibitions:
@ -154,8 +148,7 @@ class MonetScraper:
        bibliography = None
        try:
            bibliography = await self.find_els("#publication article")
-        except Exception as e:
-            print(e)
+        except Exception:
            return None
        arr = []
        for paragraph in bibliography:
@ -167,12 +160,17 @@ class MonetScraper:
    async def get_data(self):
        for index, href in enumerate(self.hrefs):
            await self.go_to(f"{self.base_url}{href}")
+            print(f"{index + 1}/{len(self.hrefs)}")
            image = await self.get_image(href)
+            if not image:
+                continue
+            self.page.set_default_timeout(200)
            title = await self.get_title()
            get_info = await self.get_info()
            provenance = await self.get_provenance()
            exhibitions = await self.get_exhibitions()
            bibliography = await self.get_bibliography()
+            self.page.set_default_timeout(5000)

            self.curl_image(image, title, id=index)
            self.data.append({
@ -184,8 +182,18 @@ class MonetScraper:
                "dimensions": get_info["dimensions"],
                "signature": get_info["signature"],
                "location": None,
-                "image": image,
-                "provenance": provenance,
-                "exhibitions": exhibitions,
-                "bibliography": bibliography,
+                "image_url": image,
+                "provenance": provenance if provenance else [],
+                "exhibitions": exhibitions if exhibitions else [],
+                "bibliography": bibliography if bibliography else [],
            })
+
+
+if __name__ == "__main__":
+    scraper = MonetScraper()
+    try:
+        asyncio.run(scraper.scrape())
+    except KeyboardInterrupt:
+        print('\nSaving data to json..\n')
+        scraper.save_data()
+        asyncio.run(scraper.browser.close())
--- a/app/scripts/noguchi.py
+++ b/app/scripts/noguchi.py
@ -1,75 +0,0 @@
-import time
-from playwright.async_api import async_playwright
-import asyncio
-
-# TODO: Scrape through all the pages
-"""
-NOTE:
-    Some pages don't have info about paintings, so we need to skip them
-"""
-
-
-class NoguchiScraper:
-    def __init__(self, url="https://archive.noguchi.org/Browse/CR", base_url="https://archive.noguchi.org"):
-        self.hrefs = []
-        self.base_url = base_url
-        self.url = url
-        self.data = []
-        self.pages = 3
-
-    async def scrape(self):
-        async with async_playwright() as p:
-            self.browser = await p.chromium.launch(headless=False)
-            self.context = await self.browser.new_context()
-            self.page = await self.context.new_page()
-            await self.go_to(self.url)
-            await self.skip_cookies()
-            await self.get_hrefs()
-            self.page.set_default_timeout(10000)
-            # await self.get_data()
-            # self.save_data()
-            await self.browser.close()
-
-    async def skip_cookies(self):
-        element = await self.find_el('a.acceptCookie')
-        await element.click()
-
-    async def find_el(self, selector: str):
-        await self.wait_for_el(selector)
-        return await self.page.query_selector(selector)
-
-    async def find_els(self, selector: str):
-        await self.wait_for_el(selector)
-        return await self.page.query_selector_all(selector)
-
-    async def wait_for_el(self, selector: str):
-        await self.page.wait_for_selector(selector)
-
-    async def go_to(self, url, tabs=False):
-        hack = True
-        while hack:
-            try:
-                await self.page.goto(url, timeout=60000)
-                hack = False
-            except Exception as e:
-                print(e)
-                print(f'error go to {url}')
-
-    async def load_more(self):
-        button = await self.find_el('.load-more-wrapper > a')
-        await button.click()
-        time.sleep(5)
-
-    async def get_hrefs(self):
-        [await self.load_more() for _ in range(2)]
-        links = await self.find_els('div.grid-flex.grid-cr-browse div.item-grid a')
-        arr = []
-        for link in links:
-            href = await link.get_attribute('href')
-            arr.append(href)
-        print(arr)
-
-
-if __name__ == "__main__":
-    scraper = NoguchiScraper()
-    asyncio.run(scraper.scrape())
--- a/app/scripts/torres.py
+++ b/app/scripts/torres.py
@ -0,0 +1,232 @@
+import time
+import requests
+import os
+import json
+from playwright.async_api import async_playwright
+import asyncio
+
+
+# max pages = 41
+
+class TorresScraper:
+    def __init__(self,
+                 url="https://www.torresgarcia.com"):
+        self.hrefs = []
+        self.url = url
+        self.data = []
+        self.pages = 41
+        self.email = "jinaj73631@nalwan.com"
+
+    async def scrape(self):
+        async with async_playwright() as p:
+            self.browser = await p.chromium.launch(headless=False)
+            self.context = await self.browser.new_context()
+            self.page = await self.context.new_page()
+            await self.go_to(self.url)
+            await self.login()
+            await self.get_hrefs()
+            self.page.set_default_timeout(5000)
+            await self.get_data()
+            self.save_data()
+            await self.browser.close()
+
+    async def login(self):
+        enter_button = await self.find_el('#enterSiteLink')
+        await enter_button.click()
+        input = await self.find_el('#email')
+        await input.fill(self.email)
+        submit_button = await self.find_el('#loginReturn')
+        await submit_button.click()
+
+    async def find_el(self, selector: str):
+        await self.wait_for_el(selector)
+        return await self.page.query_selector(selector)
+
+    async def find_els(self, selector: str):
+        await self.wait_for_el(selector)
+        return await self.page.query_selector_all(selector)
+
+    async def wait_for_el(self, selector: str):
+        await self.page.wait_for_selector(selector)
+
+    async def go_to(self, url, tabs=False):
+        hack = True
+        while hack:
+            try:
+                await self.page.goto(url, timeout=60000)
+                hack = False
+            except Exception:
+                print(f'error go to {url}')
+
+    async def get_hrefs(self):
+        for i in range(self.pages):
+            if i > 0:
+                pagination = await self.find_el(
+                    '#next')
+                await pagination.click()
+                time.sleep(1)
+            el = await self.find_els(
+                '#catWorks > .item > .recordContainer')
+            for e in el:
+                self.hrefs.append(await e.get_attribute('href'))
+
+    async def get_image(self, href):
+        image = "null"
+        i = 0
+        self.page.set_default_timeout(10000)
+        while image == "null" and i < 30:
+            try:
+                image = await self.find_el(
+                    "#mainImage")
+            except Exception as e:
+                print(f"Error: {e}\n\nOn page: {href}")
+                return None
+            image = await image.get_attribute('src')
+            time.sleep(0.5)
+            i += 1
+        self.page.set_default_timeout(5000)
+        return image
+
+    def curl_image(self, image, id):
+        try:
+            os.mkdir("torres")
+        except FileExistsError:
+            pass
+
+        if image != "null":
+            image_response = requests.get(image)
+            if image_response.status_code == 200:
+                with open(f'torres/{id}.jpg', 'wb')\
+                        as img_file:
+                    img_file.write(image_response.content)
+
+    async def get_title(self):
+        try:
+            title = await self.find_el(".div_Title em")
+            title = await title.inner_text()
+            return title
+        except Exception:
+            return None
+
+    async def get_date(self):
+        try:
+            date = await self.find_el(".div_fullDate")
+            date = await date.inner_text()
+            return date
+        except Exception:
+            return None
+
+    async def get_technique(self):
+        try:
+            technique = await self.find_el(".div_fullMedium")
+            technique = await technique.inner_text()
+            return technique
+        except Exception:
+            return None
+
+    async def get_dimensions(self):
+        try:
+            dimensions = await self.find_el(".div_fullDimension")
+            dimensions = await dimensions.inner_text()
+            return dimensions
+        except Exception:
+            return None
+
+    async def get_signature(self):
+        try:
+            signature = await self.find_el(".div_fullInscription")
+            signature = await signature.inner_text()
+            return signature
+        except Exception:
+            return
+
+    async def get_location(self):
+        try:
+            location = await self.find_el(".div_CreditLine")
+            location = await location.inner_text()
+            return location
+        except Exception:
+            return None
+
+    async def get_provenance(self):
+        try:
+            provenance = await self.find_els("#sectionProvenance > .sectionContent > .item")
+            arr = []
+            for paragraph in provenance:
+                arr.append(await paragraph.inner_text())
+            return arr
+        except Exception:
+            return []
+
+    async def get_exhibitions(self):
+        try:
+            exhibitions = await self.find_els("#sectionExhibitions > .sectionContent > .item")
+            arr = []
+            for paragraph in exhibitions:
+                arr.append(await paragraph.inner_text())
+            return arr
+        except Exception:
+            return []
+
+    async def get_bibliography(self):
+        try:
+            bibl = await self.find_els("#sectionLiterature > .sectionContent > .item")
+            arr = []
+            for paragraph in bibl:
+                arr.append(await paragraph.inner_text())
+            return arr
+        except Exception:
+            return []
+
+    def save_data(self):
+        try:
+            os.mkdir("torres")
+        except FileExistsError:
+            pass
+        open("torres/torres.json", "w", encoding="utf8").write(
+            json.dumps([d for d in self.data], indent=4, ensure_ascii=False))
+
+    async def get_data(self):
+        for index, href in enumerate(self.hrefs):
+            await self.go_to(f"{self.url}{href}")
+            print(f"{index + 1}/{len(self.hrefs)}")
+            image = await self.get_image(href)
+            if not image:
+                continue
+            self.page.set_default_timeout(200)
+            title = await self.get_title()
+            date = await self.get_date()
+            technique = await self.get_technique()
+            dimensions = await self.get_dimensions()
+            signature = await self.get_signature()
+            location = await self.get_location()
+            provenance = await self.get_provenance()
+            exhibitions = await self.get_exhibitions()
+            bibliography = await self.get_bibliography()
+            self.page.set_default_timeout(5000)
+            self.curl_image(image, index)
+
+            self.data.append({
+                "id": index,
+                "title": title,
+                "date": date,
+                "name_of_artist": "Joan Torres Garcia",
+                "technique": technique,
+                "dimensions": dimensions,
+                "signature": signature,
+                "location": location,
+                "image_url": image,
+                "provenance": provenance,
+                "exhibitions": exhibitions,
+                "bibliography": bibliography
+            })
+
+
+if __name__ == "__main__":
+    scraper = TorresScraper()
+    try:
+        asyncio.run(scraper.scrape())
+    except KeyboardInterrupt:
+        print('\nSaving data to json..\n')
+        scraper.save_data()
+        asyncio.run(scraper.browser.close())