webscraper/app/scripts/monet.py

import time
import requests
import os
import json
from playwright.async_api import async_playwright


class MonetScraper:
    def __init__(self, url="https://digitalprojects.wpi.art/monet/artworks?page=1", base_url="https://digitalprojects.wpi.art"):
        self.hrefs = []
        self.base_url = base_url
        self.url = url
        self.data = []
        self.pages = 3

    async def scrape(self):
        async with async_playwright() as p:
            self.browser = await p.chromium.launch(headless=False)
            self.context = await self.browser.new_context()
            self.page = await self.context.new_page()
            self.page.set_default_timeout(5000)
            await self.go_to(self.url)
            await self.skip_cookies()
            await self.get_hrefs()
            await self.get_data()
            self.save_data()
            await self.browser.close()

    async def skip_cookies(self):
        await self.wait_for_el('.button-disabled')
        await self.page.eval_on_selector('.button-disabled', 'el => el.removeAttribute("disabled")')
        await self.page.click('.button-disabled')

    async def insert_value(self, selector, value):
        await self.page.fill(selector, value)

    async def find_el(self, selector: str):
        await self.wait_for_el(selector)
        return await self.page.query_selector(selector)

    async def find_els(self, selector: str):
        await self.wait_for_el(selector)
        return await self.page.query_selector_all(selector)

    async def wait_for_el(self, selector: str):
        await self.page.wait_for_selector(selector)

    async def go_to(self, url, tabs=False):
        hack = True
        while hack:
            try:
                await self.page.goto(url, timeout=60000)
                hack = False
            except Exception as e:
                print(e)
                print(f'error go to {url}')

    async def get_hrefs(self):
        for i in range(self.pages):
            if i > 0:
                pagination = await self.find_el('cpd-controls-pagination > button:last-child')
                await pagination.click()
                time.sleep(1)
            el = await self.find_els('.artwork-search-results > article:not(.not-included) > a')
            for e in el:
                self.hrefs.append(await e.get_attribute('href'))

    async def get_image(self):
        image = await self.find_el(".not-full-screen-image-container > img")
        image = await image.get_attribute('srcset')
        image = image.split(",")[0].split(" ")[0]
        i = 0
        while image == "null" and i < 10:
            image = await self.find_el(".not-full-screen-image-container > img")
            image = await image.get_attribute('srcset')
            image = image.split(",")[0].split(" ")[0]
            time.sleep(0.5)
            i += 1

        return image

    def curl_image(self, image, title):
        try:
            os.mkdir("images")
        except FileExistsError:
            pass

        if image != "null":
            image_response = requests.get(image)
            if image_response.status_code == 200:
                with open(f'images/{title.lower().replace(",", "").replace(" ", "_")}.jpg', 'wb') as img_file:
                    img_file.write(image_response.content)

    async def get_title(self):
        title = await self.find_el(".details h1")
        title = await title.inner_text()
        return title

    async def get_info(self):
        info = await self.find_els("article[_ngcontent-ng-c2311764719] p > p")
        return {
            "date": await info[0].inner_text(),
            "technique": await info[1].inner_text(),
            "dimensions": await info[2].inner_text(),
            "signature": await info[3].inner_text(),
        }

    def save_data(self):
        try:
            os.mkdir("dist")
        except FileExistsError:
            pass
        open("dist/data.json", "w", encoding="utf8").write(
            json.dumps([d for d in self.data], indent=4, ensure_ascii=False))

    async def get_provenance(self):
        provenances = None
        try:
            provenances = await self.find_els("#provenance p p")
        except Exception as e:
            print(e)
            return None
        return [await p.inner_text() for p in provenances]

    async def get_exhibitions(self):
        exhibitions = None
        try:
            exhibitions = await self.find_els("#exhibition article")
        except Exception as e:
            print(e)
            return None
        arr = []
        for paragraph in exhibitions:
            await paragraph.wait_for_selector("p")
            ps = await paragraph.query_selector_all("p")
            arr.append(", ".join([await p.inner_text() for p in ps]))
        return arr

    async def get_bibliography(self):
        bibliography = None
        try:
            bibliography = await self.find_els("#publication article")
        except Exception as e:
            print(e)
            return None
        arr = []
        for paragraph in bibliography:
            await paragraph.wait_for_selector("p")
            ps = await paragraph.query_selector_all("p")
            arr.append(", ".join([await p.inner_text() for p in ps]))
        return arr

    async def get_data(self):
        for href in self.hrefs:
            await self.go_to(f"{self.base_url}{href}")
            image = await self.get_image()
            title = await self.get_title()
            get_info = await self.get_info()
            provenance = await self.get_provenance()
            exhibitions = await self.get_exhibitions()
            bibliography = await self.get_bibliography()

            self.curl_image(image, title)
            self.data.append({
                "title": title,
                "date": get_info["date"],
                "name_of_artist": "Claude Monet",
                "technique": get_info["technique"],
                "dimensions": get_info["dimensions"],
                "signature": get_info["signature"],
                "location": None,
                "image": image,
                "provenance": provenance,
                "exhibitions": exhibitions,
                "bibliography": bibliography,
            })
feat: scraper for monet arts Signed-off-by: paprykdev <58005447+paprykdev@users.noreply.github.com> 2024-12-18 01:41:12 +01:00			`import time`
			`import requests`
			`import os`
			`import json`
			`from playwright.async_api import async_playwright`


			`class MonetScraper:`
			`def __init__(self, url="https://digitalprojects.wpi.art/monet/artworks?page=1", base_url="https://digitalprojects.wpi.art"):`
			`self.hrefs = []`
			`self.base_url = base_url`
			`self.url = url`
			`self.data = []`
			`self.pages = 3`

			`async def scrape(self):`
			`async with async_playwright() as p:`
			`self.browser = await p.chromium.launch(headless=False)`
			`self.context = await self.browser.new_context()`
			`self.page = await self.context.new_page()`
			`self.page.set_default_timeout(5000)`
			`await self.go_to(self.url)`
			`await self.skip_cookies()`
			`await self.get_hrefs()`
			`await self.get_data()`
			`self.save_data()`
			`await self.browser.close()`

			`async def skip_cookies(self):`
			`await self.wait_for_el('.button-disabled')`
			`await self.page.eval_on_selector('.button-disabled', 'el => el.removeAttribute("disabled")')`
			`await self.page.click('.button-disabled')`

			`async def insert_value(self, selector, value):`
			`await self.page.fill(selector, value)`

			`async def find_el(self, selector: str):`
			`await self.wait_for_el(selector)`
			`return await self.page.query_selector(selector)`

			`async def find_els(self, selector: str):`
			`await self.wait_for_el(selector)`
			`return await self.page.query_selector_all(selector)`

			`async def wait_for_el(self, selector: str):`
			`await self.page.wait_for_selector(selector)`

			`async def go_to(self, url, tabs=False):`
			`hack = True`
			`while hack:`
			`try:`
			`await self.page.goto(url, timeout=60000)`
			`hack = False`
			`except Exception as e:`
			`print(e)`
			`print(f'error go to {url}')`

			`async def get_hrefs(self):`
			`for i in range(self.pages):`
			`if i > 0:`
			`pagination = await self.find_el('cpd-controls-pagination > button:last-child')`
			`await pagination.click()`
			`time.sleep(1)`
			`el = await self.find_els('.artwork-search-results > article:not(.not-included) > a')`
			`for e in el:`
			`self.hrefs.append(await e.get_attribute('href'))`

			`async def get_image(self):`
			`image = await self.find_el(".not-full-screen-image-container > img")`
			`image = await image.get_attribute('srcset')`
			`image = image.split(",")[0].split(" ")[0]`
			`i = 0`
			`while image == "null" and i < 10:`
			`image = await self.find_el(".not-full-screen-image-container > img")`
			`image = await image.get_attribute('srcset')`
			`image = image.split(",")[0].split(" ")[0]`
			`time.sleep(0.5)`
			`i += 1`

			`return image`

			`def curl_image(self, image, title):`
			`try:`
			`os.mkdir("images")`
			`except FileExistsError:`
			`pass`

			`if image != "null":`
			`image_response = requests.get(image)`
			`if image_response.status_code == 200:`
			`with open(f'images/{title.lower().replace(",", "").replace(" ", "_")}.jpg', 'wb') as img_file:`
			`img_file.write(image_response.content)`

			`async def get_title(self):`
			`title = await self.find_el(".details h1")`
			`title = await title.inner_text()`
			`return title`

			`async def get_info(self):`
			`info = await self.find_els("article[_ngcontent-ng-c2311764719] p > p")`
			`return {`
			`"date": await info[0].inner_text(),`
			`"technique": await info[1].inner_text(),`
			`"dimensions": await info[2].inner_text(),`
			`"signature": await info[3].inner_text(),`
			`}`

			`def save_data(self):`
			`try:`
			`os.mkdir("dist")`
			`except FileExistsError:`
			`pass`
fix(encoding): utf8 encoding support Signed-off-by: paprykdev <58005447+paprykdev@users.noreply.github.com> 2024-12-19 12:51:14 +01:00			`open("dist/data.json", "w", encoding="utf8").write(`
			`json.dumps([d for d in self.data], indent=4, ensure_ascii=False))`
feat: scraper for monet arts Signed-off-by: paprykdev <58005447+paprykdev@users.noreply.github.com> 2024-12-18 01:41:12 +01:00
			`async def get_provenance(self):`
			`provenances = None`
			`try:`
			`provenances = await self.find_els("#provenance p p")`
			`except Exception as e:`
			`print(e)`
			`return None`
			`return [await p.inner_text() for p in provenances]`

			`async def get_exhibitions(self):`
			`exhibitions = None`
			`try:`
			`exhibitions = await self.find_els("#exhibition article")`
			`except Exception as e:`
			`print(e)`
			`return None`
			`arr = []`
			`for paragraph in exhibitions:`
			`await paragraph.wait_for_selector("p")`
			`ps = await paragraph.query_selector_all("p")`
			`arr.append(", ".join([await p.inner_text() for p in ps]))`
			`return arr`

			`async def get_bibliography(self):`
			`bibliography = None`
			`try:`
			`bibliography = await self.find_els("#publication article")`
			`except Exception as e:`
			`print(e)`
			`return None`
			`arr = []`
			`for paragraph in bibliography:`
			`await paragraph.wait_for_selector("p")`
			`ps = await paragraph.query_selector_all("p")`
			`arr.append(", ".join([await p.inner_text() for p in ps]))`
			`return arr`

			`async def get_data(self):`
			`for href in self.hrefs:`
			`await self.go_to(f"{self.base_url}{href}")`
			`image = await self.get_image()`
			`title = await self.get_title()`
			`get_info = await self.get_info()`
			`provenance = await self.get_provenance()`
			`exhibitions = await self.get_exhibitions()`
			`bibliography = await self.get_bibliography()`

			`self.curl_image(image, title)`
			`self.data.append({`
			`"title": title,`
			`"date": get_info["date"],`
			`"name_of_artist": "Claude Monet",`
			`"technique": get_info["technique"],`
			`"dimensions": get_info["dimensions"],`
			`"signature": get_info["signature"],`
			`"location": None,`
			`"image": image,`
			`"provenance": provenance,`
			`"exhibitions": exhibitions,`
			`"bibliography": bibliography,`
			`})`