webscraper/app/scripts/noguchi.py

import time
import requests
import os
import json
from playwright.async_api import async_playwright
import asyncio

# TODO: Scrape through all the pages
"""
NOTE:
    Some pages doesn'y have info about paintings, so we need to skip them
"""

class NoguchiScraper:
    def __init__(self, url="https://archive.noguchi.org/Browse/CR", base_url="https://archive.noguchi.org"):
        self.hrefs = []
        self.base_url = base_url
        self.url = url
        self.data = []
        self.pages = 3

    async def scrape(self):
        async with async_playwright() as p:
            self.browser = await p.chromium.launch(headless=False)
            self.context = await self.browser.new_context()
            self.page = await self.context.new_page()
            await self.go_to(self.url)
            await self.skip_cookies()
            await self.get_hrefs()
            self.page.set_default_timeout(10000)
            # await self.get_data()
            # self.save_data()
            await self.browser.close()

    async def skip_cookies(self):
        element = await self.find_el('a.acceptCookie')
        await element.click()

    async def insert_value(self, selector, value):
        await self.page.fill(selector, value)

    async def find_el(self, selector: str):
        await self.wait_for_el(selector)
        return await self.page.query_selector(selector)

    async def find_els(self, selector: str):
        await self.wait_for_el(selector)
        return await self.page.query_selector_all(selector)

    async def wait_for_el(self, selector: str):
        await self.page.wait_for_selector(selector)

    async def go_to(self, url, tabs=False):
        hack = True
        while hack:
            try:
                await self.page.goto(url, timeout=60000)
                hack = False
            except Exception as e:
                print(e)
                print(f'error go to {url}')

    async def load_more(self):
        button = await self.find_el('.load-more-wrapper > a')
        await button.click()
        time.sleep(5)

    async def get_hrefs(self):
        await self.load_more()
        await self.load_more()
        links = await self.find_els('div.grid-flex.grid-cr-browse div.item-grid a')
        arr = []
        for link in links:
            href = await link.get_attribute('href')
            arr.append(href)
        print(arr)


if __name__ == "__main__":
    scraper = NoguchiScraper()
    asyncio.run(scraper.scrape())