feat: init the script for noguchi arts
Some checks are pending
Docker Image CI / build (push) Waiting to run
Some checks are pending
Docker Image CI / build (push) Waiting to run
Signed-off-by: paprykdev <58005447+paprykdev@users.noreply.github.com>
This commit is contained in:
parent
6fa3241eae
commit
c7ddd2487f
81
app/scripts/noguchi.py
Normal file
81
app/scripts/noguchi.py
Normal file
@ -0,0 +1,81 @@
|
||||
import time
|
||||
import requests
|
||||
import os
|
||||
import json
|
||||
from playwright.async_api import async_playwright
|
||||
import asyncio
|
||||
|
||||
# TODO: Scrape through all the pages
|
||||
"""
|
||||
NOTE:
|
||||
Some pages doesn'y have info about paintings, so we need to skip them
|
||||
"""
|
||||
|
||||
class NoguchiScraper:
|
||||
def __init__(self, url="https://archive.noguchi.org/Browse/CR", base_url="https://archive.noguchi.org"):
|
||||
self.hrefs = []
|
||||
self.base_url = base_url
|
||||
self.url = url
|
||||
self.data = []
|
||||
self.pages = 3
|
||||
|
||||
async def scrape(self):
|
||||
async with async_playwright() as p:
|
||||
self.browser = await p.chromium.launch(headless=False)
|
||||
self.context = await self.browser.new_context()
|
||||
self.page = await self.context.new_page()
|
||||
self.page.set_default_timeout(10000)
|
||||
await self.go_to(self.url)
|
||||
await self.skip_cookies()
|
||||
await self.get_hrefs()
|
||||
# await self.get_data()
|
||||
# self.save_data()
|
||||
await self.browser.close()
|
||||
|
||||
async def skip_cookies(self):
|
||||
element = await self.find_el('a.acceptCookie')
|
||||
await element.click()
|
||||
|
||||
async def insert_value(self, selector, value):
|
||||
await self.page.fill(selector, value)
|
||||
|
||||
async def find_el(self, selector: str):
|
||||
await self.wait_for_el(selector)
|
||||
return await self.page.query_selector(selector)
|
||||
|
||||
async def find_els(self, selector: str):
|
||||
await self.wait_for_el(selector)
|
||||
return await self.page.query_selector_all(selector)
|
||||
|
||||
async def wait_for_el(self, selector: str):
|
||||
await self.page.wait_for_selector(selector)
|
||||
|
||||
async def go_to(self, url, tabs=False):
|
||||
hack = True
|
||||
while hack:
|
||||
try:
|
||||
await self.page.goto(url, timeout=60000)
|
||||
hack = False
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print(f'error go to {url}')
|
||||
|
||||
async def load_more(self):
|
||||
button = await self.find_el('.load-more-wrapper > a')
|
||||
await button.click()
|
||||
time.sleep(5)
|
||||
|
||||
async def get_hrefs(self):
|
||||
await self.load_more()
|
||||
await self.load_more()
|
||||
links = await self.find_els('div.grid-flex.grid-cr-browse div.item-grid a')
|
||||
arr = []
|
||||
for link in links:
|
||||
href = await link.get_attribute('href')
|
||||
arr.append(href)
|
||||
print(arr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
scraper = NoguchiScraper()
|
||||
asyncio.run(scraper.scrape())
|
Loading…
Reference in New Issue
Block a user