Signed-off-by: paprykdev <58005447+paprykdev@users.noreply.github.com>
This commit is contained in:
parent
1587f60c2b
commit
d256ec82da
2
.gitignore
vendored
2
.gitignore
vendored
@ -21,6 +21,8 @@ Thumbs.db
|
||||
|
||||
# Ignore output files
|
||||
dist/
|
||||
torres/
|
||||
monet/
|
||||
build/
|
||||
*.egg-info/
|
||||
|
||||
|
@ -4,4 +4,11 @@
|
||||
|
||||
This project is a web scraper designed to extract data from websites.
|
||||
|
||||
## How to use
|
||||
|
||||
1. Clone the repository
|
||||
1. `cd webscraper`
|
||||
1. `cd app`
|
||||
1. `pip3 install -r requirements.txt`
|
||||
1. `python3 scripts/monet.py` for the monet scraper
|
||||
1. `python3 scripts/torres.py` for the torres scraper
|
||||
|
@ -1,19 +1,3 @@
|
||||
# FROM python:3.9-slim
|
||||
#
|
||||
# WORKDIR /usr/src/app
|
||||
#
|
||||
# COPY requirements.txt .
|
||||
# RUN pip install --trusted-host pypi.python.org -r requirements.txt
|
||||
#
|
||||
# COPY . .
|
||||
#
|
||||
# RUN apt-get update && apt-get install -y wget unzip && \
|
||||
# wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb && \
|
||||
# apt install -y ./google-chrome-stable_current_amd64.deb && \
|
||||
# rm ./google-chrome-stable_current_amd64.deb && \
|
||||
# apt-get clean
|
||||
#
|
||||
|
||||
# Use an official Python runtime as a parent image
|
||||
FROM mcr.microsoft.com/playwright/python:v1.49.1-jammy
|
||||
|
||||
|
@ -3,7 +3,7 @@ import requests
|
||||
import os
|
||||
import json
|
||||
from playwright.async_api import async_playwright
|
||||
from sys import exit
|
||||
import asyncio
|
||||
|
||||
|
||||
class MonetScraper:
|
||||
@ -52,8 +52,7 @@ class MonetScraper:
|
||||
try:
|
||||
await self.page.goto(url, timeout=60000)
|
||||
hack = False
|
||||
except Exception as e:
|
||||
print(e)
|
||||
except Exception:
|
||||
print(f'error go to {url}')
|
||||
|
||||
async def get_hrefs(self):
|
||||
@ -78,9 +77,11 @@ class MonetScraper:
|
||||
".not-full-screen-image-container > img")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}\n\nOn page: {href}")
|
||||
exit(1)
|
||||
return None
|
||||
image = await image.get_attribute('srcset')
|
||||
image = image.split(",")[0].split(" ")[0]
|
||||
image = image.split(",")
|
||||
if len(image) > 0:
|
||||
image = image[len(image) - 1].strip().split(" ")[0]
|
||||
time.sleep(0.5)
|
||||
i += 1
|
||||
|
||||
@ -89,19 +90,14 @@ class MonetScraper:
|
||||
|
||||
def curl_image(self, image, title, id):
|
||||
try:
|
||||
os.mkdir("dist")
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
try:
|
||||
os.mkdir("dist/images")
|
||||
os.mkdir("monet")
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
if image != "null":
|
||||
image_response = requests.get(image)
|
||||
if image_response.status_code == 200:
|
||||
with open(f'dist/images/{id}.jpg', 'wb')\
|
||||
with open(f'monet/{id}.jpg', 'wb')\
|
||||
as img_file:
|
||||
img_file.write(image_response.content)
|
||||
|
||||
@ -111,7 +107,7 @@ class MonetScraper:
|
||||
return title
|
||||
|
||||
async def get_info(self):
|
||||
info = await self.find_els("article[_ngcontent-ng-c2311764719] p > p")
|
||||
info = await self.find_els("article[_ngcontent-ng-c746531210] p > p")
|
||||
return {
|
||||
"date": await info[0].inner_text(),
|
||||
"technique": await info[1].inner_text(),
|
||||
@ -121,18 +117,17 @@ class MonetScraper:
|
||||
|
||||
def save_data(self):
|
||||
try:
|
||||
os.mkdir("dist")
|
||||
os.mkdir("monet")
|
||||
except FileExistsError:
|
||||
pass
|
||||
open("dist/data.json", "w", encoding="utf8").write(
|
||||
open("monet/monet.json", "w", encoding="utf8").write(
|
||||
json.dumps([d for d in self.data], indent=4, ensure_ascii=False))
|
||||
|
||||
async def get_provenance(self):
|
||||
provenances = None
|
||||
try:
|
||||
provenances = await self.find_els("#provenance p p")
|
||||
except Exception as e:
|
||||
print(e)
|
||||
except Exception:
|
||||
return None
|
||||
return [await p.inner_text() for p in provenances]
|
||||
|
||||
@ -140,8 +135,7 @@ class MonetScraper:
|
||||
exhibitions = None
|
||||
try:
|
||||
exhibitions = await self.find_els("#exhibition article")
|
||||
except Exception as e:
|
||||
print(e)
|
||||
except Exception:
|
||||
return None
|
||||
arr = []
|
||||
for paragraph in exhibitions:
|
||||
@ -154,8 +148,7 @@ class MonetScraper:
|
||||
bibliography = None
|
||||
try:
|
||||
bibliography = await self.find_els("#publication article")
|
||||
except Exception as e:
|
||||
print(e)
|
||||
except Exception:
|
||||
return None
|
||||
arr = []
|
||||
for paragraph in bibliography:
|
||||
@ -167,12 +160,17 @@ class MonetScraper:
|
||||
async def get_data(self):
|
||||
for index, href in enumerate(self.hrefs):
|
||||
await self.go_to(f"{self.base_url}{href}")
|
||||
print(f"{index + 1}/{len(self.hrefs)}")
|
||||
image = await self.get_image(href)
|
||||
if not image:
|
||||
continue
|
||||
self.page.set_default_timeout(200)
|
||||
title = await self.get_title()
|
||||
get_info = await self.get_info()
|
||||
provenance = await self.get_provenance()
|
||||
exhibitions = await self.get_exhibitions()
|
||||
bibliography = await self.get_bibliography()
|
||||
self.page.set_default_timeout(5000)
|
||||
|
||||
self.curl_image(image, title, id=index)
|
||||
self.data.append({
|
||||
@ -184,8 +182,18 @@ class MonetScraper:
|
||||
"dimensions": get_info["dimensions"],
|
||||
"signature": get_info["signature"],
|
||||
"location": None,
|
||||
"image": image,
|
||||
"provenance": provenance,
|
||||
"exhibitions": exhibitions,
|
||||
"bibliography": bibliography,
|
||||
"image_url": image,
|
||||
"provenance": provenance if provenance else [],
|
||||
"exhibitions": exhibitions if exhibitions else [],
|
||||
"bibliography": bibliography if bibliography else [],
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
scraper = MonetScraper()
|
||||
try:
|
||||
asyncio.run(scraper.scrape())
|
||||
except KeyboardInterrupt:
|
||||
print('\nSaving data to json..\n')
|
||||
scraper.save_data()
|
||||
asyncio.run(scraper.browser.close())
|
||||
|
@ -1,75 +0,0 @@
|
||||
import time
|
||||
from playwright.async_api import async_playwright
|
||||
import asyncio
|
||||
|
||||
# TODO: Scrape through all the pages
|
||||
"""
|
||||
NOTE:
|
||||
Some pages don't have info about paintings, so we need to skip them
|
||||
"""
|
||||
|
||||
|
||||
class NoguchiScraper:
|
||||
def __init__(self, url="https://archive.noguchi.org/Browse/CR", base_url="https://archive.noguchi.org"):
|
||||
self.hrefs = []
|
||||
self.base_url = base_url
|
||||
self.url = url
|
||||
self.data = []
|
||||
self.pages = 3
|
||||
|
||||
async def scrape(self):
|
||||
async with async_playwright() as p:
|
||||
self.browser = await p.chromium.launch(headless=False)
|
||||
self.context = await self.browser.new_context()
|
||||
self.page = await self.context.new_page()
|
||||
await self.go_to(self.url)
|
||||
await self.skip_cookies()
|
||||
await self.get_hrefs()
|
||||
self.page.set_default_timeout(10000)
|
||||
# await self.get_data()
|
||||
# self.save_data()
|
||||
await self.browser.close()
|
||||
|
||||
async def skip_cookies(self):
|
||||
element = await self.find_el('a.acceptCookie')
|
||||
await element.click()
|
||||
|
||||
async def find_el(self, selector: str):
|
||||
await self.wait_for_el(selector)
|
||||
return await self.page.query_selector(selector)
|
||||
|
||||
async def find_els(self, selector: str):
|
||||
await self.wait_for_el(selector)
|
||||
return await self.page.query_selector_all(selector)
|
||||
|
||||
async def wait_for_el(self, selector: str):
|
||||
await self.page.wait_for_selector(selector)
|
||||
|
||||
async def go_to(self, url, tabs=False):
|
||||
hack = True
|
||||
while hack:
|
||||
try:
|
||||
await self.page.goto(url, timeout=60000)
|
||||
hack = False
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print(f'error go to {url}')
|
||||
|
||||
async def load_more(self):
|
||||
button = await self.find_el('.load-more-wrapper > a')
|
||||
await button.click()
|
||||
time.sleep(5)
|
||||
|
||||
async def get_hrefs(self):
|
||||
[await self.load_more() for _ in range(2)]
|
||||
links = await self.find_els('div.grid-flex.grid-cr-browse div.item-grid a')
|
||||
arr = []
|
||||
for link in links:
|
||||
href = await link.get_attribute('href')
|
||||
arr.append(href)
|
||||
print(arr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
scraper = NoguchiScraper()
|
||||
asyncio.run(scraper.scrape())
|
232
app/scripts/torres.py
Normal file
232
app/scripts/torres.py
Normal file
@ -0,0 +1,232 @@
|
||||
import time
|
||||
import requests
|
||||
import os
|
||||
import json
|
||||
from playwright.async_api import async_playwright
|
||||
import asyncio
|
||||
|
||||
|
||||
# max pages = 41
|
||||
|
||||
class TorresScraper:
|
||||
def __init__(self,
|
||||
url="https://www.torresgarcia.com"):
|
||||
self.hrefs = []
|
||||
self.url = url
|
||||
self.data = []
|
||||
self.pages = 41
|
||||
self.email = "jinaj73631@nalwan.com"
|
||||
|
||||
async def scrape(self):
|
||||
async with async_playwright() as p:
|
||||
self.browser = await p.chromium.launch(headless=False)
|
||||
self.context = await self.browser.new_context()
|
||||
self.page = await self.context.new_page()
|
||||
await self.go_to(self.url)
|
||||
await self.login()
|
||||
await self.get_hrefs()
|
||||
self.page.set_default_timeout(5000)
|
||||
await self.get_data()
|
||||
self.save_data()
|
||||
await self.browser.close()
|
||||
|
||||
async def login(self):
|
||||
enter_button = await self.find_el('#enterSiteLink')
|
||||
await enter_button.click()
|
||||
input = await self.find_el('#email')
|
||||
await input.fill(self.email)
|
||||
submit_button = await self.find_el('#loginReturn')
|
||||
await submit_button.click()
|
||||
|
||||
async def find_el(self, selector: str):
|
||||
await self.wait_for_el(selector)
|
||||
return await self.page.query_selector(selector)
|
||||
|
||||
async def find_els(self, selector: str):
|
||||
await self.wait_for_el(selector)
|
||||
return await self.page.query_selector_all(selector)
|
||||
|
||||
async def wait_for_el(self, selector: str):
|
||||
await self.page.wait_for_selector(selector)
|
||||
|
||||
async def go_to(self, url, tabs=False):
|
||||
hack = True
|
||||
while hack:
|
||||
try:
|
||||
await self.page.goto(url, timeout=60000)
|
||||
hack = False
|
||||
except Exception:
|
||||
print(f'error go to {url}')
|
||||
|
||||
async def get_hrefs(self):
|
||||
for i in range(self.pages):
|
||||
if i > 0:
|
||||
pagination = await self.find_el(
|
||||
'#next')
|
||||
await pagination.click()
|
||||
time.sleep(1)
|
||||
el = await self.find_els(
|
||||
'#catWorks > .item > .recordContainer')
|
||||
for e in el:
|
||||
self.hrefs.append(await e.get_attribute('href'))
|
||||
|
||||
async def get_image(self, href):
|
||||
image = "null"
|
||||
i = 0
|
||||
self.page.set_default_timeout(10000)
|
||||
while image == "null" and i < 30:
|
||||
try:
|
||||
image = await self.find_el(
|
||||
"#mainImage")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}\n\nOn page: {href}")
|
||||
return None
|
||||
image = await image.get_attribute('src')
|
||||
time.sleep(0.5)
|
||||
i += 1
|
||||
self.page.set_default_timeout(5000)
|
||||
return image
|
||||
|
||||
def curl_image(self, image, id):
|
||||
try:
|
||||
os.mkdir("torres")
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
if image != "null":
|
||||
image_response = requests.get(image)
|
||||
if image_response.status_code == 200:
|
||||
with open(f'torres/{id}.jpg', 'wb')\
|
||||
as img_file:
|
||||
img_file.write(image_response.content)
|
||||
|
||||
async def get_title(self):
|
||||
try:
|
||||
title = await self.find_el(".div_Title em")
|
||||
title = await title.inner_text()
|
||||
return title
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
async def get_date(self):
|
||||
try:
|
||||
date = await self.find_el(".div_fullDate")
|
||||
date = await date.inner_text()
|
||||
return date
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
async def get_technique(self):
|
||||
try:
|
||||
technique = await self.find_el(".div_fullMedium")
|
||||
technique = await technique.inner_text()
|
||||
return technique
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
async def get_dimensions(self):
|
||||
try:
|
||||
dimensions = await self.find_el(".div_fullDimension")
|
||||
dimensions = await dimensions.inner_text()
|
||||
return dimensions
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
async def get_signature(self):
|
||||
try:
|
||||
signature = await self.find_el(".div_fullInscription")
|
||||
signature = await signature.inner_text()
|
||||
return signature
|
||||
except Exception:
|
||||
return
|
||||
|
||||
async def get_location(self):
|
||||
try:
|
||||
location = await self.find_el(".div_CreditLine")
|
||||
location = await location.inner_text()
|
||||
return location
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
async def get_provenance(self):
|
||||
try:
|
||||
provenance = await self.find_els("#sectionProvenance > .sectionContent > .item")
|
||||
arr = []
|
||||
for paragraph in provenance:
|
||||
arr.append(await paragraph.inner_text())
|
||||
return arr
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
async def get_exhibitions(self):
|
||||
try:
|
||||
exhibitions = await self.find_els("#sectionExhibitions > .sectionContent > .item")
|
||||
arr = []
|
||||
for paragraph in exhibitions:
|
||||
arr.append(await paragraph.inner_text())
|
||||
return arr
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
async def get_bibliography(self):
|
||||
try:
|
||||
bibl = await self.find_els("#sectionLiterature > .sectionContent > .item")
|
||||
arr = []
|
||||
for paragraph in bibl:
|
||||
arr.append(await paragraph.inner_text())
|
||||
return arr
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
def save_data(self):
|
||||
try:
|
||||
os.mkdir("torres")
|
||||
except FileExistsError:
|
||||
pass
|
||||
open("torres/torres.json", "w", encoding="utf8").write(
|
||||
json.dumps([d for d in self.data], indent=4, ensure_ascii=False))
|
||||
|
||||
async def get_data(self):
|
||||
for index, href in enumerate(self.hrefs):
|
||||
await self.go_to(f"{self.url}{href}")
|
||||
print(f"{index + 1}/{len(self.hrefs)}")
|
||||
image = await self.get_image(href)
|
||||
if not image:
|
||||
continue
|
||||
self.page.set_default_timeout(200)
|
||||
title = await self.get_title()
|
||||
date = await self.get_date()
|
||||
technique = await self.get_technique()
|
||||
dimensions = await self.get_dimensions()
|
||||
signature = await self.get_signature()
|
||||
location = await self.get_location()
|
||||
provenance = await self.get_provenance()
|
||||
exhibitions = await self.get_exhibitions()
|
||||
bibliography = await self.get_bibliography()
|
||||
self.page.set_default_timeout(5000)
|
||||
self.curl_image(image, index)
|
||||
|
||||
self.data.append({
|
||||
"id": index,
|
||||
"title": title,
|
||||
"date": date,
|
||||
"name_of_artist": "Joan Torres Garcia",
|
||||
"technique": technique,
|
||||
"dimensions": dimensions,
|
||||
"signature": signature,
|
||||
"location": location,
|
||||
"image_url": image,
|
||||
"provenance": provenance,
|
||||
"exhibitions": exhibitions,
|
||||
"bibliography": bibliography
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
scraper = TorresScraper()
|
||||
try:
|
||||
asyncio.run(scraper.scrape())
|
||||
except KeyboardInterrupt:
|
||||
print('\nSaving data to json..\n')
|
||||
scraper.save_data()
|
||||
asyncio.run(scraper.browser.close())
|
Loading…
Reference in New Issue
Block a user