PROJEKT_ZALICZENIOWYY/twachtman.py
2025-01-27 21:50:21 +00:00

154 lines
6.2 KiB
Python

import os
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
import json
from bs4 import BeautifulSoup
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
folder_path = "twachtman"
if not os.path.exists(folder_path):
os.makedirs(folder_path)
print(f"Folder '{folder_path}' został utworzony.")
else:
print(f"Folder '{folder_path}' już istnieje.")
if os.path.exists("twachtman_catalogue.json"):
with open("twachtman_catalogue.json", "r", encoding="utf-8") as f:
existing_artworks = json.load(f)
else:
existing_artworks = []
existing_ids = {artwork['id'] for artwork in existing_artworks}
artworks = []
try:
driver.get("https://www.jhtwachtman.org/catalogue/")
sleep(6)
body = driver.find_element(By.TAG_NAME, "body")
body.click()
sleep(6)
email_input = driver.find_element(By.ID, "email")
email_input.send_keys("darekkolano321@gmail.com")
submit_button = driver.find_element(By.ID, "loginReturn")
submit_button.click()
sleep(6)
print("Zalogowano")
pages_count = [f"index.php?pageNum={i}" for i in range(0, 13)]
base_url = "https://www.jhtwachtman.org/catalogue/"
artwork_id = max(existing_ids, default=0) + 1
for page in pages_count:
print(f"Przetwarzam stronę: {base_url + page}")
driver.get(base_url + page)
sleep(3)
soup = BeautifulSoup(driver.page_source, "html.parser")
artwork_links = []
for link_tag in soup.find_all("a", class_="recordContainer image_noslide"):
href = link_tag.get('href')
if href and href not in artwork_links:
artwork_links.append(f"https://www.jhtwachtman.org{href}")
print(f"Znaleziono {len(artwork_links)} obrazow na tej stronie.")
for artwork_url in artwork_links:
try:
driver.get(artwork_url)
sleep(3)
soup = BeautifulSoup(driver.page_source, "html.parser")
artwork_info = {"id": artwork_id}
artwork_id += 1
title_tag = soup.find("div", class_="tombstone div_Title")
artwork_info["title"] = title_tag.text.strip('()[]') if title_tag else None
artwork_info["name_of_artist"] = "John Henry Twachtman"
date_tag = soup.find("div", class_="tombstone div_fullDate")
artwork_info["date"] = date_tag.text.strip() if date_tag else None
dimensions_tag = soup.find("div", class_="tombstone div_fullDimension")
artwork_info["dimensions"] = dimensions_tag.text.strip() if dimensions_tag else None
technique_tag = soup.find("div", class_="tombstone div_fullMedium")
artwork_info["technique"] = technique_tag.text.strip() if technique_tag else None
signature_tag = soup.find("div", class_="tombstone div_fullInscription")
artwork_info["signature"] = signature_tag.text.strip() if signature_tag else None
location_tag = soup.find("div", class_="tombstone div_CreditLine")
artwork_info["location"] = location_tag.text.strip() if location_tag else None
provenance_tag = soup.find("div", id="sectionProvenance")
if provenance_tag:
raw_provenance = provenance_tag.text.strip()
artwork_info["provenance"] = [line.strip() for line in raw_provenance.split("\n") if line.strip()]
else:
artwork_info["provenance"] = None
exhibitions_tag = soup.find("div", id="sectionExhibitions")
if exhibitions_tag:
raw_exhibitions = exhibitions_tag.text.strip()
artwork_info["exhibitions"] = [line.strip() for line in raw_exhibitions.split("\n") if line.strip()]
else:
artwork_info["exhibitions"] = None
bibliography_tag = soup.find("div", id="sectionLiterature")
if bibliography_tag:
raw_bibliography = bibliography_tag.text.strip()
artwork_info["bibliography"] = [line.strip() for line in raw_bibliography.split("\n") if line.strip()]
else:
artwork_info["bibliography"] = None
commentary_tag = soup.find("div", id="sectioncommentary")
if commentary_tag:
raw_commentary = commentary_tag.text.strip()
artwork_info["commentary"] = [line.strip() for line in raw_commentary.split("\n") if line.strip()]
else:
artwork_info["commentary"] = None
image_tag = soup.find("a", {"data-src": True})
if image_tag:
artwork_info["image_url"] = image_tag["data-src"]
image_url = artwork_info["image_url"]
image_response = requests.get(image_url)
if image_response.status_code == 200:
image_filename = f"{folder_path}/{artwork_info['id']}.jpg"
with open(image_filename, 'wb') as img_file:
img_file.write(image_response.content)
else:
artwork_info["image_url"] = None
if artwork_info["id"] not in existing_ids:
artworks.append(artwork_info)
existing_ids.add(artwork_info["id"])
print(json.dumps(artwork_info, indent=4, ensure_ascii=False))
except Exception as e:
print(f"Nie udało się przetworzyć linku {artwork_url}: {e}")
with open("twachtman_catalogue.json", "w", encoding="utf-8") as f:
json.dump(existing_artworks + artworks, f, indent=4, ensure_ascii=False)
print("Dane zostały zapisane do pliku 'twachtman_catalogue.json'.")
except KeyboardInterrupt:
print("Przerwano")
with open("twachtman_catalogue.json", "w", encoding="utf-8") as f:
json.dump(existing_artworks + artworks, f, indent=4, ensure_ascii=False)
print("Dane zostały zapisane")
finally:
driver.quit()