183 lines
7.4 KiB
Python
183 lines
7.4 KiB
Python
import os
|
|
import requests
|
|
from selenium import webdriver
|
|
from selenium.webdriver.chrome.service import Service
|
|
from selenium.webdriver.common.by import By
|
|
from webdriver_manager.chrome import ChromeDriverManager
|
|
from time import sleep
|
|
import json
|
|
from bs4 import BeautifulSoup
|
|
|
|
# chromedriver
|
|
service = Service(ChromeDriverManager().install())
|
|
driver = webdriver.Chrome(service=service)
|
|
|
|
# sprawdzenie czy folder gorky istnieje i utworzenie go jesli go nie ma
|
|
folder_path = "gorky"
|
|
if not os.path.exists(folder_path):
|
|
os.makedirs(folder_path)
|
|
print(f"Folder '{folder_path}' został utworzony.")
|
|
else:
|
|
print(f"Folder '{folder_path}' już istnieje.")
|
|
|
|
# wczytaj istniejace dane z pliku json jesli istnieje
|
|
if os.path.exists("gorky_catalogue.json"):
|
|
with open("gorky_catalogue.json", "r", encoding="utf-8") as f:
|
|
existing_artworks = json.load(f)
|
|
else:
|
|
existing_artworks = []
|
|
|
|
existing_ids = {artwork['id'] for artwork in existing_artworks}
|
|
artworks = []
|
|
|
|
try:
|
|
# logowanie do strony
|
|
driver.get("https://www.gorkycatalogue.org/catalogue/")
|
|
email_input = driver.find_element(By.ID, "email")
|
|
email_input.send_keys("darekkolano321@gmail.com")
|
|
submit_button = driver.find_element(By.ID, "loginReturn")
|
|
submit_button.click()
|
|
|
|
# sleep na ladowanie strony
|
|
sleep(5)
|
|
print("Zalogowano")
|
|
|
|
# Poprawiona generacja stron - użycie pageNum zamiast page
|
|
pages_count = [f"index.php?pageNum={i}" for i in range(0, 35)] # Zmiana z range(1, 35) na range(0, 35)
|
|
base_url = "https://www.gorkycatalogue.org/catalogue/"
|
|
|
|
# zbieranie danych
|
|
artwork_id = max(existing_ids, default=0) + 1
|
|
|
|
for page in pages_count:
|
|
print(f"Przetwarzam stronę: {base_url + page}")
|
|
driver.get(base_url + page)
|
|
sleep(3) # sleep na ladowanie strony
|
|
|
|
soup = BeautifulSoup(driver.page_source, "html.parser")
|
|
|
|
# znajdywanie linkow do obrazow
|
|
artwork_links = []
|
|
for link_tag in soup.find_all("a", class_="recordContainer image_noslide"):
|
|
href = link_tag.get('href')
|
|
if href and href not in artwork_links:
|
|
artwork_links.append(f"https://www.gorkycatalogue.org{href}")
|
|
|
|
print(f"Znaleziono {len(artwork_links)} obrazow na tej stronie.")
|
|
|
|
# przetwarzanie obrazow
|
|
for artwork_url in artwork_links:
|
|
try:
|
|
driver.get(artwork_url)
|
|
sleep(3) # sleep na ladowanie strony
|
|
soup = BeautifulSoup(driver.page_source, "html.parser")
|
|
artwork_info = {"id": artwork_id} # dodawanie id obrazom
|
|
artwork_id += 1
|
|
|
|
# tytul
|
|
title_tag = soup.find("div", class_="tombstone div_Title")
|
|
artwork_info["title"] = title_tag.text.strip('()[]') if title_tag else None
|
|
|
|
artwork_info["name_of_artist"] = "Arshile Gorky"
|
|
|
|
# data
|
|
date_tag = soup.find("div", class_="tombstone div_fullDate")
|
|
artwork_info["date"] = date_tag.text.strip() if date_tag else None
|
|
|
|
# wymiary
|
|
dimensions_tag = soup.find("div", class_="tombstone div_fullDimension")
|
|
artwork_info["dimensions"] = dimensions_tag.text.strip() if dimensions_tag else None
|
|
|
|
# technika
|
|
technique_tag = soup.find("div", class_="tombstone div_fullMedium")
|
|
artwork_info["technique"] = technique_tag.text.strip() if technique_tag else None
|
|
|
|
# podpis
|
|
signature_tag = soup.find("div", class_="tombstone div_fullInscription")
|
|
artwork_info["signature"] = signature_tag.text.strip() if signature_tag else None
|
|
|
|
# lokacja
|
|
location_tag = soup.find("div", class_="tombstone div_CreditLine")
|
|
artwork_info["location"] = location_tag.text.strip() if location_tag else None
|
|
|
|
# provenance
|
|
provenance_tag = soup.find("div", id="sectionProvenance")
|
|
if provenance_tag:
|
|
raw_provenance = provenance_tag.text.strip()
|
|
artwork_info["provenance"] = [line.strip() for line in raw_provenance.split("\n") if line.strip()]
|
|
else:
|
|
artwork_info["provenance"] = None
|
|
|
|
# wystawy
|
|
exhibitions_tag = soup.find("div", id="sectionExhibitions")
|
|
if exhibitions_tag:
|
|
raw_exhibitions = exhibitions_tag.text.strip()
|
|
artwork_info["exhibitions"] = [line.strip() for line in raw_exhibitions.split("\n") if line.strip()]
|
|
else:
|
|
artwork_info["exhibitions"] = None
|
|
|
|
# bibliografia
|
|
bibliography_tag = soup.find("div", id="sectionLiterature")
|
|
if bibliography_tag:
|
|
raw_bibliography = bibliography_tag.text.strip()
|
|
artwork_info["bibliography"] = [line.strip() for line in raw_bibliography.split("\n") if line.strip()]
|
|
else:
|
|
artwork_info["bibliography"] = None
|
|
|
|
# notes
|
|
notes_tag = soup.find("div", id="remarkText")
|
|
if notes_tag:
|
|
raw_notes = notes_tag.text.strip()
|
|
artwork_info["notes"] = [line.strip() for line in raw_notes.split("\n") if line.strip()]
|
|
else:
|
|
artwork_info["notes"] = None
|
|
|
|
# commentary
|
|
commentary_tag = soup.find("div", id="commentaryText")
|
|
if commentary_tag:
|
|
raw_commentary = commentary_tag.text.strip()
|
|
artwork_info["commentary"] = [line.strip() for line in raw_commentary.split("\n") if line.strip()]
|
|
else:
|
|
artwork_info["commentary"] = None
|
|
|
|
# obraz
|
|
image_tag = soup.find("a", {"data-src": True})
|
|
if image_tag:
|
|
artwork_info["image_url"] = image_tag["data-src"]
|
|
|
|
# pobranie obrazu i zapisanie go
|
|
image_url = artwork_info["image_url"]
|
|
image_response = requests.get(image_url)
|
|
if image_response.status_code == 200:
|
|
image_filename = f"{folder_path}/{artwork_info['id']}.jpg" # Nazwa pliku na podstawie ID dzieła
|
|
with open(image_filename, 'wb') as img_file:
|
|
img_file.write(image_response.content)
|
|
else:
|
|
artwork_info["image_url"] = None
|
|
|
|
# ograniczenie duplikatow
|
|
if artwork_info["id"] not in existing_ids:
|
|
artworks.append(artwork_info)
|
|
existing_ids.add(artwork_info["id"])
|
|
|
|
# print danych
|
|
print(json.dumps(artwork_info, indent=4, ensure_ascii=False))
|
|
|
|
except Exception as e:
|
|
print(f"Nie udało się przetworzyć linku {artwork_url}: {e}")
|
|
|
|
# zapisanie nowych danych
|
|
with open("gorky_catalogue.json", "w", encoding="utf-8") as f:
|
|
json.dump(existing_artworks + artworks, f, indent=4, ensure_ascii=False)
|
|
|
|
print("Dane zostały zapisane do pliku 'gorky_catalogue.json'.")
|
|
# obsluga ctrl c
|
|
except KeyboardInterrupt:
|
|
print("Przerwano")
|
|
with open("gorky_catalogue.json", "w", encoding="utf-8") as f:
|
|
json.dump(existing_artworks + artworks, f, indent=4, ensure_ascii=False)
|
|
print("Dane zostały zapisane")
|
|
|
|
finally:
|
|
driver.quit()
|