PROJEKT_ZALICZENIOWYY/gorky.py
2025-01-27 21:49:00 +00:00

183 lines
7.4 KiB
Python

import os
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
import json
from bs4 import BeautifulSoup
# chromedriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
# sprawdzenie czy folder gorky istnieje i utworzenie go jesli go nie ma
folder_path = "gorky"
if not os.path.exists(folder_path):
os.makedirs(folder_path)
print(f"Folder '{folder_path}' został utworzony.")
else:
print(f"Folder '{folder_path}' już istnieje.")
# wczytaj istniejace dane z pliku json jesli istnieje
if os.path.exists("gorky_catalogue.json"):
with open("gorky_catalogue.json", "r", encoding="utf-8") as f:
existing_artworks = json.load(f)
else:
existing_artworks = []
existing_ids = {artwork['id'] for artwork in existing_artworks}
artworks = []
try:
# logowanie do strony
driver.get("https://www.gorkycatalogue.org/catalogue/")
email_input = driver.find_element(By.ID, "email")
email_input.send_keys("darekkolano321@gmail.com")
submit_button = driver.find_element(By.ID, "loginReturn")
submit_button.click()
# sleep na ladowanie strony
sleep(5)
print("Zalogowano")
# Poprawiona generacja stron - użycie pageNum zamiast page
pages_count = [f"index.php?pageNum={i}" for i in range(0, 35)] # Zmiana z range(1, 35) na range(0, 35)
base_url = "https://www.gorkycatalogue.org/catalogue/"
# zbieranie danych
artwork_id = max(existing_ids, default=0) + 1
for page in pages_count:
print(f"Przetwarzam stronę: {base_url + page}")
driver.get(base_url + page)
sleep(3) # sleep na ladowanie strony
soup = BeautifulSoup(driver.page_source, "html.parser")
# znajdywanie linkow do obrazow
artwork_links = []
for link_tag in soup.find_all("a", class_="recordContainer image_noslide"):
href = link_tag.get('href')
if href and href not in artwork_links:
artwork_links.append(f"https://www.gorkycatalogue.org{href}")
print(f"Znaleziono {len(artwork_links)} obrazow na tej stronie.")
# przetwarzanie obrazow
for artwork_url in artwork_links:
try:
driver.get(artwork_url)
sleep(3) # sleep na ladowanie strony
soup = BeautifulSoup(driver.page_source, "html.parser")
artwork_info = {"id": artwork_id} # dodawanie id obrazom
artwork_id += 1
# tytul
title_tag = soup.find("div", class_="tombstone div_Title")
artwork_info["title"] = title_tag.text.strip('()[]') if title_tag else None
artwork_info["name_of_artist"] = "Arshile Gorky"
# data
date_tag = soup.find("div", class_="tombstone div_fullDate")
artwork_info["date"] = date_tag.text.strip() if date_tag else None
# wymiary
dimensions_tag = soup.find("div", class_="tombstone div_fullDimension")
artwork_info["dimensions"] = dimensions_tag.text.strip() if dimensions_tag else None
# technika
technique_tag = soup.find("div", class_="tombstone div_fullMedium")
artwork_info["technique"] = technique_tag.text.strip() if technique_tag else None
# podpis
signature_tag = soup.find("div", class_="tombstone div_fullInscription")
artwork_info["signature"] = signature_tag.text.strip() if signature_tag else None
# lokacja
location_tag = soup.find("div", class_="tombstone div_CreditLine")
artwork_info["location"] = location_tag.text.strip() if location_tag else None
# provenance
provenance_tag = soup.find("div", id="sectionProvenance")
if provenance_tag:
raw_provenance = provenance_tag.text.strip()
artwork_info["provenance"] = [line.strip() for line in raw_provenance.split("\n") if line.strip()]
else:
artwork_info["provenance"] = None
# wystawy
exhibitions_tag = soup.find("div", id="sectionExhibitions")
if exhibitions_tag:
raw_exhibitions = exhibitions_tag.text.strip()
artwork_info["exhibitions"] = [line.strip() for line in raw_exhibitions.split("\n") if line.strip()]
else:
artwork_info["exhibitions"] = None
# bibliografia
bibliography_tag = soup.find("div", id="sectionLiterature")
if bibliography_tag:
raw_bibliography = bibliography_tag.text.strip()
artwork_info["bibliography"] = [line.strip() for line in raw_bibliography.split("\n") if line.strip()]
else:
artwork_info["bibliography"] = None
# notes
notes_tag = soup.find("div", id="remarkText")
if notes_tag:
raw_notes = notes_tag.text.strip()
artwork_info["notes"] = [line.strip() for line in raw_notes.split("\n") if line.strip()]
else:
artwork_info["notes"] = None
# commentary
commentary_tag = soup.find("div", id="commentaryText")
if commentary_tag:
raw_commentary = commentary_tag.text.strip()
artwork_info["commentary"] = [line.strip() for line in raw_commentary.split("\n") if line.strip()]
else:
artwork_info["commentary"] = None
# obraz
image_tag = soup.find("a", {"data-src": True})
if image_tag:
artwork_info["image_url"] = image_tag["data-src"]
# pobranie obrazu i zapisanie go
image_url = artwork_info["image_url"]
image_response = requests.get(image_url)
if image_response.status_code == 200:
image_filename = f"{folder_path}/{artwork_info['id']}.jpg" # Nazwa pliku na podstawie ID dzieła
with open(image_filename, 'wb') as img_file:
img_file.write(image_response.content)
else:
artwork_info["image_url"] = None
# ograniczenie duplikatow
if artwork_info["id"] not in existing_ids:
artworks.append(artwork_info)
existing_ids.add(artwork_info["id"])
# print danych
print(json.dumps(artwork_info, indent=4, ensure_ascii=False))
except Exception as e:
print(f"Nie udało się przetworzyć linku {artwork_url}: {e}")
# zapisanie nowych danych
with open("gorky_catalogue.json", "w", encoding="utf-8") as f:
json.dump(existing_artworks + artworks, f, indent=4, ensure_ascii=False)
print("Dane zostały zapisane do pliku 'gorky_catalogue.json'.")
# obsluga ctrl c
except KeyboardInterrupt:
print("Przerwano")
with open("gorky_catalogue.json", "w", encoding="utf-8") as f:
json.dump(existing_artworks + artworks, f, indent=4, ensure_ascii=False)
print("Dane zostały zapisane")
finally:
driver.quit()