98 lines
4.0 KiB
Python
Executable File
98 lines
4.0 KiB
Python
Executable File
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
|
|
pages_count = [f"?page={i}" for i in range(1, 22)]
|
|
|
|
main_url = "https://www.salvador-dali.org/en/artwork/catalogue-raisonne-paintings/obres/"
|
|
|
|
for page in pages_count:
|
|
|
|
response = requests.get(f"{main_url}{page}")
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
data = []
|
|
# Tworzenie listy na linki
|
|
hrefs = []
|
|
|
|
# Znajdowanie wszystkich linków (elementy <a> z atrybutem href)
|
|
for h2_tag in soup.find_all('h2', class_='obra__title'):
|
|
# Szukanie wewnątrz <h2> elementu <a> z atrybutem href
|
|
a_tag = h2_tag.find('a', href=True)
|
|
if a_tag: # Sprawdzenie, czy element <a> został znaleziony
|
|
hrefs.append(a_tag['href'])
|
|
|
|
|
|
for link in hrefs:
|
|
response = requests.get(f"https://www.salvador-dali.org{link}")
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
with open(f"html/{link.replace('/', '_')}.html", 'w+', encoding='utf-8') as file:
|
|
file.write(soup.prettify())
|
|
|
|
|
|
painting = soup.find('div', class_='fitxa-obra')
|
|
if painting:
|
|
# Tworzenie słownika na informacje
|
|
painting_info = {}
|
|
|
|
# Znajdowanie tytułu pracy
|
|
title = painting.find('p', class_='titol-traduit')
|
|
title_2 = painting.find('span', class_='titol-traduit')
|
|
|
|
if title:
|
|
painting_info['title'] = title.text.strip()
|
|
elif title_2:
|
|
painting_info['title'] = title_2.text.strip()
|
|
title = title_2
|
|
|
|
# Szukanie wszystkich kluczy <dt> i wartości <dd>
|
|
for dl_tag in painting.find_all('dl'):
|
|
for dt, dd in zip(dl_tag.find_all('dt'), dl_tag.find_all('dd')):
|
|
# Klucz to tekst z <dt>
|
|
key = dt.text.strip().lower()
|
|
# Wartość to tekst z <dd>
|
|
value = dd.get_text(separator=" ").split('<br>')
|
|
painting_info[key] = value
|
|
painting_info['name_of_aritst'] = "Salvador Dali"
|
|
|
|
# Wyciągniecie danych z sekcji z obrazem
|
|
image_section = soup.find('figure', class_='foto-obra')
|
|
if image_section:
|
|
image_tag = image_section.find('img')
|
|
if image_tag and 'src' in image_tag.attrs:
|
|
painting_info['image_url'] = f"https://www.salvador-dali.org{image_tag['src']}"
|
|
|
|
# Zapis zdjęcia
|
|
image_response = requests.get(painting_info['image_url'])
|
|
if image_response.status_code == 200:
|
|
with open(f"images/{painting_info['title'].replace(' ', '_')}.jpg", 'wb') as img_file:
|
|
img_file.write(image_response.content)
|
|
|
|
# Wyciągnięcie danych z sekcji "Provenance"
|
|
provenance_section = soup.find('h2', text="Provenance")
|
|
if provenance_section:
|
|
provenance_items = provenance_section.find_next('ol').find_all('li')
|
|
painting_info['provenance'] = [item.get_text(strip=True) for item in provenance_items]
|
|
|
|
# Wyciągnięcie danych z sekcji "Exhibitions"
|
|
exhibitions_section = soup.find('h2', text="xhibitions")
|
|
if exhibitions_section:
|
|
exhibitions_items = exhibitions_section.find_next('ol').find_all('li')
|
|
painting_info['exhibitions'] = [item.get_text(strip=True) for item in exhibitions_items]
|
|
|
|
# Wyciągnięcie danych z sekcji "Bibliography"
|
|
bibliography_section = soup.find('h2', text="Bibliography")
|
|
if bibliography_section:
|
|
bibliography_items = bibliography_section.find_next('ol').find_all('li')
|
|
painting_info['bibliography'] = [item.get_text(strip=True) for item in bibliography_items]
|
|
|
|
# Wyświetlanie zebranych informacji
|
|
#print(json.dumps(painting_info, indent=4, ensure_ascii=False))
|
|
|
|
data.append(painting_info)
|
|
|
|
|
|
# Zapysanie wyników do pliku
|
|
with open('dali.json', 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=4, ensure_ascii=False)
|