import requests from bs4 import BeautifulSoup import json pages_count = [f"?page={i}" for i in range(1, 22)] main_url = "https://www.salvador-dali.org/en/artwork/catalogue-raisonne-paintings/obres/" for page in pages_count: response = requests.get(f"{main_url}{page}") soup = BeautifulSoup(response.text, 'html.parser') data = [] # Tworzenie listy na linki hrefs = [] # Znajdowanie wszystkich linków (elementy z atrybutem href) for h2_tag in soup.find_all('h2', class_='obra__title'): # Szukanie wewnątrz

elementu z atrybutem href a_tag = h2_tag.find('a', href=True) if a_tag: # Sprawdzenie, czy element został znaleziony hrefs.append(a_tag['href']) for link in hrefs: response = requests.get(f"https://www.salvador-dali.org{link}") soup = BeautifulSoup(response.text, 'html.parser') with open(f"html/{link.replace('/', '_')}.html", 'w+', encoding='utf-8') as file: file.write(soup.prettify()) painting = soup.find('div', class_='fitxa-obra') if painting: # Tworzenie słownika na informacje painting_info = {} # Znajdowanie tytułu pracy title = painting.find('p', class_='titol-traduit') title_2 = painting.find('span', class_='titol-traduit') if title: painting_info['title'] = title.text.strip() elif title_2: painting_info['title'] = title_2.text.strip() title = title_2 # Szukanie wszystkich kluczy
i wartości
for dl_tag in painting.find_all('dl'): for dt, dd in zip(dl_tag.find_all('dt'), dl_tag.find_all('dd')): # Klucz to tekst z
key = dt.text.strip().lower() # Wartość to tekst z
value = dd.get_text(separator=" ").split('
') painting_info[key] = value painting_info['name_of_aritst'] = "Salvador Dali" # Wyciągniecie danych z sekcji z obrazem image_section = soup.find('figure', class_='foto-obra') if image_section: image_tag = image_section.find('img') if image_tag and 'src' in image_tag.attrs: painting_info['image_url'] = f"https://www.salvador-dali.org{image_tag['src']}" # Zapis zdjęcia image_response = requests.get(painting_info['image_url']) if image_response.status_code == 200: with open(f"images/{painting_info['title'].replace(' ', '_')}.jpg", 'wb') as img_file: img_file.write(image_response.content) # Wyciągnięcie danych z sekcji "Provenance" provenance_section = soup.find('h2', text="Provenance") if provenance_section: provenance_items = provenance_section.find_next('ol').find_all('li') painting_info['provenance'] = [item.get_text(strip=True) for item in provenance_items] # Wyciągnięcie danych z sekcji "Exhibitions" exhibitions_section = soup.find('h2', text="xhibitions") if exhibitions_section: exhibitions_items = exhibitions_section.find_next('ol').find_all('li') painting_info['exhibitions'] = [item.get_text(strip=True) for item in exhibitions_items] # Wyciągnięcie danych z sekcji "Bibliography" bibliography_section = soup.find('h2', text="Bibliography") if bibliography_section: bibliography_items = bibliography_section.find_next('ol').find_all('li') painting_info['bibliography'] = [item.get_text(strip=True) for item in bibliography_items] # Wyświetlanie zebranych informacji #print(json.dumps(painting_info, indent=4, ensure_ascii=False)) data.append(painting_info) # Zapysanie wyników do pliku with open('dali.json', 'w', encoding='utf-8') as f: json.dump(data, f, indent=4, ensure_ascii=False)