commit b00e119e92f55fd76e079d0214a1c6ac4faa8c8c Author: vboxuser Date: Tue Dec 10 18:49:40 2024 +0100 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d84ea44 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/images +/html \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100755 index 0000000..f974c28 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,3 @@ +FROM python:3.10.16-alpine3.21 +WORKDIR /app +RUN pip install bs4 requests diff --git a/dali.py b/dali.py new file mode 100755 index 0000000..7c4c9a8 --- /dev/null +++ b/dali.py @@ -0,0 +1,97 @@ +import requests +from bs4 import BeautifulSoup +import json + +pages_count = [f"?page={i}" for i in range(1, 22)] + +main_url = "https://www.salvador-dali.org/en/artwork/catalogue-raisonne-paintings/obres/" + +for page in pages_count: + + response = requests.get(f"{main_url}{page}") + soup = BeautifulSoup(response.text, 'html.parser') + + data = [] + # Tworzenie listy na linki + hrefs = [] + + # Znajdowanie wszystkich linków (elementy z atrybutem href) + for h2_tag in soup.find_all('h2', class_='obra__title'): + # Szukanie wewnątrz

elementu z atrybutem href + a_tag = h2_tag.find('a', href=True) + if a_tag: # Sprawdzenie, czy element został znaleziony + hrefs.append(a_tag['href']) + + + for link in hrefs: + response = requests.get(f"https://www.salvador-dali.org{link}") + soup = BeautifulSoup(response.text, 'html.parser') + with open(f"html/{link.replace('/', '_')}.html", 'w+', encoding='utf-8') as file: + file.write(soup.prettify()) + + + painting = soup.find('div', class_='fitxa-obra') + if painting: + # Tworzenie słownika na informacje + painting_info = {} + + # Znajdowanie tytułu pracy + title = painting.find('p', class_='titol-traduit') + title_2 = painting.find('span', class_='titol-traduit') + + if title: + painting_info['title'] = title.text.strip() + elif title_2: + painting_info['title'] = title_2.text.strip() + title = title_2 + + # Szukanie wszystkich kluczy
i wartości
+ for dl_tag in painting.find_all('dl'): + for dt, dd in zip(dl_tag.find_all('dt'), dl_tag.find_all('dd')): + # Klucz to tekst z
+ key = dt.text.strip().lower() + # Wartość to tekst z
+ value = dd.get_text(separator=" ").split('
') + painting_info[key] = value + painting_info['name_of_aritst'] = "Salvador Dali" + + # Wyciągniecie danych z sekcji z obrazem + image_section = soup.find('figure', class_='foto-obra') + if image_section: + image_tag = image_section.find('img') + if image_tag and 'src' in image_tag.attrs: + painting_info['image_url'] = f"https://www.salvador-dali.org{image_tag['src']}" + + # Zapis zdjęcia + image_response = requests.get(painting_info['image_url']) + if image_response.status_code == 200: + with open(f"images/{painting_info['title'].replace(' ', '_')}.jpg", 'wb') as img_file: + img_file.write(image_response.content) + + # Wyciągnięcie danych z sekcji "Provenance" + provenance_section = soup.find('h2', text="Provenance") + if provenance_section: + provenance_items = provenance_section.find_next('ol').find_all('li') + painting_info['provenance'] = [item.get_text(strip=True) for item in provenance_items] + + # Wyciągnięcie danych z sekcji "Exhibitions" + exhibitions_section = soup.find('h2', text="xhibitions") + if exhibitions_section: + exhibitions_items = exhibitions_section.find_next('ol').find_all('li') + painting_info['exhibitions'] = [item.get_text(strip=True) for item in exhibitions_items] + + # Wyciągnięcie danych z sekcji "Bibliography" + bibliography_section = soup.find('h2', text="Bibliography") + if bibliography_section: + bibliography_items = bibliography_section.find_next('ol').find_all('li') + painting_info['bibliography'] = [item.get_text(strip=True) for item in bibliography_items] + + # Wyświetlanie zebranych informacji + #print(json.dumps(painting_info, indent=4, ensure_ascii=False)) + + data.append(painting_info) + + +# Zapysanie wyników do pliku +with open('dali.json', 'w', encoding='utf-8') as f: + json.dump(data, f, indent=4, ensure_ascii=False) diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100755 index 0000000..7aa70f2 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,10 @@ +version: '3' + +services: + main: + build: + context: ./ + dockerfile: ./Dockerfile + command: tail -f /dev/null + volumes: + - ./:/app diff --git a/requirements.txt b/requirements.txt new file mode 100755 index 0000000..dc1536f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests +bs4