first commit

This commit is contained in:
vboxuser 2024-12-10 18:49:40 +01:00
commit b00e119e92
5 changed files with 114 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
/images
/html

3
Dockerfile Executable file
View File

@ -0,0 +1,3 @@
FROM python:3.10.16-alpine3.21
WORKDIR /app
RUN pip install bs4 requests

97
dali.py Executable file
View File

@ -0,0 +1,97 @@
import requests
from bs4 import BeautifulSoup
import json
pages_count = [f"?page={i}" for i in range(1, 22)]
main_url = "https://www.salvador-dali.org/en/artwork/catalogue-raisonne-paintings/obres/"
for page in pages_count:
response = requests.get(f"{main_url}{page}")
soup = BeautifulSoup(response.text, 'html.parser')
data = []
# Tworzenie listy na linki
hrefs = []
# Znajdowanie wszystkich linków (elementy <a> z atrybutem href)
for h2_tag in soup.find_all('h2', class_='obra__title'):
# Szukanie wewnątrz <h2> elementu <a> z atrybutem href
a_tag = h2_tag.find('a', href=True)
if a_tag: # Sprawdzenie, czy element <a> został znaleziony
hrefs.append(a_tag['href'])
for link in hrefs:
response = requests.get(f"https://www.salvador-dali.org{link}")
soup = BeautifulSoup(response.text, 'html.parser')
with open(f"html/{link.replace('/', '_')}.html", 'w+', encoding='utf-8') as file:
file.write(soup.prettify())
painting = soup.find('div', class_='fitxa-obra')
if painting:
# Tworzenie słownika na informacje
painting_info = {}
# Znajdowanie tytułu pracy
title = painting.find('p', class_='titol-traduit')
title_2 = painting.find('span', class_='titol-traduit')
if title:
painting_info['title'] = title.text.strip()
elif title_2:
painting_info['title'] = title_2.text.strip()
title = title_2
# Szukanie wszystkich kluczy <dt> i wartości <dd>
for dl_tag in painting.find_all('dl'):
for dt, dd in zip(dl_tag.find_all('dt'), dl_tag.find_all('dd')):
# Klucz to tekst z <dt>
key = dt.text.strip().lower()
# Wartość to tekst z <dd>
value = dd.get_text(separator=" ").split('<br>')
painting_info[key] = value
painting_info['name_of_aritst'] = "Salvador Dali"
# Wyciągniecie danych z sekcji z obrazem
image_section = soup.find('figure', class_='foto-obra')
if image_section:
image_tag = image_section.find('img')
if image_tag and 'src' in image_tag.attrs:
painting_info['image_url'] = f"https://www.salvador-dali.org{image_tag['src']}"
# Zapis zdjęcia
image_response = requests.get(painting_info['image_url'])
if image_response.status_code == 200:
with open(f"images/{painting_info['title'].replace(' ', '_')}.jpg", 'wb') as img_file:
img_file.write(image_response.content)
# Wyciągnięcie danych z sekcji "Provenance"
provenance_section = soup.find('h2', text="Provenance")
if provenance_section:
provenance_items = provenance_section.find_next('ol').find_all('li')
painting_info['provenance'] = [item.get_text(strip=True) for item in provenance_items]
# Wyciągnięcie danych z sekcji "Exhibitions"
exhibitions_section = soup.find('h2', text="xhibitions")
if exhibitions_section:
exhibitions_items = exhibitions_section.find_next('ol').find_all('li')
painting_info['exhibitions'] = [item.get_text(strip=True) for item in exhibitions_items]
# Wyciągnięcie danych z sekcji "Bibliography"
bibliography_section = soup.find('h2', text="Bibliography")
if bibliography_section:
bibliography_items = bibliography_section.find_next('ol').find_all('li')
painting_info['bibliography'] = [item.get_text(strip=True) for item in bibliography_items]
# Wyświetlanie zebranych informacji
#print(json.dumps(painting_info, indent=4, ensure_ascii=False))
data.append(painting_info)
# Zapysanie wyników do pliku
with open('dali.json', 'w', encoding='utf-8') as f:
json.dump(data, f, indent=4, ensure_ascii=False)

10
docker-compose.yml Executable file
View File

@ -0,0 +1,10 @@
version: '3'
services:
main:
build:
context: ./
dockerfile: ./Dockerfile
command: tail -f /dev/null
volumes:
- ./:/app

2
requirements.txt Executable file
View File

@ -0,0 +1,2 @@
requests
bs4