first commit
This commit is contained in:
commit
b00e119e92
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
/images
|
||||||
|
/html
|
3
Dockerfile
Executable file
3
Dockerfile
Executable file
@ -0,0 +1,3 @@
|
|||||||
|
FROM python:3.10.16-alpine3.21
|
||||||
|
WORKDIR /app
|
||||||
|
RUN pip install bs4 requests
|
97
dali.py
Executable file
97
dali.py
Executable file
@ -0,0 +1,97 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import json
|
||||||
|
|
||||||
|
pages_count = [f"?page={i}" for i in range(1, 22)]
|
||||||
|
|
||||||
|
main_url = "https://www.salvador-dali.org/en/artwork/catalogue-raisonne-paintings/obres/"
|
||||||
|
|
||||||
|
for page in pages_count:
|
||||||
|
|
||||||
|
response = requests.get(f"{main_url}{page}")
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
|
data = []
|
||||||
|
# Tworzenie listy na linki
|
||||||
|
hrefs = []
|
||||||
|
|
||||||
|
# Znajdowanie wszystkich linków (elementy <a> z atrybutem href)
|
||||||
|
for h2_tag in soup.find_all('h2', class_='obra__title'):
|
||||||
|
# Szukanie wewnątrz <h2> elementu <a> z atrybutem href
|
||||||
|
a_tag = h2_tag.find('a', href=True)
|
||||||
|
if a_tag: # Sprawdzenie, czy element <a> został znaleziony
|
||||||
|
hrefs.append(a_tag['href'])
|
||||||
|
|
||||||
|
|
||||||
|
for link in hrefs:
|
||||||
|
response = requests.get(f"https://www.salvador-dali.org{link}")
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
with open(f"html/{link.replace('/', '_')}.html", 'w+', encoding='utf-8') as file:
|
||||||
|
file.write(soup.prettify())
|
||||||
|
|
||||||
|
|
||||||
|
painting = soup.find('div', class_='fitxa-obra')
|
||||||
|
if painting:
|
||||||
|
# Tworzenie słownika na informacje
|
||||||
|
painting_info = {}
|
||||||
|
|
||||||
|
# Znajdowanie tytułu pracy
|
||||||
|
title = painting.find('p', class_='titol-traduit')
|
||||||
|
title_2 = painting.find('span', class_='titol-traduit')
|
||||||
|
|
||||||
|
if title:
|
||||||
|
painting_info['title'] = title.text.strip()
|
||||||
|
elif title_2:
|
||||||
|
painting_info['title'] = title_2.text.strip()
|
||||||
|
title = title_2
|
||||||
|
|
||||||
|
# Szukanie wszystkich kluczy <dt> i wartości <dd>
|
||||||
|
for dl_tag in painting.find_all('dl'):
|
||||||
|
for dt, dd in zip(dl_tag.find_all('dt'), dl_tag.find_all('dd')):
|
||||||
|
# Klucz to tekst z <dt>
|
||||||
|
key = dt.text.strip().lower()
|
||||||
|
# Wartość to tekst z <dd>
|
||||||
|
value = dd.get_text(separator=" ").split('<br>')
|
||||||
|
painting_info[key] = value
|
||||||
|
painting_info['name_of_aritst'] = "Salvador Dali"
|
||||||
|
|
||||||
|
# Wyciągniecie danych z sekcji z obrazem
|
||||||
|
image_section = soup.find('figure', class_='foto-obra')
|
||||||
|
if image_section:
|
||||||
|
image_tag = image_section.find('img')
|
||||||
|
if image_tag and 'src' in image_tag.attrs:
|
||||||
|
painting_info['image_url'] = f"https://www.salvador-dali.org{image_tag['src']}"
|
||||||
|
|
||||||
|
# Zapis zdjęcia
|
||||||
|
image_response = requests.get(painting_info['image_url'])
|
||||||
|
if image_response.status_code == 200:
|
||||||
|
with open(f"images/{painting_info['title'].replace(' ', '_')}.jpg", 'wb') as img_file:
|
||||||
|
img_file.write(image_response.content)
|
||||||
|
|
||||||
|
# Wyciągnięcie danych z sekcji "Provenance"
|
||||||
|
provenance_section = soup.find('h2', text="Provenance")
|
||||||
|
if provenance_section:
|
||||||
|
provenance_items = provenance_section.find_next('ol').find_all('li')
|
||||||
|
painting_info['provenance'] = [item.get_text(strip=True) for item in provenance_items]
|
||||||
|
|
||||||
|
# Wyciągnięcie danych z sekcji "Exhibitions"
|
||||||
|
exhibitions_section = soup.find('h2', text="xhibitions")
|
||||||
|
if exhibitions_section:
|
||||||
|
exhibitions_items = exhibitions_section.find_next('ol').find_all('li')
|
||||||
|
painting_info['exhibitions'] = [item.get_text(strip=True) for item in exhibitions_items]
|
||||||
|
|
||||||
|
# Wyciągnięcie danych z sekcji "Bibliography"
|
||||||
|
bibliography_section = soup.find('h2', text="Bibliography")
|
||||||
|
if bibliography_section:
|
||||||
|
bibliography_items = bibliography_section.find_next('ol').find_all('li')
|
||||||
|
painting_info['bibliography'] = [item.get_text(strip=True) for item in bibliography_items]
|
||||||
|
|
||||||
|
# Wyświetlanie zebranych informacji
|
||||||
|
#print(json.dumps(painting_info, indent=4, ensure_ascii=False))
|
||||||
|
|
||||||
|
data.append(painting_info)
|
||||||
|
|
||||||
|
|
||||||
|
# Zapysanie wyników do pliku
|
||||||
|
with open('dali.json', 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(data, f, indent=4, ensure_ascii=False)
|
10
docker-compose.yml
Executable file
10
docker-compose.yml
Executable file
@ -0,0 +1,10 @@
|
|||||||
|
version: '3'
|
||||||
|
|
||||||
|
services:
|
||||||
|
main:
|
||||||
|
build:
|
||||||
|
context: ./
|
||||||
|
dockerfile: ./Dockerfile
|
||||||
|
command: tail -f /dev/null
|
||||||
|
volumes:
|
||||||
|
- ./:/app
|
2
requirements.txt
Executable file
2
requirements.txt
Executable file
@ -0,0 +1,2 @@
|
|||||||
|
requests
|
||||||
|
bs4
|
Loading…
Reference in New Issue
Block a user