first commit
This commit is contained in:
commit
b00e119e92
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
/images
|
||||
/html
|
3
Dockerfile
Executable file
3
Dockerfile
Executable file
@ -0,0 +1,3 @@
|
||||
FROM python:3.10.16-alpine3.21
|
||||
WORKDIR /app
|
||||
RUN pip install bs4 requests
|
97
dali.py
Executable file
97
dali.py
Executable file
@ -0,0 +1,97 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
|
||||
pages_count = [f"?page={i}" for i in range(1, 22)]
|
||||
|
||||
main_url = "https://www.salvador-dali.org/en/artwork/catalogue-raisonne-paintings/obres/"
|
||||
|
||||
for page in pages_count:
|
||||
|
||||
response = requests.get(f"{main_url}{page}")
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
data = []
|
||||
# Tworzenie listy na linki
|
||||
hrefs = []
|
||||
|
||||
# Znajdowanie wszystkich linków (elementy <a> z atrybutem href)
|
||||
for h2_tag in soup.find_all('h2', class_='obra__title'):
|
||||
# Szukanie wewnątrz <h2> elementu <a> z atrybutem href
|
||||
a_tag = h2_tag.find('a', href=True)
|
||||
if a_tag: # Sprawdzenie, czy element <a> został znaleziony
|
||||
hrefs.append(a_tag['href'])
|
||||
|
||||
|
||||
for link in hrefs:
|
||||
response = requests.get(f"https://www.salvador-dali.org{link}")
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
with open(f"html/{link.replace('/', '_')}.html", 'w+', encoding='utf-8') as file:
|
||||
file.write(soup.prettify())
|
||||
|
||||
|
||||
painting = soup.find('div', class_='fitxa-obra')
|
||||
if painting:
|
||||
# Tworzenie słownika na informacje
|
||||
painting_info = {}
|
||||
|
||||
# Znajdowanie tytułu pracy
|
||||
title = painting.find('p', class_='titol-traduit')
|
||||
title_2 = painting.find('span', class_='titol-traduit')
|
||||
|
||||
if title:
|
||||
painting_info['title'] = title.text.strip()
|
||||
elif title_2:
|
||||
painting_info['title'] = title_2.text.strip()
|
||||
title = title_2
|
||||
|
||||
# Szukanie wszystkich kluczy <dt> i wartości <dd>
|
||||
for dl_tag in painting.find_all('dl'):
|
||||
for dt, dd in zip(dl_tag.find_all('dt'), dl_tag.find_all('dd')):
|
||||
# Klucz to tekst z <dt>
|
||||
key = dt.text.strip().lower()
|
||||
# Wartość to tekst z <dd>
|
||||
value = dd.get_text(separator=" ").split('<br>')
|
||||
painting_info[key] = value
|
||||
painting_info['name_of_aritst'] = "Salvador Dali"
|
||||
|
||||
# Wyciągniecie danych z sekcji z obrazem
|
||||
image_section = soup.find('figure', class_='foto-obra')
|
||||
if image_section:
|
||||
image_tag = image_section.find('img')
|
||||
if image_tag and 'src' in image_tag.attrs:
|
||||
painting_info['image_url'] = f"https://www.salvador-dali.org{image_tag['src']}"
|
||||
|
||||
# Zapis zdjęcia
|
||||
image_response = requests.get(painting_info['image_url'])
|
||||
if image_response.status_code == 200:
|
||||
with open(f"images/{painting_info['title'].replace(' ', '_')}.jpg", 'wb') as img_file:
|
||||
img_file.write(image_response.content)
|
||||
|
||||
# Wyciągnięcie danych z sekcji "Provenance"
|
||||
provenance_section = soup.find('h2', text="Provenance")
|
||||
if provenance_section:
|
||||
provenance_items = provenance_section.find_next('ol').find_all('li')
|
||||
painting_info['provenance'] = [item.get_text(strip=True) for item in provenance_items]
|
||||
|
||||
# Wyciągnięcie danych z sekcji "Exhibitions"
|
||||
exhibitions_section = soup.find('h2', text="xhibitions")
|
||||
if exhibitions_section:
|
||||
exhibitions_items = exhibitions_section.find_next('ol').find_all('li')
|
||||
painting_info['exhibitions'] = [item.get_text(strip=True) for item in exhibitions_items]
|
||||
|
||||
# Wyciągnięcie danych z sekcji "Bibliography"
|
||||
bibliography_section = soup.find('h2', text="Bibliography")
|
||||
if bibliography_section:
|
||||
bibliography_items = bibliography_section.find_next('ol').find_all('li')
|
||||
painting_info['bibliography'] = [item.get_text(strip=True) for item in bibliography_items]
|
||||
|
||||
# Wyświetlanie zebranych informacji
|
||||
#print(json.dumps(painting_info, indent=4, ensure_ascii=False))
|
||||
|
||||
data.append(painting_info)
|
||||
|
||||
|
||||
# Zapysanie wyników do pliku
|
||||
with open('dali.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=4, ensure_ascii=False)
|
10
docker-compose.yml
Executable file
10
docker-compose.yml
Executable file
@ -0,0 +1,10 @@
|
||||
version: '3'
|
||||
|
||||
services:
|
||||
main:
|
||||
build:
|
||||
context: ./
|
||||
dockerfile: ./Dockerfile
|
||||
command: tail -f /dev/null
|
||||
volumes:
|
||||
- ./:/app
|
2
requirements.txt
Executable file
2
requirements.txt
Executable file
@ -0,0 +1,2 @@
|
||||
requests
|
||||
bs4
|
Loading…
Reference in New Issue
Block a user