first commit

2024-12-10 18:49:40 +01:00 · 2024-12-10 18:49:40 +01:00 · b00e119e92
commit b00e119e92
5 changed files with 114 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+/images
+/html
--- a/3
+++ b/3
@ -0,0 +1,3 @@
+FROM python:3.10.16-alpine3.21
+WORKDIR /app
+RUN pip install bs4 requests
--- a/dali.py
+++ b/dali.py
@ -0,0 +1,97 @@
+import requests
+from bs4 import BeautifulSoup
+import json
+
+pages_count = [f"?page={i}" for i in range(1, 22)]
+
+main_url = "https://www.salvador-dali.org/en/artwork/catalogue-raisonne-paintings/obres/"
+
+for page in pages_count:
+
+    response = requests.get(f"{main_url}{page}")
+    soup = BeautifulSoup(response.text, 'html.parser')
+
+    data = []
+    # Tworzenie listy na linki
+    hrefs = []
+
+    # Znajdowanie wszystkich linków (elementy <a> z atrybutem href)
+    for h2_tag in soup.find_all('h2', class_='obra__title'):
+        # Szukanie wewnątrz <h2> elementu <a> z atrybutem href
+        a_tag = h2_tag.find('a', href=True)
+        if a_tag:  # Sprawdzenie, czy element <a> został znaleziony
+            hrefs.append(a_tag['href'])
+
+
+    for link in hrefs:
+        response = requests.get(f"https://www.salvador-dali.org{link}")
+        soup = BeautifulSoup(response.text, 'html.parser')
+        with open(f"html/{link.replace('/', '_')}.html", 'w+', encoding='utf-8') as file:
+            file.write(soup.prettify())
+
+
+        painting = soup.find('div', class_='fitxa-obra')
+        if painting:
+            # Tworzenie słownika na informacje
+            painting_info = {}
+
+            # Znajdowanie tytułu pracy
+            title = painting.find('p', class_='titol-traduit')
+            title_2 = painting.find('span', class_='titol-traduit')
+
+            if title:
+                painting_info['title'] = title.text.strip()
+            elif title_2:
+                painting_info['title'] = title_2.text.strip()
+                title = title_2
+
+            # Szukanie wszystkich kluczy <dt> i wartości <dd>
+            for dl_tag in painting.find_all('dl'):
+                for dt, dd in zip(dl_tag.find_all('dt'), dl_tag.find_all('dd')):
+                    # Klucz to tekst z <dt>
+                    key = dt.text.strip().lower()
+                    # Wartość to tekst z <dd>
+                    value = dd.get_text(separator=" ").split('<br>')
+                    painting_info[key] = value
+                    painting_info['name_of_aritst'] = "Salvador Dali"
+
+        # Wyciągniecie danych z sekcji z obrazem
+        image_section = soup.find('figure', class_='foto-obra')
+        if image_section:
+            image_tag = image_section.find('img')
+            if image_tag and 'src' in image_tag.attrs:
+                painting_info['image_url'] = f"https://www.salvador-dali.org{image_tag['src']}"
+
+                # Zapis zdjęcia 
+                image_response = requests.get(painting_info['image_url'])
+                if image_response.status_code == 200:
+                    with open(f"images/{painting_info['title'].replace(' ', '_')}.jpg", 'wb') as img_file:
+                        img_file.write(image_response.content)
+
+        # Wyciągnięcie danych z sekcji "Provenance"
+        provenance_section = soup.find('h2', text="Provenance")
+        if provenance_section:
+            provenance_items = provenance_section.find_next('ol').find_all('li')
+            painting_info['provenance'] = [item.get_text(strip=True) for item in provenance_items]
+
+        # Wyciągnięcie danych z sekcji "Exhibitions"
+        exhibitions_section = soup.find('h2', text="xhibitions")
+        if exhibitions_section:
+            exhibitions_items = exhibitions_section.find_next('ol').find_all('li')
+            painting_info['exhibitions'] = [item.get_text(strip=True) for item in exhibitions_items]
+
+        # Wyciągnięcie danych z sekcji  "Bibliography"
+        bibliography_section = soup.find('h2', text="Bibliography")
+        if bibliography_section:
+            bibliography_items = bibliography_section.find_next('ol').find_all('li')
+            painting_info['bibliography'] = [item.get_text(strip=True) for item in bibliography_items]
+
+        # Wyświetlanie zebranych informacji
+        #print(json.dumps(painting_info, indent=4, ensure_ascii=False))
+
+    data.append(painting_info)
+
+
+# Zapysanie wyników do pliku
+with open('dali.json', 'w', encoding='utf-8') as f:
+    json.dump(data, f, indent=4, ensure_ascii=False)
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,10 @@
+version: '3'
+
+services:
+  main:
+    build:
+      context: ./
+      dockerfile: ./Dockerfile
+    command: tail -f /dev/null
+    volumes:
+        - ./:/app
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+requests
+bs4