comments
This commit is contained in:
parent
748407aeeb
commit
2b952e7108
@ -45,6 +45,8 @@ def scroll_to_end():
|
||||
break
|
||||
time.sleep(4)
|
||||
last_height = new_height
|
||||
|
||||
#funkcja pomocnicza do formatowania outputu
|
||||
|
||||
def set_value(dict_from, key_from, dict_to, key_to):
|
||||
try:
|
||||
@ -70,6 +72,9 @@ signal.signal(signal.SIGINT, signal_handler)
|
||||
site_url = "https://recherche.sik-isea.ch/en/everything/in/catalogues.hodler/work/tiles"
|
||||
|
||||
output = Path("Hodler/hodler.json")
|
||||
|
||||
#załadowanie .jsona jeśli taki istnieje, stworzenie pustego w innym przypadku
|
||||
|
||||
if output.exists():
|
||||
with open(output, 'r', encoding='utf-8') as file:
|
||||
paintings = json.load(file)
|
||||
@ -80,15 +85,11 @@ else:
|
||||
|
||||
#wejście na stronę i załadowanie wszystkich linków do tablicy:
|
||||
|
||||
driver.get(site_url)
|
||||
|
||||
loading_time = wait_for_element(driver, "//*[@id='App']/div[1]/div/a[2]")
|
||||
|
||||
scroll_to_end()
|
||||
|
||||
elements = driver.find_elements(By.XPATH, "//*[@id='App']/div[1]/div/a")
|
||||
|
||||
links = []
|
||||
driver.get(site_url)
|
||||
loading_time = wait_for_element(driver, "//*[@id='App']/div[1]/div/a[2]")
|
||||
scroll_to_end()
|
||||
elements = driver.find_elements(By.XPATH, "//*[@id='App']/div[1]/div/a")
|
||||
|
||||
for item in elements:
|
||||
links.append(item.get_attribute('href'))
|
||||
@ -100,6 +101,8 @@ skipped_counter = 0
|
||||
|
||||
for painting_id in range (starting_id, len(links)+1):
|
||||
|
||||
#sprawdzenie czy obraz jest już zapisany
|
||||
|
||||
if any(painting['id'] == painting_id for painting in paintings):
|
||||
skipped_counter = skipped_counter+1
|
||||
continue
|
||||
@ -108,12 +111,12 @@ for painting_id in range (starting_id, len(links)+1):
|
||||
print(f'skipped {skipped_counter} already saved images')
|
||||
skipped_counter = 0
|
||||
|
||||
#otwarcie podstrony ze szczegółami
|
||||
#otwarcie podstrony ze szczegółami
|
||||
|
||||
driver.get(links[painting_id-1])
|
||||
load= wait_for_element(driver, "//*[@id=\"body\"]/div[2]/div[2]/div[2]/div[1]/div/div[1]/div/div/div[2]/div[1]/div/div/div/div/ul/li/div/div/a/img")
|
||||
|
||||
#wydostanie wszystkich elementów zawierających właściwe informacje i umieszczenie ich w słowniku tymczasowym:
|
||||
#wydostanie wszystkich elementów zawierających właściwe informacje i umieszczenie ich w słowniku tymczasowym:
|
||||
|
||||
item = driver.find_element(By.CLASS_NAME,"Detail-sections")
|
||||
|
||||
@ -132,17 +135,12 @@ for painting_id in range (starting_id, len(links)+1):
|
||||
data = []
|
||||
elif child.tag_name == 'p':
|
||||
data.append(child.text)
|
||||
# elif child.tag_name == 'img':
|
||||
# try:
|
||||
# tmp_painting.setdefault('imgurl', child.get_attribute('src'))
|
||||
# except:
|
||||
# pass
|
||||
else:
|
||||
continue
|
||||
tmp_painting.setdefault(key, data)
|
||||
|
||||
|
||||
#zapisanie obrazu
|
||||
#zapisanie obrazu
|
||||
|
||||
try:
|
||||
|
||||
@ -185,11 +183,18 @@ for painting_id in range (starting_id, len(links)+1):
|
||||
painting_info.setdefault('exhibitions', tmp_painting.get('Exhibitions', []))
|
||||
painting_info.setdefault('bibliography', tmp_painting.get('Publications', []))
|
||||
|
||||
#zapis pobranych danych co 50 stron
|
||||
|
||||
print(json.dumps(painting_info, indent=4, ensure_ascii=False))
|
||||
paintings.append(painting_info)
|
||||
|
||||
if painting_id%50 == 0:
|
||||
with open('Hodler/hodler.json', 'w') as file:
|
||||
json.dump(paintings, file)
|
||||
print("saved all new data")
|
||||
|
||||
painting_id = painting_id+1
|
||||
|
||||
|
||||
|
||||
driver.quit()
|
@ -46,6 +46,8 @@ def scroll_to_end():
|
||||
time.sleep(2)
|
||||
last_height = new_height
|
||||
|
||||
#funkcja pomocnicza do formatowania outputu
|
||||
|
||||
def set_value(dict_from, key_from, dict_to, key_to):
|
||||
try:
|
||||
dict_to.setdefault(key_to, dict_from.get(key_from)[0])
|
||||
@ -69,6 +71,8 @@ signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
site_url = "https://recherche.sik-isea.ch/en/everything/in/catalogues.manuel/work/tiles"
|
||||
|
||||
#załadowanie .jsona jeśli taki istnieje, stworzenie pustego w innym przypadku
|
||||
|
||||
output = Path("Manuel/manuel.json")
|
||||
if output.exists():
|
||||
with open(output, 'r', encoding='utf-8') as file:
|
||||
@ -80,15 +84,11 @@ else:
|
||||
|
||||
#wejście na stronę i załadowanie wszystkich linków do tablicy:
|
||||
|
||||
driver.get(site_url)
|
||||
|
||||
loading_time = wait_for_element(driver, "//*[@id='App']/div[1]/div/a[2]")
|
||||
|
||||
scroll_to_end()
|
||||
|
||||
elements = driver.find_elements(By.XPATH, "//*[@id='App']/div[1]/div/a")
|
||||
|
||||
links = []
|
||||
driver.get(site_url)
|
||||
loading_time = wait_for_element(driver, "//*[@id='App']/div[1]/div/a[2]")
|
||||
scroll_to_end()
|
||||
elements = driver.find_elements(By.XPATH, "//*[@id='App']/div[1]/div/a")
|
||||
|
||||
for item in elements:
|
||||
links.append(item.get_attribute('href'))
|
||||
@ -100,6 +100,8 @@ skipped_counter = 0
|
||||
|
||||
for painting_id in range (starting_id, len(links)+1):
|
||||
|
||||
#sprawdzenie czy obraz jest już zapisany
|
||||
|
||||
if any(painting['id'] == painting_id for painting in paintings):
|
||||
skipped_counter = skipped_counter+1
|
||||
continue
|
||||
@ -108,12 +110,12 @@ for painting_id in range (starting_id, len(links)+1):
|
||||
print(f'skipped {skipped_counter} already saved images')
|
||||
skipped_counter = 0
|
||||
|
||||
#otwarcie podstrony ze szczegółami
|
||||
#otwarcie podstrony ze szczegółami
|
||||
|
||||
driver.get(links[painting_id-1])
|
||||
load= wait_for_element(driver, "//*[@id=\"body\"]/div[2]/div[2]/div[2]/div[1]/div/div[1]/div/div/div[2]/div[1]/div/div/div/div/ul/li/div/div/a/img")
|
||||
|
||||
#wydostanie wszystkich elementów zawierających właściwe informacje i umieszczenie ich w słowniku tymczasowym:
|
||||
#wydostanie wszystkich elementów zawierających właściwe informacje i umieszczenie ich w słowniku tymczasowym:
|
||||
|
||||
item = driver.find_element(By.CLASS_NAME,"Detail-sections")
|
||||
|
||||
@ -132,17 +134,12 @@ for painting_id in range (starting_id, len(links)+1):
|
||||
data = []
|
||||
elif child.tag_name == 'p':
|
||||
data.append(child.text)
|
||||
# elif child.tag_name == 'img':
|
||||
# try:
|
||||
# tmp_painting.setdefault('imgurl', child.get_attribute('src'))
|
||||
# except:
|
||||
# pass
|
||||
else:
|
||||
continue
|
||||
tmp_painting.setdefault(key, data)
|
||||
|
||||
|
||||
#zapisanie obrazu
|
||||
#zapisanie obrazu
|
||||
|
||||
try:
|
||||
|
||||
@ -189,6 +186,13 @@ for painting_id in range (starting_id, len(links)+1):
|
||||
print(json.dumps(painting_info, indent=4, ensure_ascii=False))
|
||||
paintings.append(painting_info)
|
||||
|
||||
#zapis pobranych danych co 50 stron
|
||||
|
||||
if painting_id%50 == 0:
|
||||
with open('Hodler/hodler.json', 'w') as file:
|
||||
json.dump(paintings, file)
|
||||
print("saved all new data")
|
||||
|
||||
painting_id = painting_id+1
|
||||
|
||||
|
||||
|
3
requirements.txt
Normal file
3
requirements.txt
Normal file
@ -0,0 +1,3 @@
|
||||
requests
|
||||
selenium
|
||||
webdriver_manager
|
Loading…
Reference in New Issue
Block a user