2017-11-21 22:51:09 +01:00
|
|
|
#!/usr/bin/env python3
|
2018-04-06 23:33:18 +02:00
|
|
|
from time import sleep
|
2017-11-21 22:51:09 +01:00
|
|
|
import requests
|
|
|
|
from string import Template
|
|
|
|
import re
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
import unicodedata
|
2018-01-20 21:55:26 +01:00
|
|
|
import logging
|
2017-11-21 22:51:09 +01:00
|
|
|
|
2018-01-20 21:55:26 +01:00
|
|
|
logging.basicConfig(
|
|
|
|
format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
|
2017-11-21 22:51:09 +01:00
|
|
|
|
2018-01-20 21:55:26 +01:00
|
|
|
|
|
|
|
def get_address(url):
|
2017-11-21 22:51:09 +01:00
|
|
|
page = requests.get(url, timeout=10)
|
|
|
|
soup = BeautifulSoup(page.text, 'html.parser')
|
2018-01-20 21:55:26 +01:00
|
|
|
address = soup.find(class_='adres adres2').find_next('div', class_='row')
|
|
|
|
return '|'.join(list(address.stripped_strings))
|
2017-11-21 22:51:09 +01:00
|
|
|
|
2018-01-20 21:55:26 +01:00
|
|
|
# description = soup.find(class_='tytul5 clear').find_next(class_='row')
|
|
|
|
# match = re.search('<b>www:</b> (.*?)<br', str(soup))
|
2017-11-21 22:51:09 +01:00
|
|
|
|
|
|
|
|
|
|
|
def process_page(url):
|
2018-04-06 23:33:18 +02:00
|
|
|
page = requests.get(url, timeout=30)
|
2017-11-21 22:51:09 +01:00
|
|
|
soup = BeautifulSoup(page.text, 'html.parser')
|
|
|
|
for td in soup.find_all('td', class_='temat'):
|
|
|
|
href = td.a['href']
|
2018-01-20 21:55:26 +01:00
|
|
|
parish_name = td.a.get_text(strip=True)
|
|
|
|
# parish_name = ' '.join(
|
|
|
|
# unicodedata.normalize("NFKD", parish_name).split())
|
|
|
|
td_city = td.find_next('td')
|
|
|
|
td_province = td_city.find_next('td')
|
|
|
|
td_diocese = td_province.find_next('td')
|
|
|
|
td_decanate = td_diocese.find_next('td')
|
|
|
|
address = get_address(href)
|
|
|
|
print('\t'.join([
|
|
|
|
parish_name, td_city.get_text(strip=True),
|
|
|
|
address, td_diocese.get_text(strip=True), td_decanate.get_text(
|
|
|
|
strip=True), td_province.get_text(strip=True)
|
|
|
|
]))
|
2018-04-06 23:33:18 +02:00
|
|
|
def retry_download(url, sleep_time = 0):
|
|
|
|
try:
|
|
|
|
process_page(url)
|
|
|
|
except Exception as e:
|
|
|
|
if sleep_time == 0:
|
|
|
|
sleep_time = 1.5
|
|
|
|
logging.info(e)
|
|
|
|
logging.info('Waiting {}s.\n'.format(sleep_time))
|
|
|
|
sleep(sleep_time)
|
|
|
|
retry_download(url, sleep_time * 1.5)
|
2017-11-21 22:51:09 +01:00
|
|
|
|
|
|
|
def main():
|
|
|
|
base_url = 'https://www.deon.pl/parafie-koscioly/'
|
|
|
|
suffix = Template('strona,${page}.html')
|
|
|
|
|
2018-01-20 21:55:26 +01:00
|
|
|
print('\t'.join([
|
|
|
|
'Parafia', 'Miejscowość', 'Adres', 'Diecezja', 'Dekanat', 'Województwo'
|
|
|
|
]))
|
2018-04-06 23:33:18 +02:00
|
|
|
retry_download(base_url)
|
2017-11-21 22:51:09 +01:00
|
|
|
for i in range(2, 1014): # TODO: add search for last page nr on deon
|
|
|
|
url = base_url + suffix.substitute(page=str(i))
|
2018-04-06 23:33:18 +02:00
|
|
|
retry_download(url)
|
2018-01-20 21:55:26 +01:00
|
|
|
logging.info(i)
|
2017-11-21 22:51:09 +01:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|