Modifiy error logging in get_parishes_url. Enhance crawl_deon.py

Fix Makefile - append instead of rewrite.
This commit is contained in:
Dawid Jurkiewicz 2018-04-06 23:33:18 +02:00
parent ccc4af3d51
commit f9c5690657
3 changed files with 17 additions and 5 deletions

View File

@ -14,7 +14,7 @@ parishwebsites/spider-commands.txt: parishes-with-urls.tsv
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u > $@ cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u > $@
parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.py parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.py
scraper/get_parishes_urls.py -a $< -p $(word 2,$^) > $@ 2> get-parishes-urls.log scraper/get_parishes_urls.py -a $< -p $(word 2,$^) >> $@ 2> get-parishes-urls.log
parishes-deon.tsv: scraper/crawl_deon.py parishes-deon.tsv: scraper/crawl_deon.py
scraper/crawl_deon.py > $@ 2> crawl-deon.log scraper/crawl_deon.py > $@ 2> crawl-deon.log

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from time import sleep
import requests import requests
from string import Template from string import Template
import re import re
@ -21,7 +22,7 @@ def get_address(url):
def process_page(url): def process_page(url):
page = requests.get(url, timeout=10) page = requests.get(url, timeout=30)
soup = BeautifulSoup(page.text, 'html.parser') soup = BeautifulSoup(page.text, 'html.parser')
for td in soup.find_all('td', class_='temat'): for td in soup.find_all('td', class_='temat'):
href = td.a['href'] href = td.a['href']
@ -38,7 +39,16 @@ def process_page(url):
address, td_diocese.get_text(strip=True), td_decanate.get_text( address, td_diocese.get_text(strip=True), td_decanate.get_text(
strip=True), td_province.get_text(strip=True) strip=True), td_province.get_text(strip=True)
])) ]))
def retry_download(url, sleep_time = 0):
try:
process_page(url)
except Exception as e:
if sleep_time == 0:
sleep_time = 1.5
logging.info(e)
logging.info('Waiting {}s.\n'.format(sleep_time))
sleep(sleep_time)
retry_download(url, sleep_time * 1.5)
def main(): def main():
base_url = 'https://www.deon.pl/parafie-koscioly/' base_url = 'https://www.deon.pl/parafie-koscioly/'
@ -47,10 +57,10 @@ def main():
print('\t'.join([ print('\t'.join([
'Parafia', 'Miejscowość', 'Adres', 'Diecezja', 'Dekanat', 'Województwo' 'Parafia', 'Miejscowość', 'Adres', 'Diecezja', 'Dekanat', 'Województwo'
])) ]))
process_page(base_url) retry_download(base_url)
for i in range(2, 1014): # TODO: add search for last page nr on deon for i in range(2, 1014): # TODO: add search for last page nr on deon
url = base_url + suffix.substitute(page=str(i)) url = base_url + suffix.substitute(page=str(i))
process_page(url) retry_download(url)
logging.info(i) logging.info(i)

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import traceback
import sys import sys
from googleplaces import GooglePlaces, lang, GooglePlacesError, Place from googleplaces import GooglePlaces, lang, GooglePlacesError, Place
# import jsonlines # import jsonlines
@ -137,6 +138,7 @@ def main():
NullPlace = namedtuple('NullPlace', ['website', 'place_id']) NullPlace = namedtuple('NullPlace', ['website', 'place_id'])
parish = NullPlace('', '') parish = NullPlace('', '')
except Exception as e: except Exception as e:
traceback.print_stack()
logging.info('Probably limit exceeded. Exiting.\nException: {}'.format(e)) logging.info('Probably limit exceeded. Exiting.\nException: {}'.format(e))
# write_last_line_to_file(outputfile_path, line_nr) # write_last_line_to_file(outputfile_path, line_nr)
return return