From f9c569065705af3b14f4396bdf2c949de23573f5 Mon Sep 17 00:00:00 2001 From: Dawid Jurkiewicz Date: Fri, 6 Apr 2018 23:33:18 +0200 Subject: [PATCH] Modifiy error logging in get_parishes_url. Enhance crawl_deon.py Fix Makefile - append instead of rewrite. --- Makefile | 2 +- scraper/crawl_deon.py | 18 ++++++++++++++---- scraper/get_parishes_urls.py | 2 ++ 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 4dab6b2..a9c229c 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ parishwebsites/spider-commands.txt: parishes-with-urls.tsv cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u > $@ parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.py - scraper/get_parishes_urls.py -a $< -p $(word 2,$^) > $@ 2> get-parishes-urls.log + scraper/get_parishes_urls.py -a $< -p $(word 2,$^) >> $@ 2> get-parishes-urls.log parishes-deon.tsv: scraper/crawl_deon.py scraper/crawl_deon.py > $@ 2> crawl-deon.log diff --git a/scraper/crawl_deon.py b/scraper/crawl_deon.py index 1c84375..0c9c007 100755 --- a/scraper/crawl_deon.py +++ b/scraper/crawl_deon.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +from time import sleep import requests from string import Template import re @@ -21,7 +22,7 @@ def get_address(url): def process_page(url): - page = requests.get(url, timeout=10) + page = requests.get(url, timeout=30) soup = BeautifulSoup(page.text, 'html.parser') for td in soup.find_all('td', class_='temat'): href = td.a['href'] @@ -38,7 +39,16 @@ def process_page(url): address, td_diocese.get_text(strip=True), td_decanate.get_text( strip=True), td_province.get_text(strip=True) ])) - +def retry_download(url, sleep_time = 0): + try: + process_page(url) + except Exception as e: + if sleep_time == 0: + sleep_time = 1.5 + logging.info(e) + logging.info('Waiting {}s.\n'.format(sleep_time)) + sleep(sleep_time) + retry_download(url, sleep_time * 1.5) def main(): base_url = 'https://www.deon.pl/parafie-koscioly/' @@ -47,10 +57,10 @@ def main(): print('\t'.join([ 'Parafia', 'Miejscowość', 'Adres', 'Diecezja', 'Dekanat', 'Województwo' ])) - process_page(base_url) + retry_download(base_url) for i in range(2, 1014): # TODO: add search for last page nr on deon url = base_url + suffix.substitute(page=str(i)) - process_page(url) + retry_download(url) logging.info(i) diff --git a/scraper/get_parishes_urls.py b/scraper/get_parishes_urls.py index d85a444..a59f43a 100755 --- a/scraper/get_parishes_urls.py +++ b/scraper/get_parishes_urls.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import traceback import sys from googleplaces import GooglePlaces, lang, GooglePlacesError, Place # import jsonlines @@ -137,6 +138,7 @@ def main(): NullPlace = namedtuple('NullPlace', ['website', 'place_id']) parish = NullPlace('', '') except Exception as e: + traceback.print_stack() logging.info('Probably limit exceeded. Exiting.\nException: {}'.format(e)) # write_last_line_to_file(outputfile_path, line_nr) return