From f9c569065705af3b14f4396bdf2c949de23573f5 Mon Sep 17 00:00:00 2001
From: Dawid Jurkiewicz <dawjur@st.amu.ed.pl>
Date: Fri, 6 Apr 2018 23:33:18 +0200
Subject: [PATCH] Modifiy error logging in get_parishes_url. Enhance
 crawl_deon.py

Fix Makefile - append instead of rewrite.
---
 Makefile                     |  2 +-
 scraper/crawl_deon.py        | 18 ++++++++++++++----
 scraper/get_parishes_urls.py |  2 ++
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index 4dab6b2..a9c229c 100644
--- a/Makefile
+++ b/Makefile
@@ -14,7 +14,7 @@ parishwebsites/spider-commands.txt: parishes-with-urls.tsv
 	cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u > $@
 
 parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.py
-	scraper/get_parishes_urls.py -a $< -p $(word 2,$^) > $@ 2> get-parishes-urls.log
+	scraper/get_parishes_urls.py -a $< -p $(word 2,$^) >> $@ 2> get-parishes-urls.log
 
 parishes-deon.tsv: scraper/crawl_deon.py
 	scraper/crawl_deon.py > $@ 2> crawl-deon.log
diff --git a/scraper/crawl_deon.py b/scraper/crawl_deon.py
index 1c84375..0c9c007 100755
--- a/scraper/crawl_deon.py
+++ b/scraper/crawl_deon.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+from time import sleep
 import requests
 from string import Template
 import re
@@ -21,7 +22,7 @@ def get_address(url):
 
 
 def process_page(url):
-    page = requests.get(url, timeout=10)
+    page = requests.get(url, timeout=30)
     soup = BeautifulSoup(page.text, 'html.parser')
     for td in soup.find_all('td', class_='temat'):
         href = td.a['href']
@@ -38,7 +39,16 @@ def process_page(url):
             address, td_diocese.get_text(strip=True), td_decanate.get_text(
                 strip=True), td_province.get_text(strip=True)
         ]))
-
+def retry_download(url, sleep_time = 0):
+    try:
+        process_page(url)
+    except Exception as e:
+        if sleep_time == 0:
+            sleep_time = 1.5 
+        logging.info(e)
+        logging.info('Waiting {}s.\n'.format(sleep_time))
+        sleep(sleep_time)
+        retry_download(url, sleep_time * 1.5)
 
 def main():
     base_url = 'https://www.deon.pl/parafie-koscioly/'
@@ -47,10 +57,10 @@ def main():
     print('\t'.join([
         'Parafia', 'Miejscowość', 'Adres', 'Diecezja', 'Dekanat', 'Województwo'
     ]))
-    process_page(base_url)
+    retry_download(base_url)
     for i in range(2, 1014):  # TODO: add search for last page nr on deon
         url = base_url + suffix.substitute(page=str(i))
-        process_page(url)
+        retry_download(url)
         logging.info(i)
 
 
diff --git a/scraper/get_parishes_urls.py b/scraper/get_parishes_urls.py
index d85a444..a59f43a 100755
--- a/scraper/get_parishes_urls.py
+++ b/scraper/get_parishes_urls.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+import traceback
 import sys
 from googleplaces import GooglePlaces, lang, GooglePlacesError, Place
 # import jsonlines
@@ -137,6 +138,7 @@ def main():
                 NullPlace = namedtuple('NullPlace', ['website', 'place_id'])
                 parish = NullPlace('', '')
         except Exception as e:
+            traceback.print_stack()
             logging.info('Probably limit exceeded. Exiting.\nException: {}'.format(e))
             # write_last_line_to_file(outputfile_path, line_nr)
             return