Modifiy error logging in get_parishes_url. Enhance crawl_deon.py

Fix Makefile - append instead of rewrite.
2018-04-06 23:33:18 +02:00 · 2018-04-06 23:33:18 +02:00 · f9c5690657
commit f9c5690657
parent ccc4af3d51
3 changed files with 17 additions and 5 deletions
--- a/2
+++ b/2
@ -14,7 +14,7 @@ parishwebsites/spider-commands.txt: parishes-with-urls.tsv
 	cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u > $@
 parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.py
-	scraper/get_parishes_urls.py -a $< -p $(word 2,$^) > $@ 2> get-parishes-urls.log
+	scraper/get_parishes_urls.py -a $< -p $(word 2,$^) >> $@ 2> get-parishes-urls.log
 parishes-deon.tsv: scraper/crawl_deon.py
 	scraper/crawl_deon.py > $@ 2> crawl-deon.log
--- a/scraper/crawl_deon.py
+++ b/scraper/crawl_deon.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python3
 from time import sleep
 import requests
 from string import Template
 import re
@ -21,7 +22,7 @@ def get_address(url):
 def process_page(url):
-    page = requests.get(url, timeout=10)
+    page = requests.get(url, timeout=30)
    soup = BeautifulSoup(page.text, 'html.parser')
    for td in soup.find_all('td', class_='temat'):
        href = td.a['href']
@ -38,7 +39,16 @@ def process_page(url):
            address, td_diocese.get_text(strip=True), td_decanate.get_text(
                strip=True), td_province.get_text(strip=True)
        ]))
-
+def retry_download(url, sleep_time = 0):
    try:
        process_page(url)
    except Exception as e:
        if sleep_time == 0:
            sleep_time = 1.5 
        logging.info(e)
        logging.info('Waiting {}s.\n'.format(sleep_time))
        sleep(sleep_time)
        retry_download(url, sleep_time * 1.5)
 def main():
    base_url = 'https://www.deon.pl/parafie-koscioly/'
@ -47,10 +57,10 @@ def main():
    print('\t'.join([
        'Parafia', 'Miejscowość', 'Adres', 'Diecezja', 'Dekanat', 'Województwo'
    ]))
-    process_page(base_url)
+    retry_download(base_url)
    for i in range(2, 1014):  # TODO: add search for last page nr on deon
        url = base_url + suffix.substitute(page=str(i))
-        process_page(url)
+        retry_download(url)
        logging.info(i)
--- a/scraper/get_parishes_urls.py
+++ b/scraper/get_parishes_urls.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python3
 import traceback
 import sys
 from googleplaces import GooglePlaces, lang, GooglePlacesError, Place
 # import jsonlines
@ -137,6 +138,7 @@ def main():
                NullPlace = namedtuple('NullPlace', ['website', 'place_id'])
                parish = NullPlace('', '')
        except Exception as e:
            traceback.print_stack()
            logging.info('Probably limit exceeded. Exiting.\nException: {}'.format(e))
            # write_last_line_to_file(outputfile_path, line_nr)
            return