Modifiy error logging in get_parishes_url. Enhance crawl_deon.py
Fix Makefile - append instead of rewrite.
This commit is contained in:
parent
ccc4af3d51
commit
f9c5690657
2
Makefile
2
Makefile
@ -14,7 +14,7 @@ parishwebsites/spider-commands.txt: parishes-with-urls.tsv
|
|||||||
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u > $@
|
cut -f3 $< | tail -n +2 | grep http | parishwebsites/generate_spider_commands.sh | sort -u > $@
|
||||||
|
|
||||||
parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.py
|
parishes-with-urls.tsv: apikey.txt parishes-deon.tsv scraper/get_parishes_urls.py
|
||||||
scraper/get_parishes_urls.py -a $< -p $(word 2,$^) > $@ 2> get-parishes-urls.log
|
scraper/get_parishes_urls.py -a $< -p $(word 2,$^) >> $@ 2> get-parishes-urls.log
|
||||||
|
|
||||||
parishes-deon.tsv: scraper/crawl_deon.py
|
parishes-deon.tsv: scraper/crawl_deon.py
|
||||||
scraper/crawl_deon.py > $@ 2> crawl-deon.log
|
scraper/crawl_deon.py > $@ 2> crawl-deon.log
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
from time import sleep
|
||||||
import requests
|
import requests
|
||||||
from string import Template
|
from string import Template
|
||||||
import re
|
import re
|
||||||
@ -21,7 +22,7 @@ def get_address(url):
|
|||||||
|
|
||||||
|
|
||||||
def process_page(url):
|
def process_page(url):
|
||||||
page = requests.get(url, timeout=10)
|
page = requests.get(url, timeout=30)
|
||||||
soup = BeautifulSoup(page.text, 'html.parser')
|
soup = BeautifulSoup(page.text, 'html.parser')
|
||||||
for td in soup.find_all('td', class_='temat'):
|
for td in soup.find_all('td', class_='temat'):
|
||||||
href = td.a['href']
|
href = td.a['href']
|
||||||
@ -38,7 +39,16 @@ def process_page(url):
|
|||||||
address, td_diocese.get_text(strip=True), td_decanate.get_text(
|
address, td_diocese.get_text(strip=True), td_decanate.get_text(
|
||||||
strip=True), td_province.get_text(strip=True)
|
strip=True), td_province.get_text(strip=True)
|
||||||
]))
|
]))
|
||||||
|
def retry_download(url, sleep_time = 0):
|
||||||
|
try:
|
||||||
|
process_page(url)
|
||||||
|
except Exception as e:
|
||||||
|
if sleep_time == 0:
|
||||||
|
sleep_time = 1.5
|
||||||
|
logging.info(e)
|
||||||
|
logging.info('Waiting {}s.\n'.format(sleep_time))
|
||||||
|
sleep(sleep_time)
|
||||||
|
retry_download(url, sleep_time * 1.5)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
base_url = 'https://www.deon.pl/parafie-koscioly/'
|
base_url = 'https://www.deon.pl/parafie-koscioly/'
|
||||||
@ -47,10 +57,10 @@ def main():
|
|||||||
print('\t'.join([
|
print('\t'.join([
|
||||||
'Parafia', 'Miejscowość', 'Adres', 'Diecezja', 'Dekanat', 'Województwo'
|
'Parafia', 'Miejscowość', 'Adres', 'Diecezja', 'Dekanat', 'Województwo'
|
||||||
]))
|
]))
|
||||||
process_page(base_url)
|
retry_download(base_url)
|
||||||
for i in range(2, 1014): # TODO: add search for last page nr on deon
|
for i in range(2, 1014): # TODO: add search for last page nr on deon
|
||||||
url = base_url + suffix.substitute(page=str(i))
|
url = base_url + suffix.substitute(page=str(i))
|
||||||
process_page(url)
|
retry_download(url)
|
||||||
logging.info(i)
|
logging.info(i)
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
import traceback
|
||||||
import sys
|
import sys
|
||||||
from googleplaces import GooglePlaces, lang, GooglePlacesError, Place
|
from googleplaces import GooglePlaces, lang, GooglePlacesError, Place
|
||||||
# import jsonlines
|
# import jsonlines
|
||||||
@ -137,6 +138,7 @@ def main():
|
|||||||
NullPlace = namedtuple('NullPlace', ['website', 'place_id'])
|
NullPlace = namedtuple('NullPlace', ['website', 'place_id'])
|
||||||
parish = NullPlace('', '')
|
parish = NullPlace('', '')
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
traceback.print_stack()
|
||||||
logging.info('Probably limit exceeded. Exiting.\nException: {}'.format(e))
|
logging.info('Probably limit exceeded. Exiting.\nException: {}'.format(e))
|
||||||
# write_last_line_to_file(outputfile_path, line_nr)
|
# write_last_line_to_file(outputfile_path, line_nr)
|
||||||
return
|
return
|
||||||
|
Loading…
Reference in New Issue
Block a user