add downloaded parishes

This commit is contained in:
siulkilulki 2017-04-21 00:29:17 +02:00
parent 35db6760f7
commit 35d3b11ec6
4 changed files with 17 additions and 8 deletions

0
2017-04-20 Normal file
View File

0
23:38:56.370827045 Normal file
View File

View File

@ -17,14 +17,7 @@ class ParishScraper(object):
def _scrap(self): def _scrap(self):
parishes = [] parishes = []
for page_nr in range(1, 11000): for page_nr in range(1, 11000):
page = requests.get(self.website_prefix + str(page_nr)) page = self._get_page_stubbornly(page_nr)
sleep_time = 2
while page.status_code == 500:
print('Status code 500 error')
sleep_time = sleep_time**2
print('Waiting ' + str(sleep_time) + ' sec')
time.sleep(sleep_time)
page = requests.get(self.website_prefix + str(page_nr))
if 'id' in page.url: if 'id' in page.url:
page_nr += 1 page_nr += 1
parish = self._retrieve_info(page) parish = self._retrieve_info(page)
@ -33,6 +26,22 @@ class ParishScraper(object):
parishes.append(parish) parishes.append(parish)
return parishes return parishes
def _get_page_stubbornly(self, page_nr):
sleep_time = 1
while True:
try:
page = requests.get(
self.website_prefix + str(page_nr), timeout=10)
if page.status_code == 500:
print('Status code 500 error')
raise ConnectionError
return page
except:
sleep_time = sleep_time * 2 if sleep_time < 60 else 60
print('Waiting ' + str(sleep_time) + ' sec')
time.sleep(sleep_time)
continue
def _retrieve_info(self, page): def _retrieve_info(self, page):
page.encoding = 'utf-8' page.encoding = 'utf-8'
html_doc = page.text html_doc = page.text

BIN
parishes.dill Normal file

Binary file not shown.