add downloaded parishes
This commit is contained in:
parent
35db6760f7
commit
35d3b11ec6
0
2017-04-20
Normal file
0
2017-04-20
Normal file
0
23:38:56.370827045
Normal file
0
23:38:56.370827045
Normal file
@ -17,14 +17,7 @@ class ParishScraper(object):
|
|||||||
def _scrap(self):
|
def _scrap(self):
|
||||||
parishes = []
|
parishes = []
|
||||||
for page_nr in range(1, 11000):
|
for page_nr in range(1, 11000):
|
||||||
page = requests.get(self.website_prefix + str(page_nr))
|
page = self._get_page_stubbornly(page_nr)
|
||||||
sleep_time = 2
|
|
||||||
while page.status_code == 500:
|
|
||||||
print('Status code 500 error')
|
|
||||||
sleep_time = sleep_time**2
|
|
||||||
print('Waiting ' + str(sleep_time) + ' sec')
|
|
||||||
time.sleep(sleep_time)
|
|
||||||
page = requests.get(self.website_prefix + str(page_nr))
|
|
||||||
if 'id' in page.url:
|
if 'id' in page.url:
|
||||||
page_nr += 1
|
page_nr += 1
|
||||||
parish = self._retrieve_info(page)
|
parish = self._retrieve_info(page)
|
||||||
@ -33,6 +26,22 @@ class ParishScraper(object):
|
|||||||
parishes.append(parish)
|
parishes.append(parish)
|
||||||
return parishes
|
return parishes
|
||||||
|
|
||||||
|
def _get_page_stubbornly(self, page_nr):
|
||||||
|
sleep_time = 1
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
page = requests.get(
|
||||||
|
self.website_prefix + str(page_nr), timeout=10)
|
||||||
|
if page.status_code == 500:
|
||||||
|
print('Status code 500 error')
|
||||||
|
raise ConnectionError
|
||||||
|
return page
|
||||||
|
except:
|
||||||
|
sleep_time = sleep_time * 2 if sleep_time < 60 else 60
|
||||||
|
print('Waiting ' + str(sleep_time) + ' sec')
|
||||||
|
time.sleep(sleep_time)
|
||||||
|
continue
|
||||||
|
|
||||||
def _retrieve_info(self, page):
|
def _retrieve_info(self, page):
|
||||||
page.encoding = 'utf-8'
|
page.encoding = 'utf-8'
|
||||||
html_doc = page.text
|
html_doc = page.text
|
||||||
|
BIN
parishes.dill
Normal file
BIN
parishes.dill
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user