import requests # from bs4 import BeautifulSoup import re from collections import namedtuple import pickle import time class ParishScraper(object): """Documentation for ParishScraper """ def __init__(self): self.website_prefix = 'http://colaska.pl/index/parafia/id/' def _scrap(self): parishes = [] for page_nr in range(1, 11000): page = requests.get(self.website_prefix + str(page_nr)) sleep_time = 2 while page.status_code == 500: print('Status code 500 error') sleep_time = sleep_time**2 print('Waiting ' + str(sleep_time) + ' sec') time.sleep(sleep_time) page = requests.get(self.website_prefix + str(page_nr)) if 'id' in page.url: page_nr += 1 parish = self._retrieve_info(page) print(parish) print('\n') parishes.append(parish) return parishes def _retrieve_info(self, page): page.encoding = 'utf-8' html_doc = page.text meta_url = page.url print(meta_url) try: search_result = re.search( 'pHead rel">[\w\W]*?
(.*?)
[\w\W]*?class="city">(.*?)[\w\W]*?(.*?)
(.*?)
(.*?)
[\w\W]*?class="city">(.*?)[\w\W]*?(.*?)
', html_doc) street = '' postal_code = search_result.group(3) else: street = search_result.group(3) postal_code = search_result.group(4) name = search_result.group(1) city = search_result.group(2) url_search = re.search('link mt10">', html_doc) url = '' if url_search is None else url_search.group(1) gps = re.search('id="tabsmaps" gps="(.*?)">