89 lines
2.8 KiB
Python
89 lines
2.8 KiB
Python
import requests
|
|
# from bs4 import BeautifulSoup
|
|
import re
|
|
from collections import namedtuple
|
|
import time
|
|
import dill
|
|
|
|
|
|
class ParishScraper(object):
|
|
"""Documentation for ParishScraper
|
|
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.website_prefix = 'http://colaska.pl/index/parafia/id/'
|
|
|
|
def _scrap(self):
|
|
parishes = []
|
|
for page_nr in range(1, 11000):
|
|
page = requests.get(self.website_prefix + str(page_nr))
|
|
sleep_time = 2
|
|
while page.status_code == 500:
|
|
print('Status code 500 error')
|
|
sleep_time = sleep_time**2
|
|
print('Waiting ' + str(sleep_time) + ' sec')
|
|
time.sleep(sleep_time)
|
|
page = requests.get(self.website_prefix + str(page_nr))
|
|
if 'id' in page.url:
|
|
page_nr += 1
|
|
parish = self._retrieve_info(page)
|
|
print(parish)
|
|
print('\n')
|
|
parishes.append(parish)
|
|
return parishes
|
|
|
|
def _retrieve_info(self, page):
|
|
page.encoding = 'utf-8'
|
|
html_doc = page.text
|
|
meta_url = page.url
|
|
print(meta_url)
|
|
try:
|
|
search_result = re.search(
|
|
'pHead rel">[\w\W]*?<p class="title">(.*?)</p>[\w\W]*?class="city">(.*?)</span>[\w\W]*?<p>(.*?)<br />(.*?)</p>',
|
|
html_doc)
|
|
if search_result is None:
|
|
search_result = re.search(
|
|
'pHead rel">[\w\W]*?<p class="title">(.*?)</p>[\w\W]*?class="city">(.*?)</span>[\w\W]*?<p>(.*?)</p>',
|
|
html_doc)
|
|
street = ''
|
|
postal_code = search_result.group(3)
|
|
else:
|
|
street = search_result.group(3)
|
|
postal_code = search_result.group(4)
|
|
|
|
name = search_result.group(1)
|
|
city = search_result.group(2)
|
|
|
|
url_search = re.search('link mt10"><a href="(.*?)">', html_doc)
|
|
url = '' if url_search is None else url_search.group(1)
|
|
|
|
gps = re.search('id="tabsmaps" gps="(.*?)"><span',
|
|
html_doc).group(1)
|
|
Parish = namedtuple('Parish', [
|
|
'meta_url', 'url', 'name', 'city', 'street', 'postal_code',
|
|
'gps'
|
|
])
|
|
|
|
parish = Parish(meta_url, url, name, city, street, postal_code,
|
|
gps)
|
|
except AttributeError:
|
|
import ipdb
|
|
ipdb.set_trace()
|
|
return parish
|
|
|
|
def scrap_and_save(self):
|
|
parishes = self._scrap()
|
|
with open('parishes.dill', 'wb') as f:
|
|
dill.dump(parishes, f, dill.HIGHEST_PROTOCOL)
|
|
pass
|
|
|
|
|
|
def main():
|
|
parish_scraper = ParishScraper()
|
|
parish_scraper.scrap_and_save()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|