diff --git a/AAAI96-155.pdf b/AAAI96-155.pdf new file mode 100644 index 0000000..a67be6b Binary files /dev/null and b/AAAI96-155.pdf differ diff --git a/dev-requirements.in b/dev-requirements.in new file mode 100644 index 0000000..c68ccdc --- /dev/null +++ b/dev-requirements.in @@ -0,0 +1,7 @@ +pip-tools +jedi +rope +importmagic +autopep8 +yapf +ipdb diff --git a/dev-requirements.txt b/dev-requirements.txt new file mode 100644 index 0000000..6bf5035 --- /dev/null +++ b/dev-requirements.txt @@ -0,0 +1,28 @@ +# +# This file is autogenerated by pip-compile +# To update, run: +# +# pip-compile --output-file dev-requirements.txt dev-requirements.in +# +autopep8==1.3.1 +click==6.7 # via pip-tools +decorator==4.0.11 # via ipython, traitlets +first==2.0.1 # via pip-tools +importmagic==0.1.7 +ipdb==0.10.2 +ipython-genutils==0.2.0 # via traitlets +ipython==5.3.0 # via ipdb +jedi==0.10.2 +pexpect==4.2.1 # via ipython +pickleshare==0.7.4 # via ipython +pip-tools==1.9.0 +prompt-toolkit==1.0.14 # via ipython +ptyprocess==0.5.1 # via pexpect +pycodestyle==2.3.1 # via autopep8 +pygments==2.2.0 # via ipython +rope==0.10.5 +simplegeneric==0.8.1 # via ipython +six==1.10.0 # via pip-tools, prompt-toolkit, traitlets +traitlets==4.3.2 # via ipython +wcwidth==0.1.7 # via prompt-toolkit +yapf==0.16.1 diff --git a/parish-scrapper.py b/parish-scrapper.py new file mode 100644 index 0000000..35bd11e --- /dev/null +++ b/parish-scrapper.py @@ -0,0 +1,88 @@ +import requests +# from bs4 import BeautifulSoup +import re +from collections import namedtuple +import time +import dill + + +class ParishScraper(object): + """Documentation for ParishScraper + + """ + + def __init__(self): + self.website_prefix = 'http://colaska.pl/index/parafia/id/' + + def _scrap(self): + parishes = [] + for page_nr in range(1, 11000): + page = requests.get(self.website_prefix + str(page_nr)) + sleep_time = 2 + while page.status_code == 500: + print('Status code 500 error') + sleep_time = sleep_time**2 + print('Waiting ' + str(sleep_time) + ' sec') + time.sleep(sleep_time) + page = requests.get(self.website_prefix + str(page_nr)) + if 'id' in page.url: + page_nr += 1 + parish = self._retrieve_info(page) + print(parish) + print('\n') + parishes.append(parish) + return parishes + + def _retrieve_info(self, page): + page.encoding = 'utf-8' + html_doc = page.text + meta_url = page.url + print(meta_url) + try: + search_result = re.search( + 'pHead rel">[\w\W]*?

(.*?)

[\w\W]*?class="city">(.*?)[\w\W]*?

(.*?)
(.*?)

', + html_doc) + if search_result is None: + search_result = re.search( + 'pHead rel">[\w\W]*?

(.*?)

[\w\W]*?class="city">(.*?)[\w\W]*?

(.*?)

', + html_doc) + street = '' + postal_code = search_result.group(3) + else: + street = search_result.group(3) + postal_code = search_result.group(4) + + name = search_result.group(1) + city = search_result.group(2) + + url_search = re.search('link mt10">', html_doc) + url = '' if url_search is None else url_search.group(1) + + gps = re.search('id="tabsmaps" gps="(.*?)">[\w\W]*?

(.*?)

[\w\W]*?class="city">(.*?)[\w\W]*?

(.*?)
(.*?)

', + html_doc) + if search_result is None: + search_result = re.search( + 'pHead rel">[\w\W]*?

(.*?)

[\w\W]*?class="city">(.*?)[\w\W]*?

(.*?)

', + html_doc) + street = '' + postal_code = search_result.group(3) + else: + street = search_result.group(3) + postal_code = search_result.group(4) + + name = search_result.group(1) + city = search_result.group(2) + + url_search = re.search('link mt10">
', html_doc) + url = '' if url_search is None else url_search.group(1) + + gps = re.search('id="tabsmaps" gps="(.*?)">