add parish scrapping script

This commit is contained in:
siulkilulki 2017-04-20 10:51:02 +02:00
parent 5dc436781b
commit 7aed0dda4f
7 changed files with 221 additions and 0 deletions

BIN
AAAI96-155.pdf Normal file

Binary file not shown.

7
dev-requirements.in Normal file
View File

@ -0,0 +1,7 @@
pip-tools
jedi
rope
importmagic
autopep8
yapf
ipdb

28
dev-requirements.txt Normal file
View File

@ -0,0 +1,28 @@
#
# This file is autogenerated by pip-compile
# To update, run:
#
# pip-compile --output-file dev-requirements.txt dev-requirements.in
#
autopep8==1.3.1
click==6.7 # via pip-tools
decorator==4.0.11 # via ipython, traitlets
first==2.0.1 # via pip-tools
importmagic==0.1.7
ipdb==0.10.2
ipython-genutils==0.2.0 # via traitlets
ipython==5.3.0 # via ipdb
jedi==0.10.2
pexpect==4.2.1 # via ipython
pickleshare==0.7.4 # via ipython
pip-tools==1.9.0
prompt-toolkit==1.0.14 # via ipython
ptyprocess==0.5.1 # via pexpect
pycodestyle==2.3.1 # via autopep8
pygments==2.2.0 # via ipython
rope==0.10.5
simplegeneric==0.8.1 # via ipython
six==1.10.0 # via pip-tools, prompt-toolkit, traitlets
traitlets==4.3.2 # via ipython
wcwidth==0.1.7 # via prompt-toolkit
yapf==0.16.1

88
parish-scrapper.py Normal file
View File

@ -0,0 +1,88 @@
import requests
# from bs4 import BeautifulSoup
import re
from collections import namedtuple
import time
import dill
class ParishScraper(object):
"""Documentation for ParishScraper
"""
def __init__(self):
self.website_prefix = 'http://colaska.pl/index/parafia/id/'
def _scrap(self):
parishes = []
for page_nr in range(1, 11000):
page = requests.get(self.website_prefix + str(page_nr))
sleep_time = 2
while page.status_code == 500:
print('Status code 500 error')
sleep_time = sleep_time**2
print('Waiting ' + str(sleep_time) + ' sec')
time.sleep(sleep_time)
page = requests.get(self.website_prefix + str(page_nr))
if 'id' in page.url:
page_nr += 1
parish = self._retrieve_info(page)
print(parish)
print('\n')
parishes.append(parish)
return parishes
def _retrieve_info(self, page):
page.encoding = 'utf-8'
html_doc = page.text
meta_url = page.url
print(meta_url)
try:
search_result = re.search(
'pHead rel">[\w\W]*?<p class="title">(.*?)</p>[\w\W]*?class="city">(.*?)</span>[\w\W]*?<p>(.*?)<br />(.*?)</p>',
html_doc)
if search_result is None:
search_result = re.search(
'pHead rel">[\w\W]*?<p class="title">(.*?)</p>[\w\W]*?class="city">(.*?)</span>[\w\W]*?<p>(.*?)</p>',
html_doc)
street = ''
postal_code = search_result.group(3)
else:
street = search_result.group(3)
postal_code = search_result.group(4)
name = search_result.group(1)
city = search_result.group(2)
url_search = re.search('link mt10"><a href="(.*?)">', html_doc)
url = '' if url_search is None else url_search.group(1)
gps = re.search('id="tabsmaps" gps="(.*?)"><span',
html_doc).group(1)
Parish = namedtuple('Parish', [
'meta_url', 'url', 'name', 'city', 'street', 'postal_code',
'gps'
])
parish = Parish(meta_url, url, name, city, street, postal_code,
gps)
except AttributeError:
import ipdb
ipdb.set_trace()
return parish
def scrap_and_save(self):
parishes = self._scrap()
with open('parishes.dill', 'wb') as f:
dill.dump(parishes, f, dill.HIGHEST_PROTOCOL)
pass
def main():
parish_scraper = ParishScraper()
parish_scraper.scrap_and_save()
if __name__ == "__main__":
main()

88
parish-scrapper.py~ Normal file
View File

@ -0,0 +1,88 @@
import requests
# from bs4 import BeautifulSoup
import re
from collections import namedtuple
import pickle
import time
class ParishScraper(object):
"""Documentation for ParishScraper
"""
def __init__(self):
self.website_prefix = 'http://colaska.pl/index/parafia/id/'
def _scrap(self):
parishes = []
for page_nr in range(1, 11000):
page = requests.get(self.website_prefix + str(page_nr))
sleep_time = 2
while page.status_code == 500:
print('Status code 500 error')
sleep_time = sleep_time**2
print('Waiting ' + str(sleep_time) + ' sec')
time.sleep(sleep_time)
page = requests.get(self.website_prefix + str(page_nr))
if 'id' in page.url:
page_nr += 1
parish = self._retrieve_info(page)
print(parish)
print('\n')
parishes.append(parish)
return parishes
def _retrieve_info(self, page):
page.encoding = 'utf-8'
html_doc = page.text
meta_url = page.url
print(meta_url)
try:
search_result = re.search(
'pHead rel">[\w\W]*?<p class="title">(.*?)</p>[\w\W]*?class="city">(.*?)</span>[\w\W]*?<p>(.*?)<br />(.*?)</p>',
html_doc)
if search_result is None:
search_result = re.search(
'pHead rel">[\w\W]*?<p class="title">(.*?)</p>[\w\W]*?class="city">(.*?)</span>[\w\W]*?<p>(.*?)</p>',
html_doc)
street = ''
postal_code = search_result.group(3)
else:
street = search_result.group(3)
postal_code = search_result.group(4)
name = search_result.group(1)
city = search_result.group(2)
url_search = re.search('link mt10"><a href="(.*?)">', html_doc)
url = '' if url_search is None else url_search.group(1)
gps = re.search('id="tabsmaps" gps="(.*?)"><span',
html_doc).group(1)
Parish = namedtuple('Parish', [
'meta_url', 'url', 'name', 'city', 'street', 'postal_code',
'gps'
])
parish = Parish(meta_url, url, name, city, street, postal_code,
gps)
except AttributeError:
import ipdb
ipdb.set_trace()
return parish
def scrap_and_save(self):
parishes = self._scrap()
with open('parishes.pickle', 'wb') as f:
pickle.dump(parishes, f, pickle.HIGHEST_PROTOCOL)
pass
def main():
parish_scraper = ParishScraper()
parish_scraper.scrap_and_save()
if __name__ == "__main__":
main()

2
requirements.in Normal file
View File

@ -0,0 +1,2 @@
requests
dill

8
requirements.txt Normal file
View File

@ -0,0 +1,8 @@
#
# This file is autogenerated by pip-compile
# To update, run:
#
# pip-compile --output-file requirements.txt requirements.in
#
dill==0.2.6
requests==2.13.0