add parish scrapping script
This commit is contained in:
parent
5dc436781b
commit
7aed0dda4f
BIN
AAAI96-155.pdf
Normal file
BIN
AAAI96-155.pdf
Normal file
Binary file not shown.
7
dev-requirements.in
Normal file
7
dev-requirements.in
Normal file
@ -0,0 +1,7 @@
|
||||
pip-tools
|
||||
jedi
|
||||
rope
|
||||
importmagic
|
||||
autopep8
|
||||
yapf
|
||||
ipdb
|
28
dev-requirements.txt
Normal file
28
dev-requirements.txt
Normal file
@ -0,0 +1,28 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile
|
||||
# To update, run:
|
||||
#
|
||||
# pip-compile --output-file dev-requirements.txt dev-requirements.in
|
||||
#
|
||||
autopep8==1.3.1
|
||||
click==6.7 # via pip-tools
|
||||
decorator==4.0.11 # via ipython, traitlets
|
||||
first==2.0.1 # via pip-tools
|
||||
importmagic==0.1.7
|
||||
ipdb==0.10.2
|
||||
ipython-genutils==0.2.0 # via traitlets
|
||||
ipython==5.3.0 # via ipdb
|
||||
jedi==0.10.2
|
||||
pexpect==4.2.1 # via ipython
|
||||
pickleshare==0.7.4 # via ipython
|
||||
pip-tools==1.9.0
|
||||
prompt-toolkit==1.0.14 # via ipython
|
||||
ptyprocess==0.5.1 # via pexpect
|
||||
pycodestyle==2.3.1 # via autopep8
|
||||
pygments==2.2.0 # via ipython
|
||||
rope==0.10.5
|
||||
simplegeneric==0.8.1 # via ipython
|
||||
six==1.10.0 # via pip-tools, prompt-toolkit, traitlets
|
||||
traitlets==4.3.2 # via ipython
|
||||
wcwidth==0.1.7 # via prompt-toolkit
|
||||
yapf==0.16.1
|
88
parish-scrapper.py
Normal file
88
parish-scrapper.py
Normal file
@ -0,0 +1,88 @@
|
||||
import requests
|
||||
# from bs4 import BeautifulSoup
|
||||
import re
|
||||
from collections import namedtuple
|
||||
import time
|
||||
import dill
|
||||
|
||||
|
||||
class ParishScraper(object):
|
||||
"""Documentation for ParishScraper
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.website_prefix = 'http://colaska.pl/index/parafia/id/'
|
||||
|
||||
def _scrap(self):
|
||||
parishes = []
|
||||
for page_nr in range(1, 11000):
|
||||
page = requests.get(self.website_prefix + str(page_nr))
|
||||
sleep_time = 2
|
||||
while page.status_code == 500:
|
||||
print('Status code 500 error')
|
||||
sleep_time = sleep_time**2
|
||||
print('Waiting ' + str(sleep_time) + ' sec')
|
||||
time.sleep(sleep_time)
|
||||
page = requests.get(self.website_prefix + str(page_nr))
|
||||
if 'id' in page.url:
|
||||
page_nr += 1
|
||||
parish = self._retrieve_info(page)
|
||||
print(parish)
|
||||
print('\n')
|
||||
parishes.append(parish)
|
||||
return parishes
|
||||
|
||||
def _retrieve_info(self, page):
|
||||
page.encoding = 'utf-8'
|
||||
html_doc = page.text
|
||||
meta_url = page.url
|
||||
print(meta_url)
|
||||
try:
|
||||
search_result = re.search(
|
||||
'pHead rel">[\w\W]*?<p class="title">(.*?)</p>[\w\W]*?class="city">(.*?)</span>[\w\W]*?<p>(.*?)<br />(.*?)</p>',
|
||||
html_doc)
|
||||
if search_result is None:
|
||||
search_result = re.search(
|
||||
'pHead rel">[\w\W]*?<p class="title">(.*?)</p>[\w\W]*?class="city">(.*?)</span>[\w\W]*?<p>(.*?)</p>',
|
||||
html_doc)
|
||||
street = ''
|
||||
postal_code = search_result.group(3)
|
||||
else:
|
||||
street = search_result.group(3)
|
||||
postal_code = search_result.group(4)
|
||||
|
||||
name = search_result.group(1)
|
||||
city = search_result.group(2)
|
||||
|
||||
url_search = re.search('link mt10"><a href="(.*?)">', html_doc)
|
||||
url = '' if url_search is None else url_search.group(1)
|
||||
|
||||
gps = re.search('id="tabsmaps" gps="(.*?)"><span',
|
||||
html_doc).group(1)
|
||||
Parish = namedtuple('Parish', [
|
||||
'meta_url', 'url', 'name', 'city', 'street', 'postal_code',
|
||||
'gps'
|
||||
])
|
||||
|
||||
parish = Parish(meta_url, url, name, city, street, postal_code,
|
||||
gps)
|
||||
except AttributeError:
|
||||
import ipdb
|
||||
ipdb.set_trace()
|
||||
return parish
|
||||
|
||||
def scrap_and_save(self):
|
||||
parishes = self._scrap()
|
||||
with open('parishes.dill', 'wb') as f:
|
||||
dill.dump(parishes, f, dill.HIGHEST_PROTOCOL)
|
||||
pass
|
||||
|
||||
|
||||
def main():
|
||||
parish_scraper = ParishScraper()
|
||||
parish_scraper.scrap_and_save()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
88
parish-scrapper.py~
Normal file
88
parish-scrapper.py~
Normal file
@ -0,0 +1,88 @@
|
||||
import requests
|
||||
# from bs4 import BeautifulSoup
|
||||
import re
|
||||
from collections import namedtuple
|
||||
import pickle
|
||||
import time
|
||||
|
||||
|
||||
class ParishScraper(object):
|
||||
"""Documentation for ParishScraper
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.website_prefix = 'http://colaska.pl/index/parafia/id/'
|
||||
|
||||
def _scrap(self):
|
||||
parishes = []
|
||||
for page_nr in range(1, 11000):
|
||||
page = requests.get(self.website_prefix + str(page_nr))
|
||||
sleep_time = 2
|
||||
while page.status_code == 500:
|
||||
print('Status code 500 error')
|
||||
sleep_time = sleep_time**2
|
||||
print('Waiting ' + str(sleep_time) + ' sec')
|
||||
time.sleep(sleep_time)
|
||||
page = requests.get(self.website_prefix + str(page_nr))
|
||||
if 'id' in page.url:
|
||||
page_nr += 1
|
||||
parish = self._retrieve_info(page)
|
||||
print(parish)
|
||||
print('\n')
|
||||
parishes.append(parish)
|
||||
return parishes
|
||||
|
||||
def _retrieve_info(self, page):
|
||||
page.encoding = 'utf-8'
|
||||
html_doc = page.text
|
||||
meta_url = page.url
|
||||
print(meta_url)
|
||||
try:
|
||||
search_result = re.search(
|
||||
'pHead rel">[\w\W]*?<p class="title">(.*?)</p>[\w\W]*?class="city">(.*?)</span>[\w\W]*?<p>(.*?)<br />(.*?)</p>',
|
||||
html_doc)
|
||||
if search_result is None:
|
||||
search_result = re.search(
|
||||
'pHead rel">[\w\W]*?<p class="title">(.*?)</p>[\w\W]*?class="city">(.*?)</span>[\w\W]*?<p>(.*?)</p>',
|
||||
html_doc)
|
||||
street = ''
|
||||
postal_code = search_result.group(3)
|
||||
else:
|
||||
street = search_result.group(3)
|
||||
postal_code = search_result.group(4)
|
||||
|
||||
name = search_result.group(1)
|
||||
city = search_result.group(2)
|
||||
|
||||
url_search = re.search('link mt10"><a href="(.*?)">', html_doc)
|
||||
url = '' if url_search is None else url_search.group(1)
|
||||
|
||||
gps = re.search('id="tabsmaps" gps="(.*?)"><span',
|
||||
html_doc).group(1)
|
||||
Parish = namedtuple('Parish', [
|
||||
'meta_url', 'url', 'name', 'city', 'street', 'postal_code',
|
||||
'gps'
|
||||
])
|
||||
|
||||
parish = Parish(meta_url, url, name, city, street, postal_code,
|
||||
gps)
|
||||
except AttributeError:
|
||||
import ipdb
|
||||
ipdb.set_trace()
|
||||
return parish
|
||||
|
||||
def scrap_and_save(self):
|
||||
parishes = self._scrap()
|
||||
with open('parishes.pickle', 'wb') as f:
|
||||
pickle.dump(parishes, f, pickle.HIGHEST_PROTOCOL)
|
||||
pass
|
||||
|
||||
|
||||
def main():
|
||||
parish_scraper = ParishScraper()
|
||||
parish_scraper.scrap_and_save()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
2
requirements.in
Normal file
2
requirements.in
Normal file
@ -0,0 +1,2 @@
|
||||
requests
|
||||
dill
|
8
requirements.txt
Normal file
8
requirements.txt
Normal file
@ -0,0 +1,8 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile
|
||||
# To update, run:
|
||||
#
|
||||
# pip-compile --output-file requirements.txt requirements.in
|
||||
#
|
||||
dill==0.2.6
|
||||
requests==2.13.0
|
Loading…
Reference in New Issue
Block a user