add parish scrapping script
This commit is contained in:
parent
5dc436781b
commit
7aed0dda4f
BIN
AAAI96-155.pdf
Normal file
BIN
AAAI96-155.pdf
Normal file
Binary file not shown.
7
dev-requirements.in
Normal file
7
dev-requirements.in
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
pip-tools
|
||||||
|
jedi
|
||||||
|
rope
|
||||||
|
importmagic
|
||||||
|
autopep8
|
||||||
|
yapf
|
||||||
|
ipdb
|
28
dev-requirements.txt
Normal file
28
dev-requirements.txt
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
#
|
||||||
|
# This file is autogenerated by pip-compile
|
||||||
|
# To update, run:
|
||||||
|
#
|
||||||
|
# pip-compile --output-file dev-requirements.txt dev-requirements.in
|
||||||
|
#
|
||||||
|
autopep8==1.3.1
|
||||||
|
click==6.7 # via pip-tools
|
||||||
|
decorator==4.0.11 # via ipython, traitlets
|
||||||
|
first==2.0.1 # via pip-tools
|
||||||
|
importmagic==0.1.7
|
||||||
|
ipdb==0.10.2
|
||||||
|
ipython-genutils==0.2.0 # via traitlets
|
||||||
|
ipython==5.3.0 # via ipdb
|
||||||
|
jedi==0.10.2
|
||||||
|
pexpect==4.2.1 # via ipython
|
||||||
|
pickleshare==0.7.4 # via ipython
|
||||||
|
pip-tools==1.9.0
|
||||||
|
prompt-toolkit==1.0.14 # via ipython
|
||||||
|
ptyprocess==0.5.1 # via pexpect
|
||||||
|
pycodestyle==2.3.1 # via autopep8
|
||||||
|
pygments==2.2.0 # via ipython
|
||||||
|
rope==0.10.5
|
||||||
|
simplegeneric==0.8.1 # via ipython
|
||||||
|
six==1.10.0 # via pip-tools, prompt-toolkit, traitlets
|
||||||
|
traitlets==4.3.2 # via ipython
|
||||||
|
wcwidth==0.1.7 # via prompt-toolkit
|
||||||
|
yapf==0.16.1
|
88
parish-scrapper.py
Normal file
88
parish-scrapper.py
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
import requests
|
||||||
|
# from bs4 import BeautifulSoup
|
||||||
|
import re
|
||||||
|
from collections import namedtuple
|
||||||
|
import time
|
||||||
|
import dill
|
||||||
|
|
||||||
|
|
||||||
|
class ParishScraper(object):
|
||||||
|
"""Documentation for ParishScraper
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.website_prefix = 'http://colaska.pl/index/parafia/id/'
|
||||||
|
|
||||||
|
def _scrap(self):
|
||||||
|
parishes = []
|
||||||
|
for page_nr in range(1, 11000):
|
||||||
|
page = requests.get(self.website_prefix + str(page_nr))
|
||||||
|
sleep_time = 2
|
||||||
|
while page.status_code == 500:
|
||||||
|
print('Status code 500 error')
|
||||||
|
sleep_time = sleep_time**2
|
||||||
|
print('Waiting ' + str(sleep_time) + ' sec')
|
||||||
|
time.sleep(sleep_time)
|
||||||
|
page = requests.get(self.website_prefix + str(page_nr))
|
||||||
|
if 'id' in page.url:
|
||||||
|
page_nr += 1
|
||||||
|
parish = self._retrieve_info(page)
|
||||||
|
print(parish)
|
||||||
|
print('\n')
|
||||||
|
parishes.append(parish)
|
||||||
|
return parishes
|
||||||
|
|
||||||
|
def _retrieve_info(self, page):
|
||||||
|
page.encoding = 'utf-8'
|
||||||
|
html_doc = page.text
|
||||||
|
meta_url = page.url
|
||||||
|
print(meta_url)
|
||||||
|
try:
|
||||||
|
search_result = re.search(
|
||||||
|
'pHead rel">[\w\W]*?<p class="title">(.*?)</p>[\w\W]*?class="city">(.*?)</span>[\w\W]*?<p>(.*?)<br />(.*?)</p>',
|
||||||
|
html_doc)
|
||||||
|
if search_result is None:
|
||||||
|
search_result = re.search(
|
||||||
|
'pHead rel">[\w\W]*?<p class="title">(.*?)</p>[\w\W]*?class="city">(.*?)</span>[\w\W]*?<p>(.*?)</p>',
|
||||||
|
html_doc)
|
||||||
|
street = ''
|
||||||
|
postal_code = search_result.group(3)
|
||||||
|
else:
|
||||||
|
street = search_result.group(3)
|
||||||
|
postal_code = search_result.group(4)
|
||||||
|
|
||||||
|
name = search_result.group(1)
|
||||||
|
city = search_result.group(2)
|
||||||
|
|
||||||
|
url_search = re.search('link mt10"><a href="(.*?)">', html_doc)
|
||||||
|
url = '' if url_search is None else url_search.group(1)
|
||||||
|
|
||||||
|
gps = re.search('id="tabsmaps" gps="(.*?)"><span',
|
||||||
|
html_doc).group(1)
|
||||||
|
Parish = namedtuple('Parish', [
|
||||||
|
'meta_url', 'url', 'name', 'city', 'street', 'postal_code',
|
||||||
|
'gps'
|
||||||
|
])
|
||||||
|
|
||||||
|
parish = Parish(meta_url, url, name, city, street, postal_code,
|
||||||
|
gps)
|
||||||
|
except AttributeError:
|
||||||
|
import ipdb
|
||||||
|
ipdb.set_trace()
|
||||||
|
return parish
|
||||||
|
|
||||||
|
def scrap_and_save(self):
|
||||||
|
parishes = self._scrap()
|
||||||
|
with open('parishes.dill', 'wb') as f:
|
||||||
|
dill.dump(parishes, f, dill.HIGHEST_PROTOCOL)
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parish_scraper = ParishScraper()
|
||||||
|
parish_scraper.scrap_and_save()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
88
parish-scrapper.py~
Normal file
88
parish-scrapper.py~
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
import requests
|
||||||
|
# from bs4 import BeautifulSoup
|
||||||
|
import re
|
||||||
|
from collections import namedtuple
|
||||||
|
import pickle
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
class ParishScraper(object):
|
||||||
|
"""Documentation for ParishScraper
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.website_prefix = 'http://colaska.pl/index/parafia/id/'
|
||||||
|
|
||||||
|
def _scrap(self):
|
||||||
|
parishes = []
|
||||||
|
for page_nr in range(1, 11000):
|
||||||
|
page = requests.get(self.website_prefix + str(page_nr))
|
||||||
|
sleep_time = 2
|
||||||
|
while page.status_code == 500:
|
||||||
|
print('Status code 500 error')
|
||||||
|
sleep_time = sleep_time**2
|
||||||
|
print('Waiting ' + str(sleep_time) + ' sec')
|
||||||
|
time.sleep(sleep_time)
|
||||||
|
page = requests.get(self.website_prefix + str(page_nr))
|
||||||
|
if 'id' in page.url:
|
||||||
|
page_nr += 1
|
||||||
|
parish = self._retrieve_info(page)
|
||||||
|
print(parish)
|
||||||
|
print('\n')
|
||||||
|
parishes.append(parish)
|
||||||
|
return parishes
|
||||||
|
|
||||||
|
def _retrieve_info(self, page):
|
||||||
|
page.encoding = 'utf-8'
|
||||||
|
html_doc = page.text
|
||||||
|
meta_url = page.url
|
||||||
|
print(meta_url)
|
||||||
|
try:
|
||||||
|
search_result = re.search(
|
||||||
|
'pHead rel">[\w\W]*?<p class="title">(.*?)</p>[\w\W]*?class="city">(.*?)</span>[\w\W]*?<p>(.*?)<br />(.*?)</p>',
|
||||||
|
html_doc)
|
||||||
|
if search_result is None:
|
||||||
|
search_result = re.search(
|
||||||
|
'pHead rel">[\w\W]*?<p class="title">(.*?)</p>[\w\W]*?class="city">(.*?)</span>[\w\W]*?<p>(.*?)</p>',
|
||||||
|
html_doc)
|
||||||
|
street = ''
|
||||||
|
postal_code = search_result.group(3)
|
||||||
|
else:
|
||||||
|
street = search_result.group(3)
|
||||||
|
postal_code = search_result.group(4)
|
||||||
|
|
||||||
|
name = search_result.group(1)
|
||||||
|
city = search_result.group(2)
|
||||||
|
|
||||||
|
url_search = re.search('link mt10"><a href="(.*?)">', html_doc)
|
||||||
|
url = '' if url_search is None else url_search.group(1)
|
||||||
|
|
||||||
|
gps = re.search('id="tabsmaps" gps="(.*?)"><span',
|
||||||
|
html_doc).group(1)
|
||||||
|
Parish = namedtuple('Parish', [
|
||||||
|
'meta_url', 'url', 'name', 'city', 'street', 'postal_code',
|
||||||
|
'gps'
|
||||||
|
])
|
||||||
|
|
||||||
|
parish = Parish(meta_url, url, name, city, street, postal_code,
|
||||||
|
gps)
|
||||||
|
except AttributeError:
|
||||||
|
import ipdb
|
||||||
|
ipdb.set_trace()
|
||||||
|
return parish
|
||||||
|
|
||||||
|
def scrap_and_save(self):
|
||||||
|
parishes = self._scrap()
|
||||||
|
with open('parishes.pickle', 'wb') as f:
|
||||||
|
pickle.dump(parishes, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parish_scraper = ParishScraper()
|
||||||
|
parish_scraper.scrap_and_save()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
2
requirements.in
Normal file
2
requirements.in
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
requests
|
||||||
|
dill
|
8
requirements.txt
Normal file
8
requirements.txt
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
#
|
||||||
|
# This file is autogenerated by pip-compile
|
||||||
|
# To update, run:
|
||||||
|
#
|
||||||
|
# pip-compile --output-file requirements.txt requirements.in
|
||||||
|
#
|
||||||
|
dill==0.2.6
|
||||||
|
requests==2.13.0
|
Loading…
Reference in New Issue
Block a user