add parish scrapping script

2017-04-20 10:51:02 +02:00 · 2017-04-20 10:51:02 +02:00 · 7aed0dda4f
commit 7aed0dda4f
parent 5dc436781b
7 changed files with 221 additions and 0 deletions
--- a/AAAI96-155.pdf
+++ b/AAAI96-155.pdf
--- a/dev-requirements.in
+++ b/dev-requirements.in
@ -0,0 +1,7 @@
+pip-tools
+jedi
+rope
+importmagic
+autopep8
+yapf
+ipdb
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@ -0,0 +1,28 @@
+#
+# This file is autogenerated by pip-compile
+# To update, run:
+#
+#    pip-compile --output-file dev-requirements.txt dev-requirements.in
+#
+autopep8==1.3.1
+click==6.7                # via pip-tools
+decorator==4.0.11         # via ipython, traitlets
+first==2.0.1              # via pip-tools
+importmagic==0.1.7
+ipdb==0.10.2
+ipython-genutils==0.2.0   # via traitlets
+ipython==5.3.0            # via ipdb
+jedi==0.10.2
+pexpect==4.2.1            # via ipython
+pickleshare==0.7.4        # via ipython
+pip-tools==1.9.0
+prompt-toolkit==1.0.14    # via ipython
+ptyprocess==0.5.1         # via pexpect
+pycodestyle==2.3.1        # via autopep8
+pygments==2.2.0           # via ipython
+rope==0.10.5
+simplegeneric==0.8.1      # via ipython
+six==1.10.0               # via pip-tools, prompt-toolkit, traitlets
+traitlets==4.3.2          # via ipython
+wcwidth==0.1.7            # via prompt-toolkit
+yapf==0.16.1
--- a/parish-scrapper.py
+++ b/parish-scrapper.py
@ -0,0 +1,88 @@
+import requests
+# from bs4 import BeautifulSoup
+import re
+from collections import namedtuple
+import time
+import dill
+
+
+class ParishScraper(object):
+    """Documentation for ParishScraper
+
+    """
+
+    def __init__(self):
+        self.website_prefix = 'http://colaska.pl/index/parafia/id/'
+
+    def _scrap(self):
+        parishes = []
+        for page_nr in range(1, 11000):
+            page = requests.get(self.website_prefix + str(page_nr))
+            sleep_time = 2
+            while page.status_code == 500:
+                print('Status code 500 error')
+                sleep_time = sleep_time**2
+                print('Waiting ' + str(sleep_time) + ' sec')
+                time.sleep(sleep_time)
+                page = requests.get(self.website_prefix + str(page_nr))
+            if 'id' in page.url:
+                page_nr += 1
+                parish = self._retrieve_info(page)
+                print(parish)
+                print('\n')
+                parishes.append(parish)
+        return parishes
+
+    def _retrieve_info(self, page):
+        page.encoding = 'utf-8'
+        html_doc = page.text
+        meta_url = page.url
+        print(meta_url)
+        try:
+            search_result = re.search(
+                'pHead rel">[\w\W]*?<p class="title">(.*?)</p>[\w\W]*?class="city">(.*?)</span>[\w\W]*?<p>(.*?)<br />(.*?)</p>',
+                html_doc)
+            if search_result is None:
+                search_result = re.search(
+                    'pHead rel">[\w\W]*?<p class="title">(.*?)</p>[\w\W]*?class="city">(.*?)</span>[\w\W]*?<p>(.*?)</p>',
+                    html_doc)
+                street = ''
+                postal_code = search_result.group(3)
+            else:
+                street = search_result.group(3)
+                postal_code = search_result.group(4)
+
+            name = search_result.group(1)
+            city = search_result.group(2)
+
+            url_search = re.search('link mt10"><a href="(.*?)">', html_doc)
+            url = '' if url_search is None else url_search.group(1)
+
+            gps = re.search('id="tabsmaps" gps="(.*?)"><span',
+                            html_doc).group(1)
+            Parish = namedtuple('Parish', [
+                'meta_url', 'url', 'name', 'city', 'street', 'postal_code',
+                'gps'
+            ])
+
+            parish = Parish(meta_url, url, name, city, street, postal_code,
+                            gps)
+        except AttributeError:
+            import ipdb
+            ipdb.set_trace()
+        return parish
+
+    def scrap_and_save(self):
+        parishes = self._scrap()
+        with open('parishes.dill', 'wb') as f:
+            dill.dump(parishes, f, dill.HIGHEST_PROTOCOL)
+        pass
+
+
+def main():
+    parish_scraper = ParishScraper()
+    parish_scraper.scrap_and_save()
+
+
+if __name__ == "__main__":
+    main()
--- a/parish-scrapper.py~
+++ b/parish-scrapper.py~
@ -0,0 +1,88 @@
+import requests
+# from bs4 import BeautifulSoup
+import re
+from collections import namedtuple
+import pickle
+import time
+
+
+class ParishScraper(object):
+    """Documentation for ParishScraper
+
+    """
+
+    def __init__(self):
+        self.website_prefix = 'http://colaska.pl/index/parafia/id/'
+
+    def _scrap(self):
+        parishes = []
+        for page_nr in range(1, 11000):
+            page = requests.get(self.website_prefix + str(page_nr))
+            sleep_time = 2
+            while page.status_code == 500:
+                print('Status code 500 error')
+                sleep_time = sleep_time**2
+                print('Waiting ' + str(sleep_time) + ' sec')
+                time.sleep(sleep_time)
+                page = requests.get(self.website_prefix + str(page_nr))
+            if 'id' in page.url:
+                page_nr += 1
+                parish = self._retrieve_info(page)
+                print(parish)
+                print('\n')
+                parishes.append(parish)
+        return parishes
+
+    def _retrieve_info(self, page):
+        page.encoding = 'utf-8'
+        html_doc = page.text
+        meta_url = page.url
+        print(meta_url)
+        try:
+            search_result = re.search(
+                'pHead rel">[\w\W]*?<p class="title">(.*?)</p>[\w\W]*?class="city">(.*?)</span>[\w\W]*?<p>(.*?)<br />(.*?)</p>',
+                html_doc)
+            if search_result is None:
+                search_result = re.search(
+                    'pHead rel">[\w\W]*?<p class="title">(.*?)</p>[\w\W]*?class="city">(.*?)</span>[\w\W]*?<p>(.*?)</p>',
+                    html_doc)
+                street = ''
+                postal_code = search_result.group(3)
+            else:
+                street = search_result.group(3)
+                postal_code = search_result.group(4)
+
+            name = search_result.group(1)
+            city = search_result.group(2)
+
+            url_search = re.search('link mt10"><a href="(.*?)">', html_doc)
+            url = '' if url_search is None else url_search.group(1)
+
+            gps = re.search('id="tabsmaps" gps="(.*?)"><span',
+                            html_doc).group(1)
+            Parish = namedtuple('Parish', [
+                'meta_url', 'url', 'name', 'city', 'street', 'postal_code',
+                'gps'
+            ])
+
+            parish = Parish(meta_url, url, name, city, street, postal_code,
+                            gps)
+        except AttributeError:
+            import ipdb
+            ipdb.set_trace()
+        return parish
+
+    def scrap_and_save(self):
+        parishes = self._scrap()
+        with open('parishes.pickle', 'wb') as f:
+            pickle.dump(parishes, f, pickle.HIGHEST_PROTOCOL)
+        pass
+
+
+def main():
+    parish_scraper = ParishScraper()
+    parish_scraper.scrap_and_save()
+
+
+if __name__ == "__main__":
+    main()
--- a/requirements.in
+++ b/requirements.in
@ -0,0 +1,2 @@
+requests
+dill
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,8 @@
+#
+# This file is autogenerated by pip-compile
+# To update, run:
+#
+#    pip-compile --output-file requirements.txt requirements.in
+#
+dill==0.2.6
+requests==2.13.0