From 57315f9b3141c446caf5233f9761b9a41d594c3b Mon Sep 17 00:00:00 2001
From: siulkilulki <jurkiewiczdawid@gmail.com>
Date: Mon, 12 Jun 2017 22:08:29 +0200
Subject: [PATCH] proof of concept alpha

---
 duckduckgo.py      | 59 ++++++++++++++++++++++++++++++-------------
 full_scrapper.py   | 63 +++++++++++++++++++++++++++++++++++-----------
 parish-scrapper.py |  1 -
 parishes.tsv       |  2 ++
 proxy.py           |  2 +-
 requirements.in    |  4 ++-
 requirements.txt   |  7 +++---
 urls.txt           |  2 ++
 8 files changed, 102 insertions(+), 38 deletions(-)
 create mode 100644 parishes.tsv
 create mode 100644 urls.txt

diff --git a/duckduckgo.py b/duckduckgo.py
index 71ae4d2..37563f3 100644
--- a/duckduckgo.py
+++ b/duckduckgo.py
@@ -1,6 +1,9 @@
 import requests
 from string import Template
 from random import choice
+from proxy import Proxy
+from bs4 import BeautifulSoup
+from bs4.dammit import EncodingDetector
 
 
 class DuckDuckGo(object):
@@ -9,29 +12,51 @@ class DuckDuckGo(object):
     """
 
     def __init__(self, proxies=None, language=''):
-        self.proxies = [] if proxies is None else proxies
-        self.language = language
-        self.query = Template('https://duckduckgo.com/html/?q=$query&kl=$lang')
+        self.proxy_obj = Proxy() if proxies is None else Proxy(proxies)
+        self.query = Template('https://duckduckgo.com/html/?q=$query&kl=' +
+                              language)
 
-    def _get(self, query, language):
-        link = self.query.substitute(query=query, lang=language)
-        if self.proxies:
-            proxy = choice(self.proxies)
-            ip_and_port = proxy[0]
-            protocol = proxy[1]
-            proxies = {protocol: ip_and_port}
-            requests.get(link, proxies=proxies)
-        return requests.get(link)
+    def _get(self, query):
+        query = query.replace(' ', '+')
+        link = self.query.substitute(query=query)
+        if self.proxy_obj.proxies:
+            proxy = self.proxy_obj.random()
+            print(proxy)
+            return requests.post(link, proxies=proxy)
+        return requests.post(link)
 
-    def body(self, query, language):
-        pass
+    def _proxy_to_dict(self, proxy):
+        proxy_string = str(proxy[0]) + ':' + str(proxy[1])
+        return {"http": proxy_string, "https": proxy_string}
 
-    def links(self, query, language):
-        pass
+    def download_proxies(self):
+        self.proxy_obj.download()
+
+    def _soup(self, query):
+        resp = self._get(query)
+        content_type = resp.headers.get('content-type', '').lower()
+        http_encoding = resp.encoding if 'charset' in content_type else None
+        html_encoding = EncodingDetector.find_declared_encoding(
+            resp.content, is_html=True)
+        encoding = html_encoding or http_encoding
+        return BeautifulSoup(resp.content, 'lxml', from_encoding=encoding)
+
+    def html(self, query):
+        soup = self._soup(query)
+        return soup.prettify()
+
+    def links(self, query):
+        soup = self._soup(query)
+        return [
+            link.get('href')
+            for link in soup.find_all('a', class_='result__snippet')
+        ]
 
 
 def main():
-    pass
+    duck = DuckDuckGo(language='pl-pl')
+    links = duck.links('koscioly polska')
+    print(links)
 
 
 if __name__ == '__main__':
diff --git a/full_scrapper.py b/full_scrapper.py
index 5ff114d..601e6f5 100644
--- a/full_scrapper.py
+++ b/full_scrapper.py
@@ -1,17 +1,47 @@
 import dill
-from google import search
+from duckduckgo import DuckDuckGo
+from urllib.parse import urlparse
+import time
+import random
+
+tsv = ''
+urls = ''
 
 
-def check(parish):
-    if parish.url in search(query, lang='pl', stop=10, pause=3.0):
-        return true
+def check(parish, duck):
+    global urls
+    global tsv
+    links = _urls(parish, duck)
+    for link in links:
+        parish_root_url = urlparse(parish.url).netloc
+        if parish_root_url == urlparse(link).netloc:
+            urls += parish_root_url + '\n'
+            tsv += parish.name + '\t' + parish.city + '\t' + parish.street + '\t' + parish.postal_code + '\t' + parish_root_url + '\t' + parish.meta_url + '\t' + parish.gps + '\n'
+            print('added')
+            # TODO: save links to txt file, one per line
+            # TODO: wget -r -i file all links
+            # TODO: save parishes to jsonline format
+            return True  # mark as ok url
+    return False
+
+
+def _urls(parish, duck):
+    query = parish.name + ' ' + parish.street + ' ' + parish.postal_code
+    links = duck.links(query)
+    time.sleep(1)
+    while not links:
+        print('retry')
+        random.randint(3, 10)
+        time.sleep(10)
+        links = duck.links(query)
+    return links
 
 
 def find_url(parish):
-    pass
-
-
-def stem_url(url):
+    links = _urls(parish)
+    import ipdb
+    ipdb.set_trace()
+    print(links)
 
 
 def main():
@@ -19,14 +49,19 @@ def main():
     with open('./parishes.dill', 'rb') as f:
         parishes = dill.load(f)
 
+    duck = DuckDuckGo(language='pl-pl')
+    print('Downloading proxies')
+    duck.download_proxies()
+    i = 0
     for parish in parishes:
+        print(str(i / len(parishes)) + '% done. Nr: ' + str(i))
+        i += 1
         if parish.url:
-            check(parish)
-        else:
-            find_url(parish)
-
-    import ipdb
-    ipdb.set_trace()
+            check(parish, duck)
+    with open('urls.txt', 'w') as f:
+        f.write(urls)
+    with open('parishes.tsv', 'w') as f:
+        f.write(tsv)
 
 
 if __name__ == "__main__":
diff --git a/parish-scrapper.py b/parish-scrapper.py
index ae15364..862236d 100644
--- a/parish-scrapper.py
+++ b/parish-scrapper.py
@@ -1,5 +1,4 @@
 import requests
-# from bs4 import BeautifulSoup
 import re
 from collections import namedtuple
 import time
diff --git a/parishes.tsv b/parishes.tsv
new file mode 100644
index 0000000..b37358a
--- /dev/null
+++ b/parishes.tsv
@@ -0,0 +1,2 @@
+Parafia pod wezwaniem NMP Królowej Aniołów	Adamowice	ul. Poprzeczna 15	47-435 Raszyce	www.adamowice.katowice.opoka.org.pl	http://colaska.pl/index/parafia/id/2	18.2955971,50.5078563
+Parafia pod wezwaniem Narodzenia NMP	Albigowa	Albigowa 844	37-122 Albigowa	www.albigowa.parafia.info.pl	http://colaska.pl/index/parafia/id/6	 22.229000329971313,50.01446141585083
diff --git a/proxy.py b/proxy.py
index f4cd6e5..3573f72 100644
--- a/proxy.py
+++ b/proxy.py
@@ -17,7 +17,7 @@ class Proxy():
         full_list_button = driver.find_element_by_xpath(
             '//input[@type="submit" and @value="Show Full List"]')
         full_list_button.click()
-        print(driver.page_source)
+        #print(driver.page_source)
         for match in re.finditer(
                 '<a href="#(.*?)" class="inactive" onclick="gp.pageClick',
                 driver.page_source):
diff --git a/requirements.in b/requirements.in
index ab8bb91..0d9e0c2 100644
--- a/requirements.in
+++ b/requirements.in
@@ -1,3 +1,5 @@
 requests
 dill
-dryscrape
+beautifulsoup4
+lxml
+selenium
diff --git a/requirements.txt b/requirements.txt
index b0ae206..80db7b3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,9 +4,8 @@
 #
 #    pip-compile --output-file requirements.txt requirements.in
 #
+beautifulsoup4==4.6.0
 dill==0.2.6
-dryscrape==1.0
-lxml==3.8.0               # via dryscrape
+lxml==3.8.0
 requests==2.13.0
-webkit-server==1.0        # via dryscrape
-xvfbwrapper==0.2.9        # via dryscrape
+selenium==3.4.3
diff --git a/urls.txt b/urls.txt
new file mode 100644
index 0000000..fb503eb
--- /dev/null
+++ b/urls.txt
@@ -0,0 +1,2 @@
+www.adamowice.katowice.opoka.org.pl
+www.albigowa.parafia.info.pl