proof of concept alpha

2017-06-12 22:08:29 +02:00 · 2017-06-12 22:08:29 +02:00 · 57315f9b31
commit 57315f9b31
parent de56ecb253
8 changed files with 102 additions and 38 deletions
--- a/duckduckgo.py
+++ b/duckduckgo.py
@ -1,6 +1,9 @@
 import requests
 from string import Template
 from random import choice
 from proxy import Proxy
 from bs4 import BeautifulSoup
 from bs4.dammit import EncodingDetector
 class DuckDuckGo(object):
@ -9,29 +12,51 @@ class DuckDuckGo(object):
    """
    def __init__(self, proxies=None, language=''):
-        self.proxies = [] if proxies is None else proxies
+        self.proxy_obj = Proxy() if proxies is None else Proxy(proxies)
-        self.language = language
+        self.query = Template('https://duckduckgo.com/html/?q=$query&kl=' +
-        self.query = Template('https://duckduckgo.com/html/?q=$query&kl=$lang')
+                              language)
-    def _get(self, query, language):
+    def _get(self, query):
-        link = self.query.substitute(query=query, lang=language)
+        query = query.replace(' ', '+')
-        if self.proxies:
+        link = self.query.substitute(query=query)
-            proxy = choice(self.proxies)
+        if self.proxy_obj.proxies:
-            ip_and_port = proxy[0]
+            proxy = self.proxy_obj.random()
-            protocol = proxy[1]
+            print(proxy)
-            proxies = {protocol: ip_and_port}
+            return requests.post(link, proxies=proxy)
-            requests.get(link, proxies=proxies)
+        return requests.post(link)
        return requests.get(link)
-    def body(self, query, language):
+    def _proxy_to_dict(self, proxy):
-        pass
+        proxy_string = str(proxy[0]) + ':' + str(proxy[1])
        return {"http": proxy_string, "https": proxy_string}
-    def links(self, query, language):
+    def download_proxies(self):
-        pass
+        self.proxy_obj.download()
    def _soup(self, query):
        resp = self._get(query)
        content_type = resp.headers.get('content-type', '').lower()
        http_encoding = resp.encoding if 'charset' in content_type else None
        html_encoding = EncodingDetector.find_declared_encoding(
            resp.content, is_html=True)
        encoding = html_encoding or http_encoding
        return BeautifulSoup(resp.content, 'lxml', from_encoding=encoding)
    def html(self, query):
        soup = self._soup(query)
        return soup.prettify()
    def links(self, query):
        soup = self._soup(query)
        return [
            link.get('href')
            for link in soup.find_all('a', class_='result__snippet')
        ]
 def main():
-    pass
+    duck = DuckDuckGo(language='pl-pl')
    links = duck.links('koscioly polska')
    print(links)
 if __name__ == '__main__':
--- a/full_scrapper.py
+++ b/full_scrapper.py
@ -1,17 +1,47 @@
 import dill
-from google import search
+from duckduckgo import DuckDuckGo
 from urllib.parse import urlparse
 import time
 import random
 tsv = ''
 urls = ''
-def check(parish):
+def check(parish, duck):
-    if parish.url in search(query, lang='pl', stop=10, pause=3.0):
+    global urls
-        return true
+    global tsv
    links = _urls(parish, duck)
    for link in links:
        parish_root_url = urlparse(parish.url).netloc
        if parish_root_url == urlparse(link).netloc:
            urls += parish_root_url + '\n'
            tsv += parish.name + '\t' + parish.city + '\t' + parish.street + '\t' + parish.postal_code + '\t' + parish_root_url + '\t' + parish.meta_url + '\t' + parish.gps + '\n'
            print('added')
            # TODO: save links to txt file, one per line
            # TODO: wget -r -i file all links
            # TODO: save parishes to jsonline format
            return True  # mark as ok url
    return False
 def _urls(parish, duck):
    query = parish.name + ' ' + parish.street + ' ' + parish.postal_code
    links = duck.links(query)
    time.sleep(1)
    while not links:
        print('retry')
        random.randint(3, 10)
        time.sleep(10)
        links = duck.links(query)
    return links
 def find_url(parish):
-    pass
+    links = _urls(parish)
-
+    import ipdb
-
+    ipdb.set_trace()
-def stem_url(url):
+    print(links)
 def main():
@ -19,14 +49,19 @@ def main():
    with open('./parishes.dill', 'rb') as f:
        parishes = dill.load(f)
    duck = DuckDuckGo(language='pl-pl')
    print('Downloading proxies')
    duck.download_proxies()
    i = 0
    for parish in parishes:
        print(str(i / len(parishes)) + '% done. Nr: ' + str(i))
        i += 1
        if parish.url:
-            check(parish)
+            check(parish, duck)
-        else:
+    with open('urls.txt', 'w') as f:
-            find_url(parish)
+        f.write(urls)
-
+    with open('parishes.tsv', 'w') as f:
-    import ipdb
+        f.write(tsv)
    ipdb.set_trace()
 if __name__ == "__main__":
--- a/parish-scrapper.py
+++ b/parish-scrapper.py
@ -1,5 +1,4 @@
 import requests
 # from bs4 import BeautifulSoup
 import re
 from collections import namedtuple
 import time
--- a/parishes.tsv
+++ b/parishes.tsv
@ -0,0 +1,2 @@
 Parafia pod wezwaniem NMP Królowej Aniołów	Adamowice	ul. Poprzeczna 15	47-435 Raszyce	www.adamowice.katowice.opoka.org.pl	http://colaska.pl/index/parafia/id/2	18.2955971,50.5078563
 Parafia pod wezwaniem Narodzenia NMP	Albigowa	Albigowa 844	37-122 Albigowa	www.albigowa.parafia.info.pl	http://colaska.pl/index/parafia/id/6	 22.229000329971313,50.01446141585083
--- a/proxy.py
+++ b/proxy.py
@ -17,7 +17,7 @@ class Proxy():
        full_list_button = driver.find_element_by_xpath(
            '//input[@type="submit" and @value="Show Full List"]')
        full_list_button.click()
-        print(driver.page_source)
+        #print(driver.page_source)
        for match in re.finditer(
                '<a href="#(.*?)" class="inactive" onclick="gp.pageClick',
                driver.page_source):
--- a/requirements.in
+++ b/requirements.in
@ -1,3 +1,5 @@
 requests
 dill
-dryscrape
+beautifulsoup4
 lxml
 selenium
--- a/requirements.txt
+++ b/requirements.txt
@ -4,9 +4,8 @@
 #
 #    pip-compile --output-file requirements.txt requirements.in
 #
 beautifulsoup4==4.6.0
 dill==0.2.6
-dryscrape==1.0
+lxml==3.8.0
 lxml==3.8.0               # via dryscrape
 requests==2.13.0
-webkit-server==1.0        # via dryscrape
+selenium==3.4.3
 xvfbwrapper==0.2.9        # via dryscrape
--- a/urls.txt
+++ b/urls.txt
@ -0,0 +1,2 @@
 www.adamowice.katowice.opoka.org.pl
 www.albigowa.parafia.info.pl
		`@ -0,0 +1,2 @@`
							`Parafia pod wezwaniem NMP Królowej Aniołów Adamowice ul. Poprzeczna 15 47-435 Raszyce www.adamowice.katowice.opoka.org.pl http://colaska.pl/index/parafia/id/2 18.2955971,50.5078563`
							`Parafia pod wezwaniem Narodzenia NMP Albigowa Albigowa 844 37-122 Albigowa www.albigowa.parafia.info.pl http://colaska.pl/index/parafia/id/6 22.229000329971313,50.01446141585083`
		`@ -0,0 +1,2 @@`
							`www.adamowice.katowice.opoka.org.pl`
							`www.albigowa.parafia.info.pl`