From 57315f9b3141c446caf5233f9761b9a41d594c3b Mon Sep 17 00:00:00 2001 From: siulkilulki Date: Mon, 12 Jun 2017 22:08:29 +0200 Subject: [PATCH] proof of concept alpha --- duckduckgo.py | 59 ++++++++++++++++++++++++++++++------------- full_scrapper.py | 63 +++++++++++++++++++++++++++++++++++----------- parish-scrapper.py | 1 - parishes.tsv | 2 ++ proxy.py | 2 +- requirements.in | 4 ++- requirements.txt | 7 +++--- urls.txt | 2 ++ 8 files changed, 102 insertions(+), 38 deletions(-) create mode 100644 parishes.tsv create mode 100644 urls.txt diff --git a/duckduckgo.py b/duckduckgo.py index 71ae4d2..37563f3 100644 --- a/duckduckgo.py +++ b/duckduckgo.py @@ -1,6 +1,9 @@ import requests from string import Template from random import choice +from proxy import Proxy +from bs4 import BeautifulSoup +from bs4.dammit import EncodingDetector class DuckDuckGo(object): @@ -9,29 +12,51 @@ class DuckDuckGo(object): """ def __init__(self, proxies=None, language=''): - self.proxies = [] if proxies is None else proxies - self.language = language - self.query = Template('https://duckduckgo.com/html/?q=$query&kl=$lang') + self.proxy_obj = Proxy() if proxies is None else Proxy(proxies) + self.query = Template('https://duckduckgo.com/html/?q=$query&kl=' + + language) - def _get(self, query, language): - link = self.query.substitute(query=query, lang=language) - if self.proxies: - proxy = choice(self.proxies) - ip_and_port = proxy[0] - protocol = proxy[1] - proxies = {protocol: ip_and_port} - requests.get(link, proxies=proxies) - return requests.get(link) + def _get(self, query): + query = query.replace(' ', '+') + link = self.query.substitute(query=query) + if self.proxy_obj.proxies: + proxy = self.proxy_obj.random() + print(proxy) + return requests.post(link, proxies=proxy) + return requests.post(link) - def body(self, query, language): - pass + def _proxy_to_dict(self, proxy): + proxy_string = str(proxy[0]) + ':' + str(proxy[1]) + return {"http": proxy_string, "https": proxy_string} - def links(self, query, language): - pass + def download_proxies(self): + self.proxy_obj.download() + + def _soup(self, query): + resp = self._get(query) + content_type = resp.headers.get('content-type', '').lower() + http_encoding = resp.encoding if 'charset' in content_type else None + html_encoding = EncodingDetector.find_declared_encoding( + resp.content, is_html=True) + encoding = html_encoding or http_encoding + return BeautifulSoup(resp.content, 'lxml', from_encoding=encoding) + + def html(self, query): + soup = self._soup(query) + return soup.prettify() + + def links(self, query): + soup = self._soup(query) + return [ + link.get('href') + for link in soup.find_all('a', class_='result__snippet') + ] def main(): - pass + duck = DuckDuckGo(language='pl-pl') + links = duck.links('koscioly polska') + print(links) if __name__ == '__main__': diff --git a/full_scrapper.py b/full_scrapper.py index 5ff114d..601e6f5 100644 --- a/full_scrapper.py +++ b/full_scrapper.py @@ -1,17 +1,47 @@ import dill -from google import search +from duckduckgo import DuckDuckGo +from urllib.parse import urlparse +import time +import random + +tsv = '' +urls = '' -def check(parish): - if parish.url in search(query, lang='pl', stop=10, pause=3.0): - return true +def check(parish, duck): + global urls + global tsv + links = _urls(parish, duck) + for link in links: + parish_root_url = urlparse(parish.url).netloc + if parish_root_url == urlparse(link).netloc: + urls += parish_root_url + '\n' + tsv += parish.name + '\t' + parish.city + '\t' + parish.street + '\t' + parish.postal_code + '\t' + parish_root_url + '\t' + parish.meta_url + '\t' + parish.gps + '\n' + print('added') + # TODO: save links to txt file, one per line + # TODO: wget -r -i file all links + # TODO: save parishes to jsonline format + return True # mark as ok url + return False + + +def _urls(parish, duck): + query = parish.name + ' ' + parish.street + ' ' + parish.postal_code + links = duck.links(query) + time.sleep(1) + while not links: + print('retry') + random.randint(3, 10) + time.sleep(10) + links = duck.links(query) + return links def find_url(parish): - pass - - -def stem_url(url): + links = _urls(parish) + import ipdb + ipdb.set_trace() + print(links) def main(): @@ -19,14 +49,19 @@ def main(): with open('./parishes.dill', 'rb') as f: parishes = dill.load(f) + duck = DuckDuckGo(language='pl-pl') + print('Downloading proxies') + duck.download_proxies() + i = 0 for parish in parishes: + print(str(i / len(parishes)) + '% done. Nr: ' + str(i)) + i += 1 if parish.url: - check(parish) - else: - find_url(parish) - - import ipdb - ipdb.set_trace() + check(parish, duck) + with open('urls.txt', 'w') as f: + f.write(urls) + with open('parishes.tsv', 'w') as f: + f.write(tsv) if __name__ == "__main__": diff --git a/parish-scrapper.py b/parish-scrapper.py index ae15364..862236d 100644 --- a/parish-scrapper.py +++ b/parish-scrapper.py @@ -1,5 +1,4 @@ import requests -# from bs4 import BeautifulSoup import re from collections import namedtuple import time diff --git a/parishes.tsv b/parishes.tsv new file mode 100644 index 0000000..b37358a --- /dev/null +++ b/parishes.tsv @@ -0,0 +1,2 @@ +Parafia pod wezwaniem NMP Królowej Aniołów Adamowice ul. Poprzeczna 15 47-435 Raszyce www.adamowice.katowice.opoka.org.pl http://colaska.pl/index/parafia/id/2 18.2955971,50.5078563 +Parafia pod wezwaniem Narodzenia NMP Albigowa Albigowa 844 37-122 Albigowa www.albigowa.parafia.info.pl http://colaska.pl/index/parafia/id/6 22.229000329971313,50.01446141585083 diff --git a/proxy.py b/proxy.py index f4cd6e5..3573f72 100644 --- a/proxy.py +++ b/proxy.py @@ -17,7 +17,7 @@ class Proxy(): full_list_button = driver.find_element_by_xpath( '//input[@type="submit" and @value="Show Full List"]') full_list_button.click() - print(driver.page_source) + #print(driver.page_source) for match in re.finditer( '