proof of concept alpha

This commit is contained in:
siulkilulki 2017-06-12 22:08:29 +02:00
parent de56ecb253
commit 57315f9b31
8 changed files with 102 additions and 38 deletions

View File

@ -1,6 +1,9 @@
import requests import requests
from string import Template from string import Template
from random import choice from random import choice
from proxy import Proxy
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
class DuckDuckGo(object): class DuckDuckGo(object):
@ -9,29 +12,51 @@ class DuckDuckGo(object):
""" """
def __init__(self, proxies=None, language=''): def __init__(self, proxies=None, language=''):
self.proxies = [] if proxies is None else proxies self.proxy_obj = Proxy() if proxies is None else Proxy(proxies)
self.language = language self.query = Template('https://duckduckgo.com/html/?q=$query&kl=' +
self.query = Template('https://duckduckgo.com/html/?q=$query&kl=$lang') language)
def _get(self, query, language): def _get(self, query):
link = self.query.substitute(query=query, lang=language) query = query.replace(' ', '+')
if self.proxies: link = self.query.substitute(query=query)
proxy = choice(self.proxies) if self.proxy_obj.proxies:
ip_and_port = proxy[0] proxy = self.proxy_obj.random()
protocol = proxy[1] print(proxy)
proxies = {protocol: ip_and_port} return requests.post(link, proxies=proxy)
requests.get(link, proxies=proxies) return requests.post(link)
return requests.get(link)
def body(self, query, language): def _proxy_to_dict(self, proxy):
pass proxy_string = str(proxy[0]) + ':' + str(proxy[1])
return {"http": proxy_string, "https": proxy_string}
def links(self, query, language): def download_proxies(self):
pass self.proxy_obj.download()
def _soup(self, query):
resp = self._get(query)
content_type = resp.headers.get('content-type', '').lower()
http_encoding = resp.encoding if 'charset' in content_type else None
html_encoding = EncodingDetector.find_declared_encoding(
resp.content, is_html=True)
encoding = html_encoding or http_encoding
return BeautifulSoup(resp.content, 'lxml', from_encoding=encoding)
def html(self, query):
soup = self._soup(query)
return soup.prettify()
def links(self, query):
soup = self._soup(query)
return [
link.get('href')
for link in soup.find_all('a', class_='result__snippet')
]
def main(): def main():
pass duck = DuckDuckGo(language='pl-pl')
links = duck.links('koscioly polska')
print(links)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1,17 +1,47 @@
import dill import dill
from google import search from duckduckgo import DuckDuckGo
from urllib.parse import urlparse
import time
import random
tsv = ''
urls = ''
def check(parish): def check(parish, duck):
if parish.url in search(query, lang='pl', stop=10, pause=3.0): global urls
return true global tsv
links = _urls(parish, duck)
for link in links:
parish_root_url = urlparse(parish.url).netloc
if parish_root_url == urlparse(link).netloc:
urls += parish_root_url + '\n'
tsv += parish.name + '\t' + parish.city + '\t' + parish.street + '\t' + parish.postal_code + '\t' + parish_root_url + '\t' + parish.meta_url + '\t' + parish.gps + '\n'
print('added')
# TODO: save links to txt file, one per line
# TODO: wget -r -i file all links
# TODO: save parishes to jsonline format
return True # mark as ok url
return False
def _urls(parish, duck):
query = parish.name + ' ' + parish.street + ' ' + parish.postal_code
links = duck.links(query)
time.sleep(1)
while not links:
print('retry')
random.randint(3, 10)
time.sleep(10)
links = duck.links(query)
return links
def find_url(parish): def find_url(parish):
pass links = _urls(parish)
import ipdb
ipdb.set_trace()
def stem_url(url): print(links)
def main(): def main():
@ -19,14 +49,19 @@ def main():
with open('./parishes.dill', 'rb') as f: with open('./parishes.dill', 'rb') as f:
parishes = dill.load(f) parishes = dill.load(f)
duck = DuckDuckGo(language='pl-pl')
print('Downloading proxies')
duck.download_proxies()
i = 0
for parish in parishes: for parish in parishes:
print(str(i / len(parishes)) + '% done. Nr: ' + str(i))
i += 1
if parish.url: if parish.url:
check(parish) check(parish, duck)
else: with open('urls.txt', 'w') as f:
find_url(parish) f.write(urls)
with open('parishes.tsv', 'w') as f:
import ipdb f.write(tsv)
ipdb.set_trace()
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,5 +1,4 @@
import requests import requests
# from bs4 import BeautifulSoup
import re import re
from collections import namedtuple from collections import namedtuple
import time import time

2
parishes.tsv Normal file
View File

@ -0,0 +1,2 @@
Parafia pod wezwaniem NMP Królowej Aniołów Adamowice ul. Poprzeczna 15 47-435 Raszyce www.adamowice.katowice.opoka.org.pl http://colaska.pl/index/parafia/id/2 18.2955971,50.5078563
Parafia pod wezwaniem Narodzenia NMP Albigowa Albigowa 844 37-122 Albigowa www.albigowa.parafia.info.pl http://colaska.pl/index/parafia/id/6 22.229000329971313,50.01446141585083
1 Parafia pod wezwaniem NMP Królowej Aniołów Adamowice ul. Poprzeczna 15 47-435 Raszyce www.adamowice.katowice.opoka.org.pl http://colaska.pl/index/parafia/id/2 18.2955971,50.5078563
2 Parafia pod wezwaniem Narodzenia NMP Albigowa Albigowa 844 37-122 Albigowa www.albigowa.parafia.info.pl http://colaska.pl/index/parafia/id/6 22.229000329971313,50.01446141585083

View File

@ -17,7 +17,7 @@ class Proxy():
full_list_button = driver.find_element_by_xpath( full_list_button = driver.find_element_by_xpath(
'//input[@type="submit" and @value="Show Full List"]') '//input[@type="submit" and @value="Show Full List"]')
full_list_button.click() full_list_button.click()
print(driver.page_source) #print(driver.page_source)
for match in re.finditer( for match in re.finditer(
'<a href="#(.*?)" class="inactive" onclick="gp.pageClick', '<a href="#(.*?)" class="inactive" onclick="gp.pageClick',
driver.page_source): driver.page_source):

View File

@ -1,3 +1,5 @@
requests requests
dill dill
dryscrape beautifulsoup4
lxml
selenium

View File

@ -4,9 +4,8 @@
# #
# pip-compile --output-file requirements.txt requirements.in # pip-compile --output-file requirements.txt requirements.in
# #
beautifulsoup4==4.6.0
dill==0.2.6 dill==0.2.6
dryscrape==1.0 lxml==3.8.0
lxml==3.8.0 # via dryscrape
requests==2.13.0 requests==2.13.0
webkit-server==1.0 # via dryscrape selenium==3.4.3
xvfbwrapper==0.2.9 # via dryscrape

2
urls.txt Normal file
View File

@ -0,0 +1,2 @@
www.adamowice.katowice.opoka.org.pl
www.albigowa.parafia.info.pl