proof of concept alpha

This commit is contained in:
siulkilulki 2017-06-12 22:08:29 +02:00
parent de56ecb253
commit 57315f9b31
8 changed files with 102 additions and 38 deletions

View File

@ -1,6 +1,9 @@
import requests
from string import Template
from random import choice
from proxy import Proxy
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
class DuckDuckGo(object):
@ -9,29 +12,51 @@ class DuckDuckGo(object):
"""
def __init__(self, proxies=None, language=''):
self.proxies = [] if proxies is None else proxies
self.language = language
self.query = Template('https://duckduckgo.com/html/?q=$query&kl=$lang')
self.proxy_obj = Proxy() if proxies is None else Proxy(proxies)
self.query = Template('https://duckduckgo.com/html/?q=$query&kl=' +
language)
def _get(self, query, language):
link = self.query.substitute(query=query, lang=language)
if self.proxies:
proxy = choice(self.proxies)
ip_and_port = proxy[0]
protocol = proxy[1]
proxies = {protocol: ip_and_port}
requests.get(link, proxies=proxies)
return requests.get(link)
def _get(self, query):
query = query.replace(' ', '+')
link = self.query.substitute(query=query)
if self.proxy_obj.proxies:
proxy = self.proxy_obj.random()
print(proxy)
return requests.post(link, proxies=proxy)
return requests.post(link)
def body(self, query, language):
pass
def _proxy_to_dict(self, proxy):
proxy_string = str(proxy[0]) + ':' + str(proxy[1])
return {"http": proxy_string, "https": proxy_string}
def links(self, query, language):
pass
def download_proxies(self):
self.proxy_obj.download()
def _soup(self, query):
resp = self._get(query)
content_type = resp.headers.get('content-type', '').lower()
http_encoding = resp.encoding if 'charset' in content_type else None
html_encoding = EncodingDetector.find_declared_encoding(
resp.content, is_html=True)
encoding = html_encoding or http_encoding
return BeautifulSoup(resp.content, 'lxml', from_encoding=encoding)
def html(self, query):
soup = self._soup(query)
return soup.prettify()
def links(self, query):
soup = self._soup(query)
return [
link.get('href')
for link in soup.find_all('a', class_='result__snippet')
]
def main():
pass
duck = DuckDuckGo(language='pl-pl')
links = duck.links('koscioly polska')
print(links)
if __name__ == '__main__':

View File

@ -1,17 +1,47 @@
import dill
from google import search
from duckduckgo import DuckDuckGo
from urllib.parse import urlparse
import time
import random
tsv = ''
urls = ''
def check(parish):
if parish.url in search(query, lang='pl', stop=10, pause=3.0):
return true
def check(parish, duck):
global urls
global tsv
links = _urls(parish, duck)
for link in links:
parish_root_url = urlparse(parish.url).netloc
if parish_root_url == urlparse(link).netloc:
urls += parish_root_url + '\n'
tsv += parish.name + '\t' + parish.city + '\t' + parish.street + '\t' + parish.postal_code + '\t' + parish_root_url + '\t' + parish.meta_url + '\t' + parish.gps + '\n'
print('added')
# TODO: save links to txt file, one per line
# TODO: wget -r -i file all links
# TODO: save parishes to jsonline format
return True # mark as ok url
return False
def _urls(parish, duck):
query = parish.name + ' ' + parish.street + ' ' + parish.postal_code
links = duck.links(query)
time.sleep(1)
while not links:
print('retry')
random.randint(3, 10)
time.sleep(10)
links = duck.links(query)
return links
def find_url(parish):
pass
def stem_url(url):
links = _urls(parish)
import ipdb
ipdb.set_trace()
print(links)
def main():
@ -19,14 +49,19 @@ def main():
with open('./parishes.dill', 'rb') as f:
parishes = dill.load(f)
duck = DuckDuckGo(language='pl-pl')
print('Downloading proxies')
duck.download_proxies()
i = 0
for parish in parishes:
print(str(i / len(parishes)) + '% done. Nr: ' + str(i))
i += 1
if parish.url:
check(parish)
else:
find_url(parish)
import ipdb
ipdb.set_trace()
check(parish, duck)
with open('urls.txt', 'w') as f:
f.write(urls)
with open('parishes.tsv', 'w') as f:
f.write(tsv)
if __name__ == "__main__":

View File

@ -1,5 +1,4 @@
import requests
# from bs4 import BeautifulSoup
import re
from collections import namedtuple
import time

2
parishes.tsv Normal file
View File

@ -0,0 +1,2 @@
Parafia pod wezwaniem NMP Królowej Aniołów Adamowice ul. Poprzeczna 15 47-435 Raszyce www.adamowice.katowice.opoka.org.pl http://colaska.pl/index/parafia/id/2 18.2955971,50.5078563
Parafia pod wezwaniem Narodzenia NMP Albigowa Albigowa 844 37-122 Albigowa www.albigowa.parafia.info.pl http://colaska.pl/index/parafia/id/6 22.229000329971313,50.01446141585083
1 Parafia pod wezwaniem NMP Królowej Aniołów Adamowice ul. Poprzeczna 15 47-435 Raszyce www.adamowice.katowice.opoka.org.pl http://colaska.pl/index/parafia/id/2 18.2955971,50.5078563
2 Parafia pod wezwaniem Narodzenia NMP Albigowa Albigowa 844 37-122 Albigowa www.albigowa.parafia.info.pl http://colaska.pl/index/parafia/id/6 22.229000329971313,50.01446141585083

View File

@ -17,7 +17,7 @@ class Proxy():
full_list_button = driver.find_element_by_xpath(
'//input[@type="submit" and @value="Show Full List"]')
full_list_button.click()
print(driver.page_source)
#print(driver.page_source)
for match in re.finditer(
'<a href="#(.*?)" class="inactive" onclick="gp.pageClick',
driver.page_source):

View File

@ -1,3 +1,5 @@
requests
dill
dryscrape
beautifulsoup4
lxml
selenium

View File

@ -4,9 +4,8 @@
#
# pip-compile --output-file requirements.txt requirements.in
#
beautifulsoup4==4.6.0
dill==0.2.6
dryscrape==1.0
lxml==3.8.0 # via dryscrape
lxml==3.8.0
requests==2.13.0
webkit-server==1.0 # via dryscrape
xvfbwrapper==0.2.9 # via dryscrape
selenium==3.4.3

2
urls.txt Normal file
View File

@ -0,0 +1,2 @@
www.adamowice.katowice.opoka.org.pl
www.albigowa.parafia.info.pl