proof of concept alpha
This commit is contained in:
parent
de56ecb253
commit
57315f9b31
@ -1,6 +1,9 @@
|
||||
import requests
|
||||
from string import Template
|
||||
from random import choice
|
||||
from proxy import Proxy
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.dammit import EncodingDetector
|
||||
|
||||
|
||||
class DuckDuckGo(object):
|
||||
@ -9,29 +12,51 @@ class DuckDuckGo(object):
|
||||
"""
|
||||
|
||||
def __init__(self, proxies=None, language=''):
|
||||
self.proxies = [] if proxies is None else proxies
|
||||
self.language = language
|
||||
self.query = Template('https://duckduckgo.com/html/?q=$query&kl=$lang')
|
||||
self.proxy_obj = Proxy() if proxies is None else Proxy(proxies)
|
||||
self.query = Template('https://duckduckgo.com/html/?q=$query&kl=' +
|
||||
language)
|
||||
|
||||
def _get(self, query, language):
|
||||
link = self.query.substitute(query=query, lang=language)
|
||||
if self.proxies:
|
||||
proxy = choice(self.proxies)
|
||||
ip_and_port = proxy[0]
|
||||
protocol = proxy[1]
|
||||
proxies = {protocol: ip_and_port}
|
||||
requests.get(link, proxies=proxies)
|
||||
return requests.get(link)
|
||||
def _get(self, query):
|
||||
query = query.replace(' ', '+')
|
||||
link = self.query.substitute(query=query)
|
||||
if self.proxy_obj.proxies:
|
||||
proxy = self.proxy_obj.random()
|
||||
print(proxy)
|
||||
return requests.post(link, proxies=proxy)
|
||||
return requests.post(link)
|
||||
|
||||
def body(self, query, language):
|
||||
pass
|
||||
def _proxy_to_dict(self, proxy):
|
||||
proxy_string = str(proxy[0]) + ':' + str(proxy[1])
|
||||
return {"http": proxy_string, "https": proxy_string}
|
||||
|
||||
def links(self, query, language):
|
||||
pass
|
||||
def download_proxies(self):
|
||||
self.proxy_obj.download()
|
||||
|
||||
def _soup(self, query):
|
||||
resp = self._get(query)
|
||||
content_type = resp.headers.get('content-type', '').lower()
|
||||
http_encoding = resp.encoding if 'charset' in content_type else None
|
||||
html_encoding = EncodingDetector.find_declared_encoding(
|
||||
resp.content, is_html=True)
|
||||
encoding = html_encoding or http_encoding
|
||||
return BeautifulSoup(resp.content, 'lxml', from_encoding=encoding)
|
||||
|
||||
def html(self, query):
|
||||
soup = self._soup(query)
|
||||
return soup.prettify()
|
||||
|
||||
def links(self, query):
|
||||
soup = self._soup(query)
|
||||
return [
|
||||
link.get('href')
|
||||
for link in soup.find_all('a', class_='result__snippet')
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
pass
|
||||
duck = DuckDuckGo(language='pl-pl')
|
||||
links = duck.links('koscioly polska')
|
||||
print(links)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -1,17 +1,47 @@
|
||||
import dill
|
||||
from google import search
|
||||
from duckduckgo import DuckDuckGo
|
||||
from urllib.parse import urlparse
|
||||
import time
|
||||
import random
|
||||
|
||||
tsv = ''
|
||||
urls = ''
|
||||
|
||||
|
||||
def check(parish):
|
||||
if parish.url in search(query, lang='pl', stop=10, pause=3.0):
|
||||
return true
|
||||
def check(parish, duck):
|
||||
global urls
|
||||
global tsv
|
||||
links = _urls(parish, duck)
|
||||
for link in links:
|
||||
parish_root_url = urlparse(parish.url).netloc
|
||||
if parish_root_url == urlparse(link).netloc:
|
||||
urls += parish_root_url + '\n'
|
||||
tsv += parish.name + '\t' + parish.city + '\t' + parish.street + '\t' + parish.postal_code + '\t' + parish_root_url + '\t' + parish.meta_url + '\t' + parish.gps + '\n'
|
||||
print('added')
|
||||
# TODO: save links to txt file, one per line
|
||||
# TODO: wget -r -i file all links
|
||||
# TODO: save parishes to jsonline format
|
||||
return True # mark as ok url
|
||||
return False
|
||||
|
||||
|
||||
def _urls(parish, duck):
|
||||
query = parish.name + ' ' + parish.street + ' ' + parish.postal_code
|
||||
links = duck.links(query)
|
||||
time.sleep(1)
|
||||
while not links:
|
||||
print('retry')
|
||||
random.randint(3, 10)
|
||||
time.sleep(10)
|
||||
links = duck.links(query)
|
||||
return links
|
||||
|
||||
|
||||
def find_url(parish):
|
||||
pass
|
||||
|
||||
|
||||
def stem_url(url):
|
||||
links = _urls(parish)
|
||||
import ipdb
|
||||
ipdb.set_trace()
|
||||
print(links)
|
||||
|
||||
|
||||
def main():
|
||||
@ -19,14 +49,19 @@ def main():
|
||||
with open('./parishes.dill', 'rb') as f:
|
||||
parishes = dill.load(f)
|
||||
|
||||
duck = DuckDuckGo(language='pl-pl')
|
||||
print('Downloading proxies')
|
||||
duck.download_proxies()
|
||||
i = 0
|
||||
for parish in parishes:
|
||||
print(str(i / len(parishes)) + '% done. Nr: ' + str(i))
|
||||
i += 1
|
||||
if parish.url:
|
||||
check(parish)
|
||||
else:
|
||||
find_url(parish)
|
||||
|
||||
import ipdb
|
||||
ipdb.set_trace()
|
||||
check(parish, duck)
|
||||
with open('urls.txt', 'w') as f:
|
||||
f.write(urls)
|
||||
with open('parishes.tsv', 'w') as f:
|
||||
f.write(tsv)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -1,5 +1,4 @@
|
||||
import requests
|
||||
# from bs4 import BeautifulSoup
|
||||
import re
|
||||
from collections import namedtuple
|
||||
import time
|
||||
|
2
parishes.tsv
Normal file
2
parishes.tsv
Normal file
@ -0,0 +1,2 @@
|
||||
Parafia pod wezwaniem NMP Królowej Aniołów Adamowice ul. Poprzeczna 15 47-435 Raszyce www.adamowice.katowice.opoka.org.pl http://colaska.pl/index/parafia/id/2 18.2955971,50.5078563
|
||||
Parafia pod wezwaniem Narodzenia NMP Albigowa Albigowa 844 37-122 Albigowa www.albigowa.parafia.info.pl http://colaska.pl/index/parafia/id/6 22.229000329971313,50.01446141585083
|
|
2
proxy.py
2
proxy.py
@ -17,7 +17,7 @@ class Proxy():
|
||||
full_list_button = driver.find_element_by_xpath(
|
||||
'//input[@type="submit" and @value="Show Full List"]')
|
||||
full_list_button.click()
|
||||
print(driver.page_source)
|
||||
#print(driver.page_source)
|
||||
for match in re.finditer(
|
||||
'<a href="#(.*?)" class="inactive" onclick="gp.pageClick',
|
||||
driver.page_source):
|
||||
|
@ -1,3 +1,5 @@
|
||||
requests
|
||||
dill
|
||||
dryscrape
|
||||
beautifulsoup4
|
||||
lxml
|
||||
selenium
|
||||
|
@ -4,9 +4,8 @@
|
||||
#
|
||||
# pip-compile --output-file requirements.txt requirements.in
|
||||
#
|
||||
beautifulsoup4==4.6.0
|
||||
dill==0.2.6
|
||||
dryscrape==1.0
|
||||
lxml==3.8.0 # via dryscrape
|
||||
lxml==3.8.0
|
||||
requests==2.13.0
|
||||
webkit-server==1.0 # via dryscrape
|
||||
xvfbwrapper==0.2.9 # via dryscrape
|
||||
selenium==3.4.3
|
||||
|
Loading…
Reference in New Issue
Block a user