proof of concept alpha
This commit is contained in:
parent
de56ecb253
commit
57315f9b31
@ -1,6 +1,9 @@
|
|||||||
import requests
|
import requests
|
||||||
from string import Template
|
from string import Template
|
||||||
from random import choice
|
from random import choice
|
||||||
|
from proxy import Proxy
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4.dammit import EncodingDetector
|
||||||
|
|
||||||
|
|
||||||
class DuckDuckGo(object):
|
class DuckDuckGo(object):
|
||||||
@ -9,29 +12,51 @@ class DuckDuckGo(object):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, proxies=None, language=''):
|
def __init__(self, proxies=None, language=''):
|
||||||
self.proxies = [] if proxies is None else proxies
|
self.proxy_obj = Proxy() if proxies is None else Proxy(proxies)
|
||||||
self.language = language
|
self.query = Template('https://duckduckgo.com/html/?q=$query&kl=' +
|
||||||
self.query = Template('https://duckduckgo.com/html/?q=$query&kl=$lang')
|
language)
|
||||||
|
|
||||||
def _get(self, query, language):
|
def _get(self, query):
|
||||||
link = self.query.substitute(query=query, lang=language)
|
query = query.replace(' ', '+')
|
||||||
if self.proxies:
|
link = self.query.substitute(query=query)
|
||||||
proxy = choice(self.proxies)
|
if self.proxy_obj.proxies:
|
||||||
ip_and_port = proxy[0]
|
proxy = self.proxy_obj.random()
|
||||||
protocol = proxy[1]
|
print(proxy)
|
||||||
proxies = {protocol: ip_and_port}
|
return requests.post(link, proxies=proxy)
|
||||||
requests.get(link, proxies=proxies)
|
return requests.post(link)
|
||||||
return requests.get(link)
|
|
||||||
|
|
||||||
def body(self, query, language):
|
def _proxy_to_dict(self, proxy):
|
||||||
pass
|
proxy_string = str(proxy[0]) + ':' + str(proxy[1])
|
||||||
|
return {"http": proxy_string, "https": proxy_string}
|
||||||
|
|
||||||
def links(self, query, language):
|
def download_proxies(self):
|
||||||
pass
|
self.proxy_obj.download()
|
||||||
|
|
||||||
|
def _soup(self, query):
|
||||||
|
resp = self._get(query)
|
||||||
|
content_type = resp.headers.get('content-type', '').lower()
|
||||||
|
http_encoding = resp.encoding if 'charset' in content_type else None
|
||||||
|
html_encoding = EncodingDetector.find_declared_encoding(
|
||||||
|
resp.content, is_html=True)
|
||||||
|
encoding = html_encoding or http_encoding
|
||||||
|
return BeautifulSoup(resp.content, 'lxml', from_encoding=encoding)
|
||||||
|
|
||||||
|
def html(self, query):
|
||||||
|
soup = self._soup(query)
|
||||||
|
return soup.prettify()
|
||||||
|
|
||||||
|
def links(self, query):
|
||||||
|
soup = self._soup(query)
|
||||||
|
return [
|
||||||
|
link.get('href')
|
||||||
|
for link in soup.find_all('a', class_='result__snippet')
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
pass
|
duck = DuckDuckGo(language='pl-pl')
|
||||||
|
links = duck.links('koscioly polska')
|
||||||
|
print(links)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -1,17 +1,47 @@
|
|||||||
import dill
|
import dill
|
||||||
from google import search
|
from duckduckgo import DuckDuckGo
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
|
||||||
|
tsv = ''
|
||||||
|
urls = ''
|
||||||
|
|
||||||
|
|
||||||
def check(parish):
|
def check(parish, duck):
|
||||||
if parish.url in search(query, lang='pl', stop=10, pause=3.0):
|
global urls
|
||||||
return true
|
global tsv
|
||||||
|
links = _urls(parish, duck)
|
||||||
|
for link in links:
|
||||||
|
parish_root_url = urlparse(parish.url).netloc
|
||||||
|
if parish_root_url == urlparse(link).netloc:
|
||||||
|
urls += parish_root_url + '\n'
|
||||||
|
tsv += parish.name + '\t' + parish.city + '\t' + parish.street + '\t' + parish.postal_code + '\t' + parish_root_url + '\t' + parish.meta_url + '\t' + parish.gps + '\n'
|
||||||
|
print('added')
|
||||||
|
# TODO: save links to txt file, one per line
|
||||||
|
# TODO: wget -r -i file all links
|
||||||
|
# TODO: save parishes to jsonline format
|
||||||
|
return True # mark as ok url
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _urls(parish, duck):
|
||||||
|
query = parish.name + ' ' + parish.street + ' ' + parish.postal_code
|
||||||
|
links = duck.links(query)
|
||||||
|
time.sleep(1)
|
||||||
|
while not links:
|
||||||
|
print('retry')
|
||||||
|
random.randint(3, 10)
|
||||||
|
time.sleep(10)
|
||||||
|
links = duck.links(query)
|
||||||
|
return links
|
||||||
|
|
||||||
|
|
||||||
def find_url(parish):
|
def find_url(parish):
|
||||||
pass
|
links = _urls(parish)
|
||||||
|
import ipdb
|
||||||
|
ipdb.set_trace()
|
||||||
def stem_url(url):
|
print(links)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
@ -19,14 +49,19 @@ def main():
|
|||||||
with open('./parishes.dill', 'rb') as f:
|
with open('./parishes.dill', 'rb') as f:
|
||||||
parishes = dill.load(f)
|
parishes = dill.load(f)
|
||||||
|
|
||||||
|
duck = DuckDuckGo(language='pl-pl')
|
||||||
|
print('Downloading proxies')
|
||||||
|
duck.download_proxies()
|
||||||
|
i = 0
|
||||||
for parish in parishes:
|
for parish in parishes:
|
||||||
|
print(str(i / len(parishes)) + '% done. Nr: ' + str(i))
|
||||||
|
i += 1
|
||||||
if parish.url:
|
if parish.url:
|
||||||
check(parish)
|
check(parish, duck)
|
||||||
else:
|
with open('urls.txt', 'w') as f:
|
||||||
find_url(parish)
|
f.write(urls)
|
||||||
|
with open('parishes.tsv', 'w') as f:
|
||||||
import ipdb
|
f.write(tsv)
|
||||||
ipdb.set_trace()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
import requests
|
import requests
|
||||||
# from bs4 import BeautifulSoup
|
|
||||||
import re
|
import re
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
import time
|
import time
|
||||||
|
2
parishes.tsv
Normal file
2
parishes.tsv
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
Parafia pod wezwaniem NMP Królowej Aniołów Adamowice ul. Poprzeczna 15 47-435 Raszyce www.adamowice.katowice.opoka.org.pl http://colaska.pl/index/parafia/id/2 18.2955971,50.5078563
|
||||||
|
Parafia pod wezwaniem Narodzenia NMP Albigowa Albigowa 844 37-122 Albigowa www.albigowa.parafia.info.pl http://colaska.pl/index/parafia/id/6 22.229000329971313,50.01446141585083
|
|
2
proxy.py
2
proxy.py
@ -17,7 +17,7 @@ class Proxy():
|
|||||||
full_list_button = driver.find_element_by_xpath(
|
full_list_button = driver.find_element_by_xpath(
|
||||||
'//input[@type="submit" and @value="Show Full List"]')
|
'//input[@type="submit" and @value="Show Full List"]')
|
||||||
full_list_button.click()
|
full_list_button.click()
|
||||||
print(driver.page_source)
|
#print(driver.page_source)
|
||||||
for match in re.finditer(
|
for match in re.finditer(
|
||||||
'<a href="#(.*?)" class="inactive" onclick="gp.pageClick',
|
'<a href="#(.*?)" class="inactive" onclick="gp.pageClick',
|
||||||
driver.page_source):
|
driver.page_source):
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
requests
|
requests
|
||||||
dill
|
dill
|
||||||
dryscrape
|
beautifulsoup4
|
||||||
|
lxml
|
||||||
|
selenium
|
||||||
|
@ -4,9 +4,8 @@
|
|||||||
#
|
#
|
||||||
# pip-compile --output-file requirements.txt requirements.in
|
# pip-compile --output-file requirements.txt requirements.in
|
||||||
#
|
#
|
||||||
|
beautifulsoup4==4.6.0
|
||||||
dill==0.2.6
|
dill==0.2.6
|
||||||
dryscrape==1.0
|
lxml==3.8.0
|
||||||
lxml==3.8.0 # via dryscrape
|
|
||||||
requests==2.13.0
|
requests==2.13.0
|
||||||
webkit-server==1.0 # via dryscrape
|
selenium==3.4.3
|
||||||
xvfbwrapper==0.2.9 # via dryscrape
|
|
||||||
|
Loading…
Reference in New Issue
Block a user