mass-scraper/duckduckgo.py

64 lines
1.8 KiB
Python
Raw Normal View History

2017-06-10 02:09:22 +02:00
import requests
from string import Template
from random import choice
2017-06-12 22:08:29 +02:00
from proxy import Proxy
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
2017-06-10 02:09:22 +02:00
class DuckDuckGo(object):
"""Documentation for DuckDuckGo
"""
def __init__(self, proxies=None, language=''):
2017-06-12 22:08:29 +02:00
self.proxy_obj = Proxy() if proxies is None else Proxy(proxies)
self.query = Template('https://duckduckgo.com/html/?q=$query&kl=' +
language)
def _get(self, query):
query = query.replace(' ', '+')
link = self.query.substitute(query=query)
if self.proxy_obj.proxies:
proxy = self.proxy_obj.random()
print(proxy)
return requests.post(link, proxies=proxy)
return requests.post(link)
def _proxy_to_dict(self, proxy):
proxy_string = str(proxy[0]) + ':' + str(proxy[1])
return {"http": proxy_string, "https": proxy_string}
def download_proxies(self):
self.proxy_obj.download()
def _soup(self, query):
resp = self._get(query)
content_type = resp.headers.get('content-type', '').lower()
http_encoding = resp.encoding if 'charset' in content_type else None
html_encoding = EncodingDetector.find_declared_encoding(
resp.content, is_html=True)
encoding = html_encoding or http_encoding
return BeautifulSoup(resp.content, 'lxml', from_encoding=encoding)
def html(self, query):
soup = self._soup(query)
return soup.prettify()
def links(self, query):
soup = self._soup(query)
return [
link.get('href')
for link in soup.find_all('a', class_='result__snippet')
]
2017-06-10 02:09:22 +02:00
def main():
2017-06-12 22:08:29 +02:00
duck = DuckDuckGo(language='pl-pl')
links = duck.links('koscioly polska')
print(links)
2017-06-10 02:09:22 +02:00
if __name__ == '__main__':
main()