230 lines
7.5 KiB
Python
230 lines
7.5 KiB
Python
|
from __future__ import unicode_literals
|
||
|
from __future__ import absolute_import
|
||
|
|
||
|
from future import standard_library
|
||
|
standard_library.install_aliases()
|
||
|
from builtins import range
|
||
|
from builtins import object
|
||
|
from .utils import _get_search_url, get_html
|
||
|
from bs4 import BeautifulSoup
|
||
|
import urllib.parse
|
||
|
from urllib.parse import unquote, parse_qs, urlparse
|
||
|
from unidecode import unidecode
|
||
|
from re import match, findall
|
||
|
|
||
|
|
||
|
class GoogleResult(object):
|
||
|
|
||
|
"""Represents a google search result."""
|
||
|
|
||
|
def __init__(self):
|
||
|
self.name = None # The title of the link
|
||
|
self.link = None # The external link
|
||
|
self.google_link = None # The google link
|
||
|
self.description = None # The description of the link
|
||
|
self.thumb = None # Thumbnail link of website (NOT implemented yet)
|
||
|
self.cached = None # Cached version link of page
|
||
|
self.page = None # Results page this one was on
|
||
|
self.index = None # What index on this page it was on
|
||
|
self.number_of_results = None # The total number of results the query returned
|
||
|
|
||
|
def __repr__(self):
|
||
|
name = self._limit_str_size(self.name, 55)
|
||
|
description = self._limit_str_size(self.description, 49)
|
||
|
|
||
|
list_google = ["GoogleResult(",
|
||
|
"name={}".format(name), "\n", " " * 13,
|
||
|
"description={}".format(description)]
|
||
|
|
||
|
return "".join(list_google)
|
||
|
|
||
|
def _limit_str_size(self, str_element, size_limit):
|
||
|
"""Limit the characters of the string, adding .. at the end."""
|
||
|
if not str_element:
|
||
|
return None
|
||
|
|
||
|
elif len(str_element) > size_limit:
|
||
|
return unidecode(str_element[:size_limit]) + ".."
|
||
|
|
||
|
else:
|
||
|
return unidecode(str_element)
|
||
|
|
||
|
|
||
|
# PUBLIC
|
||
|
def search(query, pages=1, lang='en', area='com', ncr=False, void=True, time_period=False, sort_by_date=False, first_page=0):
|
||
|
"""Returns a list of GoogleResult.
|
||
|
|
||
|
Args:
|
||
|
query: String to search in google.
|
||
|
pages: Number of pages where results must be taken.
|
||
|
area : Area of google homepages.
|
||
|
first_page : First page.
|
||
|
|
||
|
TODO: add support to get the google results.
|
||
|
Returns:
|
||
|
A GoogleResult object."""
|
||
|
|
||
|
results = []
|
||
|
for i in range(first_page, first_page + pages):
|
||
|
url = _get_search_url(query, i, lang=lang, area=area, ncr=ncr, time_period=time_period, sort_by_date=sort_by_date)
|
||
|
html = get_html(url)
|
||
|
|
||
|
if html:
|
||
|
soup = BeautifulSoup(html, "html.parser")
|
||
|
divs = soup.findAll("div", attrs={"class": "g"})
|
||
|
|
||
|
results_div = soup.find("div", attrs={"id": "resultStats"})
|
||
|
number_of_results = _get_number_of_results(results_div)
|
||
|
|
||
|
j = 0
|
||
|
for li in divs:
|
||
|
res = GoogleResult()
|
||
|
|
||
|
res.page = i
|
||
|
res.index = j
|
||
|
|
||
|
res.name = _get_name(li)
|
||
|
res.link = _get_link(li)
|
||
|
res.google_link = _get_google_link(li)
|
||
|
res.description = _get_description(li)
|
||
|
res.thumb = _get_thumb()
|
||
|
res.cached = _get_cached(li)
|
||
|
res.number_of_results = number_of_results
|
||
|
|
||
|
if void is True:
|
||
|
if res.description is None:
|
||
|
continue
|
||
|
results.append(res)
|
||
|
j += 1
|
||
|
return results
|
||
|
|
||
|
|
||
|
# PRIVATE
|
||
|
def _get_name(li):
|
||
|
"""Return the name of a google search."""
|
||
|
a = li.find("a")
|
||
|
# return a.text.encode("utf-8").strip()
|
||
|
if a is not None:
|
||
|
return a.text.strip()
|
||
|
return None
|
||
|
|
||
|
|
||
|
def _filter_link(link):
|
||
|
'''Filter links found in the Google result pages HTML code.
|
||
|
Returns None if the link doesn't yield a valid result.
|
||
|
'''
|
||
|
try:
|
||
|
# Valid results are absolute URLs not pointing to a Google domain
|
||
|
# like images.google.com or googleusercontent.com
|
||
|
o = urlparse(link, 'http')
|
||
|
# link type-1
|
||
|
# >>> "https://www.gitbook.com/book/ljalphabeta/python-"
|
||
|
if o.netloc and 'google' not in o.netloc:
|
||
|
return link
|
||
|
# link type-2
|
||
|
# >>> "http://www.google.com/url?url=http://python.jobbole.com/84108/&rct=j&frm=1&q=&esrc=s&sa=U&ved=0ahUKEwj3quDH-Y7UAhWG6oMKHdQ-BQMQFggUMAA&usg=AFQjCNHPws5Buru5Z71wooRLHT6mpvnZlA"
|
||
|
if o.netloc and o.path.startswith('/url'):
|
||
|
try:
|
||
|
link = parse_qs(o.query)['url'][0]
|
||
|
o = urlparse(link, 'http')
|
||
|
if o.netloc and 'google' not in o.netloc:
|
||
|
return link
|
||
|
except KeyError:
|
||
|
pass
|
||
|
# Decode hidden URLs.
|
||
|
if link.startswith('/url?'):
|
||
|
try:
|
||
|
# link type-3
|
||
|
# >>> "/url?q=http://python.jobbole.com/84108/&sa=U&ved=0ahUKEwjFw6Txg4_UAhVI5IMKHfqVAykQFggUMAA&usg=AFQjCNFOTLpmpfqctpIn0sAfaj5U5gAU9A"
|
||
|
link = parse_qs(o.query)['q'][0]
|
||
|
# Valid results are absolute URLs not pointing to a Google domain
|
||
|
# like images.google.com or googleusercontent.com
|
||
|
o = urlparse(link, 'http')
|
||
|
if o.netloc and 'google' not in o.netloc:
|
||
|
return link
|
||
|
except KeyError:
|
||
|
# link type-4
|
||
|
# >>> "/url?url=https://machine-learning-python.kspax.io/&rct=j&frm=1&q=&esrc=s&sa=U&ved=0ahUKEwj3quDH-Y7UAhWG6oMKHdQ-BQMQFggfMAI&usg=AFQjCNEfkUI0RP_RlwD3eI22rSfqbYM_nA"
|
||
|
link = parse_qs(o.query)['url'][0]
|
||
|
o = urlparse(link, 'http')
|
||
|
if o.netloc and 'google' not in o.netloc:
|
||
|
return link
|
||
|
|
||
|
# Otherwise, or on error, return None.
|
||
|
except Exception:
|
||
|
pass
|
||
|
return None
|
||
|
|
||
|
|
||
|
def _get_link(li):
|
||
|
"""Return external link from a search."""
|
||
|
try:
|
||
|
a = li.find("a")
|
||
|
link = a["href"]
|
||
|
except Exception:
|
||
|
return None
|
||
|
return _filter_link(link)
|
||
|
|
||
|
|
||
|
def _get_google_link(li):
|
||
|
"""Return google link from a search."""
|
||
|
try:
|
||
|
a = li.find("a")
|
||
|
link = a["href"]
|
||
|
except Exception:
|
||
|
return None
|
||
|
|
||
|
if link.startswith("/url?") or link.startswith("/search?"):
|
||
|
return urllib.parse.urljoin("http://www.google.com", link)
|
||
|
|
||
|
else:
|
||
|
return None
|
||
|
|
||
|
|
||
|
def _get_description(li):
|
||
|
"""Return the description of a google search.
|
||
|
|
||
|
TODO: There are some text encoding problems to resolve."""
|
||
|
|
||
|
sdiv = li.find("div", attrs={"class": "s"})
|
||
|
if sdiv:
|
||
|
stspan = sdiv.find("span", attrs={"class": "st"})
|
||
|
if stspan is not None:
|
||
|
# return stspan.text.encode("utf-8").strip()
|
||
|
return stspan.text.strip()
|
||
|
else:
|
||
|
return None
|
||
|
|
||
|
|
||
|
def _get_thumb():
|
||
|
"""Return the link to a thumbnail of the website."""
|
||
|
pass
|
||
|
|
||
|
|
||
|
def _get_cached(li):
|
||
|
"""Return a link to the cached version of the page."""
|
||
|
links = li.find_all("a")
|
||
|
if len(links) > 1 and links[1].text == "Cached":
|
||
|
link = links[1]["href"]
|
||
|
if link.startswith("/url?") or link.startswith("/search?"):
|
||
|
return urllib.parse.urljoin("http://www.google.com", link)
|
||
|
return None
|
||
|
|
||
|
def _get_number_of_results(results_div):
|
||
|
"""Return the total number of results of the google search.
|
||
|
Note that the returned value will be the same for all the GoogleResult
|
||
|
objects from a specific query."""
|
||
|
try:
|
||
|
results_div_text = results_div.get_text()
|
||
|
if results_div_text:
|
||
|
regex = r"((?:\d+[,\.])*\d+)"
|
||
|
m = findall(regex, results_div_text)
|
||
|
|
||
|
# Clean up the number.
|
||
|
num = m[0].replace(",", "").replace(".", "")
|
||
|
|
||
|
results = int(num)
|
||
|
return results
|
||
|
except Exception as e:
|
||
|
return 0
|