PCQRSCANER/venv/Lib/site-packages/google/modules/standard_search.py

230 lines
7.5 KiB
Python
Raw Permalink Normal View History

2019-12-22 21:51:47 +01:00
from __future__ import unicode_literals
from __future__ import absolute_import
from future import standard_library
standard_library.install_aliases()
from builtins import range
from builtins import object
from .utils import _get_search_url, get_html
from bs4 import BeautifulSoup
import urllib.parse
from urllib.parse import unquote, parse_qs, urlparse
from unidecode import unidecode
from re import match, findall
class GoogleResult(object):
"""Represents a google search result."""
def __init__(self):
self.name = None # The title of the link
self.link = None # The external link
self.google_link = None # The google link
self.description = None # The description of the link
self.thumb = None # Thumbnail link of website (NOT implemented yet)
self.cached = None # Cached version link of page
self.page = None # Results page this one was on
self.index = None # What index on this page it was on
self.number_of_results = None # The total number of results the query returned
def __repr__(self):
name = self._limit_str_size(self.name, 55)
description = self._limit_str_size(self.description, 49)
list_google = ["GoogleResult(",
"name={}".format(name), "\n", " " * 13,
"description={}".format(description)]
return "".join(list_google)
def _limit_str_size(self, str_element, size_limit):
"""Limit the characters of the string, adding .. at the end."""
if not str_element:
return None
elif len(str_element) > size_limit:
return unidecode(str_element[:size_limit]) + ".."
else:
return unidecode(str_element)
# PUBLIC
def search(query, pages=1, lang='en', area='com', ncr=False, void=True, time_period=False, sort_by_date=False, first_page=0):
"""Returns a list of GoogleResult.
Args:
query: String to search in google.
pages: Number of pages where results must be taken.
area : Area of google homepages.
first_page : First page.
TODO: add support to get the google results.
Returns:
A GoogleResult object."""
results = []
for i in range(first_page, first_page + pages):
url = _get_search_url(query, i, lang=lang, area=area, ncr=ncr, time_period=time_period, sort_by_date=sort_by_date)
html = get_html(url)
if html:
soup = BeautifulSoup(html, "html.parser")
divs = soup.findAll("div", attrs={"class": "g"})
results_div = soup.find("div", attrs={"id": "resultStats"})
number_of_results = _get_number_of_results(results_div)
j = 0
for li in divs:
res = GoogleResult()
res.page = i
res.index = j
res.name = _get_name(li)
res.link = _get_link(li)
res.google_link = _get_google_link(li)
res.description = _get_description(li)
res.thumb = _get_thumb()
res.cached = _get_cached(li)
res.number_of_results = number_of_results
if void is True:
if res.description is None:
continue
results.append(res)
j += 1
return results
# PRIVATE
def _get_name(li):
"""Return the name of a google search."""
a = li.find("a")
# return a.text.encode("utf-8").strip()
if a is not None:
return a.text.strip()
return None
def _filter_link(link):
'''Filter links found in the Google result pages HTML code.
Returns None if the link doesn't yield a valid result.
'''
try:
# Valid results are absolute URLs not pointing to a Google domain
# like images.google.com or googleusercontent.com
o = urlparse(link, 'http')
# link type-1
# >>> "https://www.gitbook.com/book/ljalphabeta/python-"
if o.netloc and 'google' not in o.netloc:
return link
# link type-2
# >>> "http://www.google.com/url?url=http://python.jobbole.com/84108/&rct=j&frm=1&q=&esrc=s&sa=U&ved=0ahUKEwj3quDH-Y7UAhWG6oMKHdQ-BQMQFggUMAA&usg=AFQjCNHPws5Buru5Z71wooRLHT6mpvnZlA"
if o.netloc and o.path.startswith('/url'):
try:
link = parse_qs(o.query)['url'][0]
o = urlparse(link, 'http')
if o.netloc and 'google' not in o.netloc:
return link
except KeyError:
pass
# Decode hidden URLs.
if link.startswith('/url?'):
try:
# link type-3
# >>> "/url?q=http://python.jobbole.com/84108/&sa=U&ved=0ahUKEwjFw6Txg4_UAhVI5IMKHfqVAykQFggUMAA&usg=AFQjCNFOTLpmpfqctpIn0sAfaj5U5gAU9A"
link = parse_qs(o.query)['q'][0]
# Valid results are absolute URLs not pointing to a Google domain
# like images.google.com or googleusercontent.com
o = urlparse(link, 'http')
if o.netloc and 'google' not in o.netloc:
return link
except KeyError:
# link type-4
# >>> "/url?url=https://machine-learning-python.kspax.io/&rct=j&frm=1&q=&esrc=s&sa=U&ved=0ahUKEwj3quDH-Y7UAhWG6oMKHdQ-BQMQFggfMAI&usg=AFQjCNEfkUI0RP_RlwD3eI22rSfqbYM_nA"
link = parse_qs(o.query)['url'][0]
o = urlparse(link, 'http')
if o.netloc and 'google' not in o.netloc:
return link
# Otherwise, or on error, return None.
except Exception:
pass
return None
def _get_link(li):
"""Return external link from a search."""
try:
a = li.find("a")
link = a["href"]
except Exception:
return None
return _filter_link(link)
def _get_google_link(li):
"""Return google link from a search."""
try:
a = li.find("a")
link = a["href"]
except Exception:
return None
if link.startswith("/url?") or link.startswith("/search?"):
return urllib.parse.urljoin("http://www.google.com", link)
else:
return None
def _get_description(li):
"""Return the description of a google search.
TODO: There are some text encoding problems to resolve."""
sdiv = li.find("div", attrs={"class": "s"})
if sdiv:
stspan = sdiv.find("span", attrs={"class": "st"})
if stspan is not None:
# return stspan.text.encode("utf-8").strip()
return stspan.text.strip()
else:
return None
def _get_thumb():
"""Return the link to a thumbnail of the website."""
pass
def _get_cached(li):
"""Return a link to the cached version of the page."""
links = li.find_all("a")
if len(links) > 1 and links[1].text == "Cached":
link = links[1]["href"]
if link.startswith("/url?") or link.startswith("/search?"):
return urllib.parse.urljoin("http://www.google.com", link)
return None
def _get_number_of_results(results_div):
"""Return the total number of results of the google search.
Note that the returned value will be the same for all the GoogleResult
objects from a specific query."""
try:
results_div_text = results_div.get_text()
if results_div_text:
regex = r"((?:\d+[,\.])*\d+)"
m = findall(regex, results_div_text)
# Clean up the number.
num = m[0].replace(",", "").replace(".", "")
results = int(num)
return results
except Exception as e:
return 0