PCQRSCANER/venv/Lib/site-packages/google/modules/utils.py
2019-12-22 21:51:47 +01:00

561 lines
19 KiB
Python

from __future__ import unicode_literals
from __future__ import print_function
from __future__ import division
from future import standard_library
standard_library.install_aliases()
from builtins import range
from past.utils import old_div
import time
from selenium import webdriver
import urllib.request
import urllib.error
import urllib.parse
from functools import wraps
# import requests
from urllib.parse import urlencode
from fake_useragent import UserAgent
import sys
class AreaError(KeyError):
pass
def measure_time(fn):
def decorator(*args, **kwargs):
start = time.time()
res = fn(*args, **kwargs)
elapsed = time.time() - start
print(fn.__name__, "took", elapsed, "seconds")
return res
return decorator
def normalize_query(query):
return query.strip().replace(":", "%3A").replace("+", "%2B").replace("&", "%26").replace(" ", "+")
def _get_search_url(query, page=0, per_page=10, lang='en', area='com', ncr=False, time_period=False, sort_by_date=False):
# note: num per page might not be supported by google anymore (because of
# google instant)
params = {
'nl': lang,
'q': query.encode('utf8'),
'start': page * per_page,
'num': per_page
}
time_mapping = {
'hour': 'qdr:h',
'week': 'qdr:w',
'month': 'qdr:m',
'year': 'qdr:y'
}
tbs_param = []
# Set time period for query if given
if time_period and time_period in time_mapping:
tbs_param.append(time_mapping[time_period])
if sort_by_date:
tbs_param.append('sbd:1')
params['tbs'] = ','.join(tbs_param)
# This will allow to search Google with No Country Redirect
if ncr:
params['gl'] = 'us' # Geographic Location: US
params['pws'] = '0' # 'pws' = '0' disables personalised search
params['gws_rd'] = 'cr' # Google Web Server ReDirect: CountRy.
params = urlencode(params)
url = u"https://www.google.com/search?" + params
# @author JuaniFilardo:
# Workaround to switch between http and https, since this maneuver
# seems to avoid the 503 error when performing a lot of queries.
# Weird, but it works.
# You may also wanna wait some time between queries, say, randint(50,65)
# between each query, and randint(180,240) every 100 queries, which is
# what I found useful.
https = int(time.time()) % 2 == 0
bare_url = u"https://www.google.com/search?" if https else u"http://www.google.com/search?"
url = bare_url + params
# return u"http://www.google.com/search?hl=%s&q=%s&start=%i&num=%i" %
# (lang, normalize_query(query), page * per_page, per_page)
if not ncr:
if area == 'com':
url = u"http://www.google.com/search?"
elif area == 'is':
url = 'http://www.google.is/search?'
elif area == 'dk':
url = 'http://www.google.dk/search?'
elif area == 'no':
url = 'http://www.google.no/search?'
elif area == 'se':
url = 'http://www.google.se/search?'
elif area == 'fi':
url = 'http://www.google.fi/search?'
elif area == 'ee':
url = 'http://www.google.ee/search?'
elif area == 'lv':
url = 'http://www.google.lv/search?'
elif area == 'lt':
url = 'http://www.google.lt/search?'
elif area == 'ie':
url = 'http://www.google.ie/search?'
elif area == 'uk':
url = 'http://www.google.co.uk/search?'
elif area == 'gg':
url = 'http://www.google.gg/search?'
elif area == 'je':
url = 'http://www.google.je/search?'
elif area == 'im':
url = 'http://www.google.im/search?'
elif area == 'fr':
url = 'http://www.google.fr/search?'
elif area == 'nl':
url = 'http://www.google.nl/search?'
elif area == 'be':
url = 'http://www.google.be/search?'
elif area == 'lu':
url = 'http://www.google.lu/search?'
elif area == 'de':
url = 'http://www.google.de/search?'
elif area == 'at':
url = 'http://www.google.at/search?'
elif area == 'ch':
url = 'http://www.google.ch/search?'
elif area == 'li':
url = 'http://www.google.li/search?'
elif area == 'pt':
url = 'http://www.google.pt/search?'
elif area == 'es':
url = 'http://www.google.es/search?'
elif area == 'gi':
url = 'http://www.google.com.gi/search?'
elif area == 'ad':
url = 'http://www.google.ad/search?'
elif area == 'it':
url = 'http://www.google.it/search?'
elif area == 'mt':
url = 'http://www.google.com.mt/search?'
elif area == 'sm':
url = 'http://www.google.sm/search?'
elif area == 'gr':
url = 'http://www.google.gr/search?'
elif area == 'ru':
url = 'http://www.google.ru/search?'
elif area == 'by':
url = 'http://www.google.com.by/search?'
elif area == 'ua':
url = 'http://www.google.com.ua/search?'
elif area == 'pl':
url = 'http://www.google.pl/search?'
elif area == 'cz':
url = 'http://www.google.cz/search?'
elif area == 'sk':
url = 'http://www.google.sk/search?'
elif area == 'hu':
url = 'http://www.google.hu/search?'
elif area == 'si':
url = 'http://www.google.si/search?'
elif area == 'hr':
url = 'http://www.google.hr/search?'
elif area == 'ba':
url = 'http://www.google.ba/search?'
elif area == 'me':
url = 'http://www.google.me/search?'
elif area == 'rs':
url = 'http://www.google.rs/search?'
elif area == 'mk':
url = 'http://www.google.mk/search?'
elif area == 'bg':
url = 'http://www.google.bg/search?'
elif area == 'ro':
url = 'http://www.google.ro/search?'
elif area == 'md':
url = 'http://www.google.md/search?'
elif area == 'hk':
url = 'http://www.google.com.hk/search?'
elif area == 'mn':
url = 'http://www.google.mn/search?'
elif area == 'kr':
url = 'http://www.google.co.kr/search?'
elif area == 'jp':
url = 'http://www.google.co.jp/search?'
elif area == 'vn':
url = 'http://www.google.com.vn/search?'
elif area == 'la':
url = 'http://www.google.la/search?'
elif area == 'kh':
url = 'http://www.google.com.kh/search?'
elif area == 'th':
url = 'http://www.google.co.th/search?'
elif area == 'my':
url = 'http://www.google.com.my/search?'
elif area == 'sg':
url = 'http://www.google.com.sg/search?'
elif area == 'bn':
url = 'http://www.google.com.bn/search?'
elif area == 'ph':
url = 'http://www.google.com.ph/search?'
elif area == 'id':
url = 'http://www.google.co.id/search?'
elif area == 'tp':
url = 'http://www.google.tp/search?'
elif area == 'kz':
url = 'http://www.google.kz/search?'
elif area == 'kg':
url = 'http://www.google.kg/search?'
elif area == 'tj':
url = 'http://www.google.com.tj/search?'
elif area == 'uz':
url = 'http://www.google.co.uz/search?'
elif area == 'tm':
url = 'http://www.google.tm/search?'
elif area == 'af':
url = 'http://www.google.com.af/search?'
elif area == 'pk':
url = 'http://www.google.com.pk/search?'
elif area == 'np':
url = 'http://www.google.com.np/search?'
elif area == 'in':
url = 'http://www.google.co.in/search?'
elif area == 'bd':
url = 'http://www.google.com.bd/search?'
elif area == 'lk':
url = 'http://www.google.lk/search?'
elif area == 'mv':
url = 'http://www.google.mv/search?'
elif area == 'kw':
url = 'http://www.google.com.kw/search?'
elif area == 'sa':
url = 'http://www.google.com.sa/search?'
elif area == 'bh':
url = 'http://www.google.com.bh/search?'
elif area == 'ae':
url = 'http://www.google.ae/search?'
elif area == 'om':
url = 'http://www.google.com.om/search?'
elif area == 'jo':
url = 'http://www.google.jo/search?'
elif area == 'il':
url = 'http://www.google.co.il/search?'
elif area == 'lb':
url = 'http://www.google.com.lb/search?'
elif area == 'tr':
url = 'http://www.google.com.tr/search?'
elif area == 'az':
url = 'http://www.google.az/search?'
elif area == 'am':
url = 'http://www.google.am/search?'
elif area == 'ls':
url = 'http://www.google.co.ls/search?'
elif area == 'eg':
url = 'http://www.google.com.eg/search?'
elif area == 'ly':
url = 'http://www.google.com.ly/search?'
elif area == 'dz':
url = 'http://www.google.dz/search?'
elif area == 'ma':
url = 'http://www.google.co.ma/search?'
elif area == 'sn':
url = 'http://www.google.sn/search?'
elif area == 'gm':
url = 'http://www.google.gm/search?'
elif area == 'ml':
url = 'http://www.google.ml/search?'
elif area == 'bf':
url = 'http://www.google.bf/search?'
elif area == 'sl':
url = 'http://www.google.com.sl/search?'
elif area == 'ci':
url = 'http://www.google.ci/search?'
elif area == 'gh':
url = 'http://www.google.com.gh/search?'
elif area == 'tg':
url = 'http://www.google.tg/search?'
elif area == 'bj':
url = 'http://www.google.bj/search?'
elif area == 'ne':
url = 'http://www.google.ne/search?'
elif area == 'ng':
url = 'http://www.google.com.ng/search?'
elif area == 'sh':
url = 'http://www.google.sh/search?'
elif area == 'cm':
url = 'http://www.google.cm/search?'
elif area == 'td':
url = 'http://www.google.td/search?'
elif area == 'cf':
url = 'http://www.google.cf/search?'
elif area == 'ga':
url = 'http://www.google.ga/search?'
elif area == 'cg':
url = 'http://www.google.cg/search?'
elif area == 'cd':
url = 'http://www.google.cd/search?'
elif area == 'ao':
url = 'http://www.google.it.ao/search?'
elif area == 'et':
url = 'http://www.google.com.et/search?'
elif area == 'dj':
url = 'http://www.google.dj/search?'
elif area == 'ke':
url = 'http://www.google.co.ke/search?'
elif area == 'ug':
url = 'http://www.google.co.ug/search?'
elif area == 'tz':
url = 'http://www.google.co.tz/search?'
elif area == 'rw':
url = 'http://www.google.rw/search?'
elif area == 'bi':
url = 'http://www.google.bi/search?'
elif area == 'mw':
url = 'http://www.google.mw/search?'
elif area == 'mz':
url = 'http://www.google.co.mz/search?'
elif area == 'mg':
url = 'http://www.google.mg/search?'
elif area == 'sc':
url = 'http://www.google.sc/search?'
elif area == 'mu':
url = 'http://www.google.mu/search?'
elif area == 'zm':
url = 'http://www.google.co.zm/search?'
elif area == 'zw':
url = 'http://www.google.co.zw/search?'
elif area == 'bw':
url = 'http://www.google.co.bw/search?'
elif area == 'na':
url = 'http://www.google.com.na/search?'
elif area == 'za':
url = 'http://www.google.co.za/search?'
elif area == 'au':
url = 'http://www.google.com.au/search?'
elif area == 'nf':
url = 'http://www.google.com.nf/search?'
elif area == 'nz':
url = 'http://www.google.co.nz/search?'
elif area == 'sb':
url = 'http://www.google.com.sb/search?'
elif area == 'fj':
url = 'http://www.google.com.fj/search?'
elif area == 'fm':
url = 'http://www.google.fm/search?'
elif area == 'ki':
url = 'http://www.google.ki/search?'
elif area == 'nr':
url = 'http://www.google.nr/search?'
elif area == 'tk':
url = 'http://www.google.tk/search?'
elif area == 'ws':
url = 'http://www.google.ws/search?'
elif area == 'as':
url = 'http://www.google.as/search?'
elif area == 'to':
url = 'http://www.google.to/search?'
elif area == 'nu':
url = 'http://www.google.nu/search?'
elif area == 'ck':
url = 'http://www.google.co.ck/search?'
elif area == 'do':
url = 'http://www.google.com.do/search?'
elif area == 'tt':
url = 'http://www.google.tt/search?'
elif area == 'co':
url = 'http://www.google.com.co/search?'
elif area == 'ec':
url = 'http://www.google.com.ec/search?'
elif area == 've':
url = 'http://www.google.co.ve/search?'
elif area == 'gy':
url = 'http://www.google.gy/search?'
elif area == 'pe':
url = 'http://www.google.com.pe/search?'
elif area == 'bo':
url = 'http://www.google.com.bo/search?'
elif area == 'py':
url = 'http://www.google.com.py/search?'
elif area == 'br':
url = 'http://www.google.com.br/search?'
elif area == 'uy':
url = 'http://www.google.com.uy/search?'
elif area == 'ar':
url = 'http://www.google.com.ar/search?'
elif area == 'cl':
url = 'http://www.google.cl/search?'
elif area == 'gl':
url = 'http://www.google.gl/search?'
elif area == 'ca':
url = 'http://www.google.ca/search?'
elif area == 'mx':
url = 'http://www.google.com.mx/search?'
elif area == 'gt':
url = 'http://www.google.com.gt/search?'
elif area == 'bz':
url = 'http://www.google.com.bz/search?'
elif area == 'sv':
url = 'http://www.google.com.sv/search?'
elif area == 'hn':
url = 'http://www.google.hn/search?'
elif area == 'ni':
url = 'http://www.google.com.ni/search?'
elif area == 'cr':
url = 'http://www.google.co.cr/search?'
elif area == 'pa':
url = 'http://www.google.com.pa/search?'
elif area == 'bs':
url = 'http://www.google.bs/search?'
elif area == 'cu':
url = 'http://www.google.com.cu/search?'
elif area == 'jm':
url = 'http://www.google.com.jm/search?'
elif area == 'ht':
url = 'http://www.google.ht/search?'
else:
raise AreaError('invalid name, no area found')
url += params
return url
def get_html(url):
ua = UserAgent()
header = ua.random
try:
request = urllib.request.Request(url)
request.add_header("User-Agent", header)
html = urllib.request.urlopen(request).read()
return html
except urllib.error.HTTPError as e:
print("Error accessing:", url)
print(e)
if e.code == 503 and 'CaptchaRedirect' in e.read():
print("Google is requiring a Captcha. "
"For more information check: 'https://support.google.com/websearch/answer/86640'")
if e.code == 503:
sys.exit("503 Error: service is currently unavailable. Program will exit.")
return None
except Exception as e:
print("Error accessing:", url)
print(e)
return None
def write_html_to_file(html, filename):
of = open(filename, "w")
of.write(html.encode("utf-8"))
# of.flush()
of.close()
def get_browser_with_url(url, timeout=120, driver="firefox"):
"""Returns an open browser with a given url."""
# choose a browser
if driver == "firefox":
browser = webdriver.Firefox()
elif driver == "ie":
browser = webdriver.Ie()
elif driver == "chrome":
browser = webdriver.Chrome()
else:
print("Driver choosen is not recognized")
# set maximum load time
browser.set_page_load_timeout(timeout)
# open a browser with given url
browser.get(url)
time.sleep(0.5)
return browser
def get_html_from_dynamic_site(url, timeout=120,
driver="firefox", attempts=10):
"""Returns html from a dynamic site, opening it in a browser."""
RV = ""
# try several attempts
for i in range(attempts):
try:
# load browser
browser = get_browser_with_url(url, timeout, driver)
# get html
time.sleep(2)
content = browser.page_source
# try again if there is no content
if not content:
browser.quit()
raise Exception("No content!")
# if there is content gets out
browser.quit()
RV = content
break
except:
print("\nTry ", i, " of ", attempts, "\n")
time.sleep(5)
return RV
def timeit(func=None, loops=1, verbose=False):
if func:
def inner(*args, **kwargs):
sums = 0.0
mins = 1.7976931348623157e+308
maxs = 0.0
print('====%s Timing====' % func.__name__)
for i in range(0, loops):
t0 = time.time()
result = func(*args, **kwargs)
dt = time.time() - t0
mins = dt if dt < mins else mins
maxs = dt if dt > maxs else maxs
sums += dt
if verbose:
print('\t%r ran in %2.9f sec on run %s' %
(func.__name__, dt, i))
print('%r min run time was %2.9f sec' % (func.__name__, mins))
print('%r max run time was %2.9f sec' % (func.__name__, maxs))
print('%r avg run time was %2.9f sec in %s runs' %
(func.__name__, old_div(sums, loops), loops))
print('==== end ====')
return result
return inner
else:
def partial_inner(func):
return timeit(func, loops, verbose)
return partial_inner
def timing(f):
@wraps(f)
def wrap(*args, **kw):
ts = time.time()
result = f(*args, **kw)
te = time.time()
print('func:%r args:[%r, %r] took: %2.4f sec' %
(f.__name__, args, kw, te - ts))
return result
return wrap