561 lines
19 KiB
Python
561 lines
19 KiB
Python
|
from __future__ import unicode_literals
|
||
|
from __future__ import print_function
|
||
|
from __future__ import division
|
||
|
|
||
|
from future import standard_library
|
||
|
standard_library.install_aliases()
|
||
|
from builtins import range
|
||
|
from past.utils import old_div
|
||
|
import time
|
||
|
from selenium import webdriver
|
||
|
import urllib.request
|
||
|
import urllib.error
|
||
|
import urllib.parse
|
||
|
from functools import wraps
|
||
|
# import requests
|
||
|
from urllib.parse import urlencode
|
||
|
from fake_useragent import UserAgent
|
||
|
import sys
|
||
|
|
||
|
class AreaError(KeyError):
|
||
|
pass
|
||
|
|
||
|
|
||
|
def measure_time(fn):
|
||
|
|
||
|
def decorator(*args, **kwargs):
|
||
|
start = time.time()
|
||
|
|
||
|
res = fn(*args, **kwargs)
|
||
|
|
||
|
elapsed = time.time() - start
|
||
|
print(fn.__name__, "took", elapsed, "seconds")
|
||
|
|
||
|
return res
|
||
|
|
||
|
return decorator
|
||
|
|
||
|
|
||
|
def normalize_query(query):
|
||
|
return query.strip().replace(":", "%3A").replace("+", "%2B").replace("&", "%26").replace(" ", "+")
|
||
|
|
||
|
|
||
|
def _get_search_url(query, page=0, per_page=10, lang='en', area='com', ncr=False, time_period=False, sort_by_date=False):
|
||
|
# note: num per page might not be supported by google anymore (because of
|
||
|
# google instant)
|
||
|
|
||
|
params = {
|
||
|
'nl': lang,
|
||
|
'q': query.encode('utf8'),
|
||
|
'start': page * per_page,
|
||
|
'num': per_page
|
||
|
}
|
||
|
|
||
|
time_mapping = {
|
||
|
'hour': 'qdr:h',
|
||
|
'week': 'qdr:w',
|
||
|
'month': 'qdr:m',
|
||
|
'year': 'qdr:y'
|
||
|
}
|
||
|
|
||
|
|
||
|
tbs_param = []
|
||
|
# Set time period for query if given
|
||
|
if time_period and time_period in time_mapping:
|
||
|
tbs_param.append(time_mapping[time_period])
|
||
|
|
||
|
if sort_by_date:
|
||
|
tbs_param.append('sbd:1')
|
||
|
params['tbs'] = ','.join(tbs_param)
|
||
|
|
||
|
# This will allow to search Google with No Country Redirect
|
||
|
if ncr:
|
||
|
params['gl'] = 'us' # Geographic Location: US
|
||
|
params['pws'] = '0' # 'pws' = '0' disables personalised search
|
||
|
params['gws_rd'] = 'cr' # Google Web Server ReDirect: CountRy.
|
||
|
|
||
|
params = urlencode(params)
|
||
|
|
||
|
url = u"https://www.google.com/search?" + params
|
||
|
|
||
|
# @author JuaniFilardo:
|
||
|
# Workaround to switch between http and https, since this maneuver
|
||
|
# seems to avoid the 503 error when performing a lot of queries.
|
||
|
# Weird, but it works.
|
||
|
# You may also wanna wait some time between queries, say, randint(50,65)
|
||
|
# between each query, and randint(180,240) every 100 queries, which is
|
||
|
# what I found useful.
|
||
|
https = int(time.time()) % 2 == 0
|
||
|
bare_url = u"https://www.google.com/search?" if https else u"http://www.google.com/search?"
|
||
|
url = bare_url + params
|
||
|
|
||
|
# return u"http://www.google.com/search?hl=%s&q=%s&start=%i&num=%i" %
|
||
|
# (lang, normalize_query(query), page * per_page, per_page)
|
||
|
if not ncr:
|
||
|
if area == 'com':
|
||
|
url = u"http://www.google.com/search?"
|
||
|
elif area == 'is':
|
||
|
url = 'http://www.google.is/search?'
|
||
|
elif area == 'dk':
|
||
|
url = 'http://www.google.dk/search?'
|
||
|
elif area == 'no':
|
||
|
url = 'http://www.google.no/search?'
|
||
|
elif area == 'se':
|
||
|
url = 'http://www.google.se/search?'
|
||
|
elif area == 'fi':
|
||
|
url = 'http://www.google.fi/search?'
|
||
|
elif area == 'ee':
|
||
|
url = 'http://www.google.ee/search?'
|
||
|
elif area == 'lv':
|
||
|
url = 'http://www.google.lv/search?'
|
||
|
elif area == 'lt':
|
||
|
url = 'http://www.google.lt/search?'
|
||
|
elif area == 'ie':
|
||
|
url = 'http://www.google.ie/search?'
|
||
|
elif area == 'uk':
|
||
|
url = 'http://www.google.co.uk/search?'
|
||
|
elif area == 'gg':
|
||
|
url = 'http://www.google.gg/search?'
|
||
|
elif area == 'je':
|
||
|
url = 'http://www.google.je/search?'
|
||
|
elif area == 'im':
|
||
|
url = 'http://www.google.im/search?'
|
||
|
elif area == 'fr':
|
||
|
url = 'http://www.google.fr/search?'
|
||
|
elif area == 'nl':
|
||
|
url = 'http://www.google.nl/search?'
|
||
|
elif area == 'be':
|
||
|
url = 'http://www.google.be/search?'
|
||
|
elif area == 'lu':
|
||
|
url = 'http://www.google.lu/search?'
|
||
|
elif area == 'de':
|
||
|
url = 'http://www.google.de/search?'
|
||
|
elif area == 'at':
|
||
|
url = 'http://www.google.at/search?'
|
||
|
elif area == 'ch':
|
||
|
url = 'http://www.google.ch/search?'
|
||
|
elif area == 'li':
|
||
|
url = 'http://www.google.li/search?'
|
||
|
elif area == 'pt':
|
||
|
url = 'http://www.google.pt/search?'
|
||
|
elif area == 'es':
|
||
|
url = 'http://www.google.es/search?'
|
||
|
elif area == 'gi':
|
||
|
url = 'http://www.google.com.gi/search?'
|
||
|
elif area == 'ad':
|
||
|
url = 'http://www.google.ad/search?'
|
||
|
elif area == 'it':
|
||
|
url = 'http://www.google.it/search?'
|
||
|
elif area == 'mt':
|
||
|
url = 'http://www.google.com.mt/search?'
|
||
|
elif area == 'sm':
|
||
|
url = 'http://www.google.sm/search?'
|
||
|
elif area == 'gr':
|
||
|
url = 'http://www.google.gr/search?'
|
||
|
elif area == 'ru':
|
||
|
url = 'http://www.google.ru/search?'
|
||
|
elif area == 'by':
|
||
|
url = 'http://www.google.com.by/search?'
|
||
|
elif area == 'ua':
|
||
|
url = 'http://www.google.com.ua/search?'
|
||
|
elif area == 'pl':
|
||
|
url = 'http://www.google.pl/search?'
|
||
|
elif area == 'cz':
|
||
|
url = 'http://www.google.cz/search?'
|
||
|
elif area == 'sk':
|
||
|
url = 'http://www.google.sk/search?'
|
||
|
elif area == 'hu':
|
||
|
url = 'http://www.google.hu/search?'
|
||
|
elif area == 'si':
|
||
|
url = 'http://www.google.si/search?'
|
||
|
elif area == 'hr':
|
||
|
url = 'http://www.google.hr/search?'
|
||
|
elif area == 'ba':
|
||
|
url = 'http://www.google.ba/search?'
|
||
|
elif area == 'me':
|
||
|
url = 'http://www.google.me/search?'
|
||
|
elif area == 'rs':
|
||
|
url = 'http://www.google.rs/search?'
|
||
|
elif area == 'mk':
|
||
|
url = 'http://www.google.mk/search?'
|
||
|
elif area == 'bg':
|
||
|
url = 'http://www.google.bg/search?'
|
||
|
elif area == 'ro':
|
||
|
url = 'http://www.google.ro/search?'
|
||
|
elif area == 'md':
|
||
|
url = 'http://www.google.md/search?'
|
||
|
elif area == 'hk':
|
||
|
url = 'http://www.google.com.hk/search?'
|
||
|
elif area == 'mn':
|
||
|
url = 'http://www.google.mn/search?'
|
||
|
elif area == 'kr':
|
||
|
url = 'http://www.google.co.kr/search?'
|
||
|
elif area == 'jp':
|
||
|
url = 'http://www.google.co.jp/search?'
|
||
|
elif area == 'vn':
|
||
|
url = 'http://www.google.com.vn/search?'
|
||
|
elif area == 'la':
|
||
|
url = 'http://www.google.la/search?'
|
||
|
elif area == 'kh':
|
||
|
url = 'http://www.google.com.kh/search?'
|
||
|
elif area == 'th':
|
||
|
url = 'http://www.google.co.th/search?'
|
||
|
elif area == 'my':
|
||
|
url = 'http://www.google.com.my/search?'
|
||
|
elif area == 'sg':
|
||
|
url = 'http://www.google.com.sg/search?'
|
||
|
elif area == 'bn':
|
||
|
url = 'http://www.google.com.bn/search?'
|
||
|
elif area == 'ph':
|
||
|
url = 'http://www.google.com.ph/search?'
|
||
|
elif area == 'id':
|
||
|
url = 'http://www.google.co.id/search?'
|
||
|
elif area == 'tp':
|
||
|
url = 'http://www.google.tp/search?'
|
||
|
elif area == 'kz':
|
||
|
url = 'http://www.google.kz/search?'
|
||
|
elif area == 'kg':
|
||
|
url = 'http://www.google.kg/search?'
|
||
|
elif area == 'tj':
|
||
|
url = 'http://www.google.com.tj/search?'
|
||
|
elif area == 'uz':
|
||
|
url = 'http://www.google.co.uz/search?'
|
||
|
elif area == 'tm':
|
||
|
url = 'http://www.google.tm/search?'
|
||
|
elif area == 'af':
|
||
|
url = 'http://www.google.com.af/search?'
|
||
|
elif area == 'pk':
|
||
|
url = 'http://www.google.com.pk/search?'
|
||
|
elif area == 'np':
|
||
|
url = 'http://www.google.com.np/search?'
|
||
|
elif area == 'in':
|
||
|
url = 'http://www.google.co.in/search?'
|
||
|
elif area == 'bd':
|
||
|
url = 'http://www.google.com.bd/search?'
|
||
|
elif area == 'lk':
|
||
|
url = 'http://www.google.lk/search?'
|
||
|
elif area == 'mv':
|
||
|
url = 'http://www.google.mv/search?'
|
||
|
elif area == 'kw':
|
||
|
url = 'http://www.google.com.kw/search?'
|
||
|
elif area == 'sa':
|
||
|
url = 'http://www.google.com.sa/search?'
|
||
|
elif area == 'bh':
|
||
|
url = 'http://www.google.com.bh/search?'
|
||
|
elif area == 'ae':
|
||
|
url = 'http://www.google.ae/search?'
|
||
|
elif area == 'om':
|
||
|
url = 'http://www.google.com.om/search?'
|
||
|
elif area == 'jo':
|
||
|
url = 'http://www.google.jo/search?'
|
||
|
elif area == 'il':
|
||
|
url = 'http://www.google.co.il/search?'
|
||
|
elif area == 'lb':
|
||
|
url = 'http://www.google.com.lb/search?'
|
||
|
elif area == 'tr':
|
||
|
url = 'http://www.google.com.tr/search?'
|
||
|
elif area == 'az':
|
||
|
url = 'http://www.google.az/search?'
|
||
|
elif area == 'am':
|
||
|
url = 'http://www.google.am/search?'
|
||
|
elif area == 'ls':
|
||
|
url = 'http://www.google.co.ls/search?'
|
||
|
elif area == 'eg':
|
||
|
url = 'http://www.google.com.eg/search?'
|
||
|
elif area == 'ly':
|
||
|
url = 'http://www.google.com.ly/search?'
|
||
|
elif area == 'dz':
|
||
|
url = 'http://www.google.dz/search?'
|
||
|
elif area == 'ma':
|
||
|
url = 'http://www.google.co.ma/search?'
|
||
|
elif area == 'sn':
|
||
|
url = 'http://www.google.sn/search?'
|
||
|
elif area == 'gm':
|
||
|
url = 'http://www.google.gm/search?'
|
||
|
elif area == 'ml':
|
||
|
url = 'http://www.google.ml/search?'
|
||
|
elif area == 'bf':
|
||
|
url = 'http://www.google.bf/search?'
|
||
|
elif area == 'sl':
|
||
|
url = 'http://www.google.com.sl/search?'
|
||
|
elif area == 'ci':
|
||
|
url = 'http://www.google.ci/search?'
|
||
|
elif area == 'gh':
|
||
|
url = 'http://www.google.com.gh/search?'
|
||
|
elif area == 'tg':
|
||
|
url = 'http://www.google.tg/search?'
|
||
|
elif area == 'bj':
|
||
|
url = 'http://www.google.bj/search?'
|
||
|
elif area == 'ne':
|
||
|
url = 'http://www.google.ne/search?'
|
||
|
elif area == 'ng':
|
||
|
url = 'http://www.google.com.ng/search?'
|
||
|
elif area == 'sh':
|
||
|
url = 'http://www.google.sh/search?'
|
||
|
elif area == 'cm':
|
||
|
url = 'http://www.google.cm/search?'
|
||
|
elif area == 'td':
|
||
|
url = 'http://www.google.td/search?'
|
||
|
elif area == 'cf':
|
||
|
url = 'http://www.google.cf/search?'
|
||
|
elif area == 'ga':
|
||
|
url = 'http://www.google.ga/search?'
|
||
|
elif area == 'cg':
|
||
|
url = 'http://www.google.cg/search?'
|
||
|
elif area == 'cd':
|
||
|
url = 'http://www.google.cd/search?'
|
||
|
elif area == 'ao':
|
||
|
url = 'http://www.google.it.ao/search?'
|
||
|
elif area == 'et':
|
||
|
url = 'http://www.google.com.et/search?'
|
||
|
elif area == 'dj':
|
||
|
url = 'http://www.google.dj/search?'
|
||
|
elif area == 'ke':
|
||
|
url = 'http://www.google.co.ke/search?'
|
||
|
elif area == 'ug':
|
||
|
url = 'http://www.google.co.ug/search?'
|
||
|
elif area == 'tz':
|
||
|
url = 'http://www.google.co.tz/search?'
|
||
|
elif area == 'rw':
|
||
|
url = 'http://www.google.rw/search?'
|
||
|
elif area == 'bi':
|
||
|
url = 'http://www.google.bi/search?'
|
||
|
elif area == 'mw':
|
||
|
url = 'http://www.google.mw/search?'
|
||
|
elif area == 'mz':
|
||
|
url = 'http://www.google.co.mz/search?'
|
||
|
elif area == 'mg':
|
||
|
url = 'http://www.google.mg/search?'
|
||
|
elif area == 'sc':
|
||
|
url = 'http://www.google.sc/search?'
|
||
|
elif area == 'mu':
|
||
|
url = 'http://www.google.mu/search?'
|
||
|
elif area == 'zm':
|
||
|
url = 'http://www.google.co.zm/search?'
|
||
|
elif area == 'zw':
|
||
|
url = 'http://www.google.co.zw/search?'
|
||
|
elif area == 'bw':
|
||
|
url = 'http://www.google.co.bw/search?'
|
||
|
elif area == 'na':
|
||
|
url = 'http://www.google.com.na/search?'
|
||
|
elif area == 'za':
|
||
|
url = 'http://www.google.co.za/search?'
|
||
|
elif area == 'au':
|
||
|
url = 'http://www.google.com.au/search?'
|
||
|
elif area == 'nf':
|
||
|
url = 'http://www.google.com.nf/search?'
|
||
|
elif area == 'nz':
|
||
|
url = 'http://www.google.co.nz/search?'
|
||
|
elif area == 'sb':
|
||
|
url = 'http://www.google.com.sb/search?'
|
||
|
elif area == 'fj':
|
||
|
url = 'http://www.google.com.fj/search?'
|
||
|
elif area == 'fm':
|
||
|
url = 'http://www.google.fm/search?'
|
||
|
elif area == 'ki':
|
||
|
url = 'http://www.google.ki/search?'
|
||
|
elif area == 'nr':
|
||
|
url = 'http://www.google.nr/search?'
|
||
|
elif area == 'tk':
|
||
|
url = 'http://www.google.tk/search?'
|
||
|
elif area == 'ws':
|
||
|
url = 'http://www.google.ws/search?'
|
||
|
elif area == 'as':
|
||
|
url = 'http://www.google.as/search?'
|
||
|
elif area == 'to':
|
||
|
url = 'http://www.google.to/search?'
|
||
|
elif area == 'nu':
|
||
|
url = 'http://www.google.nu/search?'
|
||
|
elif area == 'ck':
|
||
|
url = 'http://www.google.co.ck/search?'
|
||
|
elif area == 'do':
|
||
|
url = 'http://www.google.com.do/search?'
|
||
|
elif area == 'tt':
|
||
|
url = 'http://www.google.tt/search?'
|
||
|
elif area == 'co':
|
||
|
url = 'http://www.google.com.co/search?'
|
||
|
elif area == 'ec':
|
||
|
url = 'http://www.google.com.ec/search?'
|
||
|
elif area == 've':
|
||
|
url = 'http://www.google.co.ve/search?'
|
||
|
elif area == 'gy':
|
||
|
url = 'http://www.google.gy/search?'
|
||
|
elif area == 'pe':
|
||
|
url = 'http://www.google.com.pe/search?'
|
||
|
elif area == 'bo':
|
||
|
url = 'http://www.google.com.bo/search?'
|
||
|
elif area == 'py':
|
||
|
url = 'http://www.google.com.py/search?'
|
||
|
elif area == 'br':
|
||
|
url = 'http://www.google.com.br/search?'
|
||
|
elif area == 'uy':
|
||
|
url = 'http://www.google.com.uy/search?'
|
||
|
elif area == 'ar':
|
||
|
url = 'http://www.google.com.ar/search?'
|
||
|
elif area == 'cl':
|
||
|
url = 'http://www.google.cl/search?'
|
||
|
elif area == 'gl':
|
||
|
url = 'http://www.google.gl/search?'
|
||
|
elif area == 'ca':
|
||
|
url = 'http://www.google.ca/search?'
|
||
|
elif area == 'mx':
|
||
|
url = 'http://www.google.com.mx/search?'
|
||
|
elif area == 'gt':
|
||
|
url = 'http://www.google.com.gt/search?'
|
||
|
elif area == 'bz':
|
||
|
url = 'http://www.google.com.bz/search?'
|
||
|
elif area == 'sv':
|
||
|
url = 'http://www.google.com.sv/search?'
|
||
|
elif area == 'hn':
|
||
|
url = 'http://www.google.hn/search?'
|
||
|
elif area == 'ni':
|
||
|
url = 'http://www.google.com.ni/search?'
|
||
|
elif area == 'cr':
|
||
|
url = 'http://www.google.co.cr/search?'
|
||
|
elif area == 'pa':
|
||
|
url = 'http://www.google.com.pa/search?'
|
||
|
elif area == 'bs':
|
||
|
url = 'http://www.google.bs/search?'
|
||
|
elif area == 'cu':
|
||
|
url = 'http://www.google.com.cu/search?'
|
||
|
elif area == 'jm':
|
||
|
url = 'http://www.google.com.jm/search?'
|
||
|
elif area == 'ht':
|
||
|
url = 'http://www.google.ht/search?'
|
||
|
else:
|
||
|
raise AreaError('invalid name, no area found')
|
||
|
url += params
|
||
|
return url
|
||
|
|
||
|
|
||
|
def get_html(url):
|
||
|
ua = UserAgent()
|
||
|
header = ua.random
|
||
|
|
||
|
try:
|
||
|
request = urllib.request.Request(url)
|
||
|
request.add_header("User-Agent", header)
|
||
|
html = urllib.request.urlopen(request).read()
|
||
|
return html
|
||
|
except urllib.error.HTTPError as e:
|
||
|
print("Error accessing:", url)
|
||
|
print(e)
|
||
|
if e.code == 503 and 'CaptchaRedirect' in e.read():
|
||
|
print("Google is requiring a Captcha. "
|
||
|
"For more information check: 'https://support.google.com/websearch/answer/86640'")
|
||
|
if e.code == 503:
|
||
|
sys.exit("503 Error: service is currently unavailable. Program will exit.")
|
||
|
return None
|
||
|
except Exception as e:
|
||
|
print("Error accessing:", url)
|
||
|
print(e)
|
||
|
return None
|
||
|
|
||
|
|
||
|
def write_html_to_file(html, filename):
|
||
|
of = open(filename, "w")
|
||
|
of.write(html.encode("utf-8"))
|
||
|
# of.flush()
|
||
|
of.close()
|
||
|
|
||
|
|
||
|
def get_browser_with_url(url, timeout=120, driver="firefox"):
|
||
|
"""Returns an open browser with a given url."""
|
||
|
|
||
|
# choose a browser
|
||
|
if driver == "firefox":
|
||
|
browser = webdriver.Firefox()
|
||
|
elif driver == "ie":
|
||
|
browser = webdriver.Ie()
|
||
|
elif driver == "chrome":
|
||
|
browser = webdriver.Chrome()
|
||
|
else:
|
||
|
print("Driver choosen is not recognized")
|
||
|
|
||
|
# set maximum load time
|
||
|
browser.set_page_load_timeout(timeout)
|
||
|
|
||
|
# open a browser with given url
|
||
|
browser.get(url)
|
||
|
|
||
|
time.sleep(0.5)
|
||
|
|
||
|
return browser
|
||
|
|
||
|
|
||
|
def get_html_from_dynamic_site(url, timeout=120,
|
||
|
driver="firefox", attempts=10):
|
||
|
"""Returns html from a dynamic site, opening it in a browser."""
|
||
|
|
||
|
RV = ""
|
||
|
|
||
|
# try several attempts
|
||
|
for i in range(attempts):
|
||
|
try:
|
||
|
# load browser
|
||
|
browser = get_browser_with_url(url, timeout, driver)
|
||
|
|
||
|
# get html
|
||
|
time.sleep(2)
|
||
|
content = browser.page_source
|
||
|
|
||
|
# try again if there is no content
|
||
|
if not content:
|
||
|
browser.quit()
|
||
|
raise Exception("No content!")
|
||
|
|
||
|
# if there is content gets out
|
||
|
browser.quit()
|
||
|
RV = content
|
||
|
break
|
||
|
|
||
|
except:
|
||
|
print("\nTry ", i, " of ", attempts, "\n")
|
||
|
time.sleep(5)
|
||
|
|
||
|
return RV
|
||
|
|
||
|
|
||
|
def timeit(func=None, loops=1, verbose=False):
|
||
|
if func:
|
||
|
def inner(*args, **kwargs):
|
||
|
|
||
|
sums = 0.0
|
||
|
mins = 1.7976931348623157e+308
|
||
|
maxs = 0.0
|
||
|
print('====%s Timing====' % func.__name__)
|
||
|
for i in range(0, loops):
|
||
|
t0 = time.time()
|
||
|
result = func(*args, **kwargs)
|
||
|
dt = time.time() - t0
|
||
|
mins = dt if dt < mins else mins
|
||
|
maxs = dt if dt > maxs else maxs
|
||
|
sums += dt
|
||
|
if verbose:
|
||
|
print('\t%r ran in %2.9f sec on run %s' %
|
||
|
(func.__name__, dt, i))
|
||
|
print('%r min run time was %2.9f sec' % (func.__name__, mins))
|
||
|
print('%r max run time was %2.9f sec' % (func.__name__, maxs))
|
||
|
print('%r avg run time was %2.9f sec in %s runs' %
|
||
|
(func.__name__, old_div(sums, loops), loops))
|
||
|
print('==== end ====')
|
||
|
return result
|
||
|
|
||
|
return inner
|
||
|
else:
|
||
|
def partial_inner(func):
|
||
|
return timeit(func, loops, verbose)
|
||
|
return partial_inner
|
||
|
|
||
|
|
||
|
def timing(f):
|
||
|
@wraps(f)
|
||
|
def wrap(*args, **kw):
|
||
|
ts = time.time()
|
||
|
result = f(*args, **kw)
|
||
|
te = time.time()
|
||
|
print('func:%r args:[%r, %r] took: %2.4f sec' %
|
||
|
(f.__name__, args, kw, te - ts))
|
||
|
return result
|
||
|
return wrap
|