diff --git a/googletrans/gtoken.py b/googletrans/gtoken.py new file mode 100644 index 0000000..4cccf56 --- /dev/null +++ b/googletrans/gtoken.py @@ -0,0 +1,181 @@ +# -*- coding: utf-8 -*- +import ast +import math +import re +import sys +import time + +import requests + + +from googletrans.compat import PY3 +from googletrans.compat import unicode +from googletrans.utils import rshift + + +class TokenAcquirer(object): + """Google Translate API token generator + + translate.google.com uses a token to authorize the requests. If you are + not Google, you do have this token and will have to pay for use. + This class is the result of reverse engineering on the obfuscated and + minified code used by Google to generate such token. + + The token is based on a seed which is updated once per hour and on the + text that will be translated. + Both are combined - by some strange math - in order to generate a final + token (e.g. 744915.856682) which is used by the API to validate the + request. + + This operation will cause an additional request to get an initial + token from translate.google.com. + + :Example: + + >>> from gtoken import TokenAcquirer + >>> acquirer = TokenAcquirer() + >>> text = 'test' + >>> tk = acquirer.do(text) + >>> print(tk) + 950629.577246 + """ + + RE_TKK = re.compile(r'TKK=eval\(\'\(\(function\(\)\{(.+?)\}\)\(\)\)\'\);', re.DOTALL) + + + def __init__(self, tkk='0', session=None): + self.session = session or requests.Session() + self.tkk = tkk + + def _update(self): + """update tkk + """ + # we don't need to update the base TKK value when it is still valid + now = math.floor(int(time.time() * 1000) / 3600000.0) + if self.tkk and int(self.tkk.split('.')[0]) == now: + return + + r = self.session.get('https://translate.google.com') + # this will be the same as python code after stripping out a reserved word 'var' + code = unicode(self.RE_TKK.search(r.text)[1]).replace('var ', '') + # unescape special ascii characters such like a \x3d(=) + if PY3: # pragma: no cover + code = code.encode().decode('unicode-escape') + else: # pragma: no cover + code = code.decode('string_escape') + + if code: + tree = ast.parse(code) + visit_return = False + operator = '+' + n, keys = 0, dict(a=0, b=0) + for node in ast.walk(tree): + if isinstance(node, ast.Assign): + name = node.targets[0].id + if name in keys: + if isinstance(node.value, ast.Num): + keys[name] = node.value.n + # the value can sometimes be negative + elif isinstance(node.value, ast.UnaryOp) and \ + isinstance(node.value.op, ast.USub): + keys[name] = -node.value.operand.n + elif isinstance(node, ast.Return): + # parameters should be set after this point + visit_return = True + elif visit_return and isinstance(node, ast.Num): + n = node.n + elif visit_return and n > 0: + # the default operator is '+' but implement some more for + # all possible scenarios + if isinstance(node, ast.Add): # pragma: nocover + pass + elif isinstance(node, ast.Sub): # pragma: nocover + operator = '-' + elif isinstance(node, ast.Mult): # pragma: nocover + operator = '*' + elif isinstance(node, ast.Pow): # pragma: nocover + operator = '**' + elif isinstance(node, ast.BitXor): # pragma: nocover + operator = '^' + # a safety way to avoid Exceptions + clause = compile('{1}{0}{2}'.format(operator, keys['a'], keys['b']), '', 'eval') + value = eval(clause, dict(__builtin__={})) + result = '{}.{}'.format(n, value) + + self.tkk = result + + def _lazy(self, value): + """like lazy evalution, this method returns a lambda function that + returns value given. + We won't be needing this because this seems to have been built for + code obfuscation. + + the original code of this method is as follows: + + ... code-block: javascript + + var ek = function(a) { + return function() { + return a; + }; + } + """ + return lambda: value + + def _xr(self, a, b): + size_b = len(b) + c = 0 + while c < size_b - 2: + d = b[c + 2] + d = ord(d[0]) - 87 if 'a' <= d else int(d) + d = rshift(a, d) if '+' == b[c + 1] else a << d + a = a + d & 4294967295 if '+' == b[c] else a ^ d + + c += 3 + return a + + def acquire(self, text): + b = self.tkk if self.tkk != '0' else '' + d = b.split('.') + b = int(d[0]) if len(d) > 1 else 0 + + # assume e means char code array + e = [] + g = 0 + size = len(text) + for i, char in enumerate(text): + l = ord(char) + # just append if l is less than 128(ascii: DEL) + if l < 128: + e.append(l) + # append calculated value if l is less than 2048 + elif l < 2048: + e.append(l >> 6 | 192) + # append calculated value if l matches special condition + elif (l & 64512) == 55296 and g + 1 < size and ord(text[g + 1]) & 64512 == 56320: + g += 1 + l = 65536 + ((l & 1023) << 10) + ord(text[g]) & 1023 + e.append(l >> 18 | 240) + e.append(l >> 12 & 63 | 128) + else: + e.append(l >> 12 | 224) + e.append(l >> 6 & 63 | 128) + e.append(l & 63 | 128) + + a = b + for i, value in enumerate(e): + a += value + a = self._xr(a, '+-a^+6') + a = self._xr(a, '+-3^+b+-f') + a ^= int(d[1]) if len(d) > 1 else 0 + if a < 0: # pragma: nocover + a = (a & 2147483647) + 2147483648 + a %= 1000000 # int(1E6) + + return '{}.{}'.format(a, a ^ b) + + + def do(self, text): + self._update() + tk = self.acquire(text) + return tk