778b00758e
Co-authored-by: Bastien Vallet <b.vallet@criteo.com>
197 lines
6.8 KiB
Python
197 lines
6.8 KiB
Python
# -*- coding: utf-8 -*-
|
|
import ast
|
|
import math
|
|
import re
|
|
import time
|
|
|
|
import requests
|
|
|
|
from googletrans.utils import rshift
|
|
|
|
|
|
class TokenAcquirer:
|
|
"""Google Translate API token generator
|
|
|
|
translate.google.com uses a token to authorize the requests. If you are
|
|
not Google, you do have this token and will have to pay for use.
|
|
This class is the result of reverse engineering on the obfuscated and
|
|
minified code used by Google to generate such token.
|
|
|
|
The token is based on a seed which is updated once per hour and on the
|
|
text that will be translated.
|
|
Both are combined - by some strange math - in order to generate a final
|
|
token (e.g. 744915.856682) which is used by the API to validate the
|
|
request.
|
|
|
|
This operation will cause an additional request to get an initial
|
|
token from translate.google.com.
|
|
|
|
Example usage:
|
|
>>> from googletrans.gtoken import TokenAcquirer
|
|
>>> acquirer = TokenAcquirer()
|
|
>>> text = 'test'
|
|
>>> tk = acquirer.do(text)
|
|
>>> tk
|
|
950629.577246
|
|
"""
|
|
|
|
RE_TKK = re.compile(r'tkk:\'(.+?)\'', re.DOTALL)
|
|
RE_RAWTKK = re.compile(r'tkk:\'(.+?)\'', re.DOTALL)
|
|
|
|
def __init__(self, tkk='0', session=None, host='translate.google.com'):
|
|
self.session = session or requests.Session()
|
|
self.tkk = tkk
|
|
self.host = host if 'http' in host else 'https://' + host
|
|
|
|
def _update(self):
|
|
"""update tkk
|
|
"""
|
|
# we don't need to update the base TKK value when it is still valid
|
|
now = math.floor(int(time.time() * 1000) / 3600000.0)
|
|
if self.tkk and int(self.tkk.split('.')[0]) == now:
|
|
return
|
|
|
|
r = self.session.get(self.host)
|
|
|
|
raw_tkk = self.RE_TKK.search(r.text)
|
|
if raw_tkk:
|
|
self.tkk = raw_tkk.group(1)
|
|
return
|
|
|
|
# this will be the same as python code after stripping out a reserved word 'var'
|
|
code = self.RE_TKK.search(r.text).group(1).replace('var ', '')
|
|
# unescape special ascii characters such like a \x3d(=)
|
|
code = code.encode().decode('unicode-escape')
|
|
|
|
if code:
|
|
tree = ast.parse(code)
|
|
visit_return = False
|
|
operator = '+'
|
|
n, keys = 0, dict(a=0, b=0)
|
|
for node in ast.walk(tree):
|
|
if isinstance(node, ast.Assign):
|
|
name = node.targets[0].id
|
|
if name in keys:
|
|
if isinstance(node.value, ast.Num):
|
|
keys[name] = node.value.n
|
|
# the value can sometimes be negative
|
|
elif isinstance(node.value, ast.UnaryOp) and \
|
|
isinstance(node.value.op, ast.USub): # pragma: nocover
|
|
keys[name] = -node.value.operand.n
|
|
elif isinstance(node, ast.Return):
|
|
# parameters should be set after this point
|
|
visit_return = True
|
|
elif visit_return and isinstance(node, ast.Num):
|
|
n = node.n
|
|
elif visit_return and n > 0:
|
|
# the default operator is '+' but implement some more for
|
|
# all possible scenarios
|
|
if isinstance(node, ast.Add): # pragma: nocover
|
|
pass
|
|
elif isinstance(node, ast.Sub): # pragma: nocover
|
|
operator = '-'
|
|
elif isinstance(node, ast.Mult): # pragma: nocover
|
|
operator = '*'
|
|
elif isinstance(node, ast.Pow): # pragma: nocover
|
|
operator = '**'
|
|
elif isinstance(node, ast.BitXor): # pragma: nocover
|
|
operator = '^'
|
|
# a safety way to avoid Exceptions
|
|
clause = compile('{1}{0}{2}'.format(
|
|
operator, keys['a'], keys['b']), '', 'eval')
|
|
value = eval(clause, dict(__builtin__={}))
|
|
result = '{}.{}'.format(n, value)
|
|
|
|
self.tkk = result
|
|
|
|
def _lazy(self, value):
|
|
"""like lazy evalution, this method returns a lambda function that
|
|
returns value given.
|
|
We won't be needing this because this seems to have been built for
|
|
code obfuscation.
|
|
|
|
the original code of this method is as follows:
|
|
|
|
... code-block: javascript
|
|
|
|
var ek = function(a) {
|
|
return function() {
|
|
return a;
|
|
};
|
|
}
|
|
"""
|
|
return lambda: value
|
|
|
|
def _xr(self, a, b):
|
|
size_b = len(b)
|
|
c = 0
|
|
while c < size_b - 2:
|
|
d = b[c + 2]
|
|
d = ord(d[0]) - 87 if 'a' <= d else int(d)
|
|
d = rshift(a, d) if '+' == b[c + 1] else a << d
|
|
a = a + d & 4294967295 if '+' == b[c] else a ^ d
|
|
|
|
c += 3
|
|
return a
|
|
|
|
def acquire(self, text):
|
|
a = []
|
|
# Convert text to ints
|
|
for i in text:
|
|
val = ord(i)
|
|
if val < 0x10000:
|
|
a += [val]
|
|
else:
|
|
# Python doesn't natively use Unicode surrogates, so account for those
|
|
a += [
|
|
math.floor((val - 0x10000) / 0x400 + 0xD800),
|
|
math.floor((val - 0x10000) % 0x400 + 0xDC00)
|
|
]
|
|
|
|
b = self.tkk if self.tkk != '0' else ''
|
|
d = b.split('.')
|
|
b = int(d[0]) if len(d) > 1 else 0
|
|
|
|
# assume e means char code array
|
|
e = []
|
|
g = 0
|
|
size = len(a)
|
|
while g < size:
|
|
l = a[g]
|
|
# just append if l is less than 128(ascii: DEL)
|
|
if l < 128:
|
|
e.append(l)
|
|
# append calculated value if l is less than 2048
|
|
else:
|
|
if l < 2048:
|
|
e.append(l >> 6 | 192)
|
|
else:
|
|
# append calculated value if l matches special condition
|
|
if (l & 64512) == 55296 and g + 1 < size and \
|
|
a[g + 1] & 64512 == 56320:
|
|
g += 1
|
|
l = 65536 + ((l & 1023) << 10) + (a[g] & 1023) # This bracket is important
|
|
e.append(l >> 18 | 240)
|
|
e.append(l >> 12 & 63 | 128)
|
|
else:
|
|
e.append(l >> 12 | 224)
|
|
e.append(l >> 6 & 63 | 128)
|
|
e.append(l & 63 | 128)
|
|
g += 1
|
|
a = b
|
|
for i, value in enumerate(e):
|
|
a += value
|
|
a = self._xr(a, '+-a^+6')
|
|
a = self._xr(a, '+-3^+b+-f')
|
|
a ^= int(d[1]) if len(d) > 1 else 0
|
|
if a < 0: # pragma: nocover
|
|
a = (a & 2147483647) + 2147483648
|
|
a %= 1000000 # int(1E6)
|
|
|
|
return '{}.{}'.format(a, a ^ b)
|
|
|
|
def do(self, text):
|
|
self._update()
|
|
tk = self.acquire(text)
|
|
return tk
|