wmt-2020-pl-en/googletrans/gtoken.py

# -*- coding: utf-8 -*-
import ast
import math
import re
import time

import requests


from googletrans.compat import PY3
from googletrans.compat import unicode
from googletrans.utils import rshift


class TokenAcquirer(object):
    """Google Translate API token generator

    translate.google.com uses a token to authorize the requests. If you are
    not Google, you do have this token and will have to pay for use.
    This class is the result of reverse engineering on the obfuscated and
    minified code used by Google to generate such token.

    The token is based on a seed which is updated once per hour and on the
    text that will be translated.
    Both are combined - by some strange math - in order to generate a final
    token (e.g. 744915.856682) which is used by the API to validate the
    request.

    This operation will cause an additional request to get an initial
    token from translate.google.com.

    Example usage:
        >>> from googletrans.gtoken import TokenAcquirer
        >>> acquirer = TokenAcquirer()
        >>> text = 'test'
        >>> tk = acquirer.do(text)
        >>> tk
        950629.577246
    """

    RE_TKK = re.compile(r'TKK=eval\(\'\(\(function\(\)\{(.+?)\}\)\(\)\)\'\);',
                        re.DOTALL)

    def __init__(self, tkk='0', session=None):
        self.session = session or requests.Session()
        self.tkk = tkk

    def _update(self):
        """update tkk
        """
        # we don't need to update the base TKK value when it is still valid
        now = math.floor(int(time.time() * 1000) / 3600000.0)
        if self.tkk and int(self.tkk.split('.')[0]) == now:
            return

        r = self.session.get('https://translate.google.com')
        # this will be the same as python code after stripping out a reserved word 'var'
        code = unicode(self.RE_TKK.search(r.text).group(1)).replace('var ', '')
        # unescape special ascii characters such like a \x3d(=)
        if PY3:  # pragma: no cover
            code = code.encode().decode('unicode-escape')
        else:  # pragma: no cover
            code = code.decode('string_escape')

        if code:
            tree = ast.parse(code)
            visit_return = False
            operator = '+'
            n, keys = 0, dict(a=0, b=0)
            for node in ast.walk(tree):
                if isinstance(node, ast.Assign):
                    name = node.targets[0].id
                    if name in keys:
                        if isinstance(node.value, ast.Num):
                            keys[name] = node.value.n
                        # the value can sometimes be negative
                        elif isinstance(node.value, ast.UnaryOp) and \
                                isinstance(node.value.op, ast.USub):  # pragma: nocover
                            keys[name] = -node.value.operand.n
                elif isinstance(node, ast.Return):
                    # parameters should be set after this point
                    visit_return = True
                elif visit_return and isinstance(node, ast.Num):
                    n = node.n
                elif visit_return and n > 0:
                    # the default operator is '+' but implement some more for
                    # all possible scenarios
                    if isinstance(node, ast.Add):  # pragma: nocover
                        pass
                    elif isinstance(node, ast.Sub):  # pragma: nocover
                        operator = '-'
                    elif isinstance(node, ast.Mult):  # pragma: nocover
                        operator = '*'
                    elif isinstance(node, ast.Pow):  # pragma: nocover
                        operator = '**'
                    elif isinstance(node, ast.BitXor):  # pragma: nocover
                        operator = '^'
            # a safety way to avoid Exceptions
            clause = compile('{1}{0}{2}'.format(
                operator, keys['a'], keys['b']), '', 'eval')
            value = eval(clause, dict(__builtin__={}))
            result = '{}.{}'.format(n, value)

            self.tkk = result

    def _lazy(self, value):
        """like lazy evalution, this method returns a lambda function that
        returns value given.
        We won't be needing this because this seems to have been built for
        code obfuscation.

        the original code of this method is as follows:

           ... code-block: javascript

               var ek = function(a) {
                return function() {
                    return a;
                };
               }
        """
        return lambda: value

    def _xr(self, a, b):
        size_b = len(b)
        c = 0
        while c < size_b - 2:
            d = b[c + 2]
            d = ord(d[0]) - 87 if 'a' <= d else int(d)
            d = rshift(a, d) if '+' == b[c + 1] else a << d
            a = a + d & 4294967295 if '+' == b[c] else a ^ d

            c += 3
        return a

    def acquire(self, text):
        b = self.tkk if self.tkk != '0' else ''
        d = b.split('.')
        b = int(d[0]) if len(d) > 1 else 0

        # assume e means char code array
        e = []
        g = 0
        size = len(text)
        for i, char in enumerate(text):
            l = ord(char)
            # just append if l is less than 128(ascii: DEL)
            if l < 128:
                e.append(l)
            # append calculated value if l is less than 2048
            else:
                if l < 2048:
                    e.append(l >> 6 | 192)
                else:
                    # append calculated value if l matches special condition
                    if (l & 64512) == 55296 and g + 1 < size and \
                            ord(text[g + 1]) & 64512 == 56320:
                        g += 1
                        l = 65536 + ((l & 1023) << 10) + ord(text[g]) & 1023
                        e.append(l >> 18 | 240)
                        e.append(l >> 12 & 63 | 128)
                    else:
                        e.append(l >> 12 | 224)
                        e.append(l >> 6 & 63 | 128)
                e.append(l & 63 | 128)
        a = b
        for i, value in enumerate(e):
            a += value
            a = self._xr(a, '+-a^+6')
        a = self._xr(a, '+-3^+b+-f')
        a ^= int(d[1]) if len(d) > 1 else 0
        if a < 0:  # pragma: nocover
            a = (a & 2147483647) + 2147483648
        a %= 1000000  # int(1E6)

        return '{}.{}'.format(a, a ^ b)

    def do(self, text):
        self._update()
        tk = self.acquire(text)
        return tk
Add google translate token generator 2017-03-10 11:19:26 +01:00			`# -- coding: utf-8 --`
			`import ast`
			`import math`
			`import re`
			`import time`

			`import requests`


			`from googletrans.compat import PY3`
			`from googletrans.compat import unicode`
			`from googletrans.utils import rshift`


			`class TokenAcquirer(object):`
			`"""Google Translate API token generator`

			`translate.google.com uses a token to authorize the requests. If you are`
			`not Google, you do have this token and will have to pay for use.`
			`This class is the result of reverse engineering on the obfuscated and`
			`minified code used by Google to generate such token.`

			`The token is based on a seed which is updated once per hour and on the`
			`text that will be translated.`
			`Both are combined - by some strange math - in order to generate a final`
			`token (e.g. 744915.856682) which is used by the API to validate the`
			`request.`

			`This operation will cause an additional request to get an initial`
			`token from translate.google.com.`

Update documentation with a new sphinx theme 'solar' 2017-03-15 13:42:54 +01:00			`Example usage:`
			`>>> from googletrans.gtoken import TokenAcquirer`
Add google translate token generator 2017-03-10 11:19:26 +01:00			`>>> acquirer = TokenAcquirer()`
			`>>> text = 'test'`
			`>>> tk = acquirer.do(text)`
Update documentation with a new sphinx theme 'solar' 2017-03-15 13:42:54 +01:00			`>>> tk`
Add google translate token generator 2017-03-10 11:19:26 +01:00			`950629.577246`
			`"""`

PEP8 2017-03-10 15:16:47 +01:00			`RE_TKK = re.compile(r'TKK=eval\(\'\(\(function\(\)\{(.+?)\}\)\(\)\)\'\);',`
			`re.DOTALL)`
Add google translate token generator 2017-03-10 11:19:26 +01:00
			`def __init__(self, tkk='0', session=None):`
			`self.session = session or requests.Session()`
			`self.tkk = tkk`

			`def _update(self):`
			`"""update tkk`
			`"""`
			`# we don't need to update the base TKK value when it is still valid`
			`now = math.floor(int(time.time() * 1000) / 3600000.0)`
			`if self.tkk and int(self.tkk.split('.')[0]) == now:`
			`return`

			`r = self.session.get('https://translate.google.com')`
			`# this will be the same as python code after stripping out a reserved word 'var'`
Better compatiblity with all python versions 2017-03-10 14:52:00 +01:00			`code = unicode(self.RE_TKK.search(r.text).group(1)).replace('var ', '')`
Add google translate token generator 2017-03-10 11:19:26 +01:00			`# unescape special ascii characters such like a \x3d(=)`
			`if PY3: # pragma: no cover`
			`code = code.encode().decode('unicode-escape')`
			`else: # pragma: no cover`
			`code = code.decode('string_escape')`

			`if code:`
			`tree = ast.parse(code)`
			`visit_return = False`
			`operator = '+'`
			`n, keys = 0, dict(a=0, b=0)`
			`for node in ast.walk(tree):`
			`if isinstance(node, ast.Assign):`
			`name = node.targets[0].id`
			`if name in keys:`
			`if isinstance(node.value, ast.Num):`
			`keys[name] = node.value.n`
			`# the value can sometimes be negative`
			`elif isinstance(node.value, ast.UnaryOp) and \`
Better compatiblity with all python versions 2017-03-10 14:52:00 +01:00			`isinstance(node.value.op, ast.USub): # pragma: nocover`
Add google translate token generator 2017-03-10 11:19:26 +01:00			`keys[name] = -node.value.operand.n`
			`elif isinstance(node, ast.Return):`
			`# parameters should be set after this point`
			`visit_return = True`
			`elif visit_return and isinstance(node, ast.Num):`
			`n = node.n`
			`elif visit_return and n > 0:`
			`# the default operator is '+' but implement some more for`
			`# all possible scenarios`
			`if isinstance(node, ast.Add): # pragma: nocover`
			`pass`
			`elif isinstance(node, ast.Sub): # pragma: nocover`
			`operator = '-'`
			`elif isinstance(node, ast.Mult): # pragma: nocover`
			`operator = '*'`
			`elif isinstance(node, ast.Pow): # pragma: nocover`
			`operator = '**'`
			`elif isinstance(node, ast.BitXor): # pragma: nocover`
			`operator = '^'`
			`# a safety way to avoid Exceptions`
PEP8 2017-03-10 15:16:47 +01:00			`clause = compile('{1}{0}{2}'.format(`
			`operator, keys['a'], keys['b']), '', 'eval')`
Add google translate token generator 2017-03-10 11:19:26 +01:00			`value = eval(clause, dict(__builtin__={}))`
			`result = '{}.{}'.format(n, value)`

			`self.tkk = result`

			`def _lazy(self, value):`
			`"""like lazy evalution, this method returns a lambda function that`
			`returns value given.`
			`We won't be needing this because this seems to have been built for`
			`code obfuscation.`

			`the original code of this method is as follows:`

			`... code-block: javascript`

			`var ek = function(a) {`
			`return function() {`
			`return a;`
			`};`
			`}`
			`"""`
			`return lambda: value`

			`def _xr(self, a, b):`
			`size_b = len(b)`
			`c = 0`
			`while c < size_b - 2:`
			`d = b[c + 2]`
			`d = ord(d[0]) - 87 if 'a' <= d else int(d)`
			`d = rshift(a, d) if '+' == b[c + 1] else a << d`
			`a = a + d & 4294967295 if '+' == b[c] else a ^ d`

			`c += 3`
			`return a`

			`def acquire(self, text):`
			`b = self.tkk if self.tkk != '0' else ''`
			`d = b.split('.')`
			`b = int(d[0]) if len(d) > 1 else 0`

			`# assume e means char code array`
			`e = []`
			`g = 0`
			`size = len(text)`
			`for i, char in enumerate(text):`
			`l = ord(char)`
			`# just append if l is less than 128(ascii: DEL)`
			`if l < 128:`
			`e.append(l)`
			`# append calculated value if l is less than 2048`
			`else:`
Fix invalid tokenizer due to the complexity of the obfuscated code This commit solves #14 and bumps the version to 2.1.2 2017-04-07 15:43:48 +02:00			`if l < 2048:`
			`e.append(l >> 6 \| 192)`
			`else:`
			`# append calculated value if l matches special condition`
			`if (l & 64512) == 55296 and g + 1 < size and \`
			`ord(text[g + 1]) & 64512 == 56320:`
			`g += 1`
			`l = 65536 + ((l & 1023) << 10) + ord(text[g]) & 1023`
			`e.append(l >> 18 \| 240)`
			`e.append(l >> 12 & 63 \| 128)`
			`else:`
			`e.append(l >> 12 \| 224)`
			`e.append(l >> 6 & 63 \| 128)`
Add google translate token generator 2017-03-10 11:19:26 +01:00			`e.append(l & 63 \| 128)`
			`a = b`
			`for i, value in enumerate(e):`
			`a += value`
			`a = self._xr(a, '+-a^+6')`
			`a = self._xr(a, '+-3^+b+-f')`
			`a ^= int(d[1]) if len(d) > 1 else 0`
			`if a < 0: # pragma: nocover`
			`a = (a & 2147483647) + 2147483648`
			`a %= 1000000 # int(1E6)`

			`return '{}.{}'.format(a, a ^ b)`

			`def do(self, text):`
			`self._update()`
			`tk = self.acquire(text)`
			`return tk`