fix: token generation for text > 0x10000 (#75)

* Fix token generation for text > 0x10000 Also minor optimizations. * Missed a var rename
2018-12-19 07:15:37 -08:00 · 2018-12-19 07:15:37 -08:00 · 759a0baf46
commit 759a0baf46
parent 48653332ef
1 changed files with 19 additions and 5 deletions
--- a/googletrans/gtoken.py
+++ b/googletrans/gtoken.py
@ -135,6 +135,19 @@ class TokenAcquirer(object):
        return a

    def acquire(self, text):
+        a = []
+        # Convert text to ints
+        for i in text:
+            val = ord(i)
+            if val < 0x10000:
+                a += [val]
+            else:
+                # Python doesn't natively use Unicode surrogates, so account for those
+                a += [
+                    math.floor((val - 0x10000)/0x400 + 0xD800),
+                    math.floor((val - 0x10000)%0x400 + 0xDC00)
+                    ]
+
        b = self.tkk if self.tkk != '0' else ''
        d = b.split('.')
        b = int(d[0]) if len(d) > 1 else 0
@ -143,8 +156,8 @@ class TokenAcquirer(object):
        e = []
        g = 0
        size = len(text)
-        for i, char in enumerate(text):
-            l = ord(char)
+        while g < size:
+            l = a[g]
            # just append if l is less than 128(ascii: DEL)
            if l < 128:
                e.append(l)
@ -155,15 +168,16 @@ class TokenAcquirer(object):
                else:
                    # append calculated value if l matches special condition
                    if (l & 64512) == 55296 and g + 1 < size and \
-                            ord(text[g + 1]) & 64512 == 56320:
+                            a[g + 1] & 64512 == 56320:
                        g += 1
-                        l = 65536 + ((l & 1023) << 10) + ord(text[g]) & 1023
+                        l = 65536 + ((l & 1023) << 10) + (a[g] & 1023) # This bracket is important
                        e.append(l >> 18 | 240)
                        e.append(l >> 12 & 63 | 128)
                    else:
                        e.append(l >> 12 | 224)
                    e.append(l >> 6 & 63 | 128)
-                e.append(l & 63 | 128)
+                e.append(l & 63 | 128)   
+            g += 1
        a = b
        for i, value in enumerate(e):
            a += value