From 759a0baf46ce4f4da97707d14918606df264cf36 Mon Sep 17 00:00:00 2001 From: Sarah Fletcher Date: Wed, 19 Dec 2018 07:15:37 -0800 Subject: [PATCH] fix: token generation for text > 0x10000 (#75) * Fix token generation for text > 0x10000 Also minor optimizations. * Missed a var rename --- googletrans/gtoken.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/googletrans/gtoken.py b/googletrans/gtoken.py index 0d23a09..407aba3 100644 --- a/googletrans/gtoken.py +++ b/googletrans/gtoken.py @@ -135,6 +135,19 @@ class TokenAcquirer(object): return a def acquire(self, text): + a = [] + # Convert text to ints + for i in text: + val = ord(i) + if val < 0x10000: + a += [val] + else: + # Python doesn't natively use Unicode surrogates, so account for those + a += [ + math.floor((val - 0x10000)/0x400 + 0xD800), + math.floor((val - 0x10000)%0x400 + 0xDC00) + ] + b = self.tkk if self.tkk != '0' else '' d = b.split('.') b = int(d[0]) if len(d) > 1 else 0 @@ -143,8 +156,8 @@ class TokenAcquirer(object): e = [] g = 0 size = len(text) - for i, char in enumerate(text): - l = ord(char) + while g < size: + l = a[g] # just append if l is less than 128(ascii: DEL) if l < 128: e.append(l) @@ -155,15 +168,16 @@ class TokenAcquirer(object): else: # append calculated value if l matches special condition if (l & 64512) == 55296 and g + 1 < size and \ - ord(text[g + 1]) & 64512 == 56320: + a[g + 1] & 64512 == 56320: g += 1 - l = 65536 + ((l & 1023) << 10) + ord(text[g]) & 1023 + l = 65536 + ((l & 1023) << 10) + (a[g] & 1023) # This bracket is important e.append(l >> 18 | 240) e.append(l >> 12 & 63 | 128) else: e.append(l >> 12 | 224) e.append(l >> 6 & 63 | 128) - e.append(l & 63 | 128) + e.append(l & 63 | 128) + g += 1 a = b for i, value in enumerate(e): a += value