fix: token generation for text > 0x10000 (#75)

* Fix token generation for text > 0x10000

Also minor optimizations.

* Missed a var rename
This commit is contained in:
Sarah Fletcher 2018-12-19 07:15:37 -08:00 committed by SuHun Han
parent 48653332ef
commit 759a0baf46

View File

@ -135,6 +135,19 @@ class TokenAcquirer(object):
return a return a
def acquire(self, text): def acquire(self, text):
a = []
# Convert text to ints
for i in text:
val = ord(i)
if val < 0x10000:
a += [val]
else:
# Python doesn't natively use Unicode surrogates, so account for those
a += [
math.floor((val - 0x10000)/0x400 + 0xD800),
math.floor((val - 0x10000)%0x400 + 0xDC00)
]
b = self.tkk if self.tkk != '0' else '' b = self.tkk if self.tkk != '0' else ''
d = b.split('.') d = b.split('.')
b = int(d[0]) if len(d) > 1 else 0 b = int(d[0]) if len(d) > 1 else 0
@ -143,8 +156,8 @@ class TokenAcquirer(object):
e = [] e = []
g = 0 g = 0
size = len(text) size = len(text)
for i, char in enumerate(text): while g < size:
l = ord(char) l = a[g]
# just append if l is less than 128(ascii: DEL) # just append if l is less than 128(ascii: DEL)
if l < 128: if l < 128:
e.append(l) e.append(l)
@ -155,15 +168,16 @@ class TokenAcquirer(object):
else: else:
# append calculated value if l matches special condition # append calculated value if l matches special condition
if (l & 64512) == 55296 and g + 1 < size and \ if (l & 64512) == 55296 and g + 1 < size and \
ord(text[g + 1]) & 64512 == 56320: a[g + 1] & 64512 == 56320:
g += 1 g += 1
l = 65536 + ((l & 1023) << 10) + ord(text[g]) & 1023 l = 65536 + ((l & 1023) << 10) + (a[g] & 1023) # This bracket is important
e.append(l >> 18 | 240) e.append(l >> 18 | 240)
e.append(l >> 12 & 63 | 128) e.append(l >> 12 & 63 | 128)
else: else:
e.append(l >> 12 | 224) e.append(l >> 12 | 224)
e.append(l >> 6 & 63 | 128) e.append(l >> 6 & 63 | 128)
e.append(l & 63 | 128) e.append(l & 63 | 128)
g += 1
a = b a = b
for i, value in enumerate(e): for i, value in enumerate(e):
a += value a += value