wmt-2020-pl-en/googletrans/utils.py

70 lines
1.7 KiB
Python
Raw Normal View History

2017-03-10 11:19:46 +01:00
"""A conversion module for googletrans"""
from __future__ import print_function
import re
import json
2017-03-10 14:44:39 +01:00
def build_params(query, src, dest, token):
params = {
'client': 't',
'sl': src,
'tl': dest,
'hl': dest,
'dt': ['at', 'bd', 'ex', 'ld', 'md', 'qca', 'rw', 'rm', 'ss', 't'],
'ie': 'UTF-8',
'oe': 'UTF-8',
'otf': 1,
'ssel': 0,
'tsel': 0,
'tk': token,
'q': query,
}
return params
def legacy_format_json(original):
2017-03-10 11:19:46 +01:00
# save state
states = []
text = original
# save position for double-quoted texts
2017-03-10 11:19:46 +01:00
for i, pos in enumerate(re.finditer('"', text)):
# pos.start() is a double-quote
2017-03-10 11:19:46 +01:00
p = pos.start() + 1
if i % 2 == 0:
nxt = text.find('"', p)
states.append((p, text[p:nxt]))
# replace all weired characters in text
while text.find(',,') > -1:
text = text.replace(',,', ',null,')
while text.find('[,') > -1:
text = text.replace('[,', '[null,')
# recover state
for i, pos in enumerate(re.finditer('"', text)):
p = pos.start() + 1
if i % 2 == 0:
j = int(i / 2)
nxt = text.find('"', p)
# replacing a portion of a string
# use slicing to extract those parts of the original string to be kept
text = text[:p] + states[j][1] + text[nxt:]
converted = json.loads(text)
return converted
def format_json(original):
try:
converted = json.loads(original)
except ValueError:
converted = legacy_format_json(original)
return converted
2017-03-10 11:19:46 +01:00
def rshift(val, n):
"""python port for '>>>'(right shift with padding)
"""
2017-03-10 14:44:39 +01:00
return (val % 0x100000000) >> n