wmt-2020-pl-en/googletrans/conversion.py

96 lines
2.3 KiB
Python

"""A conversion module for googletrans"""
from __future__ import print_function
import re
import traceback
import json
def format_json(original):
# save state
states = []
text = original
for i, pos in enumerate(re.finditer('"', text)):
p = pos.start() + 1
if i % 2 == 0:
nxt = text.find('"', p)
states.append((p, text[p:nxt]))
# replace all weired characters in text
while text.find(',,') > -1:
text = text.replace(',,', ',null,')
while text.find('[,') > -1:
text = text.replace('[,', '[null,')
# recover state
for i, pos in enumerate(re.finditer('"', text)):
p = pos.start() + 1
if i % 2 == 0:
j = int(i / 2)
nxt = text.find('"', p)
# replacing a portion of a string
# use slicing to extract those parts of the original string to be kept
text = text[:p] + states[j][1] + text[nxt:]
try:
converted = json.loads(text)
except ValueError as e:
print('original text: ', original, ' => ', text)
traceback.print_exc()
return converted
LANGUAGES = {
'af': 'afrikaans',
'sq': 'albanian',
'ar': 'arabic',
'be': 'belarusian',
'bg': 'bulgarian',
'ca': 'catalan',
'zh-CN': 'chinese_simplified',
'zh-TW': 'chinese_traditional',
'hr': 'croatian',
'cs': 'czech',
'da': 'danish',
'nl': 'dutch',
'en': 'english',
'eo': 'esperanto',
'et': 'estonian',
'tl': 'filipino',
'fi': 'finnish',
'fr': 'french',
'gl': 'galician',
'de': 'german',
'el': 'greek',
'iw': 'hebrew',
'hi': 'hindi',
'hu': 'hungarian',
'is': 'icelandic',
'id': 'indonesian',
'ga': 'irish',
'it': 'italian',
'ja': 'japanese',
'ko': 'korean',
'la': 'latin',
'lv': 'latvian',
'lt': 'lithuanian',
'mk': 'macedonian',
'ms': 'malay',
'mt': 'maltese',
'no': 'norwegian',
'fa': 'persian',
'pl': 'polish',
'pt': 'portuguese',
'ro': 'romanian',
'ru': 'russian',
'sr': 'serbian',
'sk': 'slovak',
'sl': 'slovenian',
'es': 'spanish',
'sw': 'swahili',
'sv': 'swedish',
'th': 'thai',
'tr': 'turkish',
'uk': 'ukrainian',
'vi': 'vietnamese',
'cy': 'welsh',
'yi': 'yiddish',
}