649ba7d828
refs #2
100 lines
2.3 KiB
Python
100 lines
2.3 KiB
Python
"""A conversion module for googletrans"""
|
|
from __future__ import print_function
|
|
import re
|
|
import traceback
|
|
import json
|
|
|
|
def format_json(original):
|
|
# save state
|
|
states = []
|
|
text = original
|
|
for i, pos in enumerate(re.finditer('"', text)):
|
|
p = pos.start() + 1
|
|
if i % 2 == 0:
|
|
nxt = text.find('"', p)
|
|
states.append((p, text[p:nxt]))
|
|
|
|
# replace all weired characters in text
|
|
while text.find(',,') > -1:
|
|
text = text.replace(',,', ',null,')
|
|
while text.find('[,') > -1:
|
|
text = text.replace('[,', '[null,')
|
|
|
|
# recover state
|
|
for i, pos in enumerate(re.finditer('"', text)):
|
|
p = pos.start() + 1
|
|
if i % 2 == 0:
|
|
j = int(i / 2)
|
|
nxt = text.find('"', p)
|
|
# replacing a portion of a string
|
|
# use slicing to extract those parts of the original string to be kept
|
|
text = text[:p] + states[j][1] + text[nxt:]
|
|
|
|
try:
|
|
converted = json.loads(text)
|
|
except ValueError as e:
|
|
print('original text: ', original, ' => ', text)
|
|
traceback.print_exc()
|
|
return converted
|
|
|
|
SPECIAL_CASES = {
|
|
'ee': 'et',
|
|
}
|
|
|
|
LANGUAGES = {
|
|
'af': 'afrikaans',
|
|
'sq': 'albanian',
|
|
'ar': 'arabic',
|
|
'be': 'belarusian',
|
|
'bg': 'bulgarian',
|
|
'ca': 'catalan',
|
|
'zh-CN': 'chinese_simplified',
|
|
'zh-TW': 'chinese_traditional',
|
|
'hr': 'croatian',
|
|
'cs': 'czech',
|
|
'da': 'danish',
|
|
'nl': 'dutch',
|
|
'en': 'english',
|
|
'eo': 'esperanto',
|
|
'et': 'estonian',
|
|
'tl': 'filipino',
|
|
'fi': 'finnish',
|
|
'fr': 'french',
|
|
'gl': 'galician',
|
|
'de': 'german',
|
|
'el': 'greek',
|
|
'iw': 'hebrew',
|
|
'hi': 'hindi',
|
|
'hu': 'hungarian',
|
|
'is': 'icelandic',
|
|
'id': 'indonesian',
|
|
'ga': 'irish',
|
|
'it': 'italian',
|
|
'ja': 'japanese',
|
|
'ko': 'korean',
|
|
'la': 'latin',
|
|
'lv': 'latvian',
|
|
'lt': 'lithuanian',
|
|
'mk': 'macedonian',
|
|
'ms': 'malay',
|
|
'mt': 'maltese',
|
|
'no': 'norwegian',
|
|
'fa': 'persian',
|
|
'pl': 'polish',
|
|
'pt': 'portuguese',
|
|
'ro': 'romanian',
|
|
'ru': 'russian',
|
|
'sr': 'serbian',
|
|
'sk': 'slovak',
|
|
'sl': 'slovenian',
|
|
'es': 'spanish',
|
|
'sw': 'swahili',
|
|
'sv': 'swedish',
|
|
'th': 'thai',
|
|
'tr': 'turkish',
|
|
'uk': 'ukrainian',
|
|
'vi': 'vietnamese',
|
|
'cy': 'welsh',
|
|
'yi': 'yiddish',
|
|
}
|