wmt-2020-pl-en/googletrans/conversion.py

"""A conversion module for googletrans"""
from __future__ import print_function
import re
import traceback
import json

def format_json(original):
    # save state
    states = []
    text = original
    for i, pos in enumerate(re.finditer('"', text)):
        p = pos.start() + 1
        if i % 2 == 0:
            nxt = text.find('"', p)
            states.append((p, text[p:nxt]))

    # replace all weired characters in text
    while text.find(',,') > -1:
        text = text.replace(',,', ',null,')
    while text.find('[,') > -1:
        text = text.replace('[,', '[null,')

    # recover state
    for i, pos in enumerate(re.finditer('"', text)):
        p = pos.start() + 1
        if i % 2 == 0:
            j = int(i / 2)
            nxt = text.find('"', p)
            # replacing a portion of a string
            # use slicing to extract those parts of the original string to be kept
            text = text[:p] + states[j][1] + text[nxt:]

    try:
        converted = json.loads(text)
    except ValueError as e:
        print('original text: ', original, ' => ', text)
        traceback.print_exc()
    return converted

LANGUAGES = {
    'af': 'afrikaans',
    'sq': 'albanian',
    'ar': 'arabic',
    'be': 'belarusian',
    'bg': 'bulgarian',
    'ca': 'catalan',
    'zh-CN': 'chinese_simplified',
    'zh-TW': 'chinese_traditional',
    'hr': 'croatian',
    'cs': 'czech',
    'da': 'danish',
    'nl': 'dutch',
    'en': 'english',
    'eo': 'esperanto',
    'et': 'estonian',
    'tl': 'filipino',
    'fi': 'finnish',
    'fr': 'french',
    'gl': 'galician',
    'de': 'german',
    'el': 'greek',
    'iw': 'hebrew',
    'hi': 'hindi',
    'hu': 'hungarian',
    'is': 'icelandic',
    'id': 'indonesian',
    'ga': 'irish',
    'it': 'italian',
    'ja': 'japanese',
    'ko': 'korean',
    'la': 'latin',
    'lv': 'latvian',
    'lt': 'lithuanian',
    'mk': 'macedonian',
    'ms': 'malay',
    'mt': 'maltese',
    'no': 'norwegian',
    'fa': 'persian',
    'pl': 'polish',
    'pt': 'portuguese',
    'ro': 'romanian',
    'ru': 'russian',
    'sr': 'serbian',
    'sk': 'slovak',
    'sl': 'slovenian',
    'es': 'spanish',
    'sw': 'swahili',
    'sv': 'swedish',
    'th': 'thai',
    'tr': 'turkish',
    'uk': 'ukrainian',
    'vi': 'vietnamese',
    'cy': 'welsh',
    'yi': 'yiddish',
  }
add conversion module to process in a better way at least 2015-06-06 07:43:01 +02:00			`"""A conversion module for googletrans"""`
Add languages, and print excetion with details 2015-07-06 07:31:08 +02:00			`from __future__ import print_function`
add conversion module to process in a better way at least 2015-06-06 07:43:01 +02:00			`import re`
Add languages, and print excetion with details 2015-07-06 07:31:08 +02:00			`import traceback`
add conversion module to process in a better way at least 2015-06-06 07:43:01 +02:00			`import json`

Add languages, and print excetion with details 2015-07-06 07:31:08 +02:00			`def format_json(original):`
add conversion module to process in a better way at least 2015-06-06 07:43:01 +02:00			`# save state`
			`states = []`
Add languages, and print excetion with details 2015-07-06 07:31:08 +02:00			`text = original`
add conversion module to process in a better way at least 2015-06-06 07:43:01 +02:00			`for i, pos in enumerate(re.finditer('"', text)):`
			`p = pos.start() + 1`
			`if i % 2 == 0:`
			`nxt = text.find('"', p)`
			`states.append((p, text[p:nxt]))`

			`# replace all weired characters in text`
			`while text.find(',,') > -1:`
			`text = text.replace(',,', ',null,')`
			`while text.find('[,') > -1:`
			`text = text.replace('[,', '[null,')`

			`# recover state`
			`for i, pos in enumerate(re.finditer('"', text)):`
			`p = pos.start() + 1`
			`if i % 2 == 0:`
			`j = int(i / 2)`
			`nxt = text.find('"', p)`
			`# replacing a portion of a string`
			`# use slicing to extract those parts of the original string to be kept`
			`text = text[:p] + states[j][1] + text[nxt:]`

Add languages, and print excetion with details 2015-07-06 07:31:08 +02:00			`try:`
			`converted = json.loads(text)`
			`except ValueError as e:`
			`print('original text: ', original, ' => ', text)`
			`traceback.print_exc()`
add conversion module to process in a better way at least 2015-06-06 07:43:01 +02:00			`return converted`
Add languages, and print excetion with details 2015-07-06 07:31:08 +02:00
			`LANGUAGES = {`
			`'af': 'afrikaans',`
			`'sq': 'albanian',`
			`'ar': 'arabic',`
			`'be': 'belarusian',`
			`'bg': 'bulgarian',`
			`'ca': 'catalan',`
			`'zh-CN': 'chinese_simplified',`
			`'zh-TW': 'chinese_traditional',`
			`'hr': 'croatian',`
			`'cs': 'czech',`
			`'da': 'danish',`
			`'nl': 'dutch',`
			`'en': 'english',`
			`'eo': 'esperanto',`
			`'et': 'estonian',`
			`'tl': 'filipino',`
			`'fi': 'finnish',`
			`'fr': 'french',`
			`'gl': 'galician',`
			`'de': 'german',`
			`'el': 'greek',`
			`'iw': 'hebrew',`
			`'hi': 'hindi',`
			`'hu': 'hungarian',`
			`'is': 'icelandic',`
			`'id': 'indonesian',`
			`'ga': 'irish',`
			`'it': 'italian',`
			`'ja': 'japanese',`
			`'ko': 'korean',`
			`'la': 'latin',`
			`'lv': 'latvian',`
			`'lt': 'lithuanian',`
			`'mk': 'macedonian',`
			`'ms': 'malay',`
			`'mt': 'maltese',`
			`'no': 'norwegian',`
			`'fa': 'persian',`
			`'pl': 'polish',`
			`'pt': 'portuguese',`
			`'ro': 'romanian',`
			`'ru': 'russian',`
			`'sr': 'serbian',`
			`'sk': 'slovak',`
			`'sl': 'slovenian',`
			`'es': 'spanish',`
			`'sw': 'swahili',`
			`'sv': 'swedish',`
			`'th': 'thai',`
			`'tr': 'turkish',`
			`'uk': 'ukrainian',`
			`'vi': 'vietnamese',`
			`'cy': 'welsh',`
			`'yi': 'yiddish',`
			`}`