wmt-2020-pl-en/post_process.py

# -*- coding: utf-8 -*-
from transformers import MarianTokenizer, MarianMTModel
import pickle
import sys
import string
from googletrans import Translator
import morfeusz2
import time
trans = Translator()
morf = morfeusz2.Morfeusz()

pl_letters = ['ą','ę','ł','ź','ć','ś','ń','ó','ż']
def pl_trans():
    for line in sys.stdin:
        new_line = line.rstrip()
        p_line =new_line.capitalize()
        print(p_line)
def process_words(voc):
    lines = []
    exclude = ['.','?','!',',','/','-','+','=',')','(','%','0','1','2','3','4','5','6','7','8','9','[',']',':',';',"'",'"']
    confusion_words = ['on', 'one', 'no', 'my', 'knot', 'but', 'chart', 'prom', 'pup', 'much', 'lot', 'pan', 'herb', 'dude', 'to', 'wanna', 'unia', 'we']
    file_to_process = {}
    for idx,line in enumerate(sys.stdin):
        line_to_process = line.rstrip().split()
        processed_line = ''
        new_line = []
        for word in line_to_process:
                en_word = ''
                p_word = ''.join(w for w in word if w not in exclude)
                analysis = morf.analyse(p_word)
                for i,j,l in analysis:
                    lema = l[0]
                if str(p_word) in voc.keys() and p_word.lower() not in confusion_words and p_word not in exclude:
                    en_word = voc[p_word]
                elif str(lema) in voc.keys() and lema.lower() not in confusion_words and lema not in exclude:
                    en_word = voc[str(lema)]
                else:
                    if p_word.lower() in confusion_words and check_letters(word.lower(),pl_letters):
                        en_word = word
                    else:
                        en_word = trans.translate(word,dest='en',src='pl').text
                processed_line = processed_line + en_word + ' '
        print(processed_line)

def check_letters(text,pl):
    for ch in text:
        if ch in pl:
            return False
    return True

#voc = pickle.load(open('pl_en.pickle', 'rb'))
#process_words(voc)
pl_trans()