wmt-2020-pl-en/post_process.py

# -*- coding: utf-8 -*-
from transformers import MarianTokenizer, MarianMTModel
import pickle
import sys
import string
from googletrans import Translator
import morfeusz2
import time
trans = Translator()
morf = morfeusz2.Morfeusz()

pl_letters = ['ą','ę','ł','ź','ć','ś','ń','ó','ż']
def pl_trans():
    for line in sys.stdin:
        new_line = line.rstrip()
        p_line=line[0].upper()
        for token in range(1,len(new_line)):
            p_line = p_line + new_line[token]
        print(p_line)
def process_words(voc):
    lines = []
    exclude = ['.','?','!',',','/','-','+','=',')','(','%','0','1','2','3','4','5','6','7','8','9','[',']',':',';',"'",'"']
    confusion_words = ['on', 'one', 'no', 'my', 'knot', 'but', 'chart', 'prom', 'pup', 'much', 'lot', 'pan', 'herb', 'dude', 'to', 'wanna', 'unia', 'we']
    file_to_process = {}
    for idx,line in enumerate(sys.stdin):
        line_to_process = line.rstrip().split()
        processed_line = ''
        new_line = []
        for word in line_to_process:
                en_word = ''
                p_word = ''.join(w for w in word if w not in exclude)
                analysis = morf.analyse(p_word)
                for i,j,l in analysis:
                    lema = l[0]
                if str(p_word) in voc.keys() and p_word.lower() not in confusion_words and p_word not in exclude:
                    en_word = voc[p_word]
                elif str(lema) in voc.keys() and lema.lower() not in confusion_words and lema not in exclude:
                    en_word = voc[str(lema)]
                else:
                    if p_word.lower() in confusion_words and check_letters(word.lower(),pl_letters):
                        en_word = word
                    else:
                        try:
                            en_word = trans.translate(word,dest='en',src='pl').text
                        except:
                            en_word = word
                processed_line = processed_line + en_word + ' '
        print(processed_line)

def check_letters(text,pl):
    for ch in text:
        if ch in pl:
            return False
    return True

#voc = pickle.load(open('pl_en.pickle', 'rb'))
#process_words(voc)
pl_trans()
Add post-processing to 25k 2020-11-15 16:26:19 +01:00			`# -- coding: utf-8 --`
			`from transformers import MarianTokenizer, MarianMTModel`
			`import pickle`
			`import sys`
			`import string`
			`from googletrans import Translator`
			`import morfeusz2`
			`import time`
			`trans = Translator()`
			`morf = morfeusz2.Morfeusz()`

			`pl_letters = ['ą','ę','ł','ź','ć','ś','ń','ó','ż']`
			`def pl_trans():`
			`for line in sys.stdin:`
			`new_line = line.rstrip()`
Googletrans improvement 2020-11-15 18:57:38 +01:00			`p_line=line[0].upper()`
			`for token in range(1,len(new_line)):`
			`p_line = p_line + new_line[token]`
Add post-processing to 25k 2020-11-15 16:26:19 +01:00			`print(p_line)`
			`def process_words(voc):`
			`lines = []`
			`exclude = ['.','?','!',',','/','-','+','=',')','(','%','0','1','2','3','4','5','6','7','8','9','[',']',':',';',"'",'"']`
			`confusion_words = ['on', 'one', 'no', 'my', 'knot', 'but', 'chart', 'prom', 'pup', 'much', 'lot', 'pan', 'herb', 'dude', 'to', 'wanna', 'unia', 'we']`
			`file_to_process = {}`
			`for idx,line in enumerate(sys.stdin):`
			`line_to_process = line.rstrip().split()`
			`processed_line = ''`
			`new_line = []`
			`for word in line_to_process:`
			`en_word = ''`
			`p_word = ''.join(w for w in word if w not in exclude)`
			`analysis = morf.analyse(p_word)`
			`for i,j,l in analysis:`
			`lema = l[0]`
			`if str(p_word) in voc.keys() and p_word.lower() not in confusion_words and p_word not in exclude:`
			`en_word = voc[p_word]`
			`elif str(lema) in voc.keys() and lema.lower() not in confusion_words and lema not in exclude:`
			`en_word = voc[str(lema)]`
			`else:`
			`if p_word.lower() in confusion_words and check_letters(word.lower(),pl_letters):`
			`en_word = word`
			`else:`
Add post-processing to 25k 2020-11-15 18:27:16 +01:00			`try:`
			`en_word = trans.translate(word,dest='en',src='pl').text`
			`except:`
			`en_word = word`
Add post-processing to 25k 2020-11-15 16:26:19 +01:00			`processed_line = processed_line + en_word + ' '`
			`print(processed_line)`

			`def check_letters(text,pl):`
			`for ch in text:`
			`if ch in pl:`
			`return False`
			`return True`

Googletrans improvement 2020-11-15 18:57:38 +01:00			`#voc = pickle.load(open('pl_en.pickle', 'rb'))`
			`#process_words(voc)`
			`pl_trans()`