wmt-2020-pl-en/post_process.py

53 lines
2.0 KiB
Python

# -*- coding: utf-8 -*-
from transformers import MarianTokenizer, MarianMTModel
import pickle
import sys
import string
from googletrans import Translator
import morfeusz2
import time
trans = Translator()
morf = morfeusz2.Morfeusz()
pl_letters = ['ą','ę','ł','ź','ć','ś','ń','ó','ż']
def pl_trans():
for line in sys.stdin:
new_line = line.rstrip()
p_line =new_line.capitalize()
print(p_line)
def process_words(voc):
lines = []
exclude = ['.','?','!',',','/','-','+','=',')','(','%','0','1','2','3','4','5','6','7','8','9','[',']',':',';',"'",'"']
confusion_words = ['on', 'one', 'no', 'my', 'knot', 'but', 'chart', 'prom', 'pup', 'much', 'lot', 'pan', 'herb', 'dude', 'to', 'wanna', 'unia', 'we']
file_to_process = {}
for idx,line in enumerate(sys.stdin):
line_to_process = line.rstrip().split()
processed_line = ''
new_line = []
for word in line_to_process:
en_word = ''
p_word = ''.join(w for w in word if w not in exclude)
analysis = morf.analyse(p_word)
for i,j,l in analysis:
lema = l[0]
if str(p_word) in voc.keys() and p_word.lower() not in confusion_words and p_word not in exclude:
en_word = voc[p_word]
elif str(lema) in voc.keys() and lema.lower() not in confusion_words and lema not in exclude:
en_word = voc[str(lema)]
else:
if p_word.lower() in confusion_words and check_letters(word.lower(),pl_letters):
en_word = word
else:
en_word = trans.translate(word,dest='en',src='pl').text
processed_line = processed_line + en_word + ' '
print(processed_line)
def check_letters(text,pl):
for ch in text:
if ch in pl:
return False
return True
#voc = pickle.load(open('pl_en.pickle', 'rb'))
#process_words(voc)
pl_trans()