wmt-2020-pl-en/create_voc.py

52 lines
1.6 KiB
Python

import pickle
import os
from googletrans import Translator
import time
trans = Translator()
def translate_list(pl_file):
f = open(pl_file, 'r')
content = f.read()
translated = trans.translate(content,dest='en',src='pl').text
return content.split('\n'), translated.split('\n')
def create_pl_en(pl):
pl_en = {}
char = 0
for idx, key in enumerate(pl.keys()):
char += 1
print(idx)
if (idx + 500) >= len(pl):
try:
cont, trans = translate_list('500.txt')
cont = cont[:-1]
if len(cont) == len(trans):
print('Add to vocabulary')
for pl_item,en_item in zip(cont,trans):
pl_en[pl_item] = en_item
break
except:
print('Error with googletrans')
break
if char < 500:
f = open('500.txt', 'a+')
f.write(key + '\n')
else:
try:
cont, trans = translate_list('500.txt')
cont = cont[:-1]
if len(cont) == len(trans):
print('Add to vocabulary')
for pl_item,en_item in zip(cont,trans):
pl_en[pl_item] = en_item
time.sleep(0.5)
os.remove('500.txt')
char = 0
except:
print('Error with googletrans')
return pl_en
voc = pickle.load(open('pl_words.pickle', 'rb'))
print(len(voc))
pl_en = create_pl_en(voc)
pickle.dump(pl_en, open('pl_en.pickle', 'wb+' ))