In [1]:
import pandas as pd
import spacy


spacy_nlp_en = spacy.load('en_core_web_sm')
spacy_nlp_pl = spacy.load("pl_core_news_sm")

glossary = pd.read_csv('kompendium.tsv', sep='\t', header=None, names=['source', 'result'])

source_lemmatized = []
for word in glossary['source']:
    temp = []
    for token in spacy_nlp_en(word):
        temp.append(token.lemma_)
    source_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))

result_lemmatized = []
for word in glossary['result']:
    temp = []
    for token in spacy_nlp_pl(word):
        temp.append(token.lemma_)
    result_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))

glossary['source_lem'] = source_lemmatized
glossary['result_lem'] = result_lemmatized
glossary = glossary[['source', 'source_lem', 'result', 'result_lem']]
glossary.set_index('source_lem')



Unnamed: 0_level_0,source,result,result_lem
source_lem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aaofi,aaofi,organizacja rachunkowości i audytu dla islamsk...,organizacja rachunkowość i audyt dla islamski ...
aca,aca,członek stowarzyszenia dyplomowanych biegłych ...,członek stowarzyszenie dyplomowany biegły rewi...
acca,acca,stowarzyszenie dyplomowanych biegłych rewidentów,stowarzyszenie dyplomowany biegły rewident
abacus,abacus,liczydło,liczydło
abandonment cost,abandonment costs,koszty zaniechania,koszt zaniechanie
...,...,...,...
ytd,ytd,od początku roku,od początek rok
year-end,year-end,koniec roku,koniec rok
year-to-date,year-to-date,od początku roku,od początek rok
zog,zog,zero wzrostu kosztów ogólnych,zero wzrost koszt ogólny


In [2]:

dev_path = 'mt-summit-corpora/dev/dev'

skip_chars = ''',./!?'''

with open(dev_path + '.en', 'r') as file:
    file_lemmatized = []
    for line in file:
        temp = []
        for token in spacy_nlp_en(line):
            temp.append(token.lemma_)
        file_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))

with open(dev_path + '.pl', 'r') as file:
    file_pl_lemmatized = []
    for line in file:
        temp = []
        for token in spacy_nlp_pl(line):
            temp.append(token.lemma_)
        file_pl_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))



In [26]:
import copy
import pandas as pd
import rapidfuzz
from rapidfuzz.fuzz import *
import time
from rapidfuzz.utils import default_process


THRESHOLD = 88

def is_injectable(sentence_pl, sequence):
    sen = sentence_pl.split()
    windowSize = len(sequence.split())
    maxx = 0
    for i in range(len(sen) - windowSize):
        current = rapidfuzz.fuzz.partial_ratio(' '.join(sen[i:i + windowSize]), sequence)
        if current > maxx:
            maxx = current
    return maxx

def inject(sentence, sequence):
    sen = sentence.split()
    windowSize = len(sequence.split())
    maxx = 0
    maxxi = 0
    for i in range(len(sen) - windowSize):
        current = rapidfuzz.fuzz.partial_ratio(' '.join(sen[i:i + windowSize]), sequence)
        if current > maxx:
            maxx = current
            maxxi = i
    return ' '.join(sen[:maxxi + windowSize]) + ' ' \
           + glossary.loc[lambda df: df['source_lem'] == sequence]['result'].astype(str).values.flatten() \
           + ' ' + ' '.join(sen[maxxi + windowSize:])

glossary = pd.read_csv('kompendium_lem_cleaned.tsv', sep='\t', header=0, index_col=0)
glossary['source_lem'] = [default_process(x) for x in glossary['source_lem']]

start_time = time.time_ns()
en = []
translation_line_counts = []
for line, line_pl in zip(file_lemmatized, file_pl_lemmatized):
    line = default_process(line)
    line_pl = default_process(line_pl)
    matchez = rapidfuzz.process.extract(query=line, choices=glossary['source_lem'], limit=5, score_cutoff=THRESHOLD, scorer=partial_ratio)
    translation_line_counts.append(len(matchez))
    for match in matchez:
        # if is_injectable(line_pl, match[0]):
            print([match, is_injectable(line_pl, match[0]), glossary.loc[lambda df: df['source_lem'] == match[0]]['result'].astype(str).values.flatten()])
            print(line)
            print(inject(line, match[0])[0])
            print(line_pl)
            print('=====================================================================')
            en.append(inject(line, match[0])[0])


stop = time.time_ns()
timex = (stop - start_time) / 1000000000
print(timex)


[('asset', 100.0, 105), 66.66666666666666, array(['aktywa'], dtype=object)]
where the carrying amount of an asset exceed its recoverable amount the asset be consider impaired and be write down to its recoverable amount
where the carrying amount of an asset aktywa exceed its recoverable amount the asset be consider impaired and be write down to its recoverable amount
jeśli wartość bilansowy składnik aktywa jest wysoki niż on wartość odzyskiwalny mieć miejsce utrata wartość i dokonywać się wówczas odpis do ustalonej wartość odzyskiwalny
[('recoverable amount', 100.0, 947), 50.0, array(['wartość odzyskiwalna'], dtype=object)]
where the carrying amount of an asset exceed its recoverable amount the asset be consider impaired and be write down to its recoverable amount
where the carrying amount of an asset exceed its recoverable amount wartość odzyskiwalna the asset be consider impaired and be write down to its recoverable amount
jeśli wartość bilansowy składnik aktywa jest wysoki niż on war

In [8]:
tlcs = copy.deepcopy(translation_line_counts)

translations = pd.read_csv(dev_path + '.pl', sep='\t', header=None, names=['text'])
with open(dev_path + '.injected.crossvalidated.pl', 'w') as file_pl:
    for line, translation_line_ct in zip(translations, tlcs):
        for i in range(translation_line_ct):
            file_pl.write(line)


with open(dev_path + '.injected.crossvalidated.en', 'w') as file_en:
    for e in en:
        file_en.write(e + '\n')