Lemmatize glossary

TODO: train test split glossary

import time

import pandas as pd
import spacy


spacy_nlp_en = spacy.load('en_core_web_sm')
spacy_nlp_pl = spacy.load("pl_core_news_sm")

glossary = pd.read_csv('kompendium.tsv', sep='\t', header=None, names=['source', 'result'])

source_lemmatized = []
for word in glossary['source']:
    temp = []
    for token in spacy_nlp_en(word):
        temp.append(token.lemma_)
    source_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))

result_lemmatized = []
for word in glossary['result']:
    temp = []
    for token in spacy_nlp_pl(word):
        temp.append(token.lemma_)
    result_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))

glossary['source_lem'] = source_lemmatized
glossary['result_lem'] = result_lemmatized
glossary = glossary[['source', 'source_lem', 'result', 'result_lem']]
glossary.set_index('source_lem')

	source	result	result_lem
source_lem
aaofi	aaofi	organizacja rachunkowości i audytu dla islamsk...	organizacja rachunkowość i audyt dla islamski ...
aca	aca	członek stowarzyszenia dyplomowanych biegłych ...	członek stowarzyszenie dyplomowany biegły rewi...
acca	acca	stowarzyszenie dyplomowanych biegłych rewidentów	stowarzyszenie dyplomowany biegły rewident
abacus	abacus	liczydło	liczydło
abandonment cost	abandonment costs	koszty zaniechania	koszt zaniechanie
...	...	...	...
ytd	ytd	od początku roku	od początek rok
year-end	year-end	koniec roku	koniec rok
year-to-date	year-to-date	od początku roku	od początek rok
zog	zog	zero wzrostu kosztów ogólnych	zero wzrost koszt ogólny
zero overhead growth	zero overhead growth	zero wzrostu kosztów ogólnych	zero wzrost koszt ogólny

1197 rows × 3 columns

glossary.to_csv('kompendium_lem.tsv', sep='\t')

Lemmatize corpus

dev_path = 'mt-summit-corpora/dev/dev'

skip_chars = ''',./!?'''

with open(dev_path + '.en', 'r') as file:
    file_lemmatized = []
    for line in file:
        temp = []
        for token in spacy_nlp_en(line):
            temp.append(token.lemma_)
        file_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))

with open(dev_path + '.pl', 'r') as file:
    file_pl_lemmatized = []
    for line in file:
        temp = []
        for token in spacy_nlp_pl(line):
            temp.append(token.lemma_)
        file_pl_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))

print(file_lemmatized[2])
print(file_pl_lemmatized[2])

in the course of the control the control audit firm shall fulfil the responsibility refer to in article 114 on date and in form specify by the controller 

w czas trwanie kontrola kontrolowany firma audytorski wypełnia obowiązek o których mowa w art 114 w ter-mina i forma wskazany przez osoba kontrolującą

Inject glossary

!!! Obsolete !!!

import spacy
from spaczz.matcher import FuzzyMatcher


glossary = pd.read_csv('kompendium_lem.tsv', sep='\t', header=0, index_col=0)
bad_words = ['ocf', 'toc', 'vas', 'vat']
train_glossary = glossary.iloc[[x for x in range(len(glossary)) if x % 6 != 0]]

nlp = spacy.blank("en")
matcher = FuzzyMatcher(nlp.vocab)
for word in train_glossary['source_lem']:
    if word not in bad_words:
        matcher.add(word, [nlp(word)])


en = []
translation_line_counts = []
for line_id, line in enumerate(file_lemmatized):
    doc = nlp(line)
    matches = matcher(doc)

    not_injected = 0
    for match_id, start, end, ratio in matches:
        if ratio > 90:
            not_injected += 1
            en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))


    if not_injected == 0:
        not_injected = 1
        en.append(line)
    translation_line_counts.append(not_injected)

import copy
tlcs = copy.deepcopy(translation_line_counts)

translations = pd.read_csv(dev_path + '.pl', sep='\t', header=None, names=['text'])
with open(dev_path + '.injected.pl', 'w') as file_pl:
    for trans in translations.iterrows():
        try:
            for _ in range(tlcs.pop(0)):
                file_pl.write(trans[1]['text'] + '\n')
        except:
            pass


with open(dev_path + '.injected.en', 'w') as file_en:
    for line in en:
        file_en.write(line)

Inject glossary Polish crosscheck

import spacy
from spaczz.matcher import FuzzyMatcher

# glossary
glossary = pd.read_csv('kompendium_lem.tsv', sep='\t', header=0, index_col=0)
train_glossary = glossary.iloc[[x for x in range(len(glossary)) if x % 6 != 0]]

# add rules to English matcher
nlp = spacy.blank("en")
matcher = FuzzyMatcher(nlp.vocab)
for word in train_glossary['source_lem']:
    matcher.add(word, [nlp(word)])

# add rules to Polish matcher
nlp_pl = spacy.blank("pl")
matcher_pl = FuzzyMatcher(nlp_pl.vocab)
for word, word_id in zip(train_glossary['result_lem'], train_glossary['source_lem']):
    matcher_pl.add(word, [nlp_pl(word)])

en = []
translation_line_counts = []
for line_id in range(len(file_lemmatized)):

    doc = nlp(file_lemmatized[line_id])
    matches = matcher(doc)

    not_injected = 0
    for match_id, start, end, ratio in matches:
        if ratio > 90:
            doc_pl = nlp_pl(file_pl_lemmatized[line_id])
            matches_pl = matcher_pl(doc_pl)

            for match_id_pl, start_pl, end_pl, ratio_pl in matches_pl:
                if match_id_pl == glossary[glossary['source_lem'] == match_id].values[0][3]:
                    not_injected += 1
                    en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))

    if not_injected == 0:
        not_injected = 1
        en.append(file_lemmatized[line_id])
    translation_line_counts.append(not_injected)

import copy


tlcs = copy.deepcopy(translation_line_counts)

translations = pd.read_csv(dev_path + '.pl', sep='\t', header=None, names=['text'])
translations['id'] = [x for x in range(len(translations))]

ctr = 0
sentence = ''
with open(dev_path + '.injected.crossvalidated.en', 'w') as file_en:
    with open(dev_path + '.injected.crossvalidated.pl', 'w') as file_pl:
        for i in range(len(en)):
            if i > 0:
                if en[i-1] != en[i]:
                    if ctr == 0:
                        sentence = translations.iloc[0]
                        translations.drop(sentence['id'], inplace=True)
                        sentence = sentence['text']
                        try:
                            ctr = tlcs.pop(0)
                        except:
                            pass
                    file_en.write(en[i])
                    file_pl.write(sentence + '\n')
                ctr = ctr - 1
            else:
                try:
                    ctr = tlcs.pop(0) - 1
                except:
                    pass
                sentence = translations.iloc[0]
                translations.drop(sentence['id'], inplace=True)
                sentence = sentence['text']
                file_en.write(en[i])
                file_pl.write(sentence + '\n')

Inject glossary Polish crosscheck fast?

import time
import spacy
from spaczz.matcher import FuzzyMatcher


# glossary
glossary = pd.read_csv('kompendium_lem.tsv', sep='\t', header=0, index_col=0)
train_glossary = glossary.iloc[[x for x in range(len(glossary)) if x % 6 != 0]]

# add rules to English matcher
nlp = spacy.blank("en")
matcher = FuzzyMatcher(nlp.vocab)
for word in train_glossary['source_lem']:
    matcher.add(word, [nlp(word)])

# add rules to Polish matcher
nlp_pl = spacy.blank("pl")
matcher_pl = FuzzyMatcher(nlp_pl.vocab)
for word, word_id in zip(train_glossary['result_lem'], train_glossary['source_lem']):
    matcher_pl.add(word, [nlp_pl(word)])

start_time = time.time_ns()
en = []
injection_counter = 0
for line_id in range(len(file_lemmatized)):

    doc = nlp(file_lemmatized[line_id])
    matches = matcher(nlp(file_lemmatized[line_id]))

    not_injected = True
    if len(matches) > 0:
        match_id, _, end, ratio = sorted(matches, key=lambda x: len(x[0]), reverse=True)[0]
        if ratio > 90:
            matches_pl = matcher_pl(nlp_pl(file_pl_lemmatized[line_id]))

            for match_id_pl, _, _, _ in matches_pl:
                if match_id_pl == glossary[glossary['source_lem'] == match_id].values[0][3]:
                    not_injected = False
                    injection_counter += 1
                    en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))
                    break

    if not_injected:
        en.append(file_lemmatized[line_id])

stop = time.time_ns()
timex = (stop - start_time) / 1000000000
print(f'took {timex} injected {injection_counter} words. rate {len(file_lemmatized)/timex} sen/s')

took 152.213599056 injected 63 words. rate 6.569715230451229 sen/s

import copy


tlcs = copy.deepcopy(translation_line_counts)

translations = pd.read_csv(dev_path + '.pl', sep='\t', header=None, names=['text'])
translations['id'] = [x for x in range(len(translations))]

ctr = 0
sentence = ''
with open(dev_path + '.injected.crossvalidated.en', 'w') as file_en:
    with open(dev_path + '.injected.crossvalidated.pl', 'w') as file_pl:
        for i in range(len(en)):
            if i > 0:
                if en[i-1] != en[i]:
                    if ctr == 0:
                        sentence = translations.iloc[0]
                        translations.drop(sentence['id'], inplace=True)
                        sentence = sentence['text']
                        try:
                            ctr = tlcs.pop(0)
                        except:
                            pass
                    file_en.write(en[i])
                    file_pl.write(sentence + '\n')
                ctr = ctr - 1
            else:
                try:
                    ctr = tlcs.pop(0) - 1
                except:
                    pass
                sentence = translations.iloc[0]
                translations.drop(sentence['id'], inplace=True)
                sentence = sentence['text']
                file_en.write(en[i])
                file_pl.write(sentence + '\n')

20 KiB Raw Blame History Unescape Escape

Lemmatize glossary

Lemmatize corpus

Inject glossary

!!! Obsolete !!!

Inject glossary Polish crosscheck

Inject glossary Polish crosscheck fast?

20 KiB

Raw Blame History