mt-summit-corpora/jupyter-injector.ipynb
2022-01-18 10:27:53 +01:00

20 KiB
Raw Blame History

Lemmatize glossary

TODO: train test split glossary

import time

import pandas as pd
import spacy


spacy_nlp_en = spacy.load('en_core_web_sm')
spacy_nlp_pl = spacy.load("pl_core_news_sm")

glossary = pd.read_csv('kompendium.tsv', sep='\t', header=None, names=['source', 'result'])

source_lemmatized = []
for word in glossary['source']:
    temp = []
    for token in spacy_nlp_en(word):
        temp.append(token.lemma_)
    source_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))

result_lemmatized = []
for word in glossary['result']:
    temp = []
    for token in spacy_nlp_pl(word):
        temp.append(token.lemma_)
    result_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))

glossary['source_lem'] = source_lemmatized
glossary['result_lem'] = result_lemmatized
glossary = glossary[['source', 'source_lem', 'result', 'result_lem']]
glossary.set_index('source_lem')
source result result_lem
source_lem
aaofi aaofi organizacja rachunkowości i audytu dla islamsk... organizacja rachunkowość i audyt dla islamski ...
aca aca członek stowarzyszenia dyplomowanych biegłych ... członek stowarzyszenie dyplomowany biegły rewi...
acca acca stowarzyszenie dyplomowanych biegłych rewidentów stowarzyszenie dyplomowany biegły rewident
abacus abacus liczydło liczydło
abandonment cost abandonment costs koszty zaniechania koszt zaniechanie
... ... ... ...
ytd ytd od początku roku od początek rok
year-end year-end koniec roku koniec rok
year-to-date year-to-date od początku roku od początek rok
zog zog zero wzrostu kosztów ogólnych zero wzrost koszt ogólny
zero overhead growth zero overhead growth zero wzrostu kosztów ogólnych zero wzrost koszt ogólny

1197 rows × 3 columns

glossary.to_csv('kompendium_lem.tsv', sep='\t')

Lemmatize corpus

dev_path = 'mt-summit-corpora/dev/dev'

skip_chars = ''',./!?'''

with open(dev_path + '.en', 'r') as file:
    file_lemmatized = []
    for line in file:
        temp = []
        for token in spacy_nlp_en(line):
            temp.append(token.lemma_)
        file_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))

with open(dev_path + '.pl', 'r') as file:
    file_pl_lemmatized = []
    for line in file:
        temp = []
        for token in spacy_nlp_pl(line):
            temp.append(token.lemma_)
        file_pl_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))

print(file_lemmatized[2])
print(file_pl_lemmatized[2])
in the course of the control the control audit firm shall fulfil the responsibility refer to in article 114 on date and in form specify by the controller 

w czas trwanie kontrola kontrolowany firma audytorski wypełnia obowiązek o których mowa w art 114 w ter-mina i forma wskazany przez osoba kontrolującą 

Inject glossary

!!! Obsolete !!!

import spacy
from spaczz.matcher import FuzzyMatcher


glossary = pd.read_csv('kompendium_lem.tsv', sep='\t', header=0, index_col=0)
bad_words = ['ocf', 'toc', 'vas', 'vat']
train_glossary = glossary.iloc[[x for x in range(len(glossary)) if x % 6 != 0]]

nlp = spacy.blank("en")
matcher = FuzzyMatcher(nlp.vocab)
for word in train_glossary['source_lem']:
    if word not in bad_words:
        matcher.add(word, [nlp(word)])


en = []
translation_line_counts = []
for line_id, line in enumerate(file_lemmatized):
    doc = nlp(line)
    matches = matcher(doc)

    not_injected = 0
    for match_id, start, end, ratio in matches:
        if ratio > 90:
            not_injected += 1
            en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))


    if not_injected == 0:
        not_injected = 1
        en.append(line)
    translation_line_counts.append(not_injected)

import copy
tlcs = copy.deepcopy(translation_line_counts)

translations = pd.read_csv(dev_path + '.pl', sep='\t', header=None, names=['text'])
with open(dev_path + '.injected.pl', 'w') as file_pl:
    for trans in translations.iterrows():
        try:
            for _ in range(tlcs.pop(0)):
                file_pl.write(trans[1]['text'] + '\n')
        except:
            pass


with open(dev_path + '.injected.en', 'w') as file_en:
    for line in en:
        file_en.write(line)

Inject glossary Polish crosscheck

import spacy
from spaczz.matcher import FuzzyMatcher

# glossary
glossary = pd.read_csv('kompendium_lem.tsv', sep='\t', header=0, index_col=0)
train_glossary = glossary.iloc[[x for x in range(len(glossary)) if x % 6 != 0]]

# add rules to English matcher
nlp = spacy.blank("en")
matcher = FuzzyMatcher(nlp.vocab)
for word in train_glossary['source_lem']:
    matcher.add(word, [nlp(word)])

# add rules to Polish matcher
nlp_pl = spacy.blank("pl")
matcher_pl = FuzzyMatcher(nlp_pl.vocab)
for word, word_id in zip(train_glossary['result_lem'], train_glossary['source_lem']):
    matcher_pl.add(word, [nlp_pl(word)])

en = []
translation_line_counts = []
for line_id in range(len(file_lemmatized)):

    doc = nlp(file_lemmatized[line_id])
    matches = matcher(doc)

    not_injected = 0
    for match_id, start, end, ratio in matches:
        if ratio > 90:
            doc_pl = nlp_pl(file_pl_lemmatized[line_id])
            matches_pl = matcher_pl(doc_pl)

            for match_id_pl, start_pl, end_pl, ratio_pl in matches_pl:
                if match_id_pl == glossary[glossary['source_lem'] == match_id].values[0][3]:
                    not_injected += 1
                    en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))

    if not_injected == 0:
        not_injected = 1
        en.append(file_lemmatized[line_id])
    translation_line_counts.append(not_injected)
import copy


tlcs = copy.deepcopy(translation_line_counts)

translations = pd.read_csv(dev_path + '.pl', sep='\t', header=None, names=['text'])
translations['id'] = [x for x in range(len(translations))]

ctr = 0
sentence = ''
with open(dev_path + '.injected.crossvalidated.en', 'w') as file_en:
    with open(dev_path + '.injected.crossvalidated.pl', 'w') as file_pl:
        for i in range(len(en)):
            if i > 0:
                if en[i-1] != en[i]:
                    if ctr == 0:
                        sentence = translations.iloc[0]
                        translations.drop(sentence['id'], inplace=True)
                        sentence = sentence['text']
                        try:
                            ctr = tlcs.pop(0)
                        except:
                            pass
                    file_en.write(en[i])
                    file_pl.write(sentence + '\n')
                ctr = ctr - 1
            else:
                try:
                    ctr = tlcs.pop(0) - 1
                except:
                    pass
                sentence = translations.iloc[0]
                translations.drop(sentence['id'], inplace=True)
                sentence = sentence['text']
                file_en.write(en[i])
                file_pl.write(sentence + '\n')

Inject glossary Polish crosscheck fast?

import time
import spacy
from spaczz.matcher import FuzzyMatcher


# glossary
glossary = pd.read_csv('kompendium_lem.tsv', sep='\t', header=0, index_col=0)
train_glossary = glossary.iloc[[x for x in range(len(glossary)) if x % 6 != 0]]

# add rules to English matcher
nlp = spacy.blank("en")
matcher = FuzzyMatcher(nlp.vocab)
for word in train_glossary['source_lem']:
    matcher.add(word, [nlp(word)])

# add rules to Polish matcher
nlp_pl = spacy.blank("pl")
matcher_pl = FuzzyMatcher(nlp_pl.vocab)
for word, word_id in zip(train_glossary['result_lem'], train_glossary['source_lem']):
    matcher_pl.add(word, [nlp_pl(word)])

start_time = time.time_ns()
en = []
injection_counter = 0
for line_id in range(len(file_lemmatized)):

    doc = nlp(file_lemmatized[line_id])
    matches = matcher(nlp(file_lemmatized[line_id]))

    not_injected = True
    if len(matches) > 0:
        match_id, _, end, ratio = sorted(matches, key=lambda x: len(x[0]), reverse=True)[0]
        if ratio > 90:
            matches_pl = matcher_pl(nlp_pl(file_pl_lemmatized[line_id]))

            for match_id_pl, _, _, _ in matches_pl:
                if match_id_pl == glossary[glossary['source_lem'] == match_id].values[0][3]:
                    not_injected = False
                    injection_counter += 1
                    en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))
                    break

    if not_injected:
        en.append(file_lemmatized[line_id])

stop = time.time_ns()
timex = (stop - start_time) / 1000000000
print(f'took {timex} injected {injection_counter} words. rate {len(file_lemmatized)/timex} sen/s')
took 152.213599056 injected 63 words. rate 6.569715230451229 sen/s
import copy


tlcs = copy.deepcopy(translation_line_counts)

translations = pd.read_csv(dev_path + '.pl', sep='\t', header=None, names=['text'])
translations['id'] = [x for x in range(len(translations))]

ctr = 0
sentence = ''
with open(dev_path + '.injected.crossvalidated.en', 'w') as file_en:
    with open(dev_path + '.injected.crossvalidated.pl', 'w') as file_pl:
        for i in range(len(en)):
            if i > 0:
                if en[i-1] != en[i]:
                    if ctr == 0:
                        sentence = translations.iloc[0]
                        translations.drop(sentence['id'], inplace=True)
                        sentence = sentence['text']
                        try:
                            ctr = tlcs.pop(0)
                        except:
                            pass
                    file_en.write(en[i])
                    file_pl.write(sentence + '\n')
                ctr = ctr - 1
            else:
                try:
                    ctr = tlcs.pop(0) - 1
                except:
                    pass
                sentence = translations.iloc[0]
                translations.drop(sentence['id'], inplace=True)
                sentence = sentence['text']
                file_en.write(en[i])
                file_pl.write(sentence + '\n')