transfix-mt/rapidfuzztest.ipynb

11 KiB
Raw Blame History


import nltk
import pandas as pd
import rapidfuzz
import time

from nltk.stem import WordNetLemmatizer
from rapidfuzz.fuzz import  partial_ratio
from rapidfuzz.utils import default_process

nltk.download('wordnet')


wl = WordNetLemmatizer()

glossary = pd.read_csv('mt-summit-corpora/glossary.tsv', sep='\t', header=None, names=['source', 'result'])

source_lemmatized = []
for word in glossary['source']:
    word = nltk.word_tokenize(word)
    source_lemmatized.append(' '.join([wl.lemmatize(x) for x in word]))

glossary['source_lem'] = source_lemmatized
glossary = glossary[['source', 'source_lem', 'result']]
glossary.set_index('source_lem')

[nltk_data] Downloading package wordnet to /home/kuba/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
source result
source_lem
aaofi aaofi organizacja rachunkowości i audytu dla islamsk...
aca aca członek stowarzyszenia dyplomowanych biegłych ...
acca acca stowarzyszenie dyplomowanych biegłych rewidentów
abacus abacus liczydło
abandonment cost abandonment costs koszty zaniechania
... ... ...
ytd ytd od początku roku
year-end year-end koniec roku
year-to-date year-to-date od początku roku
zog zog zero wzrostu kosztów ogólnych
zero overhead growth zero overhead growth zero wzrostu kosztów ogólnych

1197 rows × 2 columns

# train_in_path = 'mt-summit-corpora/train/in.tsv'
# train_expected_path = 'mt-summit-corpora/train/expected.tsv'

train_in_path = 'mt-summit-corpora/dev-0/in.tsv'
train_expected_path = 'mt-summit-corpora/dev-0/expected.tsv'


start_time = time.time_ns()
file_lemmatized = []
with open(train_in_path, 'r') as file:
    for line in file:
        if len(file_lemmatized) % 50000 == 0:
            print(len(file_lemmatized), end='\r')
        line = nltk.word_tokenize(line)
        file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line]))

stop = time.time_ns()
timex = (stop - start_time) / 1000000000
print(timex)
0.191720194
        if len(file_lemmatized) % 50000 == 0:
            print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\r')

THRESHOLD = 70


def is_injectable(sentence_pl, sequence):
    sen = sentence_pl.split()
    window_size = len(sequence.split())
    maxx = 0
    for i in range(len(sen) - window_size + 1):
        current = rapidfuzz.fuzz.ratio(' '.join(sen[i:i + window_size]), sequence)
        if current > maxx:
            maxx = current
    return maxx >= THRESHOLD

def get_injected(sentence, sentence_en, sequence, inject):
    sen = sentence.split()
    sen_en = sentence_en.split()
    window_size = len(sequence.split())
    maxx = 0
    maxx_prv = 0
    maxxi = 0
    for i in range(len(sen) - window_size + 1):
        current = rapidfuzz.fuzz.ratio(' '.join(sen[i:i + window_size]), sequence)
        if current >= maxx:
            maxx_prv = maxx
            maxx = current
            maxxi = i
    if maxx_prv != maxx:
        return ' '.join(sen_en[:maxxi + window_size]) + ' $' + inject + '$ ' + ' '.join(sen_en[maxxi + window_size:])
    return sentence_en

glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]
glossary['hash'] = [hash(x) for x in glossary['source']]
glossary = glossary[glossary['hash'] % 100 > 16]
file_pl = pd.read_csv(train_expected_path, sep='\t', header=None, names=['text'])
file_pl['text'] = [default_process(text) for text in file_pl['text'].values.tolist()]
file_en= pd.read_csv(train_in_path, sep='\t', header=None, names=['text'])
file_en['text'] = [default_process(text) for text in file_en['text'].values.tolist()]

start_time = time.time_ns()
en = []
translation_line_counts = []
for line, line_en, line_pl in zip(file_lemmatized, file_en['text'].values.tolist(), file_pl['text'].values.tolist()):
    line = default_process(line)
    matchez = rapidfuzz.process.extract(query=line, choices=glossary['source_lem'], limit=5, score_cutoff=THRESHOLD, scorer=partial_ratio)
    if len(matchez) > 0:
        lines_added = 0
        for match in matchez:
            polish_translation = glossary.loc[lambda df: df['source_lem'] == match[0]]['result'].astype(str).values.flatten()[0]
            if is_injectable(line_pl, polish_translation):
                en.append(get_injected(line, line_en, match[0], polish_translation))
                lines_added += 1
        if lines_added == 0:
            en.append(line_en)
            lines_added = 1
        translation_line_counts.append(lines_added)
    else:
        translation_line_counts.append(1)
        en.append(line_en)


stop = time.time_ns()
timex = (stop - start_time) / 1000000000
print(timex)
1197
985
6.116408593

def full_strip(line):
    return ' '.join(line.split())

with open(train_expected_path + '.injected', 'w') as file_pl_write:
    for line, translation_line_ct in zip(file_pl['text'].values.tolist(), translation_line_counts):
        for i in range(translation_line_ct):
            file_pl_write.write(full_strip(line) + '\n')


with open(train_in_path + '.injected', 'w') as file_en_write:
    for e in en:
        file_en_write.write(e + '\n')