mt-summit-corpora/random-scripts/inject_rapid.py

131 lines
4.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import spacy
import copy
import pandas as pd
import rapidfuzz
from rapidfuzz.fuzz import partial_ratio
import time
from rapidfuzz.utils import default_process
import sys
spacy.require_gpu()
spacy_nlp_en = spacy.load('en_core_web_sm')
spacy_nlp_pl = spacy.load("pl_core_news_sm")
def read_arguments():
try:
corpus_path, glossary_path = sys.argv
return corpus_path, glossary_path
except:
print("ERROR: Wrong argument amount.")
sys.exit(1)
glossary = pd.read_csv('mt-summit-corpora/glossary.tsv', sep='\t', header=None, names=['source', 'result'])
source_lemmatized = []
for word in glossary['source']:
temp = []
for token in spacy_nlp_en(word):
temp.append(token.lemma_)
source_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
result_lemmatized = []
for word in glossary['result']:
temp = []
for token in spacy_nlp_pl(word):
temp.append(token.lemma_)
result_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
glossary['source_lem'] = source_lemmatized
glossary['result_lem'] = result_lemmatized
glossary = glossary[['source', 'source_lem', 'result', 'result_lem']]
glossary.to_csv('kompendium_lem.tsv', sep='\t')
corpus_path = 'mt-summit-corpora/train/'
skip_chars = ''',./!?'''
with open(corpus_path + 'in.tsv', 'r') as file:
file_lemmatized = []
for line in file:
if len(file_lemmatized) % 10000 == 0:
print(len(file_lemmatized), end='\r')
temp = []
for token in spacy_nlp_en(line):
temp.append(token.lemma_)
file_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
with open(corpus_path + 'expected.tsv', 'r') as file:
file_pl_lemmatized = []
for line in file:
if len(file_pl_lemmatized) % 10000 == 0:
print(len(file_lemmatized), end='\r')
temp = []
for token in spacy_nlp_pl(line):
temp.append(token.lemma_)
file_pl_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
THRESHOLD = 88
def is_injectable(sentence_pl, sequence):
sen = sentence_pl.split()
window_size = len(sequence.split())
maxx = 0
for i in range(len(sen) - window_size):
current = rapidfuzz.fuzz.partial_ratio(' '.join(sen[i:i + window_size]), sequence)
if current > maxx:
maxx = current
return maxx
def inject(sentence, sequence):
sen = sentence.split()
window_size = len(sequence.split())
maxx = 0
maxxi = 0
for i in range(len(sen) - window_size):
current = rapidfuzz.fuzz.partial_ratio(' '.join(sen[i:i + window_size]), sequence)
if current > maxx:
maxx = current
maxxi = i
return ' '.join(sen[:maxxi + window_size]) + ' ' \
+ glossary.loc[lambda df: df['source_lem'] == sequence]['result'].astype(str).values.flatten() \
+ ' ' + ' '.join(sen[maxxi + window_size:])
glossary = pd.read_csv('../kompendium_lem_cleaned.tsv', sep='\t', header=0, index_col=0)
glossary['source_lem'] = [default_process(x) for x in glossary['source_lem']]
start_time = time.time_ns()
en = []
translation_line_counts = []
for line, line_pl in zip(file_lemmatized, file_pl_lemmatized):
if len(translation_line_counts) % 50000 == 0:
print(str(len(translation_line_counts)) + '/' + str(len(file_lemmatized), end='\r'))
line = default_process(line)
line_pl = default_process(line_pl)
matchez = rapidfuzz.process.extract(query=line, choices=glossary['source_lem'], limit=5, score_cutoff=THRESHOLD, scorer=partial_ratio)
translation_line_counts.append(len(matchez))
for match in matchez:
# if is_injectable(line_pl, match[0]):
en.append(inject(line, match[0])[0])
stop = time.time_ns()
timex = (stop - start_time) / 1000000000
print(timex)
tlcs = copy.deepcopy(translation_line_counts)
translations = pd.read_csv(corpus_path + 'expected.tsv', sep='\t', header=None, names=['text'])
with open(corpus_path + 'extected.tsv.injected.crossvalidated.pl', 'w') as file_pl:
for line, translation_line_ct in zip(translations, tlcs):
for i in range(translation_line_ct):
file_pl.write(line)
with open(corpus_path + 'in.tsv.injected.crossvalidated.en', 'w') as file_en:
for e in en:
file_en.write(e + '\n')