import spacy import copy import pandas as pd import rapidfuzz from rapidfuzz.fuzz import partial_ratio import time from rapidfuzz.utils import default_process import sys spacy.require_gpu() spacy_nlp_en = spacy.load('en_core_web_sm') spacy_nlp_pl = spacy.load("pl_core_news_sm") def read_arguments(): try: corpus_path, glossary_path = sys.argv return corpus_path, glossary_path except: print("ERROR: Wrong argument amount.") sys.exit(1) glossary = pd.read_csv('mt-summit-corpora/glossary.tsv', sep='\t', header=None, names=['source', 'result']) source_lemmatized = [] for word in glossary['source']: temp = [] for token in spacy_nlp_en(word): temp.append(token.lemma_) source_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) result_lemmatized = [] for word in glossary['result']: temp = [] for token in spacy_nlp_pl(word): temp.append(token.lemma_) result_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) glossary['source_lem'] = source_lemmatized glossary['result_lem'] = result_lemmatized glossary = glossary[['source', 'source_lem', 'result', 'result_lem']] glossary.to_csv('kompendium_lem.tsv', sep='\t') corpus_path = 'mt-summit-corpora/train/' skip_chars = ''',./!?''' with open(corpus_path + 'in.tsv', 'r') as file: file_lemmatized = [] for line in file: if len(file_lemmatized) % 10000 == 0: print(len(file_lemmatized), end='\r') temp = [] for token in spacy_nlp_en(line): temp.append(token.lemma_) file_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) with open(corpus_path + 'expected.tsv', 'r') as file: file_pl_lemmatized = [] for line in file: if len(file_pl_lemmatized) % 10000 == 0: print(len(file_lemmatized), end='\r') temp = [] for token in spacy_nlp_pl(line): temp.append(token.lemma_) file_pl_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) THRESHOLD = 88 def is_injectable(sentence_pl, sequence): sen = sentence_pl.split() window_size = len(sequence.split()) maxx = 0 for i in range(len(sen) - window_size): current = rapidfuzz.fuzz.partial_ratio(' '.join(sen[i:i + window_size]), sequence) if current > maxx: maxx = current return maxx def inject(sentence, sequence): sen = sentence.split() window_size = len(sequence.split()) maxx = 0 maxxi = 0 for i in range(len(sen) - window_size): current = rapidfuzz.fuzz.partial_ratio(' '.join(sen[i:i + window_size]), sequence) if current > maxx: maxx = current maxxi = i return ' '.join(sen[:maxxi + window_size]) + ' ' \ + glossary.loc[lambda df: df['source_lem'] == sequence]['result'].astype(str).values.flatten() \ + ' ' + ' '.join(sen[maxxi + window_size:]) glossary = pd.read_csv('../kompendium_lem_cleaned.tsv', sep='\t', header=0, index_col=0) glossary['source_lem'] = [default_process(x) for x in glossary['source_lem']] start_time = time.time_ns() en = [] translation_line_counts = [] for line, line_pl in zip(file_lemmatized, file_pl_lemmatized): if len(translation_line_counts) % 50000 == 0: print(str(len(translation_line_counts)) + '/' + str(len(file_lemmatized), end='\r')) line = default_process(line) line_pl = default_process(line_pl) matchez = rapidfuzz.process.extract(query=line, choices=glossary['source_lem'], limit=5, score_cutoff=THRESHOLD, scorer=partial_ratio) translation_line_counts.append(len(matchez)) for match in matchez: # if is_injectable(line_pl, match[0]): en.append(inject(line, match[0])[0]) stop = time.time_ns() timex = (stop - start_time) / 1000000000 print(timex) tlcs = copy.deepcopy(translation_line_counts) translations = pd.read_csv(corpus_path + 'expected.tsv', sep='\t', header=None, names=['text']) with open(corpus_path + 'extected.tsv.injected.crossvalidated.pl', 'w') as file_pl: for line, translation_line_ct in zip(translations, tlcs): for i in range(translation_line_ct): file_pl.write(line) with open(corpus_path + 'in.tsv.injected.crossvalidated.en', 'w') as file_en: for e in en: file_en.write(e + '\n')