diff --git a/random-scripts/inject_rapid.py b/random-scripts/inject_rapid.py new file mode 100644 index 0000000..d7a24b1 --- /dev/null +++ b/random-scripts/inject_rapid.py @@ -0,0 +1,130 @@ +import spacy +import copy +import pandas as pd +import rapidfuzz +from rapidfuzz.fuzz import partial_ratio +import time +from rapidfuzz.utils import default_process +import sys + +spacy.require_gpu() + +spacy_nlp_en = spacy.load('en_core_web_sm') +spacy_nlp_pl = spacy.load("pl_core_news_sm") + + +def read_arguments(): + try: + corpus_path, glossary_path = sys.argv + return corpus_path, glossary_path + except: + print("ERROR: Wrong argument amount.") + sys.exit(1) + + + +glossary = pd.read_csv('mt-summit-corpora/glossary.tsv', sep='\t', header=None, names=['source', 'result']) + +source_lemmatized = [] +for word in glossary['source']: + temp = [] + for token in spacy_nlp_en(word): + temp.append(token.lemma_) + source_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) + +result_lemmatized = [] +for word in glossary['result']: + temp = [] + for token in spacy_nlp_pl(word): + temp.append(token.lemma_) + result_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) + +glossary['source_lem'] = source_lemmatized +glossary['result_lem'] = result_lemmatized +glossary = glossary[['source', 'source_lem', 'result', 'result_lem']] +glossary.to_csv('kompendium_lem.tsv', sep='\t') + +corpus_path = 'mt-summit-corpora/train/' + +skip_chars = ''',./!?''' + +with open(corpus_path + 'in.tsv', 'r') as file: + file_lemmatized = [] + for line in file: + if len(file_lemmatized) % 10000 == 0: + print(len(file_lemmatized), end='\r') + temp = [] + for token in spacy_nlp_en(line): + temp.append(token.lemma_) + file_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) + +with open(corpus_path + 'expected.tsv', 'r') as file: + file_pl_lemmatized = [] + for line in file: + if len(file_pl_lemmatized) % 10000 == 0: + print(len(file_lemmatized), end='\r') + temp = [] + for token in spacy_nlp_pl(line): + temp.append(token.lemma_) + file_pl_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) + +THRESHOLD = 88 + +def is_injectable(sentence_pl, sequence): + sen = sentence_pl.split() + window_size = len(sequence.split()) + maxx = 0 + for i in range(len(sen) - window_size): + current = rapidfuzz.fuzz.partial_ratio(' '.join(sen[i:i + window_size]), sequence) + if current > maxx: + maxx = current + return maxx + +def inject(sentence, sequence): + sen = sentence.split() + window_size = len(sequence.split()) + maxx = 0 + maxxi = 0 + for i in range(len(sen) - window_size): + current = rapidfuzz.fuzz.partial_ratio(' '.join(sen[i:i + window_size]), sequence) + if current > maxx: + maxx = current + maxxi = i + return ' '.join(sen[:maxxi + window_size]) + ' ' \ + + glossary.loc[lambda df: df['source_lem'] == sequence]['result'].astype(str).values.flatten() \ + + ' ' + ' '.join(sen[maxxi + window_size:]) + +glossary = pd.read_csv('../kompendium_lem_cleaned.tsv', sep='\t', header=0, index_col=0) +glossary['source_lem'] = [default_process(x) for x in glossary['source_lem']] + +start_time = time.time_ns() +en = [] +translation_line_counts = [] +for line, line_pl in zip(file_lemmatized, file_pl_lemmatized): + if len(translation_line_counts) % 50000 == 0: + print(str(len(translation_line_counts)) + '/' + str(len(file_lemmatized), end='\r')) + line = default_process(line) + line_pl = default_process(line_pl) + matchez = rapidfuzz.process.extract(query=line, choices=glossary['source_lem'], limit=5, score_cutoff=THRESHOLD, scorer=partial_ratio) + translation_line_counts.append(len(matchez)) + for match in matchez: + # if is_injectable(line_pl, match[0]): + en.append(inject(line, match[0])[0]) + + +stop = time.time_ns() +timex = (stop - start_time) / 1000000000 +print(timex) + +tlcs = copy.deepcopy(translation_line_counts) + +translations = pd.read_csv(corpus_path + 'expected.tsv', sep='\t', header=None, names=['text']) +with open(corpus_path + 'extected.tsv.injected.crossvalidated.pl', 'w') as file_pl: + for line, translation_line_ct in zip(translations, tlcs): + for i in range(translation_line_ct): + file_pl.write(line) + + +with open(corpus_path + 'in.tsv.injected.crossvalidated.en', 'w') as file_en: + for e in en: + file_en.write(e + '\n') diff --git a/random-scripts/training-command.txt b/random-scripts/training-command.txt new file mode 100644 index 0000000..9f72665 --- /dev/null +++ b/random-scripts/training-command.txt @@ -0,0 +1,30 @@ +first iteration: +./marian/build/marian --model mt.npz \ +--type transformer --overwrite \ +--train-sets mt-summit-corpora/mt-summit-corpora/dev/dev.en \ +mt-summit-corpora/mt-summit-corpora/dev/dev.pl \ +--disp-freq 1000 \ +--save-freq 1000 \ +--optimizer adam \ +--lr-report + +next iterations: +./marian/build/marian --model mt.npz \ +--type transformer --overwrite \ +--train-sets mt-summit-corpora/mt-summit-corpora/dev/dev.en \ +mt-summit-corpora/mt-summit-corpora/dev/dev.pl \ +--disp-freq 1000 \ +--save-freq 1000 \ +--optimizer adam \ +--lr-report \ +--pretrained-model mt.npz + +./marian/build/marian --model mt.npz \ +--type transformer --overwrite \ +--train-sets mt-summit-corpora/mt-summit-corpora/train/train.en \ +mt-summit-corpora/mt-summit-corpora/train/train.pl \ +--disp-freq 1000 \ +--save-freq 10000 \ +--optimizer adam \ +--lr-report \ +--pretrained-model mt.npz diff --git a/random-scripts/venv-setup.sh b/random-scripts/venv-setup.sh new file mode 100644 index 0000000..df18dde --- /dev/null +++ b/random-scripts/venv-setup.sh @@ -0,0 +1,12 @@ +#!/bin.bash + +apt install python3-pip +apt install python3-virtualenv +virtualenv -p python3.8 gpu +source gpu/bin/activate +pip install pandas ipython +pip install spacy[cuda114] +python -m spacy download en_core_web_sm +python -m spacy download pl_core_news_sm +pip install spaczz +pip install rapidfuzz