commit 6b4c6e18f881c9390d615fda62b98773ad05e528 Author: jakubknczny Date: Sat Jan 22 00:04:56 2022 +0100 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ee04fb3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.idea +mt-summit-corpora \ No newline at end of file diff --git a/random-scripts/inject_rapid.py b/random-scripts/inject_rapid.py new file mode 100644 index 0000000..d7a24b1 --- /dev/null +++ b/random-scripts/inject_rapid.py @@ -0,0 +1,130 @@ +import spacy +import copy +import pandas as pd +import rapidfuzz +from rapidfuzz.fuzz import partial_ratio +import time +from rapidfuzz.utils import default_process +import sys + +spacy.require_gpu() + +spacy_nlp_en = spacy.load('en_core_web_sm') +spacy_nlp_pl = spacy.load("pl_core_news_sm") + + +def read_arguments(): + try: + corpus_path, glossary_path = sys.argv + return corpus_path, glossary_path + except: + print("ERROR: Wrong argument amount.") + sys.exit(1) + + + +glossary = pd.read_csv('mt-summit-corpora/glossary.tsv', sep='\t', header=None, names=['source', 'result']) + +source_lemmatized = [] +for word in glossary['source']: + temp = [] + for token in spacy_nlp_en(word): + temp.append(token.lemma_) + source_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) + +result_lemmatized = [] +for word in glossary['result']: + temp = [] + for token in spacy_nlp_pl(word): + temp.append(token.lemma_) + result_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) + +glossary['source_lem'] = source_lemmatized +glossary['result_lem'] = result_lemmatized +glossary = glossary[['source', 'source_lem', 'result', 'result_lem']] +glossary.to_csv('kompendium_lem.tsv', sep='\t') + +corpus_path = 'mt-summit-corpora/train/' + +skip_chars = ''',./!?''' + +with open(corpus_path + 'in.tsv', 'r') as file: + file_lemmatized = [] + for line in file: + if len(file_lemmatized) % 10000 == 0: + print(len(file_lemmatized), end='\r') + temp = [] + for token in spacy_nlp_en(line): + temp.append(token.lemma_) + file_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) + +with open(corpus_path + 'expected.tsv', 'r') as file: + file_pl_lemmatized = [] + for line in file: + if len(file_pl_lemmatized) % 10000 == 0: + print(len(file_lemmatized), end='\r') + temp = [] + for token in spacy_nlp_pl(line): + temp.append(token.lemma_) + file_pl_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) + +THRESHOLD = 88 + +def is_injectable(sentence_pl, sequence): + sen = sentence_pl.split() + window_size = len(sequence.split()) + maxx = 0 + for i in range(len(sen) - window_size): + current = rapidfuzz.fuzz.partial_ratio(' '.join(sen[i:i + window_size]), sequence) + if current > maxx: + maxx = current + return maxx + +def inject(sentence, sequence): + sen = sentence.split() + window_size = len(sequence.split()) + maxx = 0 + maxxi = 0 + for i in range(len(sen) - window_size): + current = rapidfuzz.fuzz.partial_ratio(' '.join(sen[i:i + window_size]), sequence) + if current > maxx: + maxx = current + maxxi = i + return ' '.join(sen[:maxxi + window_size]) + ' ' \ + + glossary.loc[lambda df: df['source_lem'] == sequence]['result'].astype(str).values.flatten() \ + + ' ' + ' '.join(sen[maxxi + window_size:]) + +glossary = pd.read_csv('../kompendium_lem_cleaned.tsv', sep='\t', header=0, index_col=0) +glossary['source_lem'] = [default_process(x) for x in glossary['source_lem']] + +start_time = time.time_ns() +en = [] +translation_line_counts = [] +for line, line_pl in zip(file_lemmatized, file_pl_lemmatized): + if len(translation_line_counts) % 50000 == 0: + print(str(len(translation_line_counts)) + '/' + str(len(file_lemmatized), end='\r')) + line = default_process(line) + line_pl = default_process(line_pl) + matchez = rapidfuzz.process.extract(query=line, choices=glossary['source_lem'], limit=5, score_cutoff=THRESHOLD, scorer=partial_ratio) + translation_line_counts.append(len(matchez)) + for match in matchez: + # if is_injectable(line_pl, match[0]): + en.append(inject(line, match[0])[0]) + + +stop = time.time_ns() +timex = (stop - start_time) / 1000000000 +print(timex) + +tlcs = copy.deepcopy(translation_line_counts) + +translations = pd.read_csv(corpus_path + 'expected.tsv', sep='\t', header=None, names=['text']) +with open(corpus_path + 'extected.tsv.injected.crossvalidated.pl', 'w') as file_pl: + for line, translation_line_ct in zip(translations, tlcs): + for i in range(translation_line_ct): + file_pl.write(line) + + +with open(corpus_path + 'in.tsv.injected.crossvalidated.en', 'w') as file_en: + for e in en: + file_en.write(e + '\n') diff --git a/random-scripts/old/inject.py b/random-scripts/old/inject.py new file mode 100644 index 0000000..476898c --- /dev/null +++ b/random-scripts/old/inject.py @@ -0,0 +1,136 @@ +import copy +import pandas as pd +import spacy +from spaczz.matcher import FuzzyMatcher + +# spacy.require_gpu() + +spacy_nlp_en = spacy.load('en_core_web_sm') +spacy_nlp_pl = spacy.load('pl_core_news_sm') + +print('lemmatizing glossary') + +glossary = pd.read_csv('glossary.tsv', sep='\t', header=None, names=['source', 'result']) + +source_lemmatized = [] +for word in glossary['source']: + temp = [] + for token in spacy_nlp_en(word): + temp.append(token.lemma_) + source_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) + +result_lemmatized = [] +for word in glossary['result']: + temp = [] + for token in spacy_nlp_pl(word): + temp.append(token.lemma_) + result_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) + +glossary['source_lem'] = source_lemmatized +glossary['result_lem'] = result_lemmatized +glossary = glossary[['source', 'source_lem', 'result', 'result_lem']] +glossary.set_index('source_lem') + +glossary.to_csv('glossary_lem.tsv', sep='\t') + +dev_path = 'dev-0/' + +print('lemmatizing corpus ' + dev_path) + +skip_chars = ''',./!?''' + +with open(dev_path + 'in.tsv', 'r') as file: + file_lemmatized = [] + for line in file: + temp = [] + for token in spacy_nlp_en(line): + temp.append(token.lemma_) + file_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]) + .replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) + +with open(dev_path + 'expected.tsv', 'r') as file: + file_pl_lemmatized = [] + for line in file: + temp = [] + for token in spacy_nlp_pl(line): + temp.append(token.lemma_) + file_pl_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]) + .replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) + +# glossary +glossary = pd.read_csv('glossary_lem.tsv', sep='\t', header=0, index_col=0) +train_glossary = glossary.iloc[[x for x in range(len(glossary)) if x % 6 != 0]] + +# add rules to English matcher +nlp = spacy.blank("en") +matcher = FuzzyMatcher(nlp.vocab) +for word in train_glossary['source_lem']: + matcher.add(word, [nlp(word)]) + +# add rules to Polish matcher +nlp_pl = spacy.blank("pl") +matcher_pl = FuzzyMatcher(nlp_pl.vocab) +for word, word_id in zip(train_glossary['result_lem'], train_glossary['source_lem']): + matcher_pl.add(word, [nlp_pl(word)]) + +en = [] +translation_line_counts = [] +for line_id in range(len(file_lemmatized)): + + if line_id % 100 == 0: + print('injecting glossary: ' + str(line_id) + "/" + str(len(file_lemmatized)), end='\r') + + doc = nlp(file_lemmatized[line_id]) + matches = matcher(doc) + + line_counter = 0 + for match_id, start, end, ratio in matches: + if ratio > 90: + doc_pl = nlp_pl(file_pl_lemmatized[line_id]) + matches_pl = matcher_pl(doc_pl) + + for match_id_pl, start_pl, end_pl, ratio_pl in matches_pl: + if match_id_pl == glossary[glossary['source_lem'] == match_id].values[0][3]: + line_counter += 1 + en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text)) + + if line_counter == 0: + line_counter = 1 + en.append(file_lemmatized[line_id]) + translation_line_counts.append(line_counter) + +print('saving files') +tlcs = copy.deepcopy(translation_line_counts) + +translations = pd.read_csv(dev_path + 'expected.tsv', sep='\t', header=None, names=['text']) +translations['id'] = [x for x in range(len(translations))] + +ctr = 0 +sentence = '' +with open(dev_path + 'in.tsv.injected.crossvalidated', 'w') as file_en: + with open(dev_path + 'expected.tsv.injected.crossvalidated', 'w') as file_pl: + for i in range(len(en)): + if i > 0: + if en[i-1] != en[i]: + if ctr == 0: + sentence = translations.iloc[0] + translations.drop(sentence['id'], inplace=True) + sentence = sentence['text'] + try: + ctr = tlcs.pop(0) + except: + pass + file_en.write(en[i]) + file_pl.write(sentence + '\n') + ctr = ctr - 1 + else: + try: + ctr = tlcs.pop(0) - 1 + except: + pass + sentence = translations.iloc[0] + translations.drop(sentence['id'], inplace=True) + sentence = sentence['text'] + file_en.write(en[i]) + file_pl.write(sentence + '\n') + diff --git a/random-scripts/rapidfuzztest.ipynb b/random-scripts/rapidfuzztest.ipynb new file mode 100644 index 0000000..caa86fb --- /dev/null +++ b/random-scripts/rapidfuzztest.ipynb @@ -0,0 +1,196 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import nltk\n", + "from nltk.stem import WordNetLemmatizer\n", + "\n", + "\n", + "wl = WordNetLemmatizer()\n", + "\n", + "glossary = pd.read_csv('../kompendium.tsv', sep='\\t', header=None, names=['source', 'result'])\n", + "\n", + "source_lemmatized = []\n", + "for word in glossary['source']:\n", + " word = nltk.word_tokenize(word)\n", + " source_lemmatized.append(' '.join([wl.lemmatize(x) for x in word]))\n", + "\n", + "glossary['source_lem'] = source_lemmatized\n", + "glossary = glossary[['source', 'source_lem', 'result']]\n", + "glossary.set_index('source_lem')\n", + "\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n", + "is_executing": true + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "\n", + "start_time = time.time_ns()\n", + "filex = []\n", + "with open(dev_path + '.pl', 'r') as file:\n", + " for line in file:\n", + " if len(filex) % 50000 == 0:\n", + " print(len(filex), end='\\r')\n", + " line = nltk.word_tokenize(line)\n", + " filex.append(' '.join([wl.lemmatize(x) for x in line]))\n", + "\n", + "\n", + "print(filex)\n", + "\n", + "stop = time.time_ns()\n", + "timex = (stop - start_time) / 1000000000\n", + "print(timex)\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n", + "is_executing": true + } + } + }, + { + "cell_type": "code", + "execution_count": 23, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "78.948892319\n", + "640\n" + ] + } + ], + "source": [ + "import copy\n", + "import pandas as pd\n", + "import rapidfuzz\n", + "import time\n", + "\n", + "from rapidfuzz.fuzz import partial_ratio\n", + "from rapidfuzz.utils import default_process\n", + "\n", + "\n", + "THRESHOLD = 88\n", + "\n", + "def is_injectable(sentence_pl, sequence):\n", + " sen = sentence_pl.split()\n", + " window_size = len(sequence.split())\n", + " maxx = 0\n", + " for i in range(len(sen) - window_size):\n", + " current = rapidfuzz.fuzz.ratio(' '.join(sen[i:i + window_size]), sequence)\n", + " if current > maxx:\n", + " maxx = current\n", + " if maxx >= THRESHOLD:\n", + " return True\n", + " else:\n", + " return False\n", + "\n", + "def get_injected(sentence, sequence, inject):\n", + " sen = sentence.split()\n", + " window_size = len(sequence.split())\n", + " maxx = 0\n", + " maxxi = 0\n", + " for i in range(len(sen) - window_size + 1):\n", + " current = rapidfuzz.fuzz.ratio(' '.join(sen[i:i + window_size]), sequence)\n", + " if current >= maxx:\n", + " maxx = current\n", + " maxxi = i\n", + " return ' '.join(sen[:maxxi + window_size]) + ' ' + inject + ' ' + ' '.join(sen[maxxi + window_size:])\n", + "\n", + "glossary = pd.read_csv('../kompendium_lem_cleaned.tsv', sep='\\t', header=0, index_col=0)\n", + "glossary['source_lem'] = [' ' + str(default_process(x)) + ' ' for x in glossary['source_lem']]\n", + "\n", + "start_time = time.time_ns()\n", + "en = []\n", + "translation_line_counts = []\n", + "for line, line_pl in zip(file_lemmatized, file_pl_lemmatized):\n", + " line = default_process(line)\n", + " line_pl = default_process(line_pl)\n", + " matchez = rapidfuzz.process.extract(query=line, choices=glossary['source_lem'], limit=5, score_cutoff=THRESHOLD, scorer=partial_ratio)\n", + " if len(matchez) > 0:\n", + " translation_line_counts.append(len(matchez))\n", + " for match in matchez:\n", + " polish_translation = glossary.loc[lambda df: df['source_lem'] == match[0]]['result'].astype(str).values.flatten()[0]\n", + " if is_injectable(line_pl, polish_translation):\n", + " en.append(get_injected(line, match[0], polish_translation)[0])\n", + " else:\n", + " en.append(line)\n", + " else:\n", + " translation_line_counts.append(1)\n", + " en.append(line)\n", + "\n", + "\n", + "stop = time.time_ns()\n", + "timex = (stop - start_time) / 1000000000\n", + "print(timex)\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 32, + "outputs": [], + "source": [ + "tlcs = copy.deepcopy(translation_line_counts)\n", + "\n", + "translations = pd.read_csv(dev_path + '.pl', sep='\\t', header=None, names=['text'])\n", + "with open(dev_path + '.injected.crossvalidated.pl', 'w') as file_pl:\n", + " for line, translation_line_ct in zip(translations, tlcs):\n", + " for i in range(translation_line_ct):\n", + " file_pl.write(line)\n", + "\n", + "\n", + "with open(dev_path + '.injected.crossvalidated.en', 'w') as file_en:\n", + " for e in en:\n", + " file_en.write(e + '\\n')" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/random-scripts/test.py b/random-scripts/test.py new file mode 100644 index 0000000..92994ed --- /dev/null +++ b/random-scripts/test.py @@ -0,0 +1,26 @@ +import time +import nltk +from nltk.stem import WordNetLemmatizer + +# nltk.download('omw-1.4') +# nltk.download('punkt') +nltk.download('wordnet') + +wl = WordNetLemmatizer() + +start_time = time.time_ns() +filex = [] +with open('mt-summit-corpora/train/in.tsv', 'r') as file: + for line in file: + if len(filex) % 50000 == 0: + print(len(filex), end='\r') + line = nltk.word_tokenize(line) + filex.append(' '.join([wl.lemmatize(x) for x in line])) + + +stop = time.time_ns() +timex = (stop - start_time) / 1000000000 +print(timex) +f = open('temp', 'w') +for line in filex: + f.write(line + '\n') diff --git a/random-scripts/training-command.txt b/random-scripts/training-command.txt new file mode 100644 index 0000000..9f72665 --- /dev/null +++ b/random-scripts/training-command.txt @@ -0,0 +1,30 @@ +first iteration: +./marian/build/marian --model mt.npz \ +--type transformer --overwrite \ +--train-sets mt-summit-corpora/mt-summit-corpora/dev/dev.en \ +mt-summit-corpora/mt-summit-corpora/dev/dev.pl \ +--disp-freq 1000 \ +--save-freq 1000 \ +--optimizer adam \ +--lr-report + +next iterations: +./marian/build/marian --model mt.npz \ +--type transformer --overwrite \ +--train-sets mt-summit-corpora/mt-summit-corpora/dev/dev.en \ +mt-summit-corpora/mt-summit-corpora/dev/dev.pl \ +--disp-freq 1000 \ +--save-freq 1000 \ +--optimizer adam \ +--lr-report \ +--pretrained-model mt.npz + +./marian/build/marian --model mt.npz \ +--type transformer --overwrite \ +--train-sets mt-summit-corpora/mt-summit-corpora/train/train.en \ +mt-summit-corpora/mt-summit-corpora/train/train.pl \ +--disp-freq 1000 \ +--save-freq 10000 \ +--optimizer adam \ +--lr-report \ +--pretrained-model mt.npz diff --git a/random-scripts/venv-setup.sh b/random-scripts/venv-setup.sh new file mode 100644 index 0000000..3499ddb --- /dev/null +++ b/random-scripts/venv-setup.sh @@ -0,0 +1,10 @@ +#!/bin.bash + +apt install python3-pip +apt install python3-virtualenv +virtualenv -p python3.8 gpu +source gpu/bin/activate +pip install pandas ipython +pip install nltk +python "nltk.download('omw-1.4')" +pip install rapidfuzz