From bb29472de9ef8a5281863a56e963b3d562473662 Mon Sep 17 00:00:00 2001 From: jakubknczny Date: Sun, 23 Jan 2022 16:01:44 +0100 Subject: [PATCH] ipnyb2py --- do_inject.sh | 7 ++ random-scripts/inject_rapid.py | 130 ------------------------------- random-scripts/old/inject.py | 136 --------------------------------- random-scripts/test.py | 26 ------- random-scripts/venv-setup.sh | 1 - rapidfuzztest.ipynb | 49 +++++++----- scripts/inject.py | 95 +++++++++++++++++++++++ scripts/lemmatize_glossary.py | 19 +++++ scripts/lemmatize_in.py | 23 ++++++ 9 files changed, 176 insertions(+), 310 deletions(-) create mode 100644 do_inject.sh delete mode 100644 random-scripts/inject_rapid.py delete mode 100644 random-scripts/old/inject.py delete mode 100644 random-scripts/test.py create mode 100644 scripts/inject.py create mode 100644 scripts/lemmatize_glossary.py create mode 100644 scripts/lemmatize_in.py diff --git a/do_inject.sh b/do_inject.sh new file mode 100644 index 0000000..f74cbdf --- /dev/null +++ b/do_inject.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +source gpu/bin/activate + +python scripts/lemmatize_glossary.py +python scripts/lemmatize_in.py +python scripts/inject.py diff --git a/random-scripts/inject_rapid.py b/random-scripts/inject_rapid.py deleted file mode 100644 index d7a24b1..0000000 --- a/random-scripts/inject_rapid.py +++ /dev/null @@ -1,130 +0,0 @@ -import spacy -import copy -import pandas as pd -import rapidfuzz -from rapidfuzz.fuzz import partial_ratio -import time -from rapidfuzz.utils import default_process -import sys - -spacy.require_gpu() - -spacy_nlp_en = spacy.load('en_core_web_sm') -spacy_nlp_pl = spacy.load("pl_core_news_sm") - - -def read_arguments(): - try: - corpus_path, glossary_path = sys.argv - return corpus_path, glossary_path - except: - print("ERROR: Wrong argument amount.") - sys.exit(1) - - - -glossary = pd.read_csv('mt-summit-corpora/glossary.tsv', sep='\t', header=None, names=['source', 'result']) - -source_lemmatized = [] -for word in glossary['source']: - temp = [] - for token in spacy_nlp_en(word): - temp.append(token.lemma_) - source_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) - -result_lemmatized = [] -for word in glossary['result']: - temp = [] - for token in spacy_nlp_pl(word): - temp.append(token.lemma_) - result_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) - -glossary['source_lem'] = source_lemmatized -glossary['result_lem'] = result_lemmatized -glossary = glossary[['source', 'source_lem', 'result', 'result_lem']] -glossary.to_csv('kompendium_lem.tsv', sep='\t') - -corpus_path = 'mt-summit-corpora/train/' - -skip_chars = ''',./!?''' - -with open(corpus_path + 'in.tsv', 'r') as file: - file_lemmatized = [] - for line in file: - if len(file_lemmatized) % 10000 == 0: - print(len(file_lemmatized), end='\r') - temp = [] - for token in spacy_nlp_en(line): - temp.append(token.lemma_) - file_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) - -with open(corpus_path + 'expected.tsv', 'r') as file: - file_pl_lemmatized = [] - for line in file: - if len(file_pl_lemmatized) % 10000 == 0: - print(len(file_lemmatized), end='\r') - temp = [] - for token in spacy_nlp_pl(line): - temp.append(token.lemma_) - file_pl_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) - -THRESHOLD = 88 - -def is_injectable(sentence_pl, sequence): - sen = sentence_pl.split() - window_size = len(sequence.split()) - maxx = 0 - for i in range(len(sen) - window_size): - current = rapidfuzz.fuzz.partial_ratio(' '.join(sen[i:i + window_size]), sequence) - if current > maxx: - maxx = current - return maxx - -def inject(sentence, sequence): - sen = sentence.split() - window_size = len(sequence.split()) - maxx = 0 - maxxi = 0 - for i in range(len(sen) - window_size): - current = rapidfuzz.fuzz.partial_ratio(' '.join(sen[i:i + window_size]), sequence) - if current > maxx: - maxx = current - maxxi = i - return ' '.join(sen[:maxxi + window_size]) + ' ' \ - + glossary.loc[lambda df: df['source_lem'] == sequence]['result'].astype(str).values.flatten() \ - + ' ' + ' '.join(sen[maxxi + window_size:]) - -glossary = pd.read_csv('../kompendium_lem_cleaned.tsv', sep='\t', header=0, index_col=0) -glossary['source_lem'] = [default_process(x) for x in glossary['source_lem']] - -start_time = time.time_ns() -en = [] -translation_line_counts = [] -for line, line_pl in zip(file_lemmatized, file_pl_lemmatized): - if len(translation_line_counts) % 50000 == 0: - print(str(len(translation_line_counts)) + '/' + str(len(file_lemmatized), end='\r')) - line = default_process(line) - line_pl = default_process(line_pl) - matchez = rapidfuzz.process.extract(query=line, choices=glossary['source_lem'], limit=5, score_cutoff=THRESHOLD, scorer=partial_ratio) - translation_line_counts.append(len(matchez)) - for match in matchez: - # if is_injectable(line_pl, match[0]): - en.append(inject(line, match[0])[0]) - - -stop = time.time_ns() -timex = (stop - start_time) / 1000000000 -print(timex) - -tlcs = copy.deepcopy(translation_line_counts) - -translations = pd.read_csv(corpus_path + 'expected.tsv', sep='\t', header=None, names=['text']) -with open(corpus_path + 'extected.tsv.injected.crossvalidated.pl', 'w') as file_pl: - for line, translation_line_ct in zip(translations, tlcs): - for i in range(translation_line_ct): - file_pl.write(line) - - -with open(corpus_path + 'in.tsv.injected.crossvalidated.en', 'w') as file_en: - for e in en: - file_en.write(e + '\n') diff --git a/random-scripts/old/inject.py b/random-scripts/old/inject.py deleted file mode 100644 index 476898c..0000000 --- a/random-scripts/old/inject.py +++ /dev/null @@ -1,136 +0,0 @@ -import copy -import pandas as pd -import spacy -from spaczz.matcher import FuzzyMatcher - -# spacy.require_gpu() - -spacy_nlp_en = spacy.load('en_core_web_sm') -spacy_nlp_pl = spacy.load('pl_core_news_sm') - -print('lemmatizing glossary') - -glossary = pd.read_csv('glossary.tsv', sep='\t', header=None, names=['source', 'result']) - -source_lemmatized = [] -for word in glossary['source']: - temp = [] - for token in spacy_nlp_en(word): - temp.append(token.lemma_) - source_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) - -result_lemmatized = [] -for word in glossary['result']: - temp = [] - for token in spacy_nlp_pl(word): - temp.append(token.lemma_) - result_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) - -glossary['source_lem'] = source_lemmatized -glossary['result_lem'] = result_lemmatized -glossary = glossary[['source', 'source_lem', 'result', 'result_lem']] -glossary.set_index('source_lem') - -glossary.to_csv('glossary_lem.tsv', sep='\t') - -dev_path = 'dev-0/' - -print('lemmatizing corpus ' + dev_path) - -skip_chars = ''',./!?''' - -with open(dev_path + 'in.tsv', 'r') as file: - file_lemmatized = [] - for line in file: - temp = [] - for token in spacy_nlp_en(line): - temp.append(token.lemma_) - file_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]) - .replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) - -with open(dev_path + 'expected.tsv', 'r') as file: - file_pl_lemmatized = [] - for line in file: - temp = [] - for token in spacy_nlp_pl(line): - temp.append(token.lemma_) - file_pl_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]) - .replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) - -# glossary -glossary = pd.read_csv('glossary_lem.tsv', sep='\t', header=0, index_col=0) -train_glossary = glossary.iloc[[x for x in range(len(glossary)) if x % 6 != 0]] - -# add rules to English matcher -nlp = spacy.blank("en") -matcher = FuzzyMatcher(nlp.vocab) -for word in train_glossary['source_lem']: - matcher.add(word, [nlp(word)]) - -# add rules to Polish matcher -nlp_pl = spacy.blank("pl") -matcher_pl = FuzzyMatcher(nlp_pl.vocab) -for word, word_id in zip(train_glossary['result_lem'], train_glossary['source_lem']): - matcher_pl.add(word, [nlp_pl(word)]) - -en = [] -translation_line_counts = [] -for line_id in range(len(file_lemmatized)): - - if line_id % 100 == 0: - print('injecting glossary: ' + str(line_id) + "/" + str(len(file_lemmatized)), end='\r') - - doc = nlp(file_lemmatized[line_id]) - matches = matcher(doc) - - line_counter = 0 - for match_id, start, end, ratio in matches: - if ratio > 90: - doc_pl = nlp_pl(file_pl_lemmatized[line_id]) - matches_pl = matcher_pl(doc_pl) - - for match_id_pl, start_pl, end_pl, ratio_pl in matches_pl: - if match_id_pl == glossary[glossary['source_lem'] == match_id].values[0][3]: - line_counter += 1 - en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text)) - - if line_counter == 0: - line_counter = 1 - en.append(file_lemmatized[line_id]) - translation_line_counts.append(line_counter) - -print('saving files') -tlcs = copy.deepcopy(translation_line_counts) - -translations = pd.read_csv(dev_path + 'expected.tsv', sep='\t', header=None, names=['text']) -translations['id'] = [x for x in range(len(translations))] - -ctr = 0 -sentence = '' -with open(dev_path + 'in.tsv.injected.crossvalidated', 'w') as file_en: - with open(dev_path + 'expected.tsv.injected.crossvalidated', 'w') as file_pl: - for i in range(len(en)): - if i > 0: - if en[i-1] != en[i]: - if ctr == 0: - sentence = translations.iloc[0] - translations.drop(sentence['id'], inplace=True) - sentence = sentence['text'] - try: - ctr = tlcs.pop(0) - except: - pass - file_en.write(en[i]) - file_pl.write(sentence + '\n') - ctr = ctr - 1 - else: - try: - ctr = tlcs.pop(0) - 1 - except: - pass - sentence = translations.iloc[0] - translations.drop(sentence['id'], inplace=True) - sentence = sentence['text'] - file_en.write(en[i]) - file_pl.write(sentence + '\n') - diff --git a/random-scripts/test.py b/random-scripts/test.py deleted file mode 100644 index 92994ed..0000000 --- a/random-scripts/test.py +++ /dev/null @@ -1,26 +0,0 @@ -import time -import nltk -from nltk.stem import WordNetLemmatizer - -# nltk.download('omw-1.4') -# nltk.download('punkt') -nltk.download('wordnet') - -wl = WordNetLemmatizer() - -start_time = time.time_ns() -filex = [] -with open('mt-summit-corpora/train/in.tsv', 'r') as file: - for line in file: - if len(filex) % 50000 == 0: - print(len(filex), end='\r') - line = nltk.word_tokenize(line) - filex.append(' '.join([wl.lemmatize(x) for x in line])) - - -stop = time.time_ns() -timex = (stop - start_time) / 1000000000 -print(timex) -f = open('temp', 'w') -for line in filex: - f.write(line + '\n') diff --git a/random-scripts/venv-setup.sh b/random-scripts/venv-setup.sh index 3499ddb..de0d0bf 100644 --- a/random-scripts/venv-setup.sh +++ b/random-scripts/venv-setup.sh @@ -6,5 +6,4 @@ virtualenv -p python3.8 gpu source gpu/bin/activate pip install pandas ipython pip install nltk -python "nltk.download('omw-1.4')" pip install rapidfuzz diff --git a/rapidfuzztest.ipynb b/rapidfuzztest.ipynb index 4e98ef0..99b117f 100644 --- a/rapidfuzztest.ipynb +++ b/rapidfuzztest.ipynb @@ -2,8 +2,18 @@ "cells": [ { "cell_type": "code", - "execution_count": null, - "outputs": [], + "execution_count": 2, + "outputs": [ + { + "data": { + "text/plain": " source \\\nsource_lem \naaofi aaofi \naca aca \nacca acca \nabacus abacus \nabandonment cost abandonment costs \n... ... \nytd ytd \nyear-end year-end \nyear-to-date year-to-date \nzog zog \nzero overhead growth zero overhead growth \n\n result \nsource_lem \naaofi organizacja rachunkowości i audytu dla islamsk... \naca członek stowarzyszenia dyplomowanych biegłych ... \nacca stowarzyszenie dyplomowanych biegłych rewidentów \nabacus liczydło \nabandonment cost koszty zaniechania \n... ... \nytd od początku roku \nyear-end koniec roku \nyear-to-date od początku roku \nzog zero wzrostu kosztów ogólnych \nzero overhead growth zero wzrostu kosztów ogólnych \n\n[1197 rows x 2 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
sourceresult
source_lem
aaofiaaofiorganizacja rachunkowości i audytu dla islamsk...
acaacaczłonek stowarzyszenia dyplomowanych biegłych ...
accaaccastowarzyszenie dyplomowanych biegłych rewidentów
abacusabacusliczydło
abandonment costabandonment costskoszty zaniechania
.........
ytdytdod początku roku
year-endyear-endkoniec roku
year-to-dateyear-to-dateod początku roku
zogzogzero wzrostu kosztów ogólnych
zero overhead growthzero overhead growthzero wzrostu kosztów ogólnych
\n

1197 rows × 2 columns

\n
" + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "\n", "import nltk\n", @@ -15,6 +25,8 @@ "from rapidfuzz.fuzz import partial_ratio\n", "from rapidfuzz.utils import default_process\n", "\n", + "nltk.download('wordnet')\n", + "\n", "\n", "wl = WordNetLemmatizer()\n", "\n", @@ -33,20 +45,19 @@ "metadata": { "collapsed": false, "pycharm": { - "name": "#%%\n", - "is_executing": true + "name": "#%%\n" } } }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 12, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.194806501\n" + "0.187306436\n" ] } ], @@ -80,13 +91,13 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 19, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "6.915366953\n" + "6.592824061\n" ] } ], @@ -94,6 +105,7 @@ "\n", "THRESHOLD = 70\n", "\n", + "\n", "def is_injectable(sentence_pl, sequence):\n", " sen = sentence_pl.split()\n", " window_size = len(sequence.split())\n", @@ -102,24 +114,24 @@ " current = rapidfuzz.fuzz.ratio(' '.join(sen[i:i + window_size]), sequence)\n", " if current > maxx:\n", " maxx = current\n", - " if maxx >= THRESHOLD:\n", - " return True\n", - " else:\n", - " return False\n", + " return maxx >= THRESHOLD\n", "\n", "def get_injected(sentence, sentence_en, sequence, inject):\n", " sen = sentence.split()\n", " sen_en = sentence_en.split()\n", " window_size = len(sequence.split())\n", " maxx = 0\n", + " maxx_prv = 0\n", " maxxi = 0\n", " for i in range(len(sen) - window_size + 1):\n", " current = rapidfuzz.fuzz.ratio(' '.join(sen[i:i + window_size]), sequence)\n", " if current >= maxx:\n", + " maxx_prv = maxx\n", " maxx = current\n", " maxxi = i\n", - " temp = ' '.join(sen_en[:maxxi + window_size]) + ' $' + inject + '$ ' + ' '.join(sen_en[maxxi + window_size:])\n", - " return temp\n", + " if maxx_prv != maxx:\n", + " return ' '.join(sen_en[:maxxi + window_size]) + ' $' + inject + '$ ' + ' '.join(sen_en[maxxi + window_size:])\n", + " return sentence_en\n", "\n", "glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]\n", "file_pl = pd.read_csv(train_expected_path, sep='\\t', header=None, names=['text'])\n", @@ -162,14 +174,17 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 21, "outputs": [], "source": [ + "\n", + "def full_strip(line):\n", + " return ' '.join(line.split())\n", "\n", "with open(train_expected_path + '.injected', 'w') as file_pl_write:\n", " for line, translation_line_ct in zip(file_pl['text'].values.tolist(), translation_line_counts):\n", " for i in range(translation_line_ct):\n", - " file_pl_write.write(line + '\\n')\n", + " file_pl_write.write(full_strip(line) + '\\n')\n", "\n", "\n", "with open(train_in_path + '.injected', 'w') as file_en_write:\n", @@ -185,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "outputs": [], "source": [], "metadata": { diff --git a/scripts/inject.py b/scripts/inject.py new file mode 100644 index 0000000..f118357 --- /dev/null +++ b/scripts/inject.py @@ -0,0 +1,95 @@ +import pandas as pd +import rapidfuzz + +from rapidfuzz.fuzz import partial_ratio +from rapidfuzz.utils import default_process + + +def full_strip(line): + return ' '.join(line.split()) + + +def is_injectable(sentence_pl, sequence): + sen = sentence_pl.split() + window_size = len(sequence.split()) + maxx = 0 + for i in range(len(sen) - window_size + 1): + current = rapidfuzz.fuzz.ratio(' '.join(sen[i:i + window_size]), sequence) + if current > maxx: + maxx = current + return maxx >= THRESHOLD + + +def get_injected(sentence, sentence_en, sequence, inject): + sen = sentence.split() + sen_en = sentence_en.split() + window_size = len(sequence.split()) + maxx = 0 + maxx_prv = 0 + maxxi = 0 + for i in range(len(sen) - window_size + 1): + current = rapidfuzz.fuzz.ratio(' '.join(sen[i:i + window_size]), sequence) + if current >= maxx: + maxx_prv = maxx + maxx = current + maxxi = i + if maxx_prv != maxx: + return ' '.join(sen_en[:maxxi + window_size]) + ' $' + inject + '$ ' + ' '.join(sen_en[maxxi + window_size:]) + return sentence_en + + +THRESHOLD = 70 + +# train_in_path = 'mt-summit-corpora/train/in.tsv' +# train_expected_path = 'mt-summit-corpora/train/expected.tsv' + +train_in_path = 'mt-summit-corpora/dev-0/in.tsv' +train_expected_path = 'mt-summit-corpora/dev-0/expected.tsv' + +glossary = pd.read_csv('mt-summit-corpora/glossary_lem.tsv', sep='\t') +glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']] + +file_pl = pd.read_csv(train_expected_path, sep='\t', header=None, names=['text']) +file_pl['text'] = [default_process(text) for text in file_pl['text'].values.tolist()] +file_pl = file_pl['text'].values.tolist() + +file_en = pd.read_csv(train_in_path, sep='\t', header=None, names=['text']) +file_en['text'] = [default_process(text) for text in file_en['text'].values.tolist()] +file_en = file_en['text'].values.tolist() + +file_en_lemmatized = pd.read_csv(train_in_path + '.lemmatized', sep='\t', header=None, names=['text']) +file_en_lemmatized['text'] = [default_process(text) for text in file_en_lemmatized['text'].values.tolist()] +file_en_lemmatized = file_en_lemmatized['text'].values.tolist() + +en = [] +translation_line_counts = [] +for line, line_en, line_pl in zip(file_en_lemmatized, file_en, file_pl): + line = default_process(line) + matchez = rapidfuzz.process.extract( + query=line, choices=glossary['source_lem'], limit=5, score_cutoff=THRESHOLD, scorer=partial_ratio) + if len(matchez) > 0: + lines_added = 0 + for match in matchez: + polish_translation = \ + glossary.loc[lambda df: df['source_lem'] == match[0]]['result'].astype(str).values.flatten()[0] + if is_injectable(line_pl, polish_translation): + en.append(get_injected(line, line_en, match[0], polish_translation)) + lines_added += 1 + if lines_added == 0: + en.append(line_en) + lines_added = 1 + translation_line_counts.append(lines_added) + else: + translation_line_counts.append(1) + en.append(line_en) + + +with open(train_expected_path + '.injected', 'w') as file_pl_write: + for line, translation_line_ct in zip(file_pl, translation_line_counts): + for i in range(translation_line_ct): + file_pl_write.write(full_strip(line) + '\n') + + +with open(train_in_path + '.injected', 'w') as file_en_write: + for e in en: + file_en_write.write(e + '\n') diff --git a/scripts/lemmatize_glossary.py b/scripts/lemmatize_glossary.py new file mode 100644 index 0000000..a62cb5f --- /dev/null +++ b/scripts/lemmatize_glossary.py @@ -0,0 +1,19 @@ +import nltk +import pandas as pd + +from nltk.stem import WordNetLemmatizer + +nltk.download('wordnet') + +wl = WordNetLemmatizer() +glossary = pd.read_csv('mt-summit-corpora/glossary.tsv', sep='\t', header=None, names=['source', 'result']) +source_lemmatized = [] +for word in glossary['source']: + word = nltk.word_tokenize(word) + source_lemmatized.append(' '.join([wl.lemmatize(x) for x in word])) + +glossary['source_lem'] = source_lemmatized +glossary = glossary[['source', 'source_lem', 'result']] +glossary.set_index('source_lem') + +glossary.to_csv('mt-summit-corpora/glossary_lem.tsv', sep='\t', index=False) diff --git a/scripts/lemmatize_in.py b/scripts/lemmatize_in.py new file mode 100644 index 0000000..190d7c6 --- /dev/null +++ b/scripts/lemmatize_in.py @@ -0,0 +1,23 @@ +import nltk +from nltk.stem import WordNetLemmatizer + + +wl = WordNetLemmatizer() + +# train_in_path = 'mt-summit-corpora/train/in.tsv' +# train_expected_path = 'mt-summit-corpora/train/expected.tsv' + +train_in_path = 'mt-summit-corpora/dev-0/in.tsv' +train_expected_path = 'mt-summit-corpora/dev-0/expected.tsv' + +file_lemmatized = [] +with open(train_in_path, 'r') as file: + for line in file: + if len(file_lemmatized) % 50000 == 0: + print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\r') + line = nltk.word_tokenize(line) + file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line])) + +with open(train_in_path + '.lemmatized', 'w') as file_write: + for line in file_lemmatized: + file_write.write(line + '\n')