add rapidfuzz

This commit is contained in:
jakubknczny 2022-01-18 10:27:53 +01:00
parent cb4a413644
commit 05e1a10139
7 changed files with 13372 additions and 39 deletions

136
inject.py Normal file
View File

@ -0,0 +1,136 @@
import copy
import pandas as pd
import spacy
from spaczz.matcher import FuzzyMatcher
# spacy.require_gpu()
spacy_nlp_en = spacy.load('en_core_web_sm')
spacy_nlp_pl = spacy.load('pl_core_news_sm')
print('lemmatizing glossary')
glossary = pd.read_csv('glossary.tsv', sep='\t', header=None, names=['source', 'result'])
source_lemmatized = []
for word in glossary['source']:
temp = []
for token in spacy_nlp_en(word):
temp.append(token.lemma_)
source_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
result_lemmatized = []
for word in glossary['result']:
temp = []
for token in spacy_nlp_pl(word):
temp.append(token.lemma_)
result_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
glossary['source_lem'] = source_lemmatized
glossary['result_lem'] = result_lemmatized
glossary = glossary[['source', 'source_lem', 'result', 'result_lem']]
glossary.set_index('source_lem')
glossary.to_csv('glossary_lem.tsv', sep='\t')
dev_path = 'dev-0/'
print('lemmatizing corpus ' + dev_path)
skip_chars = ''',./!?'''
with open(dev_path + 'in.tsv', 'r') as file:
file_lemmatized = []
for line in file:
temp = []
for token in spacy_nlp_en(line):
temp.append(token.lemma_)
file_lemmatized.append(' '.join([x for x in temp if x not in skip_chars])
.replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
with open(dev_path + 'expected.tsv', 'r') as file:
file_pl_lemmatized = []
for line in file:
temp = []
for token in spacy_nlp_pl(line):
temp.append(token.lemma_)
file_pl_lemmatized.append(' '.join([x for x in temp if x not in skip_chars])
.replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
# glossary
glossary = pd.read_csv('glossary_lem.tsv', sep='\t', header=0, index_col=0)
train_glossary = glossary.iloc[[x for x in range(len(glossary)) if x % 6 != 0]]
# add rules to English matcher
nlp = spacy.blank("en")
matcher = FuzzyMatcher(nlp.vocab)
for word in train_glossary['source_lem']:
matcher.add(word, [nlp(word)])
# add rules to Polish matcher
nlp_pl = spacy.blank("pl")
matcher_pl = FuzzyMatcher(nlp_pl.vocab)
for word, word_id in zip(train_glossary['result_lem'], train_glossary['source_lem']):
matcher_pl.add(word, [nlp_pl(word)])
en = []
translation_line_counts = []
for line_id in range(len(file_lemmatized)):
if line_id % 100 == 0:
print('injecting glossary: ' + str(line_id) + "/" + str(len(file_lemmatized)), end='\r')
doc = nlp(file_lemmatized[line_id])
matches = matcher(doc)
line_counter = 0
for match_id, start, end, ratio in matches:
if ratio > 90:
doc_pl = nlp_pl(file_pl_lemmatized[line_id])
matches_pl = matcher_pl(doc_pl)
for match_id_pl, start_pl, end_pl, ratio_pl in matches_pl:
if match_id_pl == glossary[glossary['source_lem'] == match_id].values[0][3]:
line_counter += 1
en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))
if line_counter == 0:
line_counter = 1
en.append(file_lemmatized[line_id])
translation_line_counts.append(line_counter)
print('saving files')
tlcs = copy.deepcopy(translation_line_counts)
translations = pd.read_csv(dev_path + 'expected.tsv', sep='\t', header=None, names=['text'])
translations['id'] = [x for x in range(len(translations))]
ctr = 0
sentence = ''
with open(dev_path + 'in.tsv.injected.crossvalidated', 'w') as file_en:
with open(dev_path + 'expected.tsv.injected.crossvalidated', 'w') as file_pl:
for i in range(len(en)):
if i > 0:
if en[i-1] != en[i]:
if ctr == 0:
sentence = translations.iloc[0]
translations.drop(sentence['id'], inplace=True)
sentence = sentence['text']
try:
ctr = tlcs.pop(0)
except:
pass
file_en.write(en[i])
file_pl.write(sentence + '\n')
ctr = ctr - 1
else:
try:
ctr = tlcs.pop(0) - 1
except:
pass
sentence = translations.iloc[0]
translations.drop(sentence['id'], inplace=True)
sentence = sentence['text']
file_en.write(en[i])
file_pl.write(sentence + '\n')

124
inject_rapid.py Normal file
View File

@ -0,0 +1,124 @@
import spacy
import copy
import pandas as pd
import rapidfuzz
from rapidfuzz.fuzz import partial_ratio
import time
from rapidfuzz.utils import default_process
import sys
spacy.require_gpu()
spacy_nlp_en = spacy.load('en_core_web_sm')
spacy_nlp_pl = spacy.load("pl_core_news_sm")
def read_arguments():
try:
corpus_path, glossary_path = sys.argv
return corpus_path, glossary_path
except:
print("ERROR: Wrong argument amount.")
sys.exit(1)
glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv', sep='\t', header=None, names=['source', 'result'])
source_lemmatized = []
for word in glossary['source']:
temp = []
for token in spacy_nlp_en(word):
temp.append(token.lemma_)
source_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
result_lemmatized = []
for word in glossary['result']:
temp = []
for token in spacy_nlp_pl(word):
temp.append(token.lemma_)
result_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
glossary['source_lem'] = source_lemmatized
glossary['result_lem'] = result_lemmatized
glossary = glossary[['source', 'source_lem', 'result', 'result_lem']]
corpus_path = '~/mt-summit-corpora/train/'
skip_chars = ''',./!?'''
with open(corpus_path + 'in.tsv', 'r') as file:
file_lemmatized = []
for line in file:
temp = []
for token in spacy_nlp_en(line):
temp.append(token.lemma_)
file_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
with open(corpus_path + 'expected.tsv', 'r') as file:
file_pl_lemmatized = []
for line in file:
temp = []
for token in spacy_nlp_pl(line):
temp.append(token.lemma_)
file_pl_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ', '').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
THRESHOLD = 88
def is_injectable(sentence_pl, sequence):
sen = sentence_pl.split()
window_size = len(sequence.split())
maxx = 0
for i in range(len(sen) - window_size):
current = rapidfuzz.fuzz.partial_ratio(' '.join(sen[i:i + window_size]), sequence)
if current > maxx:
maxx = current
return maxx
def inject(sentence, sequence):
sen = sentence.split()
window_size = len(sequence.split())
maxx = 0
maxxi = 0
for i in range(len(sen) - window_size):
current = rapidfuzz.fuzz.partial_ratio(' '.join(sen[i:i + window_size]), sequence)
if current > maxx:
maxx = current
maxxi = i
return ' '.join(sen[:maxxi + window_size]) + ' ' \
+ glossary.loc[lambda df: df['source_lem'] == sequence]['result'].astype(str).values.flatten() \
+ ' ' + ' '.join(sen[maxxi + window_size:])
glossary = pd.read_csv('kompendium_lem_cleaned.tsv', sep='\t', header=0, index_col=0)
glossary['source_lem'] = [default_process(x) for x in glossary['source_lem']]
start_time = time.time_ns()
en = []
translation_line_counts = []
for line, line_pl in zip(file_lemmatized, file_pl_lemmatized):
line = default_process(line)
line_pl = default_process(line_pl)
matchez = rapidfuzz.process.extract(query=line, choices=glossary['source_lem'], limit=5, score_cutoff=THRESHOLD, scorer=partial_ratio)
translation_line_counts.append(len(matchez))
for match in matchez:
# if is_injectable(line_pl, match[0]):
en.append(inject(line, match[0])[0])
stop = time.time_ns()
timex = (stop - start_time) / 1000000000
print(timex)
tlcs = copy.deepcopy(translation_line_counts)
translations = pd.read_csv(corpus_path + 'expected.tsv', sep='\t', header=None, names=['text'])
with open(corpus_path + 'extected.tsv.injected.crossvalidated.pl', 'w') as file_pl:
for line, translation_line_ct in zip(translations, tlcs):
for i in range(translation_line_ct):
file_pl.write(line)
with open(corpus_path + 'in.tsv.injected.crossvalidated.en', 'w') as file_en:
for e in en:
file_en.write(e + '\n')

View File

@ -15,19 +15,21 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 2,
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": " source \\\nsource_lem \naaofi aaofi \naca aca \nacca acca \nabacus abacus \nabandonment cost abandonment costs \n... ... \nytd ytd \nyear-end year-end \nyear-to-date year-to-date \nzog zog \nzero overhead growth zero overhead growth \n\n result \\\nsource_lem \naaofi organizacja rachunkowości i audytu dla islamsk... \naca członek stowarzyszenia dyplomowanych biegłych ... \nacca stowarzyszenie dyplomowanych biegłych rewidentów \nabacus liczydło \nabandonment cost koszty zaniechania \n... ... \nytd od początku roku \nyear-end koniec roku \nyear-to-date od początku roku \nzog zero wzrostu kosztów ogólnych \nzero overhead growth zero wzrostu kosztów ogólnych \n\n result_lem \nsource_lem \naaofi organizacja rachunkowość i audyt dla islamski ... \naca członek stowarzyszenie dyplomowany biegły rewi... \nacca stowarzyszenie dyplomowany biegły rewident \nabacus liczydło \nabandonment cost koszt zaniechanie \n... ... \nytd od początek rok \nyear-end koniec rok \nyear-to-date od początek rok \nzog zero wzrost koszt ogólny \nzero overhead growth zero wzrost koszt ogólny \n\n[1197 rows x 3 columns]", "text/plain": " source \\\nsource_lem \naaofi aaofi \naca aca \nacca acca \nabacus abacus \nabandonment cost abandonment costs \n... ... \nytd ytd \nyear-end year-end \nyear-to-date year-to-date \nzog zog \nzero overhead growth zero overhead growth \n\n result \\\nsource_lem \naaofi organizacja rachunkowości i audytu dla islamsk... \naca członek stowarzyszenia dyplomowanych biegłych ... \nacca stowarzyszenie dyplomowanych biegłych rewidentów \nabacus liczydło \nabandonment cost koszty zaniechania \n... ... \nytd od początku roku \nyear-end koniec roku \nyear-to-date od początku roku \nzog zero wzrostu kosztów ogólnych \nzero overhead growth zero wzrostu kosztów ogólnych \n\n result_lem \nsource_lem \naaofi organizacja rachunkowość i audyt dla islamski ... \naca członek stowarzyszenie dyplomowany biegły rewi... \nacca stowarzyszenie dyplomowany biegły rewident \nabacus liczydło \nabandonment cost koszt zaniechanie \n... ... \nytd od początek rok \nyear-end koniec rok \nyear-to-date od początek rok \nzog zero wzrost koszt ogólny \nzero overhead growth zero wzrost koszt ogólny \n\n[1197 rows x 3 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>source</th>\n <th>result</th>\n <th>result_lem</th>\n </tr>\n <tr>\n <th>source_lem</th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>aaofi</th>\n <td>aaofi</td>\n <td>organizacja rachunkowości i audytu dla islamsk...</td>\n <td>organizacja rachunkowość i audyt dla islamski ...</td>\n </tr>\n <tr>\n <th>aca</th>\n <td>aca</td>\n <td>członek stowarzyszenia dyplomowanych biegłych ...</td>\n <td>członek stowarzyszenie dyplomowany biegły rewi...</td>\n </tr>\n <tr>\n <th>acca</th>\n <td>acca</td>\n <td>stowarzyszenie dyplomowanych biegłych rewidentów</td>\n <td>stowarzyszenie dyplomowany biegły rewident</td>\n </tr>\n <tr>\n <th>abacus</th>\n <td>abacus</td>\n <td>liczydło</td>\n <td>liczydło</td>\n </tr>\n <tr>\n <th>abandonment cost</th>\n <td>abandonment costs</td>\n <td>koszty zaniechania</td>\n <td>koszt zaniechanie</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>ytd</th>\n <td>ytd</td>\n <td>od początku roku</td>\n <td>od początek rok</td>\n </tr>\n <tr>\n <th>year-end</th>\n <td>year-end</td>\n <td>koniec roku</td>\n <td>koniec rok</td>\n </tr>\n <tr>\n <th>year-to-date</th>\n <td>year-to-date</td>\n <td>od początku roku</td>\n <td>od początek rok</td>\n </tr>\n <tr>\n <th>zog</th>\n <td>zog</td>\n <td>zero wzrostu kosztów ogólnych</td>\n <td>zero wzrost koszt ogólny</td>\n </tr>\n <tr>\n <th>zero overhead growth</th>\n <td>zero overhead growth</td>\n <td>zero wzrostu kosztów ogólnych</td>\n <td>zero wzrost koszt ogólny</td>\n </tr>\n </tbody>\n</table>\n<p>1197 rows × 3 columns</p>\n</div>" "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>source</th>\n <th>result</th>\n <th>result_lem</th>\n </tr>\n <tr>\n <th>source_lem</th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>aaofi</th>\n <td>aaofi</td>\n <td>organizacja rachunkowości i audytu dla islamsk...</td>\n <td>organizacja rachunkowość i audyt dla islamski ...</td>\n </tr>\n <tr>\n <th>aca</th>\n <td>aca</td>\n <td>członek stowarzyszenia dyplomowanych biegłych ...</td>\n <td>członek stowarzyszenie dyplomowany biegły rewi...</td>\n </tr>\n <tr>\n <th>acca</th>\n <td>acca</td>\n <td>stowarzyszenie dyplomowanych biegłych rewidentów</td>\n <td>stowarzyszenie dyplomowany biegły rewident</td>\n </tr>\n <tr>\n <th>abacus</th>\n <td>abacus</td>\n <td>liczydło</td>\n <td>liczydło</td>\n </tr>\n <tr>\n <th>abandonment cost</th>\n <td>abandonment costs</td>\n <td>koszty zaniechania</td>\n <td>koszt zaniechanie</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>ytd</th>\n <td>ytd</td>\n <td>od początku roku</td>\n <td>od początek rok</td>\n </tr>\n <tr>\n <th>year-end</th>\n <td>year-end</td>\n <td>koniec roku</td>\n <td>koniec rok</td>\n </tr>\n <tr>\n <th>year-to-date</th>\n <td>year-to-date</td>\n <td>od początku roku</td>\n <td>od początek rok</td>\n </tr>\n <tr>\n <th>zog</th>\n <td>zog</td>\n <td>zero wzrostu kosztów ogólnych</td>\n <td>zero wzrost koszt ogólny</td>\n </tr>\n <tr>\n <th>zero overhead growth</th>\n <td>zero overhead growth</td>\n <td>zero wzrostu kosztów ogólnych</td>\n <td>zero wzrost koszt ogólny</td>\n </tr>\n </tbody>\n</table>\n<p>1197 rows × 3 columns</p>\n</div>"
}, },
"execution_count": 1, "execution_count": 2,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"import time\n",
"\n",
"import pandas as pd\n", "import pandas as pd\n",
"import spacy\n", "import spacy\n",
"\n", "\n",
@ -160,25 +162,8 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": null,
"outputs": [ "outputs": [],
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[0;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)",
"\u001B[0;32m/tmp/ipykernel_1418662/149035253.py\u001B[0m in \u001B[0;36m<module>\u001B[0;34m\u001B[0m\n\u001B[1;32m 18\u001B[0m \u001B[0;32mfor\u001B[0m \u001B[0mline_id\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mline\u001B[0m \u001B[0;32min\u001B[0m \u001B[0menumerate\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mfile_lemmatized\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 19\u001B[0m \u001B[0mdoc\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mnlp\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mline\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 20\u001B[0;31m \u001B[0mmatches\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mmatcher\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mdoc\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 21\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 22\u001B[0m \u001B[0mline_counter\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;36m0\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
"\u001B[0;32m~/Workspace/Envs/trainMT/lib/python3.8/site-packages/spaczz/matcher/_phrasematcher.py\u001B[0m in \u001B[0;36m__call__\u001B[0;34m(self, doc)\u001B[0m\n\u001B[1;32m 95\u001B[0m \u001B[0;32mif\u001B[0m \u001B[0;32mnot\u001B[0m \u001B[0mkwargs\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 96\u001B[0m \u001B[0mkwargs\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mdefaults\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 97\u001B[0;31m \u001B[0mmatches_wo_label\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_searcher\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mmatch\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mdoc\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mpattern\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;34m**\u001B[0m\u001B[0mkwargs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 98\u001B[0m \u001B[0;32mif\u001B[0m \u001B[0mmatches_wo_label\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 99\u001B[0m matches_w_label = [\n",
"\u001B[0;32m~/Workspace/Envs/trainMT/lib/python3.8/site-packages/spaczz/search/_phrasesearcher.py\u001B[0m in \u001B[0;36mmatch\u001B[0;34m(self, doc, query, flex, min_r1, min_r2, thresh, *args, **kwargs)\u001B[0m\n\u001B[1;32m 137\u001B[0m \u001B[0mflex\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_calc_flex\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mquery\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mflex\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 138\u001B[0m \u001B[0mmin_r1\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mmin_r2\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mthresh\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_check_ratios\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmin_r1\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mmin_r2\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mthresh\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mflex\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 139\u001B[0;31m \u001B[0mmatch_values\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_scan\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mdoc\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mquery\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mmin_r1\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;34m*\u001B[0m\u001B[0margs\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;34m**\u001B[0m\u001B[0mkwargs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 140\u001B[0m \u001B[0;32mif\u001B[0m \u001B[0mmatch_values\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 141\u001B[0m \u001B[0mpositions\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mlist\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmatch_values\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mkeys\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
"\u001B[0;32m~/Workspace/Envs/trainMT/lib/python3.8/site-packages/spaczz/search/_phrasesearcher.py\u001B[0m in \u001B[0;36m_scan\u001B[0;34m(self, doc, query, min_r1, *args, **kwargs)\u001B[0m\n\u001B[1;32m 282\u001B[0m \u001B[0mi\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;36m0\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 283\u001B[0m \u001B[0;32mwhile\u001B[0m \u001B[0mi\u001B[0m \u001B[0;34m+\u001B[0m \u001B[0mlen\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mquery\u001B[0m\u001B[0;34m)\u001B[0m \u001B[0;34m<=\u001B[0m \u001B[0mlen\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mdoc\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 284\u001B[0;31m \u001B[0mmatch\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mcompare\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mquery\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mdoc\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0mi\u001B[0m \u001B[0;34m:\u001B[0m \u001B[0mi\u001B[0m \u001B[0;34m+\u001B[0m \u001B[0mlen\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mquery\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;34m*\u001B[0m\u001B[0margs\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;34m**\u001B[0m\u001B[0mkwargs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 285\u001B[0m \u001B[0;32mif\u001B[0m \u001B[0mmatch\u001B[0m \u001B[0;34m>=\u001B[0m \u001B[0mmin_r1\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 286\u001B[0m \u001B[0mmatch_values\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0mi\u001B[0m\u001B[0;34m]\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mmatch\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
"\u001B[0;32m~/Workspace/Envs/trainMT/lib/python3.8/site-packages/spacy/tokens/doc.pyx\u001B[0m in \u001B[0;36mspacy.tokens.doc.Doc.__getitem__\u001B[0;34m()\u001B[0m\n",
"\u001B[0;32m~/Workspace/Envs/trainMT/lib/python3.8/site-packages/spacy/util.py\u001B[0m in \u001B[0;36mnormalize_slice\u001B[0;34m(length, start, stop, step)\u001B[0m\n\u001B[1;32m 1199\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 1200\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m-> 1201\u001B[0;31m def normalize_slice(\n\u001B[0m\u001B[1;32m 1202\u001B[0m \u001B[0mlength\u001B[0m\u001B[0;34m:\u001B[0m \u001B[0mint\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mstart\u001B[0m\u001B[0;34m:\u001B[0m \u001B[0mint\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mstop\u001B[0m\u001B[0;34m:\u001B[0m \u001B[0mint\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mstep\u001B[0m\u001B[0;34m:\u001B[0m \u001B[0mOptional\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0mint\u001B[0m\u001B[0;34m]\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;32mNone\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 1203\u001B[0m ) -> Tuple[int, int]:\n",
"\u001B[0;31mKeyboardInterrupt\u001B[0m: "
]
}
],
"source": [ "source": [
"import spacy\n", "import spacy\n",
"from spaczz.matcher import FuzzyMatcher\n", "from spaczz.matcher import FuzzyMatcher\n",
@ -201,23 +186,24 @@
" doc = nlp(line)\n", " doc = nlp(line)\n",
" matches = matcher(doc)\n", " matches = matcher(doc)\n",
"\n", "\n",
" line_counter = 0\n", " not_injected = 0\n",
" for match_id, start, end, ratio in matches:\n", " for match_id, start, end, ratio in matches:\n",
" if ratio > 90:\n", " if ratio > 90:\n",
" line_counter += 1\n", " not_injected += 1\n",
" en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))\n", " en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))\n",
"\n", "\n",
"\n", "\n",
" if line_counter == 0:\n", " if not_injected == 0:\n",
" line_counter = 1\n", " not_injected = 1\n",
" en.append(line)\n", " en.append(line)\n",
" translation_line_counts.append(line_counter)\n", " translation_line_counts.append(not_injected)\n",
"\n" "\n"
], ],
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"pycharm": { "pycharm": {
"name": "#%%\n" "name": "#%%\n",
"is_executing": true
} }
} }
}, },
@ -283,7 +269,6 @@
"for word, word_id in zip(train_glossary['result_lem'], train_glossary['source_lem']):\n", "for word, word_id in zip(train_glossary['result_lem'], train_glossary['source_lem']):\n",
" matcher_pl.add(word, [nlp_pl(word)])\n", " matcher_pl.add(word, [nlp_pl(word)])\n",
"\n", "\n",
"# todo\n",
"en = []\n", "en = []\n",
"translation_line_counts = []\n", "translation_line_counts = []\n",
"for line_id in range(len(file_lemmatized)):\n", "for line_id in range(len(file_lemmatized)):\n",
@ -291,7 +276,7 @@
" doc = nlp(file_lemmatized[line_id])\n", " doc = nlp(file_lemmatized[line_id])\n",
" matches = matcher(doc)\n", " matches = matcher(doc)\n",
"\n", "\n",
" line_counter = 0\n", " not_injected = 0\n",
" for match_id, start, end, ratio in matches:\n", " for match_id, start, end, ratio in matches:\n",
" if ratio > 90:\n", " if ratio > 90:\n",
" doc_pl = nlp_pl(file_pl_lemmatized[line_id])\n", " doc_pl = nlp_pl(file_pl_lemmatized[line_id])\n",
@ -299,13 +284,13 @@
"\n", "\n",
" for match_id_pl, start_pl, end_pl, ratio_pl in matches_pl:\n", " for match_id_pl, start_pl, end_pl, ratio_pl in matches_pl:\n",
" if match_id_pl == glossary[glossary['source_lem'] == match_id].values[0][3]:\n", " if match_id_pl == glossary[glossary['source_lem'] == match_id].values[0][3]:\n",
" line_counter += 1\n", " not_injected += 1\n",
" en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))\n", " en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))\n",
"\n", "\n",
" if line_counter == 0:\n", " if not_injected == 0:\n",
" line_counter = 1\n", " not_injected = 1\n",
" en.append(file_lemmatized[line_id])\n", " en.append(file_lemmatized[line_id])\n",
" translation_line_counts.append(line_counter)\n" " translation_line_counts.append(not_injected)\n"
], ],
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
@ -362,6 +347,136 @@
"name": "#%%\n" "name": "#%%\n"
} }
} }
},
{
"cell_type": "markdown",
"source": [
"# Inject glossary Polish crosscheck fast?"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 49,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"took 152.213599056 injected 63 words. rate 6.569715230451229 sen/s\n"
]
}
],
"source": [
"import time\n",
"import spacy\n",
"from spaczz.matcher import FuzzyMatcher\n",
"\n",
"\n",
"# glossary\n",
"glossary = pd.read_csv('kompendium_lem.tsv', sep='\\t', header=0, index_col=0)\n",
"train_glossary = glossary.iloc[[x for x in range(len(glossary)) if x % 6 != 0]]\n",
"\n",
"# add rules to English matcher\n",
"nlp = spacy.blank(\"en\")\n",
"matcher = FuzzyMatcher(nlp.vocab)\n",
"for word in train_glossary['source_lem']:\n",
" matcher.add(word, [nlp(word)])\n",
"\n",
"# add rules to Polish matcher\n",
"nlp_pl = spacy.blank(\"pl\")\n",
"matcher_pl = FuzzyMatcher(nlp_pl.vocab)\n",
"for word, word_id in zip(train_glossary['result_lem'], train_glossary['source_lem']):\n",
" matcher_pl.add(word, [nlp_pl(word)])\n",
"\n",
"start_time = time.time_ns()\n",
"en = []\n",
"injection_counter = 0\n",
"for line_id in range(len(file_lemmatized)):\n",
"\n",
" doc = nlp(file_lemmatized[line_id])\n",
" matches = matcher(nlp(file_lemmatized[line_id]))\n",
"\n",
" not_injected = True\n",
" if len(matches) > 0:\n",
" match_id, _, end, ratio = sorted(matches, key=lambda x: len(x[0]), reverse=True)[0]\n",
" if ratio > 90:\n",
" matches_pl = matcher_pl(nlp_pl(file_pl_lemmatized[line_id]))\n",
"\n",
" for match_id_pl, _, _, _ in matches_pl:\n",
" if match_id_pl == glossary[glossary['source_lem'] == match_id].values[0][3]:\n",
" not_injected = False\n",
" injection_counter += 1\n",
" en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))\n",
" break\n",
"\n",
" if not_injected:\n",
" en.append(file_lemmatized[line_id])\n",
"\n",
"stop = time.time_ns()\n",
"timex = (stop - start_time) / 1000000000\n",
"print(f'took {timex} injected {injection_counter} words. rate {len(file_lemmatized)/timex} sen/s')"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [],
"source": [
"import copy\n",
"\n",
"\n",
"tlcs = copy.deepcopy(translation_line_counts)\n",
"\n",
"translations = pd.read_csv(dev_path + '.pl', sep='\\t', header=None, names=['text'])\n",
"translations['id'] = [x for x in range(len(translations))]\n",
"\n",
"ctr = 0\n",
"sentence = ''\n",
"with open(dev_path + '.injected.crossvalidated.en', 'w') as file_en:\n",
" with open(dev_path + '.injected.crossvalidated.pl', 'w') as file_pl:\n",
" for i in range(len(en)):\n",
" if i > 0:\n",
" if en[i-1] != en[i]:\n",
" if ctr == 0:\n",
" sentence = translations.iloc[0]\n",
" translations.drop(sentence['id'], inplace=True)\n",
" sentence = sentence['text']\n",
" try:\n",
" ctr = tlcs.pop(0)\n",
" except:\n",
" pass\n",
" file_en.write(en[i])\n",
" file_pl.write(sentence + '\\n')\n",
" ctr = ctr - 1\n",
" else:\n",
" try:\n",
" ctr = tlcs.pop(0) - 1\n",
" except:\n",
" pass\n",
" sentence = translations.iloc[0]\n",
" translations.drop(sentence['id'], inplace=True)\n",
" sentence = sentence['text']\n",
" file_en.write(en[i])\n",
" file_pl.write(sentence + '\\n')\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
} }
], ],
"metadata": { "metadata": {

1194
kompendium_lem_cleaned.tsv Normal file

File diff suppressed because it is too large Load Diff

11740
rapidfuzztest.ipynb Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,18 +1,30 @@
first iteration: first iteration:
./marian/build/marian --model mt.npz --type transformer --overwrite \ ./marian/build/marian --model mt.npz \
--type transformer --overwrite \
--train-sets mt-summit-corpora/mt-summit-corpora/dev/dev.en \ --train-sets mt-summit-corpora/mt-summit-corpora/dev/dev.en \
mt-summit-corpora/mt-summit-corpora/dev/dev.pl \ mt-summit-corpora/mt-summit-corpora/dev/dev.pl \
--disp-freq 1000 --save-freq 1000 --optimizer adam --lr-report --disp-freq 1000 \
--save-freq 1000 \
--optimizer adam \
--lr-report
next iterations: next iterations:
./marian/build/marian --model mt.npz --type transformer --overwrite \ ./marian/build/marian --model mt.npz \
--type transformer --overwrite \
--train-sets mt-summit-corpora/mt-summit-corpora/dev/dev.en \ --train-sets mt-summit-corpora/mt-summit-corpora/dev/dev.en \
mt-summit-corpora/mt-summit-corpora/dev/dev.pl \ mt-summit-corpora/mt-summit-corpora/dev/dev.pl \
--disp-freq 1000 --save-freq 1000 --optimizer adam --lr-report \ --disp-freq 1000 \
--save-freq 1000 \
--optimizer adam \
--lr-report \
--pretrained-model mt.npz --pretrained-model mt.npz
./marian/build/marian --model mt.npz --type transformer --overwrite \ ./marian/build/marian --model mt.npz \
--type transformer --overwrite \
--train-sets mt-summit-corpora/mt-summit-corpora/train/train.en \ --train-sets mt-summit-corpora/mt-summit-corpora/train/train.en \
mt-summit-corpora/mt-summit-corpora/train/train.pl \ mt-summit-corpora/mt-summit-corpora/train/train.pl \
--disp-freq 1000 --save-freq 10000 --optimizer adam --lr-report \ --disp-freq 1000 \
--save-freq 10000 \
--optimizer adam \
--lr-report \
--pretrained-model mt.npz --pretrained-model mt.npz

12
venv-setup.sh Normal file
View File

@ -0,0 +1,12 @@
#!/bin.bash
apt install python3-pip
apt install python3-virtualenv
virtualenv -p python3.8 gpu
source gpu/bin/activate
pip install pandas ipython
pip install spacy[cuda114]
python -m spacy download en_core_web_sm
python -m spacy download pl_core_news_sm
pip install spaczz
pip install rapidfuzz