add rapidfuzz
This commit is contained in:
parent
cb4a413644
commit
05e1a10139
136
inject.py
Normal file
136
inject.py
Normal file
@ -0,0 +1,136 @@
|
||||
import copy
|
||||
import pandas as pd
|
||||
import spacy
|
||||
from spaczz.matcher import FuzzyMatcher
|
||||
|
||||
# spacy.require_gpu()
|
||||
|
||||
spacy_nlp_en = spacy.load('en_core_web_sm')
|
||||
spacy_nlp_pl = spacy.load('pl_core_news_sm')
|
||||
|
||||
print('lemmatizing glossary')
|
||||
|
||||
glossary = pd.read_csv('glossary.tsv', sep='\t', header=None, names=['source', 'result'])
|
||||
|
||||
source_lemmatized = []
|
||||
for word in glossary['source']:
|
||||
temp = []
|
||||
for token in spacy_nlp_en(word):
|
||||
temp.append(token.lemma_)
|
||||
source_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
|
||||
|
||||
result_lemmatized = []
|
||||
for word in glossary['result']:
|
||||
temp = []
|
||||
for token in spacy_nlp_pl(word):
|
||||
temp.append(token.lemma_)
|
||||
result_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
|
||||
|
||||
glossary['source_lem'] = source_lemmatized
|
||||
glossary['result_lem'] = result_lemmatized
|
||||
glossary = glossary[['source', 'source_lem', 'result', 'result_lem']]
|
||||
glossary.set_index('source_lem')
|
||||
|
||||
glossary.to_csv('glossary_lem.tsv', sep='\t')
|
||||
|
||||
dev_path = 'dev-0/'
|
||||
|
||||
print('lemmatizing corpus ' + dev_path)
|
||||
|
||||
skip_chars = ''',./!?'''
|
||||
|
||||
with open(dev_path + 'in.tsv', 'r') as file:
|
||||
file_lemmatized = []
|
||||
for line in file:
|
||||
temp = []
|
||||
for token in spacy_nlp_en(line):
|
||||
temp.append(token.lemma_)
|
||||
file_lemmatized.append(' '.join([x for x in temp if x not in skip_chars])
|
||||
.replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
|
||||
|
||||
with open(dev_path + 'expected.tsv', 'r') as file:
|
||||
file_pl_lemmatized = []
|
||||
for line in file:
|
||||
temp = []
|
||||
for token in spacy_nlp_pl(line):
|
||||
temp.append(token.lemma_)
|
||||
file_pl_lemmatized.append(' '.join([x for x in temp if x not in skip_chars])
|
||||
.replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
|
||||
|
||||
# glossary
|
||||
glossary = pd.read_csv('glossary_lem.tsv', sep='\t', header=0, index_col=0)
|
||||
train_glossary = glossary.iloc[[x for x in range(len(glossary)) if x % 6 != 0]]
|
||||
|
||||
# add rules to English matcher
|
||||
nlp = spacy.blank("en")
|
||||
matcher = FuzzyMatcher(nlp.vocab)
|
||||
for word in train_glossary['source_lem']:
|
||||
matcher.add(word, [nlp(word)])
|
||||
|
||||
# add rules to Polish matcher
|
||||
nlp_pl = spacy.blank("pl")
|
||||
matcher_pl = FuzzyMatcher(nlp_pl.vocab)
|
||||
for word, word_id in zip(train_glossary['result_lem'], train_glossary['source_lem']):
|
||||
matcher_pl.add(word, [nlp_pl(word)])
|
||||
|
||||
en = []
|
||||
translation_line_counts = []
|
||||
for line_id in range(len(file_lemmatized)):
|
||||
|
||||
if line_id % 100 == 0:
|
||||
print('injecting glossary: ' + str(line_id) + "/" + str(len(file_lemmatized)), end='\r')
|
||||
|
||||
doc = nlp(file_lemmatized[line_id])
|
||||
matches = matcher(doc)
|
||||
|
||||
line_counter = 0
|
||||
for match_id, start, end, ratio in matches:
|
||||
if ratio > 90:
|
||||
doc_pl = nlp_pl(file_pl_lemmatized[line_id])
|
||||
matches_pl = matcher_pl(doc_pl)
|
||||
|
||||
for match_id_pl, start_pl, end_pl, ratio_pl in matches_pl:
|
||||
if match_id_pl == glossary[glossary['source_lem'] == match_id].values[0][3]:
|
||||
line_counter += 1
|
||||
en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))
|
||||
|
||||
if line_counter == 0:
|
||||
line_counter = 1
|
||||
en.append(file_lemmatized[line_id])
|
||||
translation_line_counts.append(line_counter)
|
||||
|
||||
print('saving files')
|
||||
tlcs = copy.deepcopy(translation_line_counts)
|
||||
|
||||
translations = pd.read_csv(dev_path + 'expected.tsv', sep='\t', header=None, names=['text'])
|
||||
translations['id'] = [x for x in range(len(translations))]
|
||||
|
||||
ctr = 0
|
||||
sentence = ''
|
||||
with open(dev_path + 'in.tsv.injected.crossvalidated', 'w') as file_en:
|
||||
with open(dev_path + 'expected.tsv.injected.crossvalidated', 'w') as file_pl:
|
||||
for i in range(len(en)):
|
||||
if i > 0:
|
||||
if en[i-1] != en[i]:
|
||||
if ctr == 0:
|
||||
sentence = translations.iloc[0]
|
||||
translations.drop(sentence['id'], inplace=True)
|
||||
sentence = sentence['text']
|
||||
try:
|
||||
ctr = tlcs.pop(0)
|
||||
except:
|
||||
pass
|
||||
file_en.write(en[i])
|
||||
file_pl.write(sentence + '\n')
|
||||
ctr = ctr - 1
|
||||
else:
|
||||
try:
|
||||
ctr = tlcs.pop(0) - 1
|
||||
except:
|
||||
pass
|
||||
sentence = translations.iloc[0]
|
||||
translations.drop(sentence['id'], inplace=True)
|
||||
sentence = sentence['text']
|
||||
file_en.write(en[i])
|
||||
file_pl.write(sentence + '\n')
|
||||
|
124
inject_rapid.py
Normal file
124
inject_rapid.py
Normal file
@ -0,0 +1,124 @@
|
||||
import spacy
|
||||
import copy
|
||||
import pandas as pd
|
||||
import rapidfuzz
|
||||
from rapidfuzz.fuzz import partial_ratio
|
||||
import time
|
||||
from rapidfuzz.utils import default_process
|
||||
import sys
|
||||
|
||||
spacy.require_gpu()
|
||||
|
||||
spacy_nlp_en = spacy.load('en_core_web_sm')
|
||||
spacy_nlp_pl = spacy.load("pl_core_news_sm")
|
||||
|
||||
|
||||
def read_arguments():
|
||||
try:
|
||||
corpus_path, glossary_path = sys.argv
|
||||
return corpus_path, glossary_path
|
||||
except:
|
||||
print("ERROR: Wrong argument amount.")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
|
||||
glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv', sep='\t', header=None, names=['source', 'result'])
|
||||
|
||||
source_lemmatized = []
|
||||
for word in glossary['source']:
|
||||
temp = []
|
||||
for token in spacy_nlp_en(word):
|
||||
temp.append(token.lemma_)
|
||||
source_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
|
||||
|
||||
result_lemmatized = []
|
||||
for word in glossary['result']:
|
||||
temp = []
|
||||
for token in spacy_nlp_pl(word):
|
||||
temp.append(token.lemma_)
|
||||
result_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
|
||||
|
||||
glossary['source_lem'] = source_lemmatized
|
||||
glossary['result_lem'] = result_lemmatized
|
||||
glossary = glossary[['source', 'source_lem', 'result', 'result_lem']]
|
||||
|
||||
|
||||
corpus_path = '~/mt-summit-corpora/train/'
|
||||
|
||||
skip_chars = ''',./!?'''
|
||||
|
||||
with open(corpus_path + 'in.tsv', 'r') as file:
|
||||
file_lemmatized = []
|
||||
for line in file:
|
||||
temp = []
|
||||
for token in spacy_nlp_en(line):
|
||||
temp.append(token.lemma_)
|
||||
file_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
|
||||
|
||||
with open(corpus_path + 'expected.tsv', 'r') as file:
|
||||
file_pl_lemmatized = []
|
||||
for line in file:
|
||||
temp = []
|
||||
for token in spacy_nlp_pl(line):
|
||||
temp.append(token.lemma_)
|
||||
file_pl_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')'))
|
||||
|
||||
THRESHOLD = 88
|
||||
|
||||
def is_injectable(sentence_pl, sequence):
|
||||
sen = sentence_pl.split()
|
||||
window_size = len(sequence.split())
|
||||
maxx = 0
|
||||
for i in range(len(sen) - window_size):
|
||||
current = rapidfuzz.fuzz.partial_ratio(' '.join(sen[i:i + window_size]), sequence)
|
||||
if current > maxx:
|
||||
maxx = current
|
||||
return maxx
|
||||
|
||||
def inject(sentence, sequence):
|
||||
sen = sentence.split()
|
||||
window_size = len(sequence.split())
|
||||
maxx = 0
|
||||
maxxi = 0
|
||||
for i in range(len(sen) - window_size):
|
||||
current = rapidfuzz.fuzz.partial_ratio(' '.join(sen[i:i + window_size]), sequence)
|
||||
if current > maxx:
|
||||
maxx = current
|
||||
maxxi = i
|
||||
return ' '.join(sen[:maxxi + window_size]) + ' ' \
|
||||
+ glossary.loc[lambda df: df['source_lem'] == sequence]['result'].astype(str).values.flatten() \
|
||||
+ ' ' + ' '.join(sen[maxxi + window_size:])
|
||||
|
||||
glossary = pd.read_csv('kompendium_lem_cleaned.tsv', sep='\t', header=0, index_col=0)
|
||||
glossary['source_lem'] = [default_process(x) for x in glossary['source_lem']]
|
||||
|
||||
start_time = time.time_ns()
|
||||
en = []
|
||||
translation_line_counts = []
|
||||
for line, line_pl in zip(file_lemmatized, file_pl_lemmatized):
|
||||
line = default_process(line)
|
||||
line_pl = default_process(line_pl)
|
||||
matchez = rapidfuzz.process.extract(query=line, choices=glossary['source_lem'], limit=5, score_cutoff=THRESHOLD, scorer=partial_ratio)
|
||||
translation_line_counts.append(len(matchez))
|
||||
for match in matchez:
|
||||
# if is_injectable(line_pl, match[0]):
|
||||
en.append(inject(line, match[0])[0])
|
||||
|
||||
|
||||
stop = time.time_ns()
|
||||
timex = (stop - start_time) / 1000000000
|
||||
print(timex)
|
||||
|
||||
tlcs = copy.deepcopy(translation_line_counts)
|
||||
|
||||
translations = pd.read_csv(corpus_path + 'expected.tsv', sep='\t', header=None, names=['text'])
|
||||
with open(corpus_path + 'extected.tsv.injected.crossvalidated.pl', 'w') as file_pl:
|
||||
for line, translation_line_ct in zip(translations, tlcs):
|
||||
for i in range(translation_line_ct):
|
||||
file_pl.write(line)
|
||||
|
||||
|
||||
with open(corpus_path + 'in.tsv.injected.crossvalidated.en', 'w') as file_en:
|
||||
for e in en:
|
||||
file_en.write(e + '\n')
|
@ -15,19 +15,21 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 2,
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": " source \\\nsource_lem \naaofi aaofi \naca aca \nacca acca \nabacus abacus \nabandonment cost abandonment costs \n... ... \nytd ytd \nyear-end year-end \nyear-to-date year-to-date \nzog zog \nzero overhead growth zero overhead growth \n\n result \\\nsource_lem \naaofi organizacja rachunkowości i audytu dla islamsk... \naca członek stowarzyszenia dyplomowanych biegłych ... \nacca stowarzyszenie dyplomowanych biegłych rewidentów \nabacus liczydło \nabandonment cost koszty zaniechania \n... ... \nytd od początku roku \nyear-end koniec roku \nyear-to-date od początku roku \nzog zero wzrostu kosztów ogólnych \nzero overhead growth zero wzrostu kosztów ogólnych \n\n result_lem \nsource_lem \naaofi organizacja rachunkowość i audyt dla islamski ... \naca członek stowarzyszenie dyplomowany biegły rewi... \nacca stowarzyszenie dyplomowany biegły rewident \nabacus liczydło \nabandonment cost koszt zaniechanie \n... ... \nytd od początek rok \nyear-end koniec rok \nyear-to-date od początek rok \nzog zero wzrost koszt ogólny \nzero overhead growth zero wzrost koszt ogólny \n\n[1197 rows x 3 columns]",
|
||||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>source</th>\n <th>result</th>\n <th>result_lem</th>\n </tr>\n <tr>\n <th>source_lem</th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>aaofi</th>\n <td>aaofi</td>\n <td>organizacja rachunkowości i audytu dla islamsk...</td>\n <td>organizacja rachunkowość i audyt dla islamski ...</td>\n </tr>\n <tr>\n <th>aca</th>\n <td>aca</td>\n <td>członek stowarzyszenia dyplomowanych biegłych ...</td>\n <td>członek stowarzyszenie dyplomowany biegły rewi...</td>\n </tr>\n <tr>\n <th>acca</th>\n <td>acca</td>\n <td>stowarzyszenie dyplomowanych biegłych rewidentów</td>\n <td>stowarzyszenie dyplomowany biegły rewident</td>\n </tr>\n <tr>\n <th>abacus</th>\n <td>abacus</td>\n <td>liczydło</td>\n <td>liczydło</td>\n </tr>\n <tr>\n <th>abandonment cost</th>\n <td>abandonment costs</td>\n <td>koszty zaniechania</td>\n <td>koszt zaniechanie</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>ytd</th>\n <td>ytd</td>\n <td>od początku roku</td>\n <td>od początek rok</td>\n </tr>\n <tr>\n <th>year-end</th>\n <td>year-end</td>\n <td>koniec roku</td>\n <td>koniec rok</td>\n </tr>\n <tr>\n <th>year-to-date</th>\n <td>year-to-date</td>\n <td>od początku roku</td>\n <td>od początek rok</td>\n </tr>\n <tr>\n <th>zog</th>\n <td>zog</td>\n <td>zero wzrostu kosztów ogólnych</td>\n <td>zero wzrost koszt ogólny</td>\n </tr>\n <tr>\n <th>zero overhead growth</th>\n <td>zero overhead growth</td>\n <td>zero wzrostu kosztów ogólnych</td>\n <td>zero wzrost koszt ogólny</td>\n </tr>\n </tbody>\n</table>\n<p>1197 rows × 3 columns</p>\n</div>"
|
||||
},
|
||||
"execution_count": 1,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"import spacy\n",
|
||||
"\n",
|
||||
@ -160,25 +162,8 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "KeyboardInterrupt",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
|
||||
"\u001B[0;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)",
|
||||
"\u001B[0;32m/tmp/ipykernel_1418662/149035253.py\u001B[0m in \u001B[0;36m<module>\u001B[0;34m\u001B[0m\n\u001B[1;32m 18\u001B[0m \u001B[0;32mfor\u001B[0m \u001B[0mline_id\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mline\u001B[0m \u001B[0;32min\u001B[0m \u001B[0menumerate\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mfile_lemmatized\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 19\u001B[0m \u001B[0mdoc\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mnlp\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mline\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 20\u001B[0;31m \u001B[0mmatches\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mmatcher\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mdoc\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 21\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 22\u001B[0m \u001B[0mline_counter\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;36m0\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
|
||||
"\u001B[0;32m~/Workspace/Envs/trainMT/lib/python3.8/site-packages/spaczz/matcher/_phrasematcher.py\u001B[0m in \u001B[0;36m__call__\u001B[0;34m(self, doc)\u001B[0m\n\u001B[1;32m 95\u001B[0m \u001B[0;32mif\u001B[0m \u001B[0;32mnot\u001B[0m \u001B[0mkwargs\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 96\u001B[0m \u001B[0mkwargs\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mdefaults\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 97\u001B[0;31m \u001B[0mmatches_wo_label\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_searcher\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mmatch\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mdoc\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mpattern\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;34m**\u001B[0m\u001B[0mkwargs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 98\u001B[0m \u001B[0;32mif\u001B[0m \u001B[0mmatches_wo_label\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 99\u001B[0m matches_w_label = [\n",
|
||||
"\u001B[0;32m~/Workspace/Envs/trainMT/lib/python3.8/site-packages/spaczz/search/_phrasesearcher.py\u001B[0m in \u001B[0;36mmatch\u001B[0;34m(self, doc, query, flex, min_r1, min_r2, thresh, *args, **kwargs)\u001B[0m\n\u001B[1;32m 137\u001B[0m \u001B[0mflex\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_calc_flex\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mquery\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mflex\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 138\u001B[0m \u001B[0mmin_r1\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mmin_r2\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mthresh\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_check_ratios\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmin_r1\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mmin_r2\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mthresh\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mflex\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 139\u001B[0;31m \u001B[0mmatch_values\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0m_scan\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mdoc\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mquery\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mmin_r1\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;34m*\u001B[0m\u001B[0margs\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;34m**\u001B[0m\u001B[0mkwargs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 140\u001B[0m \u001B[0;32mif\u001B[0m \u001B[0mmatch_values\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 141\u001B[0m \u001B[0mpositions\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mlist\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmatch_values\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mkeys\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
|
||||
"\u001B[0;32m~/Workspace/Envs/trainMT/lib/python3.8/site-packages/spaczz/search/_phrasesearcher.py\u001B[0m in \u001B[0;36m_scan\u001B[0;34m(self, doc, query, min_r1, *args, **kwargs)\u001B[0m\n\u001B[1;32m 282\u001B[0m \u001B[0mi\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;36m0\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 283\u001B[0m \u001B[0;32mwhile\u001B[0m \u001B[0mi\u001B[0m \u001B[0;34m+\u001B[0m \u001B[0mlen\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mquery\u001B[0m\u001B[0;34m)\u001B[0m \u001B[0;34m<=\u001B[0m \u001B[0mlen\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mdoc\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 284\u001B[0;31m \u001B[0mmatch\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mcompare\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mquery\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mdoc\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0mi\u001B[0m \u001B[0;34m:\u001B[0m \u001B[0mi\u001B[0m \u001B[0;34m+\u001B[0m \u001B[0mlen\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mquery\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;34m*\u001B[0m\u001B[0margs\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;34m**\u001B[0m\u001B[0mkwargs\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 285\u001B[0m \u001B[0;32mif\u001B[0m \u001B[0mmatch\u001B[0m \u001B[0;34m>=\u001B[0m \u001B[0mmin_r1\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 286\u001B[0m \u001B[0mmatch_values\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0mi\u001B[0m\u001B[0;34m]\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mmatch\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
|
||||
"\u001B[0;32m~/Workspace/Envs/trainMT/lib/python3.8/site-packages/spacy/tokens/doc.pyx\u001B[0m in \u001B[0;36mspacy.tokens.doc.Doc.__getitem__\u001B[0;34m()\u001B[0m\n",
|
||||
"\u001B[0;32m~/Workspace/Envs/trainMT/lib/python3.8/site-packages/spacy/util.py\u001B[0m in \u001B[0;36mnormalize_slice\u001B[0;34m(length, start, stop, step)\u001B[0m\n\u001B[1;32m 1199\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 1200\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m-> 1201\u001B[0;31m def normalize_slice(\n\u001B[0m\u001B[1;32m 1202\u001B[0m \u001B[0mlength\u001B[0m\u001B[0;34m:\u001B[0m \u001B[0mint\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mstart\u001B[0m\u001B[0;34m:\u001B[0m \u001B[0mint\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mstop\u001B[0m\u001B[0;34m:\u001B[0m \u001B[0mint\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mstep\u001B[0m\u001B[0;34m:\u001B[0m \u001B[0mOptional\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0mint\u001B[0m\u001B[0;34m]\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;32mNone\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 1203\u001B[0m ) -> Tuple[int, int]:\n",
|
||||
"\u001B[0;31mKeyboardInterrupt\u001B[0m: "
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import spacy\n",
|
||||
"from spaczz.matcher import FuzzyMatcher\n",
|
||||
@ -201,23 +186,24 @@
|
||||
" doc = nlp(line)\n",
|
||||
" matches = matcher(doc)\n",
|
||||
"\n",
|
||||
" line_counter = 0\n",
|
||||
" not_injected = 0\n",
|
||||
" for match_id, start, end, ratio in matches:\n",
|
||||
" if ratio > 90:\n",
|
||||
" line_counter += 1\n",
|
||||
" not_injected += 1\n",
|
||||
" en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" if line_counter == 0:\n",
|
||||
" line_counter = 1\n",
|
||||
" if not_injected == 0:\n",
|
||||
" not_injected = 1\n",
|
||||
" en.append(line)\n",
|
||||
" translation_line_counts.append(line_counter)\n",
|
||||
" translation_line_counts.append(not_injected)\n",
|
||||
"\n"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
"name": "#%%\n",
|
||||
"is_executing": true
|
||||
}
|
||||
}
|
||||
},
|
||||
@ -283,7 +269,6 @@
|
||||
"for word, word_id in zip(train_glossary['result_lem'], train_glossary['source_lem']):\n",
|
||||
" matcher_pl.add(word, [nlp_pl(word)])\n",
|
||||
"\n",
|
||||
"# todo\n",
|
||||
"en = []\n",
|
||||
"translation_line_counts = []\n",
|
||||
"for line_id in range(len(file_lemmatized)):\n",
|
||||
@ -291,7 +276,7 @@
|
||||
" doc = nlp(file_lemmatized[line_id])\n",
|
||||
" matches = matcher(doc)\n",
|
||||
"\n",
|
||||
" line_counter = 0\n",
|
||||
" not_injected = 0\n",
|
||||
" for match_id, start, end, ratio in matches:\n",
|
||||
" if ratio > 90:\n",
|
||||
" doc_pl = nlp_pl(file_pl_lemmatized[line_id])\n",
|
||||
@ -299,13 +284,13 @@
|
||||
"\n",
|
||||
" for match_id_pl, start_pl, end_pl, ratio_pl in matches_pl:\n",
|
||||
" if match_id_pl == glossary[glossary['source_lem'] == match_id].values[0][3]:\n",
|
||||
" line_counter += 1\n",
|
||||
" not_injected += 1\n",
|
||||
" en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))\n",
|
||||
"\n",
|
||||
" if line_counter == 0:\n",
|
||||
" line_counter = 1\n",
|
||||
" if not_injected == 0:\n",
|
||||
" not_injected = 1\n",
|
||||
" en.append(file_lemmatized[line_id])\n",
|
||||
" translation_line_counts.append(line_counter)\n"
|
||||
" translation_line_counts.append(not_injected)\n"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
@ -362,6 +347,136 @@
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"# Inject glossary Polish crosscheck fast?"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"took 152.213599056 injected 63 words. rate 6.569715230451229 sen/s\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"import spacy\n",
|
||||
"from spaczz.matcher import FuzzyMatcher\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# glossary\n",
|
||||
"glossary = pd.read_csv('kompendium_lem.tsv', sep='\\t', header=0, index_col=0)\n",
|
||||
"train_glossary = glossary.iloc[[x for x in range(len(glossary)) if x % 6 != 0]]\n",
|
||||
"\n",
|
||||
"# add rules to English matcher\n",
|
||||
"nlp = spacy.blank(\"en\")\n",
|
||||
"matcher = FuzzyMatcher(nlp.vocab)\n",
|
||||
"for word in train_glossary['source_lem']:\n",
|
||||
" matcher.add(word, [nlp(word)])\n",
|
||||
"\n",
|
||||
"# add rules to Polish matcher\n",
|
||||
"nlp_pl = spacy.blank(\"pl\")\n",
|
||||
"matcher_pl = FuzzyMatcher(nlp_pl.vocab)\n",
|
||||
"for word, word_id in zip(train_glossary['result_lem'], train_glossary['source_lem']):\n",
|
||||
" matcher_pl.add(word, [nlp_pl(word)])\n",
|
||||
"\n",
|
||||
"start_time = time.time_ns()\n",
|
||||
"en = []\n",
|
||||
"injection_counter = 0\n",
|
||||
"for line_id in range(len(file_lemmatized)):\n",
|
||||
"\n",
|
||||
" doc = nlp(file_lemmatized[line_id])\n",
|
||||
" matches = matcher(nlp(file_lemmatized[line_id]))\n",
|
||||
"\n",
|
||||
" not_injected = True\n",
|
||||
" if len(matches) > 0:\n",
|
||||
" match_id, _, end, ratio = sorted(matches, key=lambda x: len(x[0]), reverse=True)[0]\n",
|
||||
" if ratio > 90:\n",
|
||||
" matches_pl = matcher_pl(nlp_pl(file_pl_lemmatized[line_id]))\n",
|
||||
"\n",
|
||||
" for match_id_pl, _, _, _ in matches_pl:\n",
|
||||
" if match_id_pl == glossary[glossary['source_lem'] == match_id].values[0][3]:\n",
|
||||
" not_injected = False\n",
|
||||
" injection_counter += 1\n",
|
||||
" en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text))\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
" if not_injected:\n",
|
||||
" en.append(file_lemmatized[line_id])\n",
|
||||
"\n",
|
||||
"stop = time.time_ns()\n",
|
||||
"timex = (stop - start_time) / 1000000000\n",
|
||||
"print(f'took {timex} injected {injection_counter} words. rate {len(file_lemmatized)/timex} sen/s')"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import copy\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"tlcs = copy.deepcopy(translation_line_counts)\n",
|
||||
"\n",
|
||||
"translations = pd.read_csv(dev_path + '.pl', sep='\\t', header=None, names=['text'])\n",
|
||||
"translations['id'] = [x for x in range(len(translations))]\n",
|
||||
"\n",
|
||||
"ctr = 0\n",
|
||||
"sentence = ''\n",
|
||||
"with open(dev_path + '.injected.crossvalidated.en', 'w') as file_en:\n",
|
||||
" with open(dev_path + '.injected.crossvalidated.pl', 'w') as file_pl:\n",
|
||||
" for i in range(len(en)):\n",
|
||||
" if i > 0:\n",
|
||||
" if en[i-1] != en[i]:\n",
|
||||
" if ctr == 0:\n",
|
||||
" sentence = translations.iloc[0]\n",
|
||||
" translations.drop(sentence['id'], inplace=True)\n",
|
||||
" sentence = sentence['text']\n",
|
||||
" try:\n",
|
||||
" ctr = tlcs.pop(0)\n",
|
||||
" except:\n",
|
||||
" pass\n",
|
||||
" file_en.write(en[i])\n",
|
||||
" file_pl.write(sentence + '\\n')\n",
|
||||
" ctr = ctr - 1\n",
|
||||
" else:\n",
|
||||
" try:\n",
|
||||
" ctr = tlcs.pop(0) - 1\n",
|
||||
" except:\n",
|
||||
" pass\n",
|
||||
" sentence = translations.iloc[0]\n",
|
||||
" translations.drop(sentence['id'], inplace=True)\n",
|
||||
" sentence = sentence['text']\n",
|
||||
" file_en.write(en[i])\n",
|
||||
" file_pl.write(sentence + '\\n')\n"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
1194
kompendium_lem_cleaned.tsv
Normal file
1194
kompendium_lem_cleaned.tsv
Normal file
File diff suppressed because it is too large
Load Diff
11740
rapidfuzztest.ipynb
Normal file
11740
rapidfuzztest.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,18 +1,30 @@
|
||||
first iteration:
|
||||
./marian/build/marian --model mt.npz --type transformer --overwrite \
|
||||
./marian/build/marian --model mt.npz \
|
||||
--type transformer --overwrite \
|
||||
--train-sets mt-summit-corpora/mt-summit-corpora/dev/dev.en \
|
||||
mt-summit-corpora/mt-summit-corpora/dev/dev.pl \
|
||||
--disp-freq 1000 --save-freq 1000 --optimizer adam --lr-report
|
||||
--disp-freq 1000 \
|
||||
--save-freq 1000 \
|
||||
--optimizer adam \
|
||||
--lr-report
|
||||
|
||||
next iterations:
|
||||
./marian/build/marian --model mt.npz --type transformer --overwrite \
|
||||
./marian/build/marian --model mt.npz \
|
||||
--type transformer --overwrite \
|
||||
--train-sets mt-summit-corpora/mt-summit-corpora/dev/dev.en \
|
||||
mt-summit-corpora/mt-summit-corpora/dev/dev.pl \
|
||||
--disp-freq 1000 --save-freq 1000 --optimizer adam --lr-report \
|
||||
--disp-freq 1000 \
|
||||
--save-freq 1000 \
|
||||
--optimizer adam \
|
||||
--lr-report \
|
||||
--pretrained-model mt.npz
|
||||
|
||||
./marian/build/marian --model mt.npz --type transformer --overwrite \
|
||||
./marian/build/marian --model mt.npz \
|
||||
--type transformer --overwrite \
|
||||
--train-sets mt-summit-corpora/mt-summit-corpora/train/train.en \
|
||||
mt-summit-corpora/mt-summit-corpora/train/train.pl \
|
||||
--disp-freq 1000 --save-freq 10000 --optimizer adam --lr-report \
|
||||
--disp-freq 1000 \
|
||||
--save-freq 10000 \
|
||||
--optimizer adam \
|
||||
--lr-report \
|
||||
--pretrained-model mt.npz
|
||||
|
12
venv-setup.sh
Normal file
12
venv-setup.sh
Normal file
@ -0,0 +1,12 @@
|
||||
#!/bin.bash
|
||||
|
||||
apt install python3-pip
|
||||
apt install python3-virtualenv
|
||||
virtualenv -p python3.8 gpu
|
||||
source gpu/bin/activate
|
||||
pip install pandas ipython
|
||||
pip install spacy[cuda114]
|
||||
python -m spacy download en_core_web_sm
|
||||
python -m spacy download pl_core_news_sm
|
||||
pip install spaczz
|
||||
pip install rapidfuzz
|
Loading…
Reference in New Issue
Block a user