diff --git a/do_inject.sh b/do_inject.sh new file mode 100644 index 0000000..f74cbdf --- /dev/null +++ b/do_inject.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +source gpu/bin/activate + +python scripts/lemmatize_glossary.py +python scripts/lemmatize_in.py +python scripts/inject.py diff --git a/random-scripts/inject_rapid.py b/random-scripts/inject_rapid.py deleted file mode 100644 index d7a24b1..0000000 --- a/random-scripts/inject_rapid.py +++ /dev/null @@ -1,130 +0,0 @@ -import spacy -import copy -import pandas as pd -import rapidfuzz -from rapidfuzz.fuzz import partial_ratio -import time -from rapidfuzz.utils import default_process -import sys - -spacy.require_gpu() - -spacy_nlp_en = spacy.load('en_core_web_sm') -spacy_nlp_pl = spacy.load("pl_core_news_sm") - - -def read_arguments(): - try: - corpus_path, glossary_path = sys.argv - return corpus_path, glossary_path - except: - print("ERROR: Wrong argument amount.") - sys.exit(1) - - - -glossary = pd.read_csv('mt-summit-corpora/glossary.tsv', sep='\t', header=None, names=['source', 'result']) - -source_lemmatized = [] -for word in glossary['source']: - temp = [] - for token in spacy_nlp_en(word): - temp.append(token.lemma_) - source_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) - -result_lemmatized = [] -for word in glossary['result']: - temp = [] - for token in spacy_nlp_pl(word): - temp.append(token.lemma_) - result_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) - -glossary['source_lem'] = source_lemmatized -glossary['result_lem'] = result_lemmatized -glossary = glossary[['source', 'source_lem', 'result', 'result_lem']] -glossary.to_csv('kompendium_lem.tsv', sep='\t') - -corpus_path = 'mt-summit-corpora/train/' - -skip_chars = ''',./!?''' - -with open(corpus_path + 'in.tsv', 'r') as file: - file_lemmatized = [] - for line in file: - if len(file_lemmatized) % 10000 == 0: - print(len(file_lemmatized), end='\r') - temp = [] - for token in spacy_nlp_en(line): - temp.append(token.lemma_) - file_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) - -with open(corpus_path + 'expected.tsv', 'r') as file: - file_pl_lemmatized = [] - for line in file: - if len(file_pl_lemmatized) % 10000 == 0: - print(len(file_lemmatized), end='\r') - temp = [] - for token in spacy_nlp_pl(line): - temp.append(token.lemma_) - file_pl_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) - -THRESHOLD = 88 - -def is_injectable(sentence_pl, sequence): - sen = sentence_pl.split() - window_size = len(sequence.split()) - maxx = 0 - for i in range(len(sen) - window_size): - current = rapidfuzz.fuzz.partial_ratio(' '.join(sen[i:i + window_size]), sequence) - if current > maxx: - maxx = current - return maxx - -def inject(sentence, sequence): - sen = sentence.split() - window_size = len(sequence.split()) - maxx = 0 - maxxi = 0 - for i in range(len(sen) - window_size): - current = rapidfuzz.fuzz.partial_ratio(' '.join(sen[i:i + window_size]), sequence) - if current > maxx: - maxx = current - maxxi = i - return ' '.join(sen[:maxxi + window_size]) + ' ' \ - + glossary.loc[lambda df: df['source_lem'] == sequence]['result'].astype(str).values.flatten() \ - + ' ' + ' '.join(sen[maxxi + window_size:]) - -glossary = pd.read_csv('../kompendium_lem_cleaned.tsv', sep='\t', header=0, index_col=0) -glossary['source_lem'] = [default_process(x) for x in glossary['source_lem']] - -start_time = time.time_ns() -en = [] -translation_line_counts = [] -for line, line_pl in zip(file_lemmatized, file_pl_lemmatized): - if len(translation_line_counts) % 50000 == 0: - print(str(len(translation_line_counts)) + '/' + str(len(file_lemmatized), end='\r')) - line = default_process(line) - line_pl = default_process(line_pl) - matchez = rapidfuzz.process.extract(query=line, choices=glossary['source_lem'], limit=5, score_cutoff=THRESHOLD, scorer=partial_ratio) - translation_line_counts.append(len(matchez)) - for match in matchez: - # if is_injectable(line_pl, match[0]): - en.append(inject(line, match[0])[0]) - - -stop = time.time_ns() -timex = (stop - start_time) / 1000000000 -print(timex) - -tlcs = copy.deepcopy(translation_line_counts) - -translations = pd.read_csv(corpus_path + 'expected.tsv', sep='\t', header=None, names=['text']) -with open(corpus_path + 'extected.tsv.injected.crossvalidated.pl', 'w') as file_pl: - for line, translation_line_ct in zip(translations, tlcs): - for i in range(translation_line_ct): - file_pl.write(line) - - -with open(corpus_path + 'in.tsv.injected.crossvalidated.en', 'w') as file_en: - for e in en: - file_en.write(e + '\n') diff --git a/random-scripts/old/inject.py b/random-scripts/old/inject.py deleted file mode 100644 index 476898c..0000000 --- a/random-scripts/old/inject.py +++ /dev/null @@ -1,136 +0,0 @@ -import copy -import pandas as pd -import spacy -from spaczz.matcher import FuzzyMatcher - -# spacy.require_gpu() - -spacy_nlp_en = spacy.load('en_core_web_sm') -spacy_nlp_pl = spacy.load('pl_core_news_sm') - -print('lemmatizing glossary') - -glossary = pd.read_csv('glossary.tsv', sep='\t', header=None, names=['source', 'result']) - -source_lemmatized = [] -for word in glossary['source']: - temp = [] - for token in spacy_nlp_en(word): - temp.append(token.lemma_) - source_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) - -result_lemmatized = [] -for word in glossary['result']: - temp = [] - for token in spacy_nlp_pl(word): - temp.append(token.lemma_) - result_lemmatized.append(' '.join(temp).replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) - -glossary['source_lem'] = source_lemmatized -glossary['result_lem'] = result_lemmatized -glossary = glossary[['source', 'source_lem', 'result', 'result_lem']] -glossary.set_index('source_lem') - -glossary.to_csv('glossary_lem.tsv', sep='\t') - -dev_path = 'dev-0/' - -print('lemmatizing corpus ' + dev_path) - -skip_chars = ''',./!?''' - -with open(dev_path + 'in.tsv', 'r') as file: - file_lemmatized = [] - for line in file: - temp = [] - for token in spacy_nlp_en(line): - temp.append(token.lemma_) - file_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]) - .replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) - -with open(dev_path + 'expected.tsv', 'r') as file: - file_pl_lemmatized = [] - for line in file: - temp = [] - for token in spacy_nlp_pl(line): - temp.append(token.lemma_) - file_pl_lemmatized.append(' '.join([x for x in temp if x not in skip_chars]) - .replace(' - ', '-').replace(' ’', '’').replace(' / ', '/').replace(' ( ', '(').replace(' ) ', ')')) - -# glossary -glossary = pd.read_csv('glossary_lem.tsv', sep='\t', header=0, index_col=0) -train_glossary = glossary.iloc[[x for x in range(len(glossary)) if x % 6 != 0]] - -# add rules to English matcher -nlp = spacy.blank("en") -matcher = FuzzyMatcher(nlp.vocab) -for word in train_glossary['source_lem']: - matcher.add(word, [nlp(word)]) - -# add rules to Polish matcher -nlp_pl = spacy.blank("pl") -matcher_pl = FuzzyMatcher(nlp_pl.vocab) -for word, word_id in zip(train_glossary['result_lem'], train_glossary['source_lem']): - matcher_pl.add(word, [nlp_pl(word)]) - -en = [] -translation_line_counts = [] -for line_id in range(len(file_lemmatized)): - - if line_id % 100 == 0: - print('injecting glossary: ' + str(line_id) + "/" + str(len(file_lemmatized)), end='\r') - - doc = nlp(file_lemmatized[line_id]) - matches = matcher(doc) - - line_counter = 0 - for match_id, start, end, ratio in matches: - if ratio > 90: - doc_pl = nlp_pl(file_pl_lemmatized[line_id]) - matches_pl = matcher_pl(doc_pl) - - for match_id_pl, start_pl, end_pl, ratio_pl in matches_pl: - if match_id_pl == glossary[glossary['source_lem'] == match_id].values[0][3]: - line_counter += 1 - en.append(''.join(doc[:end].text + ' ' + train_glossary.loc[lambda df: df['source_lem'] == match_id]['result'].astype(str).values.flatten() + ' ' + doc[end:].text)) - - if line_counter == 0: - line_counter = 1 - en.append(file_lemmatized[line_id]) - translation_line_counts.append(line_counter) - -print('saving files') -tlcs = copy.deepcopy(translation_line_counts) - -translations = pd.read_csv(dev_path + 'expected.tsv', sep='\t', header=None, names=['text']) -translations['id'] = [x for x in range(len(translations))] - -ctr = 0 -sentence = '' -with open(dev_path + 'in.tsv.injected.crossvalidated', 'w') as file_en: - with open(dev_path + 'expected.tsv.injected.crossvalidated', 'w') as file_pl: - for i in range(len(en)): - if i > 0: - if en[i-1] != en[i]: - if ctr == 0: - sentence = translations.iloc[0] - translations.drop(sentence['id'], inplace=True) - sentence = sentence['text'] - try: - ctr = tlcs.pop(0) - except: - pass - file_en.write(en[i]) - file_pl.write(sentence + '\n') - ctr = ctr - 1 - else: - try: - ctr = tlcs.pop(0) - 1 - except: - pass - sentence = translations.iloc[0] - translations.drop(sentence['id'], inplace=True) - sentence = sentence['text'] - file_en.write(en[i]) - file_pl.write(sentence + '\n') - diff --git a/random-scripts/test.py b/random-scripts/test.py deleted file mode 100644 index 92994ed..0000000 --- a/random-scripts/test.py +++ /dev/null @@ -1,26 +0,0 @@ -import time -import nltk -from nltk.stem import WordNetLemmatizer - -# nltk.download('omw-1.4') -# nltk.download('punkt') -nltk.download('wordnet') - -wl = WordNetLemmatizer() - -start_time = time.time_ns() -filex = [] -with open('mt-summit-corpora/train/in.tsv', 'r') as file: - for line in file: - if len(filex) % 50000 == 0: - print(len(filex), end='\r') - line = nltk.word_tokenize(line) - filex.append(' '.join([wl.lemmatize(x) for x in line])) - - -stop = time.time_ns() -timex = (stop - start_time) / 1000000000 -print(timex) -f = open('temp', 'w') -for line in filex: - f.write(line + '\n') diff --git a/random-scripts/venv-setup.sh b/random-scripts/venv-setup.sh index 3499ddb..de0d0bf 100644 --- a/random-scripts/venv-setup.sh +++ b/random-scripts/venv-setup.sh @@ -6,5 +6,4 @@ virtualenv -p python3.8 gpu source gpu/bin/activate pip install pandas ipython pip install nltk -python "nltk.download('omw-1.4')" pip install rapidfuzz diff --git a/rapidfuzztest.ipynb b/rapidfuzztest.ipynb index 4e98ef0..99b117f 100644 --- a/rapidfuzztest.ipynb +++ b/rapidfuzztest.ipynb @@ -2,8 +2,18 @@ "cells": [ { "cell_type": "code", - "execution_count": null, - "outputs": [], + "execution_count": 2, + "outputs": [ + { + "data": { + "text/plain": " source \\\nsource_lem \naaofi aaofi \naca aca \nacca acca \nabacus abacus \nabandonment cost abandonment costs \n... ... \nytd ytd \nyear-end year-end \nyear-to-date year-to-date \nzog zog \nzero overhead growth zero overhead growth \n\n result \nsource_lem \naaofi organizacja rachunkowości i audytu dla islamsk... \naca członek stowarzyszenia dyplomowanych biegłych ... \nacca stowarzyszenie dyplomowanych biegłych rewidentów \nabacus liczydło \nabandonment cost koszty zaniechania \n... ... \nytd od początku roku \nyear-end koniec roku \nyear-to-date od początku roku \nzog zero wzrostu kosztów ogólnych \nzero overhead growth zero wzrostu kosztów ogólnych \n\n[1197 rows x 2 columns]", + "text/html": "
\n | source | \nresult | \n
---|---|---|
source_lem | \n\n | \n |
aaofi | \naaofi | \norganizacja rachunkowości i audytu dla islamsk... | \n
aca | \naca | \nczłonek stowarzyszenia dyplomowanych biegłych ... | \n
acca | \nacca | \nstowarzyszenie dyplomowanych biegłych rewidentów | \n
abacus | \nabacus | \nliczydło | \n
abandonment cost | \nabandonment costs | \nkoszty zaniechania | \n
... | \n... | \n... | \n
ytd | \nytd | \nod początku roku | \n
year-end | \nyear-end | \nkoniec roku | \n
year-to-date | \nyear-to-date | \nod początku roku | \n
zog | \nzog | \nzero wzrostu kosztów ogólnych | \n
zero overhead growth | \nzero overhead growth | \nzero wzrostu kosztów ogólnych | \n
1197 rows × 2 columns
\n