diff --git a/do_inject.sh b/inject.sh similarity index 100% rename from do_inject.sh rename to inject.sh diff --git a/scripts/do_inject.sh b/scripts/do_inject.sh new file mode 100755 index 0000000..4e18549 --- /dev/null +++ b/scripts/do_inject.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# arguments: +# 1. path to glossary file, E.g. for glossary in ~/data/glossary.tsv should be data/glossary.tsv +# 2. path to in.tsv file +# 3. path to expected.tsv file +# all path should be given as absolute path without ~/ at the very beginning (as seen in the example above) + +glossary_path="$1" +in_path="$2" +expected_path="$3" + +source ~/gpu/bin/activate + +cd ~/transfix-mt/scripts + +python lemmatize_glossary.py "$glossary_path" +python lemmatize_in.py "$in_path" "$expected_path" +python inject.py "$glossary_path" "$in_path" "$expected_path" diff --git a/scripts/do_inject_helper.sh b/scripts/do_inject_helper.sh deleted file mode 100755 index f2d62d3..0000000 --- a/scripts/do_inject_helper.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -source ~/gpu/bin/activate - -cd ~/transfix-mt/scripts - -python lemmatize_glossary.py -python lemmatize_in.py -python inject.py diff --git a/scripts/inject.py b/scripts/inject.py index dfe8c4e..a9b2bb0 100644 --- a/scripts/inject.py +++ b/scripts/inject.py @@ -1,11 +1,21 @@ import os import pandas as pd import rapidfuzz +import sys from rapidfuzz.fuzz import partial_ratio from rapidfuzz.utils import default_process +def read_arguments(): + try: + path_glossary, path_in, path_expected = sys.argv + return path_glossary, path_in, path_expected + except Exception: + print("ERROR: Wrong argument.") + sys.exit(1) + + def full_strip(line): return ' '.join(line.split()) @@ -41,10 +51,11 @@ def get_injected(sentence, sentence_en, sequence, inject): THRESHOLD = 70 -train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/in.tsv') -train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/expected.tsv') +glossary_arg_path, in_arg_path, expected_arg_path = read_arguments() +train_in_path = os.path.join(os.path.expanduser('~'), in_arg_path) +train_expected_path = os.path.join(os.path.expanduser('~'), expected_arg_path) -glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv.lemmatized', sep='\t') +glossary = pd.read_csv(os.path.join(os.path.expanduser('~'), glossary_arg_path), sep='\t') glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']] glossary['hash'] = [hash(x) for x in glossary['source']] glossary = glossary[glossary['hash'] % 100 > 16] @@ -83,17 +94,17 @@ for line, line_en, line_pl in zip(file_en_lemmatized, file_en, file_pl): translation_line_counts.append(1) en.append(line_en) - if len(translation_line_counts) % 50000 == 0: - print('injecting into file: ' + train_in_path + '.injected: ' + str(len(translation_line_counts)), end='\r') + if len(translation_line_counts) % 1000 == 0: + print('injecting into file: ' + train_in_path + ': ' + str(len(translation_line_counts)), end='\r') print('\n') -with open(train_expected_path + '.injected', 'w') as file_pl_write: +with open(train_expected_path, 'w') as file_pl_write: for line, translation_line_ct in zip(file_pl, translation_line_counts): for i in range(translation_line_ct): file_pl_write.write(full_strip(line) + '\n') -with open(train_in_path + '.injected', 'w') as file_en_write: +with open(train_in_path, 'w') as file_en_write: for e in en: file_en_write.write(e + '\n') diff --git a/scripts/lemmatize_glossary.py b/scripts/lemmatize_glossary.py index 4096a3b..6bea041 100644 --- a/scripts/lemmatize_glossary.py +++ b/scripts/lemmatize_glossary.py @@ -1,14 +1,25 @@ import nltk import os import pandas as pd +import sys from nltk.stem import WordNetLemmatizer nltk.download('wordnet') + +def read_arguments(): + try: + glossary_arg_pathx = sys.argv + return glossary_arg_pathx + except Exception: + print("ERROR: Wrong argument.") + sys.exit(1) + + wl = WordNetLemmatizer() -glossary_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/glossary.tsv') +glossary_path = os.path.join(os.path.expanduser('~'), read_arguments()[0]) glossary = pd.read_csv(glossary_path, sep='\t', header=None, names=['source', 'result']) source_lemmatized = [] diff --git a/scripts/lemmatize_in.py b/scripts/lemmatize_in.py index 7f9064e..2729870 100644 --- a/scripts/lemmatize_in.py +++ b/scripts/lemmatize_in.py @@ -1,13 +1,24 @@ import nltk import os +import sys from nltk.stem import WordNetLemmatizer +def read_arguments(): + try: + path_in, path_expected = sys.argv + return path_in, path_expected + except Exception: + print("ERROR: Wrong argument.") + sys.exit(1) + + wl = WordNetLemmatizer() -train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/in.tsv') -train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/expected.tsv') +in_arg_path, expected_arg_path = read_arguments() +train_in_path = os.path.join(os.path.expanduser('~'), in_arg_path) +train_expected_path = os.path.join(os.path.expanduser('~'), expected_arg_path) file_lemmatized = [] with open(train_in_path, 'r') as file: