diff --git a/do_inject.sh b/do_inject.sh old mode 100644 new mode 100755 index f74cbdf..f4a0f4e --- a/do_inject.sh +++ b/do_inject.sh @@ -1,7 +1,3 @@ #!/bin/bash -source gpu/bin/activate - -python scripts/lemmatize_glossary.py -python scripts/lemmatize_in.py -python scripts/inject.py +. ./scripts/do_inject_helper.sh diff --git a/scripts/do_inject_helper.sh b/scripts/do_inject_helper.sh new file mode 100755 index 0000000..f2d62d3 --- /dev/null +++ b/scripts/do_inject_helper.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +source ~/gpu/bin/activate + +cd ~/transfix-mt/scripts + +python lemmatize_glossary.py +python lemmatize_in.py +python inject.py diff --git a/scripts/inject.py b/scripts/inject.py index f118357..368d261 100644 --- a/scripts/inject.py +++ b/scripts/inject.py @@ -1,3 +1,4 @@ +import os import pandas as pd import rapidfuzz @@ -40,13 +41,13 @@ def get_injected(sentence, sentence_en, sequence, inject): THRESHOLD = 70 -# train_in_path = 'mt-summit-corpora/train/in.tsv' -# train_expected_path = 'mt-summit-corpora/train/expected.tsv' +# train_in_path = '~/mt-summit-corpora/train/in.tsv' +# train_expected_path = '~/mt-summit-corpora/train/expected.tsv' -train_in_path = 'mt-summit-corpora/dev-0/in.tsv' -train_expected_path = 'mt-summit-corpora/dev-0/expected.tsv' +train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv') +train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv') -glossary = pd.read_csv('mt-summit-corpora/glossary_lem.tsv', sep='\t') +glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv.lemmatized', sep='\t') glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']] file_pl = pd.read_csv(train_expected_path, sep='\t', header=None, names=['text']) diff --git a/scripts/lemmatize_glossary.py b/scripts/lemmatize_glossary.py index a62cb5f..5325c09 100644 --- a/scripts/lemmatize_glossary.py +++ b/scripts/lemmatize_glossary.py @@ -6,7 +6,10 @@ from nltk.stem import WordNetLemmatizer nltk.download('wordnet') wl = WordNetLemmatizer() -glossary = pd.read_csv('mt-summit-corpora/glossary.tsv', sep='\t', header=None, names=['source', 'result']) + +glossary_path = '~/mt-summit-corpora/glossary.tsv' + +glossary = pd.read_csv(glossary_path, sep='\t', header=None, names=['source', 'result']) source_lemmatized = [] for word in glossary['source']: word = nltk.word_tokenize(word) @@ -16,4 +19,4 @@ glossary['source_lem'] = source_lemmatized glossary = glossary[['source', 'source_lem', 'result']] glossary.set_index('source_lem') -glossary.to_csv('mt-summit-corpora/glossary_lem.tsv', sep='\t', index=False) +glossary.to_csv(glossary_path + '.lemmatized', sep='\t', index=False) diff --git a/scripts/lemmatize_in.py b/scripts/lemmatize_in.py index 190d7c6..2232f07 100644 --- a/scripts/lemmatize_in.py +++ b/scripts/lemmatize_in.py @@ -1,14 +1,16 @@ import nltk +import os + from nltk.stem import WordNetLemmatizer wl = WordNetLemmatizer() -# train_in_path = 'mt-summit-corpora/train/in.tsv' -# train_expected_path = 'mt-summit-corpora/train/expected.tsv' +# train_in_path = '~/mt-summit-corpora/train/in.tsv' +# train_expected_path = '~/mt-summit-corpora/train/expected.tsv' -train_in_path = 'mt-summit-corpora/dev-0/in.tsv' -train_expected_path = 'mt-summit-corpora/dev-0/expected.tsv' +train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv') +train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv') file_lemmatized = [] with open(train_in_path, 'r') as file: