From 13bc44a9757ecd90764faa0a12dcf234111030f2 Mon Sep 17 00:00:00 2001 From: jakubknczny Date: Sun, 23 Jan 2022 16:58:40 +0100 Subject: [PATCH] add readme --- README.md | 10 ++++++++++ {random-scripts => env}/training-command.txt | 0 {random-scripts => env}/venv-setup-helper.sh | 0 {random-scripts => env}/venv-setup.sh | 0 scripts/inject.py | 7 ++----- scripts/lemmatize_glossary.py | 3 ++- scripts/lemmatize_in.py | 8 +++----- 7 files changed, 17 insertions(+), 11 deletions(-) rename {random-scripts => env}/training-command.txt (100%) rename {random-scripts => env}/venv-setup-helper.sh (100%) rename {random-scripts => env}/venv-setup.sh (100%) diff --git a/README.md b/README.md index e69de29..22e4c77 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,10 @@ +# Transfix-mt + +Part of the Transfix project responsible for injecting the dictionary into +source data for constrained translation. +The scripts are compatible with a gonito challenge that will be linked here in the future. + +It should used in the following way in a Transfix-like environment/file structure: +* git clone https://git.wmi.amu.edu.pl/s470607/transfix-mt.git +* ./transfix-mt/env/venv-setup.sh +* ./transfix-mt/do_inject.sh diff --git a/random-scripts/training-command.txt b/env/training-command.txt similarity index 100% rename from random-scripts/training-command.txt rename to env/training-command.txt diff --git a/random-scripts/venv-setup-helper.sh b/env/venv-setup-helper.sh similarity index 100% rename from random-scripts/venv-setup-helper.sh rename to env/venv-setup-helper.sh diff --git a/random-scripts/venv-setup.sh b/env/venv-setup.sh similarity index 100% rename from random-scripts/venv-setup.sh rename to env/venv-setup.sh diff --git a/scripts/inject.py b/scripts/inject.py index 368d261..66790a7 100644 --- a/scripts/inject.py +++ b/scripts/inject.py @@ -41,11 +41,8 @@ def get_injected(sentence, sentence_en, sequence, inject): THRESHOLD = 70 -# train_in_path = '~/mt-summit-corpora/train/in.tsv' -# train_expected_path = '~/mt-summit-corpora/train/expected.tsv' - -train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv') -train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv') +train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/in.tsv') +train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/expected.tsv') glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv.lemmatized', sep='\t') glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']] diff --git a/scripts/lemmatize_glossary.py b/scripts/lemmatize_glossary.py index 5325c09..4096a3b 100644 --- a/scripts/lemmatize_glossary.py +++ b/scripts/lemmatize_glossary.py @@ -1,4 +1,5 @@ import nltk +import os import pandas as pd from nltk.stem import WordNetLemmatizer @@ -7,7 +8,7 @@ nltk.download('wordnet') wl = WordNetLemmatizer() -glossary_path = '~/mt-summit-corpora/glossary.tsv' +glossary_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/glossary.tsv') glossary = pd.read_csv(glossary_path, sep='\t', header=None, names=['source', 'result']) source_lemmatized = [] diff --git a/scripts/lemmatize_in.py b/scripts/lemmatize_in.py index 2232f07..7118eaa 100644 --- a/scripts/lemmatize_in.py +++ b/scripts/lemmatize_in.py @@ -6,11 +6,8 @@ from nltk.stem import WordNetLemmatizer wl = WordNetLemmatizer() -# train_in_path = '~/mt-summit-corpora/train/in.tsv' -# train_expected_path = '~/mt-summit-corpora/train/expected.tsv' - -train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv') -train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv') +train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/in.tsv') +train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/expected.tsv') file_lemmatized = [] with open(train_in_path, 'r') as file: @@ -19,6 +16,7 @@ with open(train_in_path, 'r') as file: print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\r') line = nltk.word_tokenize(line) file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line])) +print('\n') with open(train_in_path + '.lemmatized', 'w') as file_write: for line in file_lemmatized: