add readme

2022-01-23 16:58:40 +01:00 · 2022-01-23 16:58:40 +01:00 · 13bc44a975
commit 13bc44a975
parent 40ccea191b
7 changed files with 17 additions and 11 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,10 @@
+# Transfix-mt
+
+Part of the Transfix project responsible for injecting the dictionary into
+source data for constrained translation.
+The scripts are compatible with a gonito challenge that will be linked here in the future.
+
+It should used in the following way in a Transfix-like environment/file structure:
+* git clone https://git.wmi.amu.edu.pl/s470607/transfix-mt.git
+* ./transfix-mt/env/venv-setup.sh
+* ./transfix-mt/do_inject.sh
--- a/random-scripts/training-command.txt
+++ b/random-scripts/training-command.txt
--- a/random-scripts/venv-setup-helper.sh
+++ b/random-scripts/venv-setup-helper.sh
--- a/random-scripts/venv-setup.sh
+++ b/random-scripts/venv-setup.sh
--- a/scripts/inject.py
+++ b/scripts/inject.py
@ -41,11 +41,8 @@ def get_injected(sentence, sentence_en, sequence, inject):

 THRESHOLD = 70

-# train_in_path = '~/mt-summit-corpora/train/in.tsv'
-# train_expected_path = '~/mt-summit-corpora/train/expected.tsv'
-
-train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv')
-train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv')
+train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/in.tsv')
+train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/expected.tsv')

 glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv.lemmatized', sep='\t')
 glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]
--- a/scripts/lemmatize_glossary.py
+++ b/scripts/lemmatize_glossary.py
@ -1,4 +1,5 @@
 import nltk
+import os
 import pandas as pd

 from nltk.stem import WordNetLemmatizer
@ -7,7 +8,7 @@ nltk.download('wordnet')

 wl = WordNetLemmatizer()

-glossary_path = '~/mt-summit-corpora/glossary.tsv'
+glossary_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/glossary.tsv')

 glossary = pd.read_csv(glossary_path, sep='\t', header=None, names=['source', 'result'])
 source_lemmatized = []
--- a/scripts/lemmatize_in.py
+++ b/scripts/lemmatize_in.py
@ -6,11 +6,8 @@ from nltk.stem import WordNetLemmatizer

 wl = WordNetLemmatizer()

-# train_in_path = '~/mt-summit-corpora/train/in.tsv'
-# train_expected_path = '~/mt-summit-corpora/train/expected.tsv'
-
-train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv')
-train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv')
+train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/in.tsv')
+train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/expected.tsv')

 file_lemmatized = []
 with open(train_in_path, 'r') as file:
@ -19,6 +16,7 @@ with open(train_in_path, 'r') as file:
            print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\r')
        line = nltk.word_tokenize(line)
        file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line]))
+print('\n')

 with open(train_in_path + '.lemmatized', 'w') as file_write:
    for line in file_lemmatized: