From 13bc44a9757ecd90764faa0a12dcf234111030f2 Mon Sep 17 00:00:00 2001
From: jakubknczny <jakubknczny@gmail.com>
Date: Sun, 23 Jan 2022 16:58:40 +0100
Subject: [PATCH] add readme

---
 README.md                                    | 10 ++++++++++
 {random-scripts => env}/training-command.txt |  0
 {random-scripts => env}/venv-setup-helper.sh |  0
 {random-scripts => env}/venv-setup.sh        |  0
 scripts/inject.py                            |  7 ++-----
 scripts/lemmatize_glossary.py                |  3 ++-
 scripts/lemmatize_in.py                      |  8 +++-----
 7 files changed, 17 insertions(+), 11 deletions(-)
 rename {random-scripts => env}/training-command.txt (100%)
 rename {random-scripts => env}/venv-setup-helper.sh (100%)
 rename {random-scripts => env}/venv-setup.sh (100%)

diff --git a/README.md b/README.md
index e69de29..22e4c77 100644
--- a/README.md
+++ b/README.md
@@ -0,0 +1,10 @@
+# Transfix-mt
+
+Part of the Transfix project responsible for injecting the dictionary into
+source data for constrained translation.
+The scripts are compatible with a gonito challenge that will be linked here in the future.
+
+It should used in the following way in a Transfix-like environment/file structure:
+* git clone https://git.wmi.amu.edu.pl/s470607/transfix-mt.git
+* ./transfix-mt/env/venv-setup.sh
+* ./transfix-mt/do_inject.sh
diff --git a/random-scripts/training-command.txt b/env/training-command.txt
similarity index 100%
rename from random-scripts/training-command.txt
rename to env/training-command.txt
diff --git a/random-scripts/venv-setup-helper.sh b/env/venv-setup-helper.sh
similarity index 100%
rename from random-scripts/venv-setup-helper.sh
rename to env/venv-setup-helper.sh
diff --git a/random-scripts/venv-setup.sh b/env/venv-setup.sh
similarity index 100%
rename from random-scripts/venv-setup.sh
rename to env/venv-setup.sh
diff --git a/scripts/inject.py b/scripts/inject.py
index 368d261..66790a7 100644
--- a/scripts/inject.py
+++ b/scripts/inject.py
@@ -41,11 +41,8 @@ def get_injected(sentence, sentence_en, sequence, inject):
 
 THRESHOLD = 70
 
-# train_in_path = '~/mt-summit-corpora/train/in.tsv'
-# train_expected_path = '~/mt-summit-corpora/train/expected.tsv'
-
-train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv')
-train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv')
+train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/in.tsv')
+train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/expected.tsv')
 
 glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv.lemmatized', sep='\t')
 glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]
diff --git a/scripts/lemmatize_glossary.py b/scripts/lemmatize_glossary.py
index 5325c09..4096a3b 100644
--- a/scripts/lemmatize_glossary.py
+++ b/scripts/lemmatize_glossary.py
@@ -1,4 +1,5 @@
 import nltk
+import os
 import pandas as pd
 
 from nltk.stem import WordNetLemmatizer
@@ -7,7 +8,7 @@ nltk.download('wordnet')
 
 wl = WordNetLemmatizer()
 
-glossary_path = '~/mt-summit-corpora/glossary.tsv'
+glossary_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/glossary.tsv')
 
 glossary = pd.read_csv(glossary_path, sep='\t', header=None, names=['source', 'result'])
 source_lemmatized = []
diff --git a/scripts/lemmatize_in.py b/scripts/lemmatize_in.py
index 2232f07..7118eaa 100644
--- a/scripts/lemmatize_in.py
+++ b/scripts/lemmatize_in.py
@@ -6,11 +6,8 @@ from nltk.stem import WordNetLemmatizer
 
 wl = WordNetLemmatizer()
 
-# train_in_path = '~/mt-summit-corpora/train/in.tsv'
-# train_expected_path = '~/mt-summit-corpora/train/expected.tsv'
-
-train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv')
-train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv')
+train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/in.tsv')
+train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/expected.tsv')
 
 file_lemmatized = []
 with open(train_in_path, 'r') as file:
@@ -19,6 +16,7 @@ with open(train_in_path, 'r') as file:
             print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\r')
         line = nltk.word_tokenize(line)
         file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line]))
+print('\n')
 
 with open(train_in_path + '.lemmatized', 'w') as file_write:
     for line in file_lemmatized: