add readme

This commit is contained in:
jakubknczny 2022-01-23 16:58:40 +01:00
parent 40ccea191b
commit 13bc44a975
7 changed files with 17 additions and 11 deletions

View File

@ -0,0 +1,10 @@
# Transfix-mt
Part of the Transfix project responsible for injecting the dictionary into
source data for constrained translation.
The scripts are compatible with a gonito challenge that will be linked here in the future.
It should used in the following way in a Transfix-like environment/file structure:
* git clone https://git.wmi.amu.edu.pl/s470607/transfix-mt.git
* ./transfix-mt/env/venv-setup.sh
* ./transfix-mt/do_inject.sh

View File

@ -41,11 +41,8 @@ def get_injected(sentence, sentence_en, sequence, inject):
THRESHOLD = 70
# train_in_path = '~/mt-summit-corpora/train/in.tsv'
# train_expected_path = '~/mt-summit-corpora/train/expected.tsv'
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv')
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv')
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/in.tsv')
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/expected.tsv')
glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv.lemmatized', sep='\t')
glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]

View File

@ -1,4 +1,5 @@
import nltk
import os
import pandas as pd
from nltk.stem import WordNetLemmatizer
@ -7,7 +8,7 @@ nltk.download('wordnet')
wl = WordNetLemmatizer()
glossary_path = '~/mt-summit-corpora/glossary.tsv'
glossary_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/glossary.tsv')
glossary = pd.read_csv(glossary_path, sep='\t', header=None, names=['source', 'result'])
source_lemmatized = []

View File

@ -6,11 +6,8 @@ from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()
# train_in_path = '~/mt-summit-corpora/train/in.tsv'
# train_expected_path = '~/mt-summit-corpora/train/expected.tsv'
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv')
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv')
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/in.tsv')
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/expected.tsv')
file_lemmatized = []
with open(train_in_path, 'r') as file:
@ -19,6 +16,7 @@ with open(train_in_path, 'r') as file:
print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\r')
line = nltk.word_tokenize(line)
file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line]))
print('\n')
with open(train_in_path + '.lemmatized', 'w') as file_write:
for line in file_lemmatized: