add readme

This commit is contained in:
jakubknczny 2022-01-23 16:58:40 +01:00
parent 40ccea191b
commit 13bc44a975
7 changed files with 17 additions and 11 deletions

View File

@ -0,0 +1,10 @@
# Transfix-mt
Part of the Transfix project responsible for injecting the dictionary into
source data for constrained translation.
The scripts are compatible with a gonito challenge that will be linked here in the future.
It should used in the following way in a Transfix-like environment/file structure:
* git clone https://git.wmi.amu.edu.pl/s470607/transfix-mt.git
* ./transfix-mt/env/venv-setup.sh
* ./transfix-mt/do_inject.sh

View File

@ -41,11 +41,8 @@ def get_injected(sentence, sentence_en, sequence, inject):
THRESHOLD = 70 THRESHOLD = 70
# train_in_path = '~/mt-summit-corpora/train/in.tsv' train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/in.tsv')
# train_expected_path = '~/mt-summit-corpora/train/expected.tsv' train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/expected.tsv')
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv')
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv')
glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv.lemmatized', sep='\t') glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv.lemmatized', sep='\t')
glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']] glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]

View File

@ -1,4 +1,5 @@
import nltk import nltk
import os
import pandas as pd import pandas as pd
from nltk.stem import WordNetLemmatizer from nltk.stem import WordNetLemmatizer
@ -7,7 +8,7 @@ nltk.download('wordnet')
wl = WordNetLemmatizer() wl = WordNetLemmatizer()
glossary_path = '~/mt-summit-corpora/glossary.tsv' glossary_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/glossary.tsv')
glossary = pd.read_csv(glossary_path, sep='\t', header=None, names=['source', 'result']) glossary = pd.read_csv(glossary_path, sep='\t', header=None, names=['source', 'result'])
source_lemmatized = [] source_lemmatized = []

View File

@ -6,11 +6,8 @@ from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer() wl = WordNetLemmatizer()
# train_in_path = '~/mt-summit-corpora/train/in.tsv' train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/in.tsv')
# train_expected_path = '~/mt-summit-corpora/train/expected.tsv' train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/expected.tsv')
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv')
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv')
file_lemmatized = [] file_lemmatized = []
with open(train_in_path, 'r') as file: with open(train_in_path, 'r') as file:
@ -19,6 +16,7 @@ with open(train_in_path, 'r') as file:
print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\r') print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\r')
line = nltk.word_tokenize(line) line = nltk.word_tokenize(line)
file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line])) file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line]))
print('\n')
with open(train_in_path + '.lemmatized', 'w') as file_write: with open(train_in_path + '.lemmatized', 'w') as file_write:
for line in file_lemmatized: for line in file_lemmatized: