add readme
This commit is contained in:
parent
40ccea191b
commit
13bc44a975
10
README.md
10
README.md
@ -0,0 +1,10 @@
|
||||
# Transfix-mt
|
||||
|
||||
Part of the Transfix project responsible for injecting the dictionary into
|
||||
source data for constrained translation.
|
||||
The scripts are compatible with a gonito challenge that will be linked here in the future.
|
||||
|
||||
It should used in the following way in a Transfix-like environment/file structure:
|
||||
* git clone https://git.wmi.amu.edu.pl/s470607/transfix-mt.git
|
||||
* ./transfix-mt/env/venv-setup.sh
|
||||
* ./transfix-mt/do_inject.sh
|
@ -41,11 +41,8 @@ def get_injected(sentence, sentence_en, sequence, inject):
|
||||
|
||||
THRESHOLD = 70
|
||||
|
||||
# train_in_path = '~/mt-summit-corpora/train/in.tsv'
|
||||
# train_expected_path = '~/mt-summit-corpora/train/expected.tsv'
|
||||
|
||||
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv')
|
||||
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv')
|
||||
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/in.tsv')
|
||||
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/expected.tsv')
|
||||
|
||||
glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv.lemmatized', sep='\t')
|
||||
glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]
|
||||
|
@ -1,4 +1,5 @@
|
||||
import nltk
|
||||
import os
|
||||
import pandas as pd
|
||||
|
||||
from nltk.stem import WordNetLemmatizer
|
||||
@ -7,7 +8,7 @@ nltk.download('wordnet')
|
||||
|
||||
wl = WordNetLemmatizer()
|
||||
|
||||
glossary_path = '~/mt-summit-corpora/glossary.tsv'
|
||||
glossary_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/glossary.tsv')
|
||||
|
||||
glossary = pd.read_csv(glossary_path, sep='\t', header=None, names=['source', 'result'])
|
||||
source_lemmatized = []
|
||||
|
@ -6,11 +6,8 @@ from nltk.stem import WordNetLemmatizer
|
||||
|
||||
wl = WordNetLemmatizer()
|
||||
|
||||
# train_in_path = '~/mt-summit-corpora/train/in.tsv'
|
||||
# train_expected_path = '~/mt-summit-corpora/train/expected.tsv'
|
||||
|
||||
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv')
|
||||
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv')
|
||||
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/in.tsv')
|
||||
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/expected.tsv')
|
||||
|
||||
file_lemmatized = []
|
||||
with open(train_in_path, 'r') as file:
|
||||
@ -19,6 +16,7 @@ with open(train_in_path, 'r') as file:
|
||||
print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\r')
|
||||
line = nltk.word_tokenize(line)
|
||||
file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line]))
|
||||
print('\n')
|
||||
|
||||
with open(train_in_path + '.lemmatized', 'w') as file_write:
|
||||
for line in file_lemmatized:
|
||||
|
Loading…
Reference in New Issue
Block a user