add readme
This commit is contained in:
parent
40ccea191b
commit
13bc44a975
10
README.md
10
README.md
@ -0,0 +1,10 @@
|
|||||||
|
# Transfix-mt
|
||||||
|
|
||||||
|
Part of the Transfix project responsible for injecting the dictionary into
|
||||||
|
source data for constrained translation.
|
||||||
|
The scripts are compatible with a gonito challenge that will be linked here in the future.
|
||||||
|
|
||||||
|
It should used in the following way in a Transfix-like environment/file structure:
|
||||||
|
* git clone https://git.wmi.amu.edu.pl/s470607/transfix-mt.git
|
||||||
|
* ./transfix-mt/env/venv-setup.sh
|
||||||
|
* ./transfix-mt/do_inject.sh
|
@ -41,11 +41,8 @@ def get_injected(sentence, sentence_en, sequence, inject):
|
|||||||
|
|
||||||
THRESHOLD = 70
|
THRESHOLD = 70
|
||||||
|
|
||||||
# train_in_path = '~/mt-summit-corpora/train/in.tsv'
|
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/in.tsv')
|
||||||
# train_expected_path = '~/mt-summit-corpora/train/expected.tsv'
|
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/expected.tsv')
|
||||||
|
|
||||||
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv')
|
|
||||||
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv')
|
|
||||||
|
|
||||||
glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv.lemmatized', sep='\t')
|
glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv.lemmatized', sep='\t')
|
||||||
glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]
|
glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import nltk
|
import nltk
|
||||||
|
import os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from nltk.stem import WordNetLemmatizer
|
from nltk.stem import WordNetLemmatizer
|
||||||
@ -7,7 +8,7 @@ nltk.download('wordnet')
|
|||||||
|
|
||||||
wl = WordNetLemmatizer()
|
wl = WordNetLemmatizer()
|
||||||
|
|
||||||
glossary_path = '~/mt-summit-corpora/glossary.tsv'
|
glossary_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/glossary.tsv')
|
||||||
|
|
||||||
glossary = pd.read_csv(glossary_path, sep='\t', header=None, names=['source', 'result'])
|
glossary = pd.read_csv(glossary_path, sep='\t', header=None, names=['source', 'result'])
|
||||||
source_lemmatized = []
|
source_lemmatized = []
|
||||||
|
@ -6,11 +6,8 @@ from nltk.stem import WordNetLemmatizer
|
|||||||
|
|
||||||
wl = WordNetLemmatizer()
|
wl = WordNetLemmatizer()
|
||||||
|
|
||||||
# train_in_path = '~/mt-summit-corpora/train/in.tsv'
|
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/in.tsv')
|
||||||
# train_expected_path = '~/mt-summit-corpora/train/expected.tsv'
|
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/train/expected.tsv')
|
||||||
|
|
||||||
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv')
|
|
||||||
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv')
|
|
||||||
|
|
||||||
file_lemmatized = []
|
file_lemmatized = []
|
||||||
with open(train_in_path, 'r') as file:
|
with open(train_in_path, 'r') as file:
|
||||||
@ -19,6 +16,7 @@ with open(train_in_path, 'r') as file:
|
|||||||
print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\r')
|
print('lemmatizing file: ' + train_in_path + ': ' + str(len(file_lemmatized)), end='\r')
|
||||||
line = nltk.word_tokenize(line)
|
line = nltk.word_tokenize(line)
|
||||||
file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line]))
|
file_lemmatized.append(' '.join([wl.lemmatize(x) for x in line]))
|
||||||
|
print('\n')
|
||||||
|
|
||||||
with open(train_in_path + '.lemmatized', 'w') as file_write:
|
with open(train_in_path + '.lemmatized', 'w') as file_write:
|
||||||
for line in file_lemmatized:
|
for line in file_lemmatized:
|
||||||
|
Loading…
Reference in New Issue
Block a user