Merge branch 'master' of git.wmi.amu.edu.pl:s470607/transfix-mt
This commit is contained in:
commit
40ccea191b
6
do_inject.sh
Normal file → Executable file
6
do_inject.sh
Normal file → Executable file
@ -1,7 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
source gpu/bin/activate
|
||||
|
||||
python scripts/lemmatize_glossary.py
|
||||
python scripts/lemmatize_in.py
|
||||
python scripts/inject.py
|
||||
. ./scripts/do_inject_helper.sh
|
||||
|
9
scripts/do_inject_helper.sh
Executable file
9
scripts/do_inject_helper.sh
Executable file
@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
source ~/gpu/bin/activate
|
||||
|
||||
cd ~/transfix-mt/scripts
|
||||
|
||||
python lemmatize_glossary.py
|
||||
python lemmatize_in.py
|
||||
python inject.py
|
@ -1,3 +1,4 @@
|
||||
import os
|
||||
import pandas as pd
|
||||
import rapidfuzz
|
||||
|
||||
@ -40,13 +41,13 @@ def get_injected(sentence, sentence_en, sequence, inject):
|
||||
|
||||
THRESHOLD = 70
|
||||
|
||||
# train_in_path = 'mt-summit-corpora/train/in.tsv'
|
||||
# train_expected_path = 'mt-summit-corpora/train/expected.tsv'
|
||||
# train_in_path = '~/mt-summit-corpora/train/in.tsv'
|
||||
# train_expected_path = '~/mt-summit-corpora/train/expected.tsv'
|
||||
|
||||
train_in_path = 'mt-summit-corpora/dev-0/in.tsv'
|
||||
train_expected_path = 'mt-summit-corpora/dev-0/expected.tsv'
|
||||
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv')
|
||||
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv')
|
||||
|
||||
glossary = pd.read_csv('mt-summit-corpora/glossary_lem.tsv', sep='\t')
|
||||
glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv.lemmatized', sep='\t')
|
||||
glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]
|
||||
|
||||
file_pl = pd.read_csv(train_expected_path, sep='\t', header=None, names=['text'])
|
||||
|
@ -6,7 +6,10 @@ from nltk.stem import WordNetLemmatizer
|
||||
nltk.download('wordnet')
|
||||
|
||||
wl = WordNetLemmatizer()
|
||||
glossary = pd.read_csv('mt-summit-corpora/glossary.tsv', sep='\t', header=None, names=['source', 'result'])
|
||||
|
||||
glossary_path = '~/mt-summit-corpora/glossary.tsv'
|
||||
|
||||
glossary = pd.read_csv(glossary_path, sep='\t', header=None, names=['source', 'result'])
|
||||
source_lemmatized = []
|
||||
for word in glossary['source']:
|
||||
word = nltk.word_tokenize(word)
|
||||
@ -16,4 +19,4 @@ glossary['source_lem'] = source_lemmatized
|
||||
glossary = glossary[['source', 'source_lem', 'result']]
|
||||
glossary.set_index('source_lem')
|
||||
|
||||
glossary.to_csv('mt-summit-corpora/glossary_lem.tsv', sep='\t', index=False)
|
||||
glossary.to_csv(glossary_path + '.lemmatized', sep='\t', index=False)
|
||||
|
@ -1,14 +1,16 @@
|
||||
import nltk
|
||||
import os
|
||||
|
||||
from nltk.stem import WordNetLemmatizer
|
||||
|
||||
|
||||
wl = WordNetLemmatizer()
|
||||
|
||||
# train_in_path = 'mt-summit-corpora/train/in.tsv'
|
||||
# train_expected_path = 'mt-summit-corpora/train/expected.tsv'
|
||||
# train_in_path = '~/mt-summit-corpora/train/in.tsv'
|
||||
# train_expected_path = '~/mt-summit-corpora/train/expected.tsv'
|
||||
|
||||
train_in_path = 'mt-summit-corpora/dev-0/in.tsv'
|
||||
train_expected_path = 'mt-summit-corpora/dev-0/expected.tsv'
|
||||
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv')
|
||||
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv')
|
||||
|
||||
file_lemmatized = []
|
||||
with open(train_in_path, 'r') as file:
|
||||
|
Loading…
Reference in New Issue
Block a user