Merge branch 'master' of git.wmi.amu.edu.pl:s470607/transfix-mt

This commit is contained in:
jakubknczny 2022-01-23 16:40:19 +01:00
commit 40ccea191b
5 changed files with 27 additions and 16 deletions

6
do_inject.sh Normal file → Executable file
View File

@ -1,7 +1,3 @@
#!/bin/bash
source gpu/bin/activate
python scripts/lemmatize_glossary.py
python scripts/lemmatize_in.py
python scripts/inject.py
. ./scripts/do_inject_helper.sh

9
scripts/do_inject_helper.sh Executable file
View File

@ -0,0 +1,9 @@
#!/bin/bash
source ~/gpu/bin/activate
cd ~/transfix-mt/scripts
python lemmatize_glossary.py
python lemmatize_in.py
python inject.py

View File

@ -1,3 +1,4 @@
import os
import pandas as pd
import rapidfuzz
@ -40,13 +41,13 @@ def get_injected(sentence, sentence_en, sequence, inject):
THRESHOLD = 70
# train_in_path = 'mt-summit-corpora/train/in.tsv'
# train_expected_path = 'mt-summit-corpora/train/expected.tsv'
# train_in_path = '~/mt-summit-corpora/train/in.tsv'
# train_expected_path = '~/mt-summit-corpora/train/expected.tsv'
train_in_path = 'mt-summit-corpora/dev-0/in.tsv'
train_expected_path = 'mt-summit-corpora/dev-0/expected.tsv'
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv')
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv')
glossary = pd.read_csv('mt-summit-corpora/glossary_lem.tsv', sep='\t')
glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv.lemmatized', sep='\t')
glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]
file_pl = pd.read_csv(train_expected_path, sep='\t', header=None, names=['text'])

View File

@ -6,7 +6,10 @@ from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wl = WordNetLemmatizer()
glossary = pd.read_csv('mt-summit-corpora/glossary.tsv', sep='\t', header=None, names=['source', 'result'])
glossary_path = '~/mt-summit-corpora/glossary.tsv'
glossary = pd.read_csv(glossary_path, sep='\t', header=None, names=['source', 'result'])
source_lemmatized = []
for word in glossary['source']:
word = nltk.word_tokenize(word)
@ -16,4 +19,4 @@ glossary['source_lem'] = source_lemmatized
glossary = glossary[['source', 'source_lem', 'result']]
glossary.set_index('source_lem')
glossary.to_csv('mt-summit-corpora/glossary_lem.tsv', sep='\t', index=False)
glossary.to_csv(glossary_path + '.lemmatized', sep='\t', index=False)

View File

@ -1,14 +1,16 @@
import nltk
import os
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()
# train_in_path = 'mt-summit-corpora/train/in.tsv'
# train_expected_path = 'mt-summit-corpora/train/expected.tsv'
# train_in_path = '~/mt-summit-corpora/train/in.tsv'
# train_expected_path = '~/mt-summit-corpora/train/expected.tsv'
train_in_path = 'mt-summit-corpora/dev-0/in.tsv'
train_expected_path = 'mt-summit-corpora/dev-0/expected.tsv'
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv')
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv')
file_lemmatized = []
with open(train_in_path, 'r') as file: