Merge branch 'master' of git.wmi.amu.edu.pl:s470607/transfix-mt
This commit is contained in:
commit
40ccea191b
6
do_inject.sh
Normal file → Executable file
6
do_inject.sh
Normal file → Executable file
@ -1,7 +1,3 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
source gpu/bin/activate
|
. ./scripts/do_inject_helper.sh
|
||||||
|
|
||||||
python scripts/lemmatize_glossary.py
|
|
||||||
python scripts/lemmatize_in.py
|
|
||||||
python scripts/inject.py
|
|
||||||
|
9
scripts/do_inject_helper.sh
Executable file
9
scripts/do_inject_helper.sh
Executable file
@ -0,0 +1,9 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
source ~/gpu/bin/activate
|
||||||
|
|
||||||
|
cd ~/transfix-mt/scripts
|
||||||
|
|
||||||
|
python lemmatize_glossary.py
|
||||||
|
python lemmatize_in.py
|
||||||
|
python inject.py
|
@ -1,3 +1,4 @@
|
|||||||
|
import os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import rapidfuzz
|
import rapidfuzz
|
||||||
|
|
||||||
@ -40,13 +41,13 @@ def get_injected(sentence, sentence_en, sequence, inject):
|
|||||||
|
|
||||||
THRESHOLD = 70
|
THRESHOLD = 70
|
||||||
|
|
||||||
# train_in_path = 'mt-summit-corpora/train/in.tsv'
|
# train_in_path = '~/mt-summit-corpora/train/in.tsv'
|
||||||
# train_expected_path = 'mt-summit-corpora/train/expected.tsv'
|
# train_expected_path = '~/mt-summit-corpora/train/expected.tsv'
|
||||||
|
|
||||||
train_in_path = 'mt-summit-corpora/dev-0/in.tsv'
|
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv')
|
||||||
train_expected_path = 'mt-summit-corpora/dev-0/expected.tsv'
|
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv')
|
||||||
|
|
||||||
glossary = pd.read_csv('mt-summit-corpora/glossary_lem.tsv', sep='\t')
|
glossary = pd.read_csv('~/mt-summit-corpora/glossary.tsv.lemmatized', sep='\t')
|
||||||
glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]
|
glossary['source_lem'] = [str(default_process(x)) for x in glossary['source_lem']]
|
||||||
|
|
||||||
file_pl = pd.read_csv(train_expected_path, sep='\t', header=None, names=['text'])
|
file_pl = pd.read_csv(train_expected_path, sep='\t', header=None, names=['text'])
|
||||||
|
@ -6,7 +6,10 @@ from nltk.stem import WordNetLemmatizer
|
|||||||
nltk.download('wordnet')
|
nltk.download('wordnet')
|
||||||
|
|
||||||
wl = WordNetLemmatizer()
|
wl = WordNetLemmatizer()
|
||||||
glossary = pd.read_csv('mt-summit-corpora/glossary.tsv', sep='\t', header=None, names=['source', 'result'])
|
|
||||||
|
glossary_path = '~/mt-summit-corpora/glossary.tsv'
|
||||||
|
|
||||||
|
glossary = pd.read_csv(glossary_path, sep='\t', header=None, names=['source', 'result'])
|
||||||
source_lemmatized = []
|
source_lemmatized = []
|
||||||
for word in glossary['source']:
|
for word in glossary['source']:
|
||||||
word = nltk.word_tokenize(word)
|
word = nltk.word_tokenize(word)
|
||||||
@ -16,4 +19,4 @@ glossary['source_lem'] = source_lemmatized
|
|||||||
glossary = glossary[['source', 'source_lem', 'result']]
|
glossary = glossary[['source', 'source_lem', 'result']]
|
||||||
glossary.set_index('source_lem')
|
glossary.set_index('source_lem')
|
||||||
|
|
||||||
glossary.to_csv('mt-summit-corpora/glossary_lem.tsv', sep='\t', index=False)
|
glossary.to_csv(glossary_path + '.lemmatized', sep='\t', index=False)
|
||||||
|
@ -1,14 +1,16 @@
|
|||||||
import nltk
|
import nltk
|
||||||
|
import os
|
||||||
|
|
||||||
from nltk.stem import WordNetLemmatizer
|
from nltk.stem import WordNetLemmatizer
|
||||||
|
|
||||||
|
|
||||||
wl = WordNetLemmatizer()
|
wl = WordNetLemmatizer()
|
||||||
|
|
||||||
# train_in_path = 'mt-summit-corpora/train/in.tsv'
|
# train_in_path = '~/mt-summit-corpora/train/in.tsv'
|
||||||
# train_expected_path = 'mt-summit-corpora/train/expected.tsv'
|
# train_expected_path = '~/mt-summit-corpora/train/expected.tsv'
|
||||||
|
|
||||||
train_in_path = 'mt-summit-corpora/dev-0/in.tsv'
|
train_in_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/in.tsv')
|
||||||
train_expected_path = 'mt-summit-corpora/dev-0/expected.tsv'
|
train_expected_path = os.path.join(os.path.expanduser('~'), 'mt-summit-corpora/dev-0/expected.tsv')
|
||||||
|
|
||||||
file_lemmatized = []
|
file_lemmatized = []
|
||||||
with open(train_in_path, 'r') as file:
|
with open(train_in_path, 'r') as file:
|
||||||
|
Loading…
Reference in New Issue
Block a user