giza aligner improvements
This commit is contained in:
parent
01a70fe444
commit
c712aa7c63
@ -2,8 +2,9 @@ SRC_LANG=pl
|
|||||||
TRG_LANG=en
|
TRG_LANG=en
|
||||||
CORPUS_NAME=europarl_sample
|
CORPUS_NAME=europarl_sample
|
||||||
SEPARATOR=@\#@
|
SEPARATOR=@\#@
|
||||||
|
DICTIONARY_WEIGHT=5
|
||||||
|
|
||||||
all: corpora/$(CORPUS_NAME)/aligned.txt corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt
|
all: corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/aligned.txt: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.lem_trg.lem.cooc corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt corpora/$(CORPUS_NAME)/src.lem.vcb corpora/$(CORPUS_NAME)/trg.lem.vcb
|
corpora/$(CORPUS_NAME)/aligned.txt: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.lem_trg.lem.cooc corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt corpora/$(CORPUS_NAME)/src.lem.vcb corpora/$(CORPUS_NAME)/trg.lem.vcb
|
||||||
mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
|
mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
|
||||||
@ -11,26 +12,26 @@ corpora/$(CORPUS_NAME)/aligned.txt: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CO
|
|||||||
|
|
||||||
clean-intermediate-files:
|
clean-intermediate-files:
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.lem
|
rm -f corpora/$(CORPUS_NAME)/*.lem
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/*.tok
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/*.dict
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.classes
|
rm -f corpora/$(CORPUS_NAME)/*.classes
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.classes.cats
|
rm -f corpora/$(CORPUS_NAME)/*.classes.cats
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.vcb
|
rm -f corpora/$(CORPUS_NAME)/*.vcb
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.snt
|
rm -f corpora/$(CORPUS_NAME)/*.snt
|
||||||
rm -f corpora/$(CORPUS_NAME)/*.cooc
|
rm -f corpora/$(CORPUS_NAME)/*.cooc
|
||||||
rm -f corpora/$(CORPUS_NAME)/aligned*part*
|
rm -f corpora/$(CORPUS_NAME)/aligned*part*
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/aligned.txt
|
||||||
rm -f corpora/$(CORPUS_NAME)/giza.cfg
|
rm -f corpora/$(CORPUS_NAME)/giza.cfg
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/aligned.gizacfg
|
||||||
rm -f corpora/$(CORPUS_NAME)/pasted.txt
|
rm -f corpora/$(CORPUS_NAME)/pasted.txt
|
||||||
rm -f corpora/$(CORPUS_NAME)/pasted_deduplicated.txt
|
rm -f corpora/$(CORPUS_NAME)/pasted_deduplicated.txt
|
||||||
rm -f corpora/$(CORPUS_NAME)/src_deduplicated.txt
|
|
||||||
rm -f corpora/$(CORPUS_NAME)/trg_deduplicated.txt
|
|
||||||
rm -f corpora/$(CORPUS_NAME)/src_deduplicated.tok
|
|
||||||
rm -f corpora/$(CORPUS_NAME)/trg_deduplicated.tok
|
|
||||||
rm -f corpora/$(CORPUS_NAME)/src_clean.tok
|
|
||||||
rm -f corpora/$(CORPUS_NAME)/trg_clean.tok
|
|
||||||
|
|
||||||
clean: clean-intermediate-files
|
|
||||||
rm -f corpora/$(CORPUS_NAME)/src_clean.txt
|
rm -f corpora/$(CORPUS_NAME)/src_clean.txt
|
||||||
rm -f corpora/$(CORPUS_NAME)/trg_clean.txt
|
rm -f corpora/$(CORPUS_NAME)/trg_clean.txt
|
||||||
rm -f corpora/$(CORPUS_NAME)/aligned*
|
|
||||||
|
clean: clean-intermediate-files
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/src_final.txt
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/trg_final.txt
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/aligned_final.txt
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/giza.cfg: giza.cfg.pattern
|
corpora/$(CORPUS_NAME)/giza.cfg: giza.cfg.pattern
|
||||||
sed 's/CORPUS_NAME/'$(CORPUS_NAME)'/' < $< > $@
|
sed 's/CORPUS_NAME/'$(CORPUS_NAME)'/' < $< > $@
|
||||||
@ -44,25 +45,42 @@ corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt corpora/$(CORPUS_NAME)/trg.lem_src.le
|
|||||||
corpora/$(CORPUS_NAME)/%.classes: corpora/$(CORPUS_NAME)/%.lem
|
corpora/$(CORPUS_NAME)/%.classes: corpora/$(CORPUS_NAME)/%.lem
|
||||||
mgiza/mgizapp/bin/mkcls -n10 -p$< -V$@
|
mgiza/mgizapp/bin/mkcls -n10 -p$< -V$@
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg_clean.tok
|
corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg_clean.lem corpora/$(CORPUS_NAME)/trg.dict
|
||||||
|
cat corpora/$(CORPUS_NAME)/trg_clean.lem corpora/$(CORPUS_NAME)/trg.dict > $@
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/src.dict
|
||||||
|
cat corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/src.dict > $@
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/src.dict:
|
||||||
|
./collect_dict.py $(SRC_LANG) $(TRG_LANG) $(DICTIONARY_WEIGHT) > $@
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/trg.dict:
|
||||||
|
./collect_dict.py $(TRG_LANG) $(SRC_LANG) $(DICTIONARY_WEIGHT) > $@
|
||||||
|
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/trg_clean.lem: corpora/$(CORPUS_NAME)/trg_clean.tok
|
||||||
mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(TRG_LANG) < $< > $@
|
mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(TRG_LANG) < $< > $@
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/src_clean.lem: corpora/$(CORPUS_NAME)/src_clean.tok
|
||||||
corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src_clean.tok
|
|
||||||
mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(SRC_LANG) < $< > $@
|
mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(SRC_LANG) < $< > $@
|
||||||
|
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok: corpora/$(CORPUS_NAME)/pasted_deduplicated.txt corpora/$(CORPUS_NAME)/src_deduplicated.tok corpora/$(CORPUS_NAME)/trg_deduplicated.tok
|
corpora/$(CORPUS_NAME)/trg.tok: corpora/$(CORPUS_NAME)/trg.txt
|
||||||
./clean_corpus.py $< corpora/$(CORPUS_NAME)/src_deduplicated.tok corpora/$(CORPUS_NAME)/trg_deduplicated.tok corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok $(SEPARATOR)
|
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/%_deduplicated.tok: corpora/$(CORPUS_NAME)/%_deduplicated.txt
|
|
||||||
concordia-sentence-tokenizer -c ../concordia.cfg < $< > $@
|
concordia-sentence-tokenizer -c ../concordia.cfg < $< > $@
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/src_deduplicated.txt corpora/$(CORPUS_NAME)/trg_deduplicated.txt: corpora/$(CORPUS_NAME)/pasted_deduplicated.txt
|
|
||||||
./cut.py $< corpora/$(CORPUS_NAME)/src_deduplicated.txt corpora/$(CORPUS_NAME)/trg_deduplicated.txt $(SEPARATOR)
|
corpora/$(CORPUS_NAME)/src.tok: corpora/$(CORPUS_NAME)/src.txt
|
||||||
|
concordia-sentence-tokenizer -c ../concordia.cfg < $< > $@
|
||||||
|
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok
|
||||||
|
./clean_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok $(SEPARATOR)
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/pasted.txt: corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt
|
||||||
|
./paste.py corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt $(SEPARATOR)> $@
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/pasted_deduplicated.txt: corpora/$(CORPUS_NAME)/pasted.txt
|
corpora/$(CORPUS_NAME)/pasted_deduplicated.txt: corpora/$(CORPUS_NAME)/pasted.txt
|
||||||
sort -k 1.13 $< | uniq -s 12 | sort > $@
|
sort -k 1.13 $< | uniq -s 12 | sort > $@
|
||||||
|
|
||||||
corpora/$(CORPUS_NAME)/pasted.txt: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt
|
corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt: corpora/$(CORPUS_NAME)/pasted_deduplicated.txt corpora/$(CORPUS_NAME)/aligned.txt
|
||||||
./paste.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt $(SEPARATOR)> $@
|
./extract.py $< corpora/$(CORPUS_NAME)/aligned.txt corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt $(SEPARATOR)
|
||||||
|
@ -6,13 +6,14 @@ import sys
|
|||||||
max_tokens = 100
|
max_tokens = 100
|
||||||
max_ratio = 4.0
|
max_ratio = 4.0
|
||||||
|
|
||||||
separator = sys.argv[8]
|
separator = sys.argv[9]
|
||||||
|
|
||||||
with open(sys.argv[1]) as pasted_file, open(sys.argv[2]) as src_deduplicated_tok, open(sys.argv[3]) as trg_deduplicated_tok, open(sys.argv[4], 'w') as src_clean, open(sys.argv[5], 'w') as trg_clean, open(sys.argv[6], 'w') as src_clean_tok, open(sys.argv[7], 'w') as trg_clean_tok:
|
with open(sys.argv[1]) as src_file, open(sys.argv[2]) as trg_file, open(sys.argv[3]) as src_tok, open(sys.argv[4]) as trg_tok, open(sys.argv[5], 'w') as src_clean, open(sys.argv[6], 'w') as trg_clean, open(sys.argv[7], 'w') as src_clean_tok, open(sys.argv[8], 'w') as trg_clean_tok:
|
||||||
for line in pasted_file:
|
for line in src_file:
|
||||||
src_line_orig, trg_line_orig = line.strip()[12:].split(separator)
|
src_line_orig = line.strip()
|
||||||
src_line_tok = src_deduplicated_tok.readline().strip()
|
trg_line_orig = trg_file.readline().strip()
|
||||||
trg_line_tok = trg_deduplicated_tok.readline().strip()
|
src_line_tok = src_tok.readline().strip()
|
||||||
|
trg_line_tok = trg_tok.readline().strip()
|
||||||
src_token_count = len(src_line_tok.split())
|
src_token_count = len(src_line_tok.split())
|
||||||
trg_token_count = len(trg_line_tok.split())
|
trg_token_count = len(trg_line_tok.split())
|
||||||
if (src_token_count > 0 and trg_token_count > 0 and src_token_count <= max_tokens and trg_token_count <= max_tokens):
|
if (src_token_count > 0 and trg_token_count > 0 and src_token_count <= max_tokens and trg_token_count <= max_tokens):
|
||||||
|
17
mgiza-aligner/collect_dict.py
Executable file
17
mgiza-aligner/collect_dict.py
Executable file
@ -0,0 +1,17 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import sys, os, bz2
|
||||||
|
|
||||||
|
src_lang = sys.argv[1]
|
||||||
|
trg_lang = sys.argv[2]
|
||||||
|
weight = int(sys.argv[3])
|
||||||
|
|
||||||
|
for dname in os.listdir('dictionaries'):
|
||||||
|
src_path = 'dictionaries/%s/%s.bz2' % (dname, src_lang)
|
||||||
|
trg_path = 'dictionaries/%s/%s.bz2' % (dname, trg_lang)
|
||||||
|
if os.path.isfile(src_path) and os.path.isfile(trg_path):
|
||||||
|
with bz2.open(src_path, 'rt') as src_dict_file:
|
||||||
|
for line in src_dict_file:
|
||||||
|
for i in range(weight):
|
||||||
|
print(line.strip())
|
@ -1,12 +0,0 @@
|
|||||||
#!/usr/bin/python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
import sys
|
|
||||||
|
|
||||||
separator = sys.argv[4]
|
|
||||||
|
|
||||||
with open(sys.argv[1]) as pasted_file, open(sys.argv[2], 'w') as src_file, open(sys.argv[3], 'w') as trg_file:
|
|
||||||
for line in pasted_file:
|
|
||||||
src_line, trg_line = line.strip()[12:].split(separator)
|
|
||||||
src_file.write(src_line+"\n")
|
|
||||||
trg_file.write(trg_line+"\n")
|
|
BIN
mgiza-aligner/dictionaries/test/en.bz2
Normal file
BIN
mgiza-aligner/dictionaries/test/en.bz2
Normal file
Binary file not shown.
BIN
mgiza-aligner/dictionaries/test/pl.bz2
Normal file
BIN
mgiza-aligner/dictionaries/test/pl.bz2
Normal file
Binary file not shown.
36
mgiza-aligner/extract.py
Executable file
36
mgiza-aligner/extract.py
Executable file
@ -0,0 +1,36 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
separator = sys.argv[6]
|
||||||
|
|
||||||
|
pair_pattern = re.compile(r'^# Sentence pair \((\d+)\).*')
|
||||||
|
|
||||||
|
def getNextSentencePair(f):
|
||||||
|
result = dict()
|
||||||
|
first_line = f.readline()
|
||||||
|
m = pair_pattern.match(first_line)
|
||||||
|
if m:
|
||||||
|
result['number'] = int(m.group(1))
|
||||||
|
else:
|
||||||
|
raise Exception("Could not read sentence pair number from line: %s" %first_line)
|
||||||
|
lines = [first_line]
|
||||||
|
lines.append(f.readline())
|
||||||
|
lines.append(f.readline())
|
||||||
|
result['lines'] = lines
|
||||||
|
return result
|
||||||
|
|
||||||
|
with open(sys.argv[1]) as pasted_file, open(sys.argv[2]) as aligned_file, open(sys.argv[3], 'w') as aligned_final, open(sys.argv[4], 'w') as src_final, open(sys.argv[5], 'w') as trg_final:
|
||||||
|
for line in pasted_file:
|
||||||
|
src_line, trg_line = line.strip()[12:].split(separator)
|
||||||
|
src_final.write(src_line+"\n")
|
||||||
|
trg_final.write(trg_line+"\n")
|
||||||
|
|
||||||
|
number = int(line[:12])
|
||||||
|
found = False
|
||||||
|
while not found:
|
||||||
|
sentence_pair = getNextSentencePair(aligned_file)
|
||||||
|
if (sentence_pair['number'] == number):
|
||||||
|
found = True
|
||||||
|
aligned_final.write(''.join(sentence_pair['lines']))
|
@ -6,7 +6,7 @@ import sys
|
|||||||
separator = sys.argv[3]
|
separator = sys.argv[3]
|
||||||
|
|
||||||
with open(sys.argv[1]) as src_file, open(sys.argv[2]) as trg_file:
|
with open(sys.argv[1]) as src_file, open(sys.argv[2]) as trg_file:
|
||||||
index = 0
|
index = 1
|
||||||
for src_line in src_file:
|
for src_line in src_file:
|
||||||
trg_line = trg_file.readline()
|
trg_line = trg_file.readline()
|
||||||
if separator in src_line or separator in trg_line:
|
if separator in src_line or separator in trg_line:
|
||||||
|
@ -4,4 +4,4 @@ CORPUS_NAME="europarl_sample"
|
|||||||
SRC_LANG_ID=1
|
SRC_LANG_ID=1
|
||||||
TRG_LANG_ID=2
|
TRG_LANG_ID=2
|
||||||
|
|
||||||
./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src_clean.txt $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg_clean.txt $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned.txt
|
./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src_final.txt $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg_final.txt $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned_final.txt
|
||||||
|
Loading…
Reference in New Issue
Block a user