fast aligner

This commit is contained in:
Rafał Jaworski 2019-02-21 14:02:51 +01:00
parent ffe20cdc3b
commit b7da815f74
14 changed files with 52 additions and 18 deletions

View File

@ -5,16 +5,23 @@ SEPARATOR=@\#@
DICTIONARY_WEIGHT=3 DICTIONARY_WEIGHT=3
all: corpora/$(CORPUS_NAME)/falign_corpus.txt all: corpora/$(CORPUS_NAME)/alignments.txt corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt
clean: clean:
rm -f corpora/$(CORPUS_NAME)/*.lem rm -f corpora/$(CORPUS_NAME)/*.lem
rm -f corpora/$(CORPUS_NAME)/*.dict rm -f corpora/$(CORPUS_NAME)/*.dict
rm -f corpora/$(CORPUS_NAME)/src_clean.txt rm -f corpora/$(CORPUS_NAME)/src_clean.txt
rm -f corpora/$(CORPUS_NAME)/src_clean.lem
rm -f corpora/$(CORPUS_NAME)/trg_clean.txt rm -f corpora/$(CORPUS_NAME)/trg_clean.txt
rm -f corpora/$(CORPUS_NAME)/ids_clean.txt rm -f corpora/$(CORPUS_NAME)/ids_clean.txt
rm -f corpora/$(CORPUS_NAME)/falign_corpus.txt rm -f corpora/$(CORPUS_NAME)/falign_corpus.txt
rm -f corpora/$(CORPUS_NAME)/falign_result.txt
rm -f corpora/$(CORPUS_NAME)/alignments.txt
corpora/$(CORPUS_NAME)/alignments.txt: corpora/$(CORPUS_NAME)/falign_result.txt corpora/$(CORPUS_NAME)/src_clean.lem
./get_alignments.py corpora/$(CORPUS_NAME)/falign_result.txt corpora/$(CORPUS_NAME)/src_clean.lem > $@
corpora/$(CORPUS_NAME)/src.dict: corpora/$(CORPUS_NAME)/src.dict:
./collect_dict.py $(SRC_LANG) $(TRG_LANG) $(DICTIONARY_WEIGHT) > $@ ./collect_dict.py $(SRC_LANG) $(TRG_LANG) $(DICTIONARY_WEIGHT) > $@
@ -30,16 +37,9 @@ corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.txt
/usr/local/bin/concordia-sentence-tokenizer -c ../concordia.cfg < $< | ./sentence_lemmatizer.py $(TRG_LANG) > $@ /usr/local/bin/concordia-sentence-tokenizer -c ../concordia.cfg < $< | ./sentence_lemmatizer.py $(TRG_LANG) > $@
corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict
./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt
corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/falign_result.txt: corpora/$(CORPUS_NAME)/falign_corpus.txt
./clean_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt fast_align -i $< -d -o -v > $@
corpora/$(CORPUS_NAME)/pasted.txt: corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt
./paste.py corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt $(SEPARATOR)> $@
corpora/$(CORPUS_NAME)/pasted_deduplicated.txt: corpora/$(CORPUS_NAME)/pasted.txt
sort -k 1.13 $< | uniq -s 12 | sort > $@
corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt: corpora/$(CORPUS_NAME)/pasted_deduplicated.txt corpora/$(CORPUS_NAME)/aligned.txt
./extract.py $< corpora/$(CORPUS_NAME)/aligned.txt corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt $(SEPARATOR)

View File

@ -7,9 +7,11 @@ src_lang = sys.argv[1]
trg_lang = sys.argv[2] trg_lang = sys.argv[2]
weight = int(sys.argv[3]) weight = int(sys.argv[3])
for dname in os.listdir('dictionaries'): dictionaries_path = '../dictionaries'
src_path = 'dictionaries/%s/%s.bz2' % (dname, src_lang)
trg_path = 'dictionaries/%s/%s.bz2' % (dname, trg_lang) for dname in os.listdir(dictionaries_path):
src_path = '%s/%s/%s.bz2' % (dictionaries_path, dname, src_lang)
trg_path = '%s/%s/%s.bz2' % (dictionaries_path, dname, trg_lang)
if os.path.isfile(src_path) and os.path.isfile(trg_path): if os.path.isfile(src_path) and os.path.isfile(trg_path):
with bz2.open(src_path, 'rt') as src_dict_file: with bz2.open(src_path, 'rt') as src_dict_file:
for line in src_dict_file: for line in src_dict_file:

25
fast-aligner/get_alignments.py Executable file
View File

@ -0,0 +1,25 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys
with open(sys.argv[1]) as falign_result, open(sys.argv[2]) as src_clean_lem:
for line in src_clean_lem:
src_line_lem = line.strip()
falign_alignment_pairs = falign_result.readline().rstrip().split()
falign_map = {}
for pair in falign_alignment_pairs:
numbers = pair.split('-')
s = int(numbers[0])
t = int(numbers[1])
if not s in falign_map:
falign_map[s] = []
falign_map[s].append(t)
res = []
for i in range(len(src_line_lem.split())):
if i in falign_map:
res.append(falign_map[i])
else:
res.append([])
print(res)

View File

@ -6,10 +6,12 @@ import sys
max_tokens = 100 max_tokens = 100
max_ratio = 4.0 max_ratio = 4.0
#./clean_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt #./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt
with open(sys.argv[1]) as src, open(sys.argv[2]) as trg, open(sys.argv[3]) as ids, open(sys.argv[4]) as src_lem, open(sys.argv[5]) as trg_lem, open(sys.argv[6], 'w') as src_clean, open(sys.argv[7], 'w') as trg_clean, open(sys.argv[8], 'w') as ids_clean, open(sys.argv[9], 'w') as falign_corpus:
with open(sys.argv[1]) as src, open(sys.argv[2]) as trg, open(sys.argv[3]) as ids, open(sys.argv[4]) as src_lem, open(sys.argv[5]) as trg_lem, open(sys.argv[6]) as src_dict, open(sys.argv[7]) as trg_dict, open(sys.argv[8], 'w') as src_clean, open(sys.argv[9], 'w') as src_clean_lem, open(sys.argv[10], 'w') as trg_clean, open(sys.argv[11], 'w') as ids_clean, open(sys.argv[12], 'w') as falign_corpus:
for line in src: for line in src:
src_line_orig = line.strip() src_line_orig = line.strip()
trg_line_orig = trg.readline().strip() trg_line_orig = trg.readline().strip()
@ -22,6 +24,11 @@ with open(sys.argv[1]) as src, open(sys.argv[2]) as trg, open(sys.argv[3]) as id
ratio = float(src_token_count/trg_token_count) if src_token_count > trg_token_count else float(trg_token_count/src_token_count) ratio = float(src_token_count/trg_token_count) if src_token_count > trg_token_count else float(trg_token_count/src_token_count)
if (ratio <= max_ratio): if (ratio <= max_ratio):
src_clean.write(src_line_orig+"\n") src_clean.write(src_line_orig+"\n")
src_clean_lem.write(src_line_lem+"\n")
trg_clean.write(trg_line_orig+"\n") trg_clean.write(trg_line_orig+"\n")
ids_clean.write(id_orig+"\n") ids_clean.write(id_orig+"\n")
falign_corpus.write("%s ||| %s\n" % (src_line_lem, trg_line_lem)) falign_corpus.write("%s ||| %s\n" % (src_line_lem, trg_line_lem))
for line in src_dict:
src_word = line.rstrip()
trg_word = trg_dict.readline().rstrip()
falign_corpus.write("%s ||| %s\n" % (src_word, trg_word))