diff --git a/mgiza-aligner/dictionaries/enhr/en.bz2 b/dictionaries/enhr/en.bz2 similarity index 100% rename from mgiza-aligner/dictionaries/enhr/en.bz2 rename to dictionaries/enhr/en.bz2 diff --git a/mgiza-aligner/dictionaries/enhr/hr.bz2 b/dictionaries/enhr/hr.bz2 similarity index 100% rename from mgiza-aligner/dictionaries/enhr/hr.bz2 rename to dictionaries/enhr/hr.bz2 diff --git a/mgiza-aligner/dictionaries/icd/en.bz2 b/dictionaries/icd/en.bz2 similarity index 100% rename from mgiza-aligner/dictionaries/icd/en.bz2 rename to dictionaries/icd/en.bz2 diff --git a/mgiza-aligner/dictionaries/icd/pl.bz2 b/dictionaries/icd/pl.bz2 similarity index 100% rename from mgiza-aligner/dictionaries/icd/pl.bz2 rename to dictionaries/icd/pl.bz2 diff --git a/mgiza-aligner/dictionaries/logofag/en.bz2 b/dictionaries/logofag/en.bz2 similarity index 100% rename from mgiza-aligner/dictionaries/logofag/en.bz2 rename to dictionaries/logofag/en.bz2 diff --git a/mgiza-aligner/dictionaries/logofag/pl.bz2 b/dictionaries/logofag/pl.bz2 similarity index 100% rename from mgiza-aligner/dictionaries/logofag/pl.bz2 rename to dictionaries/logofag/pl.bz2 diff --git a/mgiza-aligner/dictionaries/logofag_full/en.bz2 b/dictionaries/logofag_full/en.bz2 similarity index 100% rename from mgiza-aligner/dictionaries/logofag_full/en.bz2 rename to dictionaries/logofag_full/en.bz2 diff --git a/mgiza-aligner/dictionaries/logofag_full/pl.bz2 b/dictionaries/logofag_full/pl.bz2 similarity index 100% rename from mgiza-aligner/dictionaries/logofag_full/pl.bz2 rename to dictionaries/logofag_full/pl.bz2 diff --git a/mgiza-aligner/dictionaries/test/en.bz2 b/dictionaries/test/en.bz2 similarity index 100% rename from mgiza-aligner/dictionaries/test/en.bz2 rename to dictionaries/test/en.bz2 diff --git a/mgiza-aligner/dictionaries/test/pl.bz2 b/dictionaries/test/pl.bz2 similarity index 100% rename from mgiza-aligner/dictionaries/test/pl.bz2 rename to dictionaries/test/pl.bz2 diff --git a/fast-aligner/Makefile b/fast-aligner/Makefile index 0a8d2d9..1ba1f54 100644 --- a/fast-aligner/Makefile +++ b/fast-aligner/Makefile @@ -5,16 +5,23 @@ SEPARATOR=@\#@ DICTIONARY_WEIGHT=3 -all: corpora/$(CORPUS_NAME)/falign_corpus.txt +all: corpora/$(CORPUS_NAME)/alignments.txt corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt clean: rm -f corpora/$(CORPUS_NAME)/*.lem rm -f corpora/$(CORPUS_NAME)/*.dict rm -f corpora/$(CORPUS_NAME)/src_clean.txt + rm -f corpora/$(CORPUS_NAME)/src_clean.lem rm -f corpora/$(CORPUS_NAME)/trg_clean.txt rm -f corpora/$(CORPUS_NAME)/ids_clean.txt rm -f corpora/$(CORPUS_NAME)/falign_corpus.txt + rm -f corpora/$(CORPUS_NAME)/falign_result.txt + rm -f corpora/$(CORPUS_NAME)/alignments.txt + + +corpora/$(CORPUS_NAME)/alignments.txt: corpora/$(CORPUS_NAME)/falign_result.txt corpora/$(CORPUS_NAME)/src_clean.lem + ./get_alignments.py corpora/$(CORPUS_NAME)/falign_result.txt corpora/$(CORPUS_NAME)/src_clean.lem > $@ corpora/$(CORPUS_NAME)/src.dict: ./collect_dict.py $(SRC_LANG) $(TRG_LANG) $(DICTIONARY_WEIGHT) > $@ @@ -30,16 +37,9 @@ corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.txt /usr/local/bin/concordia-sentence-tokenizer -c ../concordia.cfg < $< | ./sentence_lemmatizer.py $(TRG_LANG) > $@ +corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict + ./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt -corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem - ./clean_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt - -corpora/$(CORPUS_NAME)/pasted.txt: corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt - ./paste.py corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt $(SEPARATOR)> $@ - -corpora/$(CORPUS_NAME)/pasted_deduplicated.txt: corpora/$(CORPUS_NAME)/pasted.txt - sort -k 1.13 $< | uniq -s 12 | sort > $@ - -corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt: corpora/$(CORPUS_NAME)/pasted_deduplicated.txt corpora/$(CORPUS_NAME)/aligned.txt - ./extract.py $< corpora/$(CORPUS_NAME)/aligned.txt corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt $(SEPARATOR) +corpora/$(CORPUS_NAME)/falign_result.txt: corpora/$(CORPUS_NAME)/falign_corpus.txt + fast_align -i $< -d -o -v > $@ \ No newline at end of file diff --git a/fast-aligner/collect_dict.py b/fast-aligner/collect_dict.py index 3aacaba..9459673 100755 --- a/fast-aligner/collect_dict.py +++ b/fast-aligner/collect_dict.py @@ -7,9 +7,11 @@ src_lang = sys.argv[1] trg_lang = sys.argv[2] weight = int(sys.argv[3]) -for dname in os.listdir('dictionaries'): - src_path = 'dictionaries/%s/%s.bz2' % (dname, src_lang) - trg_path = 'dictionaries/%s/%s.bz2' % (dname, trg_lang) +dictionaries_path = '../dictionaries' + +for dname in os.listdir(dictionaries_path): + src_path = '%s/%s/%s.bz2' % (dictionaries_path, dname, src_lang) + trg_path = '%s/%s/%s.bz2' % (dictionaries_path, dname, trg_lang) if os.path.isfile(src_path) and os.path.isfile(trg_path): with bz2.open(src_path, 'rt') as src_dict_file: for line in src_dict_file: diff --git a/fast-aligner/get_alignments.py b/fast-aligner/get_alignments.py new file mode 100755 index 0000000..6ab644d --- /dev/null +++ b/fast-aligner/get_alignments.py @@ -0,0 +1,25 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +import sys + + +with open(sys.argv[1]) as falign_result, open(sys.argv[2]) as src_clean_lem: + for line in src_clean_lem: + src_line_lem = line.strip() + falign_alignment_pairs = falign_result.readline().rstrip().split() + falign_map = {} + for pair in falign_alignment_pairs: + numbers = pair.split('-') + s = int(numbers[0]) + t = int(numbers[1]) + if not s in falign_map: + falign_map[s] = [] + falign_map[s].append(t) + res = [] + for i in range(len(src_line_lem.split())): + if i in falign_map: + res.append(falign_map[i]) + else: + res.append([]) + print(res) diff --git a/fast-aligner/clean_corpus.py b/fast-aligner/prepare_corpus.py similarity index 50% rename from fast-aligner/clean_corpus.py rename to fast-aligner/prepare_corpus.py index 3b5ecbc..1853067 100755 --- a/fast-aligner/clean_corpus.py +++ b/fast-aligner/prepare_corpus.py @@ -6,10 +6,12 @@ import sys max_tokens = 100 max_ratio = 4.0 -#./clean_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt +#./prepare_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/ids.txt corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem corpora/$(CORPUS_NAME)/src.dict corpora/$(CORPUS_NAME)/trg.dict corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/ids_clean.txt corpora/$(CORPUS_NAME)/falign_corpus.txt -with open(sys.argv[1]) as src, open(sys.argv[2]) as trg, open(sys.argv[3]) as ids, open(sys.argv[4]) as src_lem, open(sys.argv[5]) as trg_lem, open(sys.argv[6], 'w') as src_clean, open(sys.argv[7], 'w') as trg_clean, open(sys.argv[8], 'w') as ids_clean, open(sys.argv[9], 'w') as falign_corpus: + + +with open(sys.argv[1]) as src, open(sys.argv[2]) as trg, open(sys.argv[3]) as ids, open(sys.argv[4]) as src_lem, open(sys.argv[5]) as trg_lem, open(sys.argv[6]) as src_dict, open(sys.argv[7]) as trg_dict, open(sys.argv[8], 'w') as src_clean, open(sys.argv[9], 'w') as src_clean_lem, open(sys.argv[10], 'w') as trg_clean, open(sys.argv[11], 'w') as ids_clean, open(sys.argv[12], 'w') as falign_corpus: for line in src: src_line_orig = line.strip() trg_line_orig = trg.readline().strip() @@ -22,6 +24,11 @@ with open(sys.argv[1]) as src, open(sys.argv[2]) as trg, open(sys.argv[3]) as id ratio = float(src_token_count/trg_token_count) if src_token_count > trg_token_count else float(trg_token_count/src_token_count) if (ratio <= max_ratio): src_clean.write(src_line_orig+"\n") + src_clean_lem.write(src_line_lem+"\n") trg_clean.write(trg_line_orig+"\n") ids_clean.write(id_orig+"\n") - falign_corpus.write("%s|||%s\n" % (src_line_lem, trg_line_lem)) + falign_corpus.write("%s ||| %s\n" % (src_line_lem, trg_line_lem)) + for line in src_dict: + src_word = line.rstrip() + trg_word = trg_dict.readline().rstrip() + falign_corpus.write("%s ||| %s\n" % (src_word, trg_word)) \ No newline at end of file