diff --git a/mgiza-aligner/Makefile b/mgiza-aligner/Makefile index a93731a..f247dda 100644 --- a/mgiza-aligner/Makefile +++ b/mgiza-aligner/Makefile @@ -2,8 +2,9 @@ SRC_LANG=pl TRG_LANG=en CORPUS_NAME=europarl_sample SEPARATOR=@\#@ +DICTIONARY_WEIGHT=5 -all: corpora/$(CORPUS_NAME)/aligned.txt corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt +all: corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt corpora/$(CORPUS_NAME)/aligned.txt: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.lem_trg.lem.cooc corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt corpora/$(CORPUS_NAME)/src.lem.vcb corpora/$(CORPUS_NAME)/trg.lem.vcb mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg @@ -11,26 +12,26 @@ corpora/$(CORPUS_NAME)/aligned.txt: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CO clean-intermediate-files: rm -f corpora/$(CORPUS_NAME)/*.lem + rm -f corpora/$(CORPUS_NAME)/*.tok + rm -f corpora/$(CORPUS_NAME)/*.dict rm -f corpora/$(CORPUS_NAME)/*.classes rm -f corpora/$(CORPUS_NAME)/*.classes.cats rm -f corpora/$(CORPUS_NAME)/*.vcb rm -f corpora/$(CORPUS_NAME)/*.snt rm -f corpora/$(CORPUS_NAME)/*.cooc rm -f corpora/$(CORPUS_NAME)/aligned*part* + rm -f corpora/$(CORPUS_NAME)/aligned.txt rm -f corpora/$(CORPUS_NAME)/giza.cfg + rm -f corpora/$(CORPUS_NAME)/aligned.gizacfg rm -f corpora/$(CORPUS_NAME)/pasted.txt rm -f corpora/$(CORPUS_NAME)/pasted_deduplicated.txt - rm -f corpora/$(CORPUS_NAME)/src_deduplicated.txt - rm -f corpora/$(CORPUS_NAME)/trg_deduplicated.txt - rm -f corpora/$(CORPUS_NAME)/src_deduplicated.tok - rm -f corpora/$(CORPUS_NAME)/trg_deduplicated.tok - rm -f corpora/$(CORPUS_NAME)/src_clean.tok - rm -f corpora/$(CORPUS_NAME)/trg_clean.tok - -clean: clean-intermediate-files rm -f corpora/$(CORPUS_NAME)/src_clean.txt rm -f corpora/$(CORPUS_NAME)/trg_clean.txt - rm -f corpora/$(CORPUS_NAME)/aligned* + +clean: clean-intermediate-files + rm -f corpora/$(CORPUS_NAME)/src_final.txt + rm -f corpora/$(CORPUS_NAME)/trg_final.txt + rm -f corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/giza.cfg: giza.cfg.pattern sed 's/CORPUS_NAME/'$(CORPUS_NAME)'/' < $< > $@ @@ -44,25 +45,42 @@ corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt corpora/$(CORPUS_NAME)/trg.lem_src.le corpora/$(CORPUS_NAME)/%.classes: corpora/$(CORPUS_NAME)/%.lem mgiza/mgizapp/bin/mkcls -n10 -p$< -V$@ -corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg_clean.tok +corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg_clean.lem corpora/$(CORPUS_NAME)/trg.dict + cat corpora/$(CORPUS_NAME)/trg_clean.lem corpora/$(CORPUS_NAME)/trg.dict > $@ + +corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/src.dict + cat corpora/$(CORPUS_NAME)/src_clean.lem corpora/$(CORPUS_NAME)/src.dict > $@ + +corpora/$(CORPUS_NAME)/src.dict: + ./collect_dict.py $(SRC_LANG) $(TRG_LANG) $(DICTIONARY_WEIGHT) > $@ + +corpora/$(CORPUS_NAME)/trg.dict: + ./collect_dict.py $(TRG_LANG) $(SRC_LANG) $(DICTIONARY_WEIGHT) > $@ + + +corpora/$(CORPUS_NAME)/trg_clean.lem: corpora/$(CORPUS_NAME)/trg_clean.tok mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(TRG_LANG) < $< > $@ - -corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src_clean.tok +corpora/$(CORPUS_NAME)/src_clean.lem: corpora/$(CORPUS_NAME)/src_clean.tok mono LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(SRC_LANG) < $< > $@ -corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok: corpora/$(CORPUS_NAME)/pasted_deduplicated.txt corpora/$(CORPUS_NAME)/src_deduplicated.tok corpora/$(CORPUS_NAME)/trg_deduplicated.tok - ./clean_corpus.py $< corpora/$(CORPUS_NAME)/src_deduplicated.tok corpora/$(CORPUS_NAME)/trg_deduplicated.tok corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok $(SEPARATOR) - -corpora/$(CORPUS_NAME)/%_deduplicated.tok: corpora/$(CORPUS_NAME)/%_deduplicated.txt +corpora/$(CORPUS_NAME)/trg.tok: corpora/$(CORPUS_NAME)/trg.txt concordia-sentence-tokenizer -c ../concordia.cfg < $< > $@ -corpora/$(CORPUS_NAME)/src_deduplicated.txt corpora/$(CORPUS_NAME)/trg_deduplicated.txt: corpora/$(CORPUS_NAME)/pasted_deduplicated.txt - ./cut.py $< corpora/$(CORPUS_NAME)/src_deduplicated.txt corpora/$(CORPUS_NAME)/trg_deduplicated.txt $(SEPARATOR) + +corpora/$(CORPUS_NAME)/src.tok: corpora/$(CORPUS_NAME)/src.txt + concordia-sentence-tokenizer -c ../concordia.cfg < $< > $@ + + +corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok + ./clean_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok $(SEPARATOR) + +corpora/$(CORPUS_NAME)/pasted.txt: corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt + ./paste.py corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt $(SEPARATOR)> $@ corpora/$(CORPUS_NAME)/pasted_deduplicated.txt: corpora/$(CORPUS_NAME)/pasted.txt sort -k 1.13 $< | uniq -s 12 | sort > $@ -corpora/$(CORPUS_NAME)/pasted.txt: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt - ./paste.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt $(SEPARATOR)> $@ +corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt: corpora/$(CORPUS_NAME)/pasted_deduplicated.txt corpora/$(CORPUS_NAME)/aligned.txt + ./extract.py $< corpora/$(CORPUS_NAME)/aligned.txt corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt $(SEPARATOR) diff --git a/mgiza-aligner/clean_corpus.py b/mgiza-aligner/clean_corpus.py index 6c9ed9f..a545ff4 100755 --- a/mgiza-aligner/clean_corpus.py +++ b/mgiza-aligner/clean_corpus.py @@ -6,13 +6,14 @@ import sys max_tokens = 100 max_ratio = 4.0 -separator = sys.argv[8] +separator = sys.argv[9] -with open(sys.argv[1]) as pasted_file, open(sys.argv[2]) as src_deduplicated_tok, open(sys.argv[3]) as trg_deduplicated_tok, open(sys.argv[4], 'w') as src_clean, open(sys.argv[5], 'w') as trg_clean, open(sys.argv[6], 'w') as src_clean_tok, open(sys.argv[7], 'w') as trg_clean_tok: - for line in pasted_file: - src_line_orig, trg_line_orig = line.strip()[12:].split(separator) - src_line_tok = src_deduplicated_tok.readline().strip() - trg_line_tok = trg_deduplicated_tok.readline().strip() +with open(sys.argv[1]) as src_file, open(sys.argv[2]) as trg_file, open(sys.argv[3]) as src_tok, open(sys.argv[4]) as trg_tok, open(sys.argv[5], 'w') as src_clean, open(sys.argv[6], 'w') as trg_clean, open(sys.argv[7], 'w') as src_clean_tok, open(sys.argv[8], 'w') as trg_clean_tok: + for line in src_file: + src_line_orig = line.strip() + trg_line_orig = trg_file.readline().strip() + src_line_tok = src_tok.readline().strip() + trg_line_tok = trg_tok.readline().strip() src_token_count = len(src_line_tok.split()) trg_token_count = len(trg_line_tok.split()) if (src_token_count > 0 and trg_token_count > 0 and src_token_count <= max_tokens and trg_token_count <= max_tokens): diff --git a/mgiza-aligner/collect_dict.py b/mgiza-aligner/collect_dict.py new file mode 100755 index 0000000..3aacaba --- /dev/null +++ b/mgiza-aligner/collect_dict.py @@ -0,0 +1,17 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +import sys, os, bz2 + +src_lang = sys.argv[1] +trg_lang = sys.argv[2] +weight = int(sys.argv[3]) + +for dname in os.listdir('dictionaries'): + src_path = 'dictionaries/%s/%s.bz2' % (dname, src_lang) + trg_path = 'dictionaries/%s/%s.bz2' % (dname, trg_lang) + if os.path.isfile(src_path) and os.path.isfile(trg_path): + with bz2.open(src_path, 'rt') as src_dict_file: + for line in src_dict_file: + for i in range(weight): + print(line.strip()) diff --git a/mgiza-aligner/cut.py b/mgiza-aligner/cut.py deleted file mode 100755 index e8361ed..0000000 --- a/mgiza-aligner/cut.py +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/python3 -# -*- coding: utf-8 -*- - -import sys - -separator = sys.argv[4] - -with open(sys.argv[1]) as pasted_file, open(sys.argv[2], 'w') as src_file, open(sys.argv[3], 'w') as trg_file: - for line in pasted_file: - src_line, trg_line = line.strip()[12:].split(separator) - src_file.write(src_line+"\n") - trg_file.write(trg_line+"\n") diff --git a/mgiza-aligner/dictionaries/test/en.bz2 b/mgiza-aligner/dictionaries/test/en.bz2 new file mode 100644 index 0000000..86f35c9 Binary files /dev/null and b/mgiza-aligner/dictionaries/test/en.bz2 differ diff --git a/mgiza-aligner/dictionaries/test/pl.bz2 b/mgiza-aligner/dictionaries/test/pl.bz2 new file mode 100644 index 0000000..5fe8c26 Binary files /dev/null and b/mgiza-aligner/dictionaries/test/pl.bz2 differ diff --git a/mgiza-aligner/extract.py b/mgiza-aligner/extract.py new file mode 100755 index 0000000..97afa52 --- /dev/null +++ b/mgiza-aligner/extract.py @@ -0,0 +1,36 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +import sys, re + +separator = sys.argv[6] + +pair_pattern = re.compile(r'^# Sentence pair \((\d+)\).*') + +def getNextSentencePair(f): + result = dict() + first_line = f.readline() + m = pair_pattern.match(first_line) + if m: + result['number'] = int(m.group(1)) + else: + raise Exception("Could not read sentence pair number from line: %s" %first_line) + lines = [first_line] + lines.append(f.readline()) + lines.append(f.readline()) + result['lines'] = lines + return result + +with open(sys.argv[1]) as pasted_file, open(sys.argv[2]) as aligned_file, open(sys.argv[3], 'w') as aligned_final, open(sys.argv[4], 'w') as src_final, open(sys.argv[5], 'w') as trg_final: + for line in pasted_file: + src_line, trg_line = line.strip()[12:].split(separator) + src_final.write(src_line+"\n") + trg_final.write(trg_line+"\n") + + number = int(line[:12]) + found = False + while not found: + sentence_pair = getNextSentencePair(aligned_file) + if (sentence_pair['number'] == number): + found = True + aligned_final.write(''.join(sentence_pair['lines'])) diff --git a/mgiza-aligner/paste.py b/mgiza-aligner/paste.py index 77fba84..240589f 100755 --- a/mgiza-aligner/paste.py +++ b/mgiza-aligner/paste.py @@ -6,7 +6,7 @@ import sys separator = sys.argv[3] with open(sys.argv[1]) as src_file, open(sys.argv[2]) as trg_file: - index = 0 + index = 1 for src_line in src_file: trg_line = trg_file.readline() if separator in src_line or separator in trg_line: diff --git a/tests/addLemmatizedTM.sh b/tests/addLemmatizedTM.sh index fc34843..282f2c3 100755 --- a/tests/addLemmatizedTM.sh +++ b/tests/addLemmatizedTM.sh @@ -4,4 +4,4 @@ CORPUS_NAME="europarl_sample" SRC_LANG_ID=1 TRG_LANG_ID=2 -./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src_clean.txt $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg_clean.txt $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned.txt +./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src_final.txt $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg_final.txt $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned_final.txt