optimized sorting giza alignments

2017-10-18 10:07:25 +02:00 · 2017-10-18 10:07:25 +02:00 · 38e51a90f7
commit 38e51a90f7
parent a2b6853dcf
2 changed files with 77 additions and 5 deletions
--- a/mgiza-aligner/Makefile
+++ b/mgiza-aligner/Makefile
@ -1,15 +1,15 @@
-SRC_LANG=en
+SRC_LANG=pl
 TRG_LANG=en
-CORPUS_NAME=jrc_enes
+CORPUS_NAME=europarl_sample
 SEPARATOR=@\#@
-DICTIONARY_WEIGHT=5
+DICTIONARY_WEIGHT=0
 all: corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt
 corpora/$(CORPUS_NAME)/aligned.txt: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.lem_trg.lem.cooc corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt corpora/$(CORPUS_NAME)/src.lem.vcb corpora/$(CORPUS_NAME)/trg.lem.vcb
 	mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
-	cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt
+	./mergeGizaAlignments.py corpora/$(CORPUS_NAME)/aligned*part* > corpora/$(CORPUS_NAME)/aligned.txt
 clean-intermediate-files:
 	rm -f corpora/$(CORPUS_NAME)/*.lem
--- a/mgiza-aligner/mergeGizaAlignments.py
+++ b/mgiza-aligner/mergeGizaAlignments.py
@ -0,0 +1,72 @@
 #!/usr/bin/python3
 import sys, re
 pair_pattern = re.compile("# Sentence pair \((\d+)\)")
 def getNextSentencePair(pairs_file):
    first_line = pairs_file['file'].readline()
    if first_line == '':
        # we reached the end of file
        pairs_file['nextPair'] = None
    else:
        nextPair = dict()
        m = pair_pattern.match(first_line)
        if m:
            nextPair['number'] = int(m.group(1))
        else:
            raise Exception("Could not read sentence pair number from line: %s" %first_line)
        lines = [first_line]
        lines.append(pairs_file['file'].readline())
        lines.append(pairs_file['file'].readline())
        nextPair['lines'] = lines
        pairs_file['nextPair'] = nextPair
 files = []
 for arg in sys.argv[1:]:
    pairs_file = {'file':open(arg,'r'), 'nextPair':None}
    getNextSentencePair(pairs_file)
    files.append(pairs_file)
 finished = False
 while not finished:
    youngest_file = None
    for pairs_file in files:
        if pairs_file['nextPair'] is not None:
        # if the file is not at end
            if youngest_file is None or pairs_file['nextPair']['number'] < youngest_file['nextPair']['number']:
                youngest_file = pairs_file
    if youngest_file is None:
        finished = True
    else:
        print(''.join(youngest_file['nextPair']['lines']).rstrip())
        getNextSentencePair(youngest_file)
 for pairs_file in files:
    pairs_file['file'].close()
 """
 i = 0
 for line in sys.stdin:
    line = line.strip()
    if i % 3 == 0:
        current_example = [line]
        m = p.match(line)
        if m:
            current_key = int(m.group(1))
        else:
            raise Exception("Wrong line: "+line)
    elif i % 3 == 1:
        current_example.append(line)
    else:
        current_example.append(line)
        examples_dict[current_key] = current_example
    i+=1
 for key in sorted(examples_dict.keys()):
    print ('\n'.join(examples_dict[key]))
 """