From 38e51a90f78f46fb3e1bf56958b28900d42ced4c Mon Sep 17 00:00:00 2001 From: rjawor Date: Wed, 18 Oct 2017 10:07:25 +0200 Subject: [PATCH] optimized sorting giza alignments --- mgiza-aligner/Makefile | 10 ++-- mgiza-aligner/mergeGizaAlignments.py | 72 ++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 5 deletions(-) create mode 100755 mgiza-aligner/mergeGizaAlignments.py diff --git a/mgiza-aligner/Makefile b/mgiza-aligner/Makefile index f66bf29..8ad6958 100644 --- a/mgiza-aligner/Makefile +++ b/mgiza-aligner/Makefile @@ -1,15 +1,15 @@ -SRC_LANG=en +SRC_LANG=pl TRG_LANG=en -CORPUS_NAME=jrc_enes +CORPUS_NAME=europarl_sample SEPARATOR=@\#@ -DICTIONARY_WEIGHT=5 - +DICTIONARY_WEIGHT=0 + all: corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt corpora/$(CORPUS_NAME)/aligned.txt: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.lem_trg.lem.cooc corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt corpora/$(CORPUS_NAME)/src.lem.vcb corpora/$(CORPUS_NAME)/trg.lem.vcb mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg - cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt + ./mergeGizaAlignments.py corpora/$(CORPUS_NAME)/aligned*part* > corpora/$(CORPUS_NAME)/aligned.txt clean-intermediate-files: rm -f corpora/$(CORPUS_NAME)/*.lem diff --git a/mgiza-aligner/mergeGizaAlignments.py b/mgiza-aligner/mergeGizaAlignments.py new file mode 100755 index 0000000..6dfcefd --- /dev/null +++ b/mgiza-aligner/mergeGizaAlignments.py @@ -0,0 +1,72 @@ +#!/usr/bin/python3 + +import sys, re + +pair_pattern = re.compile("# Sentence pair \((\d+)\)") + + +def getNextSentencePair(pairs_file): + first_line = pairs_file['file'].readline() + if first_line == '': + # we reached the end of file + pairs_file['nextPair'] = None + else: + nextPair = dict() + m = pair_pattern.match(first_line) + if m: + nextPair['number'] = int(m.group(1)) + else: + raise Exception("Could not read sentence pair number from line: %s" %first_line) + lines = [first_line] + lines.append(pairs_file['file'].readline()) + lines.append(pairs_file['file'].readline()) + nextPair['lines'] = lines + pairs_file['nextPair'] = nextPair + +files = [] +for arg in sys.argv[1:]: + pairs_file = {'file':open(arg,'r'), 'nextPair':None} + getNextSentencePair(pairs_file) + files.append(pairs_file) + +finished = False +while not finished: + youngest_file = None + for pairs_file in files: + if pairs_file['nextPair'] is not None: + # if the file is not at end + if youngest_file is None or pairs_file['nextPair']['number'] < youngest_file['nextPair']['number']: + youngest_file = pairs_file + if youngest_file is None: + finished = True + else: + print(''.join(youngest_file['nextPair']['lines']).rstrip()) + getNextSentencePair(youngest_file) + + + +for pairs_file in files: + pairs_file['file'].close() + +""" + +i = 0 +for line in sys.stdin: + line = line.strip() + if i % 3 == 0: + current_example = [line] + m = p.match(line) + if m: + current_key = int(m.group(1)) + else: + raise Exception("Wrong line: "+line) + elif i % 3 == 1: + current_example.append(line) + else: + current_example.append(line) + examples_dict[current_key] = current_example + i+=1 + +for key in sorted(examples_dict.keys()): + print ('\n'.join(examples_dict[key])) +"""