optimized sorting giza alignments

2017-10-18 10:07:25 +02:00 · 2017-10-18 10:07:25 +02:00 · 38e51a90f7
commit 38e51a90f7
parent a2b6853dcf
2 changed files with 77 additions and 5 deletions
--- a/mgiza-aligner/Makefile
+++ b/mgiza-aligner/Makefile
@ -1,15 +1,15 @@
-SRC_LANG=en
+SRC_LANG=pl
 TRG_LANG=en
-CORPUS_NAME=jrc_enes
+CORPUS_NAME=europarl_sample
 SEPARATOR=@\#@

-DICTIONARY_WEIGHT=5
- 
+DICTIONARY_WEIGHT=0
+
 all: corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt

 corpora/$(CORPUS_NAME)/aligned.txt: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.lem_trg.lem.cooc corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt corpora/$(CORPUS_NAME)/src.lem.vcb corpora/$(CORPUS_NAME)/trg.lem.vcb
 	mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
-	cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt
+	./mergeGizaAlignments.py corpora/$(CORPUS_NAME)/aligned*part* > corpora/$(CORPUS_NAME)/aligned.txt

 clean-intermediate-files:
 	rm -f corpora/$(CORPUS_NAME)/*.lem
--- a/mgiza-aligner/mergeGizaAlignments.py
+++ b/mgiza-aligner/mergeGizaAlignments.py
@ -0,0 +1,72 @@
+#!/usr/bin/python3
+
+import sys, re
+
+pair_pattern = re.compile("# Sentence pair \((\d+)\)")
+
+
+def getNextSentencePair(pairs_file):
+    first_line = pairs_file['file'].readline()
+    if first_line == '':
+        # we reached the end of file
+        pairs_file['nextPair'] = None
+    else:
+        nextPair = dict()
+        m = pair_pattern.match(first_line)
+        if m:
+            nextPair['number'] = int(m.group(1))
+        else:
+            raise Exception("Could not read sentence pair number from line: %s" %first_line)
+        lines = [first_line]
+        lines.append(pairs_file['file'].readline())
+        lines.append(pairs_file['file'].readline())
+        nextPair['lines'] = lines
+        pairs_file['nextPair'] = nextPair
+
+files = []
+for arg in sys.argv[1:]:
+    pairs_file = {'file':open(arg,'r'), 'nextPair':None}
+    getNextSentencePair(pairs_file)
+    files.append(pairs_file)
+
+finished = False
+while not finished:
+    youngest_file = None
+    for pairs_file in files:
+        if pairs_file['nextPair'] is not None:
+        # if the file is not at end
+            if youngest_file is None or pairs_file['nextPair']['number'] < youngest_file['nextPair']['number']:
+                youngest_file = pairs_file
+    if youngest_file is None:
+        finished = True
+    else:
+        print(''.join(youngest_file['nextPair']['lines']).rstrip())
+        getNextSentencePair(youngest_file)
+
+
+
+for pairs_file in files:
+    pairs_file['file'].close()
+
+"""
+
+i = 0
+for line in sys.stdin:
+    line = line.strip()
+    if i % 3 == 0:
+        current_example = [line]
+        m = p.match(line)
+        if m:
+            current_key = int(m.group(1))
+        else:
+            raise Exception("Wrong line: "+line)
+    elif i % 3 == 1:
+        current_example.append(line)
+    else:
+        current_example.append(line)
+        examples_dict[current_key] = current_example
+    i+=1
+
+for key in sorted(examples_dict.keys()):
+    print ('\n'.join(examples_dict[key]))
+"""