optimized sorting giza alignments

This commit is contained in:
rjawor 2017-10-18 10:07:25 +02:00
parent a2b6853dcf
commit 38e51a90f7
2 changed files with 77 additions and 5 deletions

View File

@ -1,15 +1,15 @@
SRC_LANG=en
SRC_LANG=pl
TRG_LANG=en
CORPUS_NAME=jrc_enes
CORPUS_NAME=europarl_sample
SEPARATOR=@\#@
DICTIONARY_WEIGHT=5
DICTIONARY_WEIGHT=0
all: corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt
corpora/$(CORPUS_NAME)/aligned.txt: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.lem_trg.lem.cooc corpora/$(CORPUS_NAME)/src.lem_trg.lem.snt corpora/$(CORPUS_NAME)/src.lem.vcb corpora/$(CORPUS_NAME)/trg.lem.vcb
mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
cat corpora/$(CORPUS_NAME)/aligned*part* | ./sortGizaAlignments.py > corpora/$(CORPUS_NAME)/aligned.txt
./mergeGizaAlignments.py corpora/$(CORPUS_NAME)/aligned*part* > corpora/$(CORPUS_NAME)/aligned.txt
clean-intermediate-files:
rm -f corpora/$(CORPUS_NAME)/*.lem

View File

@ -0,0 +1,72 @@
#!/usr/bin/python3
import sys, re
pair_pattern = re.compile("# Sentence pair \((\d+)\)")
def getNextSentencePair(pairs_file):
first_line = pairs_file['file'].readline()
if first_line == '':
# we reached the end of file
pairs_file['nextPair'] = None
else:
nextPair = dict()
m = pair_pattern.match(first_line)
if m:
nextPair['number'] = int(m.group(1))
else:
raise Exception("Could not read sentence pair number from line: %s" %first_line)
lines = [first_line]
lines.append(pairs_file['file'].readline())
lines.append(pairs_file['file'].readline())
nextPair['lines'] = lines
pairs_file['nextPair'] = nextPair
files = []
for arg in sys.argv[1:]:
pairs_file = {'file':open(arg,'r'), 'nextPair':None}
getNextSentencePair(pairs_file)
files.append(pairs_file)
finished = False
while not finished:
youngest_file = None
for pairs_file in files:
if pairs_file['nextPair'] is not None:
# if the file is not at end
if youngest_file is None or pairs_file['nextPair']['number'] < youngest_file['nextPair']['number']:
youngest_file = pairs_file
if youngest_file is None:
finished = True
else:
print(''.join(youngest_file['nextPair']['lines']).rstrip())
getNextSentencePair(youngest_file)
for pairs_file in files:
pairs_file['file'].close()
"""
i = 0
for line in sys.stdin:
line = line.strip()
if i % 3 == 0:
current_example = [line]
m = p.match(line)
if m:
current_key = int(m.group(1))
else:
raise Exception("Wrong line: "+line)
elif i % 3 == 1:
current_example.append(line)
else:
current_example.append(line)
examples_dict[current_key] = current_example
i+=1
for key in sorted(examples_dict.keys()):
print ('\n'.join(examples_dict[key]))
"""