concordia-server/mgiza-aligner/mergeGizaAlignments.py

73 lines
1.9 KiB
Python
Raw Normal View History

2017-10-18 10:07:25 +02:00
#!/usr/bin/python3
import sys, re
pair_pattern = re.compile("# Sentence pair \((\d+)\)")
def getNextSentencePair(pairs_file):
first_line = pairs_file['file'].readline()
if first_line == '':
# we reached the end of file
pairs_file['nextPair'] = None
else:
nextPair = dict()
m = pair_pattern.match(first_line)
if m:
nextPair['number'] = int(m.group(1))
else:
raise Exception("Could not read sentence pair number from line: %s" %first_line)
lines = [first_line]
lines.append(pairs_file['file'].readline())
lines.append(pairs_file['file'].readline())
nextPair['lines'] = lines
pairs_file['nextPair'] = nextPair
files = []
for arg in sys.argv[1:]:
pairs_file = {'file':open(arg,'r'), 'nextPair':None}
getNextSentencePair(pairs_file)
files.append(pairs_file)
finished = False
while not finished:
youngest_file = None
for pairs_file in files:
if pairs_file['nextPair'] is not None:
# if the file is not at end
if youngest_file is None or pairs_file['nextPair']['number'] < youngest_file['nextPair']['number']:
youngest_file = pairs_file
if youngest_file is None:
finished = True
else:
print(''.join(youngest_file['nextPair']['lines']).rstrip())
getNextSentencePair(youngest_file)
for pairs_file in files:
pairs_file['file'].close()
"""
i = 0
for line in sys.stdin:
line = line.strip()
if i % 3 == 0:
current_example = [line]
m = p.match(line)
if m:
current_key = int(m.group(1))
else:
raise Exception("Wrong line: "+line)
elif i % 3 == 1:
current_example.append(line)
else:
current_example.append(line)
examples_dict[current_key] = current_example
i+=1
for key in sorted(examples_dict.keys()):
print ('\n'.join(examples_dict[key]))
"""