concordia-server/mgiza-aligner/extract.py

37 lines
1.2 KiB
Python
Raw Normal View History

2017-04-29 23:14:08 +02:00
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys, re
separator = sys.argv[6]
pair_pattern = re.compile(r'^# Sentence pair \((\d+)\).*')
def getNextSentencePair(f):
result = dict()
first_line = f.readline()
m = pair_pattern.match(first_line)
if m:
result['number'] = int(m.group(1))
else:
raise Exception("Could not read sentence pair number from line: %s" %first_line)
lines = [first_line]
lines.append(f.readline())
lines.append(f.readline())
result['lines'] = lines
return result
with open(sys.argv[1]) as pasted_file, open(sys.argv[2]) as aligned_file, open(sys.argv[3], 'w') as aligned_final, open(sys.argv[4], 'w') as src_final, open(sys.argv[5], 'w') as trg_final:
for line in pasted_file:
src_line, trg_line = line.strip()[12:].split(separator)
src_final.write(src_line+"\n")
trg_final.write(trg_line+"\n")
number = int(line[:12])
found = False
while not found:
sentence_pair = getNextSentencePair(aligned_file)
if (sentence_pair['number'] == number):
found = True
aligned_final.write(''.join(sentence_pair['lines']))