concordia-server/mgiza-aligner/mgiza/mgizapp/scripts/sntpostproc.py

133 lines
2.9 KiB
Python
Raw Normal View History

2017-01-21 17:07:36 +01:00
#!/usr/bin/env python
#
# This file is part of mgiza++. Its use is licensed under the GNU General
# Public License version 2 or, at your option, any later version.
"""Post-process the snt file.
The file can be either in single-line format or in multi-line format.
The output, however, will always be in single-line format.
"""
from __future__ import unicode_literals
from optparse import OptionParser
import sys
import re;
import codecs
import io
usage = """
The script post process the snt file, the input could be single-line snt
file or multi-line, (triple line) and can insert sentence weight to the
file (-w) or add partial alignment to the file (-a)
Usage %prog -s sntfile -w weight-file -a alignfile -o outputfile
"""
if sys.version_info < (3,0,0):
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
parser = OptionParser(usage=usage)
parser = OptionParser()
parser.add_option("-s", "--snt", dest="snt",default=None,
help="The input snt file", metavar="FILE")
parser.add_option("-w", "--weight", dest="weight",default=None,
help="The input weight file", metavar="FILE")
parser.add_option("-o", "--output", dest="output",default="-",
help="The input partial alignment file, one sentence per line", metavar="FILE")
parser.add_option("-a", "--align", dest="align",default=None,
help="The input partial alignment file, one sentence per line", metavar="FILE")
(options, args) = parser.parse_args()
if options.snt == None:
parser.print_help();
sys.exit();
else:
sfile = io.open(options.snt,"r", encoding="UTF-8");
if options.output=="-":
ofile = stdout;
else:
ofile = io.open(options.output,"w", encoding="UTF-8");
wfile = None;
if options.weight <> None:
wfile = io.open(options.weight,"r", encoding="UTF-8");
afile = None;
if options.align <> None:
afile = io.open(options.align,"r", encoding="UTF-8");
rr = re.compile("[\\|\\#\\*]");
wt = 0.0;
al = {};
e = "";
f = "";
def parse_ax(line):
alq = {};
als = line.strip().split(" ");
for e in als:
if len(e.strip())>0:
alo = e.split("-");
if len(alo)==2:
alq[tuple(alo)] = 1;
return alq;
while True:
l = sfile.readline();
if len(l) == 0:
break;
lp = rr.split(l.strip());
if len(lp)>=3:
wt = float(lp[0]);
e = lp[1];
f = lp[2];
if len(lp) > 3:
al = parse_ax(lp[3]);
else:
al = {};
else:
wt = float(l);
e = sfile.readline().strip();
f = sfile.readline().strip();
al={}
if wfile <> None:
lw = wfile.readline().strip();
if len(lw)>0:
wt = float(lw);
else:
wt = 1;
if afile <> None:
la = afile.readline().strip();
if len(la)>0:
al1 = parse_ax(la);
for entry in al1.keys():
al[entry] = 1;
ofile.write("%g | %s | %s" % (wt, e, f));
if len(al)>0:
ofile.write(" |");
for entry in al.keys():
ofile.write(" %s-%s" % entry);
ofile.write("\n");