133 lines
2.9 KiB
Python
133 lines
2.9 KiB
Python
|
#!/usr/bin/env python
|
||
|
#
|
||
|
# This file is part of mgiza++. Its use is licensed under the GNU General
|
||
|
# Public License version 2 or, at your option, any later version.
|
||
|
|
||
|
"""Post-process the snt file.
|
||
|
|
||
|
The file can be either in single-line format or in multi-line format.
|
||
|
The output, however, will always be in single-line format.
|
||
|
"""
|
||
|
|
||
|
from __future__ import unicode_literals
|
||
|
from optparse import OptionParser
|
||
|
import sys
|
||
|
import re;
|
||
|
import codecs
|
||
|
import io
|
||
|
|
||
|
usage = """
|
||
|
The script post process the snt file, the input could be single-line snt
|
||
|
file or multi-line, (triple line) and can insert sentence weight to the
|
||
|
file (-w) or add partial alignment to the file (-a)
|
||
|
Usage %prog -s sntfile -w weight-file -a alignfile -o outputfile
|
||
|
"""
|
||
|
|
||
|
if sys.version_info < (3,0,0):
|
||
|
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
|
||
|
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
|
||
|
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
|
||
|
|
||
|
parser = OptionParser(usage=usage)
|
||
|
|
||
|
|
||
|
parser = OptionParser()
|
||
|
|
||
|
parser.add_option("-s", "--snt", dest="snt",default=None,
|
||
|
help="The input snt file", metavar="FILE")
|
||
|
|
||
|
parser.add_option("-w", "--weight", dest="weight",default=None,
|
||
|
help="The input weight file", metavar="FILE")
|
||
|
|
||
|
|
||
|
parser.add_option("-o", "--output", dest="output",default="-",
|
||
|
help="The input partial alignment file, one sentence per line", metavar="FILE")
|
||
|
|
||
|
parser.add_option("-a", "--align", dest="align",default=None,
|
||
|
help="The input partial alignment file, one sentence per line", metavar="FILE")
|
||
|
|
||
|
|
||
|
(options, args) = parser.parse_args()
|
||
|
|
||
|
if options.snt == None:
|
||
|
parser.print_help();
|
||
|
sys.exit();
|
||
|
else:
|
||
|
sfile = io.open(options.snt,"r", encoding="UTF-8");
|
||
|
|
||
|
if options.output=="-":
|
||
|
ofile = stdout;
|
||
|
else:
|
||
|
ofile = io.open(options.output,"w", encoding="UTF-8");
|
||
|
|
||
|
wfile = None;
|
||
|
|
||
|
if options.weight <> None:
|
||
|
wfile = io.open(options.weight,"r", encoding="UTF-8");
|
||
|
|
||
|
afile = None;
|
||
|
if options.align <> None:
|
||
|
afile = io.open(options.align,"r", encoding="UTF-8");
|
||
|
|
||
|
rr = re.compile("[\\|\\#\\*]");
|
||
|
wt = 0.0;
|
||
|
al = {};
|
||
|
e = "";
|
||
|
f = "";
|
||
|
|
||
|
def parse_ax(line):
|
||
|
alq = {};
|
||
|
als = line.strip().split(" ");
|
||
|
for e in als:
|
||
|
if len(e.strip())>0:
|
||
|
alo = e.split("-");
|
||
|
if len(alo)==2:
|
||
|
alq[tuple(alo)] = 1;
|
||
|
return alq;
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
while True:
|
||
|
l = sfile.readline();
|
||
|
if len(l) == 0:
|
||
|
break;
|
||
|
lp = rr.split(l.strip());
|
||
|
if len(lp)>=3:
|
||
|
wt = float(lp[0]);
|
||
|
e = lp[1];
|
||
|
f = lp[2];
|
||
|
if len(lp) > 3:
|
||
|
al = parse_ax(lp[3]);
|
||
|
else:
|
||
|
al = {};
|
||
|
else:
|
||
|
wt = float(l);
|
||
|
e = sfile.readline().strip();
|
||
|
f = sfile.readline().strip();
|
||
|
al={}
|
||
|
if wfile <> None:
|
||
|
lw = wfile.readline().strip();
|
||
|
if len(lw)>0:
|
||
|
wt = float(lw);
|
||
|
else:
|
||
|
wt = 1;
|
||
|
if afile <> None:
|
||
|
la = afile.readline().strip();
|
||
|
if len(la)>0:
|
||
|
al1 = parse_ax(la);
|
||
|
for entry in al1.keys():
|
||
|
al[entry] = 1;
|
||
|
|
||
|
ofile.write("%g | %s | %s" % (wt, e, f));
|
||
|
if len(al)>0:
|
||
|
ofile.write(" |");
|
||
|
|
||
|
for entry in al.keys():
|
||
|
ofile.write(" %s-%s" % entry);
|
||
|
ofile.write("\n");
|
||
|
|
||
|
|