#!/usr/bin/env python # # This file is part of mgiza++. Its use is licensed under the GNU General # Public License version 2 or, at your option, any later version. """Post-process the snt file. The file can be either in single-line format or in multi-line format. The output, however, will always be in single-line format. """ from __future__ import unicode_literals from optparse import OptionParser import sys import re; import codecs import io usage = """ The script post process the snt file, the input could be single-line snt file or multi-line, (triple line) and can insert sentence weight to the file (-w) or add partial alignment to the file (-a) Usage %prog -s sntfile -w weight-file -a alignfile -o outputfile """ if sys.version_info < (3,0,0): sys.stdin = codecs.getreader('UTF-8')(sys.stdin) sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) parser = OptionParser(usage=usage) parser = OptionParser() parser.add_option("-s", "--snt", dest="snt",default=None, help="The input snt file", metavar="FILE") parser.add_option("-w", "--weight", dest="weight",default=None, help="The input weight file", metavar="FILE") parser.add_option("-o", "--output", dest="output",default="-", help="The input partial alignment file, one sentence per line", metavar="FILE") parser.add_option("-a", "--align", dest="align",default=None, help="The input partial alignment file, one sentence per line", metavar="FILE") (options, args) = parser.parse_args() if options.snt == None: parser.print_help(); sys.exit(); else: sfile = io.open(options.snt,"r", encoding="UTF-8"); if options.output=="-": ofile = stdout; else: ofile = io.open(options.output,"w", encoding="UTF-8"); wfile = None; if options.weight <> None: wfile = io.open(options.weight,"r", encoding="UTF-8"); afile = None; if options.align <> None: afile = io.open(options.align,"r", encoding="UTF-8"); rr = re.compile("[\\|\\#\\*]"); wt = 0.0; al = {}; e = ""; f = ""; def parse_ax(line): alq = {}; als = line.strip().split(" "); for e in als: if len(e.strip())>0: alo = e.split("-"); if len(alo)==2: alq[tuple(alo)] = 1; return alq; while True: l = sfile.readline(); if len(l) == 0: break; lp = rr.split(l.strip()); if len(lp)>=3: wt = float(lp[0]); e = lp[1]; f = lp[2]; if len(lp) > 3: al = parse_ax(lp[3]); else: al = {}; else: wt = float(l); e = sfile.readline().strip(); f = sfile.readline().strip(); al={} if wfile <> None: lw = wfile.readline().strip(); if len(lw)>0: wt = float(lw); else: wt = 1; if afile <> None: la = afile.readline().strip(); if len(la)>0: al1 = parse_ax(la); for entry in al1.keys(): al[entry] = 1; ofile.write("%g | %s | %s" % (wt, e, f)); if len(al)>0: ofile.write(" |"); for entry in al.keys(): ofile.write(" %s-%s" % entry); ofile.write("\n");