103 lines
2.4 KiB
Python
103 lines
2.4 KiB
Python
|
#!/usr/bin/env python
|
||
|
#
|
||
|
# This file is part of mgiza++. Its use is licensed under the GNU General
|
||
|
# Public License version 2 or, at your option, any later version.
|
||
|
|
||
|
from __future__ import unicode_literals
|
||
|
import sys
|
||
|
import codecs
|
||
|
import io
|
||
|
|
||
|
if sys.version_info < (3,0,0):
|
||
|
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
|
||
|
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
|
||
|
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
|
||
|
|
||
|
def loadvcb(fname,out):
|
||
|
dict={};
|
||
|
df = io.open(fname,"r", encoding="UTF-8");
|
||
|
for line in df:
|
||
|
out.write(line);
|
||
|
ws = line.strip().split();
|
||
|
id = int(ws[0]);
|
||
|
wd = ws[1];
|
||
|
dict[wd]=id;
|
||
|
return dict, id
|
||
|
|
||
|
if len(sys.argv)<9:
|
||
|
sys.stderr.write("Error, the input should be \n");
|
||
|
sys.stderr.write("%s evcb fvcb etxt ftxt esnt(out) fsnt(out) evcbx(out) fvcbx(out)\n" % sys.argv[0]);
|
||
|
sys.stderr.write("You should concatenate the evcbx and fvcbx to existing vcb files\n");
|
||
|
sys.exit();
|
||
|
|
||
|
ein = io.open(sys.argv[3],"r", encoding="UTF-8");
|
||
|
fin = io.open(sys.argv[4],"r", encoding="UTF-8");
|
||
|
|
||
|
eout = io.open(sys.argv[5],"w", encoding="UTF-8");
|
||
|
fout = io.open(sys.argv[6],"w", encoding="UTF-8");
|
||
|
|
||
|
evcbx = io.open(sys.argv[7],"w", encoding="UTF-8");
|
||
|
fvcbx = io.open(sys.argv[8],"w", encoding="UTF-8");
|
||
|
evcb, esize = loadvcb(sys.argv[1],evcbx)
|
||
|
fvcb, fsize = loadvcb(sys.argv[2],fvcbx)
|
||
|
|
||
|
i=0
|
||
|
while True:
|
||
|
i+=1;
|
||
|
eline=ein.readline();
|
||
|
fline=fin.readline();
|
||
|
if len(eline)==0 or len(fline)==0:
|
||
|
break;
|
||
|
ewords = eline.strip().split();
|
||
|
fwords = fline.strip().split();
|
||
|
el = [];
|
||
|
fl = [];
|
||
|
j=0;
|
||
|
for w in ewords:
|
||
|
j+=1
|
||
|
if evcb.has_key(w):
|
||
|
el.append(evcb[w]);
|
||
|
else:
|
||
|
if evcb.has_key(w.lower()):
|
||
|
el.append(evcb[w.lower()]);
|
||
|
else:
|
||
|
##stdout.write("#E %d %d %s\n" % (i,j,w))
|
||
|
esize += 1
|
||
|
evcb[w.lower()] = esize
|
||
|
evcbx.write("%d %s 1\n" % (esize, w))
|
||
|
el.append(esize)
|
||
|
|
||
|
j=0;
|
||
|
for w in fwords:
|
||
|
j+=1
|
||
|
if fvcb.has_key(w):
|
||
|
fl.append(fvcb[w]);
|
||
|
else:
|
||
|
if fvcb.has_key(w.lower()):
|
||
|
fl.append(fvcb[w.lower()]);
|
||
|
else:
|
||
|
#stdout.write("#F %d %d %s\n" % (i,j,w))
|
||
|
fsize += 1
|
||
|
fvcb[w.lower()] = fsize
|
||
|
fvcbx.write("%d %s 1\n" % (fsize, w))
|
||
|
fl.append(fsize)
|
||
|
eout.write("1\n");
|
||
|
fout.write("1\n");
|
||
|
for I in el:
|
||
|
eout.write("%d " % I);
|
||
|
eout.write("\n");
|
||
|
for I in fl:
|
||
|
eout.write("%d " % I);
|
||
|
fout.write("%d " % I);
|
||
|
eout.write("\n");
|
||
|
fout.write("\n");
|
||
|
for I in el:
|
||
|
fout.write("%d " % I);
|
||
|
fout.write("\n");
|
||
|
|
||
|
fout.close();
|
||
|
eout.close();
|
||
|
fvcbx.close();
|
||
|
evcbx.close();
|
||
|
|