dodane programy do uaktualniania slownika form (compdic-*)

This commit is contained in:
Tomasz Obrebski 2012-03-13 17:18:41 +01:00
parent a15e59b825
commit f4bf33ed04
7 changed files with 393 additions and 2 deletions

View File

@ -167,14 +167,14 @@ void process_common_options(gengetopt_args_info* args, char* argv0)
if(args->output_given) if(args->output_given)
if(!(outputf=fopen(args->output_arg,"w"))) if(!(outputf=fopen(args->output_arg,"w")))
{ {
fprintf(stderr,"Cannot open output file: %s.\n", args->output_arg); fprintf(stderr,"Cannot open the output file: %s.\n", args->output_arg);
exit(1); exit(1);
} }
if(args->fail_given) if(args->fail_given)
if(!(failedf=fopen(args->fail_arg,"w"))) if(!(failedf=fopen(args->fail_arg,"w")))
{ {
fprintf(stderr,"Cannot open output file: %s.\n", args->fail_arg); fprintf(stderr,"Cannot open the output file: %s.\n", args->fail_arg);
exit(1); exit(1);
} }

View File

@ -15,6 +15,12 @@ aut2fsa: aut2fsa.cc
install: install:
ifdef BIN_DIR ifdef BIN_DIR
install -m 0755 compdic $(BIN_DIR) install -m 0755 compdic $(BIN_DIR)
install -m 0755 compdic-create-fst $(BIN_DIR)
install -m 0755 compdic-fst-add $(BIN_DIR)
install -m 0755 compdic-fst-minus $(BIN_DIR)
install -m 0755 compdic-fst-to-bin $(BIN_DIR)
install -m 0755 compdic-fst-update $(BIN_DIR)
install -m 0755 fsm2aut $(BIN_DIR) install -m 0755 fsm2aut $(BIN_DIR)
install -m 0755 aut2fsa $(BIN_DIR) install -m 0755 aut2fsa $(BIN_DIR)
install -m 0755 lst2fstext $(BIN_DIR) install -m 0755 lst2fstext $(BIN_DIR)
@ -24,6 +30,11 @@ endif
uninstall: uninstall:
ifdef BIN_DIR ifdef BIN_DIR
rm $(BIN_DIR)/compdic rm $(BIN_DIR)/compdic
rm $(BIN_DIR)/compdic-create-fst
rm $(BIN_DIR)/compdic-fst-add
rm $(BIN_DIR)/compdic-fst-minus
rm $(BIN_DIR)/compdic-fst-to-bin
rm $(BIN_DIR)/compdic-fst-update
rm $(BIN_DIR)/fsm2aut rm $(BIN_DIR)/fsm2aut
rm $(BIN_DIR)/aut2fsa rm $(BIN_DIR)/aut2fsa
rm $(BIN_DIR)/lst2fstext rm $(BIN_DIR)/lst2fstext

181
src/compdic/compdic-create-fst Executable file
View File

@ -0,0 +1,181 @@
no_of_parts=0
while [ $# -gt 2 ]
do
case $1
in
-p)
no_of_parts=$2
shift 2
;;
*)
echo "The arguments to use are"
echo "-p: number of parts"
shift 1
;;
esac
done
if [ $# -lt 2 ]
then
echo "Usage:"
echo " compdic [-p <parts>] <wordlist> <automaton>"
echo "where"
echo " <wordlist> - file containig a list of words, one per line, iso-8859-2 encoded"
echo " <automaton> - a file to which the compiled automaton in openfst format shoul be written"
exit 0
fi
if [ $no_of_parts -eq 0 ]
then
no_of_parts=$(( `cat $1 | wc -l` / 75000 + 1 ))
fi
echo number of parts: $no_of_parts
tempdir=`mktemp -d /tmp/compdic.XXXXXX`
alphabet=`tempfile -d $tempdir`
cat <<EOF > $alphabet
<eps> 0
a 1
A 2
ä 3
± 4
¡ 5
b 6
B 7
c 8
C 9
æ 10
Æ 11
d 12
D 13
e 14
E 15
é 16
ê 17
Ê 18
f 19
F 20
g 21
G 22
h 23
H 24
i 25
I 26
j 27
J 28
k 29
K 30
l 31
L 32
³ 33
£ 34
m 35
M 36
n 37
N 38
ñ 39
Ñ 40
o 41
O 42
ö 43
ó 44
Ó 45
p 46
P 47
q 48
Q 49
r 50
R 51
s 52
S 53
¶ 54
¦ 55
t 56
T 57
u 58
U 59
ü 60
v 61
V 62
w 63
W 64
x 65
X 66
y 67
Y 68
z 69
Z 70
¼ 71
¬ 72
¿ 73
¯ 74
0 75
1 76
2 77
3 78
4 79
5 80
6 81
7 82
8 83
9 84
_ 85
- 86
? 87
! 88
~ 89
; 90
, 91
/ 92
* 93
+ 94
EOF
no_of_lines=$(( (`cat $1 | wc -l` / $no_of_parts) + 1 ))
split -l $no_of_lines $1 $tempdir/part.
automaton=$tempdir/output.fst
cat <<EOF | fstcompile --acceptor --isymbols=$alphabet > $automaton
EOF
n=0
for f in $tempdir/part.*
do
temp1=`tempfile -d $tempdir`
temp2=`tempfile -d $tempdir`
temp3=`tempfile -d $tempdir`
n=$(( $n + 1 ))
echo processing part $n
cat $f |\
lst2fstext |\
fstcompile --acceptor --isymbols=$alphabet |\
fstrmepsilon |\
fstdeterminize > $temp1
fstminimize $temp1 $temp2
fstunion $automaton $temp2 | fstrmepsilon | fstdeterminize > $temp3
fstminimize $temp3 $automaton
done
echo generating binary automaton file ...
cat $automaton | fsttopsort > $2
rm -r $tempdir
#echo generating cats file ...
#cat $1 | cut -d ',' -f 2 | sort -u $1.cats

18
src/compdic/compdic-fst-add Executable file
View File

@ -0,0 +1,18 @@
if [ $# -ne 2 ]
then
echo "Usage:"
echo " compdic-fst-add <automaton1> <automaton2>"
echo "where"
echo " <automaton1> - automaton in openfst format"
echo " <automaton2> - automaton in openfst format containing paths to be removed from <automaton1>"
exit 0
fi
tempdir=`mktemp -d /tmp/compdic.XXXXXX`
automaton=$tempdir/output.fst
fstunion $1 $2 | fstrmepsilon | fstdeterminize | fstminimize | fsttopsort > $automaton
mv $automaton $1
rm -r $tempdir

18
src/compdic/compdic-fst-minus Executable file
View File

@ -0,0 +1,18 @@
if [ $# -ne 2 ]
then
echo "Usage:"
echo " compdic-fst-remove <automaton1> <automaton2>"
echo "where"
echo " <automaton1> - automaton in openfst format"
echo " <automaton2> - automaton in openfst format containing paths to be removed from <automaton1>"
exit 0
fi
tempdir=`mktemp -d /tmp/compdic.XXXXXX`
automaton=$tempdir/output.fst
fstdifference $1 $2 | fsttopsort > $automaton
mv $automaton $1
rm -r $tempdir

120
src/compdic/compdic-fst-to-bin Executable file
View File

@ -0,0 +1,120 @@
if [ $# -lt 2 ]
then
echo "Usage:"
echo " compdic-fst-to-bin <fstautomaton> <binautomaton>"
echo "where"
echo " <fstautomaton> - file containig automaton in openfst format"
echo " <binautomaton> - a file to which the compiled binary automaton (lem/cor/kor format) shoul be written"
exit 0
fi
tempdir=`mktemp -d /tmp/compdic.XXXXXX`
alphabet=`tempfile -d $tempdir`
cat <<EOF > $alphabet
<eps> 0
a 1
A 2
ä 3
± 4
¡ 5
b 6
B 7
c 8
C 9
æ 10
Æ 11
d 12
D 13
e 14
E 15
é 16
ê 17
Ê 18
f 19
F 20
g 21
G 22
h 23
H 24
i 25
I 26
j 27
J 28
k 29
K 30
l 31
L 32
³ 33
£ 34
m 35
M 36
n 37
N 38
ñ 39
Ñ 40
o 41
O 42
ö 43
ó 44
Ó 45
p 46
P 47
q 48
Q 49
r 50
R 51
s 52
S 53
¶ 54
¦ 55
t 56
T 57
u 58
U 59
ü 60
v 61
V 62
w 63
W 64
x 65
X 66
y 67
Y 68
z 69
Z 70
¼ 71
¬ 72
¿ 73
¯ 74
0 75
1 76
2 77
3 78
4 79
5 80
6 81
7 82
8 83
9 84
_ 85
- 86
? 87
! 88
~ 89
; 90
, 91
/ 92
* 93
+ 94
EOF
cat $1 | fstrmepsilon | fstdeterminize | fstminimize | fstprint --acceptor --isymbols=$alphabet | fsm2aut | aut2fsa > $2
rm -r $tempdir
#echo generating cats file ...
#cat $1 | cut -d ',' -f 2 | sort -u $2.cats

43
src/compdic/compdic-fst-update Executable file
View File

@ -0,0 +1,43 @@
if [ $# -lt 2 ]
then
echo "Usage:"
echo " compdic-fst-update <dictionary> <difference> <difference> ..."
echo "where"
echo " <dictionary> - file containig a list of words, one per line, iso-8859-2 encoded"
echo " <difference> - a file to which the compiled automaton (cor/kor format) shoul be written"
exit 0
fi
tempdir=`mktemp -d /tmp/compdic.XXXXXX`
dict=$1
shift
dicplus=$tempdir/plus.dic
fstplus=$tempdir/plus.fst
dicminus=$tempdir/minus.dic
fstminus=$tempdir/minus.fst
tmpfst=$tempdir/tmp.fst
touch $dicplus
touch $dicminus
while (($#))
do
echo processing $1 ...
cat $1 | egrep '^>' | sed -r 's/^> *//' >> $dicplus
cat $1 | egrep '^<' | sed -r 's/^< *//' >> $dicminus
shift
done
echo updating $dict ...
compdic-create-fst $dicplus $fstplus
compdic-create-fst $dicminus $fstminus
fstdifference $dict $fstminus | fstdeterminize > $tmpfst
fstunion $tmpfst $fstplus | fstdeterminize | fstminimize > $tmpfst
mv ${dict} ${dict}~
mv $tmpfst ${dict}
rm -r $tempdir