diff --git a/src/common/common.cc b/src/common/common.cc index c7f4a57..d4a0b87 100644 --- a/src/common/common.cc +++ b/src/common/common.cc @@ -167,14 +167,14 @@ void process_common_options(gengetopt_args_info* args, char* argv0) if(args->output_given) if(!(outputf=fopen(args->output_arg,"w"))) { - fprintf(stderr,"Cannot open output file: %s.\n", args->output_arg); + fprintf(stderr,"Cannot open the output file: %s.\n", args->output_arg); exit(1); } if(args->fail_given) if(!(failedf=fopen(args->fail_arg,"w"))) { - fprintf(stderr,"Cannot open output file: %s.\n", args->fail_arg); + fprintf(stderr,"Cannot open the output file: %s.\n", args->fail_arg); exit(1); } diff --git a/src/compdic/Makefile b/src/compdic/Makefile index f4e8282..9201103 100644 --- a/src/compdic/Makefile +++ b/src/compdic/Makefile @@ -15,6 +15,12 @@ aut2fsa: aut2fsa.cc install: ifdef BIN_DIR install -m 0755 compdic $(BIN_DIR) + install -m 0755 compdic-create-fst $(BIN_DIR) + install -m 0755 compdic-fst-add $(BIN_DIR) + install -m 0755 compdic-fst-minus $(BIN_DIR) + install -m 0755 compdic-fst-to-bin $(BIN_DIR) + install -m 0755 compdic-fst-update $(BIN_DIR) + install -m 0755 fsm2aut $(BIN_DIR) install -m 0755 aut2fsa $(BIN_DIR) install -m 0755 lst2fstext $(BIN_DIR) @@ -24,6 +30,11 @@ endif uninstall: ifdef BIN_DIR rm $(BIN_DIR)/compdic + rm $(BIN_DIR)/compdic-create-fst + rm $(BIN_DIR)/compdic-fst-add + rm $(BIN_DIR)/compdic-fst-minus + rm $(BIN_DIR)/compdic-fst-to-bin + rm $(BIN_DIR)/compdic-fst-update rm $(BIN_DIR)/fsm2aut rm $(BIN_DIR)/aut2fsa rm $(BIN_DIR)/lst2fstext diff --git a/src/compdic/compdic-create-fst b/src/compdic/compdic-create-fst new file mode 100755 index 0000000..e874c21 --- /dev/null +++ b/src/compdic/compdic-create-fst @@ -0,0 +1,181 @@ + +no_of_parts=0 + +while [ $# -gt 2 ] +do + case $1 + in + -p) + no_of_parts=$2 + shift 2 + ;; + + *) + echo "The arguments to use are" + echo "-p: number of parts" + shift 1 + ;; + esac +done + +if [ $# -lt 2 ] +then + echo "Usage:" + echo " compdic [-p ] " + echo "where" + echo " - file containig a list of words, one per line, iso-8859-2 encoded" + echo " - a file to which the compiled automaton in openfst format shoul be written" + exit 0 +fi + +if [ $no_of_parts -eq 0 ] +then + no_of_parts=$(( `cat $1 | wc -l` / 75000 + 1 )) +fi + + +echo number of parts: $no_of_parts + + +tempdir=`mktemp -d /tmp/compdic.XXXXXX` + +alphabet=`tempfile -d $tempdir` + +cat < $alphabet + 0 +a 1 +A 2 +ä 3 +± 4 +¡ 5 +b 6 +B 7 +c 8 +C 9 +æ 10 +Æ 11 +d 12 +D 13 +e 14 +E 15 +é 16 +ê 17 +Ê 18 +f 19 +F 20 +g 21 +G 22 +h 23 +H 24 +i 25 +I 26 +j 27 +J 28 +k 29 +K 30 +l 31 +L 32 +³ 33 +£ 34 +m 35 +M 36 +n 37 +N 38 +ñ 39 +Ñ 40 +o 41 +O 42 +ö 43 +ó 44 +Ó 45 +p 46 +P 47 +q 48 +Q 49 +r 50 +R 51 +s 52 +S 53 +¶ 54 +¦ 55 +t 56 +T 57 +u 58 +U 59 +ü 60 +v 61 +V 62 +w 63 +W 64 +x 65 +X 66 +y 67 +Y 68 +z 69 +Z 70 +¼ 71 +¬ 72 +¿ 73 +¯ 74 +0 75 +1 76 +2 77 +3 78 +4 79 +5 80 +6 81 +7 82 +8 83 +9 84 +_ 85 +- 86 +? 87 +! 88 +~ 89 +; 90 +, 91 +/ 92 +* 93 ++ 94 +EOF + + +no_of_lines=$(( (`cat $1 | wc -l` / $no_of_parts) + 1 )) + +split -l $no_of_lines $1 $tempdir/part. + +automaton=$tempdir/output.fst + +cat < $automaton +EOF + +n=0 + +for f in $tempdir/part.* +do + temp1=`tempfile -d $tempdir` + temp2=`tempfile -d $tempdir` + temp3=`tempfile -d $tempdir` + + n=$(( $n + 1 )) + echo processing part $n + + cat $f |\ + lst2fstext |\ + fstcompile --acceptor --isymbols=$alphabet |\ + fstrmepsilon |\ + fstdeterminize > $temp1 + fstminimize $temp1 $temp2 + + fstunion $automaton $temp2 | fstrmepsilon | fstdeterminize > $temp3 + fstminimize $temp3 $automaton +done + +echo generating binary automaton file ... + +cat $automaton | fsttopsort > $2 +rm -r $tempdir + +#echo generating cats file ... + +#cat $1 | cut -d ',' -f 2 | sort -u $1.cats diff --git a/src/compdic/compdic-fst-add b/src/compdic/compdic-fst-add new file mode 100755 index 0000000..4e6053e --- /dev/null +++ b/src/compdic/compdic-fst-add @@ -0,0 +1,18 @@ + +if [ $# -ne 2 ] +then + echo "Usage:" + echo " compdic-fst-add " + echo "where" + echo " - automaton in openfst format" + echo " - automaton in openfst format containing paths to be removed from " + exit 0 +fi + +tempdir=`mktemp -d /tmp/compdic.XXXXXX` + +automaton=$tempdir/output.fst + +fstunion $1 $2 | fstrmepsilon | fstdeterminize | fstminimize | fsttopsort > $automaton +mv $automaton $1 +rm -r $tempdir diff --git a/src/compdic/compdic-fst-minus b/src/compdic/compdic-fst-minus new file mode 100755 index 0000000..9fd176c --- /dev/null +++ b/src/compdic/compdic-fst-minus @@ -0,0 +1,18 @@ + +if [ $# -ne 2 ] +then + echo "Usage:" + echo " compdic-fst-remove " + echo "where" + echo " - automaton in openfst format" + echo " - automaton in openfst format containing paths to be removed from " + exit 0 +fi + +tempdir=`mktemp -d /tmp/compdic.XXXXXX` + +automaton=$tempdir/output.fst + +fstdifference $1 $2 | fsttopsort > $automaton +mv $automaton $1 +rm -r $tempdir diff --git a/src/compdic/compdic-fst-to-bin b/src/compdic/compdic-fst-to-bin new file mode 100755 index 0000000..9c9ca24 --- /dev/null +++ b/src/compdic/compdic-fst-to-bin @@ -0,0 +1,120 @@ + + +if [ $# -lt 2 ] +then + echo "Usage:" + echo " compdic-fst-to-bin " + echo "where" + echo " - file containig automaton in openfst format" + echo " - a file to which the compiled binary automaton (lem/cor/kor format) shoul be written" + exit 0 +fi + +tempdir=`mktemp -d /tmp/compdic.XXXXXX` + +alphabet=`tempfile -d $tempdir` + +cat < $alphabet + 0 +a 1 +A 2 +ä 3 +± 4 +¡ 5 +b 6 +B 7 +c 8 +C 9 +æ 10 +Æ 11 +d 12 +D 13 +e 14 +E 15 +é 16 +ê 17 +Ê 18 +f 19 +F 20 +g 21 +G 22 +h 23 +H 24 +i 25 +I 26 +j 27 +J 28 +k 29 +K 30 +l 31 +L 32 +³ 33 +£ 34 +m 35 +M 36 +n 37 +N 38 +ñ 39 +Ñ 40 +o 41 +O 42 +ö 43 +ó 44 +Ó 45 +p 46 +P 47 +q 48 +Q 49 +r 50 +R 51 +s 52 +S 53 +¶ 54 +¦ 55 +t 56 +T 57 +u 58 +U 59 +ü 60 +v 61 +V 62 +w 63 +W 64 +x 65 +X 66 +y 67 +Y 68 +z 69 +Z 70 +¼ 71 +¬ 72 +¿ 73 +¯ 74 +0 75 +1 76 +2 77 +3 78 +4 79 +5 80 +6 81 +7 82 +8 83 +9 84 +_ 85 +- 86 +? 87 +! 88 +~ 89 +; 90 +, 91 +/ 92 +* 93 ++ 94 +EOF + +cat $1 | fstrmepsilon | fstdeterminize | fstminimize | fstprint --acceptor --isymbols=$alphabet | fsm2aut | aut2fsa > $2 +rm -r $tempdir + +#echo generating cats file ... + +#cat $1 | cut -d ',' -f 2 | sort -u $2.cats diff --git a/src/compdic/compdic-fst-update b/src/compdic/compdic-fst-update new file mode 100755 index 0000000..e0cc5dd --- /dev/null +++ b/src/compdic/compdic-fst-update @@ -0,0 +1,43 @@ + +if [ $# -lt 2 ] +then + echo "Usage:" + echo " compdic-fst-update ..." + echo "where" + echo " - file containig a list of words, one per line, iso-8859-2 encoded" + echo " - a file to which the compiled automaton (cor/kor format) shoul be written" + exit 0 +fi + +tempdir=`mktemp -d /tmp/compdic.XXXXXX` + +dict=$1 +shift + +dicplus=$tempdir/plus.dic +fstplus=$tempdir/plus.fst +dicminus=$tempdir/minus.dic +fstminus=$tempdir/minus.fst +tmpfst=$tempdir/tmp.fst + +touch $dicplus +touch $dicminus + +while (($#)) +do + echo processing $1 ... + cat $1 | egrep '^>' | sed -r 's/^> *//' >> $dicplus + cat $1 | egrep '^<' | sed -r 's/^< *//' >> $dicminus + shift +done + +echo updating $dict ... +compdic-create-fst $dicplus $fstplus +compdic-create-fst $dicminus $fstminus +fstdifference $dict $fstminus | fstdeterminize > $tmpfst +fstunion $tmpfst $fstplus | fstdeterminize | fstminimize > $tmpfst +mv ${dict} ${dict}~ +mv $tmpfst ${dict} + + +rm -r $tempdir