dodanie moliwoci szybkiego uaktualniania sownika dla lema
This commit is contained in:
parent
f4bf33ed04
commit
555c7f814b
@ -15,11 +15,11 @@ aut2fsa: aut2fsa.cc
|
||||
install:
|
||||
ifdef BIN_DIR
|
||||
install -m 0755 compdic $(BIN_DIR)
|
||||
install -m 0755 compdic-create-fst $(BIN_DIR)
|
||||
install -m 0755 compdic-fst-add $(BIN_DIR)
|
||||
install -m 0755 compdic-fst-minus $(BIN_DIR)
|
||||
install -m 0755 compdic-update $(BIN_DIR)
|
||||
install -m 0755 compdic-update-fst $(BIN_DIR)
|
||||
install -m 0755 compdic-update-cats $(BIN_DIR)
|
||||
install -m 0755 compdic-dic-to-fst $(BIN_DIR)
|
||||
install -m 0755 compdic-fst-to-bin $(BIN_DIR)
|
||||
install -m 0755 compdic-fst-update $(BIN_DIR)
|
||||
|
||||
install -m 0755 fsm2aut $(BIN_DIR)
|
||||
install -m 0755 aut2fsa $(BIN_DIR)
|
||||
@ -30,11 +30,12 @@ endif
|
||||
uninstall:
|
||||
ifdef BIN_DIR
|
||||
rm $(BIN_DIR)/compdic
|
||||
rm $(BIN_DIR)/compdic-create-fst
|
||||
rm $(BIN_DIR)/compdic-fst-add
|
||||
rm $(BIN_DIR)/compdic-fst-minus
|
||||
rm $(BIN_DIR)/compdic-update
|
||||
rm $(BIN_DIR)/compdic-update-fst
|
||||
rm $(BIN_DIR)/compdic-update-cats
|
||||
rm $(BIN_DIR)/compdic-dic-to-fst
|
||||
rm $(BIN_DIR)/compdic-fst-to-bin
|
||||
rm $(BIN_DIR)/compdic-fst-update
|
||||
|
||||
rm $(BIN_DIR)/fsm2aut
|
||||
rm $(BIN_DIR)/aut2fsa
|
||||
rm $(BIN_DIR)/lst2fstext
|
||||
|
@ -1,3 +1,4 @@
|
||||
#!/bin/bash
|
||||
|
||||
no_of_parts=0
|
||||
|
||||
|
18
src/compdic/compdic-dic-to-cats
Executable file
18
src/compdic/compdic-dic-to-cats
Executable file
@ -0,0 +1,18 @@
|
||||
|
||||
if [ $# -lt 2 ]
|
||||
then
|
||||
echo "Usage:"
|
||||
echo " compdic-dic-to-cats <dic-file> <cats-file>"
|
||||
echo "where"
|
||||
echo " <wordlist> - file containig a list of words, one per line, iso-8859-2 encoded"
|
||||
echo " <automaton> - a file to which the compiled automaton in openfst format shoul be written"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
source=$1
|
||||
cats=$2
|
||||
|
||||
echo generating cats file ...
|
||||
|
||||
cat $source | cut -d ',' -f 2 | sort -u > $cats
|
||||
|
187
src/compdic/compdic-dic-to-fst
Executable file
187
src/compdic/compdic-dic-to-fst
Executable file
@ -0,0 +1,187 @@
|
||||
#! /bin/bash
|
||||
|
||||
no_of_parts=0
|
||||
|
||||
while [ $# -gt 2 ]
|
||||
do
|
||||
case $1
|
||||
in
|
||||
-p)
|
||||
no_of_parts=$2
|
||||
shift 2
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "The arguments to use are"
|
||||
echo "-p: number of parts"
|
||||
shift 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ $# -lt 2 ]
|
||||
then
|
||||
echo "Usage:"
|
||||
echo " compdic [-p <parts>] <sourcefile> <fst>"
|
||||
echo "where"
|
||||
echo " <sourcefile> - file containig a list of words, one per line, iso-8859-2 encoded"
|
||||
echo " <dict> - file to which the compiled automaton in openfst format will be written"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
||||
source=$1
|
||||
fst=$2
|
||||
|
||||
|
||||
if [ $no_of_parts -eq 0 ]
|
||||
then
|
||||
no_of_parts=$(( `cat $1 | wc -l` / 75000 + 1 ))
|
||||
fi
|
||||
|
||||
|
||||
echo number of parts: $no_of_parts
|
||||
|
||||
|
||||
tempdir=`mktemp -d /tmp/compdic.XXXXXX`
|
||||
|
||||
alphabet=`tempfile -d $tempdir`
|
||||
|
||||
cat <<EOF > $alphabet
|
||||
<eps> 0
|
||||
a 1
|
||||
A 2
|
||||
ä 3
|
||||
± 4
|
||||
¡ 5
|
||||
b 6
|
||||
B 7
|
||||
c 8
|
||||
C 9
|
||||
æ 10
|
||||
Æ 11
|
||||
d 12
|
||||
D 13
|
||||
e 14
|
||||
E 15
|
||||
é 16
|
||||
ê 17
|
||||
Ê 18
|
||||
f 19
|
||||
F 20
|
||||
g 21
|
||||
G 22
|
||||
h 23
|
||||
H 24
|
||||
i 25
|
||||
I 26
|
||||
j 27
|
||||
J 28
|
||||
k 29
|
||||
K 30
|
||||
l 31
|
||||
L 32
|
||||
³ 33
|
||||
£ 34
|
||||
m 35
|
||||
M 36
|
||||
n 37
|
||||
N 38
|
||||
ñ 39
|
||||
Ñ 40
|
||||
o 41
|
||||
O 42
|
||||
ö 43
|
||||
ó 44
|
||||
Ó 45
|
||||
p 46
|
||||
P 47
|
||||
q 48
|
||||
Q 49
|
||||
r 50
|
||||
R 51
|
||||
s 52
|
||||
S 53
|
||||
¶ 54
|
||||
¦ 55
|
||||
t 56
|
||||
T 57
|
||||
u 58
|
||||
U 59
|
||||
ü 60
|
||||
v 61
|
||||
V 62
|
||||
w 63
|
||||
W 64
|
||||
x 65
|
||||
X 66
|
||||
y 67
|
||||
Y 68
|
||||
z 69
|
||||
Z 70
|
||||
¼ 71
|
||||
¬ 72
|
||||
¿ 73
|
||||
¯ 74
|
||||
0 75
|
||||
1 76
|
||||
2 77
|
||||
3 78
|
||||
4 79
|
||||
5 80
|
||||
6 81
|
||||
7 82
|
||||
8 83
|
||||
9 84
|
||||
_ 85
|
||||
- 86
|
||||
? 87
|
||||
! 88
|
||||
~ 89
|
||||
; 90
|
||||
, 91
|
||||
/ 92
|
||||
* 93
|
||||
+ 94
|
||||
EOF
|
||||
|
||||
|
||||
no_of_lines=$(( (`cat $source | wc -l` / $no_of_parts) + 1 ))
|
||||
|
||||
split -l $no_of_lines $source $tempdir/part.
|
||||
|
||||
automaton=$tempdir/output.fst
|
||||
|
||||
cat <<EOF | fstcompile --acceptor --isymbols=$alphabet > $automaton
|
||||
EOF
|
||||
|
||||
n=0
|
||||
|
||||
for f in $tempdir/part.*
|
||||
do
|
||||
temp1=`tempfile -d $tempdir`
|
||||
temp2=`tempfile -d $tempdir`
|
||||
temp3=`tempfile -d $tempdir`
|
||||
|
||||
n=$(( $n + 1 ))
|
||||
echo processing part $n
|
||||
|
||||
cat $f |\
|
||||
lst2fstext |\
|
||||
fstcompile --acceptor --isymbols=$alphabet |\
|
||||
fstrmepsilon |\
|
||||
fstdeterminize > $temp1
|
||||
fstminimize $temp1 $temp2
|
||||
|
||||
fstunion $automaton $temp2 | fstrmepsilon | fstdeterminize > $temp3
|
||||
fstminimize $temp3 $automaton
|
||||
done
|
||||
|
||||
echo generating binary automaton file ...
|
||||
|
||||
cat $automaton | fsttopsort > $fst
|
||||
rm -r $tempdir
|
||||
|
||||
#echo generating cats file ...
|
||||
|
||||
#cat $1 | cut -d ',' -f 2 | sort -u > $1.cats
|
@ -1,4 +1,4 @@
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
if [ $# -lt 2 ]
|
||||
then
|
||||
|
48
src/compdic/compdic-update
Executable file
48
src/compdic/compdic-update
Executable file
@ -0,0 +1,48 @@
|
||||
#! /bin/bash
|
||||
|
||||
. /etc/utt/compdic.conf
|
||||
|
||||
|
||||
while [ $# -gt 2 ]
|
||||
do
|
||||
case $1
|
||||
in
|
||||
-d)
|
||||
dictionary_home=$2
|
||||
shift 2
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "The arguments to use are"
|
||||
echo "-d <dictionary-home-dir>"
|
||||
shift 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
|
||||
if [ $# -lt 2 ]
|
||||
then
|
||||
echo "Usage:"
|
||||
echo " compdic-update [-d <dictionary-home-dir>] <dictionary-name> <difference> <difference> ..."
|
||||
echo "where"
|
||||
echo " <dictionary-home-dir> - dictionary home directory"
|
||||
echo " <dictionary-name> - dictionary name"
|
||||
echo " <difference> - diff format file containing paths to be added/removed from dictionary"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
fst=$dictionary_home/$1.fst
|
||||
cats=$dictionary_home/$1.cats
|
||||
bin=$dictionary_home/$1.bin
|
||||
shift 1
|
||||
diffs=$*
|
||||
|
||||
echo running "compdic-update-fst $fst $diffs"
|
||||
compdic-update-fst $fst $diffs
|
||||
|
||||
echo running "compdic-fst-to-bin $fst $bin"
|
||||
compdic-fst-to-bin $fst $bin
|
||||
|
||||
echo running "compdic-update-cats $cats $diffs"
|
||||
compdic-update-cats $cats $diffs
|
44
src/compdic/compdic-update-cats
Executable file
44
src/compdic/compdic-update-cats
Executable file
@ -0,0 +1,44 @@
|
||||
#! /bin/bash
|
||||
|
||||
if [ $# -lt 2 ]
|
||||
then
|
||||
echo "Usage:"
|
||||
echo " compdic-update-cats <catfile> <difference> <difference> ..."
|
||||
echo "where"
|
||||
echo " <catfile> - file containing the list morphosyntactic categories"
|
||||
echo " <difference> - diff-format file containing lines to be added to/removed from dictionary"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
tempdir=`mktemp -d /tmp/compdic.XXXXXX`
|
||||
|
||||
cats=$1
|
||||
shift
|
||||
|
||||
catplus=$tempdir/plus.cat
|
||||
catminus=$tempdir/minus.cat
|
||||
cattmp=$tempdir/tmp.cat
|
||||
|
||||
touch $catplus
|
||||
touch $catminus
|
||||
|
||||
while (($#))
|
||||
do
|
||||
echo processing $1 ...
|
||||
cat $1 | egrep '^>' | sed -r 's/^> *//;s/[[:space:]].*$//' | cut -d ',' -f 2 | sort -u | canonize >> $catplus
|
||||
cat $1 | egrep '^<' | sed -r 's/^< *//;s/[[:space:]].*$//' | cut -d ',' -f 2 | sort -u | canonize >> $catminus
|
||||
shift
|
||||
done
|
||||
|
||||
echo updating $cats ...
|
||||
# comm -23 $cats $catminus > $cattmp
|
||||
# mv ${cats} ${cats}~
|
||||
# cat $cattmp $catplus | sort -u > $cats
|
||||
|
||||
cat $cats $catplus | sort -u > $cattmp
|
||||
|
||||
mv ${cats} ${cats}~
|
||||
mv ${cattmp} ${cats}
|
||||
chmod a+r ${cats}
|
||||
|
||||
rm -r $tempdir
|
67
src/compdic/compdic-update-fst
Executable file
67
src/compdic/compdic-update-fst
Executable file
@ -0,0 +1,67 @@
|
||||
#! /bin/bash
|
||||
|
||||
if [ $# -lt 2 ]
|
||||
then
|
||||
echo "Usage:"
|
||||
echo " compdic-update-fst <dictionary> <difference> <difference> ..."
|
||||
echo "where"
|
||||
echo " <dictionary> - fst format automaton"
|
||||
echo " <difference> - diff format file containing paths to be added/removed from dictionary"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
tempdir=`mktemp -d /tmp/compdic.XXXXXX`
|
||||
|
||||
dict=$1
|
||||
shift
|
||||
|
||||
dicplus=$tempdir/plus.dic
|
||||
fstplus=$tempdir/plus.fst
|
||||
dicminus=$tempdir/minus.dic
|
||||
fstminus=$tempdir/minus.fst
|
||||
|
||||
touch $dicplus
|
||||
touch $dicminus
|
||||
|
||||
while (($#))
|
||||
do
|
||||
echo processing $1 ...
|
||||
cat $1 | egrep '^>' | sed -r 's/^> *//;s/[[:space:]].*$//' | canonize >> $dicplus
|
||||
cat $1 | egrep '^<' | sed -r 's/^< *//;s/[[:space:]].*$//' | canonize >> $dicminus
|
||||
shift
|
||||
done
|
||||
|
||||
echo updating $dict ...
|
||||
|
||||
if (( `cat $dicminus | wc -l` ))
|
||||
then
|
||||
tmpfst1=$tempdir/tmp1.fst
|
||||
echo "running compdic-dic-to-fst $dicminus $fstminus"
|
||||
compdic-dic-to-fst $dicminus $fstminus
|
||||
echo "running fstdifference $dict $fstminus | fstdeterminize > $tmpfst1"
|
||||
fstdifference $dict $fstminus > $tmpfst1
|
||||
else
|
||||
tmpfst1=$dict
|
||||
fi
|
||||
|
||||
if (( `cat $dicplus | wc -l` ))
|
||||
then
|
||||
tmpfst2=$tempdir/tmp2.fst
|
||||
echo "running compdic-dic-to-fst $dicplus $fstplus"
|
||||
compdic-dic-to-fst $dicplus $fstplus
|
||||
echo "running fstunion $tmpfst1 $fstplus | fstdeterminize | fstminimize > $tmpfst2"
|
||||
fstunion $tmpfst1 $fstplus > $tmpfst2
|
||||
else
|
||||
tmpfst2=$tmpfst1
|
||||
fi
|
||||
|
||||
|
||||
|
||||
echo "running mv ${dict} ${dict}~"
|
||||
mv ${dict} ${dict}~
|
||||
|
||||
echo "cat ${tmpfst2} | fstrmepsilon | fstdeterminize | fstminimize > ${dict}"
|
||||
cat ${tmpfst2} | fstrmepsilon | fstdeterminize | fstminimize > ${dict}
|
||||
chmod a+r ${dict}
|
||||
|
||||
rm -r $tempdir
|
Loading…
Reference in New Issue
Block a user