dodanie moliwoci szybkiego uaktualniania sownika dla lema

This commit is contained in:
Tomasz Obrebski 2013-01-17 19:29:12 +01:00
parent f4bf33ed04
commit 555c7f814b
8 changed files with 375 additions and 9 deletions

View File

@ -15,11 +15,11 @@ aut2fsa: aut2fsa.cc
install: install:
ifdef BIN_DIR ifdef BIN_DIR
install -m 0755 compdic $(BIN_DIR) install -m 0755 compdic $(BIN_DIR)
install -m 0755 compdic-create-fst $(BIN_DIR) install -m 0755 compdic-update $(BIN_DIR)
install -m 0755 compdic-fst-add $(BIN_DIR) install -m 0755 compdic-update-fst $(BIN_DIR)
install -m 0755 compdic-fst-minus $(BIN_DIR) install -m 0755 compdic-update-cats $(BIN_DIR)
install -m 0755 compdic-dic-to-fst $(BIN_DIR)
install -m 0755 compdic-fst-to-bin $(BIN_DIR) install -m 0755 compdic-fst-to-bin $(BIN_DIR)
install -m 0755 compdic-fst-update $(BIN_DIR)
install -m 0755 fsm2aut $(BIN_DIR) install -m 0755 fsm2aut $(BIN_DIR)
install -m 0755 aut2fsa $(BIN_DIR) install -m 0755 aut2fsa $(BIN_DIR)
@ -30,11 +30,12 @@ endif
uninstall: uninstall:
ifdef BIN_DIR ifdef BIN_DIR
rm $(BIN_DIR)/compdic rm $(BIN_DIR)/compdic
rm $(BIN_DIR)/compdic-create-fst rm $(BIN_DIR)/compdic-update
rm $(BIN_DIR)/compdic-fst-add rm $(BIN_DIR)/compdic-update-fst
rm $(BIN_DIR)/compdic-fst-minus rm $(BIN_DIR)/compdic-update-cats
rm $(BIN_DIR)/compdic-dic-to-fst
rm $(BIN_DIR)/compdic-fst-to-bin rm $(BIN_DIR)/compdic-fst-to-bin
rm $(BIN_DIR)/compdic-fst-update
rm $(BIN_DIR)/fsm2aut rm $(BIN_DIR)/fsm2aut
rm $(BIN_DIR)/aut2fsa rm $(BIN_DIR)/aut2fsa
rm $(BIN_DIR)/lst2fstext rm $(BIN_DIR)/lst2fstext

View File

@ -1,3 +1,4 @@
#!/bin/bash
no_of_parts=0 no_of_parts=0

18
src/compdic/compdic-dic-to-cats Executable file
View File

@ -0,0 +1,18 @@
if [ $# -lt 2 ]
then
echo "Usage:"
echo " compdic-dic-to-cats <dic-file> <cats-file>"
echo "where"
echo " <wordlist> - file containig a list of words, one per line, iso-8859-2 encoded"
echo " <automaton> - a file to which the compiled automaton in openfst format shoul be written"
exit 0
fi
source=$1
cats=$2
echo generating cats file ...
cat $source | cut -d ',' -f 2 | sort -u > $cats

187
src/compdic/compdic-dic-to-fst Executable file
View File

@ -0,0 +1,187 @@
#! /bin/bash
no_of_parts=0
while [ $# -gt 2 ]
do
case $1
in
-p)
no_of_parts=$2
shift 2
;;
*)
echo "The arguments to use are"
echo "-p: number of parts"
shift 1
;;
esac
done
if [ $# -lt 2 ]
then
echo "Usage:"
echo " compdic [-p <parts>] <sourcefile> <fst>"
echo "where"
echo " <sourcefile> - file containig a list of words, one per line, iso-8859-2 encoded"
echo " <dict> - file to which the compiled automaton in openfst format will be written"
exit 0
fi
source=$1
fst=$2
if [ $no_of_parts -eq 0 ]
then
no_of_parts=$(( `cat $1 | wc -l` / 75000 + 1 ))
fi
echo number of parts: $no_of_parts
tempdir=`mktemp -d /tmp/compdic.XXXXXX`
alphabet=`tempfile -d $tempdir`
cat <<EOF > $alphabet
<eps> 0
a 1
A 2
ä 3
± 4
¡ 5
b 6
B 7
c 8
C 9
æ 10
Æ 11
d 12
D 13
e 14
E 15
é 16
ê 17
Ê 18
f 19
F 20
g 21
G 22
h 23
H 24
i 25
I 26
j 27
J 28
k 29
K 30
l 31
L 32
³ 33
£ 34
m 35
M 36
n 37
N 38
ñ 39
Ñ 40
o 41
O 42
ö 43
ó 44
Ó 45
p 46
P 47
q 48
Q 49
r 50
R 51
s 52
S 53
¶ 54
¦ 55
t 56
T 57
u 58
U 59
ü 60
v 61
V 62
w 63
W 64
x 65
X 66
y 67
Y 68
z 69
Z 70
¼ 71
¬ 72
¿ 73
¯ 74
0 75
1 76
2 77
3 78
4 79
5 80
6 81
7 82
8 83
9 84
_ 85
- 86
? 87
! 88
~ 89
; 90
, 91
/ 92
* 93
+ 94
EOF
no_of_lines=$(( (`cat $source | wc -l` / $no_of_parts) + 1 ))
split -l $no_of_lines $source $tempdir/part.
automaton=$tempdir/output.fst
cat <<EOF | fstcompile --acceptor --isymbols=$alphabet > $automaton
EOF
n=0
for f in $tempdir/part.*
do
temp1=`tempfile -d $tempdir`
temp2=`tempfile -d $tempdir`
temp3=`tempfile -d $tempdir`
n=$(( $n + 1 ))
echo processing part $n
cat $f |\
lst2fstext |\
fstcompile --acceptor --isymbols=$alphabet |\
fstrmepsilon |\
fstdeterminize > $temp1
fstminimize $temp1 $temp2
fstunion $automaton $temp2 | fstrmepsilon | fstdeterminize > $temp3
fstminimize $temp3 $automaton
done
echo generating binary automaton file ...
cat $automaton | fsttopsort > $fst
rm -r $tempdir
#echo generating cats file ...
#cat $1 | cut -d ',' -f 2 | sort -u > $1.cats

View File

@ -1,4 +1,4 @@
#!/bin/bash
if [ $# -lt 2 ] if [ $# -lt 2 ]
then then

48
src/compdic/compdic-update Executable file
View File

@ -0,0 +1,48 @@
#! /bin/bash
. /etc/utt/compdic.conf
while [ $# -gt 2 ]
do
case $1
in
-d)
dictionary_home=$2
shift 2
;;
*)
echo "The arguments to use are"
echo "-d <dictionary-home-dir>"
shift 1
;;
esac
done
if [ $# -lt 2 ]
then
echo "Usage:"
echo " compdic-update [-d <dictionary-home-dir>] <dictionary-name> <difference> <difference> ..."
echo "where"
echo " <dictionary-home-dir> - dictionary home directory"
echo " <dictionary-name> - dictionary name"
echo " <difference> - diff format file containing paths to be added/removed from dictionary"
exit 0
fi
fst=$dictionary_home/$1.fst
cats=$dictionary_home/$1.cats
bin=$dictionary_home/$1.bin
shift 1
diffs=$*
echo running "compdic-update-fst $fst $diffs"
compdic-update-fst $fst $diffs
echo running "compdic-fst-to-bin $fst $bin"
compdic-fst-to-bin $fst $bin
echo running "compdic-update-cats $cats $diffs"
compdic-update-cats $cats $diffs

44
src/compdic/compdic-update-cats Executable file
View File

@ -0,0 +1,44 @@
#! /bin/bash
if [ $# -lt 2 ]
then
echo "Usage:"
echo " compdic-update-cats <catfile> <difference> <difference> ..."
echo "where"
echo " <catfile> - file containing the list morphosyntactic categories"
echo " <difference> - diff-format file containing lines to be added to/removed from dictionary"
exit 0
fi
tempdir=`mktemp -d /tmp/compdic.XXXXXX`
cats=$1
shift
catplus=$tempdir/plus.cat
catminus=$tempdir/minus.cat
cattmp=$tempdir/tmp.cat
touch $catplus
touch $catminus
while (($#))
do
echo processing $1 ...
cat $1 | egrep '^>' | sed -r 's/^> *//;s/[[:space:]].*$//' | cut -d ',' -f 2 | sort -u | canonize >> $catplus
cat $1 | egrep '^<' | sed -r 's/^< *//;s/[[:space:]].*$//' | cut -d ',' -f 2 | sort -u | canonize >> $catminus
shift
done
echo updating $cats ...
# comm -23 $cats $catminus > $cattmp
# mv ${cats} ${cats}~
# cat $cattmp $catplus | sort -u > $cats
cat $cats $catplus | sort -u > $cattmp
mv ${cats} ${cats}~
mv ${cattmp} ${cats}
chmod a+r ${cats}
rm -r $tempdir

67
src/compdic/compdic-update-fst Executable file
View File

@ -0,0 +1,67 @@
#! /bin/bash
if [ $# -lt 2 ]
then
echo "Usage:"
echo " compdic-update-fst <dictionary> <difference> <difference> ..."
echo "where"
echo " <dictionary> - fst format automaton"
echo " <difference> - diff format file containing paths to be added/removed from dictionary"
exit 0
fi
tempdir=`mktemp -d /tmp/compdic.XXXXXX`
dict=$1
shift
dicplus=$tempdir/plus.dic
fstplus=$tempdir/plus.fst
dicminus=$tempdir/minus.dic
fstminus=$tempdir/minus.fst
touch $dicplus
touch $dicminus
while (($#))
do
echo processing $1 ...
cat $1 | egrep '^>' | sed -r 's/^> *//;s/[[:space:]].*$//' | canonize >> $dicplus
cat $1 | egrep '^<' | sed -r 's/^< *//;s/[[:space:]].*$//' | canonize >> $dicminus
shift
done
echo updating $dict ...
if (( `cat $dicminus | wc -l` ))
then
tmpfst1=$tempdir/tmp1.fst
echo "running compdic-dic-to-fst $dicminus $fstminus"
compdic-dic-to-fst $dicminus $fstminus
echo "running fstdifference $dict $fstminus | fstdeterminize > $tmpfst1"
fstdifference $dict $fstminus > $tmpfst1
else
tmpfst1=$dict
fi
if (( `cat $dicplus | wc -l` ))
then
tmpfst2=$tempdir/tmp2.fst
echo "running compdic-dic-to-fst $dicplus $fstplus"
compdic-dic-to-fst $dicplus $fstplus
echo "running fstunion $tmpfst1 $fstplus | fstdeterminize | fstminimize > $tmpfst2"
fstunion $tmpfst1 $fstplus > $tmpfst2
else
tmpfst2=$tmpfst1
fi
echo "running mv ${dict} ${dict}~"
mv ${dict} ${dict}~
echo "cat ${tmpfst2} | fstrmepsilon | fstdeterminize | fstminimize > ${dict}"
cat ${tmpfst2} | fstrmepsilon | fstdeterminize | fstminimize > ${dict}
chmod a+r ${dict}
rm -r $tempdir