dodany komponent compdic - kompilator slownikow dla lem, cor, kor, gue

(compdic zastepuje compiledic)
This commit is contained in:
tom 2011-12-14 17:21:24 +01:00
parent a6e708f37f
commit 93afab8cc2
5 changed files with 278 additions and 0 deletions

View File

@ -0,0 +1,21 @@
CFLAG1 = -m32 -Wno-deprecated -O3 -fpermissive
CFLAG_ST = -Wno-deprecated -O3 -fpermissive -static
all: compdic aut2fsa
compdic:
aut2fsa: aut2fsa.cc
#g++ -m32 -Wno-deprecated -O3 -fpermissive -static -o aut2fsa aut2fsa.cc
g++ $(CFLAG1) -o aut2fsa aut2fsa.cc
copy:
ifdef UTT_BIN_DIR
cp compdic fsm2aut aut2fsa lst2fstext ${UTT_BIN_DIR}
endif
clean:
rm aut2fsa

View File

@ -0,0 +1,18 @@
#include <iostream>
#include <stdlib.h>
#include "../lib/tfti.h"
#include <fstream>
using namespace std;
int main()
{
TFTiv<char,char> a;
a.read();
a.save();
return 0;
}

175
_old/app/src/compdic/compdic Executable file
View File

@ -0,0 +1,175 @@
no_of_parts=0
while [ $# -gt 2 ]
do
case $1
in
-p)
no_of_parts=$2
shift 2
;;
*)
echo "The arguments to use are"
echo "-p: number of parts"
shift 1
;;
esac
done
if [ $# -lt 2 ]
then
echo "Usage:"
echo " compdic [-p <parts>] <wordlist> <automaton>"
echo "where"
echo " <wordlist> - file containig a list of words, one per line, iso-8859-2 encoded"
echo " <automaton> - a file to which the compiled automaton (cor/kor format) shoul be written"
exit 0
fi
if [ $no_of_parts -eq 0 ]
then
no_of_parts=$(( `cat $1 | wc -l` / 75000 + 1 ))
fi
echo number of parts: $no_of_parts
tempdir=`mktemp -d /tmp/compdic.XXXXXX`
alphabet=`tempfile -d $tempdir`
cat <<EOF > $alphabet
<eps> 0
a 1
A 2
ä 3
± 4
¡ 5
b 6
B 7
c 8
C 9
æ 10
Æ 11
d 12
D 13
e 14
E 15
é 16
ê 17
Ê 18
f 19
F 20
g 21
G 22
h 23
H 24
i 25
I 26
j 27
J 28
k 29
K 30
l 31
L 32
³ 33
£ 34
m 35
M 36
n 37
N 38
ñ 39
Ñ 40
o 41
O 42
ö 43
ó 44
Ó 45
p 46
P 47
q 48
Q 49
r 50
R 51
s 52
S 53
¶ 54
¦ 55
t 56
T 57
u 58
U 59
ü 60
v 61
V 62
w 63
W 64
x 65
X 66
y 67
Y 68
z 69
Z 70
¼ 71
¬ 72
¿ 73
¯ 74
0 75
1 76
2 77
3 78
4 79
5 80
6 81
7 82
8 83
9 84
_ 85
- 86
? 87
! 88
~ 89
; 90
, 91
/ 92
* 93
+ 94
EOF
no_of_lines=$(( (`cat $1 | wc -l` / $no_of_parts) + 1 ))
split -l $no_of_lines $1 $tempdir/part.
automaton=$tempdir/output.fst
cat <<EOF | fstcompile --acceptor --isymbols=$alphabet --keep_isymbols > $automaton
EOF
n=0
for f in $tempdir/part.*
do
temp1=`tempfile -d $tempdir`
temp2=`tempfile -d $tempdir`
temp3=`tempfile -d $tempdir`
n=$(( $n + 1 ))
echo processing part $n
cat $f |\
lst2fstext |\
fstcompile --acceptor --isymbols=$alphabet --keep_isymbols |\
fstrmepsilon |\
fstdeterminize > $temp1
fstminimize $temp1 $temp2
fstunion $automaton $temp2 | fstrmepsilon | fstdeterminize > $temp3
fstminimize $temp3 $automaton
done
cat $automaton | fsttopsort | fstprint --acceptor | fsm2aut | aut2fsa > $2
rm -r $tempdir

44
_old/app/src/compdic/fsm2aut Executable file
View File

@ -0,0 +1,44 @@
#!/usr/bin/perl
my $currstate=-1;
my @states;
my @final;
my $tn=0;
while(<>)
{
if(/^\s*([0-9]+)\s+([0-9]+)\s+(.)(\s*)?$/)
{
push @{$states[$1]}, ($3, $2);
$#states=$2 if $#states<$2;
$tn++;
}
elsif(/^\s*([0-9]+)\s*$/)
{
$final[$1]=1;
$#states=$1 if $#states<$1;
}
else
{
die("Input error.");
}
}
print scalar(@states)," ",$tn," char void\n";
my $i=0;
my $width=int(log(@states+1)/log(10));
foreach $stateref (@states)
{
$f = ($final[$i]?"+":"-");
printf "%${width}d %s",$i++,$f;
while(@$stateref)
{
$c=shift @$stateref;
$s=shift @$stateref;
print " $c $s";
}
print "\n";
}

20
_old/app/src/compdic/lst2fstext Executable file
View File

@ -0,0 +1,20 @@
#!/usr/bin/env perl
use locale;
$s=1;
$f=1;
while(<>)
{
chomp;
@cs = split('');
++$s;
print "0 $s <eps>\n";
while($c = shift @cs)
{
print $s . ' ' . ++$s . " $c\n";
}
print "$s $f <eps>\n";
}
print "$f\n";