git-svn-id: svn://atos.wmid.amu.edu.pl/utt@3 e293616e-ec6a-49c2-aa92-f4a8b91c5d16
This commit is contained in:
parent
75b8bc10d0
commit
f1563c0f02
8
app/conf/Makefile
Normal file
8
app/conf/Makefile
Normal file
@ -0,0 +1,8 @@
|
||||
main:
|
||||
|
||||
copy:
|
||||
ifdef UTT_CONF_DIR
|
||||
cp *.conf ${UTT_CONF_DIR}
|
||||
endif
|
||||
|
||||
clean:
|
7
app/conf/cor.conf
Normal file
7
app/conf/cor.conf
Normal file
@ -0,0 +1,7 @@
|
||||
# Plik konfiguracyjny dla komponentu cor.
|
||||
|
||||
# kazda linia ma postac:
|
||||
# nazwa_parametru [=] wartosc
|
||||
|
||||
# PRZYKLAD: analizuj tylko slowa
|
||||
# p = W
|
8
app/conf/gue.conf
Normal file
8
app/conf/gue.conf
Normal file
@ -0,0 +1,8 @@
|
||||
# Plik konfiguracyjny dla komponentu gue.
|
||||
|
||||
# kazda linia ma postac:
|
||||
# nazwa_parametru [=] wartosc
|
||||
|
||||
# PRZYKLAD: analizuj tylko slowa
|
||||
# p = W
|
||||
|
7
app/conf/lem.conf
Normal file
7
app/conf/lem.conf
Normal file
@ -0,0 +1,7 @@
|
||||
# Plik konfiguracyjny dla komponentu lem.
|
||||
|
||||
# kazda linia ma postac:
|
||||
# nazwa_parametru [=] wartosc
|
||||
|
||||
# PRZYKLAD: analizuj tylko slowa
|
||||
# p = W
|
BIN
nawszelkiwypadek/tools/aut2fsa
Executable file
BIN
nawszelkiwypadek/tools/aut2fsa
Executable file
Binary file not shown.
11
nawszelkiwypadek/tools/cor_dic/makeLabels.pl
Executable file
11
nawszelkiwypadek/tools/cor_dic/makeLabels.pl
Executable file
@ -0,0 +1,11 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
use locale;
|
||||
|
||||
print "lcase a ± b c æ d e ê f g h i j k l ³ m n ñ o ó p q r s ¶ t u v w x y z ¼ ¿ é ö ü ä\n";
|
||||
print "ucase A ¡ B C Æ D E Ê F G H I J K L £ M N Ñ O Ó P Q R S ¦ T U V W X Y Z ¬ ¯\n";
|
||||
print "letter lcase ucase\n";
|
||||
print "digit 0 1 2 3 4 5 6 7 8 9\n";
|
||||
print "signs , . @ \/ \'\n _";
|
||||
print "sem ~ ; - \\ \n";
|
||||
print "all letter digit signs sem\n";
|
67
nawszelkiwypadek/tools/cor_dic/prep.pl
Executable file
67
nawszelkiwypadek/tools/cor_dic/prep.pl
Executable file
@ -0,0 +1,67 @@
|
||||
#! /usr/bin/perl
|
||||
|
||||
use locale;
|
||||
use strict;
|
||||
|
||||
my $file = shift;
|
||||
|
||||
if ($file eq "") {
|
||||
print "Podaj nazwê pliku.\n";
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
||||
`makeLabels.pl > labels.sym`;
|
||||
|
||||
`lexmakelab labels`;
|
||||
|
||||
print "Pobieram informacje ze s³ownika..........................";
|
||||
|
||||
`cut -d \\; -f 1 <$file > temp1`;
|
||||
|
||||
`sort -u < temp1 > temp2`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Kompilujê automat........................................";
|
||||
|
||||
`lexcomplex -l labels.lab -S labels.scl <temp2 > temp1`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Usuwam epsilon-przejscia.................................";
|
||||
|
||||
`fsmrmepsilon temp1> temp2`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Determinizujê automat....................................";
|
||||
|
||||
`fsmdeterminize temp2 > temp1`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Minimalizujê automat.....................................";
|
||||
|
||||
`fsmminimize temp1> temp2`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Konwertujê automat do formatu fsa........................";
|
||||
|
||||
`fsmprint -i labels.lab temp2> temp1`;
|
||||
|
||||
`../fsm2aut temp1> temp2`;
|
||||
|
||||
`../aut2fsa < temp2> cor.dic`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Czyszczê pliki pomocnicze................................";
|
||||
|
||||
`rm temp2`;
|
||||
`rm temp1`;
|
||||
`rm labels.*`;
|
||||
|
||||
print "OK\n";
|
||||
|
47
nawszelkiwypadek/tools/dist/Makefile
vendored
Normal file
47
nawszelkiwypadek/tools/dist/Makefile
vendored
Normal file
@ -0,0 +1,47 @@
|
||||
# main makefile
|
||||
|
||||
BIN=bin
|
||||
SRC=src
|
||||
DIR=$(shell pwd)
|
||||
##############################
|
||||
# CONFIGURATION
|
||||
|
||||
# UTT_BIN_DIR - directory for executables
|
||||
# UTT_SHARE_DIR - directory for data and stuff
|
||||
# UTT_DOC_DIR - directory for documentation
|
||||
|
||||
# example (site installation)
|
||||
# UTT_BIN_DIR=/usr/local/bin
|
||||
# UTT_DOC_DIR=/usr/share/doc/utt
|
||||
|
||||
UTT_BIN_DIR=~/utt/bin
|
||||
UTT_SHARE_DIR=~/.utt
|
||||
UTT_DOC_DIR=~/utt/doc
|
||||
UTT_LIB_DIR=$(UTT_SHARE_DIR)/lib
|
||||
##############################
|
||||
|
||||
install: make_dirs install_dta install_lib #install_doc install_components
|
||||
@echo "Installation completed successfully!"
|
||||
|
||||
install_components:
|
||||
cp -r bin/* $(UTT_BIN_DIR)/
|
||||
|
||||
install_dta:
|
||||
if [ -d data ]; then cp -r data/* $(UTT_SHARE_DIR)/; fi
|
||||
|
||||
install_doc:
|
||||
cp -r doc/* $(UTT_DOC_DIR)/
|
||||
|
||||
install_lib:
|
||||
cp -r lib/* $(UTT_LIB_DIR)/
|
||||
|
||||
make_dirs:
|
||||
#if [ -d $(UTT_BIN_DIR) ]; then true; else mkdir -p $(UTT_BIN_DIR); fi
|
||||
if [ -d $(UTT_SHARE_DIR) ]; then true; else mkdir -p $(UTT_SHARE_DIR); fi
|
||||
if [ -d $(UTT_LIB_DIR) ]; then true; else mkdir -p $(UTT_LIB_DIR); fi
|
||||
#if [ -d $(UTT_DOC_DIR) ]; then true; else mkdir -p $(UTT_DOC_DIR); fi
|
||||
|
||||
uninstall:
|
||||
rm -r $(UTT_SHARE_DIR)
|
||||
#rm -r $(UTT_BIN_DIR)
|
||||
#rm -r $(UTT_DOC_DIR)
|
5
nawszelkiwypadek/tools/dist/README
vendored
Normal file
5
nawszelkiwypadek/tools/dist/README
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
Instalacja:
|
||||
|
||||
1) Przeniesc katalog .utt do swojego katalogu domowego.
|
||||
2) Dopisac do $PATH sciezke do katalogu bin.
|
||||
|
44
nawszelkiwypadek/tools/fsm2aut
Executable file
44
nawszelkiwypadek/tools/fsm2aut
Executable file
@ -0,0 +1,44 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
my $currstate=-1;
|
||||
my @states;
|
||||
my @final;
|
||||
my $tn=0;
|
||||
|
||||
while(<>)
|
||||
{
|
||||
if(/^\s*([0-9]+)\s+([0-9]+)\s+(.)(\s*)?$/)
|
||||
{
|
||||
push @{$states[$1]}, ($3, $2);
|
||||
$#states=$2 if $#states<$2;
|
||||
$tn++;
|
||||
}
|
||||
elsif(/^\s*([0-9]+)\s*$/)
|
||||
{
|
||||
$final[$1]=1;
|
||||
$#states=$1 if $#states<$1;
|
||||
}
|
||||
else
|
||||
{
|
||||
die("Input error.");
|
||||
}
|
||||
}
|
||||
|
||||
print scalar(@states)," ",$tn," char void\n";
|
||||
|
||||
my $i=0;
|
||||
my $width=int(log(@states+1)/log(10));
|
||||
foreach $stateref (@states)
|
||||
{
|
||||
$f = ($final[$i]?"+":"-");
|
||||
printf "%${width}d %s",$i++,$f;
|
||||
while(@$stateref)
|
||||
{
|
||||
$c=shift @$stateref;
|
||||
$s=shift @$stateref;
|
||||
print " $c $s";
|
||||
}
|
||||
print "\n";
|
||||
}
|
||||
|
||||
|
28
nawszelkiwypadek/tools/gue_dic/README
Normal file
28
nawszelkiwypadek/tools/gue_dic/README
Normal file
@ -0,0 +1,28 @@
|
||||
How to prepare gue dictionary?
|
||||
|
||||
1. Preparing input file.
|
||||
|
||||
Lines in input file should look like this:
|
||||
|
||||
prefix*suffix~weight;description
|
||||
|
||||
where:
|
||||
prefix - is the prefix of a word
|
||||
suffix - is the suffix of a word
|
||||
weight - is the weight saying how importand information in this line is
|
||||
description - is a description of the word
|
||||
(in any format - description will be in output of gue)
|
||||
|
||||
|
||||
2. Compiling a dictionary.
|
||||
|
||||
Let's say we have input file named "dict.in".
|
||||
Commands compiling dictionary:
|
||||
|
||||
prep_user_dict.pl < dict.in > dict.temp
|
||||
compile_user_dict.pl dict.temp
|
||||
|
||||
Those should create file called "gue.bin" which is dictionary
|
||||
for gue component.
|
||||
|
||||
Good luck.
|
110
nawszelkiwypadek/tools/gue_dic/attr.pm
Normal file
110
nawszelkiwypadek/tools/gue_dic/attr.pm
Normal file
@ -0,0 +1,110 @@
|
||||
package attr;
|
||||
|
||||
use locale;
|
||||
use strict;
|
||||
|
||||
|
||||
sub match(\@\@)
|
||||
{
|
||||
my ($cat1,$avs1)= @{shift @_};
|
||||
my ($cat2,$avs2)= @{shift @_};
|
||||
|
||||
if($cat1 ne $cat2)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
ATTR:for my $attr (keys %$avs1)
|
||||
{
|
||||
if($avs2->{$attr})
|
||||
{
|
||||
for my $val (keys %{$avs1->{$attr}})
|
||||
{
|
||||
next ATTR if $avs2->{$attr}->{$val};
|
||||
}
|
||||
return 0;
|
||||
last ATTR;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
# funkcja parse
|
||||
# arg: deskrypcja
|
||||
# warto¶æ: referencja do tablicy [<cat>, <avs>],
|
||||
# gdzie <avs> jest referencja do hasza, zawierajacego pary
|
||||
# atrybut=>hasz warto¶ci (pary warto¶æ=>1), czyli np.
|
||||
|
||||
# [
|
||||
# 'ADJ',
|
||||
# {
|
||||
# 'KOLEDZY' => {
|
||||
# '<alojzy>' => 1,
|
||||
# '<karol>' => 1,
|
||||
# '<jan>' => 1
|
||||
# },
|
||||
# 'C' => {
|
||||
# 'p' => 1,
|
||||
# 'a' => 1,
|
||||
# 'i' => 1
|
||||
# },
|
||||
# 'N' => {
|
||||
# 'p' => 1
|
||||
# }
|
||||
# }
|
||||
# ];
|
||||
|
||||
sub parse ($)
|
||||
{
|
||||
my ($dstr)=@_;
|
||||
my $avs={};
|
||||
my ($cat,$attrlist) = split '/', $dstr;
|
||||
attr:
|
||||
while( $attrlist =~ /([[:upper:]]+)((?:[[:lower:]+?!*-]|<[^>\n]+>)+)/g )
|
||||
{
|
||||
my ($attrstr,$valstr)=($1,$2);
|
||||
my %vals;
|
||||
while($valstr =~ /[[:lower:]+?!*-]|<[^>\n]+>/g)
|
||||
{
|
||||
my $val = $&;
|
||||
next attr if $val eq '*';
|
||||
$val =~ s/^<([[:lower:]])>$/$1/;
|
||||
$vals{$val}=1;
|
||||
}
|
||||
|
||||
$avs->{$attrstr} = \%vals; # dlaczego to dziala? %vals jest lokalne
|
||||
}
|
||||
[$cat, $avs];
|
||||
}
|
||||
|
||||
# funkcja unparse
|
||||
# arg: jak warto¶æ parse
|
||||
# warto¶æ: deskrypcja - napis
|
||||
|
||||
sub unparse (\@)
|
||||
{
|
||||
my ($cat,$avs)= @{shift @_};
|
||||
my $dstr=$cat;
|
||||
my @attrs = keys %$avs;
|
||||
if(@attrs)
|
||||
{
|
||||
$dstr .= '/';
|
||||
for my $attr ( sort @attrs )
|
||||
{
|
||||
$dstr .= $attr . (join '', sort keys %{$avs->{$attr}});
|
||||
}
|
||||
}
|
||||
$dstr;
|
||||
}
|
||||
|
||||
|
||||
sub canonize ($)
|
||||
{
|
||||
unparse @{parse @_[0]} ;
|
||||
}
|
||||
|
||||
|
||||
1;
|
9
nawszelkiwypadek/tools/gue_dic/canon.pl
Executable file
9
nawszelkiwypadek/tools/gue_dic/canon.pl
Executable file
@ -0,0 +1,9 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
use locale;
|
||||
use attr;
|
||||
|
||||
while (<>) {
|
||||
s/,(.*)$/','.attr::canonize($1)/e;
|
||||
print;
|
||||
}
|
197
nawszelkiwypadek/tools/gue_dic/compile_user_dict.pl
Executable file
197
nawszelkiwypadek/tools/gue_dic/compile_user_dict.pl
Executable file
@ -0,0 +1,197 @@
|
||||
#! /usr/bin/env perl
|
||||
|
||||
use locale;
|
||||
#use strict;
|
||||
|
||||
#
|
||||
##################################################
|
||||
$linesPerFile = 20000;
|
||||
|
||||
if (@ARGV < 1) {
|
||||
print "usage: prep_user_dict.pl dictionary_file\n";
|
||||
exit;
|
||||
}
|
||||
|
||||
$file = shift; # @ARGV;
|
||||
|
||||
# Przygotowanie etykiet
|
||||
|
||||
`makeLabels.pl > labels.sym`;
|
||||
|
||||
`lexmakelab labels`;
|
||||
|
||||
# Analiza pliku s³ownika
|
||||
|
||||
|
||||
print "Kanonizujê opisy.........................................";
|
||||
|
||||
`canon.pl <$file >temp1`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Sortujê plik.............................................";
|
||||
|
||||
`sort -t \\~ -k1,1 -k2,2nr <temp1 > temp2`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Minimalizujê plik s³ownika...............................";
|
||||
|
||||
#`rmDup.pl < temp2 > temp1`;
|
||||
#`rmDup2.pl < temp1 > temp2`;
|
||||
|
||||
`cp temp1 temp2`;
|
||||
|
||||
`rm temp1`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Czyszczê pliki...........................................";
|
||||
|
||||
`sed -r "s/([[:punct:]])/[\\1]/g" < temp2 > temp1`;
|
||||
|
||||
`cp temp1 temp2`;
|
||||
`rm temp1`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
#dzielimy plik na wiele czê¶ci, uruchamiamy lexcomplex dla ka¿dej
|
||||
#czê¶ci osobno, nastêpnie ³±czymy to za pomoc± programu fsmunion
|
||||
|
||||
print "Dzielê s³ownik na mniejsze czê¶ci........................";
|
||||
|
||||
open(IN, "./temp2");
|
||||
|
||||
$lineCount = 0;
|
||||
$fileCount = 0;
|
||||
|
||||
`mkdir LemTEMP`;
|
||||
|
||||
open(FILE, ">LemTEMP/slo_0");
|
||||
|
||||
while (<IN>) {
|
||||
|
||||
if (++$lineCount >= $linesPerFile) {
|
||||
$fileCount++;
|
||||
$lineCount = 0;
|
||||
|
||||
close(FILE);
|
||||
# print "Tworzê nowy plik tymczasowy: slo_".$fileCount."\n";
|
||||
open(FILE, ">LemTEMP/slo_".$fileCount);
|
||||
}
|
||||
|
||||
print(FILE $_);
|
||||
}
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Tworzê automaty po¶rednie";
|
||||
|
||||
#32 kropki, fileCount plikow
|
||||
$filesPerDot = $fileCount/32;
|
||||
$files=$filesPerDot;
|
||||
$dots=0;
|
||||
|
||||
for ($i=0; $i<=$fileCount; $i++) {
|
||||
|
||||
if ($files >= $filesPerDot) {
|
||||
$files = 0;
|
||||
print ".";
|
||||
$dots++;
|
||||
}
|
||||
$files++;
|
||||
|
||||
$command = "lexcomplex -l labels.lab -S labels.scl < LemTEMP/slo_".$i." > LemTEMP/slownik_".$i.".fsm";
|
||||
|
||||
`$command`;
|
||||
|
||||
}
|
||||
if ($dots < 32) {
|
||||
for ($i=0; $i<32 - $dots; $i++) {
|
||||
print ".";
|
||||
}
|
||||
}
|
||||
|
||||
print "OK\n";
|
||||
|
||||
`rm LemTEMP/slo_*`;
|
||||
|
||||
print "Tworzê automat koñcowy";
|
||||
|
||||
#35 kropek...
|
||||
$filesPerDot = $fileCount/35;
|
||||
$files=$filesPerDot;
|
||||
$dots=0;
|
||||
|
||||
`cp LemTEMP/slownik_0.fsm slownik1.fsm`;
|
||||
|
||||
for ($i=1; $i<=$filecount; $i++) {
|
||||
|
||||
if ($files >= $filesPerDot) {
|
||||
$files = 0;
|
||||
print ".";
|
||||
$dots++;
|
||||
}
|
||||
$files++;
|
||||
|
||||
$command = "fsmunion LemTEMP/slownik_".$i." slownik1.fsm > slownik2.fsm";
|
||||
|
||||
`$command`;
|
||||
|
||||
`mv slownik2.fsm slownik1.fsm`;
|
||||
}
|
||||
|
||||
if ($dots < 35) {
|
||||
for ($i=0; $i<35 - $dots; $i++) {
|
||||
print ".";
|
||||
}
|
||||
}
|
||||
|
||||
`fsmunion LemTEMP/* > slownik1.fsm`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Usuwam epsilon-przejscia.................................";
|
||||
|
||||
`fsmrmepsilon slownik1.fsm > slownik2.fsm`;
|
||||
|
||||
`rm slownik1.fsm`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Determinizujê automat....................................";
|
||||
|
||||
`fsmdeterminize slownik2.fsm > slownik1.fsm`;
|
||||
|
||||
`rm slownik2.fsm`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Minimalizujê automat.....................................";
|
||||
|
||||
`fsmminimize slownik1.fsm > slownik.fsm`;
|
||||
|
||||
`rm slownik1.fsm`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Konwertujê automat do formatu fsa........................";
|
||||
|
||||
`fsmprint -i labels.lab slownik.fsm > slownik.txt`;
|
||||
|
||||
`../fsm2aut slownik.txt > slownik.aut`;
|
||||
|
||||
`../aut2fsa < slownik.aut > gue.bin`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Czyszczê pliki pomocnicze................................";
|
||||
|
||||
`rm LemTEMP/*`;
|
||||
`rmdir LemTEMP`;
|
||||
`rm temp2`;
|
||||
`rm slownik.fsm`;
|
||||
`rm slownik.txt`;
|
||||
`rm slownik.aut`;
|
||||
|
||||
print "OK\n";
|
86
nawszelkiwypadek/tools/gue_dic/count_prefs.pl
Executable file
86
nawszelkiwypadek/tools/gue_dic/count_prefs.pl
Executable file
@ -0,0 +1,86 @@
|
||||
#! /usr/bin/perl
|
||||
|
||||
use locale;
|
||||
use strict;
|
||||
|
||||
my @prefs;
|
||||
|
||||
sub addPref {
|
||||
|
||||
my $pref = shift;
|
||||
my $desc = shift;
|
||||
my $i;
|
||||
for ($i=0; $i< @prefs; ++$i) {
|
||||
my @tab = @{$prefs[$i]};
|
||||
if (${@{$prefs[$i]}}[0] =~ /^$pref/) {
|
||||
${@{$prefs[$i]}}[1]{$desc}++;
|
||||
return;
|
||||
}
|
||||
}
|
||||
my @new;
|
||||
my %hash;
|
||||
|
||||
$hash{$desc}++;
|
||||
push(@new, $pref);
|
||||
push(@new, \%hash);
|
||||
|
||||
push(@prefs, \@new);
|
||||
}
|
||||
|
||||
sub printPrefs {
|
||||
|
||||
my $i;
|
||||
for $i (@prefs) {
|
||||
my @tab = @$i;
|
||||
# print $tab[0]."\t";
|
||||
my $pref = $tab[0];
|
||||
my %hash = %{$tab[1]};
|
||||
my @keys = keys(%hash);
|
||||
# print(@keys."\n");
|
||||
my $sum = 0;
|
||||
my $key;
|
||||
for $key (@keys) {
|
||||
$sum += $hash{$key};
|
||||
}
|
||||
for $key (@keys) {
|
||||
print $pref."\t";
|
||||
print $key."\t";
|
||||
print $hash{$key}."\t";
|
||||
print $sum."\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (@ARGV < 2) {
|
||||
print "USAGE: count_prefs.pl MIN_PREF_LEN MAX_PREF_LEN\n";
|
||||
exit;
|
||||
}
|
||||
|
||||
my $MIN = shift;
|
||||
my $MAX = shift;
|
||||
my $PART = shift;
|
||||
|
||||
if ($MIN > $MAX) {
|
||||
print "MIN_PREF_LEN > MAX_PREF_LEN! ($MIN > $MAX)\n";
|
||||
exit;
|
||||
}
|
||||
my $begin = "";
|
||||
while (<>) {
|
||||
my $len = $MIN;
|
||||
$_ =~ /(\w+);(.*)$/;
|
||||
my $pref = $1;
|
||||
my $desc = $2;
|
||||
if ($begin eq "") {
|
||||
$begin = substr($pref, 0, $MIN);
|
||||
}
|
||||
if ($pref !~ /^$begin.*/) {
|
||||
printPrefs();
|
||||
undef(@prefs);
|
||||
$begin = "";
|
||||
}
|
||||
while ($len <= $MAX) {
|
||||
addPref(substr($pref, 0, $len++), $desc);
|
||||
}
|
||||
}
|
||||
|
||||
printPrefs();
|
60
nawszelkiwypadek/tools/gue_dic/cut_prefs.pl
Executable file
60
nawszelkiwypadek/tools/gue_dic/cut_prefs.pl
Executable file
@ -0,0 +1,60 @@
|
||||
#! /usr/bin/perl
|
||||
|
||||
use locale;
|
||||
use strict;
|
||||
|
||||
if (@ARGV < 3) {
|
||||
print "USAGE: cut_prefs.pl CUT_OFF TOTAL_PER ABS_CUT\n\n";
|
||||
print "Obcina wpisy mało ważne, opis nie jest uwzględniany\n";
|
||||
print "jeżeli zachodzi jeden z warunków:\n";
|
||||
print " - jezeli liczba wystapien danego opisu jest mniejsza\n";
|
||||
print " od \$CUT_OFF*(liczba wystapien poprzedniego opisu)\n";
|
||||
print " - jezeli liczba wystapien danego opisu jest mniejsza\n";
|
||||
print " od \$TOTAL_PER*(suma wszystkich wystapien)\n";
|
||||
print " - jezeli liczba wystapien danego opisu jest mniejsza\n";
|
||||
print " od \$ABS_CUT\n";
|
||||
exit;
|
||||
}
|
||||
|
||||
# jezeli liczba wystapien danego opisu jest mniejsza
|
||||
# od $CUT_OFF*(liczba wystapien poprzedniego opisu) - opis nie jest uwzgledniany
|
||||
my $CUT_OFF = shift;
|
||||
|
||||
# jezeli liczba wystapien danego opisu jest mniejsza
|
||||
# od $TOTAL_PER*(suma wszystkich wystapien) - opis nie jest uwzgledniany
|
||||
my $TOTAL_PER = shift;
|
||||
|
||||
# jezeli liczba wystapien danego opisu jest mniejsza
|
||||
# od $ABS_CUT - opis nie jest uwzgledniany
|
||||
my $ABS_CUT = shift;
|
||||
|
||||
my $pref = "";
|
||||
my $oldPref = " ";
|
||||
my $countTotal = -1;
|
||||
my $count = -1;
|
||||
|
||||
while (<>) {
|
||||
|
||||
if (($count == -1) && ($_ =~ /^$oldPref\t.*/)) {
|
||||
next;
|
||||
}
|
||||
|
||||
if ($pref =~ //) {
|
||||
$_ =~ /^(\w+)\t.+\t(\d+)\t(\d+)/;
|
||||
$pref = $1;
|
||||
$count = $2;
|
||||
$countTotal = $3;
|
||||
# print "\$pref=$pref\t\$count=$count\t\$countTotal=$countTotal\n";
|
||||
}
|
||||
$_ =~ /\w+\t.+\t(\d+)\t\d+/;
|
||||
my $c = $1;
|
||||
# print "\$c=$c\t\$CUT_OFF*\$count=$CUT_OFF*$count\t\$TOTAL_PER*\$countTotal=".$TOTAL_PER*$countTotal."\n";
|
||||
if (($CUT_OFF*$count < $c) && ($TOTAL_PER*$countTotal < $c) && ($ABS_CUT < $c)) {
|
||||
$count = $c;
|
||||
print $_;
|
||||
} else {
|
||||
$count = -1;
|
||||
$oldPref = $pref;
|
||||
$pref = "";
|
||||
}
|
||||
}
|
0
nawszelkiwypadek/tools/gue_dic/gue.bin
Normal file
0
nawszelkiwypadek/tools/gue_dic/gue.bin
Normal file
95
nawszelkiwypadek/tools/gue_dic/labels.lab
Normal file
95
nawszelkiwypadek/tools/gue_dic/labels.lab
Normal file
@ -0,0 +1,95 @@
|
||||
<epsilon> 0
|
||||
a 1
|
||||
± 2
|
||||
b 3
|
||||
c 4
|
||||
æ 5
|
||||
d 6
|
||||
e 7
|
||||
ê 8
|
||||
f 9
|
||||
g 10
|
||||
h 11
|
||||
i 12
|
||||
j 13
|
||||
k 14
|
||||
l 15
|
||||
³ 16
|
||||
m 17
|
||||
n 18
|
||||
ñ 19
|
||||
o 20
|
||||
ó 21
|
||||
p 22
|
||||
q 23
|
||||
r 24
|
||||
s 25
|
||||
¶ 26
|
||||
t 27
|
||||
u 28
|
||||
v 29
|
||||
w 30
|
||||
x 31
|
||||
y 32
|
||||
z 33
|
||||
¼ 34
|
||||
¿ 35
|
||||
é 36
|
||||
ö 37
|
||||
ü 38
|
||||
ä 39
|
||||
A 40
|
||||
¡ 41
|
||||
B 42
|
||||
C 43
|
||||
Æ 44
|
||||
D 45
|
||||
E 46
|
||||
Ê 47
|
||||
F 48
|
||||
G 49
|
||||
H 50
|
||||
I 51
|
||||
J 52
|
||||
K 53
|
||||
L 54
|
||||
£ 55
|
||||
M 56
|
||||
N 57
|
||||
Ñ 58
|
||||
O 59
|
||||
Ó 60
|
||||
P 61
|
||||
Q 62
|
||||
R 63
|
||||
S 64
|
||||
¦ 65
|
||||
T 66
|
||||
U 67
|
||||
V 68
|
||||
W 69
|
||||
X 70
|
||||
Y 71
|
||||
Z 72
|
||||
¬ 73
|
||||
¯ 74
|
||||
0 75
|
||||
1 76
|
||||
2 77
|
||||
3 78
|
||||
4 79
|
||||
5 80
|
||||
6 81
|
||||
7 82
|
||||
8 83
|
||||
9 84
|
||||
, 85
|
||||
. 86
|
||||
@ 87
|
||||
/ 88
|
||||
' 89
|
||||
_ 90
|
||||
~ 91
|
||||
; 92
|
||||
- 93
|
||||
\ 94
|
356
nawszelkiwypadek/tools/gue_dic/labels.scl
Normal file
356
nawszelkiwypadek/tools/gue_dic/labels.scl
Normal file
@ -0,0 +1,356 @@
|
||||
lcase 1
|
||||
lcase 2
|
||||
lcase 3
|
||||
lcase 4
|
||||
lcase 5
|
||||
lcase 6
|
||||
lcase 7
|
||||
lcase 8
|
||||
lcase 9
|
||||
lcase 10
|
||||
lcase 11
|
||||
lcase 12
|
||||
lcase 13
|
||||
lcase 14
|
||||
lcase 15
|
||||
lcase 16
|
||||
lcase 17
|
||||
lcase 18
|
||||
lcase 19
|
||||
lcase 20
|
||||
lcase 21
|
||||
lcase 22
|
||||
lcase 23
|
||||
lcase 24
|
||||
lcase 25
|
||||
lcase 26
|
||||
lcase 27
|
||||
lcase 28
|
||||
lcase 29
|
||||
lcase 30
|
||||
lcase 31
|
||||
lcase 32
|
||||
lcase 33
|
||||
lcase 34
|
||||
lcase 35
|
||||
lcase 36
|
||||
lcase 37
|
||||
lcase 38
|
||||
lcase 39
|
||||
sem 91
|
||||
sem 92
|
||||
sem 93
|
||||
sem 94
|
||||
<sigma> 1
|
||||
<sigma> 2
|
||||
<sigma> 3
|
||||
<sigma> 4
|
||||
<sigma> 5
|
||||
<sigma> 6
|
||||
<sigma> 7
|
||||
<sigma> 8
|
||||
<sigma> 9
|
||||
<sigma> 10
|
||||
<sigma> 11
|
||||
<sigma> 12
|
||||
<sigma> 13
|
||||
<sigma> 14
|
||||
<sigma> 15
|
||||
<sigma> 16
|
||||
<sigma> 17
|
||||
<sigma> 18
|
||||
<sigma> 19
|
||||
<sigma> 20
|
||||
<sigma> 21
|
||||
<sigma> 22
|
||||
<sigma> 23
|
||||
<sigma> 24
|
||||
<sigma> 25
|
||||
<sigma> 26
|
||||
<sigma> 27
|
||||
<sigma> 28
|
||||
<sigma> 29
|
||||
<sigma> 30
|
||||
<sigma> 31
|
||||
<sigma> 32
|
||||
<sigma> 33
|
||||
<sigma> 34
|
||||
<sigma> 35
|
||||
<sigma> 36
|
||||
<sigma> 37
|
||||
<sigma> 38
|
||||
<sigma> 39
|
||||
<sigma> 40
|
||||
<sigma> 41
|
||||
<sigma> 42
|
||||
<sigma> 43
|
||||
<sigma> 44
|
||||
<sigma> 45
|
||||
<sigma> 46
|
||||
<sigma> 47
|
||||
<sigma> 48
|
||||
<sigma> 49
|
||||
<sigma> 50
|
||||
<sigma> 51
|
||||
<sigma> 52
|
||||
<sigma> 53
|
||||
<sigma> 54
|
||||
<sigma> 55
|
||||
<sigma> 56
|
||||
<sigma> 57
|
||||
<sigma> 58
|
||||
<sigma> 59
|
||||
<sigma> 60
|
||||
<sigma> 61
|
||||
<sigma> 62
|
||||
<sigma> 63
|
||||
<sigma> 64
|
||||
<sigma> 65
|
||||
<sigma> 66
|
||||
<sigma> 67
|
||||
<sigma> 68
|
||||
<sigma> 69
|
||||
<sigma> 70
|
||||
<sigma> 71
|
||||
<sigma> 72
|
||||
<sigma> 73
|
||||
<sigma> 74
|
||||
<sigma> 75
|
||||
<sigma> 76
|
||||
<sigma> 77
|
||||
<sigma> 78
|
||||
<sigma> 79
|
||||
<sigma> 80
|
||||
<sigma> 81
|
||||
<sigma> 82
|
||||
<sigma> 83
|
||||
<sigma> 84
|
||||
<sigma> 85
|
||||
<sigma> 86
|
||||
<sigma> 87
|
||||
<sigma> 88
|
||||
<sigma> 89
|
||||
<sigma> 90
|
||||
<sigma> 91
|
||||
<sigma> 92
|
||||
<sigma> 93
|
||||
<sigma> 94
|
||||
ucase 40
|
||||
ucase 41
|
||||
ucase 42
|
||||
ucase 43
|
||||
ucase 44
|
||||
ucase 45
|
||||
ucase 46
|
||||
ucase 47
|
||||
ucase 48
|
||||
ucase 49
|
||||
ucase 50
|
||||
ucase 51
|
||||
ucase 52
|
||||
ucase 53
|
||||
ucase 54
|
||||
ucase 55
|
||||
ucase 56
|
||||
ucase 57
|
||||
ucase 58
|
||||
ucase 59
|
||||
ucase 60
|
||||
ucase 61
|
||||
ucase 62
|
||||
ucase 63
|
||||
ucase 64
|
||||
ucase 65
|
||||
ucase 66
|
||||
ucase 67
|
||||
ucase 68
|
||||
ucase 69
|
||||
ucase 70
|
||||
ucase 71
|
||||
ucase 72
|
||||
ucase 73
|
||||
ucase 74
|
||||
letter 1
|
||||
letter 2
|
||||
letter 3
|
||||
letter 4
|
||||
letter 5
|
||||
letter 6
|
||||
letter 7
|
||||
letter 8
|
||||
letter 9
|
||||
letter 10
|
||||
letter 11
|
||||
letter 12
|
||||
letter 13
|
||||
letter 14
|
||||
letter 15
|
||||
letter 16
|
||||
letter 17
|
||||
letter 18
|
||||
letter 19
|
||||
letter 20
|
||||
letter 21
|
||||
letter 22
|
||||
letter 23
|
||||
letter 24
|
||||
letter 25
|
||||
letter 26
|
||||
letter 27
|
||||
letter 28
|
||||
letter 29
|
||||
letter 30
|
||||
letter 31
|
||||
letter 32
|
||||
letter 33
|
||||
letter 34
|
||||
letter 35
|
||||
letter 36
|
||||
letter 37
|
||||
letter 38
|
||||
letter 39
|
||||
letter 40
|
||||
letter 41
|
||||
letter 42
|
||||
letter 43
|
||||
letter 44
|
||||
letter 45
|
||||
letter 46
|
||||
letter 47
|
||||
letter 48
|
||||
letter 49
|
||||
letter 50
|
||||
letter 51
|
||||
letter 52
|
||||
letter 53
|
||||
letter 54
|
||||
letter 55
|
||||
letter 56
|
||||
letter 57
|
||||
letter 58
|
||||
letter 59
|
||||
letter 60
|
||||
letter 61
|
||||
letter 62
|
||||
letter 63
|
||||
letter 64
|
||||
letter 65
|
||||
letter 66
|
||||
letter 67
|
||||
letter 68
|
||||
letter 69
|
||||
letter 70
|
||||
letter 71
|
||||
letter 72
|
||||
letter 73
|
||||
letter 74
|
||||
all 1
|
||||
all 2
|
||||
all 3
|
||||
all 4
|
||||
all 5
|
||||
all 6
|
||||
all 7
|
||||
all 8
|
||||
all 9
|
||||
all 10
|
||||
all 11
|
||||
all 12
|
||||
all 13
|
||||
all 14
|
||||
all 15
|
||||
all 16
|
||||
all 17
|
||||
all 18
|
||||
all 19
|
||||
all 20
|
||||
all 21
|
||||
all 22
|
||||
all 23
|
||||
all 24
|
||||
all 25
|
||||
all 26
|
||||
all 27
|
||||
all 28
|
||||
all 29
|
||||
all 30
|
||||
all 31
|
||||
all 32
|
||||
all 33
|
||||
all 34
|
||||
all 35
|
||||
all 36
|
||||
all 37
|
||||
all 38
|
||||
all 39
|
||||
all 40
|
||||
all 41
|
||||
all 42
|
||||
all 43
|
||||
all 44
|
||||
all 45
|
||||
all 46
|
||||
all 47
|
||||
all 48
|
||||
all 49
|
||||
all 50
|
||||
all 51
|
||||
all 52
|
||||
all 53
|
||||
all 54
|
||||
all 55
|
||||
all 56
|
||||
all 57
|
||||
all 58
|
||||
all 59
|
||||
all 60
|
||||
all 61
|
||||
all 62
|
||||
all 63
|
||||
all 64
|
||||
all 65
|
||||
all 66
|
||||
all 67
|
||||
all 68
|
||||
all 69
|
||||
all 70
|
||||
all 71
|
||||
all 72
|
||||
all 73
|
||||
all 74
|
||||
all 75
|
||||
all 76
|
||||
all 77
|
||||
all 78
|
||||
all 79
|
||||
all 80
|
||||
all 81
|
||||
all 82
|
||||
all 83
|
||||
all 84
|
||||
all 85
|
||||
all 86
|
||||
all 87
|
||||
all 88
|
||||
all 89
|
||||
all 90
|
||||
all 91
|
||||
all 92
|
||||
all 93
|
||||
all 94
|
||||
digit 75
|
||||
digit 76
|
||||
digit 77
|
||||
digit 78
|
||||
digit 79
|
||||
digit 80
|
||||
digit 81
|
||||
digit 82
|
||||
digit 83
|
||||
digit 84
|
||||
signs 85
|
||||
signs 86
|
||||
signs 87
|
||||
signs 88
|
||||
signs 89
|
||||
signs 90
|
7
nawszelkiwypadek/tools/gue_dic/labels.sym
Normal file
7
nawszelkiwypadek/tools/gue_dic/labels.sym
Normal file
@ -0,0 +1,7 @@
|
||||
lcase a ± b c æ d e ê f g h i j k l ³ m n ñ o ó p q r s ¶ t u v w x y z ¼ ¿ é ö ü ä
|
||||
ucase A ¡ B C Æ D E Ê F G H I J K L £ M N Ñ O Ó P Q R S ¦ T U V W X Y Z ¬ ¯
|
||||
letter lcase ucase
|
||||
digit 0 1 2 3 4 5 6 7 8 9
|
||||
signs , . @ / ' _
|
||||
sem ~ ; - \
|
||||
all letter digit signs sem
|
11
nawszelkiwypadek/tools/gue_dic/makeLabels.pl
Executable file
11
nawszelkiwypadek/tools/gue_dic/makeLabels.pl
Executable file
@ -0,0 +1,11 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
use locale;
|
||||
|
||||
print "lcase a ± b c æ d e ê f g h i j k l ³ m n ñ o ó p q r s ¶ t u v w x y z ¼ ¿ é ö ü ä\n";
|
||||
print "ucase A ¡ B C Æ D E Ê F G H I J K L £ M N Ñ O Ó P Q R S ¦ T U V W X Y Z ¬ ¯\n";
|
||||
print "letter lcase ucase\n";
|
||||
print "digit 0 1 2 3 4 5 6 7 8 9\n";
|
||||
print "signs , . @ \/ \' _\n";
|
||||
print "sem ~ ; - \\ \n";
|
||||
print "all letter digit signs sem\n";
|
213
nawszelkiwypadek/tools/gue_dic/prep.pl
Executable file
213
nawszelkiwypadek/tools/gue_dic/prep.pl
Executable file
@ -0,0 +1,213 @@
|
||||
#! /usr/bin/perl
|
||||
|
||||
use locale;
|
||||
|
||||
$linesPerFile = 20000;
|
||||
|
||||
if (@ARGV < 1) {
|
||||
print "usage: prep.pl dictionary_file\n";
|
||||
exit;
|
||||
}
|
||||
|
||||
$file = shift; # @ARGV;
|
||||
$kind = shift;
|
||||
|
||||
if ($kind eq "") {
|
||||
$kind="suf";
|
||||
}
|
||||
|
||||
# Przygotowanie etykiet
|
||||
|
||||
`makeLabels.pl > labels.sym`;
|
||||
|
||||
`lexmakelab labels`;
|
||||
|
||||
# Analiza pliku s³ownika
|
||||
|
||||
|
||||
print "Kanonizujê opisy.........................................";
|
||||
|
||||
`canon.pl <$file >temp2`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Analizuje prefiksy.......................................";
|
||||
|
||||
`count_prefs.pl 2 4 < temp2 > prefs`;
|
||||
`sort -k1,1 -k3,3nr prefs > prefsS`;
|
||||
`cut_prefs.pl 0.5 0.01 100 prefsS > prefs`;
|
||||
`rm prefsS`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Analizujê plik s³ownika";
|
||||
|
||||
if ($kind eq "pre") {
|
||||
print "(pre).............................";
|
||||
`stat_pre.pl temp2 > temp1`;
|
||||
} else {
|
||||
print "(suf).............................";
|
||||
`stat.pl prefs < temp2 > temp1`;
|
||||
}
|
||||
|
||||
print "OK\n";
|
||||
|
||||
# zmniejszamy plik...
|
||||
|
||||
print "Sortujê plik.............................................";
|
||||
|
||||
`sort -t \\~ -k1,1 -k2,2nr <temp1 > temp2`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Minimalizujê plik s³ownika...............................";
|
||||
|
||||
`rmDup.pl < temp2 > temp1`;
|
||||
#`rmDup2.pl < temp1 > temp2`;
|
||||
|
||||
`cp temp1 temp2`;
|
||||
|
||||
`rm temp1`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
#dzielimy plik na wiele czê¶ci, uruchamiamy lexcomplex dla ka¿dej
|
||||
#czê¶ci osobno, nastêpnie ³±czymy to za pomoc± programu fsmunion
|
||||
|
||||
print "Dzielê s³ownik na mniejsze czê¶ci........................";
|
||||
|
||||
open(IN, "./temp2");
|
||||
|
||||
$lineCount = 0;
|
||||
$fileCount = 0;
|
||||
|
||||
`mkdir LemTEMP`;
|
||||
|
||||
open(FILE, ">LemTEMP/slo_0");
|
||||
|
||||
while (<IN>) {
|
||||
|
||||
if (++$lineCount >= $linesPerFile) {
|
||||
$fileCount++;
|
||||
$lineCount = 0;
|
||||
|
||||
close(FILE);
|
||||
# print "Tworzê nowy plik tymczasowy: slo_".$fileCount."\n";
|
||||
open(FILE, ">LemTEMP/slo_".$fileCount);
|
||||
}
|
||||
|
||||
print(FILE $_);
|
||||
}
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Tworzê automaty po¶rednie";
|
||||
|
||||
#32 kropki, fileCount plikow
|
||||
$filesPerDot = $fileCount/32;
|
||||
$files=$filesPerDot;
|
||||
$dots=0;
|
||||
|
||||
for ($i=0; $i<=$fileCount; $i++) {
|
||||
|
||||
if ($files >= $filesPerDot) {
|
||||
$files = 0;
|
||||
print ".";
|
||||
$dots++;
|
||||
}
|
||||
$files++;
|
||||
|
||||
$command = "lexcomplex -l labels.lab -S labels.scl < LemTEMP/slo_".$i." > LemTEMP/slownik_".$i.".fsm";
|
||||
|
||||
`$command`;
|
||||
|
||||
}
|
||||
if ($dots < 32) {
|
||||
for ($i=0; $i<32 - $dots; $i++) {
|
||||
print ".";
|
||||
}
|
||||
}
|
||||
|
||||
print "OK\n";
|
||||
|
||||
`rm LemTEMP/slo_*`;
|
||||
|
||||
print "Tworzê automat koñcowy";
|
||||
|
||||
#35 kropek...
|
||||
$filesPerDot = $fileCount/35;
|
||||
$files=$filesPerDot;
|
||||
$dots=0;
|
||||
|
||||
`cp LemTEMP/slownik_0.fsm slownik1.fsm`;
|
||||
|
||||
for ($i=1; $i<=$filecount; $i++) {
|
||||
|
||||
if ($files >= $filesPerDot) {
|
||||
$files = 0;
|
||||
print ".";
|
||||
$dots++;
|
||||
}
|
||||
$files++;
|
||||
|
||||
$command = "fsmunion LemTEMP/slownik_".$i." slownik1.fsm > slownik2.fsm";
|
||||
|
||||
`$command`;
|
||||
|
||||
`mv slownik2.fsm slownik1.fsm`;
|
||||
}
|
||||
|
||||
if ($dots < 35) {
|
||||
for ($i=0; $i<35 - $dots; $i++) {
|
||||
print ".";
|
||||
}
|
||||
}
|
||||
|
||||
`fsmunion LemTEMP/* > slownik1.fsm`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Usuwam epsilon-przejscia.................................";
|
||||
|
||||
`fsmrmepsilon slownik1.fsm > slownik2.fsm`;
|
||||
|
||||
`rm slownik1.fsm`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Determinizujê automat....................................";
|
||||
|
||||
`fsmdeterminize slownik2.fsm > slownik1.fsm`;
|
||||
|
||||
`rm slownik2.fsm`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Minimalizujê automat.....................................";
|
||||
|
||||
`fsmminimize slownik1.fsm > slownik.fsm`;
|
||||
|
||||
`rm slownik1.fsm`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Konwertujê automat do formatu fsa........................";
|
||||
|
||||
`fsmprint -i labels.lab slownik.fsm > slownik.txt`;
|
||||
|
||||
`../fsm2aut slownik.txt > slownik.aut`;
|
||||
|
||||
`../aut2fsa < slownik.aut > gue.bin`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Czyszczê pliki pomocnicze................................";
|
||||
|
||||
`rm LemTEMP/*`;
|
||||
`rmdir LemTEMP`;
|
||||
`rm temp2`;
|
||||
`rm slownik.fsm`;
|
||||
`rm slownik.txt`;
|
||||
`rm slownik.aut`;
|
||||
|
||||
print "OK\n";
|
31
nawszelkiwypadek/tools/gue_dic/prep_user_dict.pl
Executable file
31
nawszelkiwypadek/tools/gue_dic/prep_user_dict.pl
Executable file
@ -0,0 +1,31 @@
|
||||
#! /usr/bin/env perl
|
||||
|
||||
use locale;
|
||||
use strict;
|
||||
|
||||
##################################################
|
||||
# Skrypt transformuje s³ownik u¿ytkownika #
|
||||
# do s³ownika rozumianego przez gue. #
|
||||
# Format wej¶ciowy: #
|
||||
# pref*kon~waga;opis #
|
||||
# Format wyj¶ciowy: #
|
||||
# nok_pref~waga;opis #
|
||||
# #
|
||||
# Prefiks mo¿e byæ pusty, koñcówka te¿ #
|
||||
##################################################
|
||||
|
||||
while (<>) {
|
||||
/^(\w*)\*(\w*)(~.*)$/;
|
||||
my $pref = $1;
|
||||
my $kon = $2;
|
||||
my $desc = $3;
|
||||
|
||||
print reverse(split("",$kon));
|
||||
|
||||
if ($pref != "") {
|
||||
print "_$pref";
|
||||
}
|
||||
print "$desc\n";
|
||||
|
||||
|
||||
}
|
55
nawszelkiwypadek/tools/gue_dic/rmDup.pl
Executable file
55
nawszelkiwypadek/tools/gue_dic/rmDup.pl
Executable file
@ -0,0 +1,55 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
# Usuwa zbędne powtórzenia scieżek
|
||||
|
||||
use locale;
|
||||
|
||||
#if (@ARGV < 1) {
|
||||
# print "USAGE: remDup.pl num\n\tGdzie \"num\" jest ilością powtórzeń, które zostawiamy\n";
|
||||
# exit(0);
|
||||
#}
|
||||
|
||||
|
||||
|
||||
#ilość powtórzeń, które zostawiamy
|
||||
$max = 10;
|
||||
|
||||
# przyjmujemy, ze na wejściu znajduje się plik posortowany,
|
||||
# po końcówkach oraz po prawdopodobieństwie
|
||||
|
||||
$line = <>;
|
||||
|
||||
while (($line !~ m/^$/) || ($line =~ m/^\n$/)) {
|
||||
|
||||
$count = 0;
|
||||
|
||||
while ($line =~ m/^\n$/) {
|
||||
$line = <>;
|
||||
}
|
||||
|
||||
$line =~ /^([^~]+)~.*/;
|
||||
$theEnd = $1;
|
||||
$end = $1;
|
||||
|
||||
while (($end =~ m/$theEnd/) && ($count++ < $max)) {
|
||||
print $line;
|
||||
$line = <>;
|
||||
$line =~ /^([^~]+)~.*/;
|
||||
$end = $1;
|
||||
}
|
||||
|
||||
# tutaj mamy dwie możliwości:
|
||||
# 1. wypisaliśmy już max lini - musimy wywalić kolejne linie, które
|
||||
# zawierają końcówkę theEnd,
|
||||
# 2. pasujących lini było mniej niż max, wtedy nic nie musimy robić
|
||||
# - w zmiennej line znajduje się kolejna linia...
|
||||
|
||||
if ($count == $max + 1) {
|
||||
while ($oldEnd =~ m/$theEnd/) {
|
||||
$line = <>;
|
||||
$line =~ /^([^~]+)~.*/;
|
||||
$oldEnd = $1;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
165
nawszelkiwypadek/tools/gue_dic/stat.pl
Executable file
165
nawszelkiwypadek/tools/gue_dic/stat.pl
Executable file
@ -0,0 +1,165 @@
|
||||
#! /usr/bin/perl
|
||||
|
||||
use locale;
|
||||
|
||||
######################################################
|
||||
# na wejściu znajduje się plik zawierający linie #
|
||||
# postaci: #
|
||||
# slowo;opis #
|
||||
# #
|
||||
# na wyjściu ma się znaleźć plik zawierający linie: #
|
||||
# końcówka(rev);prawdopodobieństwo;opis #
|
||||
# gdzie: #
|
||||
# - końcówka(rev) jest końcówką wyrazu zapisaną #
|
||||
# w odwrotnej kolejności, dla każdego wyrazu #
|
||||
# w słowniku wypisujemy końcówki o długościach #
|
||||
# od 1 do długości wyrazu, #
|
||||
# - prawdopodobieństwo jest prawdopodobieństwem #
|
||||
# wystąpienia danego opisu dla danej końcówki #
|
||||
# (obliczonym na podstwie statystycznej analizy #
|
||||
# słownika), np: 250 oznacza, ze opis popjawia sie #
|
||||
# 1 raz na 4 wystąpienia końcówki. #
|
||||
# Zapisana zostaje odwrotność prawdopodobieństwa #
|
||||
# aby scieżka najbardziej prawdopodobna miała #
|
||||
# najmniejszy koszt. #
|
||||
######################################################
|
||||
|
||||
######
|
||||
#STALE
|
||||
#
|
||||
# Jak bardzo prawdopodobna musi być dana ścieżka, aby
|
||||
# brać ją pod uwagę? (w promilach)
|
||||
$MIN_PROB = 0;
|
||||
#
|
||||
# Maksymalna ilość powtórzeń danej końcówki (brane od
|
||||
# najbardziej prawdopodbnej w dół
|
||||
$MAX_PATH = 10;
|
||||
#
|
||||
# Znak odzielajacy koncowke od prefiksu
|
||||
$PREF_SIGN = '_';
|
||||
######
|
||||
# Zmienne globalne
|
||||
#
|
||||
# Tablica okreslajaca, ktore prefiksy nalezy uwzlednic
|
||||
# w wyjsciowym pliku.
|
||||
# Klucz - ciag znakow prefiks$PREF_SIGNopis.
|
||||
# Wartosc: 1 - jezeli nalezy uwzglednic, 0 w przeciwnym przypadku
|
||||
my %prefs;
|
||||
#
|
||||
# maksymalna dlugosc analizowanego prefiksu
|
||||
my $MAX_PREF = 0;
|
||||
######
|
||||
|
||||
###########################################################
|
||||
# FUNKCJE
|
||||
|
||||
# wczytuje prefiksy do tablicy hashowej
|
||||
# parametry:
|
||||
# - nazwa pliku, z ktorego nalezy pobrac prefiksy
|
||||
# Plik w formacie:
|
||||
# prefiks\topis...\n
|
||||
sub load_prefs {
|
||||
|
||||
my $file = shift;
|
||||
open(IN, $file);
|
||||
|
||||
while (<IN>) {
|
||||
$_ =~ /^(\w+)\t([^\t]+)\t/;
|
||||
my $key = "$1$PREF_SIGN$2";
|
||||
my $len = length($1);
|
||||
if ($len > $MAX_PREF) {
|
||||
$MAX_PREF = $len;
|
||||
}
|
||||
$prefs{$key} = 1;
|
||||
}
|
||||
}
|
||||
|
||||
###########################################################
|
||||
|
||||
# Jezeli podano parametr to jest to nazwa pliku z prefiksami
|
||||
|
||||
if (@ARGV > 0) {
|
||||
# print "Laduje prefiksy ($ARGV[0])\n";
|
||||
load_prefs(shift);
|
||||
# print "Zaladowane:\n";
|
||||
# for $key (keys(%prefs)) {
|
||||
# print "$key\t$prefs{$key}\n";
|
||||
# }
|
||||
# print "++++++++++++++++++++++++++++++++++++++++++++++++++\n";
|
||||
}
|
||||
|
||||
@input = <>;
|
||||
|
||||
#$max = 0;
|
||||
|
||||
#for $m (@input) {
|
||||
# $m =~ /(\w+);.*$/;
|
||||
# if (length($1) > $max) {
|
||||
# $max = length($1);
|
||||
# }
|
||||
#}
|
||||
|
||||
$n = 2; #$max;
|
||||
|
||||
$go = 1;
|
||||
|
||||
while ($go) {
|
||||
|
||||
my %koncowki;
|
||||
my $sumy;
|
||||
|
||||
$go = 0;
|
||||
for $m (@input) {
|
||||
if ($m =~ /(\w{$n});(.*)$/) {
|
||||
$go = 1;
|
||||
my $ending = $1;
|
||||
my $desc = $2;
|
||||
for (my $i=$MAX_PREF; $i>0; $i--) {
|
||||
$m =~ /^(\w{$i}).*/;
|
||||
my $key = "$1$PREF_SIGN$desc";
|
||||
if ($prefs{$key} == 1) {
|
||||
$ending .= "$PREF_SIGN$1";
|
||||
last;
|
||||
}
|
||||
}
|
||||
$koncowki{$ending.";".$desc}++;
|
||||
$sumy{$ending}++;
|
||||
}
|
||||
}
|
||||
|
||||
print "\n";
|
||||
|
||||
for $koncowka (keys %koncowki) {
|
||||
$koncowka =~ /^(.*);(.*)$/;
|
||||
my $ending = $1;
|
||||
my $opis = $2;
|
||||
$p = $koncowki{$koncowka} / $sumy{$ending};
|
||||
$p *= 1000; #wartosc w promilach
|
||||
|
||||
if ($p <= $MIN_PROB) {
|
||||
next;
|
||||
}
|
||||
|
||||
#if ($p == 1000) {
|
||||
# $p--;
|
||||
#}
|
||||
|
||||
#$p = 1000 - $p; #odwrotnosc
|
||||
my $old = $2;
|
||||
$ending =~ /^(\w+)$PREF_SIGN(\w+)/;
|
||||
|
||||
my $rev = reverse($1);
|
||||
|
||||
if ($2 !~ /^$old$/) {
|
||||
$rev .= "$PREF_SIGN$2";
|
||||
}
|
||||
|
||||
# opakowujemy znak '-' znakami [] ;) dla lextools
|
||||
$opis =~ s/-/\[-\]/;
|
||||
|
||||
printf "%s~%.0f;%s\n", $rev, $p, $opis;
|
||||
}
|
||||
|
||||
$n++;
|
||||
|
||||
}
|
95
nawszelkiwypadek/tools/gue_dic/stat_pre.pl
Executable file
95
nawszelkiwypadek/tools/gue_dic/stat_pre.pl
Executable file
@ -0,0 +1,95 @@
|
||||
#! /usr/bin/perl
|
||||
|
||||
use locale;
|
||||
|
||||
######################################################
|
||||
# na wejściu znajduje się plik zawierający linie #
|
||||
# postaci: #
|
||||
# slowo;opis #
|
||||
# #
|
||||
# na wyjściu ma się znaleźć plik zawierający linie: #
|
||||
# końcówka(rev);prawdopodobieństwo;opis #
|
||||
# gdzie: #
|
||||
# - końcówka(rev) jest końcówką wyrazu zapisaną #
|
||||
# w odwrotnej kolejności, dla każdego wyrazu #
|
||||
# w słowniku wypisujemy końcówki o długościach #
|
||||
# od 1 do długości wyrazu, #
|
||||
# - prawdopodobieństwo jest prawdopodobieństwem #
|
||||
# wystąpienia danego opisu dla danej końcówki #
|
||||
# (obliczonym na podstwie statystycznej analizy #
|
||||
# słownika), np: 250 oznacza, ze opis popjawia sie #
|
||||
# 1 raz na 4 wystąpienia końcówki. #
|
||||
# Zapisana zostaje odwrotność prawdopodobieństwa #
|
||||
# aby scieżka najbardziej prawdopodobna miała #
|
||||
# najmniejszy koszt. #
|
||||
######################################################
|
||||
|
||||
######
|
||||
#STALE
|
||||
#
|
||||
# Jak bardzo prawdopodobna musi być dana ścieżka, aby
|
||||
# brać ją pod uwagę? (w promilach)
|
||||
$MIN_PROB = 0;
|
||||
#
|
||||
# Maksymalna ilość powtórzeń danej końcówki (brane od
|
||||
# najbardziej prawdopodbnej w dół
|
||||
$MAX_PATH = 10;
|
||||
#
|
||||
######
|
||||
|
||||
@input = <>;
|
||||
|
||||
#$max = 0;
|
||||
|
||||
#for $m (@input) {
|
||||
# $m =~ /(\w+);.*$/;
|
||||
# if (length($1) > $max) {
|
||||
# $max = length($1);
|
||||
# }
|
||||
#}
|
||||
|
||||
$n = 1; #$max;
|
||||
|
||||
$go = 1;
|
||||
|
||||
while ($n<7) {
|
||||
|
||||
my %prefiksy;
|
||||
my $sumy;
|
||||
|
||||
$go = 0;
|
||||
for $m (@input) {
|
||||
if ($m =~ /^(\w{$n})\w*;(.*)$/) {
|
||||
$go = 1;
|
||||
$prefiksy{$1.";".$2}++;
|
||||
$sumy{$1}++;
|
||||
}
|
||||
}
|
||||
|
||||
print "\n";
|
||||
|
||||
for $prefiks (keys %prefiksy) {
|
||||
$prefiks =~ /^(.*);(.*)$/;
|
||||
$p = $prefiksy{$prefiks} / $sumy{$1};
|
||||
$p *= 1000; #wartosc w promilach
|
||||
|
||||
if ($p <= $MIN_PROB) {
|
||||
next;
|
||||
}
|
||||
|
||||
#if ($p == 1000) {
|
||||
# $p--;
|
||||
#}
|
||||
|
||||
$p = 1000 - $p; #odwrotnosc
|
||||
# $rev = reverse($1);
|
||||
# opakowujemy znak '-' znakami [] ;)
|
||||
$opis = $2;
|
||||
$opis =~ s/-/\[-\]/;
|
||||
|
||||
printf "%s~%.0f;%s\n", $1, $p, $opis;
|
||||
}
|
||||
|
||||
$n++;
|
||||
|
||||
}
|
11
nawszelkiwypadek/tools/lem_dic/makeLabels.pl
Executable file
11
nawszelkiwypadek/tools/lem_dic/makeLabels.pl
Executable file
@ -0,0 +1,11 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
use locale;
|
||||
|
||||
print "lcase a ± b c æ d e ê f g h i j k l ³ m n ñ o ó p q r s ¶ t u v w x y z ¼ ¿ é ö ü ä\n";
|
||||
print "ucase A ¡ B C Æ D E Ê F G H I J K L £ M N Ñ O Ó P Q R S ¦ T U V W X Y Z ¬ ¯\n";
|
||||
print "letter lcase ucase\n";
|
||||
print "digit 0 1 2 3 4 5 6 7 8 9\n";
|
||||
print "signs , . @ \/ \'\n";
|
||||
print "sem ~ ; _ - + ? \\ \n";
|
||||
print "all letter digit signs sem\n";
|
75
nawszelkiwypadek/tools/lem_dic/prep.pl
Executable file
75
nawszelkiwypadek/tools/lem_dic/prep.pl
Executable file
@ -0,0 +1,75 @@
|
||||
#! /usr/bin/perl
|
||||
|
||||
use locale;
|
||||
use strict;
|
||||
|
||||
my $file = shift;
|
||||
my $filename;
|
||||
|
||||
if ($file eq "") {
|
||||
print "Podaj nazwê pliku.\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if ($file =~ /(.*)\.dic/)
|
||||
{
|
||||
$filename = $1;
|
||||
}
|
||||
else
|
||||
{
|
||||
print "The input file must have .dic extension.";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
`makeLabels.pl > labels.sym`;
|
||||
|
||||
`lexmakelab labels`;
|
||||
|
||||
print "Kanonizujê opisy.........................................";
|
||||
|
||||
#`canon.pl <$file >temp2`;
|
||||
|
||||
print "Kompilujê automat........................................";
|
||||
|
||||
`lexcomplex -l labels.lab -S labels.scl <$file > temp1`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Usuwam epsilon-przejscia.................................";
|
||||
|
||||
`fsmrmepsilon temp1> temp2`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Determinizujê automat....................................";
|
||||
|
||||
`fsmdeterminize temp2 > temp1`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Minimalizujê automat.....................................";
|
||||
|
||||
`fsmminimize temp1> temp2`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Konwertujê automat do formatu fsa........................";
|
||||
|
||||
`fsmprint -i labels.lab temp2> temp1`;
|
||||
|
||||
`cp temp1 pofsmprint`;
|
||||
|
||||
`../fsm2aut temp1> temp2`;
|
||||
|
||||
`./aut2fsa.nowy < temp2> $filename.bin`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "Czyszczê pliki pomocnicze................................";
|
||||
|
||||
`rm temp2`;
|
||||
`rm temp1`;
|
||||
#`rm labels.*`;
|
||||
|
||||
print "OK\n";
|
||||
|
34
www/copyright.html
Normal file
34
www/copyright.html
Normal file
@ -0,0 +1,34 @@
|
||||
<html>
|
||||
<head>
|
||||
<title>UAM Text Tools</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>UAM Text Tools</h1>
|
||||
|
||||
|
||||
<h3>UTT components</h3>
|
||||
<ul>
|
||||
<li>tok - tokenizer
|
||||
<li>lem - lemmatizer
|
||||
<li>gue - guesser
|
||||
<li>cor - corrector
|
||||
<li>ser - searcher
|
||||
<li>grp - grepper
|
||||
<li>con - concordancer
|
||||
<li>dgp - dependency graph parser
|
||||
<li>kot - rezinekot
|
||||
</ul>
|
||||
|
||||
<h3>Download</h3>
|
||||
<ul>
|
||||
<li>Software
|
||||
<li>Polish dictionary data (PMDB tagset)
|
||||
<li>Portuguese dictionary data (INTEX tagset)
|
||||
</ul>
|
||||
|
||||
|
||||
|
||||
Authors: Tomasz Obrêbski, Micha³ Stolarski, Justyna Walkowska
|
||||
|
||||
</body>
|
||||
<html>
|
34
www/downloads.html
Normal file
34
www/downloads.html
Normal file
@ -0,0 +1,34 @@
|
||||
<html>
|
||||
<head>
|
||||
<title>UAM Text Tools</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>UAM Text Tools</h1>
|
||||
|
||||
|
||||
<h3>UTT components</h3>
|
||||
<ul>
|
||||
<li>tok - tokenizer
|
||||
<li>lem - lemmatizer
|
||||
<li>gue - guesser
|
||||
<li>cor - corrector
|
||||
<li>ser - searcher
|
||||
<li>grp - grepper
|
||||
<li>con - concordancer
|
||||
<li>dgp - dependency graph parser
|
||||
<li>kot - rezinekot
|
||||
</ul>
|
||||
|
||||
<h3>Download</h3>
|
||||
<ul>
|
||||
<li>Software
|
||||
<li>Polish dictionary data (PMDB tagset)
|
||||
<li>Portuguese dictionary data (INTEX tagset)
|
||||
</ul>
|
||||
|
||||
|
||||
|
||||
Authors: Tomasz Obrêbski, Micha³ Stolarski, Justyna Walkowska
|
||||
|
||||
</body>
|
||||
<html>
|
34
www/index.html
Normal file
34
www/index.html
Normal file
@ -0,0 +1,34 @@
|
||||
<html>
|
||||
<head>
|
||||
<title>UAM Text Tools</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>UAM Text Tools</h1>
|
||||
|
||||
|
||||
<h3>UTT components</h3>
|
||||
<ul>
|
||||
<li>tok - tokenizer
|
||||
<li>lem - lemmatizer
|
||||
<li>gue - guesser
|
||||
<li>cor - corrector
|
||||
<li>ser - searcher
|
||||
<li>grp - grepper
|
||||
<li>con - concordancer
|
||||
<li>dgp - dependency graph parser
|
||||
<li>kot - rezinekot
|
||||
</ul>
|
||||
|
||||
<h3>Download</h3>
|
||||
<ul>
|
||||
<li>Software
|
||||
<li>Polish dictionary data (PMDB tagset)
|
||||
<li>Portuguese dictionary data (INTEX tagset)
|
||||
</ul>
|
||||
|
||||
|
||||
|
||||
Authors: Tomasz Obrêbski, Micha³ Stolarski, Justyna Walkowska
|
||||
|
||||
</body>
|
||||
<html>
|
34
www/utt.html
Normal file
34
www/utt.html
Normal file
@ -0,0 +1,34 @@
|
||||
<html>
|
||||
<head>
|
||||
<title>UAM Text Tools</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>UAM Text Tools</h1>
|
||||
|
||||
|
||||
<h3>UTT components</h3>
|
||||
<ul>
|
||||
<li>tok - tokenizer
|
||||
<li>lem - lemmatizer
|
||||
<li>gue - guesser
|
||||
<li>cor - corrector
|
||||
<li>ser - searcher
|
||||
<li>grp - grepper
|
||||
<li>con - concordancer
|
||||
<li>dgp - dependency graph parser
|
||||
<li>kot - rezinekot
|
||||
</ul>
|
||||
|
||||
<h3>Download</h3>
|
||||
<ul>
|
||||
<li>Software
|
||||
<li>Polish dictionary data (PMDB tagset)
|
||||
<li>Portuguese dictionary data (INTEX tagset)
|
||||
</ul>
|
||||
|
||||
|
||||
|
||||
Authors: Tomasz Obrêbski, Micha³ Stolarski, Justyna Walkowska
|
||||
|
||||
</body>
|
||||
<html>
|
Loading…
Reference in New Issue
Block a user