From 19760efd7bb36c499d7149474a4898332fa60a7c Mon Sep 17 00:00:00 2001 From: obrebski Date: Thu, 15 May 2008 19:55:02 +0000 Subject: [PATCH] uzupelnione configi oprawiona obsluga opcji weight w gue tre znajduje swoja biblioteke (nie znajdywal wczesniej) git-svn-id: svn://atos.wmid.amu.edu.pl/utt@51 e293616e-ec6a-49c2-aa92-f4a8b91c5d16 --- app/TODO | 1 + app/conf/kor.conf | 1 + app/conf/ser.conf | 1 + app/doc/utt.texinfo | 61 ++++++++++++++++++----------------- app/src/dgp/tre.rb | 7 ++-- app/src/gue/cmdline_guess.ggo | 2 +- app/src/gue/common_guess.cc | 4 +-- 7 files changed, 41 insertions(+), 36 deletions(-) diff --git a/app/TODO b/app/TODO index 2a4b8a4..08b8999 100644 --- a/app/TODO +++ b/app/TODO @@ -1,6 +1,7 @@ BARDZO WAZNE: * przemyslec sposob wybierania jezyka / slownika po zainstalowaniu roznych dystrybucji [PK, TO] * gue nie sortuje wynikow, opcja weights dziala na odwrot +* kor nie wykonuje zamian -> , np. ż rz WAZNE: * zamienic kota na lepszego (Kubis) [TO] diff --git a/app/conf/kor.conf b/app/conf/kor.conf index 6e743f3..e4b9315 100644 --- a/app/conf/kor.conf +++ b/app/conf/kor.conf @@ -13,3 +13,4 @@ dictionary-home = PATH_PREFIX/share/utt weights = PATH_PREFIX/share/utt/weights.kor threshold = 1.0 +process=W diff --git a/app/conf/ser.conf b/app/conf/ser.conf index 33fec25..3e52657 100644 --- a/app/conf/ser.conf +++ b/app/conf/ser.conf @@ -12,3 +12,4 @@ # macros = PATH_PREFIX/lib/utt/terms.m4 flex-template = PATH_PREFIX/lib/utt/ser.l.template +tags=uam diff --git a/app/doc/utt.texinfo b/app/doc/utt.texinfo index 1be4ccb..1102734 100644 --- a/app/doc/utt.texinfo +++ b/app/doc/utt.texinfo @@ -10,7 +10,7 @@ @copying This manual is for UAM Text Tools (version 0.90, November, 2007) -Copyright @copyright{} 2005, 2007 Tomasz Obrêbski, Micha³ Stolarski, Justyna Walkowska, Pawe³ Konieczka. +Copyright @copyright{} 2005, 2007 Tomasz Obrêbski, Micha³ Stolarski, Justyna Walkowska, Pawe³ Konieczka. Permission is granted to copy, distribute and/or modify this document under the terms of the GNU Free Documentation License, Version 1.2 @@ -127,6 +127,7 @@ List of contributors: @item Michal Stolarski @item Marcin Walas @item Justyna Walkowska +@item PaweÅ‚ WereÅ„ski @end itemize @c ---------------------------------------------------------------------- @@ -248,7 +249,7 @@ sentence: @samp{Piszemy dobre progrumy.} @example 0000 00 BOS * -0000 07 W Piszemy lem:pisaæ,V +0000 07 W Piszemy lem:pisaæ,V 0007 01 S _ 0008 05 W dobre lem:dobry,ADJ 0013 01 S _ @@ -259,7 +260,7 @@ sentence: @samp{Piszemy dobre progrumy.} 0024 00 BOS * 0024 11 W Warszawiacy lem:Warszawiak,N 0035 01 S _ -0036 03 W te¿ +0036 03 W te¿ 0039 01 P . 0040 00 EOS * @@ -267,7 +268,7 @@ sentence: @samp{Piszemy dobre progrumy.} @example 0000 BOS * -0000 W Piszemy lem:pisaæ,V +0000 W Piszemy lem:pisaæ,V 0007 S _ 0008 W dobre lem:dobry,ADJ 0013 S _ @@ -280,7 +281,7 @@ Posion information may be provided only for some types of segments: @example 0000 BOS * -W Piszemy lem:pisaæ,V +W Piszemy lem:pisaæ,V S _ W dobre lem:dobry,ADJ S _ @@ -291,7 +292,7 @@ S _ 0024 BOS * W Warszawiacy lem:Warszawiak,N S _ -W te¿ +W te¿ P . EOS * @end example @@ -405,7 +406,7 @@ as ISO, ANSI, DOS, UTF-8 (probably: not tested yet). @c @end table -@c [JAK UZYSKAÆ POLSKIE CZCIONKI W DVI???] +@c [JAK UZYSKAÆ POLSKIE CZCIONKI W DVI???] @macro parhelp @item @b{@minus{}@minus{}help}, @b{@minus{}h} @@ -718,7 +719,7 @@ Sinks: programs which read UTT data and produce output in another format @c ---------------------------------------- @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} -@item @strong{Authors:} @tab Tomasz Obrêbski +@item @strong{Authors:} @tab Tomasz Obrêbski @item @strong{Component category:} @tab source @end multitable @@ -820,7 +821,7 @@ Output: @c @node sen - sentencizer @c @chapter sen - sentencizer -@c Authors: Tomasz Obrêbski +@c Authors: Tomasz Obrêbski @c --------------------------------------------------------------------- @c LEM @@ -831,7 +832,7 @@ Output: @section lem - morphological analyzer @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} -@item @strong{Authors:} @tab Tomasz Obrêbski, Micha³ Stolarski +@item @strong{Authors:} @tab Tomasz Obrêbski, Micha³ Stolarski @item @strong{Component category:} @tab filter @end multitable @@ -932,7 +933,7 @@ Input: Output (default): @example -0000 07 W Piszemy lem:pisaæ,V/AiVpMdTrfNpP1 +0000 07 W Piszemy lem:pisaæ,V/AiVpMdTrfNpP1 0007 01 B _ 0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn 0008 05 W dobre lem:dobry,ADJ/DpNsCnavGn @@ -947,7 +948,7 @@ Output (default): Output (@option{--one-line} option): @example -0000 07 W Piszemy lem:pisaæ,V/AiVpMdTrfNpP1 +0000 07 W Piszemy lem:pisaæ,V/AiVpMdTrfNpP1 0007 01 S _ 0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn lem:dobry,ADJ/DpNsCnavGn 0013 01 S _ @@ -959,7 +960,7 @@ Output (@option{--one-line} option): Output (@option{--one-field} option): @example -0000 07 W Piszemy lem:pisaæ,V/AiVpMdTrfNpP1 +0000 07 W Piszemy lem:pisaæ,V/AiVpMdTrfNpP1 0007 01 S _ 0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn,ADJ/DpNsCnavGn 0013 01 S _ @@ -993,7 +994,7 @@ Dictionary entries have the following structure: meaning: replace prefix of length @code{} with string @code{}, replace suffix of length @code{} with string @code{}. For example @code{3t} transforms @samp{kocie} into -@samp{kot}, @code{3-4a³y} transforms @samp{najbielsi} into @samp{bia³y} +@samp{kot}, @code{3-4a³y} transforms @samp{najbielsi} into @samp{bia³y} Each dictionary entry must be written in one line and must not contain blank characters. @@ -1004,8 +1005,8 @@ kota;1,N/GaNsCg;1,N/GaNsCa kotu;1,N/GaNsCd kotem;2,N/GaNsCi kocie;3t,N/GaNsCl;3t,N/GaNsCv -najbielsi;3-4a³y,ADJ/DsNpCnGp -najbielsze;3-5a³y,ADJ/DsNpCnGaifn +najbielsi;3-4a³y,ADJ/DsNpCnGp +najbielsze;3-5a³y,ADJ/DsNpCnGaifn najlepsi;dobry,ADJ/DsNpCnGp najlepsze;dobry,ADJ/DsNpCnGaifn @end example @@ -1064,7 +1065,7 @@ located by default in: @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} -@item @strong{Authors:} @tab Micha³ Stolarski, Tomasz Obrêbski +@item @strong{Authors:} @tab Micha³ Stolarski, Tomasz Obrêbski @item @strong{Component category:} @tab filter @end multitable @@ -1155,7 +1156,7 @@ string @var{add1}, replace suffix of length @var{cat2} with string @var{add2}. -Example: @code{3-4a³y} transforms @i{najbielsi} into @i{bia³y} +Example: @code{3-4a³y} transforms @i{najbielsi} into @i{bia³y} @var{description} contains the part of speech and morphosyntactic information (@xref{PMDBF dictionary}.). @@ -1164,8 +1165,8 @@ Example: @code{3-4a likelihood of the guess. @example -*³kê;1a,N/GfNsCa -naj*elszy;3-4a³y,ADJ/...:... +*³kê;1a,N/GfNsCa +naj*elszy;3-4a³y,ADJ/...:... @end example @@ -1178,7 +1179,7 @@ naj*elszy;3-4a @section cor - spelling corrector @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} -@item @strong{Authors:} @tab Tomasz Obrêbski, Micha³ Stolarski +@item @strong{Authors:} @tab Tomasz Obrêbski, Micha³ Stolarski @item @strong{Component category:} @tab filter @end multitable @@ -1247,7 +1248,7 @@ odludek @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} -@item @strong{Authors:} @tab Tomasz Obrêbski +@item @strong{Authors:} @tab Tomasz Obrêbski @item @strong{Component category:} @tab filter @end multitable @@ -1267,7 +1268,7 @@ odludek command: sen input: -0000 05 W Cze¶æ +0000 05 W Cze¶æ 0005 01 P ! 0006 01 S _ 0007 02 W To @@ -1278,7 +1279,7 @@ input: output: 0000 00 BOS * -0000 05 W Cze¶æ +0000 05 W Cze¶æ 0005 01 P ! 0006 00 EOS * 0006 00 BOS * @@ -1299,7 +1300,7 @@ output: @c @node gph - graphizer @c @chapter gph - graphizer -@c Authors: Tomasz Obrêbski +@c Authors: Tomasz Obrêbski @@ -1312,7 +1313,7 @@ output: @section ser - pattern search tool @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} -@item @strong{Authors:} @tab Tomasz Obrêbski +@item @strong{Authors:} @tab Tomasz Obrêbski @item @strong{Component category:} @tab filter @end multitable @@ -1540,7 +1541,7 @@ installed in the system: @section grp - pattern search tool @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} -@item @strong{Authors:} @tab Tomasz Obrêbski +@item @strong{Authors:} @tab Tomasz Obrêbski @item @strong{Component category:} @tab filter @end multitable @@ -1634,7 +1635,7 @@ lzop -cd corpus.grp.lzo | grp -a gP -e @var{EXPR} | ser -e @var{EXPR} @node kot @section kot - untokenizer -Authors: Tomasz Obrêbski +Authors: Tomasz Obrêbski @command{kot} is the opposite of @command{tok}. It changes UTT-formatted text into plain text. @@ -1849,7 +1850,7 @@ termination of the program. @section fla - the UTT file flattener @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} -@item @strong{Authors:} @tab Tomasz Obrêbski +@item @strong{Authors:} @tab Tomasz Obrêbski @item @strong{Component category:} @tab filter @end multitable @c @@ -1888,7 +1889,7 @@ default, segments containing a field @code{BOS} are seeked. @section unfla - the UTT file unflattener @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} -@item @strong{Authors:} @tab Tomasz Obrêbski +@item @strong{Authors:} @tab Tomasz Obrêbski @item @strong{Component category:} @tab filter @end multitable diff --git a/app/src/dgp/tre.rb b/app/src/dgp/tre.rb index 85a03d1..39ee7f7 100755 --- a/app/src/dgp/tre.rb +++ b/app/src/dgp/tre.rb @@ -1,6 +1,10 @@ #!/usr/bin/ruby -I /usr/local/lib/utt -I $HOME/.local/lib/utt +$: << "#{ENV['HOME']}/.local/lib/utt" +$: << "/usr/local/lib/utt" + require 'getoptlong' +require 'seg.rb' opts = GetoptLong.new( [ '--help', '-h', GetoptLong::NO_ARGUMENT ], @@ -60,9 +64,6 @@ if $INFO=='DEFAULT' end end -#require File.expand_path(File.dirname(__FILE__) + "../lib/utt/seg.rb") -require 'seg.rb' - $dgpsep=';' def tre(input) diff --git a/app/src/gue/cmdline_guess.ggo b/app/src/gue/cmdline_guess.ggo index 1b68b02..a568f44 100644 --- a/app/src/gue/cmdline_guess.ggo +++ b/app/src/gue/cmdline_guess.ggo @@ -7,6 +7,6 @@ option "cut-off" - "Do not display answers with less weight than cut-off" int d option "dictionary-home" - "dh" string typestr="FILENAME" no hidden option "dictionary" d "File with dictionary information" string typestr="filename" default="gue.bin" no option "per-info" v "Display performance information" flag off -option "weights" w "Print weights" flag off hidden +option "weights" w "Print weights" flag off option "no-uppercase" - "Do not process form containing uppercase letters" flag off diff --git a/app/src/gue/common_guess.cc b/app/src/gue/common_guess.cc index d49debf..08a178b 100644 --- a/app/src/gue/common_guess.cc +++ b/app/src/gue/common_guess.cc @@ -7,7 +7,7 @@ double delta=0.1; int cut_off=100; char dictionary[255]; bool per_info=false; -bool weights=true; +bool weights=false; void process_guess_options(gengetopt_args_info* args) { @@ -55,6 +55,6 @@ void process_guess_options(gengetopt_args_info* args) per_info=args->per_info_flag; if(args->weights_given) - weights=false; + weights=true; }