trochę zmian

M    app/doc/utt.texinfo
M    app/src/dgp/sgraph.hh
M    app/src/dgp/const.hh
M    app/src/dgp/grammar.hh
M    app/src/dgp/thesymbols.hh
M    app/src/dgp/dgc
M    app/src/dgp/sgraph.cc
M    app/src/dgp/grammar.cc


git-svn-id: svn://atos.wmid.amu.edu.pl/utt@63 e293616e-ec6a-49c2-aa92-f4a8b91c5d16
This commit is contained in:
obrebski 2008-12-11 21:20:14 +00:00
parent 91ed676c45
commit 9ace5d204d
8 changed files with 212 additions and 53 deletions

View File

@ -1,5 +1,7 @@
\input texinfo @c -*-texinfo-*- \input texinfo @c -*-texinfo-*-
@documentencoding ISO-8859-2 @c @documentencoding ISO-8859-2
@documentencoding UTF-8
@c @documentlanguage pl @c @documentlanguage pl
@c %**start of header @c %**start of header
@ -10,7 +12,7 @@
@copying @copying
This manual is for UAM Text Tools (version 0.90, October, 2008) This manual is for UAM Text Tools (version 0.90, October, 2008)
Copyright @copyright{} 2005, 2007 Tomasz Obrêbski, Micha³ Stolarski, Justyna Walkowska, Pawe³ Konieczka. Copyright @copyright{} 2005, 2007 Tomasz Obrębski, Michał Stolarski, Justyna Walkowska, Paweł Konieczka.
Permission is granted to copy, distribute and/or modify this document Permission is granted to copy, distribute and/or modify this document
under the terms of the GNU Free Documentation License, Version 1.2 or under the terms of the GNU Free Documentation License, Version 1.2 or
@ -30,7 +32,7 @@ Documentation License,,GNU Free Documentation License.
@title UAM Text Tools 0.90 - User Manual @title UAM Text Tools 0.90 - User Manual
@subtitle edition 0.01, @today @subtitle edition 0.01, @today
@subtitle status: prescript @subtitle status: prescript
@author by Justyna Walkowska, Tomasz Obr@,{}ebski and Micha@l{} Stolarski @author by Justyna Walkowska, Tomasz Obrębski and Michał Stolarski
@page @page
@vskip 0pt plus 1filll @vskip 0pt plus 1filll
@insertcopying @insertcopying
@ -41,9 +43,14 @@ Documentation License,,GNU Free Documentation License.
@c @paragraphindent none @c @paragraphindent none
@iftex @iftex
@tex
% \usepackage[T1]{fontenc}
% \usepackage[utf8]{inputenc}
% \usepackage{times}
@end tex
@parskip = 0.5@normalbaselineskip plus 3pt minus 1pt @parskip = 0.5@normalbaselineskip plus 3pt minus 1pt
@end iftex @end iftex
@c @headings off @c @headings off
@c @everyheading LEM(1) @| @| LEM(1) @c @everyheading LEM(1) @| @| LEM(1)
@everyfooting @today @c @| @thispage @| @everyfooting @today @c @| @thispage @|
@ -83,13 +90,13 @@ developed at Adam Mickiewicz University. Its functionality includes:
@itemize @bullet @itemize @bullet
@item @item
tokenization tokenization ółąż
@item @item
dictionary-based morphological analysis dictionary-based morphological analysis
@item @item
heuristic morphological analysis of unknown words heuristic morphological analysis of unknown words
@item @item
spelling correction spelling correction ółąśćż
@item @item
pattern search pattern search
@item @item
@ -124,11 +131,11 @@ List of contributors:
@itemize @itemize
@item Pawel Konieczka @item Pawel Konieczka
@item Tomasz Obrebski @item Tomasz Obrębski
@item Michal Stolarski @item Michał Stolarski
@item Marcin Walas @item Marcin Walas
@item Justyna Walkowska @item Justyna Walkowska
@item Pawel Werenski @item Paweł Wereński
@end itemize @end itemize
@c ---------------------------------------------------------------------- @c ----------------------------------------------------------------------
@ -250,7 +257,7 @@ sentence: @samp{Piszemy dobre progrumy.}
@example @example
0000 00 BOS * 0000 00 BOS *
0000 07 W Piszemy lem:pisaæ,V 0000 07 W Piszemy lem:pisać,V
0007 01 S _ 0007 01 S _
0008 05 W dobre lem:dobry,ADJ 0008 05 W dobre lem:dobry,ADJ
0013 01 S _ 0013 01 S _
@ -261,7 +268,7 @@ sentence: @samp{Piszemy dobre progrumy.}
0024 00 BOS * 0024 00 BOS *
0024 11 W Warszawiacy lem:Warszawiak,N 0024 11 W Warszawiacy lem:Warszawiak,N
0035 01 S _ 0035 01 S _
0036 03 W te¿ 0036 03 W też
0039 01 P . 0039 01 P .
0040 00 EOS * 0040 00 EOS *
@ -269,7 +276,7 @@ sentence: @samp{Piszemy dobre progrumy.}
@example @example
0000 BOS * 0000 BOS *
0000 W Piszemy lem:pisaæ,V 0000 W Piszemy lem:pisać,V
0007 S _ 0007 S _
0008 W dobre lem:dobry,ADJ 0008 W dobre lem:dobry,ADJ
0013 S _ 0013 S _
@ -282,7 +289,7 @@ Posion information may be provided only for some types of segments:
@example @example
0000 BOS * 0000 BOS *
W Piszemy lem:pisaæ,V W Piszemy lem:pisać‡,V
S _ S _
W dobre lem:dobry,ADJ W dobre lem:dobry,ADJ
S _ S _
@ -293,7 +300,7 @@ S _
0024 BOS * 0024 BOS *
W Warszawiacy lem:Warszawiak,N W Warszawiacy lem:Warszawiak,N
S _ S _
W te¿ W też
P . P .
EOS * EOS *
@end example @end example
@ -428,7 +435,7 @@ as ISO, ANSI, DOS.
@c @end table @c @end table
@c [JAK UZYSKAÆ POLSKIE CZCIONKI W DVI???] @c [JAK UZYSKAÆ POLSKIE CZCIONKI W DVI???]
@macro parhelp @macro parhelp
@item @b{@minus{}@minus{}help}, @b{@minus{}h} @item @b{@minus{}@minus{}help}, @b{@minus{}h}
@ -650,7 +657,7 @@ Sinks: programs which read UTT data and produce output in another format
@c ---------------------------------------- @c ----------------------------------------
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
@item @strong{Authors:} @tab Tomasz Obrêbski @item @strong{Authors:} @tab Tomasz Obrębski
@item @strong{Component category:} @tab source @item @strong{Component category:} @tab source
@item @strong{Input format:} @tab raw text file @item @strong{Input format:} @tab raw text file
@item @strong{Output format:} @tab UTT regular @item @strong{Output format:} @tab UTT regular
@ -755,7 +762,7 @@ Output:
@c @node sen - sentencizer @c @node sen - sentencizer
@c @chapter sen - sentencizer @c @chapter sen - sentencizer
@c Authors: Tomasz Obrêbski @c Authors: Tomasz Obrębski
@c --------------------------------------------------------------------- @c ---------------------------------------------------------------------
@c LEM @c LEM
@ -766,7 +773,7 @@ Output:
@section lem - morphological analyzer @section lem - morphological analyzer
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
@item @strong{Authors:} @tab Tomasz Obrêbski, Micha³ Stolarski @item @strong{Authors:} @tab Tomasz Obrębski, Michał Stolarski
@item @strong{Component category:} @tab filter @item @strong{Component category:} @tab filter
@item @strong{Input format:} @tab UTT regular @item @strong{Input format:} @tab UTT regular
@item @strong{Output format:} @tab UTT regular @item @strong{Output format:} @tab UTT regular
@ -870,7 +877,7 @@ Input:
Output (default): Output (default):
@example @example
0000 07 W Piszemy lem:pisaæ,V/AiVpMdTrfNpP1 0000 07 W Piszemy lem:pisać,V/AiVpMdTrfNpP1
0007 01 B _ 0007 01 B _
0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn 0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn
0008 05 W dobre lem:dobry,ADJ/DpNsCnavGn 0008 05 W dobre lem:dobry,ADJ/DpNsCnavGn
@ -885,7 +892,7 @@ Output (default):
Output (@option{--one-line} option): Output (@option{--one-line} option):
@example @example
0000 07 W Piszemy lem:pisaæ,V/AiVpMdTrfNpP1 0000 07 W Piszemy lem:pisać,V/AiVpMdTrfNpP1
0007 01 S _ 0007 01 S _
0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn lem:dobry,ADJ/DpNsCnavGn 0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn lem:dobry,ADJ/DpNsCnavGn
0013 01 S _ 0013 01 S _
@ -897,7 +904,7 @@ Output (@option{--one-line} option):
Output (@option{--one-field} option): Output (@option{--one-field} option):
@example @example
0000 07 W Piszemy lem:pisaæ,V/AiVpMdTrfNpP1 0000 07 W Piszemy lem:pisać,V/AiVpMdTrfNpP1
0007 01 S _ 0007 01 S _
0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn,ADJ/DpNsCnavGn 0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn,ADJ/DpNsCnavGn
0013 01 S _ 0013 01 S _
@ -931,7 +938,7 @@ Dictionary entries have the following structure:
meaning: replace prefix of length @code{<cut1>} with meaning: replace prefix of length @code{<cut1>} with
string @code{<add1>}, replace suffix of length @code{<cut2>} with string string @code{<add1>}, replace suffix of length @code{<cut2>} with string
@code{<add2>}. For example @code{3t} transforms @samp{kocie} into @code{<add2>}. For example @code{3t} transforms @samp{kocie} into
@samp{kot}, @code{3-4a³y} transforms @samp{najbielsi} into @samp{bia³y} @samp{kot}, @code{3-4aÂły} transforms @samp{najbielsi} into @samp{biaÂły}
Each dictionary entry must be written in one line and must not contain blank characters. Each dictionary entry must be written in one line and must not contain blank characters.
@ -942,8 +949,8 @@ kota;1,N/GaNsCg;1,N/GaNsCa
kotu;1,N/GaNsCd kotu;1,N/GaNsCd
kotem;2,N/GaNsCi kotem;2,N/GaNsCi
kocie;3t,N/GaNsCl;3t,N/GaNsCv kocie;3t,N/GaNsCl;3t,N/GaNsCv
najbielsi;3-4a³y,ADJ/DsNpCnGp najbielsi;3-4ały,ADJ/DsNpCnGp
najbielsze;3-5a³y,ADJ/DsNpCnGaifn najbielsze;3-5ały,ADJ/DsNpCnGaifn
najlepsi;dobry,ADJ/DsNpCnGp najlepsi;dobry,ADJ/DsNpCnGp
najlepsze;dobry,ADJ/DsNpCnGaifn najlepsze;dobry,ADJ/DsNpCnGaifn
@end example @end example
@ -1008,7 +1015,7 @@ lem -c -d <dict1> | lem -S lem -d <dict2>
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
@item @strong{Authors:} @tab Micha³ Stolarski, Tomasz Obrêbski @item @strong{Authors:} @tab Michał Stolarski, Tomasz Obrębski
@item @strong{Component category:} @tab filter @item @strong{Component category:} @tab filter
@end multitable @end multitable
@ -1105,7 +1112,7 @@ string @var{add1}, replace suffix of length @var{cat2} with string
@var{add2}. @var{add2}.
Example: @code{3-4a³y} transforms @i{najbielsi} into @i{bia³y} Example: @code{3-4ały} transforms @i{najbielsi} into @i{biały}
@var{description} contains the part of speech and morphosyntactic information (@xref{PMDBF dictionary}.). @var{description} contains the part of speech and morphosyntactic information (@xref{PMDBF dictionary}.).
@ -1113,10 +1120,10 @@ Example: @code{3-4a³y} transforms @i{najbielsi} into @i{bia³y}
@var{weight} is an integer value between 1 and 999 indicating the @var{weight} is an integer value between 1 and 999 indicating the
likelihood of the guess. likelihood of the guess.
@example @c @example
*³kê;1a,N/GfNsCa @c *łkę;1a,N/GfNsCa
naj*elszy;3-4a³y,ADJ/...:... @c naj*elszy;3-4ały,ADJ/...:...
@end example @c @end example
@c --------------------------------------------------------------------- @c ---------------------------------------------------------------------
@ -1128,7 +1135,7 @@ naj*elszy;3-4a³y,ADJ/...:...
@section cor - spelling corrector @section cor - spelling corrector
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
@item @strong{Authors:} @tab Tomasz Obrêbski, Micha³ Stolarski @item @strong{Authors:} @tab Tomasz Obrębski, Michał Stolarski
@item @strong{Component category:} @tab filter @item @strong{Component category:} @tab filter
@item @strong{Input format:} @tab UTT regular @item @strong{Input format:} @tab UTT regular
@item @strong{Output format:} @tab UTT regular @item @strong{Output format:} @tab UTT regular
@ -1215,7 +1222,116 @@ compiledic <dictionaryname>.dic
@node kor @node kor
@section kor - configurable spelling corrector @section kor - configurable spelling corrector
[TODO] @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
@item @strong{Authors:} @tab Paweł Werenski, Tomasz Obrębski, Michał Stolarski
@item @strong{Component category:} @tab filter
@item @strong{Input format:} @tab UTT regular
@item @strong{Output format:} @tab UTT regular
@item @strong{Required annotation:} @tab tok
@end multitable
@menu
* kor description::
* kor command line options::
* kor weights definition file::
* kor dictionaries::
@end menu
@node kor description
@subsection Description
The spelling corrector applies a Pawel Werenski's dynamic programming
algorithm to the FSA representation of the set of word forms of the
Polex/PMDBF dictionary. The algorithm is an extension of K. Oflazer
algorithm used by @command{cor}. In the extended version it is
possible to assign weights to individual edit operations.
Given an incorrect word form it returns all word forms
present in the dictionary whose edit distance is smaller than the
threshold given as the parameter.
@node kor command line options
@subsection Command line options
@table @code
@parhelp
@parversion
@parinteractive
@c @parfile
@c @paroutput
@c @parfail
@c @parcopy
@parinputfield
@paroutputfield
@pardictionary
@parprocess
@parselect
@parunselect
@paroneline
@paronefield
@item @b{@minus{}@minus{}distance=@var{int}, @minus{}n @var{int}}
Maximum edit distance (default='1').
@item @b{@minus{}@minus{}weights=@var{filename}, @minus{}w @var{filename}}
Edit operations' weights file.
@c @item @b{@minus{}@minus{}replace, @minus{}r}
@c Replace original form with corrected form, place original form in the
@c cor field. This option has no effect in @option{--one-*} modes (default=off)
@end table
@node kor weights definition file
@subsection Weights definition file
Example:
@example
%stdcor 1
%xchg 1
ż rz 0.5
ch h 0.5
u ó 0.5
@end example
Default weight is set to 1 (@code{%stdcor 1}), the weight of exchange
operation is set to 1 (@code{%xchg 1}), the three principal orthographic
errors are assigned the weight 0.5.
The edit operation weight declaration, such as
@example
ż rz 0.5
@end example
works in both ways, i.e. ż->rz, rz->ż.
The default weights definition file for @code{kor} is:
@example
$HOME/.local/share/utt/weights.kor
@end example
or, if the above mentioned file is absent:
@example
/usr/local/share/utt/weights.kor
@end example
@node kor dictionaries
@subsection Dictionaries
see @command{cor}
@c --------------------------------------------------------------------- @c ---------------------------------------------------------------------
@c SEN @c SEN
@ -1227,7 +1343,7 @@ compiledic <dictionaryname>.dic
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
@item @strong{Authors:} @tab Tomasz Obrêbski @item @strong{Authors:} @tab Tomasz Obrębski
@item @strong{Component category:} @tab filter @item @strong{Component category:} @tab filter
@item @strong{Input format:} @tab UTT regular @item @strong{Input format:} @tab UTT regular
@item @strong{Output format:} @tab UTT regular @item @strong{Output format:} @tab UTT regular
@ -1255,7 +1371,7 @@ compiledic <dictionaryname>.dic
command: sen command: sen
input: input:
0000 05 W Cze¶æ 0000 05 W Cześć
0005 01 P ! 0005 01 P !
0006 01 S _ 0006 01 S _
0007 02 W To 0007 02 W To
@ -1266,7 +1382,7 @@ input:
output: output:
0000 00 BOS * 0000 00 BOS *
0000 05 W Cze¶æ 0000 05 W Cześć
0005 01 P ! 0005 01 P !
0006 00 EOS * 0006 00 EOS *
0006 00 BOS * 0006 00 BOS *
@ -1287,7 +1403,7 @@ output:
@c @node gph - graphizer @c @node gph - graphizer
@c @chapter gph - graphizer @c @chapter gph - graphizer
@c Authors: Tomasz Obrêbski @c Authors: Tomasz Obrębski
@ -1300,7 +1416,7 @@ output:
@section ser - pattern search tool @section ser - pattern search tool
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
@item @strong{Authors:} @tab Tomasz Obrêbski @item @strong{Authors:} @tab Tomasz Obrębski
@item @strong{Component category:} @tab filter @item @strong{Component category:} @tab filter
@item @strong{Input format:} @tab UTT regular @item @strong{Input format:} @tab UTT regular
@item @strong{Output format:} @tab UTT regular @item @strong{Output format:} @tab UTT regular
@ -1536,7 +1652,7 @@ installed in the system:
@section grp - pattern search tool @section grp - pattern search tool
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
@item @strong{Authors:} @tab Tomasz Obrêbski @item @strong{Authors:} @tab Tomasz Obrębski
@item @strong{Component category:} @tab filter @item @strong{Component category:} @tab filter
@item @strong{Input format:} @tab UTT flattened @item @strong{Input format:} @tab UTT flattened
@item @strong{Output format:} @tab UTT flattened @item @strong{Output format:} @tab UTT flattened
@ -1625,7 +1741,7 @@ lzop -cd corpus.grp.lzo | grp -e @var{EXPR} | unfla | ser -e @var{EXPR}
@section mar @section mar
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
@item @strong{Authors:} @tab Marcin Walas, Tomasz Obrêbski @item @strong{Authors:} @tab Marcin Walas, Tomasz Obrębski
@item @strong{Input format:} @tab UTT flattened @item @strong{Input format:} @tab UTT flattened
@item @strong{Output format:} @tab UTT flattened @item @strong{Output format:} @tab UTT flattened
@item @strong{Required annotation:} @tab tok, sen, lem -1 @item @strong{Required annotation:} @tab tok, sen, lem -1
@ -1645,7 +1761,7 @@ lzop -cd corpus.grp.lzo | grp -e @var{EXPR} | unfla | ser -e @var{EXPR}
@section kot - untokenizer @section kot - untokenizer
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
@item @strong{Authors:} @tab Tomasz Obrêbski @item @strong{Authors:} @tab Tomasz Obrębski
@item @strong{Component category:} @tab filter @item @strong{Component category:} @tab filter
@item @strong{Input format:} @tab UTT regular @item @strong{Input format:} @tab UTT regular
@item @strong{Output format:} @tab text @item @strong{Output format:} @tab text
@ -1838,7 +1954,7 @@ sequence:
@section compiledic - the dictionary compiler @section compiledic - the dictionary compiler
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
@item @strong{Authors:} @tab Michal Stolarski, Tomasz Obrebski @item @strong{Authors:} @tab Michał Stolarski, Tomasz Obrębski
@item @strong{Component category:} @tab additional tool @item @strong{Component category:} @tab additional tool
@end multitable @end multitable
@c @c
@ -1883,7 +1999,7 @@ termination of the program.
@section fla - the UTT file flattener @section fla - the UTT file flattener
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
@item @strong{Authors:} @tab Tomasz Obrêbski @item @strong{Authors:} @tab Tomasz Obrębski
@item @strong{Input format:} @tab UTT regular @item @strong{Input format:} @tab UTT regular
@item @strong{Output format:} @tab UTT flattened @item @strong{Output format:} @tab UTT flattened
@item @strong{Required annotation:} @tab sen @item @strong{Required annotation:} @tab sen
@ -1931,7 +2047,7 @@ default, segments containing a field @code{BOS} are seeked.
@section unfla - the UTT file unflattener @section unfla - the UTT file unflattener
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
@item @strong{Authors:} @tab Tomasz Obrêbski @item @strong{Authors:} @tab Tomasz Obrębski
@item @strong{Input format:} @tab UTT flattened @item @strong{Input format:} @tab UTT flattened
@item @strong{Output format:} @tab UTT regular @item @strong{Output format:} @tab UTT regular
@item @strong{Required annotation:} @tab - @item @strong{Required annotation:} @tab -
@ -2235,7 +2351,6 @@ descr = pos ( / ( attr val + ) + ) ?
@item @item
@tab @code{v} @tab vocative. @tab @code{v} @tab vocative.
@item @item
@item
@code{G} @tab @tab Gender @code{G} @tab @tab Gender
@item @item
@tab @code{p} @tab masculine-personal, @tab @code{p} @tab masculine-personal,
@ -2728,7 +2843,7 @@ Report bugs to <obrebski@@amu.edu.pl>.
@c @node Copyright @c @node Copyright
@c @chapter Copyright @c @chapter Copyright
@c @c
@c Copyright 2004 by Tomasz Obrebski @c Copyright 2004 by Tomasz Obrębski
@c This software is free for research and educational use. @c This software is free for research and educational use.
@c --------------------------------------------------------------------- @c ---------------------------------------------------------------------

View File

@ -2,6 +2,7 @@
#define CONST_HH #define CONST_HH
#define MAXTYPES 32 #define MAXTYPES 32
#define MAXFLAGS 64
#define MAXNODES 1024 #define MAXNODES 1024
#define MAXCONSTRS 32 #define MAXCONSTRS 32
#define MAXLINE 256 #define MAXLINE 256

View File

@ -97,6 +97,7 @@ my $nleft=0;
my $nright=0; my $nright=0;
my $nreq=0; my $nreq=0;
my $nlink=0; my $nlink=0;
my $nflag=0;
my %cats; my %cats;
my %roles; my %roles;
@ -114,7 +115,6 @@ else {
} }
loadcats($catfile) if $catfile; loadcats($catfile) if $catfile;
extractcats($dicfile) if $dicfile; extractcats($dicfile) if $dicfile;
@ -192,6 +192,11 @@ while(<INPUT>)
} }
} }
} }
elsif(/^FLAG\s+\S+$/)
{
++$nflag;
print OUTPUT "$_\n"
}
elsif(/^$/) { elsif(/^$/) {
# pomijamy puste linie oraz komentarze # pomijamy puste linie oraz komentarze
} }
@ -248,6 +253,7 @@ printf STDERR "%6d REQ statements\n", $nreq;
printf STDERR "%6d LEFT statements\n", $nleft; printf STDERR "%6d LEFT statements\n", $nleft;
printf STDERR "%6d RIGHT statements\n", $nright; printf STDERR "%6d RIGHT statements\n", $nright;
printf STDERR "%6d LINK statements\n", $nlink; printf STDERR "%6d LINK statements\n", $nlink;
printf STDERR "%6d FLAG statements\n", $nflag;
sub extractcats sub extractcats

View File

@ -46,6 +46,16 @@ void Grammar::add_type(const char* s)
} }
} }
void Grammar::add_flag(const char* s)
{
Flag::add(s);
if(Flag::count()>flags_sz)
{
flags_sz += 16;
pass.resize(flags_sz);
}
}
void Grammar::set_lt(Role s, Role t) void Grammar::set_lt(Role s, Role t)
{ {
@ -123,6 +133,11 @@ bool Grammar::read(FILE* f)
if(chk_cat(arg1,lineno) + chk_cat(arg2,lineno) + chk_type(arg3,lineno) == 3) if(chk_cat(arg1,lineno) + chk_cat(arg2,lineno) + chk_type(arg3,lineno) == 3)
set_connect(arg1,arg2,arg3); set_connect(arg1,arg2,arg3);
} }
// FLAG DECLARATION
else if(strcmp(key,"FLAG")==0 && fields>=2)
{
add_flag(arg1);
}
else fprintf(stderr,"Invalid line %d. Ignored.\n", lineno); else fprintf(stderr,"Invalid line %d. Ignored.\n", lineno);
} }
@ -159,5 +174,8 @@ void Grammar::write(FILE* f)
for(Role t=1; t<Role::count(); ++t) for(Role t=1; t<Role::count(); ++t)
if(connect[c][d].count(t)) if(connect[c][d].count(t))
fprintf(f,"LINK\t%s\t%s\t%s\n",c.str(),d.str(),t.str()); fprintf(f,"LINK\t%s\t%s\t%s\n",c.str(),d.str(),t.str());
for(Flag i=1; i<Flag::count(); ++i)
fprintf(f,"FLAG\t%s\n",i.str());
} }

View File

@ -10,6 +10,15 @@
#include "thesymbols.hh" #include "thesymbols.hh"
#include "sgraph.hh" #include "sgraph.hh"
class Link
{
Role role;
FlagSet hflags;
FlagSet dflags;
}
class Grammar class Grammar
{ {
@ -17,10 +26,11 @@ class Grammar
// enum CONSTR { SGL, OBL, LEFT, RIGHT, INIT, NONINIT, FIN, NONFIN }; // enum CONSTR { SGL, OBL, LEFT, RIGHT, INIT, NONINIT, FIN, NONFIN };
Grammar() : types_sz(0), cats_sz(0) {} ; Grammar() : types_sz(0), cats_sz(0), flags_sz(0) {} ;
int types_sz; int types_sz;
int cats_sz; int cats_sz;
int flags_sz;
vector< vector< Roles > > connect; vector< vector< Roles > > connect;
RoleSet sgl; RoleSet sgl;
@ -30,11 +40,17 @@ class Grammar
vector< RoleSet > lt; vector< RoleSet > lt;
vector< RoleSet > gt; vector< RoleSet > gt;
// vector< vector< vector<
vector< FlagSet > set;
vector< FlagSet > pass;
bool read(FILE* f); bool read(FILE* f);
void write(FILE* f); void write(FILE* f);
void add_category(const char* s); void add_category(const char* s);
void add_type(const char* s); void add_type(const char* s);
void add_flag(const char* s);
void set_sgl(Role r) { sgl.set(r); } void set_sgl(Role r) { sgl.set(r); }
void set_obl(Cat c, Role r) { obl[c].set(r); } void set_obl(Cat c, Role r) { obl[c].set(r); }

View File

@ -127,10 +127,11 @@ int SGraph::sprint_node(char* buf, int nodeind, unsigned int info)
if (info&CONSTRAINTS)// buf+=sprint_node_constraints(buf,n); if (info&CONSTRAINTS)// buf+=sprint_node_constraints(buf,n);
{ {
buf+=sprintf(buf,";"); buf+=sprintf(buf,";");
int cont=0;
for(Role i=1; i<=Role::count(); ++i) for(Role i=1; i<=Role::count(); ++i)
if(node.prop.forbidden[i]) buf+=sprintf(buf,"!%s",i.str()); if(node.prop.forbidden[i]) buf+=sprintf(buf,"%s!%s",(cont++)?",":"",i.str());
for(Role i=1; i<=Role::count(); ++i) for(Role i=1; i<=Role::count(); ++i)
if(node.prop.required[i]) buf+=sprintf(buf,"&%s",i.str()); if(node.prop.required[i]) buf+=sprintf(buf,"%s&%s",(cont++)?",":"",i.str());
} }
// buf+=sprintf(buf,"\n"); // buf+=sprintf(buf,"\n");
@ -139,7 +140,7 @@ int SGraph::sprint_node(char* buf, int nodeind, unsigned int info)
} }
int SGraph::sprint_node_debug(char* buf, char* pref, int n) int SGraph::sprint_node_debug(char* buf, const char* pref, int n)
{ {
char *buf0 = buf; char *buf0 = buf;
buf+=sprintf(buf,"#%s",pref); buf+=sprintf(buf,"#%s",pref);
@ -148,7 +149,7 @@ int SGraph::sprint_node_debug(char* buf, char* pref, int n)
return buf-buf0; return buf-buf0;
} }
int SGraph::print_node_debug(FILE* f, char* pref, int n) int SGraph::print_node_debug(FILE* f, const char* pref, int n)
{ {
char buf[1000]; char buf[1000];
sprint_node_debug(buf,pref,n); sprint_node_debug(buf,pref,n);

View File

@ -87,8 +87,8 @@ public:
int sprint_node(char* buf, int n, unsigned int info); int sprint_node(char* buf, int n, unsigned int info);
int print_node(FILE* f, int n, unsigned int info); int print_node(FILE* f, int n, unsigned int info);
int sprint_node_debug(char* buf, char* pref, int n); int sprint_node_debug(char* buf, const char* pref, int n);
int print_node_debug(FILE* f, char* pref, int n); int print_node_debug(FILE* f, const char* pref, int n);
void print_arc(FILE* f, int left, int right, Role role, int dir); // 0 - left, 1 - right void print_arc(FILE* f, int left, int right, Role role, int dir); // 0 - left, 1 - right

View File

@ -22,6 +22,8 @@ typedef list<Constr> ConstrList;
typedef list<Constr>::iterator ConstrListIter; typedef list<Constr>::iterator ConstrListIter;
typedef Symbol<4> Rel; typedef Symbol<4> Rel;
typedef Symbol<5> Flag; typedef Symbol<5> Flag;
typedef bitset<MAXFLAGS> FlagSet;
#endif #endif