trochę zmian
M app/doc/utt.texinfo M app/src/dgp/sgraph.hh M app/src/dgp/const.hh M app/src/dgp/grammar.hh M app/src/dgp/thesymbols.hh M app/src/dgp/dgc M app/src/dgp/sgraph.cc M app/src/dgp/grammar.cc git-svn-id: svn://atos.wmid.amu.edu.pl/utt@63 e293616e-ec6a-49c2-aa92-f4a8b91c5d16
This commit is contained in:
parent
91ed676c45
commit
9ace5d204d
@ -1,5 +1,7 @@
|
|||||||
|
|
||||||
\input texinfo @c -*-texinfo-*-
|
\input texinfo @c -*-texinfo-*-
|
||||||
@documentencoding ISO-8859-2
|
@c @documentencoding ISO-8859-2
|
||||||
|
@documentencoding UTF-8
|
||||||
@c @documentlanguage pl
|
@c @documentlanguage pl
|
||||||
|
|
||||||
@c %**start of header
|
@c %**start of header
|
||||||
@ -10,7 +12,7 @@
|
|||||||
@copying
|
@copying
|
||||||
This manual is for UAM Text Tools (version 0.90, October, 2008)
|
This manual is for UAM Text Tools (version 0.90, October, 2008)
|
||||||
|
|
||||||
Copyright @copyright{} 2005, 2007 Tomasz Obrêbski, Micha³ Stolarski, Justyna Walkowska, Pawe³ Konieczka.
|
Copyright @copyright{} 2005, 2007 Tomasz Obrębski, Michał Stolarski, Justyna Walkowska, Paweł Konieczka.
|
||||||
|
|
||||||
Permission is granted to copy, distribute and/or modify this document
|
Permission is granted to copy, distribute and/or modify this document
|
||||||
under the terms of the GNU Free Documentation License, Version 1.2 or
|
under the terms of the GNU Free Documentation License, Version 1.2 or
|
||||||
@ -30,7 +32,7 @@ Documentation License,,GNU Free Documentation License.
|
|||||||
@title UAM Text Tools 0.90 - User Manual
|
@title UAM Text Tools 0.90 - User Manual
|
||||||
@subtitle edition 0.01, @today
|
@subtitle edition 0.01, @today
|
||||||
@subtitle status: prescript
|
@subtitle status: prescript
|
||||||
@author by Justyna Walkowska, Tomasz Obr@,{}ebski and Micha@l{} Stolarski
|
@author by Justyna Walkowska, Tomasz Obrębski and Michał Stolarski
|
||||||
@page
|
@page
|
||||||
@vskip 0pt plus 1filll
|
@vskip 0pt plus 1filll
|
||||||
@insertcopying
|
@insertcopying
|
||||||
@ -41,9 +43,14 @@ Documentation License,,GNU Free Documentation License.
|
|||||||
@c @paragraphindent none
|
@c @paragraphindent none
|
||||||
|
|
||||||
@iftex
|
@iftex
|
||||||
|
@tex
|
||||||
|
% \usepackage[T1]{fontenc}
|
||||||
|
% \usepackage[utf8]{inputenc}
|
||||||
|
% \usepackage{times}
|
||||||
|
@end tex
|
||||||
|
|
||||||
@parskip = 0.5@normalbaselineskip plus 3pt minus 1pt
|
@parskip = 0.5@normalbaselineskip plus 3pt minus 1pt
|
||||||
@end iftex
|
@end iftex
|
||||||
|
|
||||||
@c @headings off
|
@c @headings off
|
||||||
@c @everyheading LEM(1) @| @| LEM(1)
|
@c @everyheading LEM(1) @| @| LEM(1)
|
||||||
@everyfooting @today @c @| @thispage @|
|
@everyfooting @today @c @| @thispage @|
|
||||||
@ -83,13 +90,13 @@ developed at Adam Mickiewicz University. Its functionality includes:
|
|||||||
@itemize @bullet
|
@itemize @bullet
|
||||||
|
|
||||||
@item
|
@item
|
||||||
tokenization
|
tokenization ółąż
|
||||||
@item
|
@item
|
||||||
dictionary-based morphological analysis
|
dictionary-based morphological analysis
|
||||||
@item
|
@item
|
||||||
heuristic morphological analysis of unknown words
|
heuristic morphological analysis of unknown words
|
||||||
@item
|
@item
|
||||||
spelling correction
|
spelling correction ółąśćż
|
||||||
@item
|
@item
|
||||||
pattern search
|
pattern search
|
||||||
@item
|
@item
|
||||||
@ -124,11 +131,11 @@ List of contributors:
|
|||||||
|
|
||||||
@itemize
|
@itemize
|
||||||
@item Pawel Konieczka
|
@item Pawel Konieczka
|
||||||
@item Tomasz Obrebski
|
@item Tomasz Obrębski
|
||||||
@item Michal Stolarski
|
@item Michał Stolarski
|
||||||
@item Marcin Walas
|
@item Marcin Walas
|
||||||
@item Justyna Walkowska
|
@item Justyna Walkowska
|
||||||
@item Pawel Werenski
|
@item Paweł Wereński
|
||||||
@end itemize
|
@end itemize
|
||||||
|
|
||||||
@c ----------------------------------------------------------------------
|
@c ----------------------------------------------------------------------
|
||||||
@ -250,7 +257,7 @@ sentence: @samp{Piszemy dobre progrumy.}
|
|||||||
|
|
||||||
@example
|
@example
|
||||||
0000 00 BOS *
|
0000 00 BOS *
|
||||||
0000 07 W Piszemy lem:pisaæ,V
|
0000 07 W Piszemy lem:pisać,V
|
||||||
0007 01 S _
|
0007 01 S _
|
||||||
0008 05 W dobre lem:dobry,ADJ
|
0008 05 W dobre lem:dobry,ADJ
|
||||||
0013 01 S _
|
0013 01 S _
|
||||||
@ -261,7 +268,7 @@ sentence: @samp{Piszemy dobre progrumy.}
|
|||||||
0024 00 BOS *
|
0024 00 BOS *
|
||||||
0024 11 W Warszawiacy lem:Warszawiak,N
|
0024 11 W Warszawiacy lem:Warszawiak,N
|
||||||
0035 01 S _
|
0035 01 S _
|
||||||
0036 03 W te¿
|
0036 03 W też
|
||||||
0039 01 P .
|
0039 01 P .
|
||||||
0040 00 EOS *
|
0040 00 EOS *
|
||||||
|
|
||||||
@ -269,7 +276,7 @@ sentence: @samp{Piszemy dobre progrumy.}
|
|||||||
|
|
||||||
@example
|
@example
|
||||||
0000 BOS *
|
0000 BOS *
|
||||||
0000 W Piszemy lem:pisaæ,V
|
0000 W Piszemy lem:pisać,V
|
||||||
0007 S _
|
0007 S _
|
||||||
0008 W dobre lem:dobry,ADJ
|
0008 W dobre lem:dobry,ADJ
|
||||||
0013 S _
|
0013 S _
|
||||||
@ -282,7 +289,7 @@ Posion information may be provided only for some types of segments:
|
|||||||
|
|
||||||
@example
|
@example
|
||||||
0000 BOS *
|
0000 BOS *
|
||||||
W Piszemy lem:pisaæ,V
|
W Piszemy lem:pisać,V
|
||||||
S _
|
S _
|
||||||
W dobre lem:dobry,ADJ
|
W dobre lem:dobry,ADJ
|
||||||
S _
|
S _
|
||||||
@ -293,7 +300,7 @@ S _
|
|||||||
0024 BOS *
|
0024 BOS *
|
||||||
W Warszawiacy lem:Warszawiak,N
|
W Warszawiacy lem:Warszawiak,N
|
||||||
S _
|
S _
|
||||||
W te¿
|
W też
|
||||||
P .
|
P .
|
||||||
EOS *
|
EOS *
|
||||||
@end example
|
@end example
|
||||||
@ -428,7 +435,7 @@ as ISO, ANSI, DOS.
|
|||||||
@c @end table
|
@c @end table
|
||||||
|
|
||||||
|
|
||||||
@c [JAK UZYSKAÆ POLSKIE CZCIONKI W DVI???]
|
@c [JAK UZYSKAĂ POLSKIE CZCIONKI W DVI???]
|
||||||
|
|
||||||
@macro parhelp
|
@macro parhelp
|
||||||
@item @b{@minus{}@minus{}help}, @b{@minus{}h}
|
@item @b{@minus{}@minus{}help}, @b{@minus{}h}
|
||||||
@ -650,7 +657,7 @@ Sinks: programs which read UTT data and produce output in another format
|
|||||||
@c ----------------------------------------
|
@c ----------------------------------------
|
||||||
|
|
||||||
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||||
@item @strong{Authors:} @tab Tomasz Obrêbski
|
@item @strong{Authors:} @tab Tomasz Obrębski
|
||||||
@item @strong{Component category:} @tab source
|
@item @strong{Component category:} @tab source
|
||||||
@item @strong{Input format:} @tab raw text file
|
@item @strong{Input format:} @tab raw text file
|
||||||
@item @strong{Output format:} @tab UTT regular
|
@item @strong{Output format:} @tab UTT regular
|
||||||
@ -755,7 +762,7 @@ Output:
|
|||||||
@c @node sen - sentencizer
|
@c @node sen - sentencizer
|
||||||
@c @chapter sen - sentencizer
|
@c @chapter sen - sentencizer
|
||||||
|
|
||||||
@c Authors: Tomasz Obrêbski
|
@c Authors: Tomasz Obrębski
|
||||||
|
|
||||||
@c ---------------------------------------------------------------------
|
@c ---------------------------------------------------------------------
|
||||||
@c LEM
|
@c LEM
|
||||||
@ -766,7 +773,7 @@ Output:
|
|||||||
@section lem - morphological analyzer
|
@section lem - morphological analyzer
|
||||||
|
|
||||||
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||||
@item @strong{Authors:} @tab Tomasz Obrêbski, Micha³ Stolarski
|
@item @strong{Authors:} @tab Tomasz Obrębski, Michał Stolarski
|
||||||
@item @strong{Component category:} @tab filter
|
@item @strong{Component category:} @tab filter
|
||||||
@item @strong{Input format:} @tab UTT regular
|
@item @strong{Input format:} @tab UTT regular
|
||||||
@item @strong{Output format:} @tab UTT regular
|
@item @strong{Output format:} @tab UTT regular
|
||||||
@ -870,7 +877,7 @@ Input:
|
|||||||
Output (default):
|
Output (default):
|
||||||
|
|
||||||
@example
|
@example
|
||||||
0000 07 W Piszemy lem:pisaæ,V/AiVpMdTrfNpP1
|
0000 07 W Piszemy lem:pisać,V/AiVpMdTrfNpP1
|
||||||
0007 01 B _
|
0007 01 B _
|
||||||
0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn
|
0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn
|
||||||
0008 05 W dobre lem:dobry,ADJ/DpNsCnavGn
|
0008 05 W dobre lem:dobry,ADJ/DpNsCnavGn
|
||||||
@ -885,7 +892,7 @@ Output (default):
|
|||||||
Output (@option{--one-line} option):
|
Output (@option{--one-line} option):
|
||||||
|
|
||||||
@example
|
@example
|
||||||
0000 07 W Piszemy lem:pisaæ,V/AiVpMdTrfNpP1
|
0000 07 W Piszemy lem:pisać,V/AiVpMdTrfNpP1
|
||||||
0007 01 S _
|
0007 01 S _
|
||||||
0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn lem:dobry,ADJ/DpNsCnavGn
|
0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn lem:dobry,ADJ/DpNsCnavGn
|
||||||
0013 01 S _
|
0013 01 S _
|
||||||
@ -897,7 +904,7 @@ Output (@option{--one-line} option):
|
|||||||
Output (@option{--one-field} option):
|
Output (@option{--one-field} option):
|
||||||
|
|
||||||
@example
|
@example
|
||||||
0000 07 W Piszemy lem:pisaæ,V/AiVpMdTrfNpP1
|
0000 07 W Piszemy lem:pisać,V/AiVpMdTrfNpP1
|
||||||
0007 01 S _
|
0007 01 S _
|
||||||
0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn,ADJ/DpNsCnavGn
|
0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn,ADJ/DpNsCnavGn
|
||||||
0013 01 S _
|
0013 01 S _
|
||||||
@ -931,7 +938,7 @@ Dictionary entries have the following structure:
|
|||||||
meaning: replace prefix of length @code{<cut1>} with
|
meaning: replace prefix of length @code{<cut1>} with
|
||||||
string @code{<add1>}, replace suffix of length @code{<cut2>} with string
|
string @code{<add1>}, replace suffix of length @code{<cut2>} with string
|
||||||
@code{<add2>}. For example @code{3t} transforms @samp{kocie} into
|
@code{<add2>}. For example @code{3t} transforms @samp{kocie} into
|
||||||
@samp{kot}, @code{3-4a³y} transforms @samp{najbielsi} into @samp{bia³y}
|
@samp{kot}, @code{3-4aÂły} transforms @samp{najbielsi} into @samp{biaÂły}
|
||||||
|
|
||||||
Each dictionary entry must be written in one line and must not contain blank characters.
|
Each dictionary entry must be written in one line and must not contain blank characters.
|
||||||
|
|
||||||
@ -942,8 +949,8 @@ kota;1,N/GaNsCg;1,N/GaNsCa
|
|||||||
kotu;1,N/GaNsCd
|
kotu;1,N/GaNsCd
|
||||||
kotem;2,N/GaNsCi
|
kotem;2,N/GaNsCi
|
||||||
kocie;3t,N/GaNsCl;3t,N/GaNsCv
|
kocie;3t,N/GaNsCl;3t,N/GaNsCv
|
||||||
najbielsi;3-4a³y,ADJ/DsNpCnGp
|
najbielsi;3-4ały,ADJ/DsNpCnGp
|
||||||
najbielsze;3-5a³y,ADJ/DsNpCnGaifn
|
najbielsze;3-5ały,ADJ/DsNpCnGaifn
|
||||||
najlepsi;dobry,ADJ/DsNpCnGp
|
najlepsi;dobry,ADJ/DsNpCnGp
|
||||||
najlepsze;dobry,ADJ/DsNpCnGaifn
|
najlepsze;dobry,ADJ/DsNpCnGaifn
|
||||||
@end example
|
@end example
|
||||||
@ -1008,7 +1015,7 @@ lem -c -d <dict1> | lem -S lem -d <dict2>
|
|||||||
|
|
||||||
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||||
|
|
||||||
@item @strong{Authors:} @tab Micha³ Stolarski, Tomasz Obrêbski
|
@item @strong{Authors:} @tab Michał Stolarski, Tomasz Obrębski
|
||||||
@item @strong{Component category:} @tab filter
|
@item @strong{Component category:} @tab filter
|
||||||
|
|
||||||
@end multitable
|
@end multitable
|
||||||
@ -1105,7 +1112,7 @@ string @var{add1}, replace suffix of length @var{cat2} with string
|
|||||||
@var{add2}.
|
@var{add2}.
|
||||||
|
|
||||||
|
|
||||||
Example: @code{3-4a³y} transforms @i{najbielsi} into @i{bia³y}
|
Example: @code{3-4ały} transforms @i{najbielsi} into @i{biały}
|
||||||
|
|
||||||
|
|
||||||
@var{description} contains the part of speech and morphosyntactic information (@xref{PMDBF dictionary}.).
|
@var{description} contains the part of speech and morphosyntactic information (@xref{PMDBF dictionary}.).
|
||||||
@ -1113,10 +1120,10 @@ Example: @code{3-4a³y} transforms @i{najbielsi} into @i{bia³y}
|
|||||||
@var{weight} is an integer value between 1 and 999 indicating the
|
@var{weight} is an integer value between 1 and 999 indicating the
|
||||||
likelihood of the guess.
|
likelihood of the guess.
|
||||||
|
|
||||||
@example
|
@c @example
|
||||||
*³kê;1a,N/GfNsCa
|
@c *łkę;1a,N/GfNsCa
|
||||||
naj*elszy;3-4a³y,ADJ/...:...
|
@c naj*elszy;3-4ały,ADJ/...:...
|
||||||
@end example
|
@c @end example
|
||||||
|
|
||||||
|
|
||||||
@c ---------------------------------------------------------------------
|
@c ---------------------------------------------------------------------
|
||||||
@ -1128,7 +1135,7 @@ naj*elszy;3-4a³y,ADJ/...:...
|
|||||||
@section cor - spelling corrector
|
@section cor - spelling corrector
|
||||||
|
|
||||||
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||||
@item @strong{Authors:} @tab Tomasz Obrêbski, Micha³ Stolarski
|
@item @strong{Authors:} @tab Tomasz Obrębski, Michał Stolarski
|
||||||
@item @strong{Component category:} @tab filter
|
@item @strong{Component category:} @tab filter
|
||||||
@item @strong{Input format:} @tab UTT regular
|
@item @strong{Input format:} @tab UTT regular
|
||||||
@item @strong{Output format:} @tab UTT regular
|
@item @strong{Output format:} @tab UTT regular
|
||||||
@ -1215,7 +1222,116 @@ compiledic <dictionaryname>.dic
|
|||||||
@node kor
|
@node kor
|
||||||
@section kor - configurable spelling corrector
|
@section kor - configurable spelling corrector
|
||||||
|
|
||||||
[TODO]
|
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||||
|
@item @strong{Authors:} @tab Paweł Werenski, Tomasz Obrębski, Michał Stolarski
|
||||||
|
@item @strong{Component category:} @tab filter
|
||||||
|
@item @strong{Input format:} @tab UTT regular
|
||||||
|
@item @strong{Output format:} @tab UTT regular
|
||||||
|
@item @strong{Required annotation:} @tab tok
|
||||||
|
@end multitable
|
||||||
|
|
||||||
|
@menu
|
||||||
|
* kor description::
|
||||||
|
* kor command line options::
|
||||||
|
* kor weights definition file::
|
||||||
|
* kor dictionaries::
|
||||||
|
@end menu
|
||||||
|
|
||||||
|
|
||||||
|
@node kor description
|
||||||
|
@subsection Description
|
||||||
|
|
||||||
|
The spelling corrector applies a Pawel Werenski's dynamic programming
|
||||||
|
algorithm to the FSA representation of the set of word forms of the
|
||||||
|
Polex/PMDBF dictionary. The algorithm is an extension of K. Oflazer
|
||||||
|
algorithm used by @command{cor}. In the extended version it is
|
||||||
|
possible to assign weights to individual edit operations.
|
||||||
|
|
||||||
|
Given an incorrect word form it returns all word forms
|
||||||
|
present in the dictionary whose edit distance is smaller than the
|
||||||
|
threshold given as the parameter.
|
||||||
|
|
||||||
|
|
||||||
|
@node kor command line options
|
||||||
|
@subsection Command line options
|
||||||
|
|
||||||
|
@table @code
|
||||||
|
|
||||||
|
@parhelp
|
||||||
|
@parversion
|
||||||
|
@parinteractive
|
||||||
|
@c @parfile
|
||||||
|
@c @paroutput
|
||||||
|
@c @parfail
|
||||||
|
@c @parcopy
|
||||||
|
@parinputfield
|
||||||
|
@paroutputfield
|
||||||
|
@pardictionary
|
||||||
|
@parprocess
|
||||||
|
@parselect
|
||||||
|
@parunselect
|
||||||
|
@paroneline
|
||||||
|
@paronefield
|
||||||
|
|
||||||
|
@item @b{@minus{}@minus{}distance=@var{int}, @minus{}n @var{int}}
|
||||||
|
Maximum edit distance (default='1').
|
||||||
|
|
||||||
|
@item @b{@minus{}@minus{}weights=@var{filename}, @minus{}w @var{filename}}
|
||||||
|
Edit operations' weights file.
|
||||||
|
|
||||||
|
@c @item @b{@minus{}@minus{}replace, @minus{}r}
|
||||||
|
@c Replace original form with corrected form, place original form in the
|
||||||
|
@c cor field. This option has no effect in @option{--one-*} modes (default=off)
|
||||||
|
|
||||||
|
|
||||||
|
@end table
|
||||||
|
|
||||||
|
|
||||||
|
@node kor weights definition file
|
||||||
|
@subsection Weights definition file
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
@example
|
||||||
|
|
||||||
|
%stdcor 1
|
||||||
|
%xchg 1
|
||||||
|
ż rz 0.5
|
||||||
|
ch h 0.5
|
||||||
|
u ó 0.5
|
||||||
|
|
||||||
|
@end example
|
||||||
|
|
||||||
|
|
||||||
|
Default weight is set to 1 (@code{%stdcor 1}), the weight of exchange
|
||||||
|
operation is set to 1 (@code{%xchg 1}), the three principal orthographic
|
||||||
|
errors are assigned the weight 0.5.
|
||||||
|
|
||||||
|
The edit operation weight declaration, such as
|
||||||
|
|
||||||
|
@example
|
||||||
|
ż rz 0.5
|
||||||
|
@end example
|
||||||
|
|
||||||
|
works in both ways, i.e. ż->rz, rz->ż.
|
||||||
|
|
||||||
|
The default weights definition file for @code{kor} is:
|
||||||
|
|
||||||
|
@example
|
||||||
|
$HOME/.local/share/utt/weights.kor
|
||||||
|
@end example
|
||||||
|
|
||||||
|
or, if the above mentioned file is absent:
|
||||||
|
|
||||||
|
@example
|
||||||
|
/usr/local/share/utt/weights.kor
|
||||||
|
@end example
|
||||||
|
|
||||||
|
|
||||||
|
@node kor dictionaries
|
||||||
|
@subsection Dictionaries
|
||||||
|
|
||||||
|
see @command{cor}
|
||||||
|
|
||||||
@c ---------------------------------------------------------------------
|
@c ---------------------------------------------------------------------
|
||||||
@c SEN
|
@c SEN
|
||||||
@ -1227,7 +1343,7 @@ compiledic <dictionaryname>.dic
|
|||||||
|
|
||||||
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||||
|
|
||||||
@item @strong{Authors:} @tab Tomasz Obrêbski
|
@item @strong{Authors:} @tab Tomasz Obrębski
|
||||||
@item @strong{Component category:} @tab filter
|
@item @strong{Component category:} @tab filter
|
||||||
@item @strong{Input format:} @tab UTT regular
|
@item @strong{Input format:} @tab UTT regular
|
||||||
@item @strong{Output format:} @tab UTT regular
|
@item @strong{Output format:} @tab UTT regular
|
||||||
@ -1255,7 +1371,7 @@ compiledic <dictionaryname>.dic
|
|||||||
command: sen
|
command: sen
|
||||||
|
|
||||||
input:
|
input:
|
||||||
0000 05 W Cze¶æ
|
0000 05 W Cześć
|
||||||
0005 01 P !
|
0005 01 P !
|
||||||
0006 01 S _
|
0006 01 S _
|
||||||
0007 02 W To
|
0007 02 W To
|
||||||
@ -1266,7 +1382,7 @@ input:
|
|||||||
|
|
||||||
output:
|
output:
|
||||||
0000 00 BOS *
|
0000 00 BOS *
|
||||||
0000 05 W Cze¶æ
|
0000 05 W Cześć
|
||||||
0005 01 P !
|
0005 01 P !
|
||||||
0006 00 EOS *
|
0006 00 EOS *
|
||||||
0006 00 BOS *
|
0006 00 BOS *
|
||||||
@ -1287,7 +1403,7 @@ output:
|
|||||||
@c @node gph - graphizer
|
@c @node gph - graphizer
|
||||||
@c @chapter gph - graphizer
|
@c @chapter gph - graphizer
|
||||||
|
|
||||||
@c Authors: Tomasz Obrêbski
|
@c Authors: Tomasz Obrębski
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -1300,7 +1416,7 @@ output:
|
|||||||
@section ser - pattern search tool
|
@section ser - pattern search tool
|
||||||
|
|
||||||
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||||
@item @strong{Authors:} @tab Tomasz Obrêbski
|
@item @strong{Authors:} @tab Tomasz Obrębski
|
||||||
@item @strong{Component category:} @tab filter
|
@item @strong{Component category:} @tab filter
|
||||||
@item @strong{Input format:} @tab UTT regular
|
@item @strong{Input format:} @tab UTT regular
|
||||||
@item @strong{Output format:} @tab UTT regular
|
@item @strong{Output format:} @tab UTT regular
|
||||||
@ -1536,7 +1652,7 @@ installed in the system:
|
|||||||
@section grp - pattern search tool
|
@section grp - pattern search tool
|
||||||
|
|
||||||
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||||
@item @strong{Authors:} @tab Tomasz Obrêbski
|
@item @strong{Authors:} @tab Tomasz Obrębski
|
||||||
@item @strong{Component category:} @tab filter
|
@item @strong{Component category:} @tab filter
|
||||||
@item @strong{Input format:} @tab UTT flattened
|
@item @strong{Input format:} @tab UTT flattened
|
||||||
@item @strong{Output format:} @tab UTT flattened
|
@item @strong{Output format:} @tab UTT flattened
|
||||||
@ -1625,7 +1741,7 @@ lzop -cd corpus.grp.lzo | grp -e @var{EXPR} | unfla | ser -e @var{EXPR}
|
|||||||
@section mar
|
@section mar
|
||||||
|
|
||||||
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||||
@item @strong{Authors:} @tab Marcin Walas, Tomasz Obrêbski
|
@item @strong{Authors:} @tab Marcin Walas, Tomasz Obrębski
|
||||||
@item @strong{Input format:} @tab UTT flattened
|
@item @strong{Input format:} @tab UTT flattened
|
||||||
@item @strong{Output format:} @tab UTT flattened
|
@item @strong{Output format:} @tab UTT flattened
|
||||||
@item @strong{Required annotation:} @tab tok, sen, lem -1
|
@item @strong{Required annotation:} @tab tok, sen, lem -1
|
||||||
@ -1645,7 +1761,7 @@ lzop -cd corpus.grp.lzo | grp -e @var{EXPR} | unfla | ser -e @var{EXPR}
|
|||||||
@section kot - untokenizer
|
@section kot - untokenizer
|
||||||
|
|
||||||
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||||
@item @strong{Authors:} @tab Tomasz Obrêbski
|
@item @strong{Authors:} @tab Tomasz Obrębski
|
||||||
@item @strong{Component category:} @tab filter
|
@item @strong{Component category:} @tab filter
|
||||||
@item @strong{Input format:} @tab UTT regular
|
@item @strong{Input format:} @tab UTT regular
|
||||||
@item @strong{Output format:} @tab text
|
@item @strong{Output format:} @tab text
|
||||||
@ -1838,7 +1954,7 @@ sequence:
|
|||||||
@section compiledic - the dictionary compiler
|
@section compiledic - the dictionary compiler
|
||||||
|
|
||||||
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||||
@item @strong{Authors:} @tab Michal Stolarski, Tomasz Obrebski
|
@item @strong{Authors:} @tab Michał Stolarski, Tomasz Obrębski
|
||||||
@item @strong{Component category:} @tab additional tool
|
@item @strong{Component category:} @tab additional tool
|
||||||
@end multitable
|
@end multitable
|
||||||
@c
|
@c
|
||||||
@ -1883,7 +1999,7 @@ termination of the program.
|
|||||||
@section fla - the UTT file flattener
|
@section fla - the UTT file flattener
|
||||||
|
|
||||||
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||||
@item @strong{Authors:} @tab Tomasz Obrêbski
|
@item @strong{Authors:} @tab Tomasz Obrębski
|
||||||
@item @strong{Input format:} @tab UTT regular
|
@item @strong{Input format:} @tab UTT regular
|
||||||
@item @strong{Output format:} @tab UTT flattened
|
@item @strong{Output format:} @tab UTT flattened
|
||||||
@item @strong{Required annotation:} @tab sen
|
@item @strong{Required annotation:} @tab sen
|
||||||
@ -1931,7 +2047,7 @@ default, segments containing a field @code{BOS} are seeked.
|
|||||||
@section unfla - the UTT file unflattener
|
@section unfla - the UTT file unflattener
|
||||||
|
|
||||||
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||||
@item @strong{Authors:} @tab Tomasz Obrêbski
|
@item @strong{Authors:} @tab Tomasz Obrębski
|
||||||
@item @strong{Input format:} @tab UTT flattened
|
@item @strong{Input format:} @tab UTT flattened
|
||||||
@item @strong{Output format:} @tab UTT regular
|
@item @strong{Output format:} @tab UTT regular
|
||||||
@item @strong{Required annotation:} @tab -
|
@item @strong{Required annotation:} @tab -
|
||||||
@ -2235,7 +2351,6 @@ descr = pos ( / ( attr val + ) + ) ?
|
|||||||
@item
|
@item
|
||||||
@tab @code{v} @tab vocative.
|
@tab @code{v} @tab vocative.
|
||||||
@item
|
@item
|
||||||
@item
|
|
||||||
@code{G} @tab @tab Gender
|
@code{G} @tab @tab Gender
|
||||||
@item
|
@item
|
||||||
@tab @code{p} @tab masculine-personal,
|
@tab @code{p} @tab masculine-personal,
|
||||||
@ -2728,7 +2843,7 @@ Report bugs to <obrebski@@amu.edu.pl>.
|
|||||||
@c @node Copyright
|
@c @node Copyright
|
||||||
@c @chapter Copyright
|
@c @chapter Copyright
|
||||||
@c
|
@c
|
||||||
@c Copyright 2004 by Tomasz Obrebski
|
@c Copyright 2004 by Tomasz Obrębski
|
||||||
@c This software is free for research and educational use.
|
@c This software is free for research and educational use.
|
||||||
|
|
||||||
@c ---------------------------------------------------------------------
|
@c ---------------------------------------------------------------------
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#define CONST_HH
|
#define CONST_HH
|
||||||
|
|
||||||
#define MAXTYPES 32
|
#define MAXTYPES 32
|
||||||
|
#define MAXFLAGS 64
|
||||||
#define MAXNODES 1024
|
#define MAXNODES 1024
|
||||||
#define MAXCONSTRS 32
|
#define MAXCONSTRS 32
|
||||||
#define MAXLINE 256
|
#define MAXLINE 256
|
||||||
|
@ -97,6 +97,7 @@ my $nleft=0;
|
|||||||
my $nright=0;
|
my $nright=0;
|
||||||
my $nreq=0;
|
my $nreq=0;
|
||||||
my $nlink=0;
|
my $nlink=0;
|
||||||
|
my $nflag=0;
|
||||||
|
|
||||||
my %cats;
|
my %cats;
|
||||||
my %roles;
|
my %roles;
|
||||||
@ -114,7 +115,6 @@ else {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
loadcats($catfile) if $catfile;
|
loadcats($catfile) if $catfile;
|
||||||
extractcats($dicfile) if $dicfile;
|
extractcats($dicfile) if $dicfile;
|
||||||
|
|
||||||
@ -192,6 +192,11 @@ while(<INPUT>)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
elsif(/^FLAG\s+\S+$/)
|
||||||
|
{
|
||||||
|
++$nflag;
|
||||||
|
print OUTPUT "$_\n"
|
||||||
|
}
|
||||||
elsif(/^$/) {
|
elsif(/^$/) {
|
||||||
# pomijamy puste linie oraz komentarze
|
# pomijamy puste linie oraz komentarze
|
||||||
}
|
}
|
||||||
@ -248,6 +253,7 @@ printf STDERR "%6d REQ statements\n", $nreq;
|
|||||||
printf STDERR "%6d LEFT statements\n", $nleft;
|
printf STDERR "%6d LEFT statements\n", $nleft;
|
||||||
printf STDERR "%6d RIGHT statements\n", $nright;
|
printf STDERR "%6d RIGHT statements\n", $nright;
|
||||||
printf STDERR "%6d LINK statements\n", $nlink;
|
printf STDERR "%6d LINK statements\n", $nlink;
|
||||||
|
printf STDERR "%6d FLAG statements\n", $nflag;
|
||||||
|
|
||||||
|
|
||||||
sub extractcats
|
sub extractcats
|
||||||
|
@ -46,6 +46,16 @@ void Grammar::add_type(const char* s)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Grammar::add_flag(const char* s)
|
||||||
|
{
|
||||||
|
Flag::add(s);
|
||||||
|
if(Flag::count()>flags_sz)
|
||||||
|
{
|
||||||
|
flags_sz += 16;
|
||||||
|
pass.resize(flags_sz);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void Grammar::set_lt(Role s, Role t)
|
void Grammar::set_lt(Role s, Role t)
|
||||||
{
|
{
|
||||||
@ -123,6 +133,11 @@ bool Grammar::read(FILE* f)
|
|||||||
if(chk_cat(arg1,lineno) + chk_cat(arg2,lineno) + chk_type(arg3,lineno) == 3)
|
if(chk_cat(arg1,lineno) + chk_cat(arg2,lineno) + chk_type(arg3,lineno) == 3)
|
||||||
set_connect(arg1,arg2,arg3);
|
set_connect(arg1,arg2,arg3);
|
||||||
}
|
}
|
||||||
|
// FLAG DECLARATION
|
||||||
|
else if(strcmp(key,"FLAG")==0 && fields>=2)
|
||||||
|
{
|
||||||
|
add_flag(arg1);
|
||||||
|
}
|
||||||
|
|
||||||
else fprintf(stderr,"Invalid line %d. Ignored.\n", lineno);
|
else fprintf(stderr,"Invalid line %d. Ignored.\n", lineno);
|
||||||
}
|
}
|
||||||
@ -159,5 +174,8 @@ void Grammar::write(FILE* f)
|
|||||||
for(Role t=1; t<Role::count(); ++t)
|
for(Role t=1; t<Role::count(); ++t)
|
||||||
if(connect[c][d].count(t))
|
if(connect[c][d].count(t))
|
||||||
fprintf(f,"LINK\t%s\t%s\t%s\n",c.str(),d.str(),t.str());
|
fprintf(f,"LINK\t%s\t%s\t%s\n",c.str(),d.str(),t.str());
|
||||||
|
|
||||||
|
for(Flag i=1; i<Flag::count(); ++i)
|
||||||
|
fprintf(f,"FLAG\t%s\n",i.str());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10,6 +10,15 @@
|
|||||||
#include "thesymbols.hh"
|
#include "thesymbols.hh"
|
||||||
#include "sgraph.hh"
|
#include "sgraph.hh"
|
||||||
|
|
||||||
|
|
||||||
|
class Link
|
||||||
|
{
|
||||||
|
Role role;
|
||||||
|
FlagSet hflags;
|
||||||
|
FlagSet dflags;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class Grammar
|
class Grammar
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -17,10 +26,11 @@ class Grammar
|
|||||||
|
|
||||||
// enum CONSTR { SGL, OBL, LEFT, RIGHT, INIT, NONINIT, FIN, NONFIN };
|
// enum CONSTR { SGL, OBL, LEFT, RIGHT, INIT, NONINIT, FIN, NONFIN };
|
||||||
|
|
||||||
Grammar() : types_sz(0), cats_sz(0) {} ;
|
Grammar() : types_sz(0), cats_sz(0), flags_sz(0) {} ;
|
||||||
|
|
||||||
int types_sz;
|
int types_sz;
|
||||||
int cats_sz;
|
int cats_sz;
|
||||||
|
int flags_sz;
|
||||||
|
|
||||||
vector< vector< Roles > > connect;
|
vector< vector< Roles > > connect;
|
||||||
RoleSet sgl;
|
RoleSet sgl;
|
||||||
@ -30,11 +40,17 @@ class Grammar
|
|||||||
vector< RoleSet > lt;
|
vector< RoleSet > lt;
|
||||||
vector< RoleSet > gt;
|
vector< RoleSet > gt;
|
||||||
|
|
||||||
|
|
||||||
|
// vector< vector< vector<
|
||||||
|
vector< FlagSet > set;
|
||||||
|
vector< FlagSet > pass;
|
||||||
|
|
||||||
bool read(FILE* f);
|
bool read(FILE* f);
|
||||||
void write(FILE* f);
|
void write(FILE* f);
|
||||||
|
|
||||||
void add_category(const char* s);
|
void add_category(const char* s);
|
||||||
void add_type(const char* s);
|
void add_type(const char* s);
|
||||||
|
void add_flag(const char* s);
|
||||||
|
|
||||||
void set_sgl(Role r) { sgl.set(r); }
|
void set_sgl(Role r) { sgl.set(r); }
|
||||||
void set_obl(Cat c, Role r) { obl[c].set(r); }
|
void set_obl(Cat c, Role r) { obl[c].set(r); }
|
||||||
|
@ -127,10 +127,11 @@ int SGraph::sprint_node(char* buf, int nodeind, unsigned int info)
|
|||||||
if (info&CONSTRAINTS)// buf+=sprint_node_constraints(buf,n);
|
if (info&CONSTRAINTS)// buf+=sprint_node_constraints(buf,n);
|
||||||
{
|
{
|
||||||
buf+=sprintf(buf,";");
|
buf+=sprintf(buf,";");
|
||||||
|
int cont=0;
|
||||||
for(Role i=1; i<=Role::count(); ++i)
|
for(Role i=1; i<=Role::count(); ++i)
|
||||||
if(node.prop.forbidden[i]) buf+=sprintf(buf,"!%s",i.str());
|
if(node.prop.forbidden[i]) buf+=sprintf(buf,"%s!%s",(cont++)?",":"",i.str());
|
||||||
for(Role i=1; i<=Role::count(); ++i)
|
for(Role i=1; i<=Role::count(); ++i)
|
||||||
if(node.prop.required[i]) buf+=sprintf(buf,"&%s",i.str());
|
if(node.prop.required[i]) buf+=sprintf(buf,"%s&%s",(cont++)?",":"",i.str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// buf+=sprintf(buf,"\n");
|
// buf+=sprintf(buf,"\n");
|
||||||
@ -139,7 +140,7 @@ int SGraph::sprint_node(char* buf, int nodeind, unsigned int info)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int SGraph::sprint_node_debug(char* buf, char* pref, int n)
|
int SGraph::sprint_node_debug(char* buf, const char* pref, int n)
|
||||||
{
|
{
|
||||||
char *buf0 = buf;
|
char *buf0 = buf;
|
||||||
buf+=sprintf(buf,"#%s",pref);
|
buf+=sprintf(buf,"#%s",pref);
|
||||||
@ -148,7 +149,7 @@ int SGraph::sprint_node_debug(char* buf, char* pref, int n)
|
|||||||
return buf-buf0;
|
return buf-buf0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int SGraph::print_node_debug(FILE* f, char* pref, int n)
|
int SGraph::print_node_debug(FILE* f, const char* pref, int n)
|
||||||
{
|
{
|
||||||
char buf[1000];
|
char buf[1000];
|
||||||
sprint_node_debug(buf,pref,n);
|
sprint_node_debug(buf,pref,n);
|
||||||
|
@ -87,8 +87,8 @@ public:
|
|||||||
|
|
||||||
int sprint_node(char* buf, int n, unsigned int info);
|
int sprint_node(char* buf, int n, unsigned int info);
|
||||||
int print_node(FILE* f, int n, unsigned int info);
|
int print_node(FILE* f, int n, unsigned int info);
|
||||||
int sprint_node_debug(char* buf, char* pref, int n);
|
int sprint_node_debug(char* buf, const char* pref, int n);
|
||||||
int print_node_debug(FILE* f, char* pref, int n);
|
int print_node_debug(FILE* f, const char* pref, int n);
|
||||||
|
|
||||||
void print_arc(FILE* f, int left, int right, Role role, int dir); // 0 - left, 1 - right
|
void print_arc(FILE* f, int left, int right, Role role, int dir); // 0 - left, 1 - right
|
||||||
|
|
||||||
|
@ -22,6 +22,8 @@ typedef list<Constr> ConstrList;
|
|||||||
typedef list<Constr>::iterator ConstrListIter;
|
typedef list<Constr>::iterator ConstrListIter;
|
||||||
|
|
||||||
typedef Symbol<4> Rel;
|
typedef Symbol<4> Rel;
|
||||||
|
|
||||||
typedef Symbol<5> Flag;
|
typedef Symbol<5> Flag;
|
||||||
|
typedef bitset<MAXFLAGS> FlagSet;
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
Reference in New Issue
Block a user