dokumentacja mar + drobne poprawki mar (version)
git-svn-id: svn://atos.wmid.amu.edu.pl/utt@64 e293616e-ec6a-49c2-aa92-f4a8b91c5d16
This commit is contained in:
parent
9ace5d204d
commit
2d89d4bc82
@ -1747,15 +1747,77 @@ lzop -cd corpus.grp.lzo | grp -e @var{EXPR} | unfla | ser -e @var{EXPR}
|
||||
@item @strong{Required annotation:} @tab tok, sen, lem -1
|
||||
@end multitable
|
||||
|
||||
[TODO]
|
||||
@subsection Description
|
||||
@code{mar} is a perl script, which matches given pattern on the utt-formated text
|
||||
and tags matching parts with any number of user-defined tags.
|
||||
|
||||
@subsection Command line options
|
||||
@table @code
|
||||
@parhelp
|
||||
@parversion
|
||||
|
||||
@item @b{@minus{}@minus{}pattern=@var{pattern}, @minus{}e @var{pattern}}
|
||||
The search pattern.
|
||||
@item @b{@minus{}@minus{}action=@var{action}, @minus{}a @var{action} [p] [s] [P]}
|
||||
Perform only indicated actions. Where:
|
||||
@multitable {aaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||
@item @code{p} @tab preprocess
|
||||
@item @code{s} @tab search
|
||||
@item @code{P} @tab postprocess
|
||||
@end multitable
|
||||
default: psP
|
||||
|
||||
@item @b{@minus{}@minus{}command}
|
||||
print generated sed command, then exit
|
||||
|
||||
@item @b{@minus{}@minus{}help, @minus{}h}
|
||||
print help, then exit
|
||||
|
||||
@item @b{@minus{}@minus{}version, @minus{}v}
|
||||
print version, then exit
|
||||
@end table
|
||||
@subsection Tokens in pattern
|
||||
@code{mar} pattern is based on @code{ser} patterns(see @pxref{ser pattern}). @code{mar} pattern is a @code{ser} pattern,
|
||||
in which you can add any number of matching tags, which will be printed in exacly the place, where
|
||||
they were placed in the pattern. A valid token starts with @@ which follows any number of alphanumeric
|
||||
characters. For example valid match tokens are: @@STARTMATCH @@ENDMATCH
|
||||
|
||||
Matching tokens can be placed between, before or after any of @code{ser} pattern terms. They don't have
|
||||
to be paritied. There can be any number of them in the pattern (zero or more). They don't have to be unique.
|
||||
They can be placed one after another. For example:
|
||||
|
||||
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||
@item @code{@@BOM lexeme(pomoc)} @tab place tag @b{BOM} before any form of the lexeme 'pomoc'
|
||||
@item @code{@@MATCH lexeme(pomoc) @@MATCH} @tab place tag @b{MATCH} before and after any form of the lexeme 'pomoc'
|
||||
@item @code{cat(<ADJ>) @@MATCH lexeme(pomoc) @@MATCH} @tab place tag @b{MATCH} before and after any form of the lexeme 'pomoc' which is followef by adjective
|
||||
@item @code{cat(<ADJ>) @@TAG @@BOM lexeme(pomoc) @@EOM} @tab place tags @b{TAG} and @b{BOM} before any form of the lexeme 'pomoc' which is followed by adjective and tag @b{EOM} after it
|
||||
@end multitable
|
||||
|
||||
(see mar's help 'mar -h' for some more information)
|
||||
|
||||
@subsection How mar works
|
||||
@code{mar} translates given @code{ser} pattern with @code{m4} macroprocessor to regular expression. Then it changes it into @code{sed} command script, which is then executed.
|
||||
|
||||
You can see translated sed script by using the @code{@minus{}@minus{}command} option.
|
||||
@subsection Limitations
|
||||
The complexity of computations performed by @code{mar} increases linearly with the number of placed tokens. So it is highly recommended not to place too much tokens.
|
||||
@subsection Requirements
|
||||
In order to run @code{mar}, the following programs must be installed in the system:
|
||||
|
||||
@itemize
|
||||
|
||||
@item @command{m4}
|
||||
@item @command{grep}
|
||||
@item @command{sed}
|
||||
|
||||
@end itemize
|
||||
|
||||
|
||||
(see mar's help 'mar -h' for some information)
|
||||
|
||||
@c ---------------------------------------------------------------------
|
||||
@c KOT
|
||||
@c ---------------------------------------------------------------------
|
||||
|
||||
|
||||
@page
|
||||
@node kot
|
||||
@section kot - untokenizer
|
||||
|
@ -10,6 +10,8 @@
|
||||
#which is one of the parametres of the script
|
||||
#contact: d287572@atos.wmid.amu.edu.pl, walasiek@gmail.com
|
||||
|
||||
my $version = '1.0';
|
||||
|
||||
use lib "/usr/local/lib/utt";
|
||||
use lib "$ENV{'HOME'}/.local/lib/utt";
|
||||
|
||||
@ -36,6 +38,7 @@ my $eos="seg(EOS)";
|
||||
my $explicit_space=0;
|
||||
my $morfield='lem';
|
||||
my $tags=0;
|
||||
my $show_version = 0;
|
||||
|
||||
#read configuration files###########################
|
||||
my $file;
|
||||
@ -89,11 +92,16 @@ GetOptions("pattern|e=s" => \$pattern,
|
||||
"command" => \$command,
|
||||
"action=s" => \$action,
|
||||
"help|h" => \$help,
|
||||
"space|s" => \$explicit_space
|
||||
"space|s" => \$explicit_space,
|
||||
"version|v" => \$show_version,
|
||||
);
|
||||
|
||||
|
||||
|
||||
if($show_version){
|
||||
print "Version: $version\n";
|
||||
exit 0;
|
||||
}
|
||||
|
||||
if($help)
|
||||
{
|
||||
@ -102,16 +110,17 @@ Usage: mar [OPTIONS] [file ..]
|
||||
|
||||
Options:
|
||||
--pattern -e PATTERN Pattern.
|
||||
--bos -E PATTERN Segment serving as sentence beginning marker. [TODO]
|
||||
--eos -E PATTERN Segment serving as sentence beginning marker. [TODO]
|
||||
--macros=FILE Read macrodefinitions from FILE. [TODO]
|
||||
--define=FILE Add macrodefinitions from FILE. [TODO]
|
||||
--action -a [p][s][P] Perform only indicated actions.
|
||||
p - preprocess
|
||||
s - search
|
||||
P - postprocess
|
||||
(default pgP)
|
||||
(default psP)
|
||||
--command Print generated shell command and exit.
|
||||
--help -h Print help.
|
||||
--version -v Script version
|
||||
|
||||
In patern you can put any tag. Tags should begin with the @ character.
|
||||
They don't have to be closed.
|
||||
|
Loading…
Reference in New Issue
Block a user