dokumentacja mar + drobne poprawki mar (version)
git-svn-id: svn://atos.wmid.amu.edu.pl/utt@64 e293616e-ec6a-49c2-aa92-f4a8b91c5d16
This commit is contained in:
parent
9ace5d204d
commit
2d89d4bc82
@ -1747,15 +1747,77 @@ lzop -cd corpus.grp.lzo | grp -e @var{EXPR} | unfla | ser -e @var{EXPR}
|
|||||||
@item @strong{Required annotation:} @tab tok, sen, lem -1
|
@item @strong{Required annotation:} @tab tok, sen, lem -1
|
||||||
@end multitable
|
@end multitable
|
||||||
|
|
||||||
[TODO]
|
@subsection Description
|
||||||
|
@code{mar} is a perl script, which matches given pattern on the utt-formated text
|
||||||
|
and tags matching parts with any number of user-defined tags.
|
||||||
|
|
||||||
|
@subsection Command line options
|
||||||
|
@table @code
|
||||||
|
@parhelp
|
||||||
|
@parversion
|
||||||
|
|
||||||
|
@item @b{@minus{}@minus{}pattern=@var{pattern}, @minus{}e @var{pattern}}
|
||||||
|
The search pattern.
|
||||||
|
@item @b{@minus{}@minus{}action=@var{action}, @minus{}a @var{action} [p] [s] [P]}
|
||||||
|
Perform only indicated actions. Where:
|
||||||
|
@multitable {aaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||||
|
@item @code{p} @tab preprocess
|
||||||
|
@item @code{s} @tab search
|
||||||
|
@item @code{P} @tab postprocess
|
||||||
|
@end multitable
|
||||||
|
default: psP
|
||||||
|
|
||||||
|
@item @b{@minus{}@minus{}command}
|
||||||
|
print generated sed command, then exit
|
||||||
|
|
||||||
|
@item @b{@minus{}@minus{}help, @minus{}h}
|
||||||
|
print help, then exit
|
||||||
|
|
||||||
|
@item @b{@minus{}@minus{}version, @minus{}v}
|
||||||
|
print version, then exit
|
||||||
|
@end table
|
||||||
|
@subsection Tokens in pattern
|
||||||
|
@code{mar} pattern is based on @code{ser} patterns(see @pxref{ser pattern}). @code{mar} pattern is a @code{ser} pattern,
|
||||||
|
in which you can add any number of matching tags, which will be printed in exacly the place, where
|
||||||
|
they were placed in the pattern. A valid token starts with @@ which follows any number of alphanumeric
|
||||||
|
characters. For example valid match tokens are: @@STARTMATCH @@ENDMATCH
|
||||||
|
|
||||||
|
Matching tokens can be placed between, before or after any of @code{ser} pattern terms. They don't have
|
||||||
|
to be paritied. There can be any number of them in the pattern (zero or more). They don't have to be unique.
|
||||||
|
They can be placed one after another. For example:
|
||||||
|
|
||||||
|
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||||
|
@item @code{@@BOM lexeme(pomoc)} @tab place tag @b{BOM} before any form of the lexeme 'pomoc'
|
||||||
|
@item @code{@@MATCH lexeme(pomoc) @@MATCH} @tab place tag @b{MATCH} before and after any form of the lexeme 'pomoc'
|
||||||
|
@item @code{cat(<ADJ>) @@MATCH lexeme(pomoc) @@MATCH} @tab place tag @b{MATCH} before and after any form of the lexeme 'pomoc' which is followef by adjective
|
||||||
|
@item @code{cat(<ADJ>) @@TAG @@BOM lexeme(pomoc) @@EOM} @tab place tags @b{TAG} and @b{BOM} before any form of the lexeme 'pomoc' which is followed by adjective and tag @b{EOM} after it
|
||||||
|
@end multitable
|
||||||
|
|
||||||
|
(see mar's help 'mar -h' for some more information)
|
||||||
|
|
||||||
|
@subsection How mar works
|
||||||
|
@code{mar} translates given @code{ser} pattern with @code{m4} macroprocessor to regular expression. Then it changes it into @code{sed} command script, which is then executed.
|
||||||
|
|
||||||
|
You can see translated sed script by using the @code{@minus{}@minus{}command} option.
|
||||||
|
@subsection Limitations
|
||||||
|
The complexity of computations performed by @code{mar} increases linearly with the number of placed tokens. So it is highly recommended not to place too much tokens.
|
||||||
|
@subsection Requirements
|
||||||
|
In order to run @code{mar}, the following programs must be installed in the system:
|
||||||
|
|
||||||
|
@itemize
|
||||||
|
|
||||||
|
@item @command{m4}
|
||||||
|
@item @command{grep}
|
||||||
|
@item @command{sed}
|
||||||
|
|
||||||
|
@end itemize
|
||||||
|
|
||||||
|
|
||||||
(see mar's help 'mar -h' for some information)
|
|
||||||
|
|
||||||
@c ---------------------------------------------------------------------
|
@c ---------------------------------------------------------------------
|
||||||
@c KOT
|
@c KOT
|
||||||
@c ---------------------------------------------------------------------
|
@c ---------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
@page
|
@page
|
||||||
@node kot
|
@node kot
|
||||||
@section kot - untokenizer
|
@section kot - untokenizer
|
||||||
|
@ -10,6 +10,8 @@
|
|||||||
#which is one of the parametres of the script
|
#which is one of the parametres of the script
|
||||||
#contact: d287572@atos.wmid.amu.edu.pl, walasiek@gmail.com
|
#contact: d287572@atos.wmid.amu.edu.pl, walasiek@gmail.com
|
||||||
|
|
||||||
|
my $version = '1.0';
|
||||||
|
|
||||||
use lib "/usr/local/lib/utt";
|
use lib "/usr/local/lib/utt";
|
||||||
use lib "$ENV{'HOME'}/.local/lib/utt";
|
use lib "$ENV{'HOME'}/.local/lib/utt";
|
||||||
|
|
||||||
@ -36,6 +38,7 @@ my $eos="seg(EOS)";
|
|||||||
my $explicit_space=0;
|
my $explicit_space=0;
|
||||||
my $morfield='lem';
|
my $morfield='lem';
|
||||||
my $tags=0;
|
my $tags=0;
|
||||||
|
my $show_version = 0;
|
||||||
|
|
||||||
#read configuration files###########################
|
#read configuration files###########################
|
||||||
my $file;
|
my $file;
|
||||||
@ -89,11 +92,16 @@ GetOptions("pattern|e=s" => \$pattern,
|
|||||||
"command" => \$command,
|
"command" => \$command,
|
||||||
"action=s" => \$action,
|
"action=s" => \$action,
|
||||||
"help|h" => \$help,
|
"help|h" => \$help,
|
||||||
"space|s" => \$explicit_space
|
"space|s" => \$explicit_space,
|
||||||
|
"version|v" => \$show_version,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if($show_version){
|
||||||
|
print "Version: $version\n";
|
||||||
|
exit 0;
|
||||||
|
}
|
||||||
|
|
||||||
if($help)
|
if($help)
|
||||||
{
|
{
|
||||||
@ -102,16 +110,17 @@ Usage: mar [OPTIONS] [file ..]
|
|||||||
|
|
||||||
Options:
|
Options:
|
||||||
--pattern -e PATTERN Pattern.
|
--pattern -e PATTERN Pattern.
|
||||||
--bos -E PATTERN Segment serving as sentence beginning marker. [TODO]
|
--eos -E PATTERN Segment serving as sentence beginning marker. [TODO]
|
||||||
--macros=FILE Read macrodefinitions from FILE. [TODO]
|
--macros=FILE Read macrodefinitions from FILE. [TODO]
|
||||||
--define=FILE Add macrodefinitions from FILE. [TODO]
|
--define=FILE Add macrodefinitions from FILE. [TODO]
|
||||||
--action -a [p][s][P] Perform only indicated actions.
|
--action -a [p][s][P] Perform only indicated actions.
|
||||||
p - preprocess
|
p - preprocess
|
||||||
s - search
|
s - search
|
||||||
P - postprocess
|
P - postprocess
|
||||||
(default pgP)
|
(default psP)
|
||||||
--command Print generated shell command and exit.
|
--command Print generated shell command and exit.
|
||||||
--help -h Print help.
|
--help -h Print help.
|
||||||
|
--version -v Script version
|
||||||
|
|
||||||
In patern you can put any tag. Tags should begin with the @ character.
|
In patern you can put any tag. Tags should begin with the @ character.
|
||||||
They don't have to be closed.
|
They don't have to be closed.
|
||||||
|
Loading…
Reference in New Issue
Block a user