diff --git a/app/doc/utt.texinfo b/app/doc/utt.texinfo index be6f117..04bb6a8 100644 --- a/app/doc/utt.texinfo +++ b/app/doc/utt.texinfo @@ -1747,15 +1747,77 @@ lzop -cd corpus.grp.lzo | grp -e @var{EXPR} | unfla | ser -e @var{EXPR} @item @strong{Required annotation:} @tab tok, sen, lem -1 @end multitable -[TODO] +@subsection Description +@code{mar} is a perl script, which matches given pattern on the utt-formated text +and tags matching parts with any number of user-defined tags. + +@subsection Command line options +@table @code +@parhelp +@parversion + +@item @b{@minus{}@minus{}pattern=@var{pattern}, @minus{}e @var{pattern}} +The search pattern. +@item @b{@minus{}@minus{}action=@var{action}, @minus{}a @var{action} [p] [s] [P]} +Perform only indicated actions. Where: +@multitable {aaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} +@item @code{p} @tab preprocess +@item @code{s} @tab search +@item @code{P} @tab postprocess +@end multitable +default: psP + +@item @b{@minus{}@minus{}command} +print generated sed command, then exit + +@item @b{@minus{}@minus{}help, @minus{}h} +print help, then exit + +@item @b{@minus{}@minus{}version, @minus{}v} +print version, then exit +@end table +@subsection Tokens in pattern +@code{mar} pattern is based on @code{ser} patterns(see @pxref{ser pattern}). @code{mar} pattern is a @code{ser} pattern, +in which you can add any number of matching tags, which will be printed in exacly the place, where +they were placed in the pattern. A valid token starts with @@ which follows any number of alphanumeric +characters. For example valid match tokens are: @@STARTMATCH @@ENDMATCH + +Matching tokens can be placed between, before or after any of @code{ser} pattern terms. They don't have +to be paritied. There can be any number of them in the pattern (zero or more). They don't have to be unique. +They can be placed one after another. For example: + +@multitable {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaa} +@item @code{@@BOM lexeme(pomoc)} @tab place tag @b{BOM} before any form of the lexeme 'pomoc' +@item @code{@@MATCH lexeme(pomoc) @@MATCH} @tab place tag @b{MATCH} before and after any form of the lexeme 'pomoc' +@item @code{cat() @@MATCH lexeme(pomoc) @@MATCH} @tab place tag @b{MATCH} before and after any form of the lexeme 'pomoc' which is followef by adjective +@item @code{cat() @@TAG @@BOM lexeme(pomoc) @@EOM} @tab place tags @b{TAG} and @b{BOM} before any form of the lexeme 'pomoc' which is followed by adjective and tag @b{EOM} after it +@end multitable + +(see mar's help 'mar -h' for some more information) + +@subsection How mar works +@code{mar} translates given @code{ser} pattern with @code{m4} macroprocessor to regular expression. Then it changes it into @code{sed} command script, which is then executed. + +You can see translated sed script by using the @code{@minus{}@minus{}command} option. +@subsection Limitations +The complexity of computations performed by @code{mar} increases linearly with the number of placed tokens. So it is highly recommended not to place too much tokens. +@subsection Requirements +In order to run @code{mar}, the following programs must be installed in the system: + +@itemize + +@item @command{m4} +@item @command{grep} +@item @command{sed} + +@end itemize + -(see mar's help 'mar -h' for some information) @c --------------------------------------------------------------------- @c KOT @c --------------------------------------------------------------------- - @page @node kot @section kot - untokenizer diff --git a/app/src/mar/mar b/app/src/mar/mar index 14a888a..7b1077d 100755 --- a/app/src/mar/mar +++ b/app/src/mar/mar @@ -10,6 +10,8 @@ #which is one of the parametres of the script #contact: d287572@atos.wmid.amu.edu.pl, walasiek@gmail.com +my $version = '1.0'; + use lib "/usr/local/lib/utt"; use lib "$ENV{'HOME'}/.local/lib/utt"; @@ -36,6 +38,7 @@ my $eos="seg(EOS)"; my $explicit_space=0; my $morfield='lem'; my $tags=0; +my $show_version = 0; #read configuration files########################### my $file; @@ -89,11 +92,16 @@ GetOptions("pattern|e=s" => \$pattern, "command" => \$command, "action=s" => \$action, "help|h" => \$help, - "space|s" => \$explicit_space + "space|s" => \$explicit_space, + "version|v" => \$show_version, ); +if($show_version){ + print "Version: $version\n"; + exit 0; +} if($help) { @@ -102,16 +110,17 @@ Usage: mar [OPTIONS] [file ..] Options: --pattern -e PATTERN Pattern. - --bos -E PATTERN Segment serving as sentence beginning marker. [TODO] + --eos -E PATTERN Segment serving as sentence beginning marker. [TODO] --macros=FILE Read macrodefinitions from FILE. [TODO] --define=FILE Add macrodefinitions from FILE. [TODO] --action -a [p][s][P] Perform only indicated actions. p - preprocess s - search P - postprocess - (default pgP) + (default psP) --command Print generated shell command and exit. --help -h Print help. + --version -v Script version In patern you can put any tag. Tags should begin with the @ character. They don't have to be closed.