From 756213194ab50cc6e0ef6edfb9065ff3219f5372 Mon Sep 17 00:00:00 2001 From: obrebski Date: Wed, 7 May 2008 13:19:14 +0000 Subject: [PATCH] mar obsluguje opcje morph, tags i umie czytac z configa git-svn-id: svn://atos.wmid.amu.edu.pl/utt@39 e293616e-ec6a-49c2-aa92-f4a8b91c5d16 --- app/src/mar/mar | 189 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 125 insertions(+), 64 deletions(-) diff --git a/app/src/mar/mar b/app/src/mar/mar index 3e7b3c8..bdd30ce 100755 --- a/app/src/mar/mar +++ b/app/src/mar/mar @@ -12,9 +12,15 @@ use strict; use Getopt::Long; +use File::HomeDir; use attr; + +my $LIB_DIR="/usr/local/lib/utt"; +my $systemconfigfile='/usr/local/etc/utt/mar.conf'; +my $userconfigfile=home()."/.utt/mar.conf"; + Getopt::Long::Configure('no_ignore_case_always'); my $help=0; @@ -25,6 +31,122 @@ my $command=0; my $action="pgP"; my $eos="seg(EOS)"; my $explicit_space=0; +my $morfield='lem'; +my $tags=0; + +#read configuration files########################### +my $file; +foreach $file ($systemconfigfile, $userconfigfile){ + if(open(CONFIG, $file)){ + while () { + chomp; + s/#.*//; + s/^\s+//; + s/\s+$//; + next unless length; + my ($name, $value) = split(/\s*=\s*/, $_, 2); + if(($name eq "pattern")or($name eq "e")){ + $pattern=$value; + } + elsif($name eq "eos"){ + $eos=$value; + } + elsif($name eq "macros"){ + $macrofile=$value; + } + elsif($name eq "tags"){ + $tags=$value; + } + elsif($name eq "morph"){ + $morfield=$value; + } + elsif($name eq "command"){ + $command=1; + } + elsif($name eq "action"){ + $action=$value; + } + elsif($name eq "space"){ + $explicit_space=1; + } + elsif(($name eq "help")or($name eq "h")){ + $help=1; + } + + } + close CONFIG; + } +} +######################################################### + +GetOptions("pattern|e=s" => \$pattern, + "eos|E=s" => \$eos, + "macros=s" => \$macrofile, + "define=s" => \$macrofile, + "command" => \$command, + "action=s" => \$action, + "help|h" => \$help, + "space|s" => \$explicit_space + ); + + + + +if($help) +{ + print <<'END' +Usage: mar [OPTIONS] [file ..] + +Options: + --pattern -e PATTERN Pattern. + --bos -E PATTERN Segment serving as sentence beginning marker. [TODO] + --macros=FILE Read macrodefinitions from FILE. [TODO] + --define=FILE Add macrodefinitions from FILE. [TODO] + --action -a [p][s][P] Perform only indicated actions. + p - preprocess + s - search + P - postprocess + (default pgP) + --command Print generated shell command and exit. + --help -h Print help. + +In patern you can put any tag. Tags should begin with the @ character. +They don't have to be closed. +They can't contain white spaces! + +Note: If you don't define any custom tags, whole pattern will be taged with + default tags (begining of match and end of match) + +Tags examples: + +mar -e '@BEG cat() @END' + it will find any adjectives in the text and tag them with surrounding tags +mar -e 'cat() @MYTAG cat()' + this will find two neighbouring adjectives and parcel them with tag MYTAG + +Some example patterns: +'word(domu)' - form of the word domu +'lexeme(dom)' - any form of lexeme dom +'space' - space +'cat()' - adjective + +You can use * in patterns to make zero or more counts of word. + +END +; + exit 0; +} + +die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/; + +die("$0: macro file not found") unless + $macrofile or + -e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4"; + +my $preproc = ($action =~ /p/) ? ' fla | ' : ''; + +my $postproc = ($action =~ /P/) ? ' | unfla ' : ''; + #this is our help function to cut the re to get another tag #it takes only one argument which is our patern (after m4 processing) @@ -83,68 +205,6 @@ sub restRe $temp; } -GetOptions("pattern|e=s" => \$pattern, - "eos|E=s" => \$eos, - "macros=s" => \$macrofile, - "define=s" => \$macrofile, - "command" => \$command, - "action=s" => \$action, - "help|h" => \$help, - "space|s" => \$explicit_space - ); - -if($help) -{ - print <<'END' -Usage: mar [OPTIONS] [file ..] - -Options: - --pattern -e PATTERN Pattern. - --bos -E PATTERN Segment serving as sentence beginning marker. [TODO] - --macros=FILE Read macrodefinitions from FILE. [TODO] - --define=FILE Add macrodefinitions from FILE. [TODO] - --action -a [p][s][P] Perform only indicated actions. - p - preprocess - s - search - P - postprocess - (default pgP) - --command Print generated shell command and exit. - --help -h Print help. - -In patern you can put any tag. Tags should begin with the @ character. -They don't have to be closed. -They can't contain white spaces! - -Note: If you don't define any custom tags, whole pattern will be taged with - default tags (begining of match and end of match) - -Tags examples: - -mar -e '@BEG cat() @END' - it will find any adjectives in the text and tag them with surrounding tags -mar -e 'cat() @MYTAG cat()' - this will find two neighbouring adjectives and parcel them with tag MYTAG - -Some example patterns: -'word(domu)' - form of the word domu -'lexeme(dom)' - any form of lexeme dom -'space' - space -'cat()' - adjective - -You can use * in patterns to make zero or more counts of word. - -END -; - exit 0; -} - -die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/; - -die("$0: macro file not found") unless -e "terms.m4" and $macrofile="terms.m4"; - -my $preproc = ($action =~ /p/) ? ' fla | ' : ''; - -my $postproc = ($action =~ /P/) ? ' | unfla ' : ''; #here we are preparing re for extended matching my @tags; @@ -185,7 +245,8 @@ $patternmod =~ s/\\,/\\`\\`\\,''/g; # quoting commas in {m,n} r.e. operator $patternmod =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g; #print "After m4:".$re."\n"; -my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' $macrofile - 2>/dev/null`; + +my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`; die("Incorrect pattern (m4).") if $? >> 8; @@ -194,7 +255,7 @@ chomp $re; # <> expansion -$re =~ s/<([^>]+)>/`echo $1 | .\/terms\.tag2re`/ge; +$re =~ s/<([^>]+)>/`echo $1 | $tags.tag2re`/ge; # Perl-like special sequences $re =~ s/\./[^ [:cntrl:]]/g;