mar obsluguje opcje morph, tags i umie czytac z configa

git-svn-id: svn://atos.wmid.amu.edu.pl/utt@39 e293616e-ec6a-49c2-aa92-f4a8b91c5d16
This commit is contained in:
obrebski 2008-05-07 13:19:14 +00:00
parent f5d3b20097
commit 756213194a

View File

@ -12,9 +12,15 @@
use strict; use strict;
use Getopt::Long; use Getopt::Long;
use File::HomeDir;
use attr; use attr;
my $LIB_DIR="/usr/local/lib/utt";
my $systemconfigfile='/usr/local/etc/utt/mar.conf';
my $userconfigfile=home()."/.utt/mar.conf";
Getopt::Long::Configure('no_ignore_case_always'); Getopt::Long::Configure('no_ignore_case_always');
my $help=0; my $help=0;
@ -25,6 +31,122 @@ my $command=0;
my $action="pgP"; my $action="pgP";
my $eos="seg(EOS)"; my $eos="seg(EOS)";
my $explicit_space=0; my $explicit_space=0;
my $morfield='lem';
my $tags=0;
#read configuration files###########################
my $file;
foreach $file ($systemconfigfile, $userconfigfile){
if(open(CONFIG, $file)){
while (<CONFIG>) {
chomp;
s/#.*//;
s/^\s+//;
s/\s+$//;
next unless length;
my ($name, $value) = split(/\s*=\s*/, $_, 2);
if(($name eq "pattern")or($name eq "e")){
$pattern=$value;
}
elsif($name eq "eos"){
$eos=$value;
}
elsif($name eq "macros"){
$macrofile=$value;
}
elsif($name eq "tags"){
$tags=$value;
}
elsif($name eq "morph"){
$morfield=$value;
}
elsif($name eq "command"){
$command=1;
}
elsif($name eq "action"){
$action=$value;
}
elsif($name eq "space"){
$explicit_space=1;
}
elsif(($name eq "help")or($name eq "h")){
$help=1;
}
}
close CONFIG;
}
}
#########################################################
GetOptions("pattern|e=s" => \$pattern,
"eos|E=s" => \$eos,
"macros=s" => \$macrofile,
"define=s" => \$macrofile,
"command" => \$command,
"action=s" => \$action,
"help|h" => \$help,
"space|s" => \$explicit_space
);
if($help)
{
print <<'END'
Usage: mar [OPTIONS] [file ..]
Options:
--pattern -e PATTERN Pattern.
--bos -E PATTERN Segment serving as sentence beginning marker. [TODO]
--macros=FILE Read macrodefinitions from FILE. [TODO]
--define=FILE Add macrodefinitions from FILE. [TODO]
--action -a [p][s][P] Perform only indicated actions.
p - preprocess
s - search
P - postprocess
(default pgP)
--command Print generated shell command and exit.
--help -h Print help.
In patern you can put any tag. Tags should begin with the @ character.
They don't have to be closed.
They can't contain white spaces!
Note: If you don't define any custom tags, whole pattern will be taged with
default tags (begining of match and end of match)
Tags examples:
mar -e '@BEG cat(<ADJ>) @END'
it will find any adjectives in the text and tag them with surrounding tags
mar -e 'cat(<ADJ>) @MYTAG cat(<ADJ>)'
this will find two neighbouring adjectives and parcel them with tag MYTAG
Some example patterns:
'word(domu)' - form of the word domu
'lexeme(dom)' - any form of lexeme dom
'space' - space
'cat(<ADJ>)' - adjective
You can use * in patterns to make zero or more counts of word.
END
;
exit 0;
}
die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/;
die("$0: macro file not found") unless
$macrofile or
-e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4";
my $preproc = ($action =~ /p/) ? ' fla | ' : '';
my $postproc = ($action =~ /P/) ? ' | unfla ' : '';
#this is our help function to cut the re to get another tag #this is our help function to cut the re to get another tag
#it takes only one argument which is our patern (after m4 processing) #it takes only one argument which is our patern (after m4 processing)
@ -83,68 +205,6 @@ sub restRe
$temp; $temp;
} }
GetOptions("pattern|e=s" => \$pattern,
"eos|E=s" => \$eos,
"macros=s" => \$macrofile,
"define=s" => \$macrofile,
"command" => \$command,
"action=s" => \$action,
"help|h" => \$help,
"space|s" => \$explicit_space
);
if($help)
{
print <<'END'
Usage: mar [OPTIONS] [file ..]
Options:
--pattern -e PATTERN Pattern.
--bos -E PATTERN Segment serving as sentence beginning marker. [TODO]
--macros=FILE Read macrodefinitions from FILE. [TODO]
--define=FILE Add macrodefinitions from FILE. [TODO]
--action -a [p][s][P] Perform only indicated actions.
p - preprocess
s - search
P - postprocess
(default pgP)
--command Print generated shell command and exit.
--help -h Print help.
In patern you can put any tag. Tags should begin with the @ character.
They don't have to be closed.
They can't contain white spaces!
Note: If you don't define any custom tags, whole pattern will be taged with
default tags (begining of match and end of match)
Tags examples:
mar -e '@BEG cat(<ADJ>) @END'
it will find any adjectives in the text and tag them with surrounding tags
mar -e 'cat(<ADJ>) @MYTAG cat(<ADJ>)'
this will find two neighbouring adjectives and parcel them with tag MYTAG
Some example patterns:
'word(domu)' - form of the word domu
'lexeme(dom)' - any form of lexeme dom
'space' - space
'cat(<ADJ>)' - adjective
You can use * in patterns to make zero or more counts of word.
END
;
exit 0;
}
die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/;
die("$0: macro file not found") unless -e "terms.m4" and $macrofile="terms.m4";
my $preproc = ($action =~ /p/) ? ' fla | ' : '';
my $postproc = ($action =~ /P/) ? ' | unfla ' : '';
#here we are preparing re for extended matching #here we are preparing re for extended matching
my @tags; my @tags;
@ -185,7 +245,8 @@ $patternmod =~ s/\\,/\\`\\`\\,''/g;
# quoting commas in {m,n} r.e. operator # quoting commas in {m,n} r.e. operator
$patternmod =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g; $patternmod =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g;
#print "After m4:".$re."\n"; #print "After m4:".$re."\n";
my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' $macrofile - 2>/dev/null`;
my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`;
die("Incorrect pattern (m4).") if $? >> 8; die("Incorrect pattern (m4).") if $? >> 8;
@ -194,7 +255,7 @@ chomp $re;
# <> expansion # <> expansion
$re =~ s/<([^>]+)>/`echo $1 | .\/terms\.tag2re`/ge; $re =~ s/<([^>]+)>/`echo $1 | $tags.tag2re`/ge;
# Perl-like special sequences # Perl-like special sequences
$re =~ s/\./[^ [:cntrl:]]/g; $re =~ s/\./[^ [:cntrl:]]/g;