mar obsluguje opcje morph, tags i umie czytac z configa
git-svn-id: svn://atos.wmid.amu.edu.pl/utt@39 e293616e-ec6a-49c2-aa92-f4a8b91c5d16
This commit is contained in:
parent
f5d3b20097
commit
756213194a
189
app/src/mar/mar
189
app/src/mar/mar
@ -12,9 +12,15 @@
|
|||||||
|
|
||||||
use strict;
|
use strict;
|
||||||
use Getopt::Long;
|
use Getopt::Long;
|
||||||
|
use File::HomeDir;
|
||||||
|
|
||||||
use attr;
|
use attr;
|
||||||
|
|
||||||
|
|
||||||
|
my $LIB_DIR="/usr/local/lib/utt";
|
||||||
|
my $systemconfigfile='/usr/local/etc/utt/mar.conf';
|
||||||
|
my $userconfigfile=home()."/.utt/mar.conf";
|
||||||
|
|
||||||
Getopt::Long::Configure('no_ignore_case_always');
|
Getopt::Long::Configure('no_ignore_case_always');
|
||||||
|
|
||||||
my $help=0;
|
my $help=0;
|
||||||
@ -25,6 +31,122 @@ my $command=0;
|
|||||||
my $action="pgP";
|
my $action="pgP";
|
||||||
my $eos="seg(EOS)";
|
my $eos="seg(EOS)";
|
||||||
my $explicit_space=0;
|
my $explicit_space=0;
|
||||||
|
my $morfield='lem';
|
||||||
|
my $tags=0;
|
||||||
|
|
||||||
|
#read configuration files###########################
|
||||||
|
my $file;
|
||||||
|
foreach $file ($systemconfigfile, $userconfigfile){
|
||||||
|
if(open(CONFIG, $file)){
|
||||||
|
while (<CONFIG>) {
|
||||||
|
chomp;
|
||||||
|
s/#.*//;
|
||||||
|
s/^\s+//;
|
||||||
|
s/\s+$//;
|
||||||
|
next unless length;
|
||||||
|
my ($name, $value) = split(/\s*=\s*/, $_, 2);
|
||||||
|
if(($name eq "pattern")or($name eq "e")){
|
||||||
|
$pattern=$value;
|
||||||
|
}
|
||||||
|
elsif($name eq "eos"){
|
||||||
|
$eos=$value;
|
||||||
|
}
|
||||||
|
elsif($name eq "macros"){
|
||||||
|
$macrofile=$value;
|
||||||
|
}
|
||||||
|
elsif($name eq "tags"){
|
||||||
|
$tags=$value;
|
||||||
|
}
|
||||||
|
elsif($name eq "morph"){
|
||||||
|
$morfield=$value;
|
||||||
|
}
|
||||||
|
elsif($name eq "command"){
|
||||||
|
$command=1;
|
||||||
|
}
|
||||||
|
elsif($name eq "action"){
|
||||||
|
$action=$value;
|
||||||
|
}
|
||||||
|
elsif($name eq "space"){
|
||||||
|
$explicit_space=1;
|
||||||
|
}
|
||||||
|
elsif(($name eq "help")or($name eq "h")){
|
||||||
|
$help=1;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
close CONFIG;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#########################################################
|
||||||
|
|
||||||
|
GetOptions("pattern|e=s" => \$pattern,
|
||||||
|
"eos|E=s" => \$eos,
|
||||||
|
"macros=s" => \$macrofile,
|
||||||
|
"define=s" => \$macrofile,
|
||||||
|
"command" => \$command,
|
||||||
|
"action=s" => \$action,
|
||||||
|
"help|h" => \$help,
|
||||||
|
"space|s" => \$explicit_space
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if($help)
|
||||||
|
{
|
||||||
|
print <<'END'
|
||||||
|
Usage: mar [OPTIONS] [file ..]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--pattern -e PATTERN Pattern.
|
||||||
|
--bos -E PATTERN Segment serving as sentence beginning marker. [TODO]
|
||||||
|
--macros=FILE Read macrodefinitions from FILE. [TODO]
|
||||||
|
--define=FILE Add macrodefinitions from FILE. [TODO]
|
||||||
|
--action -a [p][s][P] Perform only indicated actions.
|
||||||
|
p - preprocess
|
||||||
|
s - search
|
||||||
|
P - postprocess
|
||||||
|
(default pgP)
|
||||||
|
--command Print generated shell command and exit.
|
||||||
|
--help -h Print help.
|
||||||
|
|
||||||
|
In patern you can put any tag. Tags should begin with the @ character.
|
||||||
|
They don't have to be closed.
|
||||||
|
They can't contain white spaces!
|
||||||
|
|
||||||
|
Note: If you don't define any custom tags, whole pattern will be taged with
|
||||||
|
default tags (begining of match and end of match)
|
||||||
|
|
||||||
|
Tags examples:
|
||||||
|
|
||||||
|
mar -e '@BEG cat(<ADJ>) @END'
|
||||||
|
it will find any adjectives in the text and tag them with surrounding tags
|
||||||
|
mar -e 'cat(<ADJ>) @MYTAG cat(<ADJ>)'
|
||||||
|
this will find two neighbouring adjectives and parcel them with tag MYTAG
|
||||||
|
|
||||||
|
Some example patterns:
|
||||||
|
'word(domu)' - form of the word domu
|
||||||
|
'lexeme(dom)' - any form of lexeme dom
|
||||||
|
'space' - space
|
||||||
|
'cat(<ADJ>)' - adjective
|
||||||
|
|
||||||
|
You can use * in patterns to make zero or more counts of word.
|
||||||
|
|
||||||
|
END
|
||||||
|
;
|
||||||
|
exit 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/;
|
||||||
|
|
||||||
|
die("$0: macro file not found") unless
|
||||||
|
$macrofile or
|
||||||
|
-e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4";
|
||||||
|
|
||||||
|
my $preproc = ($action =~ /p/) ? ' fla | ' : '';
|
||||||
|
|
||||||
|
my $postproc = ($action =~ /P/) ? ' | unfla ' : '';
|
||||||
|
|
||||||
|
|
||||||
#this is our help function to cut the re to get another tag
|
#this is our help function to cut the re to get another tag
|
||||||
#it takes only one argument which is our patern (after m4 processing)
|
#it takes only one argument which is our patern (after m4 processing)
|
||||||
@ -83,68 +205,6 @@ sub restRe
|
|||||||
$temp;
|
$temp;
|
||||||
}
|
}
|
||||||
|
|
||||||
GetOptions("pattern|e=s" => \$pattern,
|
|
||||||
"eos|E=s" => \$eos,
|
|
||||||
"macros=s" => \$macrofile,
|
|
||||||
"define=s" => \$macrofile,
|
|
||||||
"command" => \$command,
|
|
||||||
"action=s" => \$action,
|
|
||||||
"help|h" => \$help,
|
|
||||||
"space|s" => \$explicit_space
|
|
||||||
);
|
|
||||||
|
|
||||||
if($help)
|
|
||||||
{
|
|
||||||
print <<'END'
|
|
||||||
Usage: mar [OPTIONS] [file ..]
|
|
||||||
|
|
||||||
Options:
|
|
||||||
--pattern -e PATTERN Pattern.
|
|
||||||
--bos -E PATTERN Segment serving as sentence beginning marker. [TODO]
|
|
||||||
--macros=FILE Read macrodefinitions from FILE. [TODO]
|
|
||||||
--define=FILE Add macrodefinitions from FILE. [TODO]
|
|
||||||
--action -a [p][s][P] Perform only indicated actions.
|
|
||||||
p - preprocess
|
|
||||||
s - search
|
|
||||||
P - postprocess
|
|
||||||
(default pgP)
|
|
||||||
--command Print generated shell command and exit.
|
|
||||||
--help -h Print help.
|
|
||||||
|
|
||||||
In patern you can put any tag. Tags should begin with the @ character.
|
|
||||||
They don't have to be closed.
|
|
||||||
They can't contain white spaces!
|
|
||||||
|
|
||||||
Note: If you don't define any custom tags, whole pattern will be taged with
|
|
||||||
default tags (begining of match and end of match)
|
|
||||||
|
|
||||||
Tags examples:
|
|
||||||
|
|
||||||
mar -e '@BEG cat(<ADJ>) @END'
|
|
||||||
it will find any adjectives in the text and tag them with surrounding tags
|
|
||||||
mar -e 'cat(<ADJ>) @MYTAG cat(<ADJ>)'
|
|
||||||
this will find two neighbouring adjectives and parcel them with tag MYTAG
|
|
||||||
|
|
||||||
Some example patterns:
|
|
||||||
'word(domu)' - form of the word domu
|
|
||||||
'lexeme(dom)' - any form of lexeme dom
|
|
||||||
'space' - space
|
|
||||||
'cat(<ADJ>)' - adjective
|
|
||||||
|
|
||||||
You can use * in patterns to make zero or more counts of word.
|
|
||||||
|
|
||||||
END
|
|
||||||
;
|
|
||||||
exit 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/;
|
|
||||||
|
|
||||||
die("$0: macro file not found") unless -e "terms.m4" and $macrofile="terms.m4";
|
|
||||||
|
|
||||||
my $preproc = ($action =~ /p/) ? ' fla | ' : '';
|
|
||||||
|
|
||||||
my $postproc = ($action =~ /P/) ? ' | unfla ' : '';
|
|
||||||
|
|
||||||
#here we are preparing re for extended matching
|
#here we are preparing re for extended matching
|
||||||
my @tags;
|
my @tags;
|
||||||
@ -185,7 +245,8 @@ $patternmod =~ s/\\,/\\`\\`\\,''/g;
|
|||||||
# quoting commas in {m,n} r.e. operator
|
# quoting commas in {m,n} r.e. operator
|
||||||
$patternmod =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g;
|
$patternmod =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g;
|
||||||
#print "After m4:".$re."\n";
|
#print "After m4:".$re."\n";
|
||||||
my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' $macrofile - 2>/dev/null`;
|
|
||||||
|
my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`;
|
||||||
|
|
||||||
die("Incorrect pattern (m4).") if $? >> 8;
|
die("Incorrect pattern (m4).") if $? >> 8;
|
||||||
|
|
||||||
@ -194,7 +255,7 @@ chomp $re;
|
|||||||
|
|
||||||
# <> expansion
|
# <> expansion
|
||||||
|
|
||||||
$re =~ s/<([^>]+)>/`echo $1 | .\/terms\.tag2re`/ge;
|
$re =~ s/<([^>]+)>/`echo $1 | $tags.tag2re`/ge;
|
||||||
|
|
||||||
# Perl-like special sequences
|
# Perl-like special sequences
|
||||||
$re =~ s/\./[^ [:cntrl:]]/g;
|
$re =~ s/\./[^ [:cntrl:]]/g;
|
||||||
|
Loading…
Reference in New Issue
Block a user