mar obsluguje opcje morph, tags i umie czytac z configa
git-svn-id: svn://atos.wmid.amu.edu.pl/utt@39 e293616e-ec6a-49c2-aa92-f4a8b91c5d16
This commit is contained in:
parent
f5d3b20097
commit
756213194a
189
app/src/mar/mar
189
app/src/mar/mar
@ -12,9 +12,15 @@
|
||||
|
||||
use strict;
|
||||
use Getopt::Long;
|
||||
use File::HomeDir;
|
||||
|
||||
use attr;
|
||||
|
||||
|
||||
my $LIB_DIR="/usr/local/lib/utt";
|
||||
my $systemconfigfile='/usr/local/etc/utt/mar.conf';
|
||||
my $userconfigfile=home()."/.utt/mar.conf";
|
||||
|
||||
Getopt::Long::Configure('no_ignore_case_always');
|
||||
|
||||
my $help=0;
|
||||
@ -25,6 +31,122 @@ my $command=0;
|
||||
my $action="pgP";
|
||||
my $eos="seg(EOS)";
|
||||
my $explicit_space=0;
|
||||
my $morfield='lem';
|
||||
my $tags=0;
|
||||
|
||||
#read configuration files###########################
|
||||
my $file;
|
||||
foreach $file ($systemconfigfile, $userconfigfile){
|
||||
if(open(CONFIG, $file)){
|
||||
while (<CONFIG>) {
|
||||
chomp;
|
||||
s/#.*//;
|
||||
s/^\s+//;
|
||||
s/\s+$//;
|
||||
next unless length;
|
||||
my ($name, $value) = split(/\s*=\s*/, $_, 2);
|
||||
if(($name eq "pattern")or($name eq "e")){
|
||||
$pattern=$value;
|
||||
}
|
||||
elsif($name eq "eos"){
|
||||
$eos=$value;
|
||||
}
|
||||
elsif($name eq "macros"){
|
||||
$macrofile=$value;
|
||||
}
|
||||
elsif($name eq "tags"){
|
||||
$tags=$value;
|
||||
}
|
||||
elsif($name eq "morph"){
|
||||
$morfield=$value;
|
||||
}
|
||||
elsif($name eq "command"){
|
||||
$command=1;
|
||||
}
|
||||
elsif($name eq "action"){
|
||||
$action=$value;
|
||||
}
|
||||
elsif($name eq "space"){
|
||||
$explicit_space=1;
|
||||
}
|
||||
elsif(($name eq "help")or($name eq "h")){
|
||||
$help=1;
|
||||
}
|
||||
|
||||
}
|
||||
close CONFIG;
|
||||
}
|
||||
}
|
||||
#########################################################
|
||||
|
||||
GetOptions("pattern|e=s" => \$pattern,
|
||||
"eos|E=s" => \$eos,
|
||||
"macros=s" => \$macrofile,
|
||||
"define=s" => \$macrofile,
|
||||
"command" => \$command,
|
||||
"action=s" => \$action,
|
||||
"help|h" => \$help,
|
||||
"space|s" => \$explicit_space
|
||||
);
|
||||
|
||||
|
||||
|
||||
|
||||
if($help)
|
||||
{
|
||||
print <<'END'
|
||||
Usage: mar [OPTIONS] [file ..]
|
||||
|
||||
Options:
|
||||
--pattern -e PATTERN Pattern.
|
||||
--bos -E PATTERN Segment serving as sentence beginning marker. [TODO]
|
||||
--macros=FILE Read macrodefinitions from FILE. [TODO]
|
||||
--define=FILE Add macrodefinitions from FILE. [TODO]
|
||||
--action -a [p][s][P] Perform only indicated actions.
|
||||
p - preprocess
|
||||
s - search
|
||||
P - postprocess
|
||||
(default pgP)
|
||||
--command Print generated shell command and exit.
|
||||
--help -h Print help.
|
||||
|
||||
In patern you can put any tag. Tags should begin with the @ character.
|
||||
They don't have to be closed.
|
||||
They can't contain white spaces!
|
||||
|
||||
Note: If you don't define any custom tags, whole pattern will be taged with
|
||||
default tags (begining of match and end of match)
|
||||
|
||||
Tags examples:
|
||||
|
||||
mar -e '@BEG cat(<ADJ>) @END'
|
||||
it will find any adjectives in the text and tag them with surrounding tags
|
||||
mar -e 'cat(<ADJ>) @MYTAG cat(<ADJ>)'
|
||||
this will find two neighbouring adjectives and parcel them with tag MYTAG
|
||||
|
||||
Some example patterns:
|
||||
'word(domu)' - form of the word domu
|
||||
'lexeme(dom)' - any form of lexeme dom
|
||||
'space' - space
|
||||
'cat(<ADJ>)' - adjective
|
||||
|
||||
You can use * in patterns to make zero or more counts of word.
|
||||
|
||||
END
|
||||
;
|
||||
exit 0;
|
||||
}
|
||||
|
||||
die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/;
|
||||
|
||||
die("$0: macro file not found") unless
|
||||
$macrofile or
|
||||
-e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4";
|
||||
|
||||
my $preproc = ($action =~ /p/) ? ' fla | ' : '';
|
||||
|
||||
my $postproc = ($action =~ /P/) ? ' | unfla ' : '';
|
||||
|
||||
|
||||
#this is our help function to cut the re to get another tag
|
||||
#it takes only one argument which is our patern (after m4 processing)
|
||||
@ -83,68 +205,6 @@ sub restRe
|
||||
$temp;
|
||||
}
|
||||
|
||||
GetOptions("pattern|e=s" => \$pattern,
|
||||
"eos|E=s" => \$eos,
|
||||
"macros=s" => \$macrofile,
|
||||
"define=s" => \$macrofile,
|
||||
"command" => \$command,
|
||||
"action=s" => \$action,
|
||||
"help|h" => \$help,
|
||||
"space|s" => \$explicit_space
|
||||
);
|
||||
|
||||
if($help)
|
||||
{
|
||||
print <<'END'
|
||||
Usage: mar [OPTIONS] [file ..]
|
||||
|
||||
Options:
|
||||
--pattern -e PATTERN Pattern.
|
||||
--bos -E PATTERN Segment serving as sentence beginning marker. [TODO]
|
||||
--macros=FILE Read macrodefinitions from FILE. [TODO]
|
||||
--define=FILE Add macrodefinitions from FILE. [TODO]
|
||||
--action -a [p][s][P] Perform only indicated actions.
|
||||
p - preprocess
|
||||
s - search
|
||||
P - postprocess
|
||||
(default pgP)
|
||||
--command Print generated shell command and exit.
|
||||
--help -h Print help.
|
||||
|
||||
In patern you can put any tag. Tags should begin with the @ character.
|
||||
They don't have to be closed.
|
||||
They can't contain white spaces!
|
||||
|
||||
Note: If you don't define any custom tags, whole pattern will be taged with
|
||||
default tags (begining of match and end of match)
|
||||
|
||||
Tags examples:
|
||||
|
||||
mar -e '@BEG cat(<ADJ>) @END'
|
||||
it will find any adjectives in the text and tag them with surrounding tags
|
||||
mar -e 'cat(<ADJ>) @MYTAG cat(<ADJ>)'
|
||||
this will find two neighbouring adjectives and parcel them with tag MYTAG
|
||||
|
||||
Some example patterns:
|
||||
'word(domu)' - form of the word domu
|
||||
'lexeme(dom)' - any form of lexeme dom
|
||||
'space' - space
|
||||
'cat(<ADJ>)' - adjective
|
||||
|
||||
You can use * in patterns to make zero or more counts of word.
|
||||
|
||||
END
|
||||
;
|
||||
exit 0;
|
||||
}
|
||||
|
||||
die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/;
|
||||
|
||||
die("$0: macro file not found") unless -e "terms.m4" and $macrofile="terms.m4";
|
||||
|
||||
my $preproc = ($action =~ /p/) ? ' fla | ' : '';
|
||||
|
||||
my $postproc = ($action =~ /P/) ? ' | unfla ' : '';
|
||||
|
||||
#here we are preparing re for extended matching
|
||||
my @tags;
|
||||
@ -185,7 +245,8 @@ $patternmod =~ s/\\,/\\`\\`\\,''/g;
|
||||
# quoting commas in {m,n} r.e. operator
|
||||
$patternmod =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g;
|
||||
#print "After m4:".$re."\n";
|
||||
my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' $macrofile - 2>/dev/null`;
|
||||
|
||||
my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`;
|
||||
|
||||
die("Incorrect pattern (m4).") if $? >> 8;
|
||||
|
||||
@ -194,7 +255,7 @@ chomp $re;
|
||||
|
||||
# <> expansion
|
||||
|
||||
$re =~ s/<([^>]+)>/`echo $1 | .\/terms\.tag2re`/ge;
|
||||
$re =~ s/<([^>]+)>/`echo $1 | $tags.tag2re`/ge;
|
||||
|
||||
# Perl-like special sequences
|
||||
$re =~ s/\./[^ [:cntrl:]]/g;
|
||||
|
Loading…
Reference in New Issue
Block a user