2008-03-11 14:02:41 +01:00
|
|
|
|
#!/usr/bin/perl
|
|
|
|
|
|
|
|
|
|
#package: UAM Text Tools
|
2008-03-19 12:29:21 +01:00
|
|
|
|
#component name: grp
|
|
|
|
|
#version: 1.0
|
|
|
|
|
#author: Tomasz Obrebski
|
2008-03-11 14:02:41 +01:00
|
|
|
|
|
|
|
|
|
use strict;
|
|
|
|
|
use Getopt::Long;
|
2008-03-19 12:29:21 +01:00
|
|
|
|
use File::HomeDir;
|
2008-03-11 14:02:41 +01:00
|
|
|
|
|
2008-03-19 12:29:21 +01:00
|
|
|
|
# katalog zawierajacy terms.m4
|
|
|
|
|
my $LIB_DIR="/usr/local/lib/utt";
|
2008-03-11 14:02:41 +01:00
|
|
|
|
|
|
|
|
|
my $systemconfigfile="/usr/local/etc/utt/grp.conf";
|
2008-03-19 12:29:21 +01:00
|
|
|
|
my $userconfigfile=home()."/.utt/grp.conf";
|
2008-03-11 14:02:41 +01:00
|
|
|
|
|
|
|
|
|
Getopt::Long::Configure('no_ignore_case_always');
|
|
|
|
|
|
|
|
|
|
my $help=0;
|
|
|
|
|
my $pattern=0;
|
|
|
|
|
my $matches_only=0;
|
|
|
|
|
my $macrofile=0;
|
|
|
|
|
my $define=0;
|
|
|
|
|
my $show_command=0;
|
|
|
|
|
my $action="pgP";
|
|
|
|
|
my $eos="seg(EOS)";
|
|
|
|
|
my $morfield='lem';
|
2008-05-07 14:48:49 +02:00
|
|
|
|
my $tags=0;
|
2008-03-11 14:02:41 +01:00
|
|
|
|
|
|
|
|
|
#read configuration files###########################
|
|
|
|
|
my $file;
|
|
|
|
|
foreach $file ($systemconfigfile, $userconfigfile){
|
|
|
|
|
if(open(CONFIG, $file)){
|
|
|
|
|
while (<CONFIG>) {
|
|
|
|
|
chomp;
|
|
|
|
|
s/#.*//;
|
|
|
|
|
s/^\s+//;
|
|
|
|
|
s/\s+$//;
|
|
|
|
|
next unless length;
|
|
|
|
|
my ($name, $value) = split(/\s*=\s*/, $_, 2);
|
|
|
|
|
if(($name eq "pattern")or($name eq "e")){
|
|
|
|
|
$pattern=$value;
|
|
|
|
|
}
|
|
|
|
|
elsif(($name eq "eos")or($name eq "E")){
|
|
|
|
|
$eos=$value;
|
|
|
|
|
}
|
|
|
|
|
elsif($name eq "morph"){
|
|
|
|
|
$morfield=$value;
|
|
|
|
|
}
|
|
|
|
|
elsif($name eq "macros"){
|
|
|
|
|
$macrofile=$value;
|
|
|
|
|
}
|
|
|
|
|
elsif($name eq "define"){
|
|
|
|
|
$define=$value;
|
|
|
|
|
}
|
|
|
|
|
elsif($name eq "command"){
|
|
|
|
|
$show_command=1;
|
|
|
|
|
}
|
|
|
|
|
elsif($name eq "action"){
|
|
|
|
|
$action;
|
|
|
|
|
}
|
2008-05-07 14:48:49 +02:00
|
|
|
|
elsif($name eq "tags"){
|
|
|
|
|
$tags=$value;
|
|
|
|
|
}
|
2008-03-11 14:02:41 +01:00
|
|
|
|
elsif(($name eq "help")or($name eq "h")){
|
|
|
|
|
$help=1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
close CONFIG;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#########################################################
|
|
|
|
|
|
|
|
|
|
GetOptions("pattern|e=s" => \$pattern,
|
|
|
|
|
"eos|E=s" => \$eos,
|
|
|
|
|
"morph=s" => \$morfield,
|
|
|
|
|
"macros=s" => \$macrofile,
|
|
|
|
|
"define=s" => \$macrofile,
|
|
|
|
|
"command" => \$show_command,
|
|
|
|
|
"action=s" => \$action,
|
2008-05-07 14:48:49 +02:00
|
|
|
|
"tags=s" => \$tags,
|
2008-03-11 14:02:41 +01:00
|
|
|
|
"help|h" => \$help);
|
|
|
|
|
|
|
|
|
|
if($help)
|
|
|
|
|
{
|
|
|
|
|
print <<'END'
|
|
|
|
|
Usage: gre [OPTIONS] [file ..]
|
|
|
|
|
|
|
|
|
|
Options:
|
|
|
|
|
--pattern -e PATTERN Pattern.
|
|
|
|
|
--eos -E PATTERN Segment serving as sentence delimiter.
|
|
|
|
|
--morph=STRING Field containing morphological information (default 'lem').
|
|
|
|
|
--macros=FILE Read macrodefinitions from FILE.
|
|
|
|
|
--define=FILE Add macrodefinitions from FILE.
|
|
|
|
|
--action -a [u][p][g][P] Perform only indicated actions.
|
|
|
|
|
u - uncompress with 'lzop -cd'
|
|
|
|
|
p - preprocess
|
|
|
|
|
g - grep
|
|
|
|
|
P - postprocess
|
|
|
|
|
(default pgP)
|
2008-05-07 14:48:49 +02:00
|
|
|
|
--tags=STRING Morphosyntactic tag format.
|
2008-03-11 14:02:41 +01:00
|
|
|
|
--command Print the shell command to be executed and exit.
|
|
|
|
|
--help -h Help.
|
|
|
|
|
END
|
|
|
|
|
;
|
|
|
|
|
exit 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
die("$0: no pattern given.\n") unless $pattern || $action !~ /g/;
|
|
|
|
|
|
|
|
|
|
die("$0: macro file not found") unless
|
|
|
|
|
$macrofile or
|
|
|
|
|
-e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4";
|
|
|
|
|
|
2008-05-07 14:48:49 +02:00
|
|
|
|
die("$0: undefined tagset format (tags option missing)") unless
|
|
|
|
|
$tags;
|
|
|
|
|
|
|
|
|
|
die("$0: $tags.tag2re program not found") unless
|
|
|
|
|
1; #JAK NAPISAC WARUNEK???
|
|
|
|
|
|
|
|
|
|
|
2008-03-11 14:02:41 +01:00
|
|
|
|
my $uncompress = ($action =~ /u/) ? ' lzop -cd | ' : '';
|
|
|
|
|
my $preproc = ($action =~ /p/) ? ' fla | ' : '';
|
|
|
|
|
|
|
|
|
|
my $postproc = ($action =~ /P/) ? ' | unfla ' : '';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# discarding spaces
|
|
|
|
|
$pattern =~ s/\s+/\\`'/g; #`
|
|
|
|
|
# quoting escaped commas
|
|
|
|
|
$pattern =~ s/\\,/\\`\\`\\,''/g;
|
|
|
|
|
# quoting commas in {m,n} r.e. operator
|
|
|
|
|
$pattern =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g;
|
|
|
|
|
|
|
|
|
|
my $grepre = `echo \"$pattern\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`;
|
|
|
|
|
|
|
|
|
|
die("Incorrect pattern (m4).") if $? >> 8;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
chomp $grepre;
|
|
|
|
|
|
|
|
|
|
# <> expansion
|
|
|
|
|
|
2008-05-07 14:48:49 +02:00
|
|
|
|
$grepre =~ s/<([^>]+)>/`echo $1 | $tags.tag2re`/ge;
|
2008-03-11 14:02:41 +01:00
|
|
|
|
|
|
|
|
|
$grepre =~ s/\./[^ [:cntrl:]]/g;
|
|
|
|
|
|
|
|
|
|
$grepre =~ s/\\s/[ ]/g;
|
|
|
|
|
$grepre =~ s/\\S/[^ [:cntrl:]]/g;
|
|
|
|
|
$grepre =~ s/\\d/[0-9]/g;
|
|
|
|
|
$grepre =~ s/\\D/[^0-9 [:cntrl:]]/g;
|
|
|
|
|
$grepre =~ s/\\w/[a-z<><7A><EFBFBD><EFBFBD><EFBFBD>A-Z<><5A>ʣ<EFBFBD>Ӧ<EFBFBD><D3A6>0-9_]/g;
|
|
|
|
|
$grepre =~ s/\\W/[^a-z<><7A><EFBFBD><EFBFBD><EFBFBD>A-Z<><5A>ʣ<EFBFBD>Ӧ<EFBFBD><D3A6>0-9_ [:cntrl:]]/g;
|
|
|
|
|
# extensions
|
|
|
|
|
$grepre =~ s/\\l/[a-z<><7A><EFBFBD><EFBFBD><EFBFBD>]/g; #lowercase letter
|
|
|
|
|
$grepre =~ s/\\L/[A-Z<><5A>ʣ<EFBFBD>Ӧ<EFBFBD><D3A6>]/g; #upercase letter
|
|
|
|
|
|
|
|
|
|
my $grep_command = ($action =~ /g/) ? "egrep '$grepre'" : " cat ";
|
|
|
|
|
|
|
|
|
|
if($show_command)
|
|
|
|
|
{
|
|
|
|
|
print $grep_command."\n";
|
|
|
|
|
exit 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#print $preproc.$grep_command.$postproc."\n";
|
|
|
|
|
|
|
|
|
|
exec $preproc.$grep_command.$postproc;
|