utt/src/mar.pl
Mateusz Hromada dc2176950f Migration to new build system.
* mar moved and checked
2009-06-09 20:17:04 +02:00

337 lines
7.7 KiB
Perl
Raw Blame History

#!/usr/bin/perl
#package: UAM Text Tools
#component: mar
#version: 1.0
#author: Marcin Walas
#this program tags the tokenized file with given tags
#tags can be given in any order and configuration through the expression
#which is one of the parametres of the script
#contact: d287572@atos.wmid.amu.edu.pl, walasiek@gmail.com
my $version = '1.0';
use lib "/usr/local/lib/utt";
use lib "$ENV{'HOME'}/.local/lib/utt";
use strict;
use Getopt::Long;
use File::HomeDir;
use attr;
my $LIB_DIR="/usr/local/lib/utt";
my $systemconfigfile='/usr/local/etc/utt/mar.conf';
my $userconfigfile=home()."/.utt/mar.conf";
Getopt::Long::Configure('no_ignore_case_always');
my $help=0;
my $pattern=0;
my $macrofile=0;
my $define=0;
my $command=0;
my $action="pgP";
my $eos="seg(EOS)";
my $explicit_space=0;
my $morfield='lem';
my $tags=0;
my $show_version = 0;
#read configuration files###########################
my $file;
foreach $file ($systemconfigfile, $userconfigfile){
if(open(CONFIG, $file)){
while (<CONFIG>) {
chomp;
s/#.*//;
s/^\s+//;
s/\s+$//;
next unless length;
my ($name, $value) = split(/\s*=\s*/, $_, 2);
if(($name eq "pattern")or($name eq "e")){
$pattern=$value;
}
elsif($name eq "eos"){
$eos=$value;
}
elsif($name eq "macros"){
$macrofile=$value;
}
elsif($name eq "tags"){
$tags=$value;
}
elsif($name eq "morph"){
$morfield=$value;
}
elsif($name eq "command"){
$command=1;
}
elsif($name eq "action"){
$action=$value;
}
elsif($name eq "space"){
$explicit_space=1;
}
elsif(($name eq "help")or($name eq "h")){
$help=1;
}
}
close CONFIG;
}
}
#########################################################
GetOptions("pattern|e=s" => \$pattern,
"eos|E=s" => \$eos,
"macros=s" => \$macrofile,
"define=s" => \$macrofile,
"command" => \$command,
"action=s" => \$action,
"help|h" => \$help,
"space|s" => \$explicit_space,
"version|v" => \$show_version,
);
if($show_version){
print "Version: $version\n";
exit 0;
}
if($help)
{
print <<'END'
Usage: mar [OPTIONS] [file ..]
Options:
--pattern -e PATTERN Pattern.
--eos -E PATTERN Segment serving as sentence beginning marker. [TODO]
--macros=FILE Read macrodefinitions from FILE. [TODO]
--define=FILE Add macrodefinitions from FILE. [TODO]
--action -a [p][s][P] Perform only indicated actions.
p - preprocess
s - search
P - postprocess
(default psP)
--command Print generated shell command and exit.
--help -h Print help.
--version -v Script version
In patern you can put any tag. Tags should begin with the @ character.
They don't have to be closed.
They can't contain white spaces!
Note: If you don't define any custom tags, whole pattern will be taged with
default tags (begining of match and end of match)
Tags examples:
mar -e '@BEG cat(<ADJ>) @END'
it will find any adjectives in the text and tag them with surrounding tags
mar -e 'cat(<ADJ>) @MYTAG cat(<ADJ>)'
this will find two neighbouring adjectives and parcel them with tag MYTAG
Some example patterns:
'word(domu)' - form of the word domu
'lexeme(dom)' - any form of lexeme dom
'space' - space
'cat(<ADJ>)' - adjective
You can use * in patterns to make zero or more counts of word.
END
;
exit 0;
}
die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/;
die("$0: macro file not found") unless
$macrofile or
-e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4";
my $preproc = ($action =~ /p/) ? ' fla | ' : '';
my $postproc = ($action =~ /P/) ? ' | unfla ' : '';
#this is our help function to cut the re to get another tag
#it takes only one argument which is our patern (after m4 processing)
#returns: the first root-level brace with content
sub cutRe
{
my $i = 0;
my $level = 0;
my $text = $_[0];
my $temp;
for( $i =0; $i < (length $text);$i++)
{
$temp = substr($text, $i,1);
if( $temp eq "(")
{#we have an opening
$level++;
}
elsif ( $temp eq ")")
{#we close
$level--;
}
if ( $level == 0)
{
$temp = substr($text,0,$i+1);
last;
}
}
$temp;
}
#the same function as above althought it returns everything after the
#first root level brace
sub restRe
{
my $i = 0;
my $level = 0;
my $text = $_[0];
my $temp;
for( $i =0; $i < (length $text);$i++)
{
$temp = substr($text, $i,1);
if( $temp eq "(")
{#we have an opening
$level++;
}
elsif ( $temp eq ")")
{#we close
$level--;
}
if ( $level == 0)
{ #we cut everything in the begining
$temp = substr($text,$i+1);
last;
}
}
$temp;
}
#here we are preparing re for extended matching
my @tags;
#we must find what our the tags
#some pattern adjustment
my $end = 0;
my $temp = " ".$pattern." ";
$temp =~ s/(\@[^ ]*) (\@[^ ]* )/\1 \2/g;
$pattern = $temp;
while ($end != 1)
{
#we seek for the first tag in pattern
if ($temp =~ /^.*?\@(.*?) /)
{
#we add this to tags array
push (@tags, $1);
#and cut the pattern
$temp =~ s/^.*?\@(.*?) / /;
#print $temp."\n";
}
else
{
#if we dont find any tags we end
$end = 1;
}
}
#here we have our patern with tags removed (we set sections of ()) between tags
my $patternmod = "( ".$pattern." )";
$patternmod =~ s/\s@.*?\s/\)\(/g;
#discarding spaces
$patternmod =~ s/\s+/\\`'/g; #`
# quoting escaped commas
$patternmod =~ s/\\,/\\`\\`\\,''/g;
# quoting commas in {m,n} r.e. operator
$patternmod =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g;
#print "After m4:".$re."\n";
my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`;
die("Incorrect pattern (m4).") if $? >> 8;
chomp $re;
# <> expansion
$re =~ s/<([^>]+)>/`echo $1 | $tags.tag2re`/ge;
# Perl-like special sequences
$re =~ s/\./[^ [:cntrl:]]/g;
$re =~ s/\\s/[ ]/g;
$re =~ s/\\S/[^ [:cntrl:]]/g;
$re =~ s/\\d/[0-9]/g;
$re =~ s/\\D/[^0-9 [:cntrl:]]/g;
$re =~ s/\\w/[a-z<><7A><EFBFBD><EFBFBD><EFBFBD>󶼿A-Z<><5A>ʣ<EFBFBD>Ӧ<EFBFBD><D3A6>0-9_]/g;
$re =~ s/\\W/[^a-z<><7A><EFBFBD><EFBFBD><EFBFBD>󶼿A-Z<><5A>ʣ<EFBFBD>Ӧ<EFBFBD><D3A6>0-9_ [:cntrl:]]/g;
# extensions
$re =~ s/\\l/[a-z<><7A><EFBFBD><EFBFBD><EFBFBD>󶼿]/g; #lowercase letter
$re =~ s/\\L/[A-Z<><5A>ʣ<EFBFBD>Ӧ<EFBFBD><D3A6>]/g; #upercase letter
my $sedcommand;
my $grepcommand;
#now we must built a sed script from our re
#we do this by cuting our re each tag until we cut them all
#if an user dint input any tags we do our default
my $defBOM = "BOM";
my $defEOM = "EOM";
my $defTempTagBeg = "####TempTAGBEG####";
my $defTempTagEnd = "####TempTAGEND####";
if (@tags == 0)
{
$sedcommand = "sed -r 's/($re)/\\500 $defBOM *\\f\\1###EOM###/g; s/###EOM###([0-9]+)/\\1 00 $defEOM *\\f\\1/g'";
}
else #we have custom tags
{
#first tag is easy to tag :)
my $sedscript="sed -r 's/($re)/\\600 $defTempTagBeg *\\f\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $defTempTagEnd *\\f\\1/g;";
#after first step we have temp tagged parts of input matching re
#now we need to insert our custom tags
#we will find temp tags and process our input
my $i = 0;
#copy of re which will be cut
my $rec = $re;
my $restre = $re;
for ($i = 0 ; $i < @tags ; $i++)
{
#re cutting
$rec = cutRe($restre);
$restre = restRe($restre);
if ($rec =~ / *\( *\) */)
{
$sedscript = $sedscript."s/([0-9]+) 00 $defTempTagBeg \\*\\f([0-9]+)/\\2 00 $tags[$i] *\\f\\2 00 $defTempTagBeg *\\f\\2/g;";
}
else
{
$sedscript = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f($rec)/\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $tags[$i] *\\f\\1 00 $defTempTagBeg *\\f\\1/g;";
}
}
$sedcommand = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f//g;s/[0-9]+ 00 $defTempTagEnd \\*\\f//g'";
}
if($command)
{
print $sedcommand."\n";
exit 0;
}
exec $preproc.$sedcommand.$postproc;