utt/src/mar.pl

#!/usr/bin/perl

#package:	UAM Text Tools
#component:	mar
#version:	1.0
#author:	Marcin Walas

#this program tags the tokenized file with given tags
#tags can be given in any order and configuration through the expression
#which is one of the parametres of the script
#contact: d287572@atos.wmid.amu.edu.pl, walasiek@gmail.com

my $version = '1.0';

use lib "/usr/local/lib/utt";
use lib "$ENV{'HOME'}/.local/lib/utt";

use strict;
use Getopt::Long;
use File::HomeDir;

use attr;


my $LIB_DIR="/usr/local/lib/utt";
my $systemconfigfile='/usr/local/etc/utt/mar.conf';
my $userconfigfile=home()."/.utt/mar.conf";

Getopt::Long::Configure('no_ignore_case_always');

my $help=0;
my $pattern=0;
my $macrofile=0;
my $define=0;
my $command=0;
my $action="pgP";
my $eos="seg(EOS)";
my $explicit_space=0;
my $morfield='lem';
my $tags=0;
my $show_version = 0;

#read configuration files###########################
my $file;
foreach $file ($systemconfigfile, $userconfigfile){
  if(open(CONFIG, $file)){
  	while (<CONFIG>) {
  		chomp;
      		s/#.*//;
	      	s/^\s+//;
      		s/\s+$//;
    		next unless length;
    		my ($name, $value) = split(/\s*=\s*/, $_, 2);
    		if(($name eq "pattern")or($name eq "e")){
			$pattern=$value;
    		}
    		elsif($name eq "eos"){
			$eos=$value;
    		}
    		elsif($name eq "macros"){
			$macrofile=$value;
    		}
    		elsif($name eq "tags"){
    			$tags=$value;
    		}
    		elsif($name eq "morph"){
			$morfield=$value;
    		}
    		elsif($name eq "command"){
			$command=1;
    		}
    		elsif($name eq "action"){
			$action=$value;
    		}
    		elsif($name eq "space"){
			$explicit_space=1;
    		}
    		elsif(($name eq "help")or($name eq "h")){
			$help=1;
    		}

	}
  	close CONFIG;
  }
}
#########################################################

GetOptions("pattern|e=s" => \$pattern,
	   "eos|E=s" => \$eos,
	   "macros=s" => \$macrofile,
	   "define=s" => \$macrofile,
	   "command" => \$command,
	   "action=s" => \$action,
	   "help|h" => \$help,
	   "space|s" => \$explicit_space,
       "version|v" => \$show_version,
   );


if($show_version){
    print "Version: $version\n";
    exit 0;
}

if($help)
{
    print <<'END'
Usage: mar [OPTIONS] [file ..]

Options:
   --pattern -e	PATTERN		Pattern.
   --eos -E PATTERN             Segment serving as sentence beginning marker. [TODO]
   --macros=FILE		Read macrodefinitions from FILE. [TODO]
   --define=FILE		Add macrodefinitions from FILE. [TODO]
   --action -a [p][s][P]	Perform only indicated actions.
				    p - preprocess
				    s - search
				    P - postprocess
				(default psP)
   --command			Print generated shell command and exit.
   --help -h			Print help.
   --version -v         Script version

In patern you can put any tag. Tags should begin with the @ character.
They don't have to be closed.
They can't contain white spaces!

Note: If you don't define any custom tags, whole pattern will be taged with
      default tags (begining of match and end of match)

Tags examples:

mar -e '@BEG cat(<ADJ>) @END'
  it will find any adjectives in the text and tag them with surrounding tags
mar -e 'cat(<ADJ>) @MYTAG cat(<ADJ>)'
  this will find two neighbouring adjectives and parcel them with tag MYTAG

Some example patterns:
'word(domu)'  - form of the word domu
'lexeme(dom)' - any form of lexeme dom
'space'       - space
'cat(<ADJ>)'  - adjective

You can use * in patterns to make zero or more counts of word.

END
;
    exit 0;
}

die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/;

die("$0: macro file not found") unless
    $macrofile or
    -e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4";

my $preproc    = ($action =~ /p/) ? ' fla  | '  : '';

my $postproc   = ($action =~ /P/) ? ' | unfla '  : '';


#this is our help function to  cut the re to get another tag
#it takes only one argument which is our patern (after m4 processing)
#returns: the first root-level brace with content
sub cutRe
{
 my $i = 0;
 my $level = 0;
 my $text = $_[0];
 my $temp;
 for( $i =0; $i < (length $text);$i++)
 {
	 $temp = substr($text, $i,1);
	 if( $temp eq "(")
	 {#we have an opening
		 $level++;
	 }
	 elsif ( $temp eq ")")
	 {#we close
		 $level--;
	 }
	 if ( $level == 0)
	 {
		$temp  = substr($text,0,$i+1);
		last;
	 }
 }
 $temp;
}

#the same function as above althought it returns everything after the
#first root level brace
sub restRe
{
 my $i = 0;
 my $level = 0;
 my $text = $_[0];
 my $temp;
 for( $i =0; $i < (length $text);$i++)
 {
	 $temp = substr($text, $i,1);
	 if( $temp eq "(")
	 {#we have an opening
		 $level++;
	 }
	 elsif ( $temp eq ")")
	 {#we close
		 $level--;
	 }
	 if ( $level == 0)
	 { #we cut everything in the begining
		$temp  = substr($text,$i+1);
		last;
	 }
 }
 $temp;
}


#here we are preparing re for extended matching
my @tags;

#we must find what our the tags
#some pattern adjustment
my $end = 0;
my $temp = " ".$pattern." ";
$temp =~ s/(\@[^ ]*) (\@[^ ]* )/\1  \2/g;
$pattern = $temp;

while ($end != 1)
{
	#we seek for the first tag in pattern
 if ($temp =~ /^.*?\@(.*?) /)
 {
	 #we add this to tags array
	 push (@tags, $1);
	 #and cut the pattern
	 $temp =~ s/^.*?\@(.*?) / /;
	 #print $temp."\n";
 }
 else
 {
	 #if we dont find any tags we end
	 $end = 1;
 }
}

#here we have our patern with tags removed (we set sections of ()) between tags
my $patternmod = "( ".$pattern." )";
$patternmod =~ s/\s@.*?\s/\)\(/g;

#discarding spaces
$patternmod =~ s/\s+/\\`'/g; #`
# quoting escaped commas
$patternmod =~ s/\\,/\\`\\`\\,''/g;
# quoting commas in {m,n} r.e. operator
$patternmod =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g;
#print "After m4:".$re."\n";

my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`;

die("Incorrect pattern (m4).") if $? >> 8;


chomp $re;

# <> expansion

$re =~ s/<([^>]+)>/`echo $1 | $tags.tag2re`/ge;

# Perl-like special sequences
$re =~ s/\./[^ [:cntrl:]]/g;
$re =~ s/\\s/[ ]/g;
$re =~ s/\\S/[^ [:cntrl:]]/g;
$re =~ s/\\d/[0-9]/g;
$re =~ s/\\D/[^0-9 [:cntrl:]]/g;
$re =~ s/\\w/[a-z<><7A><EFBFBD><EFBFBD><EFBFBD>󶼿A-Z<><5A>ʣ<EFBFBD>Ӧ<EFBFBD><D3A6>0-9_]/g;
$re =~ s/\\W/[^a-z<><7A><EFBFBD><EFBFBD><EFBFBD>󶼿A-Z<><5A>ʣ<EFBFBD>Ӧ<EFBFBD><D3A6>0-9_ [:cntrl:]]/g;
# extensions
$re =~ s/\\l/[a-z<><7A><EFBFBD><EFBFBD><EFBFBD>󶼿]/g; #lowercase letter
$re =~ s/\\L/[A-Z<><5A>ʣ<EFBFBD>Ӧ<EFBFBD><D3A6>]/g; #upercase letter

my $sedcommand;
my $grepcommand;

#now we must built a sed script from our re
#we do this by cuting our re each tag until we cut them all
#if an user dint input any tags we do our default
my $defBOM = "BOM";
my $defEOM = "EOM";
my $defTempTagBeg = "####TempTAGBEG####";
my $defTempTagEnd = "####TempTAGEND####";

if (@tags == 0)
{
	$sedcommand = "sed -r 's/($re)/\\500 $defBOM *\\f\\1###EOM###/g; s/###EOM###([0-9]+)/\\1 00 $defEOM *\\f\\1/g'";
}
else #we have custom tags
{
	#first tag is easy to tag :)
	my $sedscript="sed -r 's/($re)/\\600 $defTempTagBeg *\\f\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $defTempTagEnd *\\f\\1/g;";
    #after first step we have temp tagged parts of input matching re
    #now we need to insert our custom tags
    #we will find temp tags and process our input

	my $i = 0;
	#copy of re which will be cut
	my $rec = $re;
	my $restre = $re;

	for ($i = 0 ; $i < @tags ; $i++)
	{
	 #re cutting
	 $rec = cutRe($restre);
	 $restre = restRe($restre);
	 if ($rec =~ / *\( *\) */)
	 {
	    $sedscript = $sedscript."s/([0-9]+) 00 $defTempTagBeg \\*\\f([0-9]+)/\\2 00 $tags[$i] *\\f\\2 00 $defTempTagBeg *\\f\\2/g;";
	 }
	 else
	 {
	    $sedscript = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f($rec)/\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $tags[$i] *\\f\\1 00 $defTempTagBeg *\\f\\1/g;";
	 }

	}

	$sedcommand = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f//g;s/[0-9]+ 00 $defTempTagEnd \\*\\f//g'";
}

if($command)
{
    print $sedcommand."\n";
    exit 0;
}
exec $preproc.$sedcommand.$postproc;