168 lines
4.1 KiB
Plaintext
168 lines
4.1 KiB
Plaintext
|
#!/usr/bin/perl -w
|
||
|
|
||
|
# Sample Tokenizer
|
||
|
# written by Josh Schroeder, based on code by Philipp Koehn
|
||
|
|
||
|
binmode(STDIN, ":utf8");
|
||
|
binmode(STDOUT, ":utf8");
|
||
|
|
||
|
use FindBin qw($Bin);
|
||
|
use strict;
|
||
|
#use Time::HiRes;
|
||
|
|
||
|
my $mydir = "$Bin/nonbreaking_prefixes";
|
||
|
|
||
|
my %NONBREAKING_PREFIX = ();
|
||
|
my $language = "en";
|
||
|
my $QUIET = 0;
|
||
|
my $HELP = 0;
|
||
|
|
||
|
#my $start = [ Time::HiRes::gettimeofday( ) ];
|
||
|
|
||
|
while (@ARGV) {
|
||
|
$_ = shift;
|
||
|
/^-l$/ && ($language = shift, next);
|
||
|
/^-q$/ && ($QUIET = 1, next);
|
||
|
/^-h$/ && ($HELP = 1, next);
|
||
|
}
|
||
|
|
||
|
if ($HELP) {
|
||
|
print "Usage ./tokenizer.perl (-l [en|de|...]) < textfile > tokenizedfile\n";
|
||
|
exit;
|
||
|
}
|
||
|
if (!$QUIET) {
|
||
|
print STDERR "Tokenizer v3\n";
|
||
|
print STDERR "Language: $language\n";
|
||
|
}
|
||
|
|
||
|
load_prefixes($language,\%NONBREAKING_PREFIX);
|
||
|
|
||
|
if (scalar(%NONBREAKING_PREFIX) eq 0){
|
||
|
print STDERR "Warning: No known abbreviations for language '$language'\n";
|
||
|
}
|
||
|
|
||
|
while(<STDIN>) {
|
||
|
if (/^<.+>$/ || /^\s*$/) {
|
||
|
#don't try to tokenize XML/HTML tag lines
|
||
|
print $_;
|
||
|
}
|
||
|
else {
|
||
|
print &tokenize($_);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#my $duration = Time::HiRes::tv_interval( $start );
|
||
|
#print STDERR ("EXECUTION TIME: ".$duration."\n");
|
||
|
|
||
|
|
||
|
sub tokenize {
|
||
|
my($text) = @_;
|
||
|
chomp($text);
|
||
|
$text = " $text ";
|
||
|
|
||
|
# seperate out all "other" special characters
|
||
|
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
|
||
|
|
||
|
#multi-dots stay together
|
||
|
$text =~ s/\.([\.]+)/ DOTMULTI$1/g;
|
||
|
while($text =~ /DOTMULTI\./) {
|
||
|
$text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
|
||
|
$text =~ s/DOTMULTI\./DOTDOTMULTI/g;
|
||
|
}
|
||
|
|
||
|
# seperate out "," except if within numbers (5,300)
|
||
|
$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
|
||
|
# separate , pre and post number
|
||
|
$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
|
||
|
$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
|
||
|
|
||
|
# turn `into '
|
||
|
$text =~ s/\`/\'/g;
|
||
|
|
||
|
#turn '' into "
|
||
|
$text =~ s/\'\'/ \" /g;
|
||
|
|
||
|
if ($language eq "en") {
|
||
|
#split contractions right
|
||
|
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
|
||
|
$text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
|
||
|
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
|
||
|
$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
|
||
|
#special case for "1990's"
|
||
|
$text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
|
||
|
} elsif (($language eq "fr") or ($language eq "it")) {
|
||
|
#split contractions left
|
||
|
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
|
||
|
$text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
|
||
|
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
|
||
|
$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
|
||
|
} else {
|
||
|
$text =~ s/\'/ \' /g;
|
||
|
}
|
||
|
|
||
|
#word token method
|
||
|
my @words = split(/\s/,$text);
|
||
|
$text = "";
|
||
|
for (my $i=0;$i<(scalar(@words));$i++) {
|
||
|
my $word = $words[$i];
|
||
|
if ( $word =~ /^(\S+)\.$/) {
|
||
|
my $pre = $1;
|
||
|
if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) {
|
||
|
#no change
|
||
|
} elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) {
|
||
|
#no change
|
||
|
} else {
|
||
|
$word = $pre." .";
|
||
|
}
|
||
|
}
|
||
|
$text .= $word." ";
|
||
|
}
|
||
|
|
||
|
# clean up extraneous spaces
|
||
|
$text =~ s/ +/ /g;
|
||
|
$text =~ s/^ //g;
|
||
|
$text =~ s/ $//g;
|
||
|
|
||
|
#restore multi-dots
|
||
|
while($text =~ /DOTDOTMULTI/) {
|
||
|
$text =~ s/DOTDOTMULTI/DOTMULTI./g;
|
||
|
}
|
||
|
$text =~ s/DOTMULTI/./g;
|
||
|
|
||
|
#ensure final line break
|
||
|
$text .= "\n" unless $text =~ /\n$/;
|
||
|
|
||
|
return $text;
|
||
|
}
|
||
|
|
||
|
sub load_prefixes {
|
||
|
my ($language, $PREFIX_REF) = @_;
|
||
|
|
||
|
my $prefixfile = "$mydir/nonbreaking_prefix.$language";
|
||
|
|
||
|
#default back to English if we don't have a language-specific prefix file
|
||
|
if (!(-e $prefixfile)) {
|
||
|
$prefixfile = "$mydir/nonbreaking_prefix.en";
|
||
|
print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
|
||
|
die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
|
||
|
}
|
||
|
|
||
|
if (-e "$prefixfile") {
|
||
|
open(PREFIX, "<:utf8", "$prefixfile");
|
||
|
while (<PREFIX>) {
|
||
|
my $item = $_;
|
||
|
chomp($item);
|
||
|
if (($item) && (substr($item,0,1) ne "#")) {
|
||
|
if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
|
||
|
$PREFIX_REF->{$1} = 2;
|
||
|
} else {
|
||
|
$PREFIX_REF->{$item} = 1;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
close(PREFIX);
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|