#!/usr/bin/perl -w use strict; use Encode; binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); binmode(STDERR, ":utf8"); my $dir = "txt"; my $outdir = "aligned"; my $preprocessor = "tools/split-sentences.perl -q"; my ($l1,$l2) = @ARGV; die unless -e "$dir/$l1"; die unless -e "$dir/$l2"; `mkdir -p $outdir/$l1-$l2/$l1`; `mkdir -p $outdir/$l1-$l2/$l2`; my ($dayfile,$s1); # globals for reporting reasons open(LS,"ls $dir/$l1|"); while($dayfile = ) { chop($dayfile); if (! -e "$dir/$l2/$dayfile") { print "$dayfile only for $l1, not $l2, skipping\n"; next; } &align(); } sub align { my @TXT1native= `$preprocessor -l $l1 < $dir/$l1/$dayfile`; my @TXT2native = `$preprocessor -l $l2 < $dir/$l2/$dayfile`; my @TXT1; my @TXT2; #change perl encoding foreach my $line (@TXT1native) { push(@TXT1,decode_utf8($line)); } foreach my $line (@TXT2native) { push(@TXT2,decode_utf8($line)); } open(OUT1, ">$outdir/$l1-$l2/$l1/$dayfile"); open(OUT2, ">$outdir/$l1-$l2/$l2/$dayfile"); binmode(OUT1, ":utf8"); binmode(OUT2, ":utf8"); for(my $i2=0,my $i1=0; $i1/) { $p++ if $P[$p]; # each XML tag has its own paragraph push @{$P[$p]}, ${$TXT}[$$i]; $p++; } else { push @{$P[$p]}, ${$TXT}[$$i]; } } return @P; } # this is a vanilla implementation of church and gale sub sentence_align { my ($P1,$P2) = @_; chop(@{$P1}); chop(@{$P2}); # parameters my %PRIOR; $PRIOR{1}{1} = 0.89; $PRIOR{1}{0} = 0.01/2; $PRIOR{0}{1} = 0.01/2; $PRIOR{2}{1} = 0.089/2; $PRIOR{1}{2} = 0.089/2; # $PRIOR{2}{2} = 0.011; # compute length (in characters) my (@LEN1,@LEN2); $LEN1[0] = 0; for(my $i=0;$i$i1; foreach my $d2 (keys %{$PRIOR{$d1}}) { next if $d2>$i2; my $cost = $COST[$i1-$d1][$i2-$d2] - log($PRIOR{$d1}{$d2}) + &match($LEN1[$i1]-$LEN1[$i1-$d1], $LEN2[$i2]-$LEN2[$i2-$d2]); # print "($i1->".($i1-$d1).",$i2->".($i2-$d2).") [".($LEN1[$i1]-$LEN1[$i1-$d1]).",".($LEN2[$i2]-$LEN2[$i2-$d2])."] = $COST[$i1-$d1][$i2-$d2] - ".log($PRIOR{$d1}{$d2})." + ".&match($LEN1[$i1]-$LEN1[$i1-$d1], $LEN2[$i2]-$LEN2[$i2-$d2])." = $cost\n"; if ($cost < $COST[$i1][$i2]) { $COST[$i1][$i2] = $cost; @{$BACK[$i1][$i2]} = ($i1-$d1,$i2-$d2); } } } # print $COST[$i1][$i2]."($i1-$BACK[$i1][$i2][0],$i2-$BACK[$i1][$i2][1]) "; } # print "\n"; } # back tracking my (%NEXT); my $i1 = scalar(@{$P1}); my $i2 = scalar(@{$P2}); while($i1>0 || $i2>0) { # print "back $i1 $i2\n"; @{$NEXT{$BACK[$i1][$i2][0]}{$BACK[$i1][$i2][1]}} = ($i1,$i2); ($i1,$i2) = ($BACK[$i1][$i2][0],$BACK[$i1][$i2][1]); } while($i10) { return -log($pd); } return 25; } sub pnorm { my ($z) = @_; my $t = 1/(1 + 0.2316419 * $z); return 1 - 0.3989423 * exp(-$z * $z / 2) * ((((1.330274429 * $t - 1.821255978) * $t + 1.781477937) * $t - 0.356563782) * $t + 0.319381530) * $t; }