mgiza

2017-01-21 17:01:15 +01:00 · 2017-01-21 17:01:15 +01:00 · 35a78669a3
commit 35a78669a3
parent 254e028f23
17 changed files with 2211 additions and 0 deletions
--- a/mgiza-aligner/.gitignore
+++ b/mgiza-aligner/.gitignore
@ -0,0 +1 @@
+corpora/*
--- a/mgiza-aligner/Makefile
+++ b/mgiza-aligner/Makefile
@ -0,0 +1,39 @@
+SRC_LANG=en
+TRG_LANG=pl
+CORPUS_NAME=europarl
+
+all: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.low_trg.low.cooc corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb
+	mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
+	cat corpora/$(CORPUS_NAME)/aligned*part* > corpora/$(CORPUS_NAME)/aligned.txt
+
+clean:
+	rm -f corpora/$(CORPUS_NAME)/*.tok
+	rm -f corpora/$(CORPUS_NAME)/*.low
+	rm -f corpora/$(CORPUS_NAME)/*.classes
+	rm -f corpora/$(CORPUS_NAME)/*.classes.cats
+	rm -f corpora/$(CORPUS_NAME)/*.vcb
+	rm -f corpora/$(CORPUS_NAME)/*.snt
+	rm -f corpora/$(CORPUS_NAME)/*.cooc
+	rm -f corpora/$(CORPUS_NAME)/aligned*
+	rm -f corpora/$(CORPUS_NAME)/giza.cfg
+
+corpora/$(CORPUS_NAME)/giza.cfg: giza.cfg.pattern
+	sed 's/CORPUS_NAME/'$(CORPUS_NAME)'/' < $< > $@
+
+corpora/$(CORPUS_NAME)/src.low_trg.low.cooc: corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb corpora/$(CORPUS_NAME)/src.low_trg.low.snt
+	mgiza/mgizapp/bin/snt2cooc $@ corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb corpora/$(CORPUS_NAME)/src.low_trg.low.snt
+
+corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/trg.low_src.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb: corpora/$(CORPUS_NAME)/src.low corpora/$(CORPUS_NAME)/trg.low
+	mgiza/mgizapp/bin/plain2snt corpora/$(CORPUS_NAME)/src.low corpora/$(CORPUS_NAME)/trg.low
+
+corpora/$(CORPUS_NAME)/%.classes: corpora/$(CORPUS_NAME)/%.low
+	mgiza/mgizapp/bin/mkcls -n10 -p$< -V$@
+
+corpora/$(CORPUS_NAME)/%.low: corpora/$(CORPUS_NAME)/%.tok
+	tr '[:upper:]' '[:lower:]' < $< > $@
+
+corpora/$(CORPUS_NAME)/src.tok: corpora/$(CORPUS_NAME)/src.txt
+	europarl/tools/tokenizer.perl -l $(SRC_LANG) < $< > $@
+
+corpora/$(CORPUS_NAME)/trg.tok: corpora/$(CORPUS_NAME)/trg.txt
+	europarl/tools/tokenizer.perl -l $(TRG_LANG) < $< > $@
--- a/mgiza-aligner/europarl/README
+++ b/mgiza-aligner/europarl/README
@ -0,0 +1,58 @@
+Europarl Release v3 -- Sept 27, 2007
+===================================
+
+This is a parallel corpus that was extracted from the
+European Parliament web site by Philipp Koehn (University 
+of Edinburgh). It is faily big, 40 million words per 
+language, and its main intended use is to aid 
+statistical machine translation research.
+
+More information can be found at
+	http://www.statmt.org/europarl/
+
+The main difference in this release vs. the first release
+in 2002 and second release in 2003 is that it is larger 
+and it comes with improved processing tools that allow 
+the creation of parallel corpora between any two of the 
+11 languages.
+
+Some data is now tagged with the original language the text
+was spoken in.
+
+Sentence aligner
+----------------
+You can create any parallel corpus with the command
+
+	./sentence-align-corpus.perl L1 L2
+
+where L1 and L2 can be any of the 11 languages
+	da de el en es fi fr it nl pt sv
+
+The output is stored in the aligned/ directory.
+
+NOTE: To use this corpus with tools like Giza++, you want to
+- lowercase the text (recommended)
+- strip empty lines and their correspondences (recommended)
+- tokenize words and punctuation (recommended)
+- remove lines with XML-Tags (starting with "<") (required) 
+
+The sentence aligner uses the split-sentences.perl script, 
+which does and sentence splitting. You may want to 
+use your own preprocessor. This requires changing an 
+obvious line in the sentence aligner code. A tokenizer.perl
+script is included as well.
+
+Source
+------
+http://www3.europarl.eu.int/omk/omnsapir.so/calendar?APP=CRE&LANGUE=EN
+
+Copyright in the Europarl service
+(c) European Communities
+Except where otherwise indicated, reproduction is authorised,
+provided that the source is acknowledged. 
+
+Change Log
+----------
+Preprocessing is improved.
+This release covers 9/1996 - 10/2006.
+Includes sentence aligner and tokenizer.
--- a/mgiza-aligner/europarl/sentence-align-corpus.perl
+++ b/mgiza-aligner/europarl/sentence-align-corpus.perl
@ -0,0 +1,253 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Encode;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+binmode(STDERR, ":utf8");
+
+
+my $dir = "txt";
+my $outdir = "aligned";
+my $preprocessor = "tools/split-sentences.perl -q";
+
+my ($l1,$l2) = @ARGV;
+die unless -e "$dir/$l1";
+die unless -e "$dir/$l2";
+
+`mkdir -p $outdir/$l1-$l2/$l1`;
+`mkdir -p $outdir/$l1-$l2/$l2`;
+
+my ($dayfile,$s1); # globals for reporting reasons
+open(LS,"ls $dir/$l1|");
+while($dayfile = <LS>) {
+  chop($dayfile);
+  if (! -e "$dir/$l2/$dayfile") {
+    print "$dayfile only for $l1, not $l2, skipping\n";
+    next;
+  }
+  &align();
+}
+
+sub align {
+  my @TXT1native= `$preprocessor -l $l1 < $dir/$l1/$dayfile`;
+  my @TXT2native = `$preprocessor -l $l2 < $dir/$l2/$dayfile`;
+  my @TXT1;
+  my @TXT2;
+  
+  
+  #change perl encoding
+  foreach my $line (@TXT1native) {
+  	push(@TXT1,decode_utf8($line));
+  }
+foreach my $line (@TXT2native) {
+  	push(@TXT2,decode_utf8($line));
+  }  
+  
+  open(OUT1, ">$outdir/$l1-$l2/$l1/$dayfile");
+  open(OUT2, ">$outdir/$l1-$l2/$l2/$dayfile");
+  
+  	binmode(OUT1, ":utf8");
+	binmode(OUT2, ":utf8");
+
+
+  for(my $i2=0,my $i1=0; $i1<scalar(@TXT1) && $i2<scalar(@TXT2);) {
+    
+    # match chapter start
+    if ($TXT1[$i1] =~ /^<CHAPTER ID=\"?(\d+)\"?/) {
+      my $c1 = $1;
+      #print "CHAPTER $1\n";
+      if ($TXT2[$i2] =~ /^<CHAPTER ID=\"?(\d+)\"?/) {
+	my $c2 = $1;
+	if ($c1 == $c2) {
+	  print OUT1 $TXT1[$i1++];
+	  print OUT2 $TXT2[$i2++];
+	}
+	elsif ($c1 < $c2) {
+	  $i1 = &skip(\@TXT1,$i1+1,'^<CHAPTER ID=\"?\d+\"?');
+	}
+	else {
+	  $i2 = &skip(\@TXT2,$i2+1,'^<CHAPTER ID=\"?\d+\"?');
+	}
+      }
+      else {
+	$i2 = &skip(\@TXT2,$i2,'^<CHAPTER ID=\"?\d+\"?');
+      }
+    }
+    
+    # match speaker start
+    elsif ($TXT1[$i1] =~ /^<SPEAKER ID=\"?(\d+)\"?/) {
+      $s1 = $1;
+      #print "SPEAKER $1\n";
+      if ($TXT2[$i2] =~ /^<SPEAKER ID=\"?(\d+)\"?/) {
+	my $s2 = $1;
+	if ($s1 == $s2) {
+	  print OUT1 $TXT1[$i1++];
+	  print OUT2 $TXT2[$i2++];
+	}
+	elsif ($s1 < $s2) {
+	  $i1 = &skip(\@TXT1,$i1+1,'^<SPEAKER ID=\"?\d+\"?');
+	}
+	else {
+	  $i2 = &skip(\@TXT2,$i2+1,'^<SPEAKER ID=\"?\d+\"?');
+	}
+      }
+      else {
+	$i2 = &skip(\@TXT2,$i2,'^<SPEAKER ID=\"?\d+\"?');
+      }
+    }  
+    else {
+      #print "processing... $i1,$i2\n";
+      my @P1 = &extract_paragraph(\@TXT1,\$i1);
+      my @P2 = &extract_paragraph(\@TXT2,\$i2);
+      if (scalar(@P1) != scalar(@P2)) {
+	print "$dayfile (speaker $s1) different number of paragraphs ".scalar(@P1)." != ".scalar(@P2)."\n";
+      }
+      else {
+	  for(my $p=0;$p<scalar(@P1);$p++) {
+	      &sentence_align(\@{$P1[$p]},\@{$P2[$p]});
+	  }
+      }
+    }
+  }
+}
+close(LS);
+
+sub skip {
+  my ($TXT,$i,$pattern) = @_;
+  my $i_old = $i;
+  while($i < scalar(@{$TXT})
+	&& $$TXT[$i] !~ /$pattern/) { 
+    $i++; 
+  }
+  print "$dayfile skipped lines $i_old-$i to reach '$pattern'\n";
+  return $i;
+}
+
+sub extract_paragraph {
+  my ($TXT,$i) = @_;
+  my @P = ();
+  my $p=0;
+  for(;$$i<scalar(@{$TXT}) 
+      && ${$TXT}[$$i] !~ /^<SPEAKER ID=\"?\d+\"?/
+      && ${$TXT}[$$i] !~ /^<CHAPTER ID=\"?\d+\"?/;$$i++) {
+    if (${$TXT}[$$i] =~ /^<P>/) {
+	$p++ if $P[$p];
+	# each XML tag has its own paragraph
+	push @{$P[$p]}, ${$TXT}[$$i];
+	$p++;
+    }
+    else {
+      push @{$P[$p]}, ${$TXT}[$$i];
+    }
+  }
+  return @P;
+}
+
+# this is a vanilla implementation of church and gale
+sub sentence_align {
+  my ($P1,$P2) = @_;
+  chop(@{$P1});
+  chop(@{$P2});
+
+  # parameters
+  my %PRIOR;
+  $PRIOR{1}{1} = 0.89;
+  $PRIOR{1}{0} = 0.01/2;
+  $PRIOR{0}{1} = 0.01/2;
+  $PRIOR{2}{1} = 0.089/2;
+  $PRIOR{1}{2} = 0.089/2;
+#  $PRIOR{2}{2} = 0.011;
+  
+  # compute length (in characters)
+  my (@LEN1,@LEN2);
+  $LEN1[0] = 0;
+  for(my $i=0;$i<scalar(@{$P1});$i++) {
+    my $line = $$P1[$i];
+    $line =~ s/[\s\r\n]+//g;
+#    print "1: $line\n";
+    $LEN1[$i+1] = $LEN1[$i] + length($line);
+  }
+  $LEN2[0] = 0;
+  for(my $i=0;$i<scalar(@{$P2});$i++) {
+    my $line = $$P2[$i];
+    $line =~ s/[\s\r\n]+//g;
+#    print "2: $line\n";
+    $LEN2[$i+1] = $LEN2[$i] + length($line);
+  }
+
+  # dynamic programming
+  my (@COST,@BACK);
+  $COST[0][0] = 0;
+  for(my $i1=0;$i1<=scalar(@{$P1});$i1++) {
+    for(my $i2=0;$i2<=scalar(@{$P2});$i2++) {
+      next if $i1 + $i2 == 0;
+      $COST[$i1][$i2] = 1e10;
+      foreach my $d1 (keys %PRIOR) {
+	next if $d1>$i1;
+	foreach my $d2 (keys %{$PRIOR{$d1}}) {
+	  next if $d2>$i2;
+	  my $cost = $COST[$i1-$d1][$i2-$d2] - log($PRIOR{$d1}{$d2}) +  
+	    &match($LEN1[$i1]-$LEN1[$i1-$d1], $LEN2[$i2]-$LEN2[$i2-$d2]);
+#	  print "($i1->".($i1-$d1).",$i2->".($i2-$d2).") [".($LEN1[$i1]-$LEN1[$i1-$d1]).",".($LEN2[$i2]-$LEN2[$i2-$d2])."] = $COST[$i1-$d1][$i2-$d2] - ".log($PRIOR{$d1}{$d2})." + ".&match($LEN1[$i1]-$LEN1[$i1-$d1], $LEN2[$i2]-$LEN2[$i2-$d2])." = $cost\n";
+	  if ($cost < $COST[$i1][$i2]) {
+	    $COST[$i1][$i2] = $cost;
+	    @{$BACK[$i1][$i2]} = ($i1-$d1,$i2-$d2);
+	  }
+	}
+      }
+#      print $COST[$i1][$i2]."($i1-$BACK[$i1][$i2][0],$i2-$BACK[$i1][$i2][1]) ";
+    }
+#    print "\n";
+  }
+  
+  # back tracking
+  my (%NEXT);
+  my $i1 = scalar(@{$P1});
+  my $i2 = scalar(@{$P2});
+  while($i1>0 || $i2>0) {
+#    print "back $i1 $i2\n";
+    @{$NEXT{$BACK[$i1][$i2][0]}{$BACK[$i1][$i2][1]}} = ($i1,$i2);
+    ($i1,$i2) = ($BACK[$i1][$i2][0],$BACK[$i1][$i2][1]);
+  }
+  while($i1<scalar(@{$P1}) || $i2<scalar(@{$P2})) {
+#    print "fwd $i1 $i2\n";
+    for(my $i=$i1;$i<$NEXT{$i1}{$i2}[0];$i++) {
+      print OUT1 " " unless $i == $i1;
+      print OUT1 $$P1[$i];
+    }
+    print OUT1 "\n";
+    for(my $i=$i2;$i<$NEXT{$i1}{$i2}[1];$i++) {
+      print OUT2 " " unless $i == $i2;
+      print OUT2 $$P2[$i];
+    }
+    print OUT2 "\n";
+    ($i1,$i2) = @{$NEXT{$i1}{$i2}};
+  }  
+}
+
+sub match {
+  my ($len1,$len2) = @_;
+  my $c = 1;
+  my $s2 = 6.8;
+
+  if ($len1==0 && $len2==0) { return 0; }
+  my $mean = ($len1 + $len2/$c) / 2;
+  my $z = ($c * $len1 - $len2)/sqrt($s2 * $mean);
+  if ($z < 0) { $z = -$z; }
+  my $pd = 2 * (1 - &pnorm($z));
+  if ($pd>0) { return -log($pd); }
+  return 25;
+}
+
+sub pnorm {
+  my ($z) = @_;
+  my $t = 1/(1 + 0.2316419 * $z);
+  return 1 - 0.3989423 * exp(-$z * $z / 2) *
+    ((((1.330274429 * $t 
+	- 1.821255978) * $t 
+       + 1.781477937) * $t 
+      - 0.356563782) * $t
+     + 0.319381530) * $t;
+}
--- a/mgiza-aligner/europarl/tools/README
+++ b/mgiza-aligner/europarl/tools/README
@ -0,0 +1,73 @@
+Europarl v3 Preprocessing Tools
+===============================
+written by Philipp Koehn and Josh Schroeder
+
+
+Sentence Splitter
+=================
+Usage ./split-sentences.perl -l [en|de|...] < textfile > splitfile
+
+Uses punctuation and Capitalization clues to split paragraphs of 
+sentences into files with one sentence per line. For example:
+
+This is a paragraph. It contains several sentences. "But why," you ask?
+
+goes to:
+
+This is a paragraph.
+It contains several sentences.
+"But why," you ask?
+
+See more information in the Nonbreaking Prefixes section.
+
+
+Tokenizer
+=========
+Usage ./tokenizer.perl -l [en|de|...] < textfile > tokenizedfile
+
+Splits out most punctuation from words. Special cases where splits
+do not occur are documented in the code. 
+
+This E.U. treaty is, to use the words of Mr. Smith, "awesome." 
+
+goes to:
+
+This E.U. treaty is , to use the words of Mr. Smith , " awesome . "
+
+Like the sentence splitter, it makes use of the nonbreaking_prefixes
+directory.
+
+
+Nonbreaking Prefixes Directory
+==============================
+
+Nonbreaking prefixes are loosely defined as any word ending in a
+period that does NOT indicate an end of sentence marker. A basic
+example is Mr. and Ms. in English.
+
+The sentence splitter and tokenizer included with this release
+both use the nonbreaking prefix files included in this directory.
+
+To add a file for other languages, follow the naming convention
+nonbreaking_prefix.?? and use the two-letter language code you
+intend to use when calling split-sentences.perl and tokenizer.perl.
+
+Both split-sentences and tokenizer will first look for a file for the
+language they are processing, and fall back to English if a file
+for that language is not found. If the nonbreaking_prefixes directory does
+not exist at the same location as the split-sentences.perl and tokenizer.perl
+files, they will not run.
+
+For the splitter, normally a period followed by an uppercase word
+results in a sentence split. If the word preceeding the period
+is a nonbreaking prefix, this line break is not inserted.
+
+For the tokenizer, a nonbreaking prefix is not separated from its 
+period with a space.
+
+A special case of prefixes, NUMERIC_ONLY, is included for special
+cases where the prefix should be handled ONLY when before numbers.
+For example, "Article No. 24 states this." the No. is a nonbreaking
+prefix. However, in "No. It is not true." No functions as a word.
+
+See the example prefix files included here for more examples.
--- a/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.de
+++ b/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.de
@ -0,0 +1,325 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+#no german words end in single lower-case letters, so we throw those in too.
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+
+
+#Roman Numerals. A dot after one of these is not a sentence break in German.
+I
+II
+III
+IV
+V
+VI
+VII
+VIII
+IX
+X
+XI
+XII
+XIII
+XIV
+XV
+XVI
+XVII
+XVIII
+XIX
+XX
+i
+ii
+iii
+iv
+v
+vi
+vii
+viii
+ix
+x
+xi
+xii
+xiii
+xiv
+xv
+xvi
+xvii
+xviii
+xix
+xx
+
+#Titles and Honorifics
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Ens
+Gen
+Gov
+Hon
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+
+#Misc symbols
+Mio
+Mrd
+bzw
+v
+vs
+usw
+d.h
+z.B
+u.a
+etc
+Mrd
+MwSt
+ggf
+d.J
+D.h
+m.E
+vgl
+I.F
+z.T
+sogen
+ff
+u.E
+g.U
+g.g.A
+c.-à-d
+Buchst
+u.s.w
+sog
+u.ä
+Std
+evtl
+Zt
+Chr
+u.U
+o.ä
+Ltd
+b.A
+z.Zt
+spp
+sen
+SA
+k.o
+jun
+i.H.v
+dgl
+dergl
+Co
+zzt
+usf
+s.p.a
+Dkr
+Corp
+bzgl
+BSE
+
+#Number indicators
+# add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
+No
+Nos
+Art
+Nr
+pp
+ca
+Ca
+
+#Ordinals are done with . in German - "1." = "1st" in English
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
+93
+94
+95
+96
+97
+98
+99
--- a/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.el
+++ b/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.el
@ -0,0 +1,2 @@
+# for now, just include the Greek equivalent of "Mr."
+κ
--- a/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.en
+++ b/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.en
@ -0,0 +1,107 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Drs
+Ens
+Gen
+Gov
+Hon
+Hr
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY# 
+Nos
+Art #NUMERIC_ONLY#
+Nr
+pp #NUMERIC_ONLY#
--- a/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.es
+++ b/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.es
@ -0,0 +1,246 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#Abbreviations
+a.c
+aa.rr
+abrev
+adj
+adm
+admón
+afma
+afmas
+afmo
+afmos
+ag
+am
+ap
+apdo
+art
+arts
+arz
+arzbpo
+assn
+atte
+av
+avda
+bros
+bv
+cap
+caps
+cg
+cgo
+cia
+cit
+cl
+cm
+co
+col
+corp
+cos
+cta
+cte
+ctra
+cts
+cía
+cía
+d.c
+dcha
+dept
+depto
+dg
+dl
+dm
+doc
+docs
+dpt
+dpto
+dr
+dra
+dras
+dres
+dto
+dupdo
+ed
+ee.uu
+ej
+emma
+emmas
+emmo
+emmos
+entlo
+entpo
+esp
+etc
+ex
+excm
+excma
+excmas
+excmo
+excmos
+fasc
+fdo
+fig
+figs
+fil
+fol
+fra
+gr
+grs
+gral
+ha
+hnos
+hros
+hz
+ib
+ibid
+ibíd
+id
+ilm
+ilma
+ilmas
+ilmo
+ilmos
+iltre
+inc
+intr
+izq
+izqda
+izqdo
+jr
+kc
+kcal
+kg
+khz
+kl
+km
+kw
+lda
+ldo
+lib
+lic
+lim
+loc
+ltd
+ltda
+lám
+ma
+mg
+mhz
+min
+mm
+mons
+mr
+mrs
+ms
+mss
+mtro
+máx
+mín
+ntra
+ntro
+núm
+ob
+obpo
+op
+pd
+ph
+pje
+pl
+plc
+pm
+pp
+ppal
+pral
+prof
+prov
+pról
+ps
+pta
+ptas
+pte
+pts
+pza
+pág
+págs
+párr
+rda
+rdo
+ref
+reg
+rel
+rev
+revda
+revdo
+rma
+rmo
+rte
+s
+sa
+sdad
+sec
+secret
+seg
+sg
+sig
+smo
+sr
+sra
+sras
+sres
+srs
+srta
+ss.mm
+sta
+sto
+sust
+tech
+tel
+telf
+teléf
+ten
+tfono
+tlf
+t.v.e
+tít
+ud
+uds
+vda
+vdo
+vid
+vol
+vols
+vra
+vro
+vta
+íd
+ít
--- a/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.fr
+++ b/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.fr
@ -0,0 +1,153 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender
+#usually upper case letters are initials in a name
+#no French words end in single lower-case letters, so we throw those in too?
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+
+# Period-final abbreviation list for French
+A.C.N
+A.M
+art
+ann
+apr
+av
+auj
+lib
+B.P
+boul
+ca
+c.-à-d
+cf
+ch.-l
+chap
+contr
+C.P.I
+C.Q.F.D
+C.N
+C.N.S
+C.S
+dir
+éd
+e.g
+env
+al
+etc
+E.V
+ex
+fasc
+fém
+fig
+fr
+hab
+ibid
+id
+i.e
+inf
+LL.AA
+LL.AA.II
+LL.AA.RR
+LL.AA.SS
+L.D
+LL.EE
+LL.MM
+LL.MM.II.RR
+loc.cit
+masc
+MM
+ms
+N.B
+N.D.A
+N.D.L.R
+N.D.T
+n/réf
+NN.SS
+N.S
+N.D
+N.P.A.I
+p.c.c
+pl
+pp
+p.ex
+p.j
+P.S
+R.A.S
+R.-V
+R.P
+R.I.P
+SS
+S.S
+S.A
+S.A.I
+S.A.R
+S.A.S
+S.E
+sec
+sect
+sing
+S.M
+S.M.I.R
+sq
+sqq
+suiv
+sup
+suppl
+tél
+T.S.V.P
+vb
+vol
+vs
+X.O
+Z.I
--- a/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.it
+++ b/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.it
@ -0,0 +1,134 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender
+#usually upper case letters are initials in a name
+#no Italian words end in single lower-case letters, so we throw those in too?
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+
+# Period-final abbreviation list from http://www.chass.utoronto.ca/~ngargano/corsi/corrisp/abbreviazioni.html
+a.c 
+es 
+all 
+Amn 
+Arch 
+Avv
+Bcc
+c.a
+C.A.P
+Cc
+banc
+post
+c.c.p
+c.m
+Co
+c.p
+C.P
+corr
+c.s
+c.v
+Dott
+Dr
+ecc
+Egr
+e.p.c
+fatt
+Geom
+gg
+Id
+Ing
+int
+lett
+Mo
+Mons
+N.B
+ogg
+on
+pp
+p.c
+p.c
+p.c.c
+p.es
+p.f
+p.r
+P.S
+p.v
+P.T
+Prof
+racc
+Rag
+Rev
+ric
+Rif
+RP
+RSVP
+S.A
+acc
+S.B.F
+seg
+sgg
+ss
+Sig
+Sigg
+s.n.c
+Soc
+S.p.A
+Spett
+S.P.M
+S.r.l
+tel
+u.s
+V.P
+v.r
+v.s
--- a/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.nl
+++ b/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.nl
@ -0,0 +1,115 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+#Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 
+#         http://nl.wikipedia.org/wiki/Aanspreekvorm
+#         http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+bacc
+bc
+bgen
+c.i
+dhr
+dr
+dr.h.c
+drs
+drs
+ds
+eint
+fa
+Fa
+fam
+gen
+genm
+ing
+ir
+jhr
+jkvr
+jr
+kand
+kol
+lgen
+lkol
+Lt
+maj
+Mej
+mevr
+Mme
+mr
+mr
+Mw
+o.b.s
+plv
+prof
+ritm
+tint
+Vz
+Z.D
+Z.D.H
+Z.E
+Z.Em
+Z.H
+Z.K.H
+Z.K.M
+Z.M
+z.v
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+#we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
+a.g.v
+bijv
+bijz
+bv
+d.w.z
+e.c
+e.g
+e.k
+ev
+i.p.v
+i.s.m
+i.t.t
+i.v.m
+m.a.w
+m.b.t
+m.b.v
+m.h.o
+m.i
+m.i.v
+v.w.t
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+Nr #NUMERIC_ONLY# 
+Nrs 
+nrs
+nr #NUMERIC_ONLY#
--- a/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.pl
+++ b/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.pl
@ -0,0 +1,285 @@
+adw
+afr
+akad
+al
+Al
+am
+amer
+arch
+art
+Art
+artyst
+astr
+austr
+bałt
+bdb
+bł
+bm
+br
+bryg
+bryt
+centr
+ces
+chem
+chiń
+chir
+c.k
+c.o
+cyg
+cyw
+cyt
+czes
+czw
+cd
+Cd
+czyt
+ćw
+ćwicz
+daw
+dcn
+dekl
+demokr
+det
+diec
+dł
+dn
+dot
+dol
+dop
+dost
+dosł
+h.c
+ds
+dst
+duszp
+dypl
+egz
+ekol
+ekon
+elektr
+em
+ew
+fab
+farm
+fot
+fr
+gat
+gastr
+geogr
+geol
+gimn
+głęb
+gm
+godz
+górn
+gosp
+gr
+gram
+hist
+hiszp
+hr
+Hr
+hot
+id
+in
+im
+iron
+jn
+kard
+kat
+katol
+k.k
+kk
+kol
+kl
+k.p.a
+kpc
+k.p.c
+kpt
+kr
+k.r
+krak
+k.r.o
+kryt
+kult
+laic
+łac
+niem
+woj
+nb
+np
+Nb
+Np
+pol
+pow
+m.in
+pt
+ps
+Pt
+Ps
+cdn
+jw
+ryc
+rys
+Ryc
+Rys
+tj
+tzw
+Tzw
+tzn
+zob
+ang
+ub
+ul
+pw
+pn
+pl
+al
+k
+n
+nr #NUMERIC_ONLY#
+Nr #NUMERIC_ONLY#
+ww
+wł
+ur
+zm
+żyd
+żarg
+żyw
+wył
+bp
+bp
+wyst
+tow
+Tow
+o
+sp
+Sp
+st
+spółdz
+Spółdz
+społ
+spółgł
+stoł
+stow
+Stoł
+Stow
+zn
+zew
+zewn
+zdr
+zazw
+zast
+zaw
+zał
+zal
+zam
+zak
+zakł
+zagr
+zach
+adw
+Adw
+lek
+Lek
+med
+mec
+Mec
+doc
+Doc
+dyw
+dyr
+Dyw
+Dyr
+inż
+Inż
+mgr
+Mgr
+dh
+dr
+Dh
+Dr
+p
+P
+red
+Red
+prof
+prok
+Prof
+Prok
+hab
+płk
+Płk
+nadkom
+Nadkom
+podkom
+Podkom
+ks
+Ks
+gen
+Gen
+por
+Por
+reż
+Reż
+przyp
+Przyp
+śp
+św
+śW
+Śp
+Św
+ŚW
+szer
+Szer
+pkt #NUMERIC_ONLY#
+str #NUMERIC_ONLY#
+tab #NUMERIC_ONLY#
+Tab #NUMERIC_ONLY#
+tel
+ust #NUMERIC_ONLY#
+par #NUMERIC_ONLY#
+poz
+pok
+oo
+oO
+Oo
+OO
+r #NUMERIC_ONLY#
+l #NUMERIC_ONLY#
+s #NUMERIC_ONLY#
+najśw
+Najśw
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+Ś
+Ć
+Ż
+Ź
+Dz
+Contact GitHub API Training Shop Blog About
+
--- a/mgiza-aligner/europarl/tools/split-sentences.perl
+++ b/mgiza-aligner/europarl/tools/split-sentences.perl
@ -0,0 +1,152 @@
+#!/usr/bin/perl -w
+
+# Based on Preprocessor written by Philipp Koehn
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+binmode(STDERR, ":utf8");
+
+use FindBin qw($Bin);
+use strict;
+
+my $mydir = "$Bin/nonbreaking_prefixes";
+
+my %NONBREAKING_PREFIX = ();
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+
+while (@ARGV) {
+	$_ = shift;
+	/^-l$/ && ($language = shift, next);
+	/^-q$/ && ($QUIET = 1, next);
+	/^-h$/ && ($HELP = 1, next);
+}
+
+if ($HELP) {
+    print "Usage ./split-sentences.perl (-l [en|de|...]) < textfile > splitfile\n";
+	exit;
+}
+if (!$QUIET) {
+	print STDERR "Sentence Splitter v3\n";
+	print STDERR "Language: $language\n";
+}
+
+my $prefixfile = "$mydir/nonbreaking_prefix.$language";
+
+#default back to English if we don't have a language-specific prefix file
+if (!(-e $prefixfile)) {
+	$prefixfile = "$mydir/nonbreaking_prefix.en";
+	print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
+	die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
+}
+
+if (-e "$prefixfile") {
+	open(PREFIX, "<:utf8", "$prefixfile");
+	while (<PREFIX>) {
+		my $item = $_;
+		chomp($item);
+		if (($item) && (substr($item,0,1) ne "#")) {
+			if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
+				$NONBREAKING_PREFIX{$1} = 2;
+			} else {
+				$NONBREAKING_PREFIX{$item} = 1;
+			}
+		}
+	}
+	close(PREFIX);
+}
+
+##loop text, add lines together until we get a blank line or a <p>
+my $text = "";
+while(<STDIN>) {
+	chop;
+	if (/^<.+>$/ || /^\s*$/) {
+		#time to process this block, we've hit a blank or <p>
+		&do_it_for($text,$_);
+		print "<P>\n" if (/^\s*$/ && $text); ##if we have text followed by <P>
+		$text = "";
+	}
+	else {
+		#append the text, with a space
+		$text .= $_. " ";
+	}
+}
+#do the leftover text
+&do_it_for($text,"") if $text;
+
+
+sub do_it_for {
+	my($text,$markup) = @_;
+	print &preprocess($text) if $text;
+	print "$markup\n" if ($markup =~ /^<.+>$/);
+	#chop($text);
+}
+
+sub preprocess {
+	# clean up spaces at head and tail of each line as well as any double-spacing
+	$text =~ s/ +/ /g;
+	$text =~ s/\n /\n/g;
+	$text =~ s/ \n/\n/g;
+	$text =~ s/^ //g;
+	$text =~ s/ $//g;
+	
+	#this is one paragraph
+	my($text) = @_;
+	
+	#####add sentence breaks as needed#####
+	
+	#non-period end of sentence markers (?!) followed by sentence starters.
+	$text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
+		
+	#multi-dots followed by sentence starters
+	$text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
+	
+	# add breaks for sentences that end with some sort of punctuation inside a quote or parenthetical and are followed by a possible sentence starter punctuation and upper case
+	$text =~ s/([?!\.][\ ]*[\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g;
+		
+	# add breaks for sentences that end with some sort of punctuation are followed by a sentence starter punctuation and upper case
+	$text =~ s/([?!\.]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g;
+	
+	# special punctuation cases are covered. Check all remaining periods.
+	my $word;
+	my $i;
+	my @words = split(/ /,$text);
+	$text = "";
+	for ($i=0;$i<(scalar(@words)-1);$i++) {
+		if ($words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/) {
+			#check if $1 is a known honorific and $2 is empty, never break
+			my $prefix = $1;
+			my $starting_punct = $2;
+			if($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
+				#not breaking;
+			} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
+				#not breaking - upper case acronym	
+			} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/) {
+				#the next word has a bunch of initial quotes, maybe a space, then either upper case or a number
+				$words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/));
+				#we always add a return for these unless we have a numeric non-breaker and a number start
+			}
+			
+		}
+		$text = $text.$words[$i]." ";
+	}
+	
+	#we stopped one token from the end to allow for easy look-ahead. Append it now.
+	$text = $text.$words[$i];
+	
+	# clean up spaces at head and tail of each line as well as any double-spacing
+	$text =~ s/ +/ /g;
+	$text =~ s/\n /\n/g;
+	$text =~ s/ \n/\n/g;
+	$text =~ s/^ //g;
+	$text =~ s/ $//g;
+	
+	#add trailing break
+	$text .= "\n" unless $text =~ /\n$/;
+	
+	return $text;
+	
+}
+
+
--- a/mgiza-aligner/europarl/tools/tokenizer.perl
+++ b/mgiza-aligner/europarl/tools/tokenizer.perl
@ -0,0 +1,167 @@
+#!/usr/bin/perl -w
+
+# Sample Tokenizer
+# written by Josh Schroeder, based on code by Philipp Koehn
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+use FindBin qw($Bin);
+use strict;
+#use Time::HiRes;
+
+my $mydir = "$Bin/nonbreaking_prefixes";
+
+my %NONBREAKING_PREFIX = ();
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+
+#my $start = [ Time::HiRes::gettimeofday( ) ];
+
+while (@ARGV) {
+	$_ = shift;
+	/^-l$/ && ($language = shift, next);
+	/^-q$/ && ($QUIET = 1, next);
+	/^-h$/ && ($HELP = 1, next);
+}
+
+if ($HELP) {
+	print "Usage ./tokenizer.perl (-l [en|de|...]) < textfile > tokenizedfile\n";
+	exit;
+}
+if (!$QUIET) {
+	print STDERR "Tokenizer v3\n";
+	print STDERR "Language: $language\n";
+}
+
+load_prefixes($language,\%NONBREAKING_PREFIX);
+
+if (scalar(%NONBREAKING_PREFIX) eq 0){
+	print STDERR "Warning: No known abbreviations for language '$language'\n";
+}
+
+while(<STDIN>) {
+	if (/^<.+>$/ || /^\s*$/) {
+		#don't try to tokenize XML/HTML tag lines
+		print $_;
+	}
+	else {
+		print &tokenize($_);
+	}
+}
+
+#my $duration = Time::HiRes::tv_interval( $start );
+#print STDERR ("EXECUTION TIME: ".$duration."\n");
+
+
+sub tokenize {
+	my($text) = @_;
+	chomp($text);
+	$text = " $text ";
+	
+	# seperate out all "other" special characters
+	$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+	
+	#multi-dots stay together
+	$text =~ s/\.([\.]+)/ DOTMULTI$1/g;
+	while($text =~ /DOTMULTI\./) {
+		$text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
+		$text =~ s/DOTMULTI\./DOTDOTMULTI/g;
+	}
+
+	# seperate out "," except if within numbers (5,300)
+	$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+	# separate , pre and post number
+	$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+	$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+	      
+	# turn `into '
+	$text =~ s/\`/\'/g;
+	
+	#turn '' into "
+	$text =~ s/\'\'/ \" /g;
+
+	if ($language eq "en") {
+		#split contractions right
+		$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+		$text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
+		$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+		$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
+		#special case for "1990's"
+		$text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
+	} elsif (($language eq "fr") or ($language eq "it")) {
+		#split contractions left	
+		$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+		$text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+		$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+		$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
+	} else {
+		$text =~ s/\'/ \' /g;
+	}
+	
+	#word token method
+	my @words = split(/\s/,$text);
+	$text = "";
+	for (my $i=0;$i<(scalar(@words));$i++) {
+		my $word = $words[$i];
+		if ( $word =~ /^(\S+)\.$/) {
+			my $pre = $1;
+			if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) {
+				#no change
+			} elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) {
+				#no change
+			} else {
+				$word = $pre." .";
+			}
+		}
+		$text .= $word." ";
+	}		
+
+	# clean up extraneous spaces
+	$text =~ s/ +/ /g;
+	$text =~ s/^ //g;
+	$text =~ s/ $//g;
+
+	#restore multi-dots
+	while($text =~ /DOTDOTMULTI/) {
+		$text =~ s/DOTDOTMULTI/DOTMULTI./g;
+	}
+	$text =~ s/DOTMULTI/./g;
+	
+	#ensure final line break
+	$text .= "\n" unless $text =~ /\n$/;
+
+	return $text;
+}
+
+sub load_prefixes {
+	my ($language, $PREFIX_REF) = @_;
+	
+	my $prefixfile = "$mydir/nonbreaking_prefix.$language";
+	
+	#default back to English if we don't have a language-specific prefix file
+	if (!(-e $prefixfile)) {
+		$prefixfile = "$mydir/nonbreaking_prefix.en";
+		print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
+		die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
+	}
+	
+	if (-e "$prefixfile") {
+		open(PREFIX, "<:utf8", "$prefixfile");
+		while (<PREFIX>) {
+			my $item = $_;
+			chomp($item);
+			if (($item) && (substr($item,0,1) ne "#")) {
+				if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
+					$PREFIX_REF->{$1} = 2;
+				} else {
+					$PREFIX_REF->{$item} = 1;
+				}
+			}
+		}
+		close(PREFIX);
+	}
+	
+}
+
--- a/mgiza-aligner/giza.cfg.pattern
+++ b/mgiza-aligner/giza.cfg.pattern
@ -0,0 +1,100 @@
+adbackoff 0
+compactadtable 1
+compactalignmentformat 0
+coocurrencefile corpora/CORPUS_NAME/src.low_trg.low.cooc
+corpusfile corpora/CORPUS_NAME/src.low_trg.low.snt
+countcutoff 1e-06
+countcutoffal 1e-05
+countincreasecutoff 1e-06
+countincreasecutoffal 1e-05
+countoutputprefix
+d
+deficientdistortionforemptyword 0
+depm4 76
+depm5 68
+dictionary
+dopeggingyn 0
+dumpcount 0
+dumpcountusingwordstring 0
+emalignmentdependencies 2
+emalsmooth 0.2
+emprobforempty 0.4
+emsmoothhmm 2
+hmmdumpfrequency 0
+hmmiterations 5
+log 0
+logfile corpora/CORPUS_NAME/mgiza.log
+m1 5
+m2 0
+m3 3
+m4 3
+m5 0
+m5p0 -1
+m6 0
+manlexfactor1 0
+manlexfactor2 0
+manlexmaxmultiplicity 20
+maxfertility 10
+maxsentencelength 101
+mh 5
+mincountincrease 1e-07
+ml 101
+model1dumpfrequency 1
+model1iterations 5
+model23smoothfactor 0
+model2dumpfrequency 0
+model2iterations 0
+model345dumpfrequency 0
+model3dumpfrequency 0
+model3iterations 3
+model4iterations 3
+model4smoothfactor 0.4
+model5iterations 0
+model5smoothfactor 0.1
+model6iterations 0
+nbestalignments 0
+ncpus 2
+nodumps 1
+nofiledumpsyn 1
+noiterationsmodel1 5
+noiterationsmodel2 0
+noiterationsmodel3 3
+noiterationsmodel4 3
+noiterationsmodel5 0
+noiterationsmodel6 0
+nsmooth 4
+nsmoothgeneral 0
+numberofiterationsforhmmalignmentmodel 5
+onlyaldumps 1
+outputfileprefix corpora/CORPUS_NAME/aligned
+outputpath
+p 0
+p0 0.999
+peggedcutoff 0.03
+pegging 0
+previousa
+previousd
+previousd4
+previousd42
+previoushmm
+previousn
+previousp0
+previoust
+probcutoff 1e-07
+probsmooth 1e-07
+readtableprefix
+restart 0
+sourcevocabularyfile corpora/CORPUS_NAME/src.low.vcb
+t1 1
+t2 0
+t2to3 0
+t3 0
+t345 0
+targetvocabularyfile corpora/CORPUS_NAME/trg.low.vcb
+tc
+testcorpusfile
+th 0
+transferdumpfrequency 0
+v 0
+verbose 0
+verbosesentence -10
--- a/mgiza-aligner/mgiza
+++ b/mgiza-aligner/mgiza
@ -0,0 +1 @@
+Subproject commit d643960de98565d208114780ba8025799208afa7
				`@ -0,0 +1 @@`
				`Subproject commit d643960de98565d208114780ba8025799208afa7`