mgiza

2017-01-21 17:01:15 +01:00 · 2017-01-21 17:01:15 +01:00 · 35a78669a3
commit 35a78669a3
parent 254e028f23
17 changed files with 2211 additions and 0 deletions
--- a/mgiza-aligner/.gitignore
+++ b/mgiza-aligner/.gitignore
@ -0,0 +1 @@
 corpora/*
--- a/mgiza-aligner/Makefile
+++ b/mgiza-aligner/Makefile
@ -0,0 +1,39 @@
 SRC_LANG=en
 TRG_LANG=pl
 CORPUS_NAME=europarl
 all: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.low_trg.low.cooc corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb
 	mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
 	cat corpora/$(CORPUS_NAME)/aligned*part* > corpora/$(CORPUS_NAME)/aligned.txt
 clean:
 	rm -f corpora/$(CORPUS_NAME)/*.tok
 	rm -f corpora/$(CORPUS_NAME)/*.low
 	rm -f corpora/$(CORPUS_NAME)/*.classes
 	rm -f corpora/$(CORPUS_NAME)/*.classes.cats
 	rm -f corpora/$(CORPUS_NAME)/*.vcb
 	rm -f corpora/$(CORPUS_NAME)/*.snt
 	rm -f corpora/$(CORPUS_NAME)/*.cooc
 	rm -f corpora/$(CORPUS_NAME)/aligned*
 	rm -f corpora/$(CORPUS_NAME)/giza.cfg
 corpora/$(CORPUS_NAME)/giza.cfg: giza.cfg.pattern
 	sed 's/CORPUS_NAME/'$(CORPUS_NAME)'/' < $< > $@
 corpora/$(CORPUS_NAME)/src.low_trg.low.cooc: corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb corpora/$(CORPUS_NAME)/src.low_trg.low.snt
 	mgiza/mgizapp/bin/snt2cooc $@ corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb corpora/$(CORPUS_NAME)/src.low_trg.low.snt
 corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/trg.low_src.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb: corpora/$(CORPUS_NAME)/src.low corpora/$(CORPUS_NAME)/trg.low
 	mgiza/mgizapp/bin/plain2snt corpora/$(CORPUS_NAME)/src.low corpora/$(CORPUS_NAME)/trg.low
 corpora/$(CORPUS_NAME)/%.classes: corpora/$(CORPUS_NAME)/%.low
 	mgiza/mgizapp/bin/mkcls -n10 -p$< -V$@
 corpora/$(CORPUS_NAME)/%.low: corpora/$(CORPUS_NAME)/%.tok
 	tr '[:upper:]' '[:lower:]' < $< > $@
 corpora/$(CORPUS_NAME)/src.tok: corpora/$(CORPUS_NAME)/src.txt
 	europarl/tools/tokenizer.perl -l $(SRC_LANG) < $< > $@
 corpora/$(CORPUS_NAME)/trg.tok: corpora/$(CORPUS_NAME)/trg.txt
 	europarl/tools/tokenizer.perl -l $(TRG_LANG) < $< > $@
--- a/mgiza-aligner/europarl/README
+++ b/mgiza-aligner/europarl/README
@ -0,0 +1,58 @@
 Europarl Release v3 -- Sept 27, 2007
 ===================================
 This is a parallel corpus that was extracted from the
 European Parliament web site by Philipp Koehn (University 
 of Edinburgh). It is faily big, 40 million words per 
 language, and its main intended use is to aid 
 statistical machine translation research.
 More information can be found at
 	http://www.statmt.org/europarl/
 The main difference in this release vs. the first release
 in 2002 and second release in 2003 is that it is larger 
 and it comes with improved processing tools that allow 
 the creation of parallel corpora between any two of the 
 11 languages.
 Some data is now tagged with the original language the text
 was spoken in.
 Sentence aligner
 ----------------
 You can create any parallel corpus with the command
 	./sentence-align-corpus.perl L1 L2
 where L1 and L2 can be any of the 11 languages
 	da de el en es fi fr it nl pt sv
 The output is stored in the aligned/ directory.
 NOTE: To use this corpus with tools like Giza++, you want to
 - lowercase the text (recommended)
 - strip empty lines and their correspondences (recommended)
 - tokenize words and punctuation (recommended)
 - remove lines with XML-Tags (starting with "<") (required) 
 The sentence aligner uses the split-sentences.perl script, 
 which does and sentence splitting. You may want to 
 use your own preprocessor. This requires changing an 
 obvious line in the sentence aligner code. A tokenizer.perl
 script is included as well.
 Source
 ------
 http://www3.europarl.eu.int/omk/omnsapir.so/calendar?APP=CRE&LANGUE=EN
 Copyright in the Europarl service
 (c) European Communities
 Except where otherwise indicated, reproduction is authorised,
 provided that the source is acknowledged. 
 Change Log
 ----------
 Preprocessing is improved.
 This release covers 9/1996 - 10/2006.
 Includes sentence aligner and tokenizer.
--- a/mgiza-aligner/europarl/sentence-align-corpus.perl
+++ b/mgiza-aligner/europarl/sentence-align-corpus.perl
@ -0,0 +1,253 @@
 #!/usr/bin/perl -w
 use strict;
 use Encode;
 binmode(STDIN, ":utf8");
 binmode(STDOUT, ":utf8");
 binmode(STDERR, ":utf8");
 my $dir = "txt";
 my $outdir = "aligned";
 my $preprocessor = "tools/split-sentences.perl -q";
 my ($l1,$l2) = @ARGV;
 die unless -e "$dir/$l1";
 die unless -e "$dir/$l2";
 `mkdir -p $outdir/$l1-$l2/$l1`;
 `mkdir -p $outdir/$l1-$l2/$l2`;
 my ($dayfile,$s1); # globals for reporting reasons
 open(LS,"ls $dir/$l1|");
 while($dayfile = <LS>) {
  chop($dayfile);
  if (! -e "$dir/$l2/$dayfile") {
    print "$dayfile only for $l1, not $l2, skipping\n";
    next;
  }
  &align();
 }
 sub align {
  my @TXT1native= `$preprocessor -l $l1 < $dir/$l1/$dayfile`;
  my @TXT2native = `$preprocessor -l $l2 < $dir/$l2/$dayfile`;
  my @TXT1;
  my @TXT2;
  #change perl encoding
  foreach my $line (@TXT1native) {
  	push(@TXT1,decode_utf8($line));
  }
 foreach my $line (@TXT2native) {
  	push(@TXT2,decode_utf8($line));
  }  
  open(OUT1, ">$outdir/$l1-$l2/$l1/$dayfile");
  open(OUT2, ">$outdir/$l1-$l2/$l2/$dayfile");
  	binmode(OUT1, ":utf8");
 	binmode(OUT2, ":utf8");
  for(my $i2=0,my $i1=0; $i1<scalar(@TXT1) && $i2<scalar(@TXT2);) {
    # match chapter start
    if ($TXT1[$i1] =~ /^<CHAPTER ID=\"?(\d+)\"?/) {
      my $c1 = $1;
      #print "CHAPTER $1\n";
      if ($TXT2[$i2] =~ /^<CHAPTER ID=\"?(\d+)\"?/) {
 	my $c2 = $1;
 	if ($c1 == $c2) {
 	  print OUT1 $TXT1[$i1++];
 	  print OUT2 $TXT2[$i2++];
 	}
 	elsif ($c1 < $c2) {
 	  $i1 = &skip(\@TXT1,$i1+1,'^<CHAPTER ID=\"?\d+\"?');
 	}
 	else {
 	  $i2 = &skip(\@TXT2,$i2+1,'^<CHAPTER ID=\"?\d+\"?');
 	}
      }
      else {
 	$i2 = &skip(\@TXT2,$i2,'^<CHAPTER ID=\"?\d+\"?');
      }
    }
    # match speaker start
    elsif ($TXT1[$i1] =~ /^<SPEAKER ID=\"?(\d+)\"?/) {
      $s1 = $1;
      #print "SPEAKER $1\n";
      if ($TXT2[$i2] =~ /^<SPEAKER ID=\"?(\d+)\"?/) {
 	my $s2 = $1;
 	if ($s1 == $s2) {
 	  print OUT1 $TXT1[$i1++];
 	  print OUT2 $TXT2[$i2++];
 	}
 	elsif ($s1 < $s2) {
 	  $i1 = &skip(\@TXT1,$i1+1,'^<SPEAKER ID=\"?\d+\"?');
 	}
 	else {
 	  $i2 = &skip(\@TXT2,$i2+1,'^<SPEAKER ID=\"?\d+\"?');
 	}
      }
      else {
 	$i2 = &skip(\@TXT2,$i2,'^<SPEAKER ID=\"?\d+\"?');
      }
    }  
    else {
      #print "processing... $i1,$i2\n";
      my @P1 = &extract_paragraph(\@TXT1,\$i1);
      my @P2 = &extract_paragraph(\@TXT2,\$i2);
      if (scalar(@P1) != scalar(@P2)) {
 	print "$dayfile (speaker $s1) different number of paragraphs ".scalar(@P1)." != ".scalar(@P2)."\n";
      }
      else {
 	  for(my $p=0;$p<scalar(@P1);$p++) {
 	      &sentence_align(\@{$P1[$p]},\@{$P2[$p]});
 	  }
      }
    }
  }
 }
 close(LS);
 sub skip {
  my ($TXT,$i,$pattern) = @_;
  my $i_old = $i;
  while($i < scalar(@{$TXT})
 	&& $$TXT[$i] !~ /$pattern/) { 
    $i++; 
  }
  print "$dayfile skipped lines $i_old-$i to reach '$pattern'\n";
  return $i;
 }
 sub extract_paragraph {
  my ($TXT,$i) = @_;
  my @P = ();
  my $p=0;
  for(;$$i<scalar(@{$TXT}) 
      && ${$TXT}[$$i] !~ /^<SPEAKER ID=\"?\d+\"?/
      && ${$TXT}[$$i] !~ /^<CHAPTER ID=\"?\d+\"?/;$$i++) {
    if (${$TXT}[$$i] =~ /^<P>/) {
 	$p++ if $P[$p];
 	# each XML tag has its own paragraph
 	push @{$P[$p]}, ${$TXT}[$$i];
 	$p++;
    }
    else {
      push @{$P[$p]}, ${$TXT}[$$i];
    }
  }
  return @P;
 }
 # this is a vanilla implementation of church and gale
 sub sentence_align {
  my ($P1,$P2) = @_;
  chop(@{$P1});
  chop(@{$P2});
  # parameters
  my %PRIOR;
  $PRIOR{1}{1} = 0.89;
  $PRIOR{1}{0} = 0.01/2;
  $PRIOR{0}{1} = 0.01/2;
  $PRIOR{2}{1} = 0.089/2;
  $PRIOR{1}{2} = 0.089/2;
 #  $PRIOR{2}{2} = 0.011;
  # compute length (in characters)
  my (@LEN1,@LEN2);
  $LEN1[0] = 0;
  for(my $i=0;$i<scalar(@{$P1});$i++) {
    my $line = $$P1[$i];
    $line =~ s/[\s\r\n]+//g;
 #    print "1: $line\n";
    $LEN1[$i+1] = $LEN1[$i] + length($line);
  }
  $LEN2[0] = 0;
  for(my $i=0;$i<scalar(@{$P2});$i++) {
    my $line = $$P2[$i];
    $line =~ s/[\s\r\n]+//g;
 #    print "2: $line\n";
    $LEN2[$i+1] = $LEN2[$i] + length($line);
  }
  # dynamic programming
  my (@COST,@BACK);
  $COST[0][0] = 0;
  for(my $i1=0;$i1<=scalar(@{$P1});$i1++) {
    for(my $i2=0;$i2<=scalar(@{$P2});$i2++) {
      next if $i1 + $i2 == 0;
      $COST[$i1][$i2] = 1e10;
      foreach my $d1 (keys %PRIOR) {
 	next if $d1>$i1;
 	foreach my $d2 (keys %{$PRIOR{$d1}}) {
 	  next if $d2>$i2;
 	  my $cost = $COST[$i1-$d1][$i2-$d2] - log($PRIOR{$d1}{$d2}) +  
 	    &match($LEN1[$i1]-$LEN1[$i1-$d1], $LEN2[$i2]-$LEN2[$i2-$d2]);
 #	  print "($i1->".($i1-$d1).",$i2->".($i2-$d2).") [".($LEN1[$i1]-$LEN1[$i1-$d1]).",".($LEN2[$i2]-$LEN2[$i2-$d2])."] = $COST[$i1-$d1][$i2-$d2] - ".log($PRIOR{$d1}{$d2})." + ".&match($LEN1[$i1]-$LEN1[$i1-$d1], $LEN2[$i2]-$LEN2[$i2-$d2])." = $cost\n";
 	  if ($cost < $COST[$i1][$i2]) {
 	    $COST[$i1][$i2] = $cost;
 	    @{$BACK[$i1][$i2]} = ($i1-$d1,$i2-$d2);
 	  }
 	}
      }
 #      print $COST[$i1][$i2]."($i1-$BACK[$i1][$i2][0],$i2-$BACK[$i1][$i2][1]) ";
    }
 #    print "\n";
  }
  # back tracking
  my (%NEXT);
  my $i1 = scalar(@{$P1});
  my $i2 = scalar(@{$P2});
  while($i1>0 || $i2>0) {
 #    print "back $i1 $i2\n";
    @{$NEXT{$BACK[$i1][$i2][0]}{$BACK[$i1][$i2][1]}} = ($i1,$i2);
    ($i1,$i2) = ($BACK[$i1][$i2][0],$BACK[$i1][$i2][1]);
  }
  while($i1<scalar(@{$P1}) || $i2<scalar(@{$P2})) {
 #    print "fwd $i1 $i2\n";
    for(my $i=$i1;$i<$NEXT{$i1}{$i2}[0];$i++) {
      print OUT1 " " unless $i == $i1;
      print OUT1 $$P1[$i];
    }
    print OUT1 "\n";
    for(my $i=$i2;$i<$NEXT{$i1}{$i2}[1];$i++) {
      print OUT2 " " unless $i == $i2;
      print OUT2 $$P2[$i];
    }
    print OUT2 "\n";
    ($i1,$i2) = @{$NEXT{$i1}{$i2}};
  }  
 }
 sub match {
  my ($len1,$len2) = @_;
  my $c = 1;
  my $s2 = 6.8;
  if ($len1==0 && $len2==0) { return 0; }
  my $mean = ($len1 + $len2/$c) / 2;
  my $z = ($c * $len1 - $len2)/sqrt($s2 * $mean);
  if ($z < 0) { $z = -$z; }
  my $pd = 2 * (1 - &pnorm($z));
  if ($pd>0) { return -log($pd); }
  return 25;
 }
 sub pnorm {
  my ($z) = @_;
  my $t = 1/(1 + 0.2316419 * $z);
  return 1 - 0.3989423 * exp(-$z * $z / 2) *
    ((((1.330274429 * $t 
 	- 1.821255978) * $t 
       + 1.781477937) * $t 
      - 0.356563782) * $t
     + 0.319381530) * $t;
 }
--- a/mgiza-aligner/europarl/tools/README
+++ b/mgiza-aligner/europarl/tools/README
@ -0,0 +1,73 @@
 Europarl v3 Preprocessing Tools
 ===============================
 written by Philipp Koehn and Josh Schroeder
 Sentence Splitter
 =================
 Usage ./split-sentences.perl -l [en|de|...] < textfile > splitfile
 Uses punctuation and Capitalization clues to split paragraphs of 
 sentences into files with one sentence per line. For example:
 This is a paragraph. It contains several sentences. "But why," you ask?
 goes to:
 This is a paragraph.
 It contains several sentences.
 "But why," you ask?
 See more information in the Nonbreaking Prefixes section.
 Tokenizer
 =========
 Usage ./tokenizer.perl -l [en|de|...] < textfile > tokenizedfile
 Splits out most punctuation from words. Special cases where splits
 do not occur are documented in the code. 
 This E.U. treaty is, to use the words of Mr. Smith, "awesome." 
 goes to:
 This E.U. treaty is , to use the words of Mr. Smith , " awesome . "
 Like the sentence splitter, it makes use of the nonbreaking_prefixes
 directory.
 Nonbreaking Prefixes Directory
 ==============================
 Nonbreaking prefixes are loosely defined as any word ending in a
 period that does NOT indicate an end of sentence marker. A basic
 example is Mr. and Ms. in English.
 The sentence splitter and tokenizer included with this release
 both use the nonbreaking prefix files included in this directory.
 To add a file for other languages, follow the naming convention
 nonbreaking_prefix.?? and use the two-letter language code you
 intend to use when calling split-sentences.perl and tokenizer.perl.
 Both split-sentences and tokenizer will first look for a file for the
 language they are processing, and fall back to English if a file
 for that language is not found. If the nonbreaking_prefixes directory does
 not exist at the same location as the split-sentences.perl and tokenizer.perl
 files, they will not run.
 For the splitter, normally a period followed by an uppercase word
 results in a sentence split. If the word preceeding the period
 is a nonbreaking prefix, this line break is not inserted.
 For the tokenizer, a nonbreaking prefix is not separated from its 
 period with a space.
 A special case of prefixes, NUMERIC_ONLY, is included for special
 cases where the prefix should be handled ONLY when before numbers.
 For example, "Article No. 24 states this." the No. is a nonbreaking
 prefix. However, in "No. It is not true." No functions as a word.
 See the example prefix files included here for more examples.
--- a/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.de
+++ b/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.de
@ -0,0 +1,325 @@
 #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
 #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
 #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
 #usually upper case letters are initials in a name
 #no german words end in single lower-case letters, so we throw those in too.
 A
 B
 C
 D
 E
 F
 G
 H
 I
 J
 K
 L
 M
 N
 O
 P
 Q
 R
 S
 T
 U
 V
 W
 X
 Y
 Z
 a
 b
 c
 d
 e
 f
 g
 h
 i
 j
 k
 l
 m
 n
 o
 p
 q
 r
 s
 t
 u
 v
 w
 x
 y
 z
 #Roman Numerals. A dot after one of these is not a sentence break in German.
 I
 II
 III
 IV
 V
 VI
 VII
 VIII
 IX
 X
 XI
 XII
 XIII
 XIV
 XV
 XVI
 XVII
 XVIII
 XIX
 XX
 i
 ii
 iii
 iv
 v
 vi
 vii
 viii
 ix
 x
 xi
 xii
 xiii
 xiv
 xv
 xvi
 xvii
 xviii
 xix
 xx
 #Titles and Honorifics
 Adj
 Adm
 Adv
 Asst
 Bart
 Bldg
 Brig
 Bros
 Capt
 Cmdr
 Col
 Comdr
 Con
 Corp
 Cpl
 DR
 Dr
 Ens
 Gen
 Gov
 Hon
 Hosp
 Insp
 Lt
 MM
 MR
 MRS
 MS
 Maj
 Messrs
 Mlle
 Mme
 Mr
 Mrs
 Ms
 Msgr
 Op
 Ord
 Pfc
 Ph
 Prof
 Pvt
 Rep
 Reps
 Res
 Rev
 Rt
 Sen
 Sens
 Sfc
 Sgt
 Sr
 St
 Supt
 Surg
 #Misc symbols
 Mio
 Mrd
 bzw
 v
 vs
 usw
 d.h
 z.B
 u.a
 etc
 Mrd
 MwSt
 ggf
 d.J
 D.h
 m.E
 vgl
 I.F
 z.T
 sogen
 ff
 u.E
 g.U
 g.g.A
 c.-à-d
 Buchst
 u.s.w
 sog
 u.ä
 Std
 evtl
 Zt
 Chr
 u.U
 o.ä
 Ltd
 b.A
 z.Zt
 spp
 sen
 SA
 k.o
 jun
 i.H.v
 dgl
 dergl
 Co
 zzt
 usf
 s.p.a
 Dkr
 Corp
 bzgl
 BSE
 #Number indicators
 # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
 No
 Nos
 Art
 Nr
 pp
 ca
 Ca
 #Ordinals are done with . in German - "1." = "1st" in English
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
--- a/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.el
+++ b/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.el
@ -0,0 +1,2 @@
 # for now, just include the Greek equivalent of "Mr."
 κ
--- a/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.en
+++ b/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.en
@ -0,0 +1,107 @@
 #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
 #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
 #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
 #usually upper case letters are initials in a name
 A
 B
 C
 D
 E
 F
 G
 H
 I
 J
 K
 L
 M
 N
 O
 P
 Q
 R
 S
 T
 U
 V
 W
 X
 Y
 Z
 #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 Adj
 Adm
 Adv
 Asst
 Bart
 Bldg
 Brig
 Bros
 Capt
 Cmdr
 Col
 Comdr
 Con
 Corp
 Cpl
 DR
 Dr
 Drs
 Ens
 Gen
 Gov
 Hon
 Hr
 Hosp
 Insp
 Lt
 MM
 MR
 MRS
 MS
 Maj
 Messrs
 Mlle
 Mme
 Mr
 Mrs
 Ms
 Msgr
 Op
 Ord
 Pfc
 Ph
 Prof
 Pvt
 Rep
 Reps
 Res
 Rev
 Rt
 Sen
 Sens
 Sfc
 Sgt
 Sr
 St
 Supt
 Surg
 #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 v
 vs
 i.e
 rev
 e.g
 #Numbers only. These should only induce breaks when followed by a numeric sequence
 # add NUMERIC_ONLY after the word for this function
 #This case is mostly for the english "No." which can either be a sentence of its own, or
 #if followed by a number, a non-breaking prefix
 No #NUMERIC_ONLY# 
 Nos
 Art #NUMERIC_ONLY#
 Nr
 pp #NUMERIC_ONLY#
--- a/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.es
+++ b/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.es
@ -0,0 +1,246 @@
 #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
 #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
 #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
 #usually upper case letters are initials in a name
 A
 B
 C
 D
 E
 F
 G
 H
 I
 J
 K
 L
 M
 N
 O
 P
 Q
 R
 S
 T
 U
 V
 W
 X
 Y
 Z
 #Abbreviations
 a.c
 aa.rr
 abrev
 adj
 adm
 admón
 afma
 afmas
 afmo
 afmos
 ag
 am
 ap
 apdo
 art
 arts
 arz
 arzbpo
 assn
 atte
 av
 avda
 bros
 bv
 cap
 caps
 cg
 cgo
 cia
 cit
 cl
 cm
 co
 col
 corp
 cos
 cta
 cte
 ctra
 cts
 cía
 cía
 d.c
 dcha
 dept
 depto
 dg
 dl
 dm
 doc
 docs
 dpt
 dpto
 dr
 dra
 dras
 dres
 dto
 dupdo
 ed
 ee.uu
 ej
 emma
 emmas
 emmo
 emmos
 entlo
 entpo
 esp
 etc
 ex
 excm
 excma
 excmas
 excmo
 excmos
 fasc
 fdo
 fig
 figs
 fil
 fol
 fra
 gr
 grs
 gral
 ha
 hnos
 hros
 hz
 ib
 ibid
 ibíd
 id
 ilm
 ilma
 ilmas
 ilmo
 ilmos
 iltre
 inc
 intr
 izq
 izqda
 izqdo
 jr
 kc
 kcal
 kg
 khz
 kl
 km
 kw
 lda
 ldo
 lib
 lic
 lim
 loc
 ltd
 ltda
 lám
 ma
 mg
 mhz
 min
 mm
 mons
 mr
 mrs
 ms
 mss
 mtro
 máx
 mín
 ntra
 ntro
 núm
 ob
 obpo
 op
 pd
 ph
 pje
 pl
 plc
 pm
 pp
 ppal
 pral
 prof
 prov
 pról
 ps
 pta
 ptas
 pte
 pts
 pza
 pág
 págs
 párr
 rda
 rdo
 ref
 reg
 rel
 rev
 revda
 revdo
 rma
 rmo
 rte
 s
 sa
 sdad
 sec
 secret
 seg
 sg
 sig
 smo
 sr
 sra
 sras
 sres
 srs
 srta
 ss.mm
 sta
 sto
 sust
 tech
 tel
 telf
 teléf
 ten
 tfono
 tlf
 t.v.e
 tít
 ud
 uds
 vda
 vdo
 vid
 vol
 vols
 vra
 vro
 vta
 íd
 ít
--- a/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.fr
+++ b/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.fr
@ -0,0 +1,153 @@
 #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
 #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
 #any single upper case letter  followed by a period is not a sentence ender
 #usually upper case letters are initials in a name
 #no French words end in single lower-case letters, so we throw those in too?
 A
 B
 C
 D
 E
 F
 G
 H
 I
 J
 K
 L
 M
 N
 O
 P
 Q
 R
 S
 T
 U
 V
 W
 X
 Y
 Z
 a
 b
 c
 d
 e
 f
 g
 h
 i
 j
 k
 l
 m
 n
 o
 p
 q
 r
 s
 t
 u
 v
 w
 x
 y
 z
 # Period-final abbreviation list for French
 A.C.N
 A.M
 art
 ann
 apr
 av
 auj
 lib
 B.P
 boul
 ca
 c.-à-d
 cf
 ch.-l
 chap
 contr
 C.P.I
 C.Q.F.D
 C.N
 C.N.S
 C.S
 dir
 éd
 e.g
 env
 al
 etc
 E.V
 ex
 fasc
 fém
 fig
 fr
 hab
 ibid
 id
 i.e
 inf
 LL.AA
 LL.AA.II
 LL.AA.RR
 LL.AA.SS
 L.D
 LL.EE
 LL.MM
 LL.MM.II.RR
 loc.cit
 masc
 MM
 ms
 N.B
 N.D.A
 N.D.L.R
 N.D.T
 n/réf
 NN.SS
 N.S
 N.D
 N.P.A.I
 p.c.c
 pl
 pp
 p.ex
 p.j
 P.S
 R.A.S
 R.-V
 R.P
 R.I.P
 SS
 S.S
 S.A
 S.A.I
 S.A.R
 S.A.S
 S.E
 sec
 sect
 sing
 S.M
 S.M.I.R
 sq
 sqq
 suiv
 sup
 suppl
 tél
 T.S.V.P
 vb
 vol
 vs
 X.O
 Z.I
--- a/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.it
+++ b/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.it
@ -0,0 +1,134 @@
 #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
 #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
 #any single upper case letter  followed by a period is not a sentence ender
 #usually upper case letters are initials in a name
 #no Italian words end in single lower-case letters, so we throw those in too?
 A
 B
 C
 D
 E
 F
 G
 H
 I
 J
 K
 L
 M
 N
 O
 P
 Q
 R
 S
 T
 U
 V
 W
 X
 Y
 Z
 a
 b
 c
 d
 e
 f
 g
 h
 i
 j
 k
 l
 m
 n
 o
 p
 q
 r
 s
 t
 u
 v
 w
 x
 y
 z
 # Period-final abbreviation list from http://www.chass.utoronto.ca/~ngargano/corsi/corrisp/abbreviazioni.html
 a.c 
 es 
 all 
 Amn 
 Arch 
 Avv
 Bcc
 c.a
 C.A.P
 Cc
 banc
 post
 c.c.p
 c.m
 Co
 c.p
 C.P
 corr
 c.s
 c.v
 Dott
 Dr
 ecc
 Egr
 e.p.c
 fatt
 Geom
 gg
 Id
 Ing
 int
 lett
 Mo
 Mons
 N.B
 ogg
 on
 pp
 p.c
 p.c
 p.c.c
 p.es
 p.f
 p.r
 P.S
 p.v
 P.T
 Prof
 racc
 Rag
 Rev
 ric
 Rif
 RP
 RSVP
 S.A
 acc
 S.B.F
 seg
 sgg
 ss
 Sig
 Sigg
 s.n.c
 Soc
 S.p.A
 Spett
 S.P.M
 S.r.l
 tel
 u.s
 V.P
 v.r
 v.s
--- a/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.nl
+++ b/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.nl
@ -0,0 +1,115 @@
 #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
 #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
 #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 
 #         http://nl.wikipedia.org/wiki/Aanspreekvorm
 #         http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
 #any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
 #usually upper case letters are initials in a name
 A
 B
 C
 D
 E
 F
 G
 H
 I
 J
 K
 L
 M
 N
 O
 P
 Q
 R
 S
 T
 U
 V
 W
 X
 Y
 Z
 #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
 bacc
 bc
 bgen
 c.i
 dhr
 dr
 dr.h.c
 drs
 drs
 ds
 eint
 fa
 Fa
 fam
 gen
 genm
 ing
 ir
 jhr
 jkvr
 jr
 kand
 kol
 lgen
 lkol
 Lt
 maj
 Mej
 mevr
 Mme
 mr
 mr
 Mw
 o.b.s
 plv
 prof
 ritm
 tint
 Vz
 Z.D
 Z.D.H
 Z.E
 Z.Em
 Z.H
 Z.K.H
 Z.K.M
 Z.M
 z.v
 #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
 #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
 a.g.v
 bijv
 bijz
 bv
 d.w.z
 e.c
 e.g
 e.k
 ev
 i.p.v
 i.s.m
 i.t.t
 i.v.m
 m.a.w
 m.b.t
 m.b.v
 m.h.o
 m.i
 m.i.v
 v.w.t
 #Numbers only. These should only induce breaks when followed by a numeric sequence
 # add NUMERIC_ONLY after the word for this function
 #This case is mostly for the english "No." which can either be a sentence of its own, or
 #if followed by a number, a non-breaking prefix
 Nr #NUMERIC_ONLY# 
 Nrs 
 nrs
 nr #NUMERIC_ONLY#
--- a/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.pl
+++ b/mgiza-aligner/europarl/tools/nonbreaking_prefixes/nonbreaking_prefix.pl
@ -0,0 +1,285 @@
 adw
 afr
 akad
 al
 Al
 am
 amer
 arch
 art
 Art
 artyst
 astr
 austr
 bałt
 bdb
 bł
 bm
 br
 bryg
 bryt
 centr
 ces
 chem
 chiń
 chir
 c.k
 c.o
 cyg
 cyw
 cyt
 czes
 czw
 cd
 Cd
 czyt
 ćw
 ćwicz
 daw
 dcn
 dekl
 demokr
 det
 diec
 dł
 dn
 dot
 dol
 dop
 dost
 dosł
 h.c
 ds
 dst
 duszp
 dypl
 egz
 ekol
 ekon
 elektr
 em
 ew
 fab
 farm
 fot
 fr
 gat
 gastr
 geogr
 geol
 gimn
 głęb
 gm
 godz
 górn
 gosp
 gr
 gram
 hist
 hiszp
 hr
 Hr
 hot
 id
 in
 im
 iron
 jn
 kard
 kat
 katol
 k.k
 kk
 kol
 kl
 k.p.a
 kpc
 k.p.c
 kpt
 kr
 k.r
 krak
 k.r.o
 kryt
 kult
 laic
 łac
 niem
 woj
 nb
 np
 Nb
 Np
 pol
 pow
 m.in
 pt
 ps
 Pt
 Ps
 cdn
 jw
 ryc
 rys
 Ryc
 Rys
 tj
 tzw
 Tzw
 tzn
 zob
 ang
 ub
 ul
 pw
 pn
 pl
 al
 k
 n
 nr #NUMERIC_ONLY#
 Nr #NUMERIC_ONLY#
 ww
 wł
 ur
 zm
 żyd
 żarg
 żyw
 wył
 bp
 bp
 wyst
 tow
 Tow
 o
 sp
 Sp
 st
 spółdz
 Spółdz
 społ
 spółgł
 stoł
 stow
 Stoł
 Stow
 zn
 zew
 zewn
 zdr
 zazw
 zast
 zaw
 zał
 zal
 zam
 zak
 zakł
 zagr
 zach
 adw
 Adw
 lek
 Lek
 med
 mec
 Mec
 doc
 Doc
 dyw
 dyr
 Dyw
 Dyr
 inż
 Inż
 mgr
 Mgr
 dh
 dr
 Dh
 Dr
 p
 P
 red
 Red
 prof
 prok
 Prof
 Prok
 hab
 płk
 Płk
 nadkom
 Nadkom
 podkom
 Podkom
 ks
 Ks
 gen
 Gen
 por
 Por
 reż
 Reż
 przyp
 Przyp
 śp
 św
 śW
 Śp
 Św
 ŚW
 szer
 Szer
 pkt #NUMERIC_ONLY#
 str #NUMERIC_ONLY#
 tab #NUMERIC_ONLY#
 Tab #NUMERIC_ONLY#
 tel
 ust #NUMERIC_ONLY#
 par #NUMERIC_ONLY#
 poz
 pok
 oo
 oO
 Oo
 OO
 r #NUMERIC_ONLY#
 l #NUMERIC_ONLY#
 s #NUMERIC_ONLY#
 najśw
 Najśw
 A
 B
 C
 D
 E
 F
 G
 H
 I
 J
 K
 L
 M
 N
 O
 P
 Q
 R
 S
 T
 U
 V
 W
 X
 Y
 Z
 Ś
 Ć
 Ż
 Ź
 Dz
 Contact GitHub API Training Shop Blog About
--- a/mgiza-aligner/europarl/tools/split-sentences.perl
+++ b/mgiza-aligner/europarl/tools/split-sentences.perl
@ -0,0 +1,152 @@
 #!/usr/bin/perl -w
 # Based on Preprocessor written by Philipp Koehn
 binmode(STDIN, ":utf8");
 binmode(STDOUT, ":utf8");
 binmode(STDERR, ":utf8");
 use FindBin qw($Bin);
 use strict;
 my $mydir = "$Bin/nonbreaking_prefixes";
 my %NONBREAKING_PREFIX = ();
 my $language = "en";
 my $QUIET = 0;
 my $HELP = 0;
 while (@ARGV) {
 	$_ = shift;
 	/^-l$/ && ($language = shift, next);
 	/^-q$/ && ($QUIET = 1, next);
 	/^-h$/ && ($HELP = 1, next);
 }
 if ($HELP) {
    print "Usage ./split-sentences.perl (-l [en|de|...]) < textfile > splitfile\n";
 	exit;
 }
 if (!$QUIET) {
 	print STDERR "Sentence Splitter v3\n";
 	print STDERR "Language: $language\n";
 }
 my $prefixfile = "$mydir/nonbreaking_prefix.$language";
 #default back to English if we don't have a language-specific prefix file
 if (!(-e $prefixfile)) {
 	$prefixfile = "$mydir/nonbreaking_prefix.en";
 	print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
 	die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
 }
 if (-e "$prefixfile") {
 	open(PREFIX, "<:utf8", "$prefixfile");
 	while (<PREFIX>) {
 		my $item = $_;
 		chomp($item);
 		if (($item) && (substr($item,0,1) ne "#")) {
 			if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
 				$NONBREAKING_PREFIX{$1} = 2;
 			} else {
 				$NONBREAKING_PREFIX{$item} = 1;
 			}
 		}
 	}
 	close(PREFIX);
 }
 ##loop text, add lines together until we get a blank line or a <p>
 my $text = "";
 while(<STDIN>) {
 	chop;
 	if (/^<.+>$/ || /^\s*$/) {
 		#time to process this block, we've hit a blank or <p>
 		&do_it_for($text,$_);
 		print "<P>\n" if (/^\s*$/ && $text); ##if we have text followed by <P>
 		$text = "";
 	}
 	else {
 		#append the text, with a space
 		$text .= $_. " ";
 	}
 }
 #do the leftover text
 &do_it_for($text,"") if $text;
 sub do_it_for {
 	my($text,$markup) = @_;
 	print &preprocess($text) if $text;
 	print "$markup\n" if ($markup =~ /^<.+>$/);
 	#chop($text);
 }
 sub preprocess {
 	# clean up spaces at head and tail of each line as well as any double-spacing
 	$text =~ s/ +/ /g;
 	$text =~ s/\n /\n/g;
 	$text =~ s/ \n/\n/g;
 	$text =~ s/^ //g;
 	$text =~ s/ $//g;
 	#this is one paragraph
 	my($text) = @_;
 	#####add sentence breaks as needed#####
 	#non-period end of sentence markers (?!) followed by sentence starters.
 	$text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
 	#multi-dots followed by sentence starters
 	$text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
 	# add breaks for sentences that end with some sort of punctuation inside a quote or parenthetical and are followed by a possible sentence starter punctuation and upper case
 	$text =~ s/([?!\.][\ ]*[\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g;
 	# add breaks for sentences that end with some sort of punctuation are followed by a sentence starter punctuation and upper case
 	$text =~ s/([?!\.]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g;
 	# special punctuation cases are covered. Check all remaining periods.
 	my $word;
 	my $i;
 	my @words = split(/ /,$text);
 	$text = "";
 	for ($i=0;$i<(scalar(@words)-1);$i++) {
 		if ($words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/) {
 			#check if $1 is a known honorific and $2 is empty, never break
 			my $prefix = $1;
 			my $starting_punct = $2;
 			if($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
 				#not breaking;
 			} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
 				#not breaking - upper case acronym	
 			} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/) {
 				#the next word has a bunch of initial quotes, maybe a space, then either upper case or a number
 				$words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/));
 				#we always add a return for these unless we have a numeric non-breaker and a number start
 			}
 		}
 		$text = $text.$words[$i]." ";
 	}
 	#we stopped one token from the end to allow for easy look-ahead. Append it now.
 	$text = $text.$words[$i];
 	# clean up spaces at head and tail of each line as well as any double-spacing
 	$text =~ s/ +/ /g;
 	$text =~ s/\n /\n/g;
 	$text =~ s/ \n/\n/g;
 	$text =~ s/^ //g;
 	$text =~ s/ $//g;
 	#add trailing break
 	$text .= "\n" unless $text =~ /\n$/;
 	return $text;
 }
--- a/mgiza-aligner/europarl/tools/tokenizer.perl
+++ b/mgiza-aligner/europarl/tools/tokenizer.perl
@ -0,0 +1,167 @@
 #!/usr/bin/perl -w
 # Sample Tokenizer
 # written by Josh Schroeder, based on code by Philipp Koehn
 binmode(STDIN, ":utf8");
 binmode(STDOUT, ":utf8");
 use FindBin qw($Bin);
 use strict;
 #use Time::HiRes;
 my $mydir = "$Bin/nonbreaking_prefixes";
 my %NONBREAKING_PREFIX = ();
 my $language = "en";
 my $QUIET = 0;
 my $HELP = 0;
 #my $start = [ Time::HiRes::gettimeofday( ) ];
 while (@ARGV) {
 	$_ = shift;
 	/^-l$/ && ($language = shift, next);
 	/^-q$/ && ($QUIET = 1, next);
 	/^-h$/ && ($HELP = 1, next);
 }
 if ($HELP) {
 	print "Usage ./tokenizer.perl (-l [en|de|...]) < textfile > tokenizedfile\n";
 	exit;
 }
 if (!$QUIET) {
 	print STDERR "Tokenizer v3\n";
 	print STDERR "Language: $language\n";
 }
 load_prefixes($language,\%NONBREAKING_PREFIX);
 if (scalar(%NONBREAKING_PREFIX) eq 0){
 	print STDERR "Warning: No known abbreviations for language '$language'\n";
 }
 while(<STDIN>) {
 	if (/^<.+>$/ || /^\s*$/) {
 		#don't try to tokenize XML/HTML tag lines
 		print $_;
 	}
 	else {
 		print &tokenize($_);
 	}
 }
 #my $duration = Time::HiRes::tv_interval( $start );
 #print STDERR ("EXECUTION TIME: ".$duration."\n");
 sub tokenize {
 	my($text) = @_;
 	chomp($text);
 	$text = " $text ";
 	# seperate out all "other" special characters
 	$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
 	#multi-dots stay together
 	$text =~ s/\.([\.]+)/ DOTMULTI$1/g;
 	while($text =~ /DOTMULTI\./) {
 		$text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
 		$text =~ s/DOTMULTI\./DOTDOTMULTI/g;
 	}
 	# seperate out "," except if within numbers (5,300)
 	$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
 	# separate , pre and post number
 	$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
 	$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
 	# turn `into '
 	$text =~ s/\`/\'/g;
 	#turn '' into "
 	$text =~ s/\'\'/ \" /g;
 	if ($language eq "en") {
 		#split contractions right
 		$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
 		$text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
 		$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
 		$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
 		#special case for "1990's"
 		$text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
 	} elsif (($language eq "fr") or ($language eq "it")) {
 		#split contractions left	
 		$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
 		$text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
 		$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
 		$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
 	} else {
 		$text =~ s/\'/ \' /g;
 	}
 	#word token method
 	my @words = split(/\s/,$text);
 	$text = "";
 	for (my $i=0;$i<(scalar(@words));$i++) {
 		my $word = $words[$i];
 		if ( $word =~ /^(\S+)\.$/) {
 			my $pre = $1;
 			if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) {
 				#no change
 			} elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) {
 				#no change
 			} else {
 				$word = $pre." .";
 			}
 		}
 		$text .= $word." ";
 	}		
 	# clean up extraneous spaces
 	$text =~ s/ +/ /g;
 	$text =~ s/^ //g;
 	$text =~ s/ $//g;
 	#restore multi-dots
 	while($text =~ /DOTDOTMULTI/) {
 		$text =~ s/DOTDOTMULTI/DOTMULTI./g;
 	}
 	$text =~ s/DOTMULTI/./g;
 	#ensure final line break
 	$text .= "\n" unless $text =~ /\n$/;
 	return $text;
 }
 sub load_prefixes {
 	my ($language, $PREFIX_REF) = @_;
 	my $prefixfile = "$mydir/nonbreaking_prefix.$language";
 	#default back to English if we don't have a language-specific prefix file
 	if (!(-e $prefixfile)) {
 		$prefixfile = "$mydir/nonbreaking_prefix.en";
 		print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
 		die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
 	}
 	if (-e "$prefixfile") {
 		open(PREFIX, "<:utf8", "$prefixfile");
 		while (<PREFIX>) {
 			my $item = $_;
 			chomp($item);
 			if (($item) && (substr($item,0,1) ne "#")) {
 				if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
 					$PREFIX_REF->{$1} = 2;
 				} else {
 					$PREFIX_REF->{$item} = 1;
 				}
 			}
 		}
 		close(PREFIX);
 	}
 }
--- a/mgiza-aligner/giza.cfg.pattern
+++ b/mgiza-aligner/giza.cfg.pattern
@ -0,0 +1,100 @@
 adbackoff 0
 compactadtable 1
 compactalignmentformat 0
 coocurrencefile corpora/CORPUS_NAME/src.low_trg.low.cooc
 corpusfile corpora/CORPUS_NAME/src.low_trg.low.snt
 countcutoff 1e-06
 countcutoffal 1e-05
 countincreasecutoff 1e-06
 countincreasecutoffal 1e-05
 countoutputprefix
 d
 deficientdistortionforemptyword 0
 depm4 76
 depm5 68
 dictionary
 dopeggingyn 0
 dumpcount 0
 dumpcountusingwordstring 0
 emalignmentdependencies 2
 emalsmooth 0.2
 emprobforempty 0.4
 emsmoothhmm 2
 hmmdumpfrequency 0
 hmmiterations 5
 log 0
 logfile corpora/CORPUS_NAME/mgiza.log
 m1 5
 m2 0
 m3 3
 m4 3
 m5 0
 m5p0 -1
 m6 0
 manlexfactor1 0
 manlexfactor2 0
 manlexmaxmultiplicity 20
 maxfertility 10
 maxsentencelength 101
 mh 5
 mincountincrease 1e-07
 ml 101
 model1dumpfrequency 1
 model1iterations 5
 model23smoothfactor 0
 model2dumpfrequency 0
 model2iterations 0
 model345dumpfrequency 0
 model3dumpfrequency 0
 model3iterations 3
 model4iterations 3
 model4smoothfactor 0.4
 model5iterations 0
 model5smoothfactor 0.1
 model6iterations 0
 nbestalignments 0
 ncpus 2
 nodumps 1
 nofiledumpsyn 1
 noiterationsmodel1 5
 noiterationsmodel2 0
 noiterationsmodel3 3
 noiterationsmodel4 3
 noiterationsmodel5 0
 noiterationsmodel6 0
 nsmooth 4
 nsmoothgeneral 0
 numberofiterationsforhmmalignmentmodel 5
 onlyaldumps 1
 outputfileprefix corpora/CORPUS_NAME/aligned
 outputpath
 p 0
 p0 0.999
 peggedcutoff 0.03
 pegging 0
 previousa
 previousd
 previousd4
 previousd42
 previoushmm
 previousn
 previousp0
 previoust
 probcutoff 1e-07
 probsmooth 1e-07
 readtableprefix
 restart 0
 sourcevocabularyfile corpora/CORPUS_NAME/src.low.vcb
 t1 1
 t2 0
 t2to3 0
 t3 0
 t345 0
 targetvocabularyfile corpora/CORPUS_NAME/trg.low.vcb
 tc
 testcorpusfile
 th 0
 transferdumpfrequency 0
 v 0
 verbose 0
 verbosesentence -10
--- a/mgiza-aligner/mgiza
+++ b/mgiza-aligner/mgiza
@ -0,0 +1 @@
 Subproject commit d643960de98565d208114780ba8025799208afa7
		`@ -0,0 +1,2 @@`
							`# for now, just include the Greek equivalent of "Mr."`
							`κ`
		`@ -0,0 +1 @@`
							`Subproject commit d643960de98565d208114780ba8025799208afa7`