mgiza
This commit is contained in:
parent
254e028f23
commit
35a78669a3
1
mgiza-aligner/.gitignore
vendored
Normal file
1
mgiza-aligner/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
corpora/*
|
39
mgiza-aligner/Makefile
Normal file
39
mgiza-aligner/Makefile
Normal file
@ -0,0 +1,39 @@
|
||||
SRC_LANG=en
|
||||
TRG_LANG=pl
|
||||
CORPUS_NAME=europarl
|
||||
|
||||
all: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.low_trg.low.cooc corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb
|
||||
mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
|
||||
cat corpora/$(CORPUS_NAME)/aligned*part* > corpora/$(CORPUS_NAME)/aligned.txt
|
||||
|
||||
clean:
|
||||
rm -f corpora/$(CORPUS_NAME)/*.tok
|
||||
rm -f corpora/$(CORPUS_NAME)/*.low
|
||||
rm -f corpora/$(CORPUS_NAME)/*.classes
|
||||
rm -f corpora/$(CORPUS_NAME)/*.classes.cats
|
||||
rm -f corpora/$(CORPUS_NAME)/*.vcb
|
||||
rm -f corpora/$(CORPUS_NAME)/*.snt
|
||||
rm -f corpora/$(CORPUS_NAME)/*.cooc
|
||||
rm -f corpora/$(CORPUS_NAME)/aligned*
|
||||
rm -f corpora/$(CORPUS_NAME)/giza.cfg
|
||||
|
||||
corpora/$(CORPUS_NAME)/giza.cfg: giza.cfg.pattern
|
||||
sed 's/CORPUS_NAME/'$(CORPUS_NAME)'/' < $< > $@
|
||||
|
||||
corpora/$(CORPUS_NAME)/src.low_trg.low.cooc: corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb corpora/$(CORPUS_NAME)/src.low_trg.low.snt
|
||||
mgiza/mgizapp/bin/snt2cooc $@ corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb corpora/$(CORPUS_NAME)/src.low_trg.low.snt
|
||||
|
||||
corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/trg.low_src.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb: corpora/$(CORPUS_NAME)/src.low corpora/$(CORPUS_NAME)/trg.low
|
||||
mgiza/mgizapp/bin/plain2snt corpora/$(CORPUS_NAME)/src.low corpora/$(CORPUS_NAME)/trg.low
|
||||
|
||||
corpora/$(CORPUS_NAME)/%.classes: corpora/$(CORPUS_NAME)/%.low
|
||||
mgiza/mgizapp/bin/mkcls -n10 -p$< -V$@
|
||||
|
||||
corpora/$(CORPUS_NAME)/%.low: corpora/$(CORPUS_NAME)/%.tok
|
||||
tr '[:upper:]' '[:lower:]' < $< > $@
|
||||
|
||||
corpora/$(CORPUS_NAME)/src.tok: corpora/$(CORPUS_NAME)/src.txt
|
||||
europarl/tools/tokenizer.perl -l $(SRC_LANG) < $< > $@
|
||||
|
||||
corpora/$(CORPUS_NAME)/trg.tok: corpora/$(CORPUS_NAME)/trg.txt
|
||||
europarl/tools/tokenizer.perl -l $(TRG_LANG) < $< > $@
|
58
mgiza-aligner/europarl/README
Normal file
58
mgiza-aligner/europarl/README
Normal file
@ -0,0 +1,58 @@
|
||||
Europarl Release v3 -- Sept 27, 2007
|
||||
===================================
|
||||
|
||||
This is a parallel corpus that was extracted from the
|
||||
European Parliament web site by Philipp Koehn (University
|
||||
of Edinburgh). It is faily big, 40 million words per
|
||||
language, and its main intended use is to aid
|
||||
statistical machine translation research.
|
||||
|
||||
More information can be found at
|
||||
http://www.statmt.org/europarl/
|
||||
|
||||
The main difference in this release vs. the first release
|
||||
in 2002 and second release in 2003 is that it is larger
|
||||
and it comes with improved processing tools that allow
|
||||
the creation of parallel corpora between any two of the
|
||||
11 languages.
|
||||
|
||||
Some data is now tagged with the original language the text
|
||||
was spoken in.
|
||||
|
||||
Sentence aligner
|
||||
----------------
|
||||
You can create any parallel corpus with the command
|
||||
|
||||
./sentence-align-corpus.perl L1 L2
|
||||
|
||||
where L1 and L2 can be any of the 11 languages
|
||||
da de el en es fi fr it nl pt sv
|
||||
|
||||
The output is stored in the aligned/ directory.
|
||||
|
||||
NOTE: To use this corpus with tools like Giza++, you want to
|
||||
- lowercase the text (recommended)
|
||||
- strip empty lines and their correspondences (recommended)
|
||||
- tokenize words and punctuation (recommended)
|
||||
- remove lines with XML-Tags (starting with "<") (required)
|
||||
|
||||
The sentence aligner uses the split-sentences.perl script,
|
||||
which does and sentence splitting. You may want to
|
||||
use your own preprocessor. This requires changing an
|
||||
obvious line in the sentence aligner code. A tokenizer.perl
|
||||
script is included as well.
|
||||
|
||||
Source
|
||||
------
|
||||
http://www3.europarl.eu.int/omk/omnsapir.so/calendar?APP=CRE&LANGUE=EN
|
||||
|
||||
Copyright in the Europarl service
|
||||
(c) European Communities
|
||||
Except where otherwise indicated, reproduction is authorised,
|
||||
provided that the source is acknowledged.
|
||||
|
||||
Change Log
|
||||
----------
|
||||
Preprocessing is improved.
|
||||
This release covers 9/1996 - 10/2006.
|
||||
Includes sentence aligner and tokenizer.
|
253
mgiza-aligner/europarl/sentence-align-corpus.perl
Executable file
253
mgiza-aligner/europarl/sentence-align-corpus.perl
Executable file
@ -0,0 +1,253 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
use strict;
|
||||
use Encode;
|
||||
|
||||
binmode(STDIN, ":utf8");
|
||||
binmode(STDOUT, ":utf8");
|
||||
binmode(STDERR, ":utf8");
|
||||
|
||||
|
||||
my $dir = "txt";
|
||||
my $outdir = "aligned";
|
||||
my $preprocessor = "tools/split-sentences.perl -q";
|
||||
|
||||
my ($l1,$l2) = @ARGV;
|
||||
die unless -e "$dir/$l1";
|
||||
die unless -e "$dir/$l2";
|
||||
|
||||
`mkdir -p $outdir/$l1-$l2/$l1`;
|
||||
`mkdir -p $outdir/$l1-$l2/$l2`;
|
||||
|
||||
my ($dayfile,$s1); # globals for reporting reasons
|
||||
open(LS,"ls $dir/$l1|");
|
||||
while($dayfile = <LS>) {
|
||||
chop($dayfile);
|
||||
if (! -e "$dir/$l2/$dayfile") {
|
||||
print "$dayfile only for $l1, not $l2, skipping\n";
|
||||
next;
|
||||
}
|
||||
&align();
|
||||
}
|
||||
|
||||
sub align {
|
||||
my @TXT1native= `$preprocessor -l $l1 < $dir/$l1/$dayfile`;
|
||||
my @TXT2native = `$preprocessor -l $l2 < $dir/$l2/$dayfile`;
|
||||
my @TXT1;
|
||||
my @TXT2;
|
||||
|
||||
|
||||
#change perl encoding
|
||||
foreach my $line (@TXT1native) {
|
||||
push(@TXT1,decode_utf8($line));
|
||||
}
|
||||
foreach my $line (@TXT2native) {
|
||||
push(@TXT2,decode_utf8($line));
|
||||
}
|
||||
|
||||
open(OUT1, ">$outdir/$l1-$l2/$l1/$dayfile");
|
||||
open(OUT2, ">$outdir/$l1-$l2/$l2/$dayfile");
|
||||
|
||||
binmode(OUT1, ":utf8");
|
||||
binmode(OUT2, ":utf8");
|
||||
|
||||
|
||||
for(my $i2=0,my $i1=0; $i1<scalar(@TXT1) && $i2<scalar(@TXT2);) {
|
||||
|
||||
# match chapter start
|
||||
if ($TXT1[$i1] =~ /^<CHAPTER ID=\"?(\d+)\"?/) {
|
||||
my $c1 = $1;
|
||||
#print "CHAPTER $1\n";
|
||||
if ($TXT2[$i2] =~ /^<CHAPTER ID=\"?(\d+)\"?/) {
|
||||
my $c2 = $1;
|
||||
if ($c1 == $c2) {
|
||||
print OUT1 $TXT1[$i1++];
|
||||
print OUT2 $TXT2[$i2++];
|
||||
}
|
||||
elsif ($c1 < $c2) {
|
||||
$i1 = &skip(\@TXT1,$i1+1,'^<CHAPTER ID=\"?\d+\"?');
|
||||
}
|
||||
else {
|
||||
$i2 = &skip(\@TXT2,$i2+1,'^<CHAPTER ID=\"?\d+\"?');
|
||||
}
|
||||
}
|
||||
else {
|
||||
$i2 = &skip(\@TXT2,$i2,'^<CHAPTER ID=\"?\d+\"?');
|
||||
}
|
||||
}
|
||||
|
||||
# match speaker start
|
||||
elsif ($TXT1[$i1] =~ /^<SPEAKER ID=\"?(\d+)\"?/) {
|
||||
$s1 = $1;
|
||||
#print "SPEAKER $1\n";
|
||||
if ($TXT2[$i2] =~ /^<SPEAKER ID=\"?(\d+)\"?/) {
|
||||
my $s2 = $1;
|
||||
if ($s1 == $s2) {
|
||||
print OUT1 $TXT1[$i1++];
|
||||
print OUT2 $TXT2[$i2++];
|
||||
}
|
||||
elsif ($s1 < $s2) {
|
||||
$i1 = &skip(\@TXT1,$i1+1,'^<SPEAKER ID=\"?\d+\"?');
|
||||
}
|
||||
else {
|
||||
$i2 = &skip(\@TXT2,$i2+1,'^<SPEAKER ID=\"?\d+\"?');
|
||||
}
|
||||
}
|
||||
else {
|
||||
$i2 = &skip(\@TXT2,$i2,'^<SPEAKER ID=\"?\d+\"?');
|
||||
}
|
||||
}
|
||||
else {
|
||||
#print "processing... $i1,$i2\n";
|
||||
my @P1 = &extract_paragraph(\@TXT1,\$i1);
|
||||
my @P2 = &extract_paragraph(\@TXT2,\$i2);
|
||||
if (scalar(@P1) != scalar(@P2)) {
|
||||
print "$dayfile (speaker $s1) different number of paragraphs ".scalar(@P1)." != ".scalar(@P2)."\n";
|
||||
}
|
||||
else {
|
||||
for(my $p=0;$p<scalar(@P1);$p++) {
|
||||
&sentence_align(\@{$P1[$p]},\@{$P2[$p]});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
close(LS);
|
||||
|
||||
sub skip {
|
||||
my ($TXT,$i,$pattern) = @_;
|
||||
my $i_old = $i;
|
||||
while($i < scalar(@{$TXT})
|
||||
&& $$TXT[$i] !~ /$pattern/) {
|
||||
$i++;
|
||||
}
|
||||
print "$dayfile skipped lines $i_old-$i to reach '$pattern'\n";
|
||||
return $i;
|
||||
}
|
||||
|
||||
sub extract_paragraph {
|
||||
my ($TXT,$i) = @_;
|
||||
my @P = ();
|
||||
my $p=0;
|
||||
for(;$$i<scalar(@{$TXT})
|
||||
&& ${$TXT}[$$i] !~ /^<SPEAKER ID=\"?\d+\"?/
|
||||
&& ${$TXT}[$$i] !~ /^<CHAPTER ID=\"?\d+\"?/;$$i++) {
|
||||
if (${$TXT}[$$i] =~ /^<P>/) {
|
||||
$p++ if $P[$p];
|
||||
# each XML tag has its own paragraph
|
||||
push @{$P[$p]}, ${$TXT}[$$i];
|
||||
$p++;
|
||||
}
|
||||
else {
|
||||
push @{$P[$p]}, ${$TXT}[$$i];
|
||||
}
|
||||
}
|
||||
return @P;
|
||||
}
|
||||
|
||||
# this is a vanilla implementation of church and gale
|
||||
sub sentence_align {
|
||||
my ($P1,$P2) = @_;
|
||||
chop(@{$P1});
|
||||
chop(@{$P2});
|
||||
|
||||
# parameters
|
||||
my %PRIOR;
|
||||
$PRIOR{1}{1} = 0.89;
|
||||
$PRIOR{1}{0} = 0.01/2;
|
||||
$PRIOR{0}{1} = 0.01/2;
|
||||
$PRIOR{2}{1} = 0.089/2;
|
||||
$PRIOR{1}{2} = 0.089/2;
|
||||
# $PRIOR{2}{2} = 0.011;
|
||||
|
||||
# compute length (in characters)
|
||||
my (@LEN1,@LEN2);
|
||||
$LEN1[0] = 0;
|
||||
for(my $i=0;$i<scalar(@{$P1});$i++) {
|
||||
my $line = $$P1[$i];
|
||||
$line =~ s/[\s\r\n]+//g;
|
||||
# print "1: $line\n";
|
||||
$LEN1[$i+1] = $LEN1[$i] + length($line);
|
||||
}
|
||||
$LEN2[0] = 0;
|
||||
for(my $i=0;$i<scalar(@{$P2});$i++) {
|
||||
my $line = $$P2[$i];
|
||||
$line =~ s/[\s\r\n]+//g;
|
||||
# print "2: $line\n";
|
||||
$LEN2[$i+1] = $LEN2[$i] + length($line);
|
||||
}
|
||||
|
||||
# dynamic programming
|
||||
my (@COST,@BACK);
|
||||
$COST[0][0] = 0;
|
||||
for(my $i1=0;$i1<=scalar(@{$P1});$i1++) {
|
||||
for(my $i2=0;$i2<=scalar(@{$P2});$i2++) {
|
||||
next if $i1 + $i2 == 0;
|
||||
$COST[$i1][$i2] = 1e10;
|
||||
foreach my $d1 (keys %PRIOR) {
|
||||
next if $d1>$i1;
|
||||
foreach my $d2 (keys %{$PRIOR{$d1}}) {
|
||||
next if $d2>$i2;
|
||||
my $cost = $COST[$i1-$d1][$i2-$d2] - log($PRIOR{$d1}{$d2}) +
|
||||
&match($LEN1[$i1]-$LEN1[$i1-$d1], $LEN2[$i2]-$LEN2[$i2-$d2]);
|
||||
# print "($i1->".($i1-$d1).",$i2->".($i2-$d2).") [".($LEN1[$i1]-$LEN1[$i1-$d1]).",".($LEN2[$i2]-$LEN2[$i2-$d2])."] = $COST[$i1-$d1][$i2-$d2] - ".log($PRIOR{$d1}{$d2})." + ".&match($LEN1[$i1]-$LEN1[$i1-$d1], $LEN2[$i2]-$LEN2[$i2-$d2])." = $cost\n";
|
||||
if ($cost < $COST[$i1][$i2]) {
|
||||
$COST[$i1][$i2] = $cost;
|
||||
@{$BACK[$i1][$i2]} = ($i1-$d1,$i2-$d2);
|
||||
}
|
||||
}
|
||||
}
|
||||
# print $COST[$i1][$i2]."($i1-$BACK[$i1][$i2][0],$i2-$BACK[$i1][$i2][1]) ";
|
||||
}
|
||||
# print "\n";
|
||||
}
|
||||
|
||||
# back tracking
|
||||
my (%NEXT);
|
||||
my $i1 = scalar(@{$P1});
|
||||
my $i2 = scalar(@{$P2});
|
||||
while($i1>0 || $i2>0) {
|
||||
# print "back $i1 $i2\n";
|
||||
@{$NEXT{$BACK[$i1][$i2][0]}{$BACK[$i1][$i2][1]}} = ($i1,$i2);
|
||||
($i1,$i2) = ($BACK[$i1][$i2][0],$BACK[$i1][$i2][1]);
|
||||
}
|
||||
while($i1<scalar(@{$P1}) || $i2<scalar(@{$P2})) {
|
||||
# print "fwd $i1 $i2\n";
|
||||
for(my $i=$i1;$i<$NEXT{$i1}{$i2}[0];$i++) {
|
||||
print OUT1 " " unless $i == $i1;
|
||||
print OUT1 $$P1[$i];
|
||||
}
|
||||
print OUT1 "\n";
|
||||
for(my $i=$i2;$i<$NEXT{$i1}{$i2}[1];$i++) {
|
||||
print OUT2 " " unless $i == $i2;
|
||||
print OUT2 $$P2[$i];
|
||||
}
|
||||
print OUT2 "\n";
|
||||
($i1,$i2) = @{$NEXT{$i1}{$i2}};
|
||||
}
|
||||
}
|
||||
|
||||
sub match {
|
||||
my ($len1,$len2) = @_;
|
||||
my $c = 1;
|
||||
my $s2 = 6.8;
|
||||
|
||||
if ($len1==0 && $len2==0) { return 0; }
|
||||
my $mean = ($len1 + $len2/$c) / 2;
|
||||
my $z = ($c * $len1 - $len2)/sqrt($s2 * $mean);
|
||||
if ($z < 0) { $z = -$z; }
|
||||
my $pd = 2 * (1 - &pnorm($z));
|
||||
if ($pd>0) { return -log($pd); }
|
||||
return 25;
|
||||
}
|
||||
|
||||
sub pnorm {
|
||||
my ($z) = @_;
|
||||
my $t = 1/(1 + 0.2316419 * $z);
|
||||
return 1 - 0.3989423 * exp(-$z * $z / 2) *
|
||||
((((1.330274429 * $t
|
||||
- 1.821255978) * $t
|
||||
+ 1.781477937) * $t
|
||||
- 0.356563782) * $t
|
||||
+ 0.319381530) * $t;
|
||||
}
|
73
mgiza-aligner/europarl/tools/README
Normal file
73
mgiza-aligner/europarl/tools/README
Normal file
@ -0,0 +1,73 @@
|
||||
Europarl v3 Preprocessing Tools
|
||||
===============================
|
||||
written by Philipp Koehn and Josh Schroeder
|
||||
|
||||
|
||||
Sentence Splitter
|
||||
=================
|
||||
Usage ./split-sentences.perl -l [en|de|...] < textfile > splitfile
|
||||
|
||||
Uses punctuation and Capitalization clues to split paragraphs of
|
||||
sentences into files with one sentence per line. For example:
|
||||
|
||||
This is a paragraph. It contains several sentences. "But why," you ask?
|
||||
|
||||
goes to:
|
||||
|
||||
This is a paragraph.
|
||||
It contains several sentences.
|
||||
"But why," you ask?
|
||||
|
||||
See more information in the Nonbreaking Prefixes section.
|
||||
|
||||
|
||||
Tokenizer
|
||||
=========
|
||||
Usage ./tokenizer.perl -l [en|de|...] < textfile > tokenizedfile
|
||||
|
||||
Splits out most punctuation from words. Special cases where splits
|
||||
do not occur are documented in the code.
|
||||
|
||||
This E.U. treaty is, to use the words of Mr. Smith, "awesome."
|
||||
|
||||
goes to:
|
||||
|
||||
This E.U. treaty is , to use the words of Mr. Smith , " awesome . "
|
||||
|
||||
Like the sentence splitter, it makes use of the nonbreaking_prefixes
|
||||
directory.
|
||||
|
||||
|
||||
Nonbreaking Prefixes Directory
|
||||
==============================
|
||||
|
||||
Nonbreaking prefixes are loosely defined as any word ending in a
|
||||
period that does NOT indicate an end of sentence marker. A basic
|
||||
example is Mr. and Ms. in English.
|
||||
|
||||
The sentence splitter and tokenizer included with this release
|
||||
both use the nonbreaking prefix files included in this directory.
|
||||
|
||||
To add a file for other languages, follow the naming convention
|
||||
nonbreaking_prefix.?? and use the two-letter language code you
|
||||
intend to use when calling split-sentences.perl and tokenizer.perl.
|
||||
|
||||
Both split-sentences and tokenizer will first look for a file for the
|
||||
language they are processing, and fall back to English if a file
|
||||
for that language is not found. If the nonbreaking_prefixes directory does
|
||||
not exist at the same location as the split-sentences.perl and tokenizer.perl
|
||||
files, they will not run.
|
||||
|
||||
For the splitter, normally a period followed by an uppercase word
|
||||
results in a sentence split. If the word preceeding the period
|
||||
is a nonbreaking prefix, this line break is not inserted.
|
||||
|
||||
For the tokenizer, a nonbreaking prefix is not separated from its
|
||||
period with a space.
|
||||
|
||||
A special case of prefixes, NUMERIC_ONLY, is included for special
|
||||
cases where the prefix should be handled ONLY when before numbers.
|
||||
For example, "Article No. 24 states this." the No. is a nonbreaking
|
||||
prefix. However, in "No. It is not true." No functions as a word.
|
||||
|
||||
See the example prefix files included here for more examples.
|
@ -0,0 +1,325 @@
|
||||
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
|
||||
#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
|
||||
|
||||
#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
|
||||
#usually upper case letters are initials in a name
|
||||
#no german words end in single lower-case letters, so we throw those in too.
|
||||
A
|
||||
B
|
||||
C
|
||||
D
|
||||
E
|
||||
F
|
||||
G
|
||||
H
|
||||
I
|
||||
J
|
||||
K
|
||||
L
|
||||
M
|
||||
N
|
||||
O
|
||||
P
|
||||
Q
|
||||
R
|
||||
S
|
||||
T
|
||||
U
|
||||
V
|
||||
W
|
||||
X
|
||||
Y
|
||||
Z
|
||||
a
|
||||
b
|
||||
c
|
||||
d
|
||||
e
|
||||
f
|
||||
g
|
||||
h
|
||||
i
|
||||
j
|
||||
k
|
||||
l
|
||||
m
|
||||
n
|
||||
o
|
||||
p
|
||||
q
|
||||
r
|
||||
s
|
||||
t
|
||||
u
|
||||
v
|
||||
w
|
||||
x
|
||||
y
|
||||
z
|
||||
|
||||
|
||||
#Roman Numerals. A dot after one of these is not a sentence break in German.
|
||||
I
|
||||
II
|
||||
III
|
||||
IV
|
||||
V
|
||||
VI
|
||||
VII
|
||||
VIII
|
||||
IX
|
||||
X
|
||||
XI
|
||||
XII
|
||||
XIII
|
||||
XIV
|
||||
XV
|
||||
XVI
|
||||
XVII
|
||||
XVIII
|
||||
XIX
|
||||
XX
|
||||
i
|
||||
ii
|
||||
iii
|
||||
iv
|
||||
v
|
||||
vi
|
||||
vii
|
||||
viii
|
||||
ix
|
||||
x
|
||||
xi
|
||||
xii
|
||||
xiii
|
||||
xiv
|
||||
xv
|
||||
xvi
|
||||
xvii
|
||||
xviii
|
||||
xix
|
||||
xx
|
||||
|
||||
#Titles and Honorifics
|
||||
Adj
|
||||
Adm
|
||||
Adv
|
||||
Asst
|
||||
Bart
|
||||
Bldg
|
||||
Brig
|
||||
Bros
|
||||
Capt
|
||||
Cmdr
|
||||
Col
|
||||
Comdr
|
||||
Con
|
||||
Corp
|
||||
Cpl
|
||||
DR
|
||||
Dr
|
||||
Ens
|
||||
Gen
|
||||
Gov
|
||||
Hon
|
||||
Hosp
|
||||
Insp
|
||||
Lt
|
||||
MM
|
||||
MR
|
||||
MRS
|
||||
MS
|
||||
Maj
|
||||
Messrs
|
||||
Mlle
|
||||
Mme
|
||||
Mr
|
||||
Mrs
|
||||
Ms
|
||||
Msgr
|
||||
Op
|
||||
Ord
|
||||
Pfc
|
||||
Ph
|
||||
Prof
|
||||
Pvt
|
||||
Rep
|
||||
Reps
|
||||
Res
|
||||
Rev
|
||||
Rt
|
||||
Sen
|
||||
Sens
|
||||
Sfc
|
||||
Sgt
|
||||
Sr
|
||||
St
|
||||
Supt
|
||||
Surg
|
||||
|
||||
#Misc symbols
|
||||
Mio
|
||||
Mrd
|
||||
bzw
|
||||
v
|
||||
vs
|
||||
usw
|
||||
d.h
|
||||
z.B
|
||||
u.a
|
||||
etc
|
||||
Mrd
|
||||
MwSt
|
||||
ggf
|
||||
d.J
|
||||
D.h
|
||||
m.E
|
||||
vgl
|
||||
I.F
|
||||
z.T
|
||||
sogen
|
||||
ff
|
||||
u.E
|
||||
g.U
|
||||
g.g.A
|
||||
c.-à-d
|
||||
Buchst
|
||||
u.s.w
|
||||
sog
|
||||
u.ä
|
||||
Std
|
||||
evtl
|
||||
Zt
|
||||
Chr
|
||||
u.U
|
||||
o.ä
|
||||
Ltd
|
||||
b.A
|
||||
z.Zt
|
||||
spp
|
||||
sen
|
||||
SA
|
||||
k.o
|
||||
jun
|
||||
i.H.v
|
||||
dgl
|
||||
dergl
|
||||
Co
|
||||
zzt
|
||||
usf
|
||||
s.p.a
|
||||
Dkr
|
||||
Corp
|
||||
bzgl
|
||||
BSE
|
||||
|
||||
#Number indicators
|
||||
# add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
|
||||
No
|
||||
Nos
|
||||
Art
|
||||
Nr
|
||||
pp
|
||||
ca
|
||||
Ca
|
||||
|
||||
#Ordinals are done with . in German - "1." = "1st" in English
|
||||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
5
|
||||
6
|
||||
7
|
||||
8
|
||||
9
|
||||
10
|
||||
11
|
||||
12
|
||||
13
|
||||
14
|
||||
15
|
||||
16
|
||||
17
|
||||
18
|
||||
19
|
||||
20
|
||||
21
|
||||
22
|
||||
23
|
||||
24
|
||||
25
|
||||
26
|
||||
27
|
||||
28
|
||||
29
|
||||
30
|
||||
31
|
||||
32
|
||||
33
|
||||
34
|
||||
35
|
||||
36
|
||||
37
|
||||
38
|
||||
39
|
||||
40
|
||||
41
|
||||
42
|
||||
43
|
||||
44
|
||||
45
|
||||
46
|
||||
47
|
||||
48
|
||||
49
|
||||
50
|
||||
51
|
||||
52
|
||||
53
|
||||
54
|
||||
55
|
||||
56
|
||||
57
|
||||
58
|
||||
59
|
||||
60
|
||||
61
|
||||
62
|
||||
63
|
||||
64
|
||||
65
|
||||
66
|
||||
67
|
||||
68
|
||||
69
|
||||
70
|
||||
71
|
||||
72
|
||||
73
|
||||
74
|
||||
75
|
||||
76
|
||||
77
|
||||
78
|
||||
79
|
||||
80
|
||||
81
|
||||
82
|
||||
83
|
||||
84
|
||||
85
|
||||
86
|
||||
87
|
||||
88
|
||||
89
|
||||
90
|
||||
91
|
||||
92
|
||||
93
|
||||
94
|
||||
95
|
||||
96
|
||||
97
|
||||
98
|
||||
99
|
@ -0,0 +1,2 @@
|
||||
# for now, just include the Greek equivalent of "Mr."
|
||||
κ
|
@ -0,0 +1,107 @@
|
||||
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
|
||||
#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
|
||||
|
||||
#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
|
||||
#usually upper case letters are initials in a name
|
||||
A
|
||||
B
|
||||
C
|
||||
D
|
||||
E
|
||||
F
|
||||
G
|
||||
H
|
||||
I
|
||||
J
|
||||
K
|
||||
L
|
||||
M
|
||||
N
|
||||
O
|
||||
P
|
||||
Q
|
||||
R
|
||||
S
|
||||
T
|
||||
U
|
||||
V
|
||||
W
|
||||
X
|
||||
Y
|
||||
Z
|
||||
|
||||
#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
|
||||
Adj
|
||||
Adm
|
||||
Adv
|
||||
Asst
|
||||
Bart
|
||||
Bldg
|
||||
Brig
|
||||
Bros
|
||||
Capt
|
||||
Cmdr
|
||||
Col
|
||||
Comdr
|
||||
Con
|
||||
Corp
|
||||
Cpl
|
||||
DR
|
||||
Dr
|
||||
Drs
|
||||
Ens
|
||||
Gen
|
||||
Gov
|
||||
Hon
|
||||
Hr
|
||||
Hosp
|
||||
Insp
|
||||
Lt
|
||||
MM
|
||||
MR
|
||||
MRS
|
||||
MS
|
||||
Maj
|
||||
Messrs
|
||||
Mlle
|
||||
Mme
|
||||
Mr
|
||||
Mrs
|
||||
Ms
|
||||
Msgr
|
||||
Op
|
||||
Ord
|
||||
Pfc
|
||||
Ph
|
||||
Prof
|
||||
Pvt
|
||||
Rep
|
||||
Reps
|
||||
Res
|
||||
Rev
|
||||
Rt
|
||||
Sen
|
||||
Sens
|
||||
Sfc
|
||||
Sgt
|
||||
Sr
|
||||
St
|
||||
Supt
|
||||
Surg
|
||||
|
||||
#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
|
||||
v
|
||||
vs
|
||||
i.e
|
||||
rev
|
||||
e.g
|
||||
|
||||
#Numbers only. These should only induce breaks when followed by a numeric sequence
|
||||
# add NUMERIC_ONLY after the word for this function
|
||||
#This case is mostly for the english "No." which can either be a sentence of its own, or
|
||||
#if followed by a number, a non-breaking prefix
|
||||
No #NUMERIC_ONLY#
|
||||
Nos
|
||||
Art #NUMERIC_ONLY#
|
||||
Nr
|
||||
pp #NUMERIC_ONLY#
|
@ -0,0 +1,246 @@
|
||||
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
|
||||
#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
|
||||
|
||||
#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
|
||||
#usually upper case letters are initials in a name
|
||||
A
|
||||
B
|
||||
C
|
||||
D
|
||||
E
|
||||
F
|
||||
G
|
||||
H
|
||||
I
|
||||
J
|
||||
K
|
||||
L
|
||||
M
|
||||
N
|
||||
O
|
||||
P
|
||||
Q
|
||||
R
|
||||
S
|
||||
T
|
||||
U
|
||||
V
|
||||
W
|
||||
X
|
||||
Y
|
||||
Z
|
||||
|
||||
#Abbreviations
|
||||
a.c
|
||||
aa.rr
|
||||
abrev
|
||||
adj
|
||||
adm
|
||||
admón
|
||||
afma
|
||||
afmas
|
||||
afmo
|
||||
afmos
|
||||
ag
|
||||
am
|
||||
ap
|
||||
apdo
|
||||
art
|
||||
arts
|
||||
arz
|
||||
arzbpo
|
||||
assn
|
||||
atte
|
||||
av
|
||||
avda
|
||||
bros
|
||||
bv
|
||||
cap
|
||||
caps
|
||||
cg
|
||||
cgo
|
||||
cia
|
||||
cit
|
||||
cl
|
||||
cm
|
||||
co
|
||||
col
|
||||
corp
|
||||
cos
|
||||
cta
|
||||
cte
|
||||
ctra
|
||||
cts
|
||||
cía
|
||||
cía
|
||||
d.c
|
||||
dcha
|
||||
dept
|
||||
depto
|
||||
dg
|
||||
dl
|
||||
dm
|
||||
doc
|
||||
docs
|
||||
dpt
|
||||
dpto
|
||||
dr
|
||||
dra
|
||||
dras
|
||||
dres
|
||||
dto
|
||||
dupdo
|
||||
ed
|
||||
ee.uu
|
||||
ej
|
||||
emma
|
||||
emmas
|
||||
emmo
|
||||
emmos
|
||||
entlo
|
||||
entpo
|
||||
esp
|
||||
etc
|
||||
ex
|
||||
excm
|
||||
excma
|
||||
excmas
|
||||
excmo
|
||||
excmos
|
||||
fasc
|
||||
fdo
|
||||
fig
|
||||
figs
|
||||
fil
|
||||
fol
|
||||
fra
|
||||
gr
|
||||
grs
|
||||
gral
|
||||
ha
|
||||
hnos
|
||||
hros
|
||||
hz
|
||||
ib
|
||||
ibid
|
||||
ibíd
|
||||
id
|
||||
ilm
|
||||
ilma
|
||||
ilmas
|
||||
ilmo
|
||||
ilmos
|
||||
iltre
|
||||
inc
|
||||
intr
|
||||
izq
|
||||
izqda
|
||||
izqdo
|
||||
jr
|
||||
kc
|
||||
kcal
|
||||
kg
|
||||
khz
|
||||
kl
|
||||
km
|
||||
kw
|
||||
lda
|
||||
ldo
|
||||
lib
|
||||
lic
|
||||
lim
|
||||
loc
|
||||
ltd
|
||||
ltda
|
||||
lám
|
||||
ma
|
||||
mg
|
||||
mhz
|
||||
min
|
||||
mm
|
||||
mons
|
||||
mr
|
||||
mrs
|
||||
ms
|
||||
mss
|
||||
mtro
|
||||
máx
|
||||
mín
|
||||
ntra
|
||||
ntro
|
||||
núm
|
||||
ob
|
||||
obpo
|
||||
op
|
||||
pd
|
||||
ph
|
||||
pje
|
||||
pl
|
||||
plc
|
||||
pm
|
||||
pp
|
||||
ppal
|
||||
pral
|
||||
prof
|
||||
prov
|
||||
pról
|
||||
ps
|
||||
pta
|
||||
ptas
|
||||
pte
|
||||
pts
|
||||
pza
|
||||
pág
|
||||
págs
|
||||
párr
|
||||
rda
|
||||
rdo
|
||||
ref
|
||||
reg
|
||||
rel
|
||||
rev
|
||||
revda
|
||||
revdo
|
||||
rma
|
||||
rmo
|
||||
rte
|
||||
s
|
||||
sa
|
||||
sdad
|
||||
sec
|
||||
secret
|
||||
seg
|
||||
sg
|
||||
sig
|
||||
smo
|
||||
sr
|
||||
sra
|
||||
sras
|
||||
sres
|
||||
srs
|
||||
srta
|
||||
ss.mm
|
||||
sta
|
||||
sto
|
||||
sust
|
||||
tech
|
||||
tel
|
||||
telf
|
||||
teléf
|
||||
ten
|
||||
tfono
|
||||
tlf
|
||||
t.v.e
|
||||
tít
|
||||
ud
|
||||
uds
|
||||
vda
|
||||
vdo
|
||||
vid
|
||||
vol
|
||||
vols
|
||||
vra
|
||||
vro
|
||||
vta
|
||||
íd
|
||||
ít
|
@ -0,0 +1,153 @@
|
||||
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
|
||||
#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
|
||||
|
||||
#any single upper case letter followed by a period is not a sentence ender
|
||||
#usually upper case letters are initials in a name
|
||||
#no French words end in single lower-case letters, so we throw those in too?
|
||||
A
|
||||
B
|
||||
C
|
||||
D
|
||||
E
|
||||
F
|
||||
G
|
||||
H
|
||||
I
|
||||
J
|
||||
K
|
||||
L
|
||||
M
|
||||
N
|
||||
O
|
||||
P
|
||||
Q
|
||||
R
|
||||
S
|
||||
T
|
||||
U
|
||||
V
|
||||
W
|
||||
X
|
||||
Y
|
||||
Z
|
||||
a
|
||||
b
|
||||
c
|
||||
d
|
||||
e
|
||||
f
|
||||
g
|
||||
h
|
||||
i
|
||||
j
|
||||
k
|
||||
l
|
||||
m
|
||||
n
|
||||
o
|
||||
p
|
||||
q
|
||||
r
|
||||
s
|
||||
t
|
||||
u
|
||||
v
|
||||
w
|
||||
x
|
||||
y
|
||||
z
|
||||
|
||||
# Period-final abbreviation list for French
|
||||
A.C.N
|
||||
A.M
|
||||
art
|
||||
ann
|
||||
apr
|
||||
av
|
||||
auj
|
||||
lib
|
||||
B.P
|
||||
boul
|
||||
ca
|
||||
c.-à-d
|
||||
cf
|
||||
ch.-l
|
||||
chap
|
||||
contr
|
||||
C.P.I
|
||||
C.Q.F.D
|
||||
C.N
|
||||
C.N.S
|
||||
C.S
|
||||
dir
|
||||
éd
|
||||
e.g
|
||||
env
|
||||
al
|
||||
etc
|
||||
E.V
|
||||
ex
|
||||
fasc
|
||||
fém
|
||||
fig
|
||||
fr
|
||||
hab
|
||||
ibid
|
||||
id
|
||||
i.e
|
||||
inf
|
||||
LL.AA
|
||||
LL.AA.II
|
||||
LL.AA.RR
|
||||
LL.AA.SS
|
||||
L.D
|
||||
LL.EE
|
||||
LL.MM
|
||||
LL.MM.II.RR
|
||||
loc.cit
|
||||
masc
|
||||
MM
|
||||
ms
|
||||
N.B
|
||||
N.D.A
|
||||
N.D.L.R
|
||||
N.D.T
|
||||
n/réf
|
||||
NN.SS
|
||||
N.S
|
||||
N.D
|
||||
N.P.A.I
|
||||
p.c.c
|
||||
pl
|
||||
pp
|
||||
p.ex
|
||||
p.j
|
||||
P.S
|
||||
R.A.S
|
||||
R.-V
|
||||
R.P
|
||||
R.I.P
|
||||
SS
|
||||
S.S
|
||||
S.A
|
||||
S.A.I
|
||||
S.A.R
|
||||
S.A.S
|
||||
S.E
|
||||
sec
|
||||
sect
|
||||
sing
|
||||
S.M
|
||||
S.M.I.R
|
||||
sq
|
||||
sqq
|
||||
suiv
|
||||
sup
|
||||
suppl
|
||||
tél
|
||||
T.S.V.P
|
||||
vb
|
||||
vol
|
||||
vs
|
||||
X.O
|
||||
Z.I
|
@ -0,0 +1,134 @@
|
||||
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
|
||||
#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
|
||||
|
||||
#any single upper case letter followed by a period is not a sentence ender
|
||||
#usually upper case letters are initials in a name
|
||||
#no Italian words end in single lower-case letters, so we throw those in too?
|
||||
A
|
||||
B
|
||||
C
|
||||
D
|
||||
E
|
||||
F
|
||||
G
|
||||
H
|
||||
I
|
||||
J
|
||||
K
|
||||
L
|
||||
M
|
||||
N
|
||||
O
|
||||
P
|
||||
Q
|
||||
R
|
||||
S
|
||||
T
|
||||
U
|
||||
V
|
||||
W
|
||||
X
|
||||
Y
|
||||
Z
|
||||
a
|
||||
b
|
||||
c
|
||||
d
|
||||
e
|
||||
f
|
||||
g
|
||||
h
|
||||
i
|
||||
j
|
||||
k
|
||||
l
|
||||
m
|
||||
n
|
||||
o
|
||||
p
|
||||
q
|
||||
r
|
||||
s
|
||||
t
|
||||
u
|
||||
v
|
||||
w
|
||||
x
|
||||
y
|
||||
z
|
||||
|
||||
# Period-final abbreviation list from http://www.chass.utoronto.ca/~ngargano/corsi/corrisp/abbreviazioni.html
|
||||
a.c
|
||||
es
|
||||
all
|
||||
Amn
|
||||
Arch
|
||||
Avv
|
||||
Bcc
|
||||
c.a
|
||||
C.A.P
|
||||
Cc
|
||||
banc
|
||||
post
|
||||
c.c.p
|
||||
c.m
|
||||
Co
|
||||
c.p
|
||||
C.P
|
||||
corr
|
||||
c.s
|
||||
c.v
|
||||
Dott
|
||||
Dr
|
||||
ecc
|
||||
Egr
|
||||
e.p.c
|
||||
fatt
|
||||
Geom
|
||||
gg
|
||||
Id
|
||||
Ing
|
||||
int
|
||||
lett
|
||||
Mo
|
||||
Mons
|
||||
N.B
|
||||
ogg
|
||||
on
|
||||
pp
|
||||
p.c
|
||||
p.c
|
||||
p.c.c
|
||||
p.es
|
||||
p.f
|
||||
p.r
|
||||
P.S
|
||||
p.v
|
||||
P.T
|
||||
Prof
|
||||
racc
|
||||
Rag
|
||||
Rev
|
||||
ric
|
||||
Rif
|
||||
RP
|
||||
RSVP
|
||||
S.A
|
||||
acc
|
||||
S.B.F
|
||||
seg
|
||||
sgg
|
||||
ss
|
||||
Sig
|
||||
Sigg
|
||||
s.n.c
|
||||
Soc
|
||||
S.p.A
|
||||
Spett
|
||||
S.P.M
|
||||
S.r.l
|
||||
tel
|
||||
u.s
|
||||
V.P
|
||||
v.r
|
||||
v.s
|
@ -0,0 +1,115 @@
|
||||
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
|
||||
#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
|
||||
#Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen
|
||||
# http://nl.wikipedia.org/wiki/Aanspreekvorm
|
||||
# http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
|
||||
#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
|
||||
#usually upper case letters are initials in a name
|
||||
A
|
||||
B
|
||||
C
|
||||
D
|
||||
E
|
||||
F
|
||||
G
|
||||
H
|
||||
I
|
||||
J
|
||||
K
|
||||
L
|
||||
M
|
||||
N
|
||||
O
|
||||
P
|
||||
Q
|
||||
R
|
||||
S
|
||||
T
|
||||
U
|
||||
V
|
||||
W
|
||||
X
|
||||
Y
|
||||
Z
|
||||
|
||||
#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
|
||||
bacc
|
||||
bc
|
||||
bgen
|
||||
c.i
|
||||
dhr
|
||||
dr
|
||||
dr.h.c
|
||||
drs
|
||||
drs
|
||||
ds
|
||||
eint
|
||||
fa
|
||||
Fa
|
||||
fam
|
||||
gen
|
||||
genm
|
||||
ing
|
||||
ir
|
||||
jhr
|
||||
jkvr
|
||||
jr
|
||||
kand
|
||||
kol
|
||||
lgen
|
||||
lkol
|
||||
Lt
|
||||
maj
|
||||
Mej
|
||||
mevr
|
||||
Mme
|
||||
mr
|
||||
mr
|
||||
Mw
|
||||
o.b.s
|
||||
plv
|
||||
prof
|
||||
ritm
|
||||
tint
|
||||
Vz
|
||||
Z.D
|
||||
Z.D.H
|
||||
Z.E
|
||||
Z.Em
|
||||
Z.H
|
||||
Z.K.H
|
||||
Z.K.M
|
||||
Z.M
|
||||
z.v
|
||||
|
||||
#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
|
||||
#we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
|
||||
a.g.v
|
||||
bijv
|
||||
bijz
|
||||
bv
|
||||
d.w.z
|
||||
e.c
|
||||
e.g
|
||||
e.k
|
||||
ev
|
||||
i.p.v
|
||||
i.s.m
|
||||
i.t.t
|
||||
i.v.m
|
||||
m.a.w
|
||||
m.b.t
|
||||
m.b.v
|
||||
m.h.o
|
||||
m.i
|
||||
m.i.v
|
||||
v.w.t
|
||||
|
||||
#Numbers only. These should only induce breaks when followed by a numeric sequence
|
||||
# add NUMERIC_ONLY after the word for this function
|
||||
#This case is mostly for the english "No." which can either be a sentence of its own, or
|
||||
#if followed by a number, a non-breaking prefix
|
||||
Nr #NUMERIC_ONLY#
|
||||
Nrs
|
||||
nrs
|
||||
nr #NUMERIC_ONLY#
|
@ -0,0 +1,285 @@
|
||||
adw
|
||||
afr
|
||||
akad
|
||||
al
|
||||
Al
|
||||
am
|
||||
amer
|
||||
arch
|
||||
art
|
||||
Art
|
||||
artyst
|
||||
astr
|
||||
austr
|
||||
bałt
|
||||
bdb
|
||||
bł
|
||||
bm
|
||||
br
|
||||
bryg
|
||||
bryt
|
||||
centr
|
||||
ces
|
||||
chem
|
||||
chiń
|
||||
chir
|
||||
c.k
|
||||
c.o
|
||||
cyg
|
||||
cyw
|
||||
cyt
|
||||
czes
|
||||
czw
|
||||
cd
|
||||
Cd
|
||||
czyt
|
||||
ćw
|
||||
ćwicz
|
||||
daw
|
||||
dcn
|
||||
dekl
|
||||
demokr
|
||||
det
|
||||
diec
|
||||
dł
|
||||
dn
|
||||
dot
|
||||
dol
|
||||
dop
|
||||
dost
|
||||
dosł
|
||||
h.c
|
||||
ds
|
||||
dst
|
||||
duszp
|
||||
dypl
|
||||
egz
|
||||
ekol
|
||||
ekon
|
||||
elektr
|
||||
em
|
||||
ew
|
||||
fab
|
||||
farm
|
||||
fot
|
||||
fr
|
||||
gat
|
||||
gastr
|
||||
geogr
|
||||
geol
|
||||
gimn
|
||||
głęb
|
||||
gm
|
||||
godz
|
||||
górn
|
||||
gosp
|
||||
gr
|
||||
gram
|
||||
hist
|
||||
hiszp
|
||||
hr
|
||||
Hr
|
||||
hot
|
||||
id
|
||||
in
|
||||
im
|
||||
iron
|
||||
jn
|
||||
kard
|
||||
kat
|
||||
katol
|
||||
k.k
|
||||
kk
|
||||
kol
|
||||
kl
|
||||
k.p.a
|
||||
kpc
|
||||
k.p.c
|
||||
kpt
|
||||
kr
|
||||
k.r
|
||||
krak
|
||||
k.r.o
|
||||
kryt
|
||||
kult
|
||||
laic
|
||||
łac
|
||||
niem
|
||||
woj
|
||||
nb
|
||||
np
|
||||
Nb
|
||||
Np
|
||||
pol
|
||||
pow
|
||||
m.in
|
||||
pt
|
||||
ps
|
||||
Pt
|
||||
Ps
|
||||
cdn
|
||||
jw
|
||||
ryc
|
||||
rys
|
||||
Ryc
|
||||
Rys
|
||||
tj
|
||||
tzw
|
||||
Tzw
|
||||
tzn
|
||||
zob
|
||||
ang
|
||||
ub
|
||||
ul
|
||||
pw
|
||||
pn
|
||||
pl
|
||||
al
|
||||
k
|
||||
n
|
||||
nr #NUMERIC_ONLY#
|
||||
Nr #NUMERIC_ONLY#
|
||||
ww
|
||||
wł
|
||||
ur
|
||||
zm
|
||||
żyd
|
||||
żarg
|
||||
żyw
|
||||
wył
|
||||
bp
|
||||
bp
|
||||
wyst
|
||||
tow
|
||||
Tow
|
||||
o
|
||||
sp
|
||||
Sp
|
||||
st
|
||||
spółdz
|
||||
Spółdz
|
||||
społ
|
||||
spółgł
|
||||
stoł
|
||||
stow
|
||||
Stoł
|
||||
Stow
|
||||
zn
|
||||
zew
|
||||
zewn
|
||||
zdr
|
||||
zazw
|
||||
zast
|
||||
zaw
|
||||
zał
|
||||
zal
|
||||
zam
|
||||
zak
|
||||
zakł
|
||||
zagr
|
||||
zach
|
||||
adw
|
||||
Adw
|
||||
lek
|
||||
Lek
|
||||
med
|
||||
mec
|
||||
Mec
|
||||
doc
|
||||
Doc
|
||||
dyw
|
||||
dyr
|
||||
Dyw
|
||||
Dyr
|
||||
inż
|
||||
Inż
|
||||
mgr
|
||||
Mgr
|
||||
dh
|
||||
dr
|
||||
Dh
|
||||
Dr
|
||||
p
|
||||
P
|
||||
red
|
||||
Red
|
||||
prof
|
||||
prok
|
||||
Prof
|
||||
Prok
|
||||
hab
|
||||
płk
|
||||
Płk
|
||||
nadkom
|
||||
Nadkom
|
||||
podkom
|
||||
Podkom
|
||||
ks
|
||||
Ks
|
||||
gen
|
||||
Gen
|
||||
por
|
||||
Por
|
||||
reż
|
||||
Reż
|
||||
przyp
|
||||
Przyp
|
||||
śp
|
||||
św
|
||||
śW
|
||||
Śp
|
||||
Św
|
||||
ŚW
|
||||
szer
|
||||
Szer
|
||||
pkt #NUMERIC_ONLY#
|
||||
str #NUMERIC_ONLY#
|
||||
tab #NUMERIC_ONLY#
|
||||
Tab #NUMERIC_ONLY#
|
||||
tel
|
||||
ust #NUMERIC_ONLY#
|
||||
par #NUMERIC_ONLY#
|
||||
poz
|
||||
pok
|
||||
oo
|
||||
oO
|
||||
Oo
|
||||
OO
|
||||
r #NUMERIC_ONLY#
|
||||
l #NUMERIC_ONLY#
|
||||
s #NUMERIC_ONLY#
|
||||
najśw
|
||||
Najśw
|
||||
A
|
||||
B
|
||||
C
|
||||
D
|
||||
E
|
||||
F
|
||||
G
|
||||
H
|
||||
I
|
||||
J
|
||||
K
|
||||
L
|
||||
M
|
||||
N
|
||||
O
|
||||
P
|
||||
Q
|
||||
R
|
||||
S
|
||||
T
|
||||
U
|
||||
V
|
||||
W
|
||||
X
|
||||
Y
|
||||
Z
|
||||
Ś
|
||||
Ć
|
||||
Ż
|
||||
Ź
|
||||
Dz
|
||||
Contact GitHub API Training Shop Blog About
|
||||
|
152
mgiza-aligner/europarl/tools/split-sentences.perl
Executable file
152
mgiza-aligner/europarl/tools/split-sentences.perl
Executable file
@ -0,0 +1,152 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
# Based on Preprocessor written by Philipp Koehn
|
||||
|
||||
binmode(STDIN, ":utf8");
|
||||
binmode(STDOUT, ":utf8");
|
||||
binmode(STDERR, ":utf8");
|
||||
|
||||
use FindBin qw($Bin);
|
||||
use strict;
|
||||
|
||||
my $mydir = "$Bin/nonbreaking_prefixes";
|
||||
|
||||
my %NONBREAKING_PREFIX = ();
|
||||
my $language = "en";
|
||||
my $QUIET = 0;
|
||||
my $HELP = 0;
|
||||
|
||||
while (@ARGV) {
|
||||
$_ = shift;
|
||||
/^-l$/ && ($language = shift, next);
|
||||
/^-q$/ && ($QUIET = 1, next);
|
||||
/^-h$/ && ($HELP = 1, next);
|
||||
}
|
||||
|
||||
if ($HELP) {
|
||||
print "Usage ./split-sentences.perl (-l [en|de|...]) < textfile > splitfile\n";
|
||||
exit;
|
||||
}
|
||||
if (!$QUIET) {
|
||||
print STDERR "Sentence Splitter v3\n";
|
||||
print STDERR "Language: $language\n";
|
||||
}
|
||||
|
||||
my $prefixfile = "$mydir/nonbreaking_prefix.$language";
|
||||
|
||||
#default back to English if we don't have a language-specific prefix file
|
||||
if (!(-e $prefixfile)) {
|
||||
$prefixfile = "$mydir/nonbreaking_prefix.en";
|
||||
print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
|
||||
die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
|
||||
}
|
||||
|
||||
if (-e "$prefixfile") {
|
||||
open(PREFIX, "<:utf8", "$prefixfile");
|
||||
while (<PREFIX>) {
|
||||
my $item = $_;
|
||||
chomp($item);
|
||||
if (($item) && (substr($item,0,1) ne "#")) {
|
||||
if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
|
||||
$NONBREAKING_PREFIX{$1} = 2;
|
||||
} else {
|
||||
$NONBREAKING_PREFIX{$item} = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
close(PREFIX);
|
||||
}
|
||||
|
||||
##loop text, add lines together until we get a blank line or a <p>
|
||||
my $text = "";
|
||||
while(<STDIN>) {
|
||||
chop;
|
||||
if (/^<.+>$/ || /^\s*$/) {
|
||||
#time to process this block, we've hit a blank or <p>
|
||||
&do_it_for($text,$_);
|
||||
print "<P>\n" if (/^\s*$/ && $text); ##if we have text followed by <P>
|
||||
$text = "";
|
||||
}
|
||||
else {
|
||||
#append the text, with a space
|
||||
$text .= $_. " ";
|
||||
}
|
||||
}
|
||||
#do the leftover text
|
||||
&do_it_for($text,"") if $text;
|
||||
|
||||
|
||||
sub do_it_for {
|
||||
my($text,$markup) = @_;
|
||||
print &preprocess($text) if $text;
|
||||
print "$markup\n" if ($markup =~ /^<.+>$/);
|
||||
#chop($text);
|
||||
}
|
||||
|
||||
sub preprocess {
|
||||
# clean up spaces at head and tail of each line as well as any double-spacing
|
||||
$text =~ s/ +/ /g;
|
||||
$text =~ s/\n /\n/g;
|
||||
$text =~ s/ \n/\n/g;
|
||||
$text =~ s/^ //g;
|
||||
$text =~ s/ $//g;
|
||||
|
||||
#this is one paragraph
|
||||
my($text) = @_;
|
||||
|
||||
#####add sentence breaks as needed#####
|
||||
|
||||
#non-period end of sentence markers (?!) followed by sentence starters.
|
||||
$text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
|
||||
|
||||
#multi-dots followed by sentence starters
|
||||
$text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
|
||||
|
||||
# add breaks for sentences that end with some sort of punctuation inside a quote or parenthetical and are followed by a possible sentence starter punctuation and upper case
|
||||
$text =~ s/([?!\.][\ ]*[\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g;
|
||||
|
||||
# add breaks for sentences that end with some sort of punctuation are followed by a sentence starter punctuation and upper case
|
||||
$text =~ s/([?!\.]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g;
|
||||
|
||||
# special punctuation cases are covered. Check all remaining periods.
|
||||
my $word;
|
||||
my $i;
|
||||
my @words = split(/ /,$text);
|
||||
$text = "";
|
||||
for ($i=0;$i<(scalar(@words)-1);$i++) {
|
||||
if ($words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/) {
|
||||
#check if $1 is a known honorific and $2 is empty, never break
|
||||
my $prefix = $1;
|
||||
my $starting_punct = $2;
|
||||
if($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
|
||||
#not breaking;
|
||||
} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
|
||||
#not breaking - upper case acronym
|
||||
} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/) {
|
||||
#the next word has a bunch of initial quotes, maybe a space, then either upper case or a number
|
||||
$words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/));
|
||||
#we always add a return for these unless we have a numeric non-breaker and a number start
|
||||
}
|
||||
|
||||
}
|
||||
$text = $text.$words[$i]." ";
|
||||
}
|
||||
|
||||
#we stopped one token from the end to allow for easy look-ahead. Append it now.
|
||||
$text = $text.$words[$i];
|
||||
|
||||
# clean up spaces at head and tail of each line as well as any double-spacing
|
||||
$text =~ s/ +/ /g;
|
||||
$text =~ s/\n /\n/g;
|
||||
$text =~ s/ \n/\n/g;
|
||||
$text =~ s/^ //g;
|
||||
$text =~ s/ $//g;
|
||||
|
||||
#add trailing break
|
||||
$text .= "\n" unless $text =~ /\n$/;
|
||||
|
||||
return $text;
|
||||
|
||||
}
|
||||
|
||||
|
167
mgiza-aligner/europarl/tools/tokenizer.perl
Executable file
167
mgiza-aligner/europarl/tools/tokenizer.perl
Executable file
@ -0,0 +1,167 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
# Sample Tokenizer
|
||||
# written by Josh Schroeder, based on code by Philipp Koehn
|
||||
|
||||
binmode(STDIN, ":utf8");
|
||||
binmode(STDOUT, ":utf8");
|
||||
|
||||
use FindBin qw($Bin);
|
||||
use strict;
|
||||
#use Time::HiRes;
|
||||
|
||||
my $mydir = "$Bin/nonbreaking_prefixes";
|
||||
|
||||
my %NONBREAKING_PREFIX = ();
|
||||
my $language = "en";
|
||||
my $QUIET = 0;
|
||||
my $HELP = 0;
|
||||
|
||||
#my $start = [ Time::HiRes::gettimeofday( ) ];
|
||||
|
||||
while (@ARGV) {
|
||||
$_ = shift;
|
||||
/^-l$/ && ($language = shift, next);
|
||||
/^-q$/ && ($QUIET = 1, next);
|
||||
/^-h$/ && ($HELP = 1, next);
|
||||
}
|
||||
|
||||
if ($HELP) {
|
||||
print "Usage ./tokenizer.perl (-l [en|de|...]) < textfile > tokenizedfile\n";
|
||||
exit;
|
||||
}
|
||||
if (!$QUIET) {
|
||||
print STDERR "Tokenizer v3\n";
|
||||
print STDERR "Language: $language\n";
|
||||
}
|
||||
|
||||
load_prefixes($language,\%NONBREAKING_PREFIX);
|
||||
|
||||
if (scalar(%NONBREAKING_PREFIX) eq 0){
|
||||
print STDERR "Warning: No known abbreviations for language '$language'\n";
|
||||
}
|
||||
|
||||
while(<STDIN>) {
|
||||
if (/^<.+>$/ || /^\s*$/) {
|
||||
#don't try to tokenize XML/HTML tag lines
|
||||
print $_;
|
||||
}
|
||||
else {
|
||||
print &tokenize($_);
|
||||
}
|
||||
}
|
||||
|
||||
#my $duration = Time::HiRes::tv_interval( $start );
|
||||
#print STDERR ("EXECUTION TIME: ".$duration."\n");
|
||||
|
||||
|
||||
sub tokenize {
|
||||
my($text) = @_;
|
||||
chomp($text);
|
||||
$text = " $text ";
|
||||
|
||||
# seperate out all "other" special characters
|
||||
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
|
||||
|
||||
#multi-dots stay together
|
||||
$text =~ s/\.([\.]+)/ DOTMULTI$1/g;
|
||||
while($text =~ /DOTMULTI\./) {
|
||||
$text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
|
||||
$text =~ s/DOTMULTI\./DOTDOTMULTI/g;
|
||||
}
|
||||
|
||||
# seperate out "," except if within numbers (5,300)
|
||||
$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
|
||||
# separate , pre and post number
|
||||
$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
|
||||
$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
|
||||
|
||||
# turn `into '
|
||||
$text =~ s/\`/\'/g;
|
||||
|
||||
#turn '' into "
|
||||
$text =~ s/\'\'/ \" /g;
|
||||
|
||||
if ($language eq "en") {
|
||||
#split contractions right
|
||||
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
|
||||
$text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
|
||||
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
|
||||
$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
|
||||
#special case for "1990's"
|
||||
$text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
|
||||
} elsif (($language eq "fr") or ($language eq "it")) {
|
||||
#split contractions left
|
||||
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
|
||||
$text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
|
||||
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
|
||||
$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
|
||||
} else {
|
||||
$text =~ s/\'/ \' /g;
|
||||
}
|
||||
|
||||
#word token method
|
||||
my @words = split(/\s/,$text);
|
||||
$text = "";
|
||||
for (my $i=0;$i<(scalar(@words));$i++) {
|
||||
my $word = $words[$i];
|
||||
if ( $word =~ /^(\S+)\.$/) {
|
||||
my $pre = $1;
|
||||
if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) {
|
||||
#no change
|
||||
} elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) {
|
||||
#no change
|
||||
} else {
|
||||
$word = $pre." .";
|
||||
}
|
||||
}
|
||||
$text .= $word." ";
|
||||
}
|
||||
|
||||
# clean up extraneous spaces
|
||||
$text =~ s/ +/ /g;
|
||||
$text =~ s/^ //g;
|
||||
$text =~ s/ $//g;
|
||||
|
||||
#restore multi-dots
|
||||
while($text =~ /DOTDOTMULTI/) {
|
||||
$text =~ s/DOTDOTMULTI/DOTMULTI./g;
|
||||
}
|
||||
$text =~ s/DOTMULTI/./g;
|
||||
|
||||
#ensure final line break
|
||||
$text .= "\n" unless $text =~ /\n$/;
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
sub load_prefixes {
|
||||
my ($language, $PREFIX_REF) = @_;
|
||||
|
||||
my $prefixfile = "$mydir/nonbreaking_prefix.$language";
|
||||
|
||||
#default back to English if we don't have a language-specific prefix file
|
||||
if (!(-e $prefixfile)) {
|
||||
$prefixfile = "$mydir/nonbreaking_prefix.en";
|
||||
print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
|
||||
die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
|
||||
}
|
||||
|
||||
if (-e "$prefixfile") {
|
||||
open(PREFIX, "<:utf8", "$prefixfile");
|
||||
while (<PREFIX>) {
|
||||
my $item = $_;
|
||||
chomp($item);
|
||||
if (($item) && (substr($item,0,1) ne "#")) {
|
||||
if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
|
||||
$PREFIX_REF->{$1} = 2;
|
||||
} else {
|
||||
$PREFIX_REF->{$item} = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
close(PREFIX);
|
||||
}
|
||||
|
||||
}
|
||||
|
100
mgiza-aligner/giza.cfg.pattern
Normal file
100
mgiza-aligner/giza.cfg.pattern
Normal file
@ -0,0 +1,100 @@
|
||||
adbackoff 0
|
||||
compactadtable 1
|
||||
compactalignmentformat 0
|
||||
coocurrencefile corpora/CORPUS_NAME/src.low_trg.low.cooc
|
||||
corpusfile corpora/CORPUS_NAME/src.low_trg.low.snt
|
||||
countcutoff 1e-06
|
||||
countcutoffal 1e-05
|
||||
countincreasecutoff 1e-06
|
||||
countincreasecutoffal 1e-05
|
||||
countoutputprefix
|
||||
d
|
||||
deficientdistortionforemptyword 0
|
||||
depm4 76
|
||||
depm5 68
|
||||
dictionary
|
||||
dopeggingyn 0
|
||||
dumpcount 0
|
||||
dumpcountusingwordstring 0
|
||||
emalignmentdependencies 2
|
||||
emalsmooth 0.2
|
||||
emprobforempty 0.4
|
||||
emsmoothhmm 2
|
||||
hmmdumpfrequency 0
|
||||
hmmiterations 5
|
||||
log 0
|
||||
logfile corpora/CORPUS_NAME/mgiza.log
|
||||
m1 5
|
||||
m2 0
|
||||
m3 3
|
||||
m4 3
|
||||
m5 0
|
||||
m5p0 -1
|
||||
m6 0
|
||||
manlexfactor1 0
|
||||
manlexfactor2 0
|
||||
manlexmaxmultiplicity 20
|
||||
maxfertility 10
|
||||
maxsentencelength 101
|
||||
mh 5
|
||||
mincountincrease 1e-07
|
||||
ml 101
|
||||
model1dumpfrequency 1
|
||||
model1iterations 5
|
||||
model23smoothfactor 0
|
||||
model2dumpfrequency 0
|
||||
model2iterations 0
|
||||
model345dumpfrequency 0
|
||||
model3dumpfrequency 0
|
||||
model3iterations 3
|
||||
model4iterations 3
|
||||
model4smoothfactor 0.4
|
||||
model5iterations 0
|
||||
model5smoothfactor 0.1
|
||||
model6iterations 0
|
||||
nbestalignments 0
|
||||
ncpus 2
|
||||
nodumps 1
|
||||
nofiledumpsyn 1
|
||||
noiterationsmodel1 5
|
||||
noiterationsmodel2 0
|
||||
noiterationsmodel3 3
|
||||
noiterationsmodel4 3
|
||||
noiterationsmodel5 0
|
||||
noiterationsmodel6 0
|
||||
nsmooth 4
|
||||
nsmoothgeneral 0
|
||||
numberofiterationsforhmmalignmentmodel 5
|
||||
onlyaldumps 1
|
||||
outputfileprefix corpora/CORPUS_NAME/aligned
|
||||
outputpath
|
||||
p 0
|
||||
p0 0.999
|
||||
peggedcutoff 0.03
|
||||
pegging 0
|
||||
previousa
|
||||
previousd
|
||||
previousd4
|
||||
previousd42
|
||||
previoushmm
|
||||
previousn
|
||||
previousp0
|
||||
previoust
|
||||
probcutoff 1e-07
|
||||
probsmooth 1e-07
|
||||
readtableprefix
|
||||
restart 0
|
||||
sourcevocabularyfile corpora/CORPUS_NAME/src.low.vcb
|
||||
t1 1
|
||||
t2 0
|
||||
t2to3 0
|
||||
t3 0
|
||||
t345 0
|
||||
targetvocabularyfile corpora/CORPUS_NAME/trg.low.vcb
|
||||
tc
|
||||
testcorpusfile
|
||||
th 0
|
||||
transferdumpfrequency 0
|
||||
v 0
|
||||
verbose 0
|
||||
verbosesentence -10
|
1
mgiza-aligner/mgiza
Submodule
1
mgiza-aligner/mgiza
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit d643960de98565d208114780ba8025799208afa7
|
Loading…
Reference in New Issue
Block a user