This commit is contained in:
rjawor 2017-01-21 17:01:15 +01:00
parent 254e028f23
commit 35a78669a3
17 changed files with 2211 additions and 0 deletions

1
mgiza-aligner/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
corpora/*

39
mgiza-aligner/Makefile Normal file
View File

@ -0,0 +1,39 @@
SRC_LANG=en
TRG_LANG=pl
CORPUS_NAME=europarl
all: corpora/$(CORPUS_NAME)/giza.cfg corpora/$(CORPUS_NAME)/src.low_trg.low.cooc corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb
mgiza/mgizapp/bin/mgiza corpora/$(CORPUS_NAME)/giza.cfg
cat corpora/$(CORPUS_NAME)/aligned*part* > corpora/$(CORPUS_NAME)/aligned.txt
clean:
rm -f corpora/$(CORPUS_NAME)/*.tok
rm -f corpora/$(CORPUS_NAME)/*.low
rm -f corpora/$(CORPUS_NAME)/*.classes
rm -f corpora/$(CORPUS_NAME)/*.classes.cats
rm -f corpora/$(CORPUS_NAME)/*.vcb
rm -f corpora/$(CORPUS_NAME)/*.snt
rm -f corpora/$(CORPUS_NAME)/*.cooc
rm -f corpora/$(CORPUS_NAME)/aligned*
rm -f corpora/$(CORPUS_NAME)/giza.cfg
corpora/$(CORPUS_NAME)/giza.cfg: giza.cfg.pattern
sed 's/CORPUS_NAME/'$(CORPUS_NAME)'/' < $< > $@
corpora/$(CORPUS_NAME)/src.low_trg.low.cooc: corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb corpora/$(CORPUS_NAME)/src.low_trg.low.snt
mgiza/mgizapp/bin/snt2cooc $@ corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb corpora/$(CORPUS_NAME)/src.low_trg.low.snt
corpora/$(CORPUS_NAME)/src.low_trg.low.snt corpora/$(CORPUS_NAME)/trg.low_src.low.snt corpora/$(CORPUS_NAME)/src.low.vcb corpora/$(CORPUS_NAME)/trg.low.vcb: corpora/$(CORPUS_NAME)/src.low corpora/$(CORPUS_NAME)/trg.low
mgiza/mgizapp/bin/plain2snt corpora/$(CORPUS_NAME)/src.low corpora/$(CORPUS_NAME)/trg.low
corpora/$(CORPUS_NAME)/%.classes: corpora/$(CORPUS_NAME)/%.low
mgiza/mgizapp/bin/mkcls -n10 -p$< -V$@
corpora/$(CORPUS_NAME)/%.low: corpora/$(CORPUS_NAME)/%.tok
tr '[:upper:]' '[:lower:]' < $< > $@
corpora/$(CORPUS_NAME)/src.tok: corpora/$(CORPUS_NAME)/src.txt
europarl/tools/tokenizer.perl -l $(SRC_LANG) < $< > $@
corpora/$(CORPUS_NAME)/trg.tok: corpora/$(CORPUS_NAME)/trg.txt
europarl/tools/tokenizer.perl -l $(TRG_LANG) < $< > $@

View File

@ -0,0 +1,58 @@
Europarl Release v3 -- Sept 27, 2007
===================================
This is a parallel corpus that was extracted from the
European Parliament web site by Philipp Koehn (University
of Edinburgh). It is faily big, 40 million words per
language, and its main intended use is to aid
statistical machine translation research.
More information can be found at
http://www.statmt.org/europarl/
The main difference in this release vs. the first release
in 2002 and second release in 2003 is that it is larger
and it comes with improved processing tools that allow
the creation of parallel corpora between any two of the
11 languages.
Some data is now tagged with the original language the text
was spoken in.
Sentence aligner
----------------
You can create any parallel corpus with the command
./sentence-align-corpus.perl L1 L2
where L1 and L2 can be any of the 11 languages
da de el en es fi fr it nl pt sv
The output is stored in the aligned/ directory.
NOTE: To use this corpus with tools like Giza++, you want to
- lowercase the text (recommended)
- strip empty lines and their correspondences (recommended)
- tokenize words and punctuation (recommended)
- remove lines with XML-Tags (starting with "<") (required)
The sentence aligner uses the split-sentences.perl script,
which does and sentence splitting. You may want to
use your own preprocessor. This requires changing an
obvious line in the sentence aligner code. A tokenizer.perl
script is included as well.
Source
------
http://www3.europarl.eu.int/omk/omnsapir.so/calendar?APP=CRE&LANGUE=EN
Copyright in the Europarl service
(c) European Communities
Except where otherwise indicated, reproduction is authorised,
provided that the source is acknowledged.
Change Log
----------
Preprocessing is improved.
This release covers 9/1996 - 10/2006.
Includes sentence aligner and tokenizer.

View File

@ -0,0 +1,253 @@
#!/usr/bin/perl -w
use strict;
use Encode;
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
my $dir = "txt";
my $outdir = "aligned";
my $preprocessor = "tools/split-sentences.perl -q";
my ($l1,$l2) = @ARGV;
die unless -e "$dir/$l1";
die unless -e "$dir/$l2";
`mkdir -p $outdir/$l1-$l2/$l1`;
`mkdir -p $outdir/$l1-$l2/$l2`;
my ($dayfile,$s1); # globals for reporting reasons
open(LS,"ls $dir/$l1|");
while($dayfile = <LS>) {
chop($dayfile);
if (! -e "$dir/$l2/$dayfile") {
print "$dayfile only for $l1, not $l2, skipping\n";
next;
}
&align();
}
sub align {
my @TXT1native= `$preprocessor -l $l1 < $dir/$l1/$dayfile`;
my @TXT2native = `$preprocessor -l $l2 < $dir/$l2/$dayfile`;
my @TXT1;
my @TXT2;
#change perl encoding
foreach my $line (@TXT1native) {
push(@TXT1,decode_utf8($line));
}
foreach my $line (@TXT2native) {
push(@TXT2,decode_utf8($line));
}
open(OUT1, ">$outdir/$l1-$l2/$l1/$dayfile");
open(OUT2, ">$outdir/$l1-$l2/$l2/$dayfile");
binmode(OUT1, ":utf8");
binmode(OUT2, ":utf8");
for(my $i2=0,my $i1=0; $i1<scalar(@TXT1) && $i2<scalar(@TXT2);) {
# match chapter start
if ($TXT1[$i1] =~ /^<CHAPTER ID=\"?(\d+)\"?/) {
my $c1 = $1;
#print "CHAPTER $1\n";
if ($TXT2[$i2] =~ /^<CHAPTER ID=\"?(\d+)\"?/) {
my $c2 = $1;
if ($c1 == $c2) {
print OUT1 $TXT1[$i1++];
print OUT2 $TXT2[$i2++];
}
elsif ($c1 < $c2) {
$i1 = &skip(\@TXT1,$i1+1,'^<CHAPTER ID=\"?\d+\"?');
}
else {
$i2 = &skip(\@TXT2,$i2+1,'^<CHAPTER ID=\"?\d+\"?');
}
}
else {
$i2 = &skip(\@TXT2,$i2,'^<CHAPTER ID=\"?\d+\"?');
}
}
# match speaker start
elsif ($TXT1[$i1] =~ /^<SPEAKER ID=\"?(\d+)\"?/) {
$s1 = $1;
#print "SPEAKER $1\n";
if ($TXT2[$i2] =~ /^<SPEAKER ID=\"?(\d+)\"?/) {
my $s2 = $1;
if ($s1 == $s2) {
print OUT1 $TXT1[$i1++];
print OUT2 $TXT2[$i2++];
}
elsif ($s1 < $s2) {
$i1 = &skip(\@TXT1,$i1+1,'^<SPEAKER ID=\"?\d+\"?');
}
else {
$i2 = &skip(\@TXT2,$i2+1,'^<SPEAKER ID=\"?\d+\"?');
}
}
else {
$i2 = &skip(\@TXT2,$i2,'^<SPEAKER ID=\"?\d+\"?');
}
}
else {
#print "processing... $i1,$i2\n";
my @P1 = &extract_paragraph(\@TXT1,\$i1);
my @P2 = &extract_paragraph(\@TXT2,\$i2);
if (scalar(@P1) != scalar(@P2)) {
print "$dayfile (speaker $s1) different number of paragraphs ".scalar(@P1)." != ".scalar(@P2)."\n";
}
else {
for(my $p=0;$p<scalar(@P1);$p++) {
&sentence_align(\@{$P1[$p]},\@{$P2[$p]});
}
}
}
}
}
close(LS);
sub skip {
my ($TXT,$i,$pattern) = @_;
my $i_old = $i;
while($i < scalar(@{$TXT})
&& $$TXT[$i] !~ /$pattern/) {
$i++;
}
print "$dayfile skipped lines $i_old-$i to reach '$pattern'\n";
return $i;
}
sub extract_paragraph {
my ($TXT,$i) = @_;
my @P = ();
my $p=0;
for(;$$i<scalar(@{$TXT})
&& ${$TXT}[$$i] !~ /^<SPEAKER ID=\"?\d+\"?/
&& ${$TXT}[$$i] !~ /^<CHAPTER ID=\"?\d+\"?/;$$i++) {
if (${$TXT}[$$i] =~ /^<P>/) {
$p++ if $P[$p];
# each XML tag has its own paragraph
push @{$P[$p]}, ${$TXT}[$$i];
$p++;
}
else {
push @{$P[$p]}, ${$TXT}[$$i];
}
}
return @P;
}
# this is a vanilla implementation of church and gale
sub sentence_align {
my ($P1,$P2) = @_;
chop(@{$P1});
chop(@{$P2});
# parameters
my %PRIOR;
$PRIOR{1}{1} = 0.89;
$PRIOR{1}{0} = 0.01/2;
$PRIOR{0}{1} = 0.01/2;
$PRIOR{2}{1} = 0.089/2;
$PRIOR{1}{2} = 0.089/2;
# $PRIOR{2}{2} = 0.011;
# compute length (in characters)
my (@LEN1,@LEN2);
$LEN1[0] = 0;
for(my $i=0;$i<scalar(@{$P1});$i++) {
my $line = $$P1[$i];
$line =~ s/[\s\r\n]+//g;
# print "1: $line\n";
$LEN1[$i+1] = $LEN1[$i] + length($line);
}
$LEN2[0] = 0;
for(my $i=0;$i<scalar(@{$P2});$i++) {
my $line = $$P2[$i];
$line =~ s/[\s\r\n]+//g;
# print "2: $line\n";
$LEN2[$i+1] = $LEN2[$i] + length($line);
}
# dynamic programming
my (@COST,@BACK);
$COST[0][0] = 0;
for(my $i1=0;$i1<=scalar(@{$P1});$i1++) {
for(my $i2=0;$i2<=scalar(@{$P2});$i2++) {
next if $i1 + $i2 == 0;
$COST[$i1][$i2] = 1e10;
foreach my $d1 (keys %PRIOR) {
next if $d1>$i1;
foreach my $d2 (keys %{$PRIOR{$d1}}) {
next if $d2>$i2;
my $cost = $COST[$i1-$d1][$i2-$d2] - log($PRIOR{$d1}{$d2}) +
&match($LEN1[$i1]-$LEN1[$i1-$d1], $LEN2[$i2]-$LEN2[$i2-$d2]);
# print "($i1->".($i1-$d1).",$i2->".($i2-$d2).") [".($LEN1[$i1]-$LEN1[$i1-$d1]).",".($LEN2[$i2]-$LEN2[$i2-$d2])."] = $COST[$i1-$d1][$i2-$d2] - ".log($PRIOR{$d1}{$d2})." + ".&match($LEN1[$i1]-$LEN1[$i1-$d1], $LEN2[$i2]-$LEN2[$i2-$d2])." = $cost\n";
if ($cost < $COST[$i1][$i2]) {
$COST[$i1][$i2] = $cost;
@{$BACK[$i1][$i2]} = ($i1-$d1,$i2-$d2);
}
}
}
# print $COST[$i1][$i2]."($i1-$BACK[$i1][$i2][0],$i2-$BACK[$i1][$i2][1]) ";
}
# print "\n";
}
# back tracking
my (%NEXT);
my $i1 = scalar(@{$P1});
my $i2 = scalar(@{$P2});
while($i1>0 || $i2>0) {
# print "back $i1 $i2\n";
@{$NEXT{$BACK[$i1][$i2][0]}{$BACK[$i1][$i2][1]}} = ($i1,$i2);
($i1,$i2) = ($BACK[$i1][$i2][0],$BACK[$i1][$i2][1]);
}
while($i1<scalar(@{$P1}) || $i2<scalar(@{$P2})) {
# print "fwd $i1 $i2\n";
for(my $i=$i1;$i<$NEXT{$i1}{$i2}[0];$i++) {
print OUT1 " " unless $i == $i1;
print OUT1 $$P1[$i];
}
print OUT1 "\n";
for(my $i=$i2;$i<$NEXT{$i1}{$i2}[1];$i++) {
print OUT2 " " unless $i == $i2;
print OUT2 $$P2[$i];
}
print OUT2 "\n";
($i1,$i2) = @{$NEXT{$i1}{$i2}};
}
}
sub match {
my ($len1,$len2) = @_;
my $c = 1;
my $s2 = 6.8;
if ($len1==0 && $len2==0) { return 0; }
my $mean = ($len1 + $len2/$c) / 2;
my $z = ($c * $len1 - $len2)/sqrt($s2 * $mean);
if ($z < 0) { $z = -$z; }
my $pd = 2 * (1 - &pnorm($z));
if ($pd>0) { return -log($pd); }
return 25;
}
sub pnorm {
my ($z) = @_;
my $t = 1/(1 + 0.2316419 * $z);
return 1 - 0.3989423 * exp(-$z * $z / 2) *
((((1.330274429 * $t
- 1.821255978) * $t
+ 1.781477937) * $t
- 0.356563782) * $t
+ 0.319381530) * $t;
}

View File

@ -0,0 +1,73 @@
Europarl v3 Preprocessing Tools
===============================
written by Philipp Koehn and Josh Schroeder
Sentence Splitter
=================
Usage ./split-sentences.perl -l [en|de|...] < textfile > splitfile
Uses punctuation and Capitalization clues to split paragraphs of
sentences into files with one sentence per line. For example:
This is a paragraph. It contains several sentences. "But why," you ask?
goes to:
This is a paragraph.
It contains several sentences.
"But why," you ask?
See more information in the Nonbreaking Prefixes section.
Tokenizer
=========
Usage ./tokenizer.perl -l [en|de|...] < textfile > tokenizedfile
Splits out most punctuation from words. Special cases where splits
do not occur are documented in the code.
This E.U. treaty is, to use the words of Mr. Smith, "awesome."
goes to:
This E.U. treaty is , to use the words of Mr. Smith , " awesome . "
Like the sentence splitter, it makes use of the nonbreaking_prefixes
directory.
Nonbreaking Prefixes Directory
==============================
Nonbreaking prefixes are loosely defined as any word ending in a
period that does NOT indicate an end of sentence marker. A basic
example is Mr. and Ms. in English.
The sentence splitter and tokenizer included with this release
both use the nonbreaking prefix files included in this directory.
To add a file for other languages, follow the naming convention
nonbreaking_prefix.?? and use the two-letter language code you
intend to use when calling split-sentences.perl and tokenizer.perl.
Both split-sentences and tokenizer will first look for a file for the
language they are processing, and fall back to English if a file
for that language is not found. If the nonbreaking_prefixes directory does
not exist at the same location as the split-sentences.perl and tokenizer.perl
files, they will not run.
For the splitter, normally a period followed by an uppercase word
results in a sentence split. If the word preceeding the period
is a nonbreaking prefix, this line break is not inserted.
For the tokenizer, a nonbreaking prefix is not separated from its
period with a space.
A special case of prefixes, NUMERIC_ONLY, is included for special
cases where the prefix should be handled ONLY when before numbers.
For example, "Article No. 24 states this." the No. is a nonbreaking
prefix. However, in "No. It is not true." No functions as a word.
See the example prefix files included here for more examples.

View File

@ -0,0 +1,325 @@
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
#usually upper case letters are initials in a name
#no german words end in single lower-case letters, so we throw those in too.
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
#Roman Numerals. A dot after one of these is not a sentence break in German.
I
II
III
IV
V
VI
VII
VIII
IX
X
XI
XII
XIII
XIV
XV
XVI
XVII
XVIII
XIX
XX
i
ii
iii
iv
v
vi
vii
viii
ix
x
xi
xii
xiii
xiv
xv
xvi
xvii
xviii
xix
xx
#Titles and Honorifics
Adj
Adm
Adv
Asst
Bart
Bldg
Brig
Bros
Capt
Cmdr
Col
Comdr
Con
Corp
Cpl
DR
Dr
Ens
Gen
Gov
Hon
Hosp
Insp
Lt
MM
MR
MRS
MS
Maj
Messrs
Mlle
Mme
Mr
Mrs
Ms
Msgr
Op
Ord
Pfc
Ph
Prof
Pvt
Rep
Reps
Res
Rev
Rt
Sen
Sens
Sfc
Sgt
Sr
St
Supt
Surg
#Misc symbols
Mio
Mrd
bzw
v
vs
usw
d.h
z.B
u.a
etc
Mrd
MwSt
ggf
d.J
D.h
m.E
vgl
I.F
z.T
sogen
ff
u.E
g.U
g.g.A
c.-à-d
Buchst
u.s.w
sog
u.ä
Std
evtl
Zt
Chr
u.U
o.ä
Ltd
b.A
z.Zt
spp
sen
SA
k.o
jun
i.H.v
dgl
dergl
Co
zzt
usf
s.p.a
Dkr
Corp
bzgl
BSE
#Number indicators
# add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
No
Nos
Art
Nr
pp
ca
Ca
#Ordinals are done with . in German - "1." = "1st" in English
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99

View File

@ -0,0 +1,2 @@
# for now, just include the Greek equivalent of "Mr."
κ

View File

@ -0,0 +1,107 @@
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
#usually upper case letters are initials in a name
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
Adj
Adm
Adv
Asst
Bart
Bldg
Brig
Bros
Capt
Cmdr
Col
Comdr
Con
Corp
Cpl
DR
Dr
Drs
Ens
Gen
Gov
Hon
Hr
Hosp
Insp
Lt
MM
MR
MRS
MS
Maj
Messrs
Mlle
Mme
Mr
Mrs
Ms
Msgr
Op
Ord
Pfc
Ph
Prof
Pvt
Rep
Reps
Res
Rev
Rt
Sen
Sens
Sfc
Sgt
Sr
St
Supt
Surg
#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
v
vs
i.e
rev
e.g
#Numbers only. These should only induce breaks when followed by a numeric sequence
# add NUMERIC_ONLY after the word for this function
#This case is mostly for the english "No." which can either be a sentence of its own, or
#if followed by a number, a non-breaking prefix
No #NUMERIC_ONLY#
Nos
Art #NUMERIC_ONLY#
Nr
pp #NUMERIC_ONLY#

View File

@ -0,0 +1,246 @@
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
#usually upper case letters are initials in a name
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
#Abbreviations
a.c
aa.rr
abrev
adj
adm
admón
afma
afmas
afmo
afmos
ag
am
ap
apdo
art
arts
arz
arzbpo
assn
atte
av
avda
bros
bv
cap
caps
cg
cgo
cia
cit
cl
cm
co
col
corp
cos
cta
cte
ctra
cts
cía
cía
d.c
dcha
dept
depto
dg
dl
dm
doc
docs
dpt
dpto
dr
dra
dras
dres
dto
dupdo
ed
ee.uu
ej
emma
emmas
emmo
emmos
entlo
entpo
esp
etc
ex
excm
excma
excmas
excmo
excmos
fasc
fdo
fig
figs
fil
fol
fra
gr
grs
gral
ha
hnos
hros
hz
ib
ibid
ibíd
id
ilm
ilma
ilmas
ilmo
ilmos
iltre
inc
intr
izq
izqda
izqdo
jr
kc
kcal
kg
khz
kl
km
kw
lda
ldo
lib
lic
lim
loc
ltd
ltda
lám
ma
mg
mhz
min
mm
mons
mr
mrs
ms
mss
mtro
máx
mín
ntra
ntro
núm
ob
obpo
op
pd
ph
pje
pl
plc
pm
pp
ppal
pral
prof
prov
pról
ps
pta
ptas
pte
pts
pza
pág
págs
párr
rda
rdo
ref
reg
rel
rev
revda
revdo
rma
rmo
rte
s
sa
sdad
sec
secret
seg
sg
sig
smo
sr
sra
sras
sres
srs
srta
ss.mm
sta
sto
sust
tech
tel
telf
teléf
ten
tfono
tlf
t.v.e
tít
ud
uds
vda
vdo
vid
vol
vols
vra
vro
vta
íd
ít

View File

@ -0,0 +1,153 @@
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
#any single upper case letter followed by a period is not a sentence ender
#usually upper case letters are initials in a name
#no French words end in single lower-case letters, so we throw those in too?
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
# Period-final abbreviation list for French
A.C.N
A.M
art
ann
apr
av
auj
lib
B.P
boul
ca
c.-à-d
cf
ch.-l
chap
contr
C.P.I
C.Q.F.D
C.N
C.N.S
C.S
dir
éd
e.g
env
al
etc
E.V
ex
fasc
fém
fig
fr
hab
ibid
id
i.e
inf
LL.AA
LL.AA.II
LL.AA.RR
LL.AA.SS
L.D
LL.EE
LL.MM
LL.MM.II.RR
loc.cit
masc
MM
ms
N.B
N.D.A
N.D.L.R
N.D.T
n/réf
NN.SS
N.S
N.D
N.P.A.I
p.c.c
pl
pp
p.ex
p.j
P.S
R.A.S
R.-V
R.P
R.I.P
SS
S.S
S.A
S.A.I
S.A.R
S.A.S
S.E
sec
sect
sing
S.M
S.M.I.R
sq
sqq
suiv
sup
suppl
tél
T.S.V.P
vb
vol
vs
X.O
Z.I

View File

@ -0,0 +1,134 @@
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
#any single upper case letter followed by a period is not a sentence ender
#usually upper case letters are initials in a name
#no Italian words end in single lower-case letters, so we throw those in too?
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
# Period-final abbreviation list from http://www.chass.utoronto.ca/~ngargano/corsi/corrisp/abbreviazioni.html
a.c
es
all
Amn
Arch
Avv
Bcc
c.a
C.A.P
Cc
banc
post
c.c.p
c.m
Co
c.p
C.P
corr
c.s
c.v
Dott
Dr
ecc
Egr
e.p.c
fatt
Geom
gg
Id
Ing
int
lett
Mo
Mons
N.B
ogg
on
pp
p.c
p.c
p.c.c
p.es
p.f
p.r
P.S
p.v
P.T
Prof
racc
Rag
Rev
ric
Rif
RP
RSVP
S.A
acc
S.B.F
seg
sgg
ss
Sig
Sigg
s.n.c
Soc
S.p.A
Spett
S.P.M
S.r.l
tel
u.s
V.P
v.r
v.s

View File

@ -0,0 +1,115 @@
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
#Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen
# http://nl.wikipedia.org/wiki/Aanspreekvorm
# http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
#usually upper case letters are initials in a name
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
bacc
bc
bgen
c.i
dhr
dr
dr.h.c
drs
drs
ds
eint
fa
Fa
fam
gen
genm
ing
ir
jhr
jkvr
jr
kand
kol
lgen
lkol
Lt
maj
Mej
mevr
Mme
mr
mr
Mw
o.b.s
plv
prof
ritm
tint
Vz
Z.D
Z.D.H
Z.E
Z.Em
Z.H
Z.K.H
Z.K.M
Z.M
z.v
#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
#we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
a.g.v
bijv
bijz
bv
d.w.z
e.c
e.g
e.k
ev
i.p.v
i.s.m
i.t.t
i.v.m
m.a.w
m.b.t
m.b.v
m.h.o
m.i
m.i.v
v.w.t
#Numbers only. These should only induce breaks when followed by a numeric sequence
# add NUMERIC_ONLY after the word for this function
#This case is mostly for the english "No." which can either be a sentence of its own, or
#if followed by a number, a non-breaking prefix
Nr #NUMERIC_ONLY#
Nrs
nrs
nr #NUMERIC_ONLY#

View File

@ -0,0 +1,285 @@
adw
afr
akad
al
Al
am
amer
arch
art
Art
artyst
astr
austr
bałt
bdb
bm
br
bryg
bryt
centr
ces
chem
chiń
chir
c.k
c.o
cyg
cyw
cyt
czes
czw
cd
Cd
czyt
ćw
ćwicz
daw
dcn
dekl
demokr
det
diec
dn
dot
dol
dop
dost
dosł
h.c
ds
dst
duszp
dypl
egz
ekol
ekon
elektr
em
ew
fab
farm
fot
fr
gat
gastr
geogr
geol
gimn
głęb
gm
godz
górn
gosp
gr
gram
hist
hiszp
hr
Hr
hot
id
in
im
iron
jn
kard
kat
katol
k.k
kk
kol
kl
k.p.a
kpc
k.p.c
kpt
kr
k.r
krak
k.r.o
kryt
kult
laic
łac
niem
woj
nb
np
Nb
Np
pol
pow
m.in
pt
ps
Pt
Ps
cdn
jw
ryc
rys
Ryc
Rys
tj
tzw
Tzw
tzn
zob
ang
ub
ul
pw
pn
pl
al
k
n
nr #NUMERIC_ONLY#
Nr #NUMERIC_ONLY#
ww
ur
zm
żyd
żarg
żyw
wył
bp
bp
wyst
tow
Tow
o
sp
Sp
st
spółdz
Spółdz
społ
spółgł
stoł
stow
Stoł
Stow
zn
zew
zewn
zdr
zazw
zast
zaw
zał
zal
zam
zak
zakł
zagr
zach
adw
Adw
lek
Lek
med
mec
Mec
doc
Doc
dyw
dyr
Dyw
Dyr
inż
Inż
mgr
Mgr
dh
dr
Dh
Dr
p
P
red
Red
prof
prok
Prof
Prok
hab
płk
Płk
nadkom
Nadkom
podkom
Podkom
ks
Ks
gen
Gen
por
Por
reż
Reż
przyp
Przyp
śp
św
śW
Śp
Św
ŚW
szer
Szer
pkt #NUMERIC_ONLY#
str #NUMERIC_ONLY#
tab #NUMERIC_ONLY#
Tab #NUMERIC_ONLY#
tel
ust #NUMERIC_ONLY#
par #NUMERIC_ONLY#
poz
pok
oo
oO
Oo
OO
r #NUMERIC_ONLY#
l #NUMERIC_ONLY#
s #NUMERIC_ONLY#
najśw
Najśw
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
Ś
Ć
Ż
Ź
Dz
Contact GitHub API Training Shop Blog About

View File

@ -0,0 +1,152 @@
#!/usr/bin/perl -w
# Based on Preprocessor written by Philipp Koehn
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
use FindBin qw($Bin);
use strict;
my $mydir = "$Bin/nonbreaking_prefixes";
my %NONBREAKING_PREFIX = ();
my $language = "en";
my $QUIET = 0;
my $HELP = 0;
while (@ARGV) {
$_ = shift;
/^-l$/ && ($language = shift, next);
/^-q$/ && ($QUIET = 1, next);
/^-h$/ && ($HELP = 1, next);
}
if ($HELP) {
print "Usage ./split-sentences.perl (-l [en|de|...]) < textfile > splitfile\n";
exit;
}
if (!$QUIET) {
print STDERR "Sentence Splitter v3\n";
print STDERR "Language: $language\n";
}
my $prefixfile = "$mydir/nonbreaking_prefix.$language";
#default back to English if we don't have a language-specific prefix file
if (!(-e $prefixfile)) {
$prefixfile = "$mydir/nonbreaking_prefix.en";
print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
}
if (-e "$prefixfile") {
open(PREFIX, "<:utf8", "$prefixfile");
while (<PREFIX>) {
my $item = $_;
chomp($item);
if (($item) && (substr($item,0,1) ne "#")) {
if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
$NONBREAKING_PREFIX{$1} = 2;
} else {
$NONBREAKING_PREFIX{$item} = 1;
}
}
}
close(PREFIX);
}
##loop text, add lines together until we get a blank line or a <p>
my $text = "";
while(<STDIN>) {
chop;
if (/^<.+>$/ || /^\s*$/) {
#time to process this block, we've hit a blank or <p>
&do_it_for($text,$_);
print "<P>\n" if (/^\s*$/ && $text); ##if we have text followed by <P>
$text = "";
}
else {
#append the text, with a space
$text .= $_. " ";
}
}
#do the leftover text
&do_it_for($text,"") if $text;
sub do_it_for {
my($text,$markup) = @_;
print &preprocess($text) if $text;
print "$markup\n" if ($markup =~ /^<.+>$/);
#chop($text);
}
sub preprocess {
# clean up spaces at head and tail of each line as well as any double-spacing
$text =~ s/ +/ /g;
$text =~ s/\n /\n/g;
$text =~ s/ \n/\n/g;
$text =~ s/^ //g;
$text =~ s/ $//g;
#this is one paragraph
my($text) = @_;
#####add sentence breaks as needed#####
#non-period end of sentence markers (?!) followed by sentence starters.
$text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
#multi-dots followed by sentence starters
$text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
# add breaks for sentences that end with some sort of punctuation inside a quote or parenthetical and are followed by a possible sentence starter punctuation and upper case
$text =~ s/([?!\.][\ ]*[\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g;
# add breaks for sentences that end with some sort of punctuation are followed by a sentence starter punctuation and upper case
$text =~ s/([?!\.]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g;
# special punctuation cases are covered. Check all remaining periods.
my $word;
my $i;
my @words = split(/ /,$text);
$text = "";
for ($i=0;$i<(scalar(@words)-1);$i++) {
if ($words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/) {
#check if $1 is a known honorific and $2 is empty, never break
my $prefix = $1;
my $starting_punct = $2;
if($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
#not breaking;
} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
#not breaking - upper case acronym
} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/) {
#the next word has a bunch of initial quotes, maybe a space, then either upper case or a number
$words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/));
#we always add a return for these unless we have a numeric non-breaker and a number start
}
}
$text = $text.$words[$i]." ";
}
#we stopped one token from the end to allow for easy look-ahead. Append it now.
$text = $text.$words[$i];
# clean up spaces at head and tail of each line as well as any double-spacing
$text =~ s/ +/ /g;
$text =~ s/\n /\n/g;
$text =~ s/ \n/\n/g;
$text =~ s/^ //g;
$text =~ s/ $//g;
#add trailing break
$text .= "\n" unless $text =~ /\n$/;
return $text;
}

View File

@ -0,0 +1,167 @@
#!/usr/bin/perl -w
# Sample Tokenizer
# written by Josh Schroeder, based on code by Philipp Koehn
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
use FindBin qw($Bin);
use strict;
#use Time::HiRes;
my $mydir = "$Bin/nonbreaking_prefixes";
my %NONBREAKING_PREFIX = ();
my $language = "en";
my $QUIET = 0;
my $HELP = 0;
#my $start = [ Time::HiRes::gettimeofday( ) ];
while (@ARGV) {
$_ = shift;
/^-l$/ && ($language = shift, next);
/^-q$/ && ($QUIET = 1, next);
/^-h$/ && ($HELP = 1, next);
}
if ($HELP) {
print "Usage ./tokenizer.perl (-l [en|de|...]) < textfile > tokenizedfile\n";
exit;
}
if (!$QUIET) {
print STDERR "Tokenizer v3\n";
print STDERR "Language: $language\n";
}
load_prefixes($language,\%NONBREAKING_PREFIX);
if (scalar(%NONBREAKING_PREFIX) eq 0){
print STDERR "Warning: No known abbreviations for language '$language'\n";
}
while(<STDIN>) {
if (/^<.+>$/ || /^\s*$/) {
#don't try to tokenize XML/HTML tag lines
print $_;
}
else {
print &tokenize($_);
}
}
#my $duration = Time::HiRes::tv_interval( $start );
#print STDERR ("EXECUTION TIME: ".$duration."\n");
sub tokenize {
my($text) = @_;
chomp($text);
$text = " $text ";
# seperate out all "other" special characters
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
#multi-dots stay together
$text =~ s/\.([\.]+)/ DOTMULTI$1/g;
while($text =~ /DOTMULTI\./) {
$text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
$text =~ s/DOTMULTI\./DOTDOTMULTI/g;
}
# seperate out "," except if within numbers (5,300)
$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
# separate , pre and post number
$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
# turn `into '
$text =~ s/\`/\'/g;
#turn '' into "
$text =~ s/\'\'/ \" /g;
if ($language eq "en") {
#split contractions right
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
#special case for "1990's"
$text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
} elsif (($language eq "fr") or ($language eq "it")) {
#split contractions left
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
} else {
$text =~ s/\'/ \' /g;
}
#word token method
my @words = split(/\s/,$text);
$text = "";
for (my $i=0;$i<(scalar(@words));$i++) {
my $word = $words[$i];
if ( $word =~ /^(\S+)\.$/) {
my $pre = $1;
if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) {
#no change
} elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) {
#no change
} else {
$word = $pre." .";
}
}
$text .= $word." ";
}
# clean up extraneous spaces
$text =~ s/ +/ /g;
$text =~ s/^ //g;
$text =~ s/ $//g;
#restore multi-dots
while($text =~ /DOTDOTMULTI/) {
$text =~ s/DOTDOTMULTI/DOTMULTI./g;
}
$text =~ s/DOTMULTI/./g;
#ensure final line break
$text .= "\n" unless $text =~ /\n$/;
return $text;
}
sub load_prefixes {
my ($language, $PREFIX_REF) = @_;
my $prefixfile = "$mydir/nonbreaking_prefix.$language";
#default back to English if we don't have a language-specific prefix file
if (!(-e $prefixfile)) {
$prefixfile = "$mydir/nonbreaking_prefix.en";
print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
}
if (-e "$prefixfile") {
open(PREFIX, "<:utf8", "$prefixfile");
while (<PREFIX>) {
my $item = $_;
chomp($item);
if (($item) && (substr($item,0,1) ne "#")) {
if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
$PREFIX_REF->{$1} = 2;
} else {
$PREFIX_REF->{$item} = 1;
}
}
}
close(PREFIX);
}
}

View File

@ -0,0 +1,100 @@
adbackoff 0
compactadtable 1
compactalignmentformat 0
coocurrencefile corpora/CORPUS_NAME/src.low_trg.low.cooc
corpusfile corpora/CORPUS_NAME/src.low_trg.low.snt
countcutoff 1e-06
countcutoffal 1e-05
countincreasecutoff 1e-06
countincreasecutoffal 1e-05
countoutputprefix
d
deficientdistortionforemptyword 0
depm4 76
depm5 68
dictionary
dopeggingyn 0
dumpcount 0
dumpcountusingwordstring 0
emalignmentdependencies 2
emalsmooth 0.2
emprobforempty 0.4
emsmoothhmm 2
hmmdumpfrequency 0
hmmiterations 5
log 0
logfile corpora/CORPUS_NAME/mgiza.log
m1 5
m2 0
m3 3
m4 3
m5 0
m5p0 -1
m6 0
manlexfactor1 0
manlexfactor2 0
manlexmaxmultiplicity 20
maxfertility 10
maxsentencelength 101
mh 5
mincountincrease 1e-07
ml 101
model1dumpfrequency 1
model1iterations 5
model23smoothfactor 0
model2dumpfrequency 0
model2iterations 0
model345dumpfrequency 0
model3dumpfrequency 0
model3iterations 3
model4iterations 3
model4smoothfactor 0.4
model5iterations 0
model5smoothfactor 0.1
model6iterations 0
nbestalignments 0
ncpus 2
nodumps 1
nofiledumpsyn 1
noiterationsmodel1 5
noiterationsmodel2 0
noiterationsmodel3 3
noiterationsmodel4 3
noiterationsmodel5 0
noiterationsmodel6 0
nsmooth 4
nsmoothgeneral 0
numberofiterationsforhmmalignmentmodel 5
onlyaldumps 1
outputfileprefix corpora/CORPUS_NAME/aligned
outputpath
p 0
p0 0.999
peggedcutoff 0.03
pegging 0
previousa
previousd
previousd4
previousd42
previoushmm
previousn
previousp0
previoust
probcutoff 1e-07
probsmooth 1e-07
readtableprefix
restart 0
sourcevocabularyfile corpora/CORPUS_NAME/src.low.vcb
t1 1
t2 0
t2to3 0
t3 0
t345 0
targetvocabularyfile corpora/CORPUS_NAME/trg.low.vcb
tc
testcorpusfile
th 0
transferdumpfrequency 0
v 0
verbose 0
verbosesentence -10

1
mgiza-aligner/mgiza Submodule

@ -0,0 +1 @@
Subproject commit d643960de98565d208114780ba8025799208afa7