63 lines
2.5 KiB
Bash
Executable File
63 lines
2.5 KiB
Bash
Executable File
#!/bin/bash -v
|
|
|
|
# This sample script preprocesses a sample corpus, including tokenization,
|
|
# truecasing, and subword segmentation.
|
|
# For application to a different language pair, change source and target
|
|
# prefix, optionally the number of BPE operations, and the file names
|
|
# (currently, data/corpus and data/newsdev2016 are being processed).
|
|
|
|
# In the tokenization step, you will want to remove Romanian-specific
|
|
# normalization / diacritic removal, and you may want to add your own.
|
|
# Also, you may want to learn BPE segmentations separately for each language,
|
|
# especially if they differ in their alphabet
|
|
|
|
# Suffix of source language files
|
|
SRC=cs
|
|
# Suffix of target language files
|
|
TRG=en
|
|
|
|
# Number of merge operations. Network vocabulary should be slightly larger (to
|
|
# include characters), or smaller if the operations are learned on the joint
|
|
# vocabulary
|
|
bpe_operations=85000
|
|
|
|
# path to moses decoder: https://github.com/moses-smt/mosesdecoder
|
|
|
|
mosesdecoder=/home/jakub/TAU/TAU_24/marian-dev/examples/tools/moses-scripts
|
|
|
|
# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt
|
|
subword_nmt=/home/jakub/TAU/TAU_24/marian-dev/examples/tools/subword-nmt
|
|
|
|
# tokenize
|
|
for prefix in europarl valideuroparl
|
|
do
|
|
cat data/$prefix.$SRC \
|
|
| $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $SRC > data/$prefix.tok.$SRC
|
|
|
|
cat data/$prefix.$TRG \
|
|
| $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG
|
|
|
|
done
|
|
|
|
# clean empty and long sentences, and sentences with high source-target ratio (training corpus only)
|
|
$mosesdecoder/scripts/training/clean-corpus-n.perl data/europarl.tok $SRC $TRG data/europarl.tok.clean 1 80
|
|
|
|
# train truecaser
|
|
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/europarl.tok.clean.$SRC -model model/tc.$SRC
|
|
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/europarl.tok.clean.$TRG -model model/tc.$TRG
|
|
|
|
# apply truecaser (cleaned training corpus)
|
|
for prefix in europarl
|
|
do
|
|
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.clean.$SRC > data/$prefix.tc.$SRC
|
|
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.clean.$TRG > data/$prefix.tc.$TRG
|
|
done
|
|
|
|
# apply truecaser (dev/test files)
|
|
for prefix in valideuroparl
|
|
do
|
|
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
|
|
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
|
|
done
|
|
|