TAU2019-028/preprocess-data_dev-0_test-A.sh

49 lines
1.6 KiB
Bash
Executable File

#!/bin/bash -v
# This sample script preprocesses a sample corpus, including tokenization,
# truecasing, and subword segmentation.
# For application to a different language pair, change source and target
# prefix, optionally the number of BPE operations, and the file names
# (currently, data/corpus and data/newsdev2016 are being processed).
# In the tokenization step, you will want to remove Romanian-specific
# normalization / diacritic removal, and you may want to add your own.
# Also, you may want to learn BPE segmentations separately for each language,
# especially if they differ in their alphabet
# Suffix of source language files
SRC=cs
# Suffix of target language files
TRG=en
# Number of merge operations. Network vocabulary should be slightly larger (to
# include characters), or smaller if the operations are learned on the joint
# vocabulary
bpe_operations=85000
# path to moses decoder: https://github.com/moses-smt/mosesdecoder
mosesdecoder=/home/jakub/TAU/TAU_24/marian-dev/examples/tools/moses-scripts
# path to subword segmentation scripts: https://github.com/rsennrich/subword-nmt
subword_nmt=/home/jakub/TAU/TAU_24/marian-dev/examples/tools/subword-nmt
# tokenize
cat dev-0/in.tsv \
| $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l tsv > dev-0/in.tok.tsv
cat test-A/in.tsv \
| $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l tsv > test-A/in.tok.tsv
# apply truecaser (dev/test files)
for prefix in dev-0/in test-A/in
do
$mosesdecoder/scripts/recaser/truecase.perl -model preprocess-train/model/tc.$SRC < $prefix.tok.tsv > $prefix.tc.$SRC
done