TAU2019-028/train-and-translate.sh

84 lines
3.0 KiB
Bash
Raw Normal View History

#!/bin/bash -v
MARIAN=/home/jakub/TAU/TAU_24/marian-dev/build
# if we are in WSL, we need to add '.exe' to the tool names
if [ -e "/bin/wslpath" ]
then
EXT=.exe
fi
MARIAN_TRAIN=$MARIAN/marian$EXT
MARIAN_DECODER=$MARIAN/marian-decoder$EXT
MARIAN_VOCAB=$MARIAN/marian-vocab$EXT
MARIAN_SCORER=$MARIAN/marian-scorer$EXT
# set chosen gpus
GPUS=0
if [ $# -ne 0 ]
then
GPUS=$@
fi
echo Using GPUs: $GPUS
if [ ! -e $MARIAN_TRAIN ]
then
echo "marian is not installed in $MARIAN, you need to compile the toolkit first"
exit 1
fi
# create common vocabulary
if [ ! -e "preprocess-train/model/vocab.cs.yml" ] || [ ! -e "preprocess-train/model/vocab.en.yml" ]
then
cat preprocess-train/data/europarl.tc.cs preprocess-train/data/valideuroparl.tc.cs | $MARIAN_VOCAB --max-size 50000 > preprocess-train/model/vocab.cs.yml
cat preprocess-train/data/europarl.tc.en preprocess-train/data/valideuroparl.tc.en | $MARIAN_VOCAB --max-size 50000 > preprocess-train/model/vocab.en.yml
fi
# train model
if [ ! -e "preprocess-train/model/model.npz.best-translation.npz" ]
then
$MARIAN_TRAIN \
--devices $GPUS \
--type s2s \
--early-stopping 10 -e 15 \
--model preprocess-train/model/model.npz \
--train-sets preprocess-train/data/europarl.tc.cs preprocess-train/data/europarl.tc.en \
--vocabs preprocess-train/model/vocab.cs.yml preprocess-train/model/vocab.en.yml \
--dim-vocabs 50000 50000 \
--mini-batch-fit -w 1024 \
--valid-freq 10000 --save-freq 10000 --disp-freq 1000 \
--valid-mini-batch 23 --valid-max-length 100 \
--valid-metrics cross-entropy translation \
--valid-sets preprocess-train/data/valideuroparl.tc.cs preprocess-train/data/valideuroparl.tc.en \
--valid-script-path "bash ./preprocess-train/scripts/validate.sh" \
--log preprocess-train/model/train.log --valid-log preprocess-train/model/valid.log \
--overwrite --keep-best \
fi
# translate dev set
cat dev-0/in.tc.cs \
| $MARIAN_DECODER -c preprocess-train/model/model.npz.best-translation.npz.decoder.yml -d $GPUS -b 12 -n1 \
--mini-batch 64 --maxi-batch 10 --maxi-batch-sort src \
| sed 's/\@\@ //g' \
| /home/jakub/TAU/TAU_24/marian-dev/examples/tools/moses-scripts/scripts/recaser/detruecase.perl \
| /home/jakub/TAU/TAU_24/marian-dev/examples/tools/moses-scripts/scripts/tokenizer/detokenizer.perl -l en \
> dev-0/in.tc.cs.output
cp dev-0/in.tc.cs.output dev-0/out.tsv
# translate test set
cat test-A/in.tc.cs \
| $MARIAN_DECODER -c preprocess-train/model/model.npz.best-translation.npz.decoder.yml -d $GPUS -b 12 -n1 \
--mini-batch 64 --maxi-batch 10 --maxi-batch-sort src \
| sed 's/\@\@ //g' \
| /home/jakub/TAU/TAU_24/marian-dev/examples/tools/tools/moses-scripts/scripts/recaser/detruecase.perl \
| /home/jakub/TAU/TAU_24/marian-dev/examples/tools/tools/moses-scripts/scripts/tokenizer/detokenizer.perl -l en \
> test-A/in.tc.cs.output
cp test-A/in.tc.cs.output test-A/out.tsv
geval -t dev-0 >> scores_geval.txt