84 lines
3.0 KiB
Bash
84 lines
3.0 KiB
Bash
|
#!/bin/bash -v
|
||
|
|
||
|
MARIAN=/home/jakub/TAU/TAU_24/marian-dev/build
|
||
|
|
||
|
# if we are in WSL, we need to add '.exe' to the tool names
|
||
|
if [ -e "/bin/wslpath" ]
|
||
|
then
|
||
|
EXT=.exe
|
||
|
fi
|
||
|
|
||
|
MARIAN_TRAIN=$MARIAN/marian$EXT
|
||
|
MARIAN_DECODER=$MARIAN/marian-decoder$EXT
|
||
|
MARIAN_VOCAB=$MARIAN/marian-vocab$EXT
|
||
|
MARIAN_SCORER=$MARIAN/marian-scorer$EXT
|
||
|
|
||
|
# set chosen gpus
|
||
|
GPUS=0
|
||
|
if [ $# -ne 0 ]
|
||
|
then
|
||
|
GPUS=$@
|
||
|
fi
|
||
|
echo Using GPUs: $GPUS
|
||
|
|
||
|
if [ ! -e $MARIAN_TRAIN ]
|
||
|
then
|
||
|
echo "marian is not installed in $MARIAN, you need to compile the toolkit first"
|
||
|
exit 1
|
||
|
fi
|
||
|
|
||
|
|
||
|
# create common vocabulary
|
||
|
if [ ! -e "preprocess-train/model/vocab.cs.yml" ] || [ ! -e "preprocess-train/model/vocab.en.yml" ]
|
||
|
then
|
||
|
cat preprocess-train/data/europarl.tc.cs preprocess-train/data/valideuroparl.tc.cs | $MARIAN_VOCAB --max-size 50000 > preprocess-train/model/vocab.cs.yml
|
||
|
cat preprocess-train/data/europarl.tc.en preprocess-train/data/valideuroparl.tc.en | $MARIAN_VOCAB --max-size 50000 > preprocess-train/model/vocab.en.yml
|
||
|
fi
|
||
|
|
||
|
# train model
|
||
|
if [ ! -e "preprocess-train/model/model.npz.best-translation.npz" ]
|
||
|
then
|
||
|
$MARIAN_TRAIN \
|
||
|
--devices $GPUS \
|
||
|
--type s2s \
|
||
|
--early-stopping 10 -e 15 \
|
||
|
--model preprocess-train/model/model.npz \
|
||
|
--train-sets preprocess-train/data/europarl.tc.cs preprocess-train/data/europarl.tc.en \
|
||
|
--vocabs preprocess-train/model/vocab.cs.yml preprocess-train/model/vocab.en.yml \
|
||
|
--dim-vocabs 50000 50000 \
|
||
|
--mini-batch-fit -w 1024 \
|
||
|
--valid-freq 10000 --save-freq 10000 --disp-freq 1000 \
|
||
|
--valid-mini-batch 23 --valid-max-length 100 \
|
||
|
--valid-metrics cross-entropy translation \
|
||
|
--valid-sets preprocess-train/data/valideuroparl.tc.cs preprocess-train/data/valideuroparl.tc.en \
|
||
|
--valid-script-path "bash ./preprocess-train/scripts/validate.sh" \
|
||
|
--log preprocess-train/model/train.log --valid-log preprocess-train/model/valid.log \
|
||
|
--overwrite --keep-best \
|
||
|
|
||
|
|
||
|
|
||
|
fi
|
||
|
|
||
|
|
||
|
# translate dev set
|
||
|
cat dev-0/in.tc.cs \
|
||
|
| $MARIAN_DECODER -c preprocess-train/model/model.npz.best-translation.npz.decoder.yml -d $GPUS -b 12 -n1 \
|
||
|
--mini-batch 64 --maxi-batch 10 --maxi-batch-sort src \
|
||
|
| sed 's/\@\@ //g' \
|
||
|
| /home/jakub/TAU/TAU_24/marian-dev/examples/tools/moses-scripts/scripts/recaser/detruecase.perl \
|
||
|
| /home/jakub/TAU/TAU_24/marian-dev/examples/tools/moses-scripts/scripts/tokenizer/detokenizer.perl -l en \
|
||
|
> dev-0/in.tc.cs.output
|
||
|
cp dev-0/in.tc.cs.output dev-0/out.tsv
|
||
|
|
||
|
# translate test set
|
||
|
cat test-A/in.tc.cs \
|
||
|
| $MARIAN_DECODER -c preprocess-train/model/model.npz.best-translation.npz.decoder.yml -d $GPUS -b 12 -n1 \
|
||
|
--mini-batch 64 --maxi-batch 10 --maxi-batch-sort src \
|
||
|
| sed 's/\@\@ //g' \
|
||
|
| /home/jakub/TAU/TAU_24/marian-dev/examples/tools/tools/moses-scripts/scripts/recaser/detruecase.perl \
|
||
|
| /home/jakub/TAU/TAU_24/marian-dev/examples/tools/tools/moses-scripts/scripts/tokenizer/detokenizer.perl -l en \
|
||
|
> test-A/in.tc.cs.output
|
||
|
cp test-A/in.tc.cs.output test-A/out.tsv
|
||
|
geval -t dev-0 >> scores_geval.txt
|
||
|
|