#!/bin/bash -v MARIAN=/home/jakub/TAU/TAU_24/marian-dev/build # if we are in WSL, we need to add '.exe' to the tool names if [ -e "/bin/wslpath" ] then EXT=.exe fi MARIAN_TRAIN=$MARIAN/marian$EXT MARIAN_DECODER=$MARIAN/marian-decoder$EXT MARIAN_VOCAB=$MARIAN/marian-vocab$EXT MARIAN_SCORER=$MARIAN/marian-scorer$EXT # set chosen gpus GPUS=0 if [ $# -ne 0 ] then GPUS=$@ fi echo Using GPUs: $GPUS if [ ! -e $MARIAN_TRAIN ] then echo "marian is not installed in $MARIAN, you need to compile the toolkit first" exit 1 fi # create common vocabulary if [ ! -e "preprocess-train/model/vocab.cs.yml" ] || [ ! -e "preprocess-train/model/vocab.en.yml" ] then cat preprocess-train/data/europarl.tc.cs preprocess-train/data/valideuroparl.tc.cs | $MARIAN_VOCAB --max-size 50000 > preprocess-train/model/vocab.cs.yml cat preprocess-train/data/europarl.tc.en preprocess-train/data/valideuroparl.tc.en | $MARIAN_VOCAB --max-size 50000 > preprocess-train/model/vocab.en.yml fi # train model if [ ! -e "preprocess-train/model/model.npz.best-translation.npz" ] then $MARIAN_TRAIN \ --devices $GPUS \ --type s2s \ --early-stopping 10 -e 15 \ --model preprocess-train/model/model.npz \ --train-sets preprocess-train/data/europarl.tc.cs preprocess-train/data/europarl.tc.en \ --vocabs preprocess-train/model/vocab.cs.yml preprocess-train/model/vocab.en.yml \ --dim-vocabs 50000 50000 \ --mini-batch-fit -w 1024 \ --valid-freq 10000 --save-freq 10000 --disp-freq 1000 \ --valid-mini-batch 23 --valid-max-length 100 \ --valid-metrics cross-entropy translation \ --valid-sets preprocess-train/data/valideuroparl.tc.cs preprocess-train/data/valideuroparl.tc.en \ --valid-script-path "bash ./preprocess-train/scripts/validate.sh" \ --log preprocess-train/model/train.log --valid-log preprocess-train/model/valid.log \ --overwrite --keep-best \ fi # translate dev set cat dev-0/in.tc.cs \ | $MARIAN_DECODER -c preprocess-train/model/model.npz.best-translation.npz.decoder.yml -d $GPUS -b 12 -n1 \ --mini-batch 64 --maxi-batch 10 --maxi-batch-sort src \ | sed 's/\@\@ //g' \ | /home/jakub/TAU/TAU_24/marian-dev/examples/tools/moses-scripts/scripts/recaser/detruecase.perl \ | /home/jakub/TAU/TAU_24/marian-dev/examples/tools/moses-scripts/scripts/tokenizer/detokenizer.perl -l en \ > dev-0/in.tc.cs.output cp dev-0/in.tc.cs.output dev-0/out.tsv # translate test set cat test-A/in.tc.cs \ | $MARIAN_DECODER -c preprocess-train/model/model.npz.best-translation.npz.decoder.yml -d $GPUS -b 12 -n1 \ --mini-batch 64 --maxi-batch 10 --maxi-batch-sort src \ | sed 's/\@\@ //g' \ | /home/jakub/TAU/TAU_24/marian-dev/examples/tools/tools/moses-scripts/scripts/recaser/detruecase.perl \ | /home/jakub/TAU/TAU_24/marian-dev/examples/tools/tools/moses-scripts/scripts/tokenizer/detokenizer.perl -l en \ > test-A/in.tc.cs.output cp test-A/in.tc.cs.output test-A/out.tsv geval -t dev-0 >> scores_geval.txt