2022-01-14 18:45:36 +01:00
|
|
|
#!/bin/bash
|
|
|
|
|
|
|
|
# arguments
|
|
|
|
# 1. root of gonito.net challenge-like filestructure
|
2022-01-31 18:33:10 +01:00
|
|
|
# 2. name of the model and therefore a directory that will contain the model
|
2022-01-15 18:07:27 +01:00
|
|
|
# 3. expected number of train epochs
|
2022-01-14 18:45:36 +01:00
|
|
|
|
|
|
|
# path to corpus
|
|
|
|
corpus_path="$1"
|
2022-01-31 18:33:10 +01:00
|
|
|
model_name="$2"
|
2022-01-15 20:10:06 +01:00
|
|
|
epochs="$3"
|
2022-01-14 18:45:36 +01:00
|
|
|
|
2022-01-31 18:33:10 +01:00
|
|
|
source_file="$corpus_path"/train/in.tsv
|
|
|
|
source_file_valid="$corpus_path"/test-A/in.tsv
|
2022-01-14 18:45:36 +01:00
|
|
|
|
2022-01-31 18:33:10 +01:00
|
|
|
target_file="$corpus_path"/train/expected.tsv
|
|
|
|
target_file_valid="$corpus_path"/test-A/expected.tsv
|
2022-01-14 18:45:36 +01:00
|
|
|
|
2022-01-31 18:33:10 +01:00
|
|
|
mkdir "$model_name"
|
2022-01-14 18:45:36 +01:00
|
|
|
|
2022-01-15 18:07:27 +01:00
|
|
|
~/marian/build/marian \
|
2022-01-31 18:33:10 +01:00
|
|
|
--type transformer \
|
|
|
|
--model "$model_name"/model.npz \
|
2022-01-14 19:08:32 +01:00
|
|
|
--overwrite \
|
2022-01-14 18:45:36 +01:00
|
|
|
--train-sets "$source_file" "$target_file" \
|
2022-01-31 18:33:10 +01:00
|
|
|
--max-length 200 \
|
2022-01-14 18:45:36 +01:00
|
|
|
--mini-batch-fit -w 10000 --maxi-batch 1000 \
|
|
|
|
--valid-freq 5000 \
|
|
|
|
--save-freq 5000 \
|
|
|
|
--disp-freq 500 \
|
|
|
|
--beam-size 6 --normalize 0.6 \
|
|
|
|
--enc-depth 6 --dec-depth 6 \
|
|
|
|
--transformer-heads 8 \
|
|
|
|
--transformer-postprocess-emb d \
|
|
|
|
--transformer-postprocess dan \
|
|
|
|
--transformer-dropout 0.1 --label-smoothing 0.1 \
|
|
|
|
--learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
|
|
|
|
--optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
|
2022-02-01 12:43:26 +01:00
|
|
|
--tied-embeddings-all \
|
2022-01-14 19:08:32 +01:00
|
|
|
--exponential-smoothing \
|
2022-01-31 18:33:10 +01:00
|
|
|
--log "$model_name"/train.log \
|
|
|
|
--after-epochs="$epochs" \
|
|
|
|
--vocabs "$model_name"/vocab.in.spm "$model_name"/vocab.expected.spm \
|
|
|
|
--valid-log "$model_name"/valid.log \
|
2022-02-01 12:43:26 +01:00
|
|
|
--valid-metrics perplexity bleu \
|
2022-01-31 18:33:10 +01:00
|
|
|
--valid-mini-batch 64 \
|
|
|
|
--valid-sets "$source_file_valid" "$target_file_valid" \
|
|
|
|
--valid-translation-output "$model_name"/valid.output --quiet-translation
|