From 95f3b9043c6093a2637c5a2bcae7dab0ad4494ee Mon Sep 17 00:00:00 2001 From: jakubknczny Date: Mon, 31 Jan 2022 18:33:10 +0100 Subject: [PATCH] update marian-train --- README.md | 5 +++-- scripts/marian-train.sh | 38 ++++++++++++++++++-------------------- train.sh | 9 +++------ 3 files changed, 24 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 56ef77e..514295c 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ -# PLEWI -- clone PLEWI repo into ~/ +# NEEDS TO BE UPDATED +## PLEWI +- clone PLEWI repo into ~ - run following commands: - git clone https://git.wmi.amu.edu.pl/s470607/transfix-train.git - cd transfix-train diff --git a/scripts/marian-train.sh b/scripts/marian-train.sh index f5d3710..a91b430 100755 --- a/scripts/marian-train.sh +++ b/scripts/marian-train.sh @@ -2,26 +2,28 @@ # arguments # 1. root of gonito.net challenge-like filestructure -# 2. number of BPE merge operations, E.g. 32000 +# 2. name of the model and therefore a directory that will contain the model # 3. expected number of train epochs # path to corpus corpus_path="$1" -bpe_merges="$2" +model_name="$2" epochs="$3" -source_file="$corpus_path"/train/in.tsv."$bpe_merges" -source_vocab="$source_file".vocab."$bpe_merges".yml +source_file="$corpus_path"/train/in.tsv +source_file_valid="$corpus_path"/test-A/in.tsv -target_file="$corpus_path"/train/expected.tsv."$bpe_merges" -target_vocab="$target_file".vocab."$bpe_merges".yml +target_file="$corpus_path"/train/expected.tsv +target_file_valid="$corpus_path"/test-A/expected.tsv +mkdir "$model_name" ~/marian/build/marian \ - --type transformer \ +--type transformer \ +--model "$model_name"/model.npz \ --overwrite \ --train-sets "$source_file" "$target_file" \ ---max-length 100 \ +--max-length 200 \ --mini-batch-fit -w 10000 --maxi-batch 1000 \ --valid-freq 5000 \ --save-freq 5000 \ @@ -36,15 +38,11 @@ target_vocab="$target_file".vocab."$bpe_merges".yml --optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \ --tied-embeddings \ --exponential-smoothing \ ---log ~/train.log \ ---after-epochs="$epochs" -#--vocabs "$source_vocab" "$target_vocab" - -#--early-stopping 10 \ -#--model model/model.npz -#--valid-log model/valid.log \ -#--valid-metrics cross-entropy perplexity translation \ -#--valid-sets data/valid.bpe.en data/valid.bpe.de \ -#--valid-script-path ./scripts/validate.sh \ -#--valid-translation-output data/valid.bpe.en.output --quiet-translation \ -#--valid-mini-batch 64 \ +--log "$model_name"/train.log \ +--after-epochs="$epochs" \ +--vocabs "$model_name"/vocab.in.spm "$model_name"/vocab.expected.spm \ +--valid-log "$model_name"/valid.log \ +--valid-metrics cross-entropy perplexity bleu \ +--valid-mini-batch 64 \ +--valid-sets "$source_file_valid" "$target_file_valid" \ +--valid-translation-output "$model_name"/valid.output --quiet-translation diff --git a/train.sh b/train.sh index 2f8e504..8b0f796 100755 --- a/train.sh +++ b/train.sh @@ -2,13 +2,10 @@ # arguments # 1. root of gonito.net challenge-like filestructure -# 2. number of BPE merge operations, E.g. 32000 -# 3. expected number of train epochs +# 2. expected number of train epochs corpus_path="$1" -bpe_merges="$2" -epochs="$3" +epochs="$2" -./scripts/do-fastBPE.sh "$corpus_path" "$bpe_merges" -./scripts/marian-train.sh "$corpus_path" "$bpe_merges" "$epochs" \ No newline at end of file +./scripts/marian-train.sh "$corpus_path" "$epochs" \ No newline at end of file