adjust to gonito file structure

2022-01-15 18:07:27 +01:00 · 2022-01-15 18:07:27 +01:00 · 942e426366
commit 942e426366
parent 71f1c9580b
4 changed files with 45 additions and 44 deletions
--- a/scripts/do-fastBPE.sh
+++ b/scripts/do-fastBPE.sh
@ -2,26 +2,26 @@

 # arguments
 # 1. root of gonito.net challenge-like filestructure
-# 2. suffix of source sentences, E.g. en for files like train.en
-# 3. suffix of source sentences, E.g. pl for files like train.pl
-# 4. number of BPE merge operations, E.g. 32000
-# Usage example ./do-fastBPE.sh ~/myCorpora en pl 32000
+# 2. number of BPE merge operations, E.g. 32000
+# Usage example ./do-fastBPE.sh ~/sample-gonito-challenge 32000

 # path to corpus
 corpus_path="$1"
 # absolute paths to the files with source sentences
-source_suffix="$2"
-source_path_train="$corpus_path"/train/train."$source_suffix"
-source_path_test="$corpus_path"/test/test."$source_suffix"
+source_path_train="$corpus_path"/train/in.tsv
+source_path_test="$corpus_path"/test-A/in.tsv
 # absolute paths to the files with target sentences
-target_suffix="$3"
-target_path_train="$corpus_path"/train/train."$target_suffix"
-target_path_test="$corpus_path"/test/test."$target_suffix"
+target_path_train="$corpus_path"/train/expected.tsv
+target_path_test="$corpus_path"/test-A/expected.tsv
 # number of BPE merge operations, E.g. 32000
-bpe_merges="$4"
-# path to BPE merges?
+bpe_merges="$2"
+# path to BPE codes?
 codes="$corpus_path"/codes

+for file in "$source_path_train" "$source_path_test" "$target_path_train" "$target_path_test"
+do
+        ex -sc '%s/^/<s>/|%s/$/<\/s>/|x' "$file"
+done

 cd ~/fastBPE
 # learn BPE
@ -29,10 +29,19 @@ cd ~/fastBPE
 # apply codes to train
 ./fast applybpe "$source_path_train"."$bpe_merges" "$source_path_train" "$codes"
 ./fast applybpe "$target_path_train"."$bpe_merges" "$target_path_train" "$codes"
-# get train vocabulary
-./fast getvocab "$source_path_train"."$bpe_merges" > "$source_path_train".vocab."$bpe_merges"
-./fast getvocab "$target_path_train"."$bpe_merges" > "$target_path_train".vocab."$bpe_merges"
-# apply codes to test
-./fast applybpe "$source_path_test"."$bpe_merges" "$source_path_test" "$codes" "$source_path_train".vocab."$bpe_merges"
-./fast applybpe "$target_path_test"."$bpe_merges" "$target_path_test" "$codes" "$target_path_train".vocab."$bpe_merges"
+# get train vocabulary and .yml'ify it
+./fast getvocab "$source_path_train"."$bpe_merges" > "$source_path_train".vocab."$bpe_merges" \
+tee >(cut -f 1 > temp1) | cut -f 2 > temp2
+ex -sc '%s/$/:/|x' temp1
+paste temp1 temp2 > "$source_path_train".vocab."$bpe_merges".yml
+rm temp1 temp2 "$source_path_train".vocab."$bpe_merges"
+
+./fast getvocab "$target_path_train"."$bpe_merges" > "$target_path_train".vocab."$bpe_merges" \
+tee >(cut -f 1 > temp1) | cut -f 2 > temp2
+ex -sc '%s/$/:/|x' temp1
+paste temp1 temp2 > "$target_path_train".vocab."$bpe_merges".yml
+rm temp1 temp2 "$target_path_train".vocab."$bpe_merges"
+## apply codes to test
+#./fast applybpe "$source_path_test"."$bpe_merges" "$source_path_test" "$codes" "$source_path_train".vocab."$bpe_merges"
+#./fast applybpe "$target_path_test"."$bpe_merges" "$target_path_test" "$codes" "$target_path_train".vocab."$bpe_merges"
 cd ~
--- a/scripts/marian-setup.sh
+++ b/scripts/marian-setup.sh
@ -1,7 +1,8 @@
 #!/bin/bash

-# install MarianMT pre-requirements
+# install MarianMT and its pre-requirements
 apt-get install --force-yes git cmake build-essential libboost-system-dev libprotobuf17 protobuf-compiler libprotobuf-dev openssl libssl-dev libgoogle-perftools-dev
+cd ~
 git clone https://github.com/marian-nmt/marian
 mkdir marian/build
 cd marian/build
--- a/scripts/marian-train.sh
+++ b/scripts/marian-train.sh
@ -2,26 +2,22 @@

 # arguments
 # 1. root of gonito.net challenge-like filestructure
-# 2. suffix of source sentences, E.g. en for files like train.en
-# 3. suffix of source sentences, E.g. pl for files like train.pl
-# 4. number of BPE merge operations, E.g. 32000
-# 5. expected number of train epochs
+# 2. number of BPE merge operations, E.g. 32000
+# 3. expected number of train epochs

 # path to corpus
 corpus_path="$1"
-source_suffix="$2"
-target_suffix="$3"
 bpe_merges="$4"
 epochs="$5"

-source_file="$corpus_path"/train/train."$source_suffix"
-source_vocab="$source_file".vocab."$bpe_merges"
+source_file="$corpus_path"/train/in.tsv
+source_vocab="$source_file".vocab."$bpe_merges".yml

-target_file="$corpus_path"/train/train."$target_suffix"
-target_vocab="$target_file".vocab."$bpe_merges"
+target_file="$corpus_path"/train/expected.tsv
+target_vocab="$target_file".vocab."$bpe_merges".yml


-../marian/build/marian \
+~/marian/build/marian \
 --type transformer \
 --overwrite \
 --train-sets "$source_file" "$target_file" \
@ -38,13 +34,12 @@ target_vocab="$target_file".vocab."$bpe_merges"
 --transformer-dropout 0.1 --label-smoothing 0.1 \
 --learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
 --optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
--tied-embeddings-all \
+--tied-embeddings \
 --exponential-smoothing \
 --log ~/train.log \
--vocabs "$source_vocab" "$target_vocab" \
--after-epochs "$epochs" \
+--after-epochs="$epochs" \
+--vocabs "$source_vocab" "$target_vocab"

-#--vocabs model/vocab.ende.yml model/vocab.ende.yml \
 #--early-stopping 10 \
 #--model model/model.npz
 #--valid-log model/valid.log \
--- a/train.sh
+++ b/train.sh
@ -2,17 +2,13 @@

 # arguments
 # 1. root of gonito.net challenge-like filestructure
-# 2. suffix of source sentences, E.g. en for files like train.en
-# 3. suffix of source sentences, E.g. pl for files like train.pl
-# 4. number of BPE merge operations, E.g. 32000
-# 5. expected number of train epochs
+# 2. number of BPE merge operations, E.g. 32000
+# 3. expected number of train epochs


 corpus_path="$1"
-source_suffix="$2"
-target_suffix="$3"
-bpe_merges="$4"
-epochs="$5"
+bpe_merges="$2"
+epochs="$3"

-./scripts/do-fastBPE.sh "$corpus_path" "$source_suffix" "$target_suffix" "$bpe_merges"
-./scripts/marian-train.sh "$corpus_path" "$source_suffix" "$target_suffix" "$bpe_merges" "$epochs"
+./scripts/do-fastBPE.sh "$corpus_path" "$bpe_merges"
+./scripts/marian-train.sh "$corpus_path" "$bpe_merges" "$epochs"