diff --git a/scripts/do-fastBPE.sh b/scripts/do-fastBPE.sh index 52c14cc..0b1e227 100755 --- a/scripts/do-fastBPE.sh +++ b/scripts/do-fastBPE.sh @@ -2,26 +2,26 @@ # arguments # 1. root of gonito.net challenge-like filestructure -# 2. suffix of source sentences, E.g. en for files like train.en -# 3. suffix of source sentences, E.g. pl for files like train.pl -# 4. number of BPE merge operations, E.g. 32000 -# Usage example ./do-fastBPE.sh ~/myCorpora en pl 32000 +# 2. number of BPE merge operations, E.g. 32000 +# Usage example ./do-fastBPE.sh ~/sample-gonito-challenge 32000 # path to corpus corpus_path="$1" # absolute paths to the files with source sentences -source_suffix="$2" -source_path_train="$corpus_path"/train/train."$source_suffix" -source_path_test="$corpus_path"/test/test."$source_suffix" +source_path_train="$corpus_path"/train/in.tsv +source_path_test="$corpus_path"/test-A/in.tsv # absolute paths to the files with target sentences -target_suffix="$3" -target_path_train="$corpus_path"/train/train."$target_suffix" -target_path_test="$corpus_path"/test/test."$target_suffix" +target_path_train="$corpus_path"/train/expected.tsv +target_path_test="$corpus_path"/test-A/expected.tsv # number of BPE merge operations, E.g. 32000 -bpe_merges="$4" -# path to BPE merges? +bpe_merges="$2" +# path to BPE codes? codes="$corpus_path"/codes +for file in "$source_path_train" "$source_path_test" "$target_path_train" "$target_path_test" +do + ex -sc '%s/^//|%s/$/<\/s>/|x' "$file" +done cd ~/fastBPE # learn BPE @@ -29,10 +29,19 @@ cd ~/fastBPE # apply codes to train ./fast applybpe "$source_path_train"."$bpe_merges" "$source_path_train" "$codes" ./fast applybpe "$target_path_train"."$bpe_merges" "$target_path_train" "$codes" -# get train vocabulary -./fast getvocab "$source_path_train"."$bpe_merges" > "$source_path_train".vocab."$bpe_merges" -./fast getvocab "$target_path_train"."$bpe_merges" > "$target_path_train".vocab."$bpe_merges" -# apply codes to test -./fast applybpe "$source_path_test"."$bpe_merges" "$source_path_test" "$codes" "$source_path_train".vocab."$bpe_merges" -./fast applybpe "$target_path_test"."$bpe_merges" "$target_path_test" "$codes" "$target_path_train".vocab."$bpe_merges" +# get train vocabulary and .yml'ify it +./fast getvocab "$source_path_train"."$bpe_merges" > "$source_path_train".vocab."$bpe_merges" \ +tee >(cut -f 1 > temp1) | cut -f 2 > temp2 +ex -sc '%s/$/:/|x' temp1 +paste temp1 temp2 > "$source_path_train".vocab."$bpe_merges".yml +rm temp1 temp2 "$source_path_train".vocab."$bpe_merges" + +./fast getvocab "$target_path_train"."$bpe_merges" > "$target_path_train".vocab."$bpe_merges" \ +tee >(cut -f 1 > temp1) | cut -f 2 > temp2 +ex -sc '%s/$/:/|x' temp1 +paste temp1 temp2 > "$target_path_train".vocab."$bpe_merges".yml +rm temp1 temp2 "$target_path_train".vocab."$bpe_merges" +## apply codes to test +#./fast applybpe "$source_path_test"."$bpe_merges" "$source_path_test" "$codes" "$source_path_train".vocab."$bpe_merges" +#./fast applybpe "$target_path_test"."$bpe_merges" "$target_path_test" "$codes" "$target_path_train".vocab."$bpe_merges" cd ~ diff --git a/scripts/marian-setup.sh b/scripts/marian-setup.sh index 7a9342d..eb4ae1d 100755 --- a/scripts/marian-setup.sh +++ b/scripts/marian-setup.sh @@ -1,7 +1,8 @@ #!/bin/bash -# install MarianMT pre-requirements +# install MarianMT and its pre-requirements apt-get install --force-yes git cmake build-essential libboost-system-dev libprotobuf17 protobuf-compiler libprotobuf-dev openssl libssl-dev libgoogle-perftools-dev +cd ~ git clone https://github.com/marian-nmt/marian mkdir marian/build cd marian/build diff --git a/scripts/marian-train.sh b/scripts/marian-train.sh index bc93c1c..cfb30d2 100755 --- a/scripts/marian-train.sh +++ b/scripts/marian-train.sh @@ -2,26 +2,22 @@ # arguments # 1. root of gonito.net challenge-like filestructure -# 2. suffix of source sentences, E.g. en for files like train.en -# 3. suffix of source sentences, E.g. pl for files like train.pl -# 4. number of BPE merge operations, E.g. 32000 -# 5. expected number of train epochs +# 2. number of BPE merge operations, E.g. 32000 +# 3. expected number of train epochs # path to corpus corpus_path="$1" -source_suffix="$2" -target_suffix="$3" bpe_merges="$4" epochs="$5" -source_file="$corpus_path"/train/train."$source_suffix" -source_vocab="$source_file".vocab."$bpe_merges" +source_file="$corpus_path"/train/in.tsv +source_vocab="$source_file".vocab."$bpe_merges".yml -target_file="$corpus_path"/train/train."$target_suffix" -target_vocab="$target_file".vocab."$bpe_merges" +target_file="$corpus_path"/train/expected.tsv +target_vocab="$target_file".vocab."$bpe_merges".yml -../marian/build/marian \ +~/marian/build/marian \ --type transformer \ --overwrite \ --train-sets "$source_file" "$target_file" \ @@ -38,13 +34,12 @@ target_vocab="$target_file".vocab."$bpe_merges" --transformer-dropout 0.1 --label-smoothing 0.1 \ --learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \ --optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \ ---tied-embeddings-all \ +--tied-embeddings \ --exponential-smoothing \ --log ~/train.log \ ---vocabs "$source_vocab" "$target_vocab" \ ---after-epochs "$epochs" \ +--after-epochs="$epochs" \ +--vocabs "$source_vocab" "$target_vocab" -#--vocabs model/vocab.ende.yml model/vocab.ende.yml \ #--early-stopping 10 \ #--model model/model.npz #--valid-log model/valid.log \ diff --git a/train.sh b/train.sh index fb53196..2f8e504 100755 --- a/train.sh +++ b/train.sh @@ -2,17 +2,13 @@ # arguments # 1. root of gonito.net challenge-like filestructure -# 2. suffix of source sentences, E.g. en for files like train.en -# 3. suffix of source sentences, E.g. pl for files like train.pl -# 4. number of BPE merge operations, E.g. 32000 -# 5. expected number of train epochs +# 2. number of BPE merge operations, E.g. 32000 +# 3. expected number of train epochs corpus_path="$1" -source_suffix="$2" -target_suffix="$3" -bpe_merges="$4" -epochs="$5" +bpe_merges="$2" +epochs="$3" -./scripts/do-fastBPE.sh "$corpus_path" "$source_suffix" "$target_suffix" "$bpe_merges" -./scripts/marian-train.sh "$corpus_path" "$source_suffix" "$target_suffix" "$bpe_merges" "$epochs" \ No newline at end of file +./scripts/do-fastBPE.sh "$corpus_path" "$bpe_merges" +./scripts/marian-train.sh "$corpus_path" "$bpe_merges" "$epochs" \ No newline at end of file