adjust to gonito file structure

This commit is contained in:
jakubknczny 2022-01-15 18:07:27 +01:00
parent 71f1c9580b
commit 942e426366
4 changed files with 45 additions and 44 deletions

View File

@ -2,26 +2,26 @@
# arguments # arguments
# 1. root of gonito.net challenge-like filestructure # 1. root of gonito.net challenge-like filestructure
# 2. suffix of source sentences, E.g. en for files like train.en # 2. number of BPE merge operations, E.g. 32000
# 3. suffix of source sentences, E.g. pl for files like train.pl # Usage example ./do-fastBPE.sh ~/sample-gonito-challenge 32000
# 4. number of BPE merge operations, E.g. 32000
# Usage example ./do-fastBPE.sh ~/myCorpora en pl 32000
# path to corpus # path to corpus
corpus_path="$1" corpus_path="$1"
# absolute paths to the files with source sentences # absolute paths to the files with source sentences
source_suffix="$2" source_path_train="$corpus_path"/train/in.tsv
source_path_train="$corpus_path"/train/train."$source_suffix" source_path_test="$corpus_path"/test-A/in.tsv
source_path_test="$corpus_path"/test/test."$source_suffix"
# absolute paths to the files with target sentences # absolute paths to the files with target sentences
target_suffix="$3" target_path_train="$corpus_path"/train/expected.tsv
target_path_train="$corpus_path"/train/train."$target_suffix" target_path_test="$corpus_path"/test-A/expected.tsv
target_path_test="$corpus_path"/test/test."$target_suffix"
# number of BPE merge operations, E.g. 32000 # number of BPE merge operations, E.g. 32000
bpe_merges="$4" bpe_merges="$2"
# path to BPE merges? # path to BPE codes?
codes="$corpus_path"/codes codes="$corpus_path"/codes
for file in "$source_path_train" "$source_path_test" "$target_path_train" "$target_path_test"
do
ex -sc '%s/^/<s>/|%s/$/<\/s>/|x' "$file"
done
cd ~/fastBPE cd ~/fastBPE
# learn BPE # learn BPE
@ -29,10 +29,19 @@ cd ~/fastBPE
# apply codes to train # apply codes to train
./fast applybpe "$source_path_train"."$bpe_merges" "$source_path_train" "$codes" ./fast applybpe "$source_path_train"."$bpe_merges" "$source_path_train" "$codes"
./fast applybpe "$target_path_train"."$bpe_merges" "$target_path_train" "$codes" ./fast applybpe "$target_path_train"."$bpe_merges" "$target_path_train" "$codes"
# get train vocabulary # get train vocabulary and .yml'ify it
./fast getvocab "$source_path_train"."$bpe_merges" > "$source_path_train".vocab."$bpe_merges" ./fast getvocab "$source_path_train"."$bpe_merges" > "$source_path_train".vocab."$bpe_merges" \
./fast getvocab "$target_path_train"."$bpe_merges" > "$target_path_train".vocab."$bpe_merges" tee >(cut -f 1 > temp1) | cut -f 2 > temp2
# apply codes to test ex -sc '%s/$/:/|x' temp1
./fast applybpe "$source_path_test"."$bpe_merges" "$source_path_test" "$codes" "$source_path_train".vocab."$bpe_merges" paste temp1 temp2 > "$source_path_train".vocab."$bpe_merges".yml
./fast applybpe "$target_path_test"."$bpe_merges" "$target_path_test" "$codes" "$target_path_train".vocab."$bpe_merges" rm temp1 temp2 "$source_path_train".vocab."$bpe_merges"
./fast getvocab "$target_path_train"."$bpe_merges" > "$target_path_train".vocab."$bpe_merges" \
tee >(cut -f 1 > temp1) | cut -f 2 > temp2
ex -sc '%s/$/:/|x' temp1
paste temp1 temp2 > "$target_path_train".vocab."$bpe_merges".yml
rm temp1 temp2 "$target_path_train".vocab."$bpe_merges"
## apply codes to test
#./fast applybpe "$source_path_test"."$bpe_merges" "$source_path_test" "$codes" "$source_path_train".vocab."$bpe_merges"
#./fast applybpe "$target_path_test"."$bpe_merges" "$target_path_test" "$codes" "$target_path_train".vocab."$bpe_merges"
cd ~ cd ~

View File

@ -1,7 +1,8 @@
#!/bin/bash #!/bin/bash
# install MarianMT pre-requirements # install MarianMT and its pre-requirements
apt-get install --force-yes git cmake build-essential libboost-system-dev libprotobuf17 protobuf-compiler libprotobuf-dev openssl libssl-dev libgoogle-perftools-dev apt-get install --force-yes git cmake build-essential libboost-system-dev libprotobuf17 protobuf-compiler libprotobuf-dev openssl libssl-dev libgoogle-perftools-dev
cd ~
git clone https://github.com/marian-nmt/marian git clone https://github.com/marian-nmt/marian
mkdir marian/build mkdir marian/build
cd marian/build cd marian/build

View File

@ -2,26 +2,22 @@
# arguments # arguments
# 1. root of gonito.net challenge-like filestructure # 1. root of gonito.net challenge-like filestructure
# 2. suffix of source sentences, E.g. en for files like train.en # 2. number of BPE merge operations, E.g. 32000
# 3. suffix of source sentences, E.g. pl for files like train.pl # 3. expected number of train epochs
# 4. number of BPE merge operations, E.g. 32000
# 5. expected number of train epochs
# path to corpus # path to corpus
corpus_path="$1" corpus_path="$1"
source_suffix="$2"
target_suffix="$3"
bpe_merges="$4" bpe_merges="$4"
epochs="$5" epochs="$5"
source_file="$corpus_path"/train/train."$source_suffix" source_file="$corpus_path"/train/in.tsv
source_vocab="$source_file".vocab."$bpe_merges" source_vocab="$source_file".vocab."$bpe_merges".yml
target_file="$corpus_path"/train/train."$target_suffix" target_file="$corpus_path"/train/expected.tsv
target_vocab="$target_file".vocab."$bpe_merges" target_vocab="$target_file".vocab."$bpe_merges".yml
../marian/build/marian \ ~/marian/build/marian \
--type transformer \ --type transformer \
--overwrite \ --overwrite \
--train-sets "$source_file" "$target_file" \ --train-sets "$source_file" "$target_file" \
@ -38,13 +34,12 @@ target_vocab="$target_file".vocab."$bpe_merges"
--transformer-dropout 0.1 --label-smoothing 0.1 \ --transformer-dropout 0.1 --label-smoothing 0.1 \
--learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \ --learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
--optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \ --optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
--tied-embeddings-all \ --tied-embeddings \
--exponential-smoothing \ --exponential-smoothing \
--log ~/train.log \ --log ~/train.log \
--vocabs "$source_vocab" "$target_vocab" \ --after-epochs="$epochs" \
--after-epochs "$epochs" \ --vocabs "$source_vocab" "$target_vocab"
#--vocabs model/vocab.ende.yml model/vocab.ende.yml \
#--early-stopping 10 \ #--early-stopping 10 \
#--model model/model.npz #--model model/model.npz
#--valid-log model/valid.log \ #--valid-log model/valid.log \

View File

@ -2,17 +2,13 @@
# arguments # arguments
# 1. root of gonito.net challenge-like filestructure # 1. root of gonito.net challenge-like filestructure
# 2. suffix of source sentences, E.g. en for files like train.en # 2. number of BPE merge operations, E.g. 32000
# 3. suffix of source sentences, E.g. pl for files like train.pl # 3. expected number of train epochs
# 4. number of BPE merge operations, E.g. 32000
# 5. expected number of train epochs
corpus_path="$1" corpus_path="$1"
source_suffix="$2" bpe_merges="$2"
target_suffix="$3" epochs="$3"
bpe_merges="$4"
epochs="$5"
./scripts/do-fastBPE.sh "$corpus_path" "$source_suffix" "$target_suffix" "$bpe_merges" ./scripts/do-fastBPE.sh "$corpus_path" "$bpe_merges"
./scripts/marian-train.sh "$corpus_path" "$source_suffix" "$target_suffix" "$bpe_merges" "$epochs" ./scripts/marian-train.sh "$corpus_path" "$bpe_merges" "$epochs"