adjust to gonito file structure
This commit is contained in:
parent
71f1c9580b
commit
942e426366
@ -2,26 +2,26 @@
|
|||||||
|
|
||||||
# arguments
|
# arguments
|
||||||
# 1. root of gonito.net challenge-like filestructure
|
# 1. root of gonito.net challenge-like filestructure
|
||||||
# 2. suffix of source sentences, E.g. en for files like train.en
|
# 2. number of BPE merge operations, E.g. 32000
|
||||||
# 3. suffix of source sentences, E.g. pl for files like train.pl
|
# Usage example ./do-fastBPE.sh ~/sample-gonito-challenge 32000
|
||||||
# 4. number of BPE merge operations, E.g. 32000
|
|
||||||
# Usage example ./do-fastBPE.sh ~/myCorpora en pl 32000
|
|
||||||
|
|
||||||
# path to corpus
|
# path to corpus
|
||||||
corpus_path="$1"
|
corpus_path="$1"
|
||||||
# absolute paths to the files with source sentences
|
# absolute paths to the files with source sentences
|
||||||
source_suffix="$2"
|
source_path_train="$corpus_path"/train/in.tsv
|
||||||
source_path_train="$corpus_path"/train/train."$source_suffix"
|
source_path_test="$corpus_path"/test-A/in.tsv
|
||||||
source_path_test="$corpus_path"/test/test."$source_suffix"
|
|
||||||
# absolute paths to the files with target sentences
|
# absolute paths to the files with target sentences
|
||||||
target_suffix="$3"
|
target_path_train="$corpus_path"/train/expected.tsv
|
||||||
target_path_train="$corpus_path"/train/train."$target_suffix"
|
target_path_test="$corpus_path"/test-A/expected.tsv
|
||||||
target_path_test="$corpus_path"/test/test."$target_suffix"
|
|
||||||
# number of BPE merge operations, E.g. 32000
|
# number of BPE merge operations, E.g. 32000
|
||||||
bpe_merges="$4"
|
bpe_merges="$2"
|
||||||
# path to BPE merges?
|
# path to BPE codes?
|
||||||
codes="$corpus_path"/codes
|
codes="$corpus_path"/codes
|
||||||
|
|
||||||
|
for file in "$source_path_train" "$source_path_test" "$target_path_train" "$target_path_test"
|
||||||
|
do
|
||||||
|
ex -sc '%s/^/<s>/|%s/$/<\/s>/|x' "$file"
|
||||||
|
done
|
||||||
|
|
||||||
cd ~/fastBPE
|
cd ~/fastBPE
|
||||||
# learn BPE
|
# learn BPE
|
||||||
@ -29,10 +29,19 @@ cd ~/fastBPE
|
|||||||
# apply codes to train
|
# apply codes to train
|
||||||
./fast applybpe "$source_path_train"."$bpe_merges" "$source_path_train" "$codes"
|
./fast applybpe "$source_path_train"."$bpe_merges" "$source_path_train" "$codes"
|
||||||
./fast applybpe "$target_path_train"."$bpe_merges" "$target_path_train" "$codes"
|
./fast applybpe "$target_path_train"."$bpe_merges" "$target_path_train" "$codes"
|
||||||
# get train vocabulary
|
# get train vocabulary and .yml'ify it
|
||||||
./fast getvocab "$source_path_train"."$bpe_merges" > "$source_path_train".vocab."$bpe_merges"
|
./fast getvocab "$source_path_train"."$bpe_merges" > "$source_path_train".vocab."$bpe_merges" \
|
||||||
./fast getvocab "$target_path_train"."$bpe_merges" > "$target_path_train".vocab."$bpe_merges"
|
tee >(cut -f 1 > temp1) | cut -f 2 > temp2
|
||||||
# apply codes to test
|
ex -sc '%s/$/:/|x' temp1
|
||||||
./fast applybpe "$source_path_test"."$bpe_merges" "$source_path_test" "$codes" "$source_path_train".vocab."$bpe_merges"
|
paste temp1 temp2 > "$source_path_train".vocab."$bpe_merges".yml
|
||||||
./fast applybpe "$target_path_test"."$bpe_merges" "$target_path_test" "$codes" "$target_path_train".vocab."$bpe_merges"
|
rm temp1 temp2 "$source_path_train".vocab."$bpe_merges"
|
||||||
|
|
||||||
|
./fast getvocab "$target_path_train"."$bpe_merges" > "$target_path_train".vocab."$bpe_merges" \
|
||||||
|
tee >(cut -f 1 > temp1) | cut -f 2 > temp2
|
||||||
|
ex -sc '%s/$/:/|x' temp1
|
||||||
|
paste temp1 temp2 > "$target_path_train".vocab."$bpe_merges".yml
|
||||||
|
rm temp1 temp2 "$target_path_train".vocab."$bpe_merges"
|
||||||
|
## apply codes to test
|
||||||
|
#./fast applybpe "$source_path_test"."$bpe_merges" "$source_path_test" "$codes" "$source_path_train".vocab."$bpe_merges"
|
||||||
|
#./fast applybpe "$target_path_test"."$bpe_merges" "$target_path_test" "$codes" "$target_path_train".vocab."$bpe_merges"
|
||||||
cd ~
|
cd ~
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# install MarianMT pre-requirements
|
# install MarianMT and its pre-requirements
|
||||||
apt-get install --force-yes git cmake build-essential libboost-system-dev libprotobuf17 protobuf-compiler libprotobuf-dev openssl libssl-dev libgoogle-perftools-dev
|
apt-get install --force-yes git cmake build-essential libboost-system-dev libprotobuf17 protobuf-compiler libprotobuf-dev openssl libssl-dev libgoogle-perftools-dev
|
||||||
|
cd ~
|
||||||
git clone https://github.com/marian-nmt/marian
|
git clone https://github.com/marian-nmt/marian
|
||||||
mkdir marian/build
|
mkdir marian/build
|
||||||
cd marian/build
|
cd marian/build
|
||||||
|
@ -2,26 +2,22 @@
|
|||||||
|
|
||||||
# arguments
|
# arguments
|
||||||
# 1. root of gonito.net challenge-like filestructure
|
# 1. root of gonito.net challenge-like filestructure
|
||||||
# 2. suffix of source sentences, E.g. en for files like train.en
|
# 2. number of BPE merge operations, E.g. 32000
|
||||||
# 3. suffix of source sentences, E.g. pl for files like train.pl
|
# 3. expected number of train epochs
|
||||||
# 4. number of BPE merge operations, E.g. 32000
|
|
||||||
# 5. expected number of train epochs
|
|
||||||
|
|
||||||
# path to corpus
|
# path to corpus
|
||||||
corpus_path="$1"
|
corpus_path="$1"
|
||||||
source_suffix="$2"
|
|
||||||
target_suffix="$3"
|
|
||||||
bpe_merges="$4"
|
bpe_merges="$4"
|
||||||
epochs="$5"
|
epochs="$5"
|
||||||
|
|
||||||
source_file="$corpus_path"/train/train."$source_suffix"
|
source_file="$corpus_path"/train/in.tsv
|
||||||
source_vocab="$source_file".vocab."$bpe_merges"
|
source_vocab="$source_file".vocab."$bpe_merges".yml
|
||||||
|
|
||||||
target_file="$corpus_path"/train/train."$target_suffix"
|
target_file="$corpus_path"/train/expected.tsv
|
||||||
target_vocab="$target_file".vocab."$bpe_merges"
|
target_vocab="$target_file".vocab."$bpe_merges".yml
|
||||||
|
|
||||||
|
|
||||||
../marian/build/marian \
|
~/marian/build/marian \
|
||||||
--type transformer \
|
--type transformer \
|
||||||
--overwrite \
|
--overwrite \
|
||||||
--train-sets "$source_file" "$target_file" \
|
--train-sets "$source_file" "$target_file" \
|
||||||
@ -38,13 +34,12 @@ target_vocab="$target_file".vocab."$bpe_merges"
|
|||||||
--transformer-dropout 0.1 --label-smoothing 0.1 \
|
--transformer-dropout 0.1 --label-smoothing 0.1 \
|
||||||
--learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
|
--learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
|
||||||
--optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
|
--optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
|
||||||
--tied-embeddings-all \
|
--tied-embeddings \
|
||||||
--exponential-smoothing \
|
--exponential-smoothing \
|
||||||
--log ~/train.log \
|
--log ~/train.log \
|
||||||
--vocabs "$source_vocab" "$target_vocab" \
|
--after-epochs="$epochs" \
|
||||||
--after-epochs "$epochs" \
|
--vocabs "$source_vocab" "$target_vocab"
|
||||||
|
|
||||||
#--vocabs model/vocab.ende.yml model/vocab.ende.yml \
|
|
||||||
#--early-stopping 10 \
|
#--early-stopping 10 \
|
||||||
#--model model/model.npz
|
#--model model/model.npz
|
||||||
#--valid-log model/valid.log \
|
#--valid-log model/valid.log \
|
||||||
|
16
train.sh
16
train.sh
@ -2,17 +2,13 @@
|
|||||||
|
|
||||||
# arguments
|
# arguments
|
||||||
# 1. root of gonito.net challenge-like filestructure
|
# 1. root of gonito.net challenge-like filestructure
|
||||||
# 2. suffix of source sentences, E.g. en for files like train.en
|
# 2. number of BPE merge operations, E.g. 32000
|
||||||
# 3. suffix of source sentences, E.g. pl for files like train.pl
|
# 3. expected number of train epochs
|
||||||
# 4. number of BPE merge operations, E.g. 32000
|
|
||||||
# 5. expected number of train epochs
|
|
||||||
|
|
||||||
|
|
||||||
corpus_path="$1"
|
corpus_path="$1"
|
||||||
source_suffix="$2"
|
bpe_merges="$2"
|
||||||
target_suffix="$3"
|
epochs="$3"
|
||||||
bpe_merges="$4"
|
|
||||||
epochs="$5"
|
|
||||||
|
|
||||||
./scripts/do-fastBPE.sh "$corpus_path" "$source_suffix" "$target_suffix" "$bpe_merges"
|
./scripts/do-fastBPE.sh "$corpus_path" "$bpe_merges"
|
||||||
./scripts/marian-train.sh "$corpus_path" "$source_suffix" "$target_suffix" "$bpe_merges" "$epochs"
|
./scripts/marian-train.sh "$corpus_path" "$bpe_merges" "$epochs"
|
Loading…
Reference in New Issue
Block a user