init
This commit is contained in:
commit
86603d4abd
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
.idea
|
38
scripts/do-fastBPE.sh
Executable file
38
scripts/do-fastBPE.sh
Executable file
@ -0,0 +1,38 @@
|
||||
#!/bin/bash
|
||||
|
||||
# arguments
|
||||
# 1. root of gonito.net challenge-like filestructure
|
||||
# 2. suffix of source sentences, E.g. en for files like train.en
|
||||
# 3. suffix of source sentences, E.g. pl for files like train.pl
|
||||
# 4. number of BPE merge operations, E.g. 32000
|
||||
# Usage example ./do-fastBPE.sh ~/myCorpora en pl 32000
|
||||
|
||||
# path to corpus
|
||||
corpus_path="$1"
|
||||
# absolute paths to the files with source sentences
|
||||
source_suffix="$2"
|
||||
source_path_train="$corpus_path"/train/train."$source_suffix"
|
||||
source_path_test="$corpus_path"/test/test."$source_suffix"
|
||||
# absolute paths to the files with target sentences
|
||||
target_suffix="$3"
|
||||
target_path_train="$corpus_path"/train/train."$target_suffix"
|
||||
target_path_test="$corpus_path"/test/test."$target_suffix"
|
||||
# number of BPE merge operations, E.g. 32000
|
||||
bpe_merges="$4"
|
||||
# path to BPE merges?
|
||||
codes="$corpus_path"/codes
|
||||
|
||||
|
||||
cd ~/fastBPE
|
||||
# learn BPE
|
||||
./fast learnbpe "$bpe_merges" "$source_path_train" "$target_path_train" > "$codes"
|
||||
# apply codes to train
|
||||
./fast applybpe "$source_path_train"."$bpe_merges" "$source_path_train" "$codes"
|
||||
./fast applybpe "$target_path_train"."$bpe_merges" "$target_path_train" "$codes"
|
||||
# get train vocabulary
|
||||
./fast getvocab "$source_path_train"."$bpe_merges" > "$source_path_train".vocab."$bpe_merges"
|
||||
./fast getvocab "$target_path_train"."$bpe_merges" > "$target_path_train".vocab."$bpe_merges"
|
||||
# apply codes to test
|
||||
./fast applybpe "$source_path_test"."$bpe_merges" "$source_path_test" "$codes" "$source_path_train".vocab."$bpe_merges"
|
||||
./fast applybpe "$target_path_test"."$bpe_merges" "$target_path_test" "$codes" "$target_path_train".vocab."$bpe_merges"
|
||||
cd ~
|
7
scripts/fastBPE-setup.sh
Normal file
7
scripts/fastBPE-setup.sh
Normal file
@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
cd ~
|
||||
git clone https://github.com/glample/fastBPE.git
|
||||
cd fastBPE
|
||||
g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
|
||||
cd ~
|
10
scripts/marian-setup.sh
Normal file
10
scripts/marian-setup.sh
Normal file
@ -0,0 +1,10 @@
|
||||
#!/bin/bash
|
||||
|
||||
# install MarianMT pre-requirements
|
||||
apt-get install --force-yes git cmake build-essential libboost-system-dev libprotobuf17 protobuf-compiler libprotobuf-dev openssl libssl-dev libgoogle-perftools-dev
|
||||
git clone https://github.com/marian-nmt/marian
|
||||
mkdir marian/build
|
||||
cd marian/build
|
||||
cmake ..
|
||||
make -j4
|
||||
cd ~
|
53
scripts/marian-train.sh
Normal file
53
scripts/marian-train.sh
Normal file
@ -0,0 +1,53 @@
|
||||
#!/bin/bash
|
||||
|
||||
# arguments
|
||||
# 1. root of gonito.net challenge-like filestructure
|
||||
# 2. suffix of source sentences, E.g. en for files like train.en
|
||||
# 3. suffix of source sentences, E.g. pl for files like train.pl
|
||||
# 4. number of BPE merge operations, E.g. 32000
|
||||
# 5. expected number of train epochs
|
||||
|
||||
# path to corpus
|
||||
corpus_path="$1"
|
||||
source_suffix="$2"
|
||||
target_suffix="$3"
|
||||
bpe_merges="$4"
|
||||
epochs="$5"
|
||||
|
||||
source_file="$corpus_path"/train/train."$source_suffix"
|
||||
source_vocab="$source_file".vocab."$bpe_merges"
|
||||
|
||||
target_file="$corpus_path"/train/train."$target_suffix"
|
||||
target_vocab="$target_file".vocab."$bpe_merges"
|
||||
|
||||
|
||||
./marian/build/marian \
|
||||
--model model/model.npz --type transformer \
|
||||
--overwrite
|
||||
--train-sets "$source_file" "$target_file" \
|
||||
--max-length 100 \
|
||||
--vocabs "$source_vocab" "$target_vocab" \
|
||||
#--vocabs model/vocab.ende.yml model/vocab.ende.yml \
|
||||
--mini-batch-fit -w 10000 --maxi-batch 1000 \
|
||||
--after_epochs "$epochs" \
|
||||
#--early-stopping 10 \
|
||||
--valid-freq 5000 \
|
||||
--save-freq 5000 \
|
||||
--disp-freq 500 \
|
||||
--beam-size 6 --normalize 0.6 \
|
||||
--enc-depth 6 --dec-depth 6 \
|
||||
--transformer-heads 8 \
|
||||
--transformer-postprocess-emb d \
|
||||
--transformer-postprocess dan \
|
||||
--transformer-dropout 0.1 --label-smoothing 0.1 \
|
||||
--learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \
|
||||
--optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \
|
||||
--tied-embeddings-all \
|
||||
--exponential-smoothing
|
||||
--log model/train.log \
|
||||
#--valid-log model/valid.log \
|
||||
#--valid-metrics cross-entropy perplexity translation \
|
||||
#--valid-sets data/valid.bpe.en data/valid.bpe.de \
|
||||
#--valid-script-path ./scripts/validate.sh \
|
||||
#--valid-translation-output data/valid.bpe.en.output --quiet-translation \
|
||||
#--valid-mini-batch 64 \
|
4
setup.sh
Executable file
4
setup.sh
Executable file
@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
|
||||
./scripts/marian-setup.sh
|
||||
./scripts/fastBPE-setup.sh
|
18
train.sh
Normal file
18
train.sh
Normal file
@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
# arguments
|
||||
# 1. root of gonito.net challenge-like filestructure
|
||||
# 2. suffix of source sentences, E.g. en for files like train.en
|
||||
# 3. suffix of source sentences, E.g. pl for files like train.pl
|
||||
# 4. number of BPE merge operations, E.g. 32000
|
||||
# 5. expected number of train epochs
|
||||
|
||||
|
||||
corpus_path="$1"
|
||||
source_suffix="$2"
|
||||
target_suffix="$3"
|
||||
bpe_merges="$4"
|
||||
epochs="$5"
|
||||
|
||||
./do-fastBPE.sh "$corpus_path" "$source_suffix" "$target_suffix" "$bpe_merges"
|
||||
./marian-train.sh "$corpus_path" "$source_suffix" "$target_suffix" "$bpe_merges" "$epochs"
|
Loading…
Reference in New Issue
Block a user