commit 86603d4abd22c3bafe1de83f6838580519e01d54 Author: jakubknczny Date: Fri Jan 14 18:45:36 2022 +0100 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..723ef36 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/scripts/do-fastBPE.sh b/scripts/do-fastBPE.sh new file mode 100755 index 0000000..52c14cc --- /dev/null +++ b/scripts/do-fastBPE.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# arguments +# 1. root of gonito.net challenge-like filestructure +# 2. suffix of source sentences, E.g. en for files like train.en +# 3. suffix of source sentences, E.g. pl for files like train.pl +# 4. number of BPE merge operations, E.g. 32000 +# Usage example ./do-fastBPE.sh ~/myCorpora en pl 32000 + +# path to corpus +corpus_path="$1" +# absolute paths to the files with source sentences +source_suffix="$2" +source_path_train="$corpus_path"/train/train."$source_suffix" +source_path_test="$corpus_path"/test/test."$source_suffix" +# absolute paths to the files with target sentences +target_suffix="$3" +target_path_train="$corpus_path"/train/train."$target_suffix" +target_path_test="$corpus_path"/test/test."$target_suffix" +# number of BPE merge operations, E.g. 32000 +bpe_merges="$4" +# path to BPE merges? +codes="$corpus_path"/codes + + +cd ~/fastBPE +# learn BPE +./fast learnbpe "$bpe_merges" "$source_path_train" "$target_path_train" > "$codes" +# apply codes to train +./fast applybpe "$source_path_train"."$bpe_merges" "$source_path_train" "$codes" +./fast applybpe "$target_path_train"."$bpe_merges" "$target_path_train" "$codes" +# get train vocabulary +./fast getvocab "$source_path_train"."$bpe_merges" > "$source_path_train".vocab."$bpe_merges" +./fast getvocab "$target_path_train"."$bpe_merges" > "$target_path_train".vocab."$bpe_merges" +# apply codes to test +./fast applybpe "$source_path_test"."$bpe_merges" "$source_path_test" "$codes" "$source_path_train".vocab."$bpe_merges" +./fast applybpe "$target_path_test"."$bpe_merges" "$target_path_test" "$codes" "$target_path_train".vocab."$bpe_merges" +cd ~ diff --git a/scripts/fastBPE-setup.sh b/scripts/fastBPE-setup.sh new file mode 100644 index 0000000..84c5ab7 --- /dev/null +++ b/scripts/fastBPE-setup.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +cd ~ +git clone https://github.com/glample/fastBPE.git +cd fastBPE +g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast +cd ~ diff --git a/scripts/marian-setup.sh b/scripts/marian-setup.sh new file mode 100644 index 0000000..7a9342d --- /dev/null +++ b/scripts/marian-setup.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# install MarianMT pre-requirements +apt-get install --force-yes git cmake build-essential libboost-system-dev libprotobuf17 protobuf-compiler libprotobuf-dev openssl libssl-dev libgoogle-perftools-dev +git clone https://github.com/marian-nmt/marian +mkdir marian/build +cd marian/build +cmake .. +make -j4 +cd ~ \ No newline at end of file diff --git a/scripts/marian-train.sh b/scripts/marian-train.sh new file mode 100644 index 0000000..c655794 --- /dev/null +++ b/scripts/marian-train.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# arguments +# 1. root of gonito.net challenge-like filestructure +# 2. suffix of source sentences, E.g. en for files like train.en +# 3. suffix of source sentences, E.g. pl for files like train.pl +# 4. number of BPE merge operations, E.g. 32000 +# 5. expected number of train epochs + +# path to corpus +corpus_path="$1" +source_suffix="$2" +target_suffix="$3" +bpe_merges="$4" +epochs="$5" + +source_file="$corpus_path"/train/train."$source_suffix" +source_vocab="$source_file".vocab."$bpe_merges" + +target_file="$corpus_path"/train/train."$target_suffix" +target_vocab="$target_file".vocab."$bpe_merges" + + +./marian/build/marian \ +--model model/model.npz --type transformer \ +--overwrite +--train-sets "$source_file" "$target_file" \ +--max-length 100 \ +--vocabs "$source_vocab" "$target_vocab" \ +#--vocabs model/vocab.ende.yml model/vocab.ende.yml \ +--mini-batch-fit -w 10000 --maxi-batch 1000 \ +--after_epochs "$epochs" \ +#--early-stopping 10 \ +--valid-freq 5000 \ +--save-freq 5000 \ +--disp-freq 500 \ +--beam-size 6 --normalize 0.6 \ +--enc-depth 6 --dec-depth 6 \ +--transformer-heads 8 \ +--transformer-postprocess-emb d \ +--transformer-postprocess dan \ +--transformer-dropout 0.1 --label-smoothing 0.1 \ +--learn-rate 0.0003 --lr-warmup 16000 --lr-decay-inv-sqrt 16000 --lr-report \ +--optimizer-params 0.9 0.98 1e-09 --clip-norm 5 \ +--tied-embeddings-all \ +--exponential-smoothing +--log model/train.log \ +#--valid-log model/valid.log \ +#--valid-metrics cross-entropy perplexity translation \ +#--valid-sets data/valid.bpe.en data/valid.bpe.de \ +#--valid-script-path ./scripts/validate.sh \ +#--valid-translation-output data/valid.bpe.en.output --quiet-translation \ +#--valid-mini-batch 64 \ diff --git a/setup.sh b/setup.sh new file mode 100755 index 0000000..0803179 --- /dev/null +++ b/setup.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +./scripts/marian-setup.sh +./scripts/fastBPE-setup.sh diff --git a/train.sh b/train.sh new file mode 100644 index 0000000..10216ee --- /dev/null +++ b/train.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# arguments +# 1. root of gonito.net challenge-like filestructure +# 2. suffix of source sentences, E.g. en for files like train.en +# 3. suffix of source sentences, E.g. pl for files like train.pl +# 4. number of BPE merge operations, E.g. 32000 +# 5. expected number of train epochs + + +corpus_path="$1" +source_suffix="$2" +target_suffix="$3" +bpe_merges="$4" +epochs="$5" + +./do-fastBPE.sh "$corpus_path" "$source_suffix" "$target_suffix" "$bpe_merges" +./marian-train.sh "$corpus_path" "$source_suffix" "$target_suffix" "$bpe_merges" "$epochs" \ No newline at end of file