39 lines
1.6 KiB
Bash
Executable File
39 lines
1.6 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# arguments
|
|
# 1. root of gonito.net challenge-like filestructure
|
|
# 2. suffix of source sentences, E.g. en for files like train.en
|
|
# 3. suffix of source sentences, E.g. pl for files like train.pl
|
|
# 4. number of BPE merge operations, E.g. 32000
|
|
# Usage example ./do-fastBPE.sh ~/myCorpora en pl 32000
|
|
|
|
# path to corpus
|
|
corpus_path="$1"
|
|
# absolute paths to the files with source sentences
|
|
source_suffix="$2"
|
|
source_path_train="$corpus_path"/train/train."$source_suffix"
|
|
source_path_test="$corpus_path"/test/test."$source_suffix"
|
|
# absolute paths to the files with target sentences
|
|
target_suffix="$3"
|
|
target_path_train="$corpus_path"/train/train."$target_suffix"
|
|
target_path_test="$corpus_path"/test/test."$target_suffix"
|
|
# number of BPE merge operations, E.g. 32000
|
|
bpe_merges="$4"
|
|
# path to BPE merges?
|
|
codes="$corpus_path"/codes
|
|
|
|
|
|
cd ~/fastBPE
|
|
# learn BPE
|
|
./fast learnbpe "$bpe_merges" "$source_path_train" "$target_path_train" > "$codes"
|
|
# apply codes to train
|
|
./fast applybpe "$source_path_train"."$bpe_merges" "$source_path_train" "$codes"
|
|
./fast applybpe "$target_path_train"."$bpe_merges" "$target_path_train" "$codes"
|
|
# get train vocabulary
|
|
./fast getvocab "$source_path_train"."$bpe_merges" > "$source_path_train".vocab."$bpe_merges"
|
|
./fast getvocab "$target_path_train"."$bpe_merges" > "$target_path_train".vocab."$bpe_merges"
|
|
# apply codes to test
|
|
./fast applybpe "$source_path_test"."$bpe_merges" "$source_path_test" "$codes" "$source_path_train".vocab."$bpe_merges"
|
|
./fast applybpe "$target_path_test"."$bpe_merges" "$target_path_test" "$codes" "$target_path_train".vocab."$bpe_merges"
|
|
cd ~
|