transfix-train/scripts/do-fastBPE.sh

39 lines
1.6 KiB
Bash
Raw Normal View History

2022-01-14 18:45:36 +01:00
#!/bin/bash
# arguments
# 1. root of gonito.net challenge-like filestructure
# 2. suffix of source sentences, E.g. en for files like train.en
# 3. suffix of source sentences, E.g. pl for files like train.pl
# 4. number of BPE merge operations, E.g. 32000
# Usage example ./do-fastBPE.sh ~/myCorpora en pl 32000
# path to corpus
corpus_path="$1"
# absolute paths to the files with source sentences
source_suffix="$2"
source_path_train="$corpus_path"/train/train."$source_suffix"
source_path_test="$corpus_path"/test/test."$source_suffix"
# absolute paths to the files with target sentences
target_suffix="$3"
target_path_train="$corpus_path"/train/train."$target_suffix"
target_path_test="$corpus_path"/test/test."$target_suffix"
# number of BPE merge operations, E.g. 32000
bpe_merges="$4"
# path to BPE merges?
codes="$corpus_path"/codes
cd ~/fastBPE
# learn BPE
./fast learnbpe "$bpe_merges" "$source_path_train" "$target_path_train" > "$codes"
# apply codes to train
./fast applybpe "$source_path_train"."$bpe_merges" "$source_path_train" "$codes"
./fast applybpe "$target_path_train"."$bpe_merges" "$target_path_train" "$codes"
# get train vocabulary
./fast getvocab "$source_path_train"."$bpe_merges" > "$source_path_train".vocab."$bpe_merges"
./fast getvocab "$target_path_train"."$bpe_merges" > "$target_path_train".vocab."$bpe_merges"
# apply codes to test
./fast applybpe "$source_path_test"."$bpe_merges" "$source_path_test" "$codes" "$source_path_train".vocab."$bpe_merges"
./fast applybpe "$target_path_test"."$bpe_merges" "$target_path_test" "$codes" "$target_path_train".vocab."$bpe_merges"
cd ~