#!/bin/bash # arguments # 1. root of gonito.net challenge-like filestructure # 2. suffix of source sentences, E.g. en for files like train.en # 3. suffix of source sentences, E.g. pl for files like train.pl # 4. number of BPE merge operations, E.g. 32000 # Usage example ./do-fastBPE.sh ~/myCorpora en pl 32000 # path to corpus corpus_path="$1" # absolute paths to the files with source sentences source_suffix="$2" source_path_train="$corpus_path"/train/train."$source_suffix" source_path_test="$corpus_path"/test/test."$source_suffix" # absolute paths to the files with target sentences target_suffix="$3" target_path_train="$corpus_path"/train/train."$target_suffix" target_path_test="$corpus_path"/test/test."$target_suffix" # number of BPE merge operations, E.g. 32000 bpe_merges="$4" # path to BPE merges? codes="$corpus_path"/codes cd ~/fastBPE # learn BPE ./fast learnbpe "$bpe_merges" "$source_path_train" "$target_path_train" > "$codes" # apply codes to train ./fast applybpe "$source_path_train"."$bpe_merges" "$source_path_train" "$codes" ./fast applybpe "$target_path_train"."$bpe_merges" "$target_path_train" "$codes" # get train vocabulary ./fast getvocab "$source_path_train"."$bpe_merges" > "$source_path_train".vocab."$bpe_merges" ./fast getvocab "$target_path_train"."$bpe_merges" > "$target_path_train".vocab."$bpe_merges" # apply codes to test ./fast applybpe "$source_path_test"."$bpe_merges" "$source_path_test" "$codes" "$source_path_train".vocab."$bpe_merges" ./fast applybpe "$target_path_test"."$bpe_merges" "$target_path_test" "$codes" "$target_path_train".vocab."$bpe_merges" cd ~