#!/bin/bash # arguments # 1. root of gonito.net challenge-like filestructure # 2. number of BPE merge operations, E.g. 32000 # Usage example ./do-fastBPE.sh ~/sample-gonito-challenge 32000 # path to corpus corpus_path="$1" # absolute paths to the files with source sentences source_path_train="$corpus_path"/train/in.tsv source_path_test="$corpus_path"/test-A/in.tsv # absolute paths to the files with target sentences target_path_train="$corpus_path"/train/expected.tsv target_path_test="$corpus_path"/test-A/expected.tsv # number of BPE merge operations, E.g. 32000 bpe_merges="$2" # path to BPE codes? codes="$corpus_path"/codes for file in "$source_path_train" "$source_path_test" "$target_path_train" "$target_path_test" do ex -sc '%s/^//|%s/$/<\/s>/|x' "$file" done cd ~/fastBPE # learn BPE ./fast learnbpe "$bpe_merges" "$source_path_train" "$target_path_train" > "$codes" # apply codes to train ./fast applybpe "$source_path_train"."$bpe_merges" "$source_path_train" "$codes" ./fast applybpe "$target_path_train"."$bpe_merges" "$target_path_train" "$codes" # get train vocabulary and .yml'ify it for file in "$source_path_train" "$target_path_train" do ./fast getvocab "$file"."$bpe_merges" | tee >( cut -f 1 -d " " > temp1 ) | cut -f 2 -d " " > temp2 ex -sc '%s/$/:/|x' temp1 paste temp1 temp2 > "$file".vocab."$bpe_merges".yml rm temp1 temp2 done ## apply codes to test #./fast applybpe "$source_path_test"."$bpe_merges" "$source_path_test" "$codes" "$source_path_train".vocab."$bpe_merges" #./fast applybpe "$target_path_test"."$bpe_merges" "$target_path_test" "$codes" "$target_path_train".vocab."$bpe_merges" cd ~