transfix-train/scripts/do-fastBPE.sh
2022-01-15 20:10:06 +01:00

44 lines
1.6 KiB
Bash
Executable File

#!/bin/bash
# arguments
# 1. root of gonito.net challenge-like filestructure
# 2. number of BPE merge operations, E.g. 32000
# Usage example ./do-fastBPE.sh ~/sample-gonito-challenge 32000
# path to corpus
corpus_path="$1"
# absolute paths to the files with source sentences
source_path_train="$corpus_path"/train/in.tsv
source_path_test="$corpus_path"/test-A/in.tsv
# absolute paths to the files with target sentences
target_path_train="$corpus_path"/train/expected.tsv
target_path_test="$corpus_path"/test-A/expected.tsv
# number of BPE merge operations, E.g. 32000
bpe_merges="$2"
# path to BPE codes?
codes="$corpus_path"/codes
for file in "$source_path_train" "$source_path_test" "$target_path_train" "$target_path_test"
do
ex -sc '%s/^/<s>/|%s/$/<\/s>/|x' "$file"
done
cd ~/fastBPE
# learn BPE
./fast learnbpe "$bpe_merges" "$source_path_train" "$target_path_train" > "$codes"
# apply codes to train
./fast applybpe "$source_path_train"."$bpe_merges" "$source_path_train" "$codes"
./fast applybpe "$target_path_train"."$bpe_merges" "$target_path_train" "$codes"
# get train vocabulary and .yml'ify it
for file in "$source_path_train" "$target_path_train"
do
./fast getvocab "$file"."$bpe_merges" | tee >( cut -f 1 -d " " > temp1 ) | cut -f 2 -d " " > temp2
ex -sc '%s/$/:/|x' temp1
paste temp1 temp2 > "$file".vocab."$bpe_merges".yml
rm temp1 temp2
done
## apply codes to test
#./fast applybpe "$source_path_test"."$bpe_merges" "$source_path_test" "$codes" "$source_path_train".vocab."$bpe_merges"
#./fast applybpe "$target_path_test"."$bpe_merges" "$target_path_test" "$codes" "$target_path_train".vocab."$bpe_merges"
cd ~