transfix-train/scripts/do-fastBPE.sh

44 lines
1.6 KiB
Bash
Raw Permalink Normal View History

2022-01-14 18:45:36 +01:00
#!/bin/bash
# arguments
# 1. root of gonito.net challenge-like filestructure
2022-01-15 18:07:27 +01:00
# 2. number of BPE merge operations, E.g. 32000
# Usage example ./do-fastBPE.sh ~/sample-gonito-challenge 32000
2022-01-14 18:45:36 +01:00
# path to corpus
corpus_path="$1"
# absolute paths to the files with source sentences
2022-01-15 18:07:27 +01:00
source_path_train="$corpus_path"/train/in.tsv
source_path_test="$corpus_path"/test-A/in.tsv
2022-01-14 18:45:36 +01:00
# absolute paths to the files with target sentences
2022-01-15 18:07:27 +01:00
target_path_train="$corpus_path"/train/expected.tsv
target_path_test="$corpus_path"/test-A/expected.tsv
2022-01-14 18:45:36 +01:00
# number of BPE merge operations, E.g. 32000
2022-01-15 18:07:27 +01:00
bpe_merges="$2"
# path to BPE codes?
2022-01-14 18:45:36 +01:00
codes="$corpus_path"/codes
2022-01-15 18:07:27 +01:00
for file in "$source_path_train" "$source_path_test" "$target_path_train" "$target_path_test"
do
ex -sc '%s/^/<s>/|%s/$/<\/s>/|x' "$file"
done
2022-01-14 18:45:36 +01:00
cd ~/fastBPE
# learn BPE
./fast learnbpe "$bpe_merges" "$source_path_train" "$target_path_train" > "$codes"
# apply codes to train
./fast applybpe "$source_path_train"."$bpe_merges" "$source_path_train" "$codes"
./fast applybpe "$target_path_train"."$bpe_merges" "$target_path_train" "$codes"
2022-01-15 18:07:27 +01:00
# get train vocabulary and .yml'ify it
2022-01-15 20:10:06 +01:00
for file in "$source_path_train" "$target_path_train"
do
./fast getvocab "$file"."$bpe_merges" | tee >( cut -f 1 -d " " > temp1 ) | cut -f 2 -d " " > temp2
ex -sc '%s/$/:/|x' temp1
paste temp1 temp2 > "$file".vocab."$bpe_merges".yml
rm temp1 temp2
done
2022-01-15 18:07:27 +01:00
## apply codes to test
#./fast applybpe "$source_path_test"."$bpe_merges" "$source_path_test" "$codes" "$source_path_train".vocab."$bpe_merges"
#./fast applybpe "$target_path_test"."$bpe_merges" "$target_path_test" "$codes" "$target_path_train".vocab."$bpe_merges"
2022-01-14 18:45:36 +01:00
cd ~