2022-01-14 18:45:36 +01:00
|
|
|
#!/bin/bash
|
|
|
|
|
|
|
|
# arguments
|
|
|
|
# 1. root of gonito.net challenge-like filestructure
|
2022-01-15 18:07:27 +01:00
|
|
|
# 2. number of BPE merge operations, E.g. 32000
|
|
|
|
# Usage example ./do-fastBPE.sh ~/sample-gonito-challenge 32000
|
2022-01-14 18:45:36 +01:00
|
|
|
|
|
|
|
# path to corpus
|
|
|
|
corpus_path="$1"
|
|
|
|
# absolute paths to the files with source sentences
|
2022-01-15 18:07:27 +01:00
|
|
|
source_path_train="$corpus_path"/train/in.tsv
|
|
|
|
source_path_test="$corpus_path"/test-A/in.tsv
|
2022-01-14 18:45:36 +01:00
|
|
|
# absolute paths to the files with target sentences
|
2022-01-15 18:07:27 +01:00
|
|
|
target_path_train="$corpus_path"/train/expected.tsv
|
|
|
|
target_path_test="$corpus_path"/test-A/expected.tsv
|
2022-01-14 18:45:36 +01:00
|
|
|
# number of BPE merge operations, E.g. 32000
|
2022-01-15 18:07:27 +01:00
|
|
|
bpe_merges="$2"
|
|
|
|
# path to BPE codes?
|
2022-01-14 18:45:36 +01:00
|
|
|
codes="$corpus_path"/codes
|
|
|
|
|
2022-01-15 18:07:27 +01:00
|
|
|
for file in "$source_path_train" "$source_path_test" "$target_path_train" "$target_path_test"
|
|
|
|
do
|
|
|
|
ex -sc '%s/^/<s>/|%s/$/<\/s>/|x' "$file"
|
|
|
|
done
|
2022-01-14 18:45:36 +01:00
|
|
|
|
|
|
|
cd ~/fastBPE
|
|
|
|
# learn BPE
|
|
|
|
./fast learnbpe "$bpe_merges" "$source_path_train" "$target_path_train" > "$codes"
|
|
|
|
# apply codes to train
|
|
|
|
./fast applybpe "$source_path_train"."$bpe_merges" "$source_path_train" "$codes"
|
|
|
|
./fast applybpe "$target_path_train"."$bpe_merges" "$target_path_train" "$codes"
|
2022-01-15 18:07:27 +01:00
|
|
|
# get train vocabulary and .yml'ify it
|
2022-01-15 20:10:06 +01:00
|
|
|
for file in "$source_path_train" "$target_path_train"
|
|
|
|
do
|
|
|
|
./fast getvocab "$file"."$bpe_merges" | tee >( cut -f 1 -d " " > temp1 ) | cut -f 2 -d " " > temp2
|
|
|
|
ex -sc '%s/$/:/|x' temp1
|
|
|
|
paste temp1 temp2 > "$file".vocab."$bpe_merges".yml
|
|
|
|
rm temp1 temp2
|
|
|
|
done
|
2022-01-15 18:07:27 +01:00
|
|
|
## apply codes to test
|
|
|
|
#./fast applybpe "$source_path_test"."$bpe_merges" "$source_path_test" "$codes" "$source_path_train".vocab."$bpe_merges"
|
|
|
|
#./fast applybpe "$target_path_test"."$bpe_merges" "$target_path_test" "$codes" "$target_path_train".vocab."$bpe_merges"
|
2022-01-14 18:45:36 +01:00
|
|
|
cd ~
|