add plewi support

This commit is contained in:
jakubknczny 2022-01-15 20:10:06 +01:00
parent 942e426366
commit 1339121797
3 changed files with 21 additions and 17 deletions

8
plewi.sh Normal file
View File

@ -0,0 +1,8 @@
#!/bin/bash
plewi_path=~/PLEWI-polish-errors-correction-challenge/train
train_file="$plewi_path"/train.tsv
xz -d -v "$train_file".xz
cat "$train_file" | cut -f 1 > "$plewi_path"/in.tsv
cat "$train_file" | cut -f 2 > "$plewi_path"/expected.tsv

View File

@ -30,17 +30,13 @@ cd ~/fastBPE
./fast applybpe "$source_path_train"."$bpe_merges" "$source_path_train" "$codes"
./fast applybpe "$target_path_train"."$bpe_merges" "$target_path_train" "$codes"
# get train vocabulary and .yml'ify it
./fast getvocab "$source_path_train"."$bpe_merges" > "$source_path_train".vocab."$bpe_merges" \
tee >(cut -f 1 > temp1) | cut -f 2 > temp2
ex -sc '%s/$/:/|x' temp1
paste temp1 temp2 > "$source_path_train".vocab."$bpe_merges".yml
rm temp1 temp2 "$source_path_train".vocab."$bpe_merges"
./fast getvocab "$target_path_train"."$bpe_merges" > "$target_path_train".vocab."$bpe_merges" \
tee >(cut -f 1 > temp1) | cut -f 2 > temp2
ex -sc '%s/$/:/|x' temp1
paste temp1 temp2 > "$target_path_train".vocab."$bpe_merges".yml
rm temp1 temp2 "$target_path_train".vocab."$bpe_merges"
for file in "$source_path_train" "$target_path_train"
do
./fast getvocab "$file"."$bpe_merges" | tee >( cut -f 1 -d " " > temp1 ) | cut -f 2 -d " " > temp2
ex -sc '%s/$/:/|x' temp1
paste temp1 temp2 > "$file".vocab."$bpe_merges".yml
rm temp1 temp2
done
## apply codes to test
#./fast applybpe "$source_path_test"."$bpe_merges" "$source_path_test" "$codes" "$source_path_train".vocab."$bpe_merges"
#./fast applybpe "$target_path_test"."$bpe_merges" "$target_path_test" "$codes" "$target_path_train".vocab."$bpe_merges"

View File

@ -7,13 +7,13 @@
# path to corpus
corpus_path="$1"
bpe_merges="$4"
epochs="$5"
bpe_merges="$2"
epochs="$3"
source_file="$corpus_path"/train/in.tsv
source_file="$corpus_path"/train/in.tsv."$bpe_merges"
source_vocab="$source_file".vocab."$bpe_merges".yml
target_file="$corpus_path"/train/expected.tsv
target_file="$corpus_path"/train/expected.tsv."$bpe_merges"
target_vocab="$target_file".vocab."$bpe_merges".yml
@ -37,8 +37,8 @@ target_vocab="$target_file".vocab."$bpe_merges".yml
--tied-embeddings \
--exponential-smoothing \
--log ~/train.log \
--after-epochs="$epochs" \
--vocabs "$source_vocab" "$target_vocab"
--after-epochs="$epochs"
#--vocabs "$source_vocab" "$target_vocab"
#--early-stopping 10 \
#--model model/model.npz