From 13391217978bcb6a96622026c72e6fa75cad70ce Mon Sep 17 00:00:00 2001 From: jakubknczny Date: Sat, 15 Jan 2022 20:10:06 +0100 Subject: [PATCH] add plewi support --- plewi.sh | 8 ++++++++ scripts/do-fastBPE.sh | 18 +++++++----------- scripts/marian-train.sh | 12 ++++++------ 3 files changed, 21 insertions(+), 17 deletions(-) create mode 100644 plewi.sh diff --git a/plewi.sh b/plewi.sh new file mode 100644 index 0000000..f810de6 --- /dev/null +++ b/plewi.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +plewi_path=~/PLEWI-polish-errors-correction-challenge/train +train_file="$plewi_path"/train.tsv + +xz -d -v "$train_file".xz +cat "$train_file" | cut -f 1 > "$plewi_path"/in.tsv +cat "$train_file" | cut -f 2 > "$plewi_path"/expected.tsv \ No newline at end of file diff --git a/scripts/do-fastBPE.sh b/scripts/do-fastBPE.sh index 0b1e227..4fa8531 100755 --- a/scripts/do-fastBPE.sh +++ b/scripts/do-fastBPE.sh @@ -30,17 +30,13 @@ cd ~/fastBPE ./fast applybpe "$source_path_train"."$bpe_merges" "$source_path_train" "$codes" ./fast applybpe "$target_path_train"."$bpe_merges" "$target_path_train" "$codes" # get train vocabulary and .yml'ify it -./fast getvocab "$source_path_train"."$bpe_merges" > "$source_path_train".vocab."$bpe_merges" \ -tee >(cut -f 1 > temp1) | cut -f 2 > temp2 -ex -sc '%s/$/:/|x' temp1 -paste temp1 temp2 > "$source_path_train".vocab."$bpe_merges".yml -rm temp1 temp2 "$source_path_train".vocab."$bpe_merges" - -./fast getvocab "$target_path_train"."$bpe_merges" > "$target_path_train".vocab."$bpe_merges" \ -tee >(cut -f 1 > temp1) | cut -f 2 > temp2 -ex -sc '%s/$/:/|x' temp1 -paste temp1 temp2 > "$target_path_train".vocab."$bpe_merges".yml -rm temp1 temp2 "$target_path_train".vocab."$bpe_merges" +for file in "$source_path_train" "$target_path_train" +do + ./fast getvocab "$file"."$bpe_merges" | tee >( cut -f 1 -d " " > temp1 ) | cut -f 2 -d " " > temp2 + ex -sc '%s/$/:/|x' temp1 + paste temp1 temp2 > "$file".vocab."$bpe_merges".yml + rm temp1 temp2 +done ## apply codes to test #./fast applybpe "$source_path_test"."$bpe_merges" "$source_path_test" "$codes" "$source_path_train".vocab."$bpe_merges" #./fast applybpe "$target_path_test"."$bpe_merges" "$target_path_test" "$codes" "$target_path_train".vocab."$bpe_merges" diff --git a/scripts/marian-train.sh b/scripts/marian-train.sh index cfb30d2..f5d3710 100755 --- a/scripts/marian-train.sh +++ b/scripts/marian-train.sh @@ -7,13 +7,13 @@ # path to corpus corpus_path="$1" -bpe_merges="$4" -epochs="$5" +bpe_merges="$2" +epochs="$3" -source_file="$corpus_path"/train/in.tsv +source_file="$corpus_path"/train/in.tsv."$bpe_merges" source_vocab="$source_file".vocab."$bpe_merges".yml -target_file="$corpus_path"/train/expected.tsv +target_file="$corpus_path"/train/expected.tsv."$bpe_merges" target_vocab="$target_file".vocab."$bpe_merges".yml @@ -37,8 +37,8 @@ target_vocab="$target_file".vocab."$bpe_merges".yml --tied-embeddings \ --exponential-smoothing \ --log ~/train.log \ ---after-epochs="$epochs" \ ---vocabs "$source_vocab" "$target_vocab" +--after-epochs="$epochs" +#--vocabs "$source_vocab" "$target_vocab" #--early-stopping 10 \ #--model model/model.npz