wmt-2020-pl-en/prepare.sh
2021-02-07 20:40:38 +01:00

83 lines
2.1 KiB
Bash

#!/usr/bin/env bash
#
# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
SCRIPTS=../mosesdecoder/scripts
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
LC=$SCRIPTS/tokenizer/lowercase.perl
CLEAN=$SCRIPTS/training/clean-corpus-n.perl
BPEROOT=../subword-nmt/subword_nmt
BPE_TOKENS=1000
src=pl
tgt=en
lang=pl-en
prep=wmt2020.tokenized.pl-en
tmp=$prep/tmp
orig=orig
mkdir -p $orig $tmp $prep
echo "grep train"
for l in $src $tgt; do
f=train.tags.$lang.$l
tok=train.tags.$lang.tok.$l
cat $orig/$lang/$f | \
grep -v '<url>' | \
grep -v '<talkid>' | \
grep -v '<keywords>' | \
sed -e 's/<title>//g' | \
sed -e 's/<\/title>//g' | \
sed -e 's/<description>//g' | \
sed -e 's/<\/description>//g' | \
perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
echo ""
done
perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train.tags.$lang.clean 1 175
for l in $src $tgt; do
perl $LC < $tmp/train.tags.$lang.clean.$l > $tmp/train.tags.$lang.$l
done
echo "grep valid"
for l in $src $tgt; do
f=valid.$lang.$l
tok=valid.$lang.tok.$l
cat $orig/$lang/$f | \
grep -v '<url>' | \
grep -v '<talkid>' | \
grep -v '<keywords>' | \
sed -e 's/<title>//g' | \
sed -e 's/<\/title>//g' | \
sed -e 's/<description>//g' | \
sed -e 's/<\/description>//g' | \
perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
echo ""
done
perl $CLEAN -ratio 1.5 $tmp/valid.$lang.tok $src $tgt $tmp/valid.$lang.clean 1 175
for l in $src $tgt; do
perl $LC < $tmp/valid.$lang.clean.$l > $tmp/valid.$lang.$l
done
echo "creating train, valid, test..."
for l in $src $tgt; do
cat $tmp/valid.$lang.$l > $tmp/valid.$l
cat $tmp/train.tags.$lang.$l > $tmp/train.$l
done
TRAIN=$tmp/train.pl-en
BPE_CODE=$prep/code
rm -f $TRAIN
for l in $src $tgt; do
cat $tmp/train.$l >> $TRAIN
done
echo "learn_bpe.py on ${TRAIN}..."
python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
for L in $src $tgt; do
for f in train.$L valid.$L; do
echo "apply_bpe.py to ${f}..."
python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f
done
done