83 lines
2.1 KiB
Bash
83 lines
2.1 KiB
Bash
#!/usr/bin/env bash
|
|
#
|
|
# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
|
|
|
|
SCRIPTS=../mosesdecoder/scripts
|
|
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
|
|
LC=$SCRIPTS/tokenizer/lowercase.perl
|
|
CLEAN=$SCRIPTS/training/clean-corpus-n.perl
|
|
BPEROOT=../subword-nmt/subword_nmt
|
|
BPE_TOKENS=1000
|
|
|
|
src=pl
|
|
tgt=en
|
|
lang=pl-en
|
|
prep=wmt2020.tokenized.pl-en
|
|
tmp=$prep/tmp
|
|
orig=orig
|
|
|
|
mkdir -p $orig $tmp $prep
|
|
echo "grep train"
|
|
for l in $src $tgt; do
|
|
f=train.tags.$lang.$l
|
|
tok=train.tags.$lang.tok.$l
|
|
|
|
cat $orig/$lang/$f | \
|
|
grep -v '<url>' | \
|
|
grep -v '<talkid>' | \
|
|
grep -v '<keywords>' | \
|
|
sed -e 's/<title>//g' | \
|
|
sed -e 's/<\/title>//g' | \
|
|
sed -e 's/<description>//g' | \
|
|
sed -e 's/<\/description>//g' | \
|
|
perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
|
|
echo ""
|
|
done
|
|
perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train.tags.$lang.clean 1 175
|
|
for l in $src $tgt; do
|
|
perl $LC < $tmp/train.tags.$lang.clean.$l > $tmp/train.tags.$lang.$l
|
|
done
|
|
|
|
echo "grep valid"
|
|
for l in $src $tgt; do
|
|
f=valid.$lang.$l
|
|
tok=valid.$lang.tok.$l
|
|
|
|
cat $orig/$lang/$f | \
|
|
grep -v '<url>' | \
|
|
grep -v '<talkid>' | \
|
|
grep -v '<keywords>' | \
|
|
sed -e 's/<title>//g' | \
|
|
sed -e 's/<\/title>//g' | \
|
|
sed -e 's/<description>//g' | \
|
|
sed -e 's/<\/description>//g' | \
|
|
perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
|
|
echo ""
|
|
done
|
|
perl $CLEAN -ratio 1.5 $tmp/valid.$lang.tok $src $tgt $tmp/valid.$lang.clean 1 175
|
|
for l in $src $tgt; do
|
|
perl $LC < $tmp/valid.$lang.clean.$l > $tmp/valid.$lang.$l
|
|
done
|
|
|
|
echo "creating train, valid, test..."
|
|
for l in $src $tgt; do
|
|
cat $tmp/valid.$lang.$l > $tmp/valid.$l
|
|
cat $tmp/train.tags.$lang.$l > $tmp/train.$l
|
|
done
|
|
TRAIN=$tmp/train.pl-en
|
|
BPE_CODE=$prep/code
|
|
rm -f $TRAIN
|
|
for l in $src $tgt; do
|
|
cat $tmp/train.$l >> $TRAIN
|
|
done
|
|
|
|
echo "learn_bpe.py on ${TRAIN}..."
|
|
python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
|
|
|
|
for L in $src $tgt; do
|
|
for f in train.$L valid.$L; do
|
|
echo "apply_bpe.py to ${f}..."
|
|
python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f
|
|
done
|
|
done
|