English-PolishEuroparl/m2m_100

75 lines
3.3 KiB
Plaintext
Raw Normal View History

2021-02-19 02:11:02 +01:00
wget https://dl.fbaipublicfiles.com/m2m_100/model_dict.128k.txt
wget https://dl.fbaipublicfiles.com/m2m_100/language_pairs_small_models.txt
# 418M parameter model
wget https://dl.fbaipublicfiles.com/m2m_100/418M_last_checkpoint.pt
# 1.2B parameter model
wget https://dl.fbaipublicfiles.com/m2m_100/1.2B_last_checkpoint.pt
# apply SPM
wget https://dl.fbaipublicfiles.com/m2m_100/spm.128k.model
python /path/to/fairseq/scripts/spm_encode.py \
--model spm.128k.model \
--output_format=piece \
--inputs=/path/to/input/file/here \
--outputs=/path/to/output/file/here
python3 /home/kasia/fairseq/scripts/spm_encode.py --model /home/kasia/spm.128k.model --output_format=piece --inputs=/home/kasia/Pulpit/europarl/europarl/test-A/in.tsv --outputs=/home/kasia/Pulpit/europarl/europarl/test-A/spm.en-pl.en
fairseq=/path/to/fairseq
cd $fairseq
sacrebleu --echo src -l de-fr -t wmt19 | head -n 20 > raw_input.de-fr.de
sacrebleu --echo ref -l de-fr -t wmt19 | head -n 20 > raw_input.de-fr.fr
wget https://dl.fbaipublicfiles.com/m2m_100/spm.128k.model
for lang in de fr ; do
python scripts/spm_encode.py \
--model spm.128k.model \
--output_format=piece \
--inputs=raw_input.de-fr.${lang} \
--outputs=spm.de-fr.${lang}
done
# binarize data
wget https://dl.fbaipublicfiles.com/m2m_100/data_dict.128k.txt
fairseq-preprocess \
--source-lang $src --target-lang $tgt \
--testpref spm.$src.$tgt \
--thresholdsrc 0 --thresholdtgt 0 \
--destdir data_bin \
--srcdict data_dict.128k.txt --tgtdict data_dict.128k.txt
fairseq-preprocess --source-lang en --target-lang pl --testpref /home/kasia/Pulpit/europarl/europarl/test-A/spm.en-pl --thresholdsrc 0 --thresholdtgt 0 --destdir /home/kasia/Pulpit/europarl/europarl/test-A/data_bin --srcdict /home/kasia/data_dict.128k.txt --tgtdict /home/kasia/data_dict.128k.txt
fairseq-generate \
data_bin \
--batch-size 1 \
--path 12b_last_chk_4_gpus.pt \
--fixed-dictionary model_dict.128k.txt \
-s de -t fr \
--remove-bpe 'sentencepiece' \
--beam 5 \
--task translation_multi_simple_epoch \
--lang-pairs language_pairs.txt \
--decoder-langtok --encoder-langtok src \
--gen-subset test \
--fp16 \
--dataset-impl mmap \
--distributed-world-size 1 --distributed-no-spawn \
--pipeline-model-parallel \
--pipeline-chunks 1 \
--pipeline-encoder-balance '[1,15,10]' \
--pipeline-encoder-devices '[0,1,0]' \
--pipeline-decoder-balance '[3,11,11,1]' \
--pipeline-decoder-devices '[0,2,3,0]' > gen_out
fairseq-generate /home/kasia/Pulpit/europarl/europarl/test-A/data_bin --batch-size 32 --path /home/kasia/Pulpit/europarl/europarl/1.2B_last_checkpoint.pt --fixed-dictionary /home/kasia/model_dict.128k.txt -s en -t pl --remove-bpe 'sentencepiece' --beam 5 --task translation_multi_simple_epoch --lang-pairs /home/kasia/Pulpit/europarl/europarl/language_pairs_small_models.txt --decoder-langtok --encoder-langtok src --gen-subset test > /home/kasia/Pulpit/europarl/europarl/test-A/gen_out
cd ${fairseq}/examples/m2m_100
cat ${fairseq}/gen_out | grep -P "^H" | sort -V | cut -f 3- | sh tok.sh fr > hyp
cat ${fairseq}/raw_input.de-fr.fr | sh tok.sh fr > ref
cat /home/kasia/Pulpit/europarl/europarl/test-A/gen_out | grep -P "^H" | sort -V | cut -f 3- | sh /home/kasia/fairseq/examples/m2m_100/tok.sh pl > /home/kasia/Pulpit/europarl/europarl/test-A/out.tsv