wget https://dl.fbaipublicfiles.com/m2m_100/model_dict.128k.txt wget https://dl.fbaipublicfiles.com/m2m_100/language_pairs_small_models.txt # 418M parameter model wget https://dl.fbaipublicfiles.com/m2m_100/418M_last_checkpoint.pt # 1.2B parameter model wget https://dl.fbaipublicfiles.com/m2m_100/1.2B_last_checkpoint.pt # apply SPM wget https://dl.fbaipublicfiles.com/m2m_100/spm.128k.model python /path/to/fairseq/scripts/spm_encode.py \ --model spm.128k.model \ --output_format=piece \ --inputs=/path/to/input/file/here \ --outputs=/path/to/output/file/here python3 /home/kasia/fairseq/scripts/spm_encode.py --model /home/kasia/spm.128k.model --output_format=piece --inputs=/home/kasia/Pulpit/europarl/europarl/test-A/in.tsv --outputs=/home/kasia/Pulpit/europarl/europarl/test-A/spm.en-pl.en fairseq=/path/to/fairseq cd $fairseq sacrebleu --echo src -l de-fr -t wmt19 | head -n 20 > raw_input.de-fr.de sacrebleu --echo ref -l de-fr -t wmt19 | head -n 20 > raw_input.de-fr.fr wget https://dl.fbaipublicfiles.com/m2m_100/spm.128k.model for lang in de fr ; do python scripts/spm_encode.py \ --model spm.128k.model \ --output_format=piece \ --inputs=raw_input.de-fr.${lang} \ --outputs=spm.de-fr.${lang} done # binarize data wget https://dl.fbaipublicfiles.com/m2m_100/data_dict.128k.txt fairseq-preprocess \ --source-lang $src --target-lang $tgt \ --testpref spm.$src.$tgt \ --thresholdsrc 0 --thresholdtgt 0 \ --destdir data_bin \ --srcdict data_dict.128k.txt --tgtdict data_dict.128k.txt fairseq-preprocess --source-lang en --target-lang pl --testpref /home/kasia/Pulpit/europarl/europarl/test-A/spm.en-pl --thresholdsrc 0 --thresholdtgt 0 --destdir /home/kasia/Pulpit/europarl/europarl/test-A/data_bin --srcdict /home/kasia/data_dict.128k.txt --tgtdict /home/kasia/data_dict.128k.txt fairseq-generate \ data_bin \ --batch-size 1 \ --path 12b_last_chk_4_gpus.pt \ --fixed-dictionary model_dict.128k.txt \ -s de -t fr \ --remove-bpe 'sentencepiece' \ --beam 5 \ --task translation_multi_simple_epoch \ --lang-pairs language_pairs.txt \ --decoder-langtok --encoder-langtok src \ --gen-subset test \ --fp16 \ --dataset-impl mmap \ --distributed-world-size 1 --distributed-no-spawn \ --pipeline-model-parallel \ --pipeline-chunks 1 \ --pipeline-encoder-balance '[1,15,10]' \ --pipeline-encoder-devices '[0,1,0]' \ --pipeline-decoder-balance '[3,11,11,1]' \ --pipeline-decoder-devices '[0,2,3,0]' > gen_out fairseq-generate /home/kasia/Pulpit/europarl/europarl/test-A/data_bin --batch-size 32 --path /home/kasia/Pulpit/europarl/europarl/1.2B_last_checkpoint.pt --fixed-dictionary /home/kasia/model_dict.128k.txt -s en -t pl --remove-bpe 'sentencepiece' --beam 5 --task translation_multi_simple_epoch --lang-pairs /home/kasia/Pulpit/europarl/europarl/language_pairs_small_models.txt --decoder-langtok --encoder-langtok src --gen-subset test > /home/kasia/Pulpit/europarl/europarl/test-A/gen_out cd ${fairseq}/examples/m2m_100 cat ${fairseq}/gen_out | grep -P "^H" | sort -V | cut -f 3- | sh tok.sh fr > hyp cat ${fairseq}/raw_input.de-fr.fr | sh tok.sh fr > ref cat /home/kasia/Pulpit/europarl/europarl/test-A/gen_out | grep -P "^H" | sort -V | cut -f 3- | sh /home/kasia/fairseq/examples/m2m_100/tok.sh pl > /home/kasia/Pulpit/europarl/europarl/test-A/out.tsv