75 lines
3.3 KiB
Plaintext
75 lines
3.3 KiB
Plaintext
wget https://dl.fbaipublicfiles.com/m2m_100/model_dict.128k.txt
|
|
wget https://dl.fbaipublicfiles.com/m2m_100/language_pairs_small_models.txt
|
|
|
|
# 418M parameter model
|
|
wget https://dl.fbaipublicfiles.com/m2m_100/418M_last_checkpoint.pt
|
|
|
|
# 1.2B parameter model
|
|
wget https://dl.fbaipublicfiles.com/m2m_100/1.2B_last_checkpoint.pt
|
|
|
|
# apply SPM
|
|
wget https://dl.fbaipublicfiles.com/m2m_100/spm.128k.model
|
|
python /path/to/fairseq/scripts/spm_encode.py \
|
|
--model spm.128k.model \
|
|
--output_format=piece \
|
|
--inputs=/path/to/input/file/here \
|
|
--outputs=/path/to/output/file/here
|
|
|
|
python3 /home/kasia/fairseq/scripts/spm_encode.py --model /home/kasia/spm.128k.model --output_format=piece --inputs=/home/kasia/Pulpit/europarl/europarl/test-A/in.tsv --outputs=/home/kasia/Pulpit/europarl/europarl/test-A/spm.en-pl.en
|
|
|
|
|
|
fairseq=/path/to/fairseq
|
|
cd $fairseq
|
|
sacrebleu --echo src -l de-fr -t wmt19 | head -n 20 > raw_input.de-fr.de
|
|
sacrebleu --echo ref -l de-fr -t wmt19 | head -n 20 > raw_input.de-fr.fr
|
|
wget https://dl.fbaipublicfiles.com/m2m_100/spm.128k.model
|
|
for lang in de fr ; do
|
|
python scripts/spm_encode.py \
|
|
--model spm.128k.model \
|
|
--output_format=piece \
|
|
--inputs=raw_input.de-fr.${lang} \
|
|
--outputs=spm.de-fr.${lang}
|
|
done
|
|
|
|
# binarize data
|
|
wget https://dl.fbaipublicfiles.com/m2m_100/data_dict.128k.txt
|
|
fairseq-preprocess \
|
|
--source-lang $src --target-lang $tgt \
|
|
--testpref spm.$src.$tgt \
|
|
--thresholdsrc 0 --thresholdtgt 0 \
|
|
--destdir data_bin \
|
|
--srcdict data_dict.128k.txt --tgtdict data_dict.128k.txt
|
|
|
|
fairseq-preprocess --source-lang en --target-lang pl --testpref /home/kasia/Pulpit/europarl/europarl/test-A/spm.en-pl --thresholdsrc 0 --thresholdtgt 0 --destdir /home/kasia/Pulpit/europarl/europarl/test-A/data_bin --srcdict /home/kasia/data_dict.128k.txt --tgtdict /home/kasia/data_dict.128k.txt
|
|
|
|
fairseq-generate \
|
|
data_bin \
|
|
--batch-size 1 \
|
|
--path 12b_last_chk_4_gpus.pt \
|
|
--fixed-dictionary model_dict.128k.txt \
|
|
-s de -t fr \
|
|
--remove-bpe 'sentencepiece' \
|
|
--beam 5 \
|
|
--task translation_multi_simple_epoch \
|
|
--lang-pairs language_pairs.txt \
|
|
--decoder-langtok --encoder-langtok src \
|
|
--gen-subset test \
|
|
--fp16 \
|
|
--dataset-impl mmap \
|
|
--distributed-world-size 1 --distributed-no-spawn \
|
|
--pipeline-model-parallel \
|
|
--pipeline-chunks 1 \
|
|
--pipeline-encoder-balance '[1,15,10]' \
|
|
--pipeline-encoder-devices '[0,1,0]' \
|
|
--pipeline-decoder-balance '[3,11,11,1]' \
|
|
--pipeline-decoder-devices '[0,2,3,0]' > gen_out
|
|
|
|
fairseq-generate /home/kasia/Pulpit/europarl/europarl/test-A/data_bin --batch-size 32 --path /home/kasia/Pulpit/europarl/europarl/1.2B_last_checkpoint.pt --fixed-dictionary /home/kasia/model_dict.128k.txt -s en -t pl --remove-bpe 'sentencepiece' --beam 5 --task translation_multi_simple_epoch --lang-pairs /home/kasia/Pulpit/europarl/europarl/language_pairs_small_models.txt --decoder-langtok --encoder-langtok src --gen-subset test > /home/kasia/Pulpit/europarl/europarl/test-A/gen_out
|
|
|
|
cd ${fairseq}/examples/m2m_100
|
|
cat ${fairseq}/gen_out | grep -P "^H" | sort -V | cut -f 3- | sh tok.sh fr > hyp
|
|
cat ${fairseq}/raw_input.de-fr.fr | sh tok.sh fr > ref
|
|
|
|
cat /home/kasia/Pulpit/europarl/europarl/test-A/gen_out | grep -P "^H" | sort -V | cut -f 3- | sh /home/kasia/fairseq/examples/m2m_100/tok.sh pl > /home/kasia/Pulpit/europarl/europarl/test-A/out.tsv
|
|
|