13 lines
523 B
Bash
13 lines
523 B
Bash
|
mkdir -p gpt2_bpe
|
||
|
wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
|
||
|
wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
|
||
|
for SPLIT in train valid test; do \
|
||
|
python -m multiprocessing_bpe_encoder \
|
||
|
--encoder-json gpt2_bpe/encoder.json \
|
||
|
--vocab-bpe gpt2_bpe/vocab.bpe \
|
||
|
--inputs wikitext-103-raw/wiki.${SPLIT}.raw \
|
||
|
--outputs wikitext-103-raw/wiki.${SPLIT}.bpe \
|
||
|
--keep-empty \
|
||
|
--workers 10; \
|
||
|
done
|