2023-05-31 10:09:09 +02:00
|
|
|
# wget https://huggingface.co/allegro/plt5-base/raw/main/tokenizer_config.json
|
|
|
|
# wget https://huggingface.co/allegro/plt5-base/raw/main/special_tokens_map.json
|
|
|
|
# wget https://huggingface.co/allegro/plt5-base/raw/main/config.json
|
2023-05-10 15:19:19 +02:00
|
|
|
|
2023-05-31 10:09:09 +02:00
|
|
|
spm_train --input='/mnt/gpu_data1/zrostek/diachronia-year-prediction/regular_roberta_from_scratch/train_tok_3.csv' --model_prefix='spiece' --vocab_size=50000 --model_type=unigram --train_extremely_large_corpus=true
|