19 lines
372 B
Python
19 lines
372 B
Python
|
from pathlib import Path
|
||
|
from tokenizers import ByteLevelBPETokenizer
|
||
|
|
||
|
paths = ['./train_in.csv']
|
||
|
|
||
|
# Initialize a tokenizer
|
||
|
tokenizer = ByteLevelBPETokenizer()
|
||
|
|
||
|
# Customize training
|
||
|
tokenizer.train(files=paths, vocab_size=50265, min_frequency=2, special_tokens=[
|
||
|
"<s>",
|
||
|
"<pad>",
|
||
|
"</s>",
|
||
|
"<unk>",
|
||
|
"<mask>",
|
||
|
])
|
||
|
|
||
|
tokenizer.save_model("./tokenizer_model")
|