from pathlib import Path from tokenizers import ByteLevelBPETokenizer paths = ['../train/in_text_only.csv'] # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=50265, min_frequency=2, special_tokens=[ "", "", "", "", "", ]) tokenizer.save_model("./tokenizer_model")