3.5 KiB
3.5 KiB
from transformers import AutoTokenizer
from datasets import load_dataset
Dataset prep
model_checkpoint = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
def tokenize_function(examples):
return tokenizer(examples["text"], max_length=512, truncation=True)
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
Model training
from transformers import AutoModelForMaskedLM
from transformers import Trainer, TrainingArguments
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
[0;31m---------------------------------------------------------------------------[0m [0;31mNameError[0m Traceback (most recent call last) Cell [0;32mIn[11], line 1[0m [0;32m----> 1[0m model [39m=[39m AutoModelForMaskedLM[39m.[39mfrom_pretrained(model_checkpoint) [0;31mNameError[0m: name 'AutoModelForMaskedLM' is not defined
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
f"{model_name}-finetuned-america",
evaluation_strategy = "epoch",
learning_rate=2e-5,
weight_decay=0.01,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset[:len(dataset)*0.8],
eval_dataset=dataset[len(dataset)*0.8:]
)
trainer.train()