challenging-america-word-ga.../finetune_roberta.ipynb

3.5 KiB
Raw Blame History

from transformers import AutoTokenizer
from datasets import load_dataset

Dataset prep

model_checkpoint = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
def tokenize_function(examples):
    return tokenizer(examples["text"], max_length=512, truncation=True)
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

Model training

from transformers import AutoModelForMaskedLM
from transformers import Trainer, TrainingArguments
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 1
----> 1 model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

NameError: name 'AutoModelForMaskedLM' is not defined
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-america",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset[:len(dataset)*0.8],
    eval_dataset=dataset[len(dataset)*0.8:]
)
trainer.train()