challenging-america-word-ga.../finetune_roberta.ipynb
2023-06-27 19:16:04 +02:00

159 lines
3.5 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer\n",
"from datasets import load_dataset"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Dataset prep"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"model_checkpoint = \"distilroberta-base\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def tokenize_function(examples):\n",
" return tokenizer(examples[\"text\"], max_length=512, truncation=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=[\"text\"])"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Model training"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoModelForMaskedLM\n",
"from transformers import Trainer, TrainingArguments"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'AutoModelForMaskedLM' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m model \u001b[39m=\u001b[39m AutoModelForMaskedLM\u001b[39m.\u001b[39mfrom_pretrained(model_checkpoint)\n",
"\u001b[0;31mNameError\u001b[0m: name 'AutoModelForMaskedLM' is not defined"
]
}
],
"source": [
"model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_name = model_checkpoint.split(\"/\")[-1]\n",
"training_args = TrainingArguments(\n",
" f\"{model_name}-finetuned-america\",\n",
" evaluation_strategy = \"epoch\",\n",
" learning_rate=2e-5,\n",
" weight_decay=0.01,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"trainer = Trainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=dataset[:len(dataset)*0.8],\n",
" eval_dataset=dataset[len(dataset)*0.8:]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"trainer.train()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}