{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "from datasets import load_dataset" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Dataset prep" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "model_checkpoint = \"distilroberta-base\"" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def tokenize_function(examples):\n", " return tokenizer(examples[\"text\"], max_length=512, truncation=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=[\"text\"])" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Model training" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoModelForMaskedLM\n", "from transformers import Trainer, TrainingArguments" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'AutoModelForMaskedLM' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m model \u001b[39m=\u001b[39m AutoModelForMaskedLM\u001b[39m.\u001b[39mfrom_pretrained(model_checkpoint)\n", "\u001b[0;31mNameError\u001b[0m: name 'AutoModelForMaskedLM' is not defined" ] } ], "source": [ "model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model_name = model_checkpoint.split(\"/\")[-1]\n", "training_args = TrainingArguments(\n", " f\"{model_name}-finetuned-america\",\n", " evaluation_strategy = \"epoch\",\n", " learning_rate=2e-5,\n", " weight_decay=0.01,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=dataset[:len(dataset)*0.8],\n", " eval_dataset=dataset[len(dataset)*0.8:]\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trainer.train()" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }