{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "ZXsOR6oJOJbd" }, "source": [ "# Instalacja pakietów" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8l0hzptKNiZS", "outputId": "29622435-2f50-4c0a-d921-e0bee5470440" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Collecting transformers\n", " Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.3/6.3 MB\u001b[0m \u001b[31m47.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting datasets\n", " Downloading datasets-2.9.0-py3-none-any.whl (462 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m462.8/462.8 KB\u001b[0m \u001b[31m23.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: torch in /usr/local/lib/python3.8/dist-packages (1.13.1+cu116)\n", "Collecting sentencepiece\n", " Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m13.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.8/dist-packages (from transformers) (2022.6.2)\n", "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1\n", " Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m43.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.8/dist-packages (from transformers) (6.0)\n", "Collecting huggingface-hub<1.0,>=0.11.0\n", " Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m190.3/190.3 KB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.8/dist-packages (from transformers) (4.64.1)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.8/dist-packages (from transformers) (2.25.1)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.8/dist-packages (from transformers) (23.0)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from transformers) (3.9.0)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.8/dist-packages (from transformers) (1.21.6)\n", "Collecting xxhash\n", " Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m213.0/213.0 KB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: pyarrow>=6.0.0 in /usr/local/lib/python3.8/dist-packages (from datasets) (9.0.0)\n", "Requirement already satisfied: aiohttp in /usr/local/lib/python3.8/dist-packages (from datasets) (3.8.3)\n", "Collecting responses<0.19\n", " Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n", "Collecting multiprocess\n", " Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m132.0/132.0 KB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from datasets) (1.3.5)\n", "Requirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.8/dist-packages (from datasets) (2023.1.0)\n", "Requirement already satisfied: dill<0.3.7 in /usr/local/lib/python3.8/dist-packages (from datasets) (0.3.6)\n", "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.8/dist-packages (from torch) (4.4.0)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (6.0.4)\n", "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (2.1.1)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (4.0.2)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.3.3)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.3.1)\n", "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.8.2)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (22.2.0)\n", "Requirement already satisfied: chardet<5,>=3.0.2 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (4.0.0)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (1.24.3)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (2.10)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (2022.12.7)\n", "Collecting urllib3<1.27,>=1.21.1\n", " Downloading urllib3-1.26.14-py2.py3-none-any.whl (140 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m140.6/140.6 KB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas->datasets) (2.8.2)\n", "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas->datasets) (2022.7.1)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.8/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n", "Installing collected packages: tokenizers, sentencepiece, xxhash, urllib3, multiprocess, responses, huggingface-hub, transformers, datasets\n", " Attempting uninstall: urllib3\n", " Found existing installation: urllib3 1.24.3\n", " Uninstalling urllib3-1.24.3:\n", " Successfully uninstalled urllib3-1.24.3\n", "Successfully installed datasets-2.9.0 huggingface-hub-0.12.0 multiprocess-0.70.14 responses-0.18.0 sentencepiece-0.1.97 tokenizers-0.13.2 transformers-4.26.1 urllib3-1.26.14 xxhash-3.2.0\n" ] } ], "source": [ "!pip install transformers datasets torch sentencepiece" ] }, { "cell_type": "markdown", "metadata": { "id": "dhN0rmb5Oi3d" }, "source": [ "# Załadowanie datasetu" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "tnaDkwZ2Pbnn" }, "outputs": [], "source": [ "from datasets import load_dataset" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 231, "referenced_widgets": [ "d481c2d945214c45b716d266d5d75184", "b59689c435074e14a5ffab86f7358d79", "8e53b6fe3c9b4ddb95c6bc13431f330e", "a8e97add56944729a2ead40f46e0acd8", "5e960877834348318c208650676ac592", "df1e772af2fa43fd8d705e82aae29a28", "5de4c85766684223a885de341a2375a9", "951d4aa7857542599bfa653eb6e98dd8", "57e82888c8764d67b04362742b177118", "04c3671ec63e4e19a29df60b9e35b9f4", "6d4ff2e0c30e4b4891e1882a246191a7", "0a4967eb8656482988be61b4e884ab6b", "70bf1a3b71c349678cead7c1e7f59408", "692a335df661445cb4dfa84a60708884", "b37e5ec6cb7c4a07a904c8924f6d3ac9", "129641fbac844cda9261d89ba1b02a9c", "286c12dec8c84e5fbebc9bfecdcaf362", "c1ccf4622a57496684a1232730d12aae", "fe4d858e5a12465bb3e1ed4f0964a592", "d6d4fa7b46a24c90832fda7d030f729a", "09f51cd5ae6140d7b7d93f0682954024", "a35de7678ed04a0094d09162d6dc719b", "36974fdfee5141b7850289e25b23df3c", "5c5294c17e2a4c52912d9722210886cd", "99279b2dc5004a4f878e6e4d08257fb6", "ff6597d5d8cd457f9e8290339fcb42f5", "434546b6ac0a404aa36700a170709688", "973260a8e814424aac53d061ba9325bd", "f983b467dc0c433995aa865e5055fe05", "b5a31b3ad4224d009745b9604088476f", "dda925b25f6d42c5aca2fb619f3625e5", "f99fca76deeb408c8a953712adc4658c", "0b2f4e8e08474341be8e46ee255d8451", "4bbbc1eaabb0460cba315a082cd69783", "ec970d97055b434cac97bca5b1d24069", "6d28f8333ec2447cbf9f03d333d86e82", "af29ca374d42465b93525a7cb54c7869", "f6c0f046a62a489497e659d6a786324a", "0d1267b2242d4a93ab5791a0f45aaa84", "2af172c9becf4b24a12afa75f7c23bfa", "9c9c5376fbed445dbea430c02cff9194", "ca515e1f8f154ec398c8a823629a5622", "c93b2a9a14134fccb9b815e3228ed56a", "90d9bc90d3944b4297b13af2ba123f99", "7f68486bff314b178c4fdb5b54cca92e", "e11096eae0ef432dabb00cb49c5e96b8", "84576cd6a7c64d6f809dc708b137134c", "c096402cab6e4e4b911c4aa41b285e1b", "556b1cb4381b473db666c64ca193585c", "9213dd04267b42ffb51918b5a6e8bfe0", "69b79fd9c03b4801bef03d27604ef51c", "4e6240934876400db18bacae780b9839", "616c196921514481aef009cde26fe0a8", "6ac66769ad604b66aa658bf52b11914a", "75db994dd2c24faaa36a2fdddcd11fb9", "0cdcd5131b5148bf8ef7130e741e7f9b", "296cc18ff4b5480e9cf22136589b23b2", "1c465c9fe6314a75aefcf6293c79f800", "03cc9c6818ff45c1b791557da43d5d77", "19d794a37566410592951ac2f7d6f8d4", "7a965267d797415a94c4b1d08eb11e63", "1ea948fb5ea54ae3ae24fc124d64ac45", "454abc04b69a46b5b28c10be09db3adf", "41b58dbbb09c4b9497b0acb4a7484f45", "7a0a7e8ab334489d8d363459b9e0bd37", "7071959c1e2141d58bceb19e1e0759f3" ] }, "id": "cCiAuRqrOkvV", "outputId": "22cf5f97-411a-4e6c-b0bf-28b477607386" }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "Downloading builder script: 0%| | 0.00/3.21k [00:00, ?B/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "d481c2d945214c45b716d266d5d75184" } }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "Downloading metadata: 0%| | 0.00/1.69k [00:00, ?B/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "0a4967eb8656482988be61b4e884ab6b" } }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "Downloading readme: 0%| | 0.00/4.87k [00:00, ?B/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "36974fdfee5141b7850289e25b23df3c" } }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "Downloading and preparing dataset sms_spam/plain_text to /root/.cache/huggingface/datasets/sms_spam/plain_text/1.0.0/53f051d3b5f62d99d61792c91acefe4f1577ad3e4c216fb0ad39e30b9f20019c...\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "Downloading data: 0%| | 0.00/203k [00:00, ?B/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "4bbbc1eaabb0460cba315a082cd69783" } }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "Generating train split: 0%| | 0/5574 [00:00, ? examples/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "7f68486bff314b178c4fdb5b54cca92e" } }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "Dataset sms_spam downloaded and prepared to /root/.cache/huggingface/datasets/sms_spam/plain_text/1.0.0/53f051d3b5f62d99d61792c91acefe4f1577ad3e4c216fb0ad39e30b9f20019c. Subsequent calls will reuse this data.\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ " 0%| | 0/1 [00:00, ?it/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "0cdcd5131b5148bf8ef7130e741e7f9b" } }, "metadata": {} } ], "source": [ "dataset = load_dataset(\"sms_spam\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "JKFHPko3OnAV", "outputId": "2682981e-b242-4563-8e88-21f1b56f2d15" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'sms': 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\\n',\n", " 'label': 0}" ] }, "metadata": {}, "execution_count": 4 } ], "source": [ "dataset['train'][0]" ] }, { "cell_type": "markdown", "metadata": { "id": "l140vJrgYxPr" }, "source": [ "# Modyfikacja datasetu - klasyfikacja" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1boUF-YiY3_y", "outputId": "f031453d-54d7-4fba-abc4-26ff8e110d96" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'sms': 'binary classification: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',\n", " 'label': '0'}" ] }, "metadata": {}, "execution_count": 5 } ], "source": [ "parsed_dataset = []\n", "\n", "for row in dataset['train']:\n", " text = \"binary classification: \" + row['sms'].replace(\"\\n\", \"\")\n", " new_row = {}\n", " new_row['sms'] = text\n", " if row['label'] == 0:\n", " new_row['label'] = \"0\"\n", " else:\n", " new_row['label'] = \"1\"\n", " parsed_dataset.append(new_row)\n", "\n", "parsed_dataset[0]" ] }, { "cell_type": "markdown", "metadata": { "id": "O-J-jBDxPJcn" }, "source": [ "# Tokenizer T5" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "P23AYPX1PZ6g" }, "outputs": [], "source": [ "from transformers import T5Tokenizer" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 203, "referenced_widgets": [ "e9563b43a79542b3ac4607b3b32fef36", "dd79782a047542c79cf638d50c4fb6b5", "ecada9c5179b40cfa2aba932c5a9aedd", "e64dfd8ce12043a48eb24ee8fafaaa5e", "fb537b30046a4b699cd0ac5d21c0cb5e", "e63a74cb438b488a9f1f5e340842e5e0", "4c3f3eeffbc946428cd198bf16fbdac0", "5f0ceff818cf43308975724aabc9d799", "e1e65e6fe54c4d60a6c93573d3cd1fd0", "b163c0caacc547199a2372e03bdabcce", "faf1774fb05940b8908415eafe3a1070", "bf509a7ff72a4e40bfea72feb958477c", "1479e73733d548d4881d163983549014", "1eff9f6872ca41feb2a67795d6b2af20", "054b198ab5174073bfaa2a951410dc37", "2f029974bf9643dab5e96eb15c6834b8", "28036b83d1fc438e913f8f3f4cb19391", "ef17ea825afa4cf8898f13c1cefa8475", "b31a9cf7195b47faaa22deb09fe1f0d1", "2c056fb1574a42eaaeb4d64c45b9e884", "28f716fbfec540819b13602a705e69cd", "5a0c5d19389e4ccbb552ed41111dfa2c" ] }, "id": "q5Jz0E_oPMBr", "outputId": "2a01700a-67b3-4347-afda-a2763b30c714" }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "Downloading (…)ve/main/spiece.model: 0%| | 0.00/792k [00:00, ?B/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "e9563b43a79542b3ac4607b3b32fef36" } }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "Downloading (…)lve/main/config.json: 0%| | 0.00/1.21k [00:00, ?B/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "bf509a7ff72a4e40bfea72feb958477c" } }, "metadata": {} }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.8/dist-packages/transformers/models/t5/tokenization_t5.py:163: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.\n", "For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n", "- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.\n", "- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n", "- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.\n", " warnings.warn(\n" ] } ], "source": [ "tokenizer = T5Tokenizer.from_pretrained('t5-base')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dfxJQpoePsvI", "outputId": "305252ef-73f2-4677-d7c9-31efc4d249a6" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Original: binary classification: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n", "Tokenized: ['▁binary', '▁classification', ':', '▁Go', '▁until', '▁jur', 'ong', '▁point', ',', '▁crazy', '.', '.', '▁Available', '▁only', '▁in', '▁bug', 'is', '▁', 'n', '▁great', '▁world', '▁la', '▁', 'e', '▁buffet', '...', '▁Cine', '▁there', '▁got', '▁', 'a', 'more', '▁wa', 't', '...']\n", "Token IDs: [14865, 13774, 10, 1263, 552, 10081, 2444, 500, 6, 6139, 5, 5, 8144, 163, 16, 8143, 159, 3, 29, 248, 296, 50, 3, 15, 15385, 233, 17270, 132, 530, 3, 9, 3706, 8036, 17, 233]\n" ] } ], "source": [ "sms = parsed_dataset[0]['sms']\n", "print('Original: ', sms)\n", "print('Tokenized: ', tokenizer.tokenize(sms))\n", "print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sms)))" ] }, { "cell_type": "markdown", "metadata": { "id": "UpluhM8cU5Ir" }, "source": [ "# Check maximum lenght of a sentence" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7uNUkixPU85O", "outputId": "6812abc8-d279-4ee8-9947-9c9a5932e47c" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Max sentence length: 341\n" ] } ], "source": [ "max_len = 0\n", "\n", "for sentence in parsed_dataset:\n", " input_ids = tokenizer.encode(sentence['sms'], add_special_tokens=True)\n", " max_len = max(max_len, len(input_ids))\n", "\n", "print('Max sentence length: ', max_len)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "lj0issBznZfK", "outputId": "9dccb0f2-6452-460b-b955-83468285a635" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Max sentence length: 3\n" ] } ], "source": [ "max_label_len = 0\n", "\n", "for sentence in parsed_dataset:\n", " input_ids = tokenizer.encode(sentence['label'], add_special_tokens=True)\n", " max_label_len = max(max_label_len, len(input_ids))\n", "\n", "print('Max sentence length: ', max_label_len)" ] }, { "cell_type": "markdown", "metadata": { "id": "nfw62HdgSERb" }, "source": [ "# Pre train tokenization" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "id": "KTXYalS1VLqH" }, "outputs": [], "source": [ "import torch" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Z28QYfLnSGxR", "outputId": "bcb84fca-c7e0-4e4a-a8f6-ff3db86e143d" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Original: {'sms': 'binary classification: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'label': '0'}\n", "Token IDs: tensor([14865, 13774, 10, 1263, 552, 10081, 2444, 500, 6, 6139,\n", " 5, 5, 8144, 163, 16, 8143, 159, 3, 29, 248,\n", " 296, 50, 3, 15, 15385, 233, 17270, 132, 530, 3,\n", " 9, 3706, 8036, 17, 233, 1, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0])\n", "Label token IDs: tensor([ 3, 632, 1])\n" ] } ], "source": [ "input_ids = []\n", "target_ids = []\n", "attention_masks = []\n", "\n", "for sentence in parsed_dataset:\n", " encoded_dict = tokenizer.encode_plus(\n", " sentence['sms'],\n", " add_special_tokens = True,\n", " max_length = 341,\n", " padding = 'max_length',\n", " truncation=True,\n", " return_attention_mask = True,\n", " return_tensors = 'pt',\n", " )\n", " \n", " encoded_target_dict = tokenizer.encode_plus(\n", " sentence['label'],\n", " add_special_tokens = True,\n", " max_length = 3,\n", " padding = 'max_length',\n", " truncation=True,\n", " return_attention_mask = True,\n", " return_tensors = 'pt',\n", " )\n", " \n", " input_ids.append(encoded_dict['input_ids'])\n", " target_ids.append(encoded_target_dict['input_ids'])\n", " attention_masks.append(encoded_dict['attention_mask'])\n", "\n", "input_ids = torch.cat(input_ids, dim=0)\n", "target_ids = torch.cat(target_ids, dim=0)\n", "attention_masks = torch.cat(attention_masks, dim=0)\n", "\n", "print('Original: ', parsed_dataset[0])\n", "print('Token IDs:', input_ids[0])\n", "print('Label token IDs:', target_ids[0])" ] }, { "cell_type": "markdown", "metadata": { "id": "qD_t0y0KVVSy" }, "source": [ "# Split dataset" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "id": "vN_SatRIVa4c" }, "outputs": [], "source": [ "from torch.utils.data import TensorDataset, random_split" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Mm6vc6lLVW3l", "outputId": "2ff9533f-7117-4492-dae2-5a8dada86e41" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "1,000 test samples\n", "4,116 training samples\n", " 458 validation samples\n" ] } ], "source": [ "dataset = TensorDataset(input_ids, attention_masks, target_ids)\n", "\n", "test_size = 1000\n", "dataset_len = len(dataset)\n", "train_size = int(0.9 * (dataset_len-test_size))\n", "val_size = (dataset_len-test_size) - train_size\n", "\n", "test_dataset, train_dataset, val_dataset = random_split(dataset, [test_size, train_size, val_size])\n", "\n", "print('{:>5,} test samples'.format(test_size))\n", "print('{:>5,} training samples'.format(train_size))\n", "print('{:>5,} validation samples'.format(val_size))" ] }, { "cell_type": "markdown", "metadata": { "id": "bmgQOP4EVfA1" }, "source": [ "# Create train and validation loaders" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "id": "CxnQ3cmIVlNh" }, "outputs": [], "source": [ "from torch.utils.data import DataLoader, RandomSampler, SequentialSampler" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "id": "0hcpO_onVjEC" }, "outputs": [], "source": [ "batch_size = 16\n", "\n", "train_dataloader = DataLoader(\n", " train_dataset,\n", " sampler = RandomSampler(train_dataset),\n", " batch_size = batch_size\n", " )\n", "\n", "validation_dataloader = DataLoader(\n", " val_dataset,\n", " sampler = SequentialSampler(val_dataset),\n", " batch_size = batch_size\n", " )" ] }, { "cell_type": "markdown", "metadata": { "id": "efwhqLyyVu9z" }, "source": [ "# Device check" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ANBCfNGnVwVk", "outputId": "ff2ff959-f0e9-47f3-d504-9daa45f870c2" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "There are 1 GPU(s) available.\n", "We will use the GPU: Tesla T4\n" ] } ], "source": [ "if torch.cuda.is_available(): \n", " device = torch.device(\"cuda\")\n", "\n", " print('There are %d GPU(s) available.' % torch.cuda.device_count())\n", " print('We will use the GPU:', torch.cuda.get_device_name(0))\n", "\n", "else:\n", " print('No GPU available, using the CPU instead.')\n", " device = torch.device(\"cpu\")" ] }, { "cell_type": "markdown", "metadata": { "id": "okTx_ynMV0rH" }, "source": [ "# Load T5 model" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "id": "Eu-7Eed8WgN0" }, "outputs": [], "source": [ "from transformers import T5ForConditionalGeneration" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000, "referenced_widgets": [ "68418b4f08654a2c8a19bdefa31ef7e2", "f59f1fe74df84329baa0137729651d7e", "4e6666f32de94c14973b2f5895c4f4ec", "9a8b0e9cf614453789dceff586f47682", "a4e1407e1a42416087a3138812851afa", "1813bc00d8db4de5a7bb7cd276346312", "ab6b0613a4934f34aad4d28cd855362d", "7514dfc8c5c34f29ab9a246ba6b45dc2", "017b00a3a26743d3a761a5b05f72fe73", "1cfe23326f964bb0a2925456aea14ad5", "384aac4ea3274eebbb43ea847036793a", "17986d272156460f8e9bcee2559088d9", "f1c7c8e7770848dabf155be27b342c6f", "719b8ebc46884edd9b36829f49680c98", "f28050af08f947678a41e1ea5611067f", "2ff5d9e91bf64330a2747c9c518ba31c", "85bd410d586b4ac98b8df72f980c0194", "feb7905c359e4acd9c9f848fb63d5d55", "b1d4154a8b054c8380a9ac70c311755b", "2fee9e3e54ae41c8977beaae6802010f", "42dc0f0578ed4105abeee4362667a98a", "04bb3488deec4565a0864049b122437d" ] }, "id": "JKv9O8kfV2zZ", "outputId": "ad88a39b-bdc7-4325-b588-ed5feb453c3e" }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "Downloading (…)\"pytorch_model.bin\";: 0%| | 0.00/892M [00:00, ?B/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "68418b4f08654a2c8a19bdefa31ef7e2" } }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "Downloading (…)neration_config.json: 0%| | 0.00/147 [00:00, ?B/s]" ], "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "17986d272156460f8e9bcee2559088d9" } }, "metadata": {} }, { "output_type": "execute_result", "data": { "text/plain": [ "T5ForConditionalGeneration(\n", " (shared): Embedding(32128, 768)\n", " (encoder): T5Stack(\n", " (embed_tokens): Embedding(32128, 768)\n", " (block): ModuleList(\n", " (0): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " (relative_attention_bias): Embedding(32, 12)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (1): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (2): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (3): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (4): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (5): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (6): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (7): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (8): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (9): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (10): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (11): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " )\n", " (final_layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (decoder): T5Stack(\n", " (embed_tokens): Embedding(32128, 768)\n", " (block): ModuleList(\n", " (0): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " (relative_attention_bias): Embedding(32, 12)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (1): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (2): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (3): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (4): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (5): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (6): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (7): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (8): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (9): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (10): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (11): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " )\n", " (final_layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (lm_head): Linear(in_features=768, out_features=32128, bias=False)\n", ")" ] }, "metadata": {}, "execution_count": 19 } ], "source": [ "model = T5ForConditionalGeneration.from_pretrained('t5-base')\n", "\n", "model.cuda()" ] }, { "cell_type": "markdown", "metadata": { "id": "F_SDAwxoawDy" }, "source": [ "# Helper functions" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "id": "s-q6_F38bLVA" }, "outputs": [], "source": [ "import datetime\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "id": "FzUi8908ax61" }, "outputs": [], "source": [ "def calculate_accuracy(preds, target):\n", " results_ok = 0.0\n", " results_false = 0.0\n", "\n", " for idx, pred in enumerate(preds):\n", " if pred == target[idx]:\n", " results_ok += 1.0\n", " else:\n", " results_false += 1.0\n", "\n", " return results_ok / (results_ok + results_false)\n", "\n", "def format_time(elapsed):\n", " '''\n", " Takes a time in seconds and returns a string hh:mm:ss\n", " '''\n", " elapsed_rounded = int(round((elapsed)))\n", " return str(datetime.timedelta(seconds=elapsed_rounded))" ] }, { "cell_type": "markdown", "metadata": { "id": "ucChBa-9bXJy" }, "source": [ "# Init training" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "id": "A7XUF4PNbYy8" }, "outputs": [], "source": [ "optimizer = torch.optim.AdamW(model.parameters(),\n", " lr = 3e-4,\n", " eps = 1e-8\n", " )\n", "\n", "epochs = 4\n", "total_steps = len(train_dataloader) * epochs" ] }, { "cell_type": "markdown", "metadata": { "id": "DAzQWODja0A3" }, "source": [ "# Training" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "id": "Hoa7NlU0bI7G" }, "outputs": [], "source": [ "import random\n", "import time" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xsHxfslka1u5", "outputId": "e40d00a1-baf8-4554-e5ec-aeb87ee35f66" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "======== Epoch 1 / 4 ========\n", "Training...\n", " Batch 40 of 258. Elapsed: 0:01:12.\n", " Batch 80 of 258. Elapsed: 0:02:20.\n", " Batch 120 of 258. Elapsed: 0:03:28.\n", " Batch 160 of 258. Elapsed: 0:04:36.\n", " Batch 200 of 258. Elapsed: 0:05:45.\n", " Batch 240 of 258. Elapsed: 0:06:53.\n", "\n", " Average training loss: 0.09\n", " Average training acc: 0.81\n", " Training epcoh took: 0:07:23\n", "\n", "Running Validation...\n", " Accuracy: 0.83\n", " Validation took: 0:00:27\n", " Validation Loss: 0.00\n", "\n", "======== Epoch 2 / 4 ========\n", "Training...\n", " Batch 40 of 258. Elapsed: 0:01:09.\n", " Batch 80 of 258. Elapsed: 0:02:17.\n", " Batch 120 of 258. Elapsed: 0:03:25.\n", " Batch 160 of 258. Elapsed: 0:04:33.\n", " Batch 200 of 258. Elapsed: 0:05:42.\n", " Batch 240 of 258. Elapsed: 0:06:50.\n", "\n", " Average training loss: 0.00\n", " Average training acc: 0.86\n", " Training epcoh took: 0:07:19\n", "\n", "Running Validation...\n", " Accuracy: 0.83\n", " Validation took: 0:00:26\n", " Validation Loss: 0.00\n", "\n", "======== Epoch 3 / 4 ========\n", "Training...\n", " Batch 40 of 258. Elapsed: 0:01:08.\n", " Batch 80 of 258. Elapsed: 0:02:16.\n", " Batch 120 of 258. Elapsed: 0:03:24.\n", " Batch 160 of 258. Elapsed: 0:04:32.\n", " Batch 200 of 258. Elapsed: 0:05:41.\n", " Batch 240 of 258. Elapsed: 0:06:49.\n", "\n", " Average training loss: 0.00\n", " Average training acc: 0.85\n", " Training epcoh took: 0:07:18\n", "\n", "Running Validation...\n", " Accuracy: 0.83\n", " Validation took: 0:00:26\n", " Validation Loss: 0.00\n", "\n", "======== Epoch 4 / 4 ========\n", "Training...\n", " Batch 40 of 258. Elapsed: 0:01:08.\n", " Batch 80 of 258. Elapsed: 0:02:16.\n", " Batch 120 of 258. Elapsed: 0:03:24.\n", " Batch 160 of 258. Elapsed: 0:04:32.\n", " Batch 200 of 258. Elapsed: 0:05:41.\n", " Batch 240 of 258. Elapsed: 0:06:49.\n", "\n", " Average training loss: 0.00\n", " Average training acc: 0.86\n", " Training epcoh took: 0:07:18\n", "\n", "Running Validation...\n", " Accuracy: 0.83\n", " Validation took: 0:00:26\n", " Validation Loss: 0.00\n", "\n", "Training complete!\n", "Total training took 0:31:02 (h:mm:ss)\n" ] } ], "source": [ "# This training code is based on the `run_glue.py` script here:\n", "# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128\n", "\n", "seed_val = 42\n", "\n", "random.seed(seed_val)\n", "np.random.seed(seed_val)\n", "torch.manual_seed(seed_val)\n", "torch.cuda.manual_seed_all(seed_val)\n", "\n", "training_stats = []\n", "total_t0 = time.time()\n", "\n", "for epoch_i in range(0, epochs):\n", " \n", " # ========================================\n", " # Training\n", " # ========================================\n", "\n", " print(\"\")\n", " print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))\n", " print('Training...')\n", "\n", " t0 = time.time()\n", " total_train_loss = 0\n", " total_train_acc = 0\n", "\n", " model.train()\n", "\n", " for step, batch in enumerate(train_dataloader):\n", " if step % 40 == 0 and not step == 0:\n", " elapsed = format_time(time.time() - t0)\n", " print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))\n", "\n", " b_input_ids = batch[0].to(device)\n", " b_input_mask = batch[1].to(device)\n", "\n", " y = batch[2].to(device)\n", " y_ids = y[:, :-1].contiguous()\n", " lm_labels = y[:, 1:].clone().detach()\n", " lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100 \n", "\n", " outputs = model(\n", " input_ids=b_input_ids,\n", " attention_mask=b_input_mask,\n", " decoder_input_ids=y_ids,\n", " labels=lm_labels\n", " )\n", "\n", " generated_ids = model.generate(\n", " input_ids = b_input_ids,\n", " attention_mask = b_input_mask, \n", " max_length=3, \n", " num_beams=2,\n", " repetition_penalty=2.5, \n", " length_penalty=1.0, \n", " early_stopping=True\n", " )\n", "\n", " preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]\n", " target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in y]\n", " total_train_acc += calculate_accuracy(preds, target) \n", "\n", " loss = outputs[0]\n", "\n", " optimizer.zero_grad()\n", " loss.backward()\n", " optimizer.step()\n", "\n", " total_train_loss += loss.item()\n", "\n", " avg_train_loss = total_train_loss / len(train_dataloader) \n", " avg_train_acc = total_train_acc / len(train_dataloader) \n", " training_time = format_time(time.time() - t0)\n", "\n", " print(\"\")\n", " print(\" Average training loss: {0:.2f}\".format(avg_train_loss))\n", " print(\" Average training acc: {0:.2f}\".format(avg_train_acc))\n", " print(\" Training epcoh took: {:}\".format(training_time))\n", " \n", " # ========================================\n", " # Validation\n", " # ========================================\n", "\n", " print(\"\")\n", " print(\"Running Validation...\")\n", "\n", " t0 = time.time()\n", " model.eval()\n", "\n", " total_eval_loss = 0\n", " total_eval_accuracy = 0\n", "\n", " for batch in validation_dataloader:\n", " b_input_ids = batch[0].to(device)\n", " b_input_mask = batch[1].to(device)\n", "\n", " y = batch[2].to(device)\n", " y_ids = y[:, :-1].contiguous()\n", " lm_labels = y[:, 1:].clone().detach()\n", " lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100\n", " \n", " with torch.no_grad(): \n", "\n", " outputs = model(\n", " input_ids=b_input_ids,\n", " attention_mask=b_input_mask,\n", " decoder_input_ids=y_ids,\n", " labels=lm_labels\n", " )\n", "\n", " loss = outputs[0]\n", " total_eval_loss += loss.item()\n", "\n", " generated_ids = model.generate(\n", " input_ids = b_input_ids,\n", " attention_mask = b_input_mask, \n", " max_length=3, \n", " num_beams=2,\n", " repetition_penalty=2.5, \n", " length_penalty=1.0, \n", " early_stopping=True\n", " )\n", "\n", " preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]\n", " target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in y]\n", " total_eval_accuracy += calculate_accuracy(preds, target) \n", "\n", " avg_val_loss = total_eval_loss / len(validation_dataloader)\n", "\n", " avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)\n", " print(\" Accuracy: {0:.2f}\".format(avg_val_accuracy))\n", " \n", " validation_time = format_time(time.time() - t0)\n", " print(\" Validation took: {:}\".format(validation_time))\n", " print(\" Validation Loss: {0:.2f}\".format(avg_val_loss))\n", "\n", " training_stats.append(\n", " {\n", " 'epoch': epoch_i + 1,\n", " 'Training Loss': avg_train_loss,\n", " 'Training Accur.': avg_train_acc,\n", " 'Valid. Loss': avg_val_loss,\n", " 'Valid. Accur.': avg_val_accuracy,\n", " 'Training Time': training_time,\n", " 'Validation Time': validation_time\n", " }\n", " )\n", "\n", "print(\"\")\n", "print(\"Training complete!\")\n", "\n", "print(\"Total training took {:} (h:mm:ss)\".format(format_time(time.time()-total_t0)))" ] }, { "cell_type": "markdown", "metadata": { "id": "xIpFPoRb91Or" }, "source": [ "# Train summary" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "id": "GjYqBrrO93Oh", "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "outputId": "326edb05-56a5-4376-d793-424e5e122507" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Training Loss Training Accur. Valid. Loss Valid. Accur. \\\n", "epoch \n", "1 9.03e-02 0.81 9.89e-07 0.83 \n", "2 1.30e-05 0.86 2.26e-08 0.83 \n", "3 3.05e-06 0.85 0.00e+00 0.83 \n", "4 5.13e-06 0.86 0.00e+00 0.83 \n", "\n", " Training Time Validation Time \n", "epoch \n", "1 0:07:23 0:00:27 \n", "2 0:07:19 0:00:26 \n", "3 0:07:18 0:00:26 \n", "4 0:07:18 0:00:26 " ], "text/html": [ "\n", "
\n", " | Training Loss | \n", "Training Accur. | \n", "Valid. Loss | \n", "Valid. Accur. | \n", "Training Time | \n", "Validation Time | \n", "
---|---|---|---|---|---|---|
epoch | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
1 | \n", "9.03e-02 | \n", "0.81 | \n", "9.89e-07 | \n", "0.83 | \n", "0:07:23 | \n", "0:00:27 | \n", "
2 | \n", "1.30e-05 | \n", "0.86 | \n", "2.26e-08 | \n", "0.83 | \n", "0:07:19 | \n", "0:00:26 | \n", "
3 | \n", "3.05e-06 | \n", "0.85 | \n", "0.00e+00 | \n", "0.83 | \n", "0:07:18 | \n", "0:00:26 | \n", "
4 | \n", "5.13e-06 | \n", "0.86 | \n", "0.00e+00 | \n", "0.83 | \n", "0:07:18 | \n", "0:00:26 | \n", "