commit e457ab5ceeeb10cc85f1a5d566d3ffe7018ce079 Author: Michał Kozłowski Date: Mon Feb 13 14:22:50 2023 +0100 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6f66c74 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.zip \ No newline at end of file diff --git a/RobertaSequanceClassification.ipynb b/RobertaSequanceClassification.ipynb new file mode 100644 index 0000000..156a820 --- /dev/null +++ b/RobertaSequanceClassification.ipynb @@ -0,0 +1,7430 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RobertaForSequenceClassification model classification training" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Links:\n", + "- Tensorboard training: https://tensorboard.dev/experiment/Hq95VFzqTQ2CyBb1S4SOpw/#scalars\n", + "- Huggingface dataset edited: https://huggingface.co/Zombely/RobertaForSequenceClassification-sst2\n", + "- Huggingface Trained model: https://huggingface.co/datasets/Zombely/sst2-project-dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wLIiUmt93mn5", + "outputId": "746cd310-e717-4757-ef01-217e6c27a16c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m462.8/462.8 KB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.3/6.3 MB\u001b[0m \u001b[31m76.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m213.0/213.0 KB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m132.0/132.0 KB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m190.3/190.3 KB\u001b[0m \u001b[31m18.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m43.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m140.6/140.6 KB\u001b[0m \u001b[31m12.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h" + ] + } + ], + "source": [ + "!pip install -q datasets transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "aAd7twoD3Fmj" + }, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "import torch\n", + "from transformers import AutoTokenizer, RobertaForSequenceClassification, RobertaTokenizerFast, TrainingArguments, Trainer\n", + "from sklearn.metrics import accuracy_score, precision_recall_fscore_support\n", + "from transformers.integrations import TensorBoardCallback" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "eedylD5a3IgG" + }, + "outputs": [], + "source": [ + "def load_and_process_dataset():\n", + " dataset = load_dataset(\"sst2\")\n", + " dataset.remove_columns('idx')\n", + " del dataset['test']\n", + " dataset['test'] = dataset['validation']\n", + " del dataset['validation']\n", + " split_dataset = dataset['train'].train_test_split(test_size=1600)\n", + " dataset['train'] = split_dataset['train']\n", + " dataset['validation'] = split_dataset['test']\n", + " return dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 475, + "referenced_widgets": [ + "ba09f9d805ed4f3b967ba07f2f83a3c4", + "6a4280a790fd4569809314d33eac698a", + "abafff8b89364186977294e26972276c", + "863a44fcc92d45209f3a46c6bada376e", + "fed3be115c1d4a518ebbbfb9eb23320a", + "d8608fa8a7b648cf96f6c4579789c888", + "796bdacb22054e45baf3b1b67a7fd2fd", + "ab3a8206a77545e58a4de57afa9a908a", + "137adeb80e724f368414293c8d0bab92", + "0f44ce2b76984b9eb593a67c62bccbdf", + "6d3b38065e7a4de1b66c3cd588cdeb86", + "063588dc401544ec906c1078013fe4f9", + "cbaac3c9034a478f847e32db8275baa7", + "03b28ab29f4e43d29325bd2d235f933e", + "be97e900d6c14413a0b1d9ab726cf2d6", + "610bfd586ba24a6086be7849d7eac252", + "0031523f73fc4b00b8ce073f869eece2", + "a2cd9dd617814a188bd41531f5649927", + "7b39cab1767a4d9eaa69e1a5e349f12c", + "78a6c9a1f37046db8b422e9f1b6f9741", + "3df9dd4cf7584f158112a29ff360f654", + "56705651faca468d951efd00dff51353", + "40479ab5f0074c1487b7837101b469f2", + "da24edfe6eea49f78088547a8ea7fd95", + "878aca25cb354337a2884d3f0403d287", + "ec43b94819664cba88bbad85b4ddd33b", + "1bb66c4a5675491ca0ec4cb2601a3fb4", + "7f6ba1390b274b8cbd11f543cde65811", + "f8d3bd6c0195470caf68e95ff6b48bc8", + "ecbb154238674e27bce147c8d06d452f", + "6c73cbd8c93b4cc49028deeb106b334e", + "97133455d8cb4803b4e62843a27d2cd2", + "90163315f49744a89a8fc512d1774a65", + "2a693dc067cf4d8ea7fe636dcd621a29", + "bda393e2457b4c508c1df90b13f7a3a1", + "faa680855f4644a5b507d5501a9d7932", + "522060423eaf40078ec27e9153f24e2c", + "632e71b7f49146439cbaef062582e053", + "d33b7dc962df4cd49c44be1846c31b0d", + "80fb28dfc43544fe9d9dd35ce7ccc0bd", + "189e136c0ac541a68efb1f8736bc457b", + "4e2b3c9f1515442a8aa74645f0d7c07b", + "9a1ced8a42284d78a42714890ce30b7d", + "d9d26906ed594a3a8806b7ebad87912c", + "9fb1d3b9d5fc469ba86c68e8010af9a3", + "b794c2ac6b0248c7b27c5dddba82e602", + "53618b72830546bfb6d7989c9b9414fa", + "0ffc52412b0c4f84bff758ca3965cac8", + "c1a3a793d6464e009197c2ec140f0992", + "d51741447352407fafb05af2c31d77ac", + "75243a3534d946bf9894caba03fb7f06", + "3fd43f9313f3418bb5c1fc57e22566f5", + "e93b453e2d374a378d246ffa522c4da2", + "6a22651a23524dc188c94acf4da4b00e", + "ca9aa3aaab264bd598d7ad3baccf3e5b", + "0cdf2f94734348a586455c156cd9d2b7", + "b0fe6b87ef85442a8a7c1cd3ed151f54", + "3c57699da0f643708dcf3a618fd64149", + "e7b63dbd7253402993049bc4fda12f3e", + "dec25abcdfe141d9861fe7815e7be970", + "068f16935a0d4b83885a73cb3859c2a4", + "8800494d30a349d090e56eb0ea64cb25", + "128a9bbd65e94545957af50f0206e439", + "ba8fd44d9c1e4f4895b24a400a53699c", + "24d0ad8fe9034885b949c83bfd2e212a", + "0f3c00d06303493281e0b074b02ee8a6", + "ec171a5d3ed14e32a7a22fa08b78cbba", + "52ba3891f674479eb367714cf8fbab62", + "0c46bc4233384158a245d1d866dda227", + "d7ed3742dede4f088aaee95df3c86644", + "994b3828f51e445f9f83148cbd1e615d", + "1d5defe0637542ca9b6176c8ac3ca9c0", + "f4552b1705b047be9e52618bf0b225e8", + "6c171a4a2979447c9865cf832b6b7def", + "68384f5de58c455bb86b237ce35634f8", + "350201e937134b9b892a60e1d7ba4c46", + "ee2a05fac40046a19aa2c19f1ffb805d", + "c55fb490ad894872b6389f3373cf13a3", + "813882c00a7a4b9784ff4182fb0f7332", + "af46ea0189aa4a8980f5efa86dea7cd9", + "b3f5f01f788946aebb1898145c0d4d7c", + "21e774a093b14c9c87268398c6826374", + "eaf71e38db414d1e846b5de6b629cb55", + "616dc0918ea44719bbbeec244f4f714d", + "51ee8cc049b347818244b9f6e7d597a6", + "9ae061b0bc6e46e9ac4235f0affe6e04", + "bc07111299f24c148d5201b90aadb03d", + "0a10e48aa3f3462d8f1e5783b5b0732a" + ] + }, + "id": "V6bE58rm3LiU", + "outputId": "5596da88-9a5d-4762-ae84-e792d20b96fd" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ba09f9d805ed4f3b967ba07f2f83a3c4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading builder script: 0%| | 0.00/3.77k [00:00 to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is\n", + ":DefaultFlowCallback\n", + "TensorBoardCallback\n", + "Using cuda_amp half precision backend\n" + ] + } + ], + "source": [ + "trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " compute_metrics=compute_metrics,\n", + " train_dataset=train_data,\n", + " eval_dataset=val_data,\n", + " callbacks=[TensorBoardCallback]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "slHbKCSn3T-L", + "outputId": "272cd7ec-b37f-45c3-fd3d-31ded2a401fc" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `RobertaForSequenceClassification.forward`, you can safely ignore this message.\n", + "***** Running training *****\n", + " Num examples = 65749\n", + " Num Epochs = 3\n", + " Instantaneous batch size per device = 4\n", + " Total train batch size (w. parallel, distributed & accumulation) = 64\n", + " Gradient Accumulation steps = 16\n", + " Total optimization steps = 3081\n", + " Number of trainable parameters = 124647170\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [3081/3081 42:34, Epoch 2/3]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EpochTraining LossValidation LossAccuracyF1PrecisionRecall
00.1957000.1589390.9362500.9427610.9448820.940649
10.1329000.1465190.9550000.9595510.9627960.956327
20.0397000.1507180.9556250.9603570.9576840.963046

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Saving model checkpoint to ./results/checkpoint-500\n", + "Configuration saved in ./results/checkpoint-500/config.json\n", + "Model weights saved in ./results/checkpoint-500/pytorch_model.bin\n", + "Saving model checkpoint to ./results/checkpoint-1000\n", + "Configuration saved in ./results/checkpoint-1000/config.json\n", + "Model weights saved in ./results/checkpoint-1000/pytorch_model.bin\n", + "The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `RobertaForSequenceClassification.forward`, you can safely ignore this message.\n", + "***** Running Evaluation *****\n", + " Num examples = 1600\n", + " Batch size = 8\n", + "Saving model checkpoint to ./results/checkpoint-1500\n", + "Configuration saved in ./results/checkpoint-1500/config.json\n", + "Model weights saved in ./results/checkpoint-1500/pytorch_model.bin\n", + "Saving model checkpoint to ./results/checkpoint-2000\n", + "Configuration saved in ./results/checkpoint-2000/config.json\n", + "Model weights saved in ./results/checkpoint-2000/pytorch_model.bin\n", + "The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `RobertaForSequenceClassification.forward`, you can safely ignore this message.\n", + "***** Running Evaluation *****\n", + " Num examples = 1600\n", + " Batch size = 8\n", + "Saving model checkpoint to ./results/checkpoint-2500\n", + "Configuration saved in ./results/checkpoint-2500/config.json\n", + "Model weights saved in ./results/checkpoint-2500/pytorch_model.bin\n", + "Saving model checkpoint to ./results/checkpoint-3000\n", + "Configuration saved in ./results/checkpoint-3000/config.json\n", + "Model weights saved in ./results/checkpoint-3000/pytorch_model.bin\n", + "The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `RobertaForSequenceClassification.forward`, you can safely ignore this message.\n", + "***** Running Evaluation *****\n", + " Num examples = 1600\n", + " Batch size = 8\n", + "\n", + "\n", + "Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "TrainOutput(global_step=3081, training_loss=0.19893329894531087, metrics={'train_runtime': 2559.2258, 'train_samples_per_second': 77.073, 'train_steps_per_second': 1.204, 'total_flos': 6790599311126760.0, 'train_loss': 0.19893329894531087, 'epoch': 3.0})" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 283 + }, + "id": "pA_VW8AZG2su", + "outputId": "d8f02cb1-fe12-4f73-eb87-dd24b7f496f2" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `RobertaForSequenceClassification.forward`, you can safely ignore this message.\n", + "***** Running Evaluation *****\n", + " Num examples = 1600\n", + " Batch size = 8\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [200/200 00:03]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "{'eval_loss': 0.15071792900562286,\n", + " 'eval_accuracy': 0.955625,\n", + " 'eval_f1': 0.96035734226689,\n", + " 'eval_precision': 0.9576837416481069,\n", + " 'eval_recall': 0.9630459126539753,\n", + " 'eval_runtime': 3.7924,\n", + " 'eval_samples_per_second': 421.898,\n", + " 'eval_steps_per_second': 52.737,\n", + " 'epoch': 3.0}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trainer.evaluate()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 283 + }, + "id": "dp1nwOYcG8iy", + "outputId": "1415db6a-8a8d-4420-abd0-2d24c9e93a26" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `RobertaForSequenceClassification.forward`, you can safely ignore this message.\n", + "***** Running Evaluation *****\n", + " Num examples = 872\n", + " Batch size = 8\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [200/200 00:05]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "{'eval_loss': 0.20586328208446503,\n", + " 'eval_accuracy': 0.9392201834862385,\n", + " 'eval_f1': 0.9407821229050279,\n", + " 'eval_precision': 0.9334811529933481,\n", + " 'eval_recall': 0.9481981981981982,\n", + " 'eval_runtime': 2.3748,\n", + " 'eval_samples_per_second': 367.184,\n", + " 'eval_steps_per_second': 45.898,\n", + " 'epoch': 3.0}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trainer.evaluate(test_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hWZ0b-Zy4j_y", + "outputId": "ae7202b0-014d-4e12-b093-f494fe816493" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-02-13 13:06:49.916239: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia\n", + "2023-02-13 13:06:49.916330: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia\n", + "2023-02-13 13:06:49.916358: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n", + "\n", + "***** TensorBoard Uploader *****\n", + "\n", + "This will upload your TensorBoard logs to https://tensorboard.dev/ from\n", + "the following directory:\n", + "\n", + "logs\n", + "\n", + "This TensorBoard will be visible to everyone. Do not upload sensitive\n", + "data.\n", + "\n", + "Your use of this service is subject to Google's Terms of Service\n", + " and Privacy Policy\n", + ", and TensorBoard.dev's Terms of Service\n", + ".\n", + "\n", + "This notice will not be shown again while you are logged into the uploader.\n", + "To log out, run `tensorboard dev auth revoke`.\n", + "\n", + "Continue? (yes/NO) yes\n", + "\n", + "To sign in with the TensorBoard uploader:\n", + "\n", + "1. On your computer or phone, visit:\n", + "\n", + " https://www.google.com/device\n", + "\n", + "2. Sign in with your Google account, then enter:\n", + "\n", + " YBVF-QCSV\n", + "\n", + "\n", + "Upload started and will continue reading any new data as it's added to the logdir.\n", + "\n", + "To stop uploading, press Ctrl-C.\n", + "\n", + "New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/Hq95VFzqTQ2CyBb1S4SOpw/\n", + "\n", + "\u001b[1m[2023-02-13T13:07:05]\u001b[0m Started scanning logdir.\n", + "\u001b[1m[2023-02-13T13:07:06]\u001b[0m Total uploaded: 2412 scalars, 10 tensors (7.1 kB), 0 binary objects\n", + "\n", + "\n", + "Interrupted. View your TensorBoard at https://tensorboard.dev/experiment/Hq95VFzqTQ2CyBb1S4SOpw/\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/bin/tensorboard\", line 8, in \n", + " sys.exit(run_main())\n", + " File \"/usr/local/lib/python3.8/dist-packages/tensorboard/main.py\", line 46, in run_main\n", + " app.run(tensorboard.main, flags_parser=tensorboard.configure)\n", + " File \"/usr/local/lib/python3.8/dist-packages/absl/app.py\", line 308, in run\n", + " _run_main(main, args)\n", + " File \"/usr/local/lib/python3.8/dist-packages/absl/app.py\", line 254, in _run_main\n", + " sys.exit(main(argv))\n", + " File \"/usr/local/lib/python3.8/dist-packages/tensorboard/program.py\", line 276, in main\n", + " return runner(self.flags) or 0\n", + " File \"/usr/local/lib/python3.8/dist-packages/tensorboard/uploader/uploader_subcommand.py\", line 691, in run\n", + " return _run(flags, self._experiment_url_callback)\n", + " File \"/usr/local/lib/python3.8/dist-packages/tensorboard/uploader/uploader_subcommand.py\", line 124, in _run\n", + " intent.execute(server_info, channel)\n", + " File \"/usr/local/lib/python3.8/dist-packages/tensorboard/uploader/uploader_subcommand.py\", line 507, in execute\n", + " sys.stdout.write(end_message + \"\\n\")\n", + "KeyboardInterrupt\n", + "^C\n" + ] + } + ], + "source": [ + "!tensorboard dev upload --logdir logs --name RobertaForSequenceClassification" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "x-_gsbS34_Mf", + "outputId": "ef2463c6-dc96-4af3-f752-c7f46d7f57ef" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Configuration saved in ./model/config.json\n", + "Model weights saved in ./model/pytorch_model.bin\n" + ] + } + ], + "source": [ + "model.save_pretrained(\"./model\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gOeREu-e5IlE", + "outputId": "8415481a-0350-4433-b111-906d871a086f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " _| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|\n", + " _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|\n", + " _|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|\n", + " _| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|\n", + " _| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|\n", + " \n", + " To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .\n", + "Token: \n", + "Add token as git credential? (Y/n) y\n", + "Token is valid.\n", + "\u001b[1m\u001b[31mCannot authenticate through git-credential as no helper is defined on your machine.\n", + "You might have to re-authenticate when pushing to the Hugging Face Hub.\n", + "Run the following command in your terminal in case you want to set the 'store' credential helper as default.\n", + "\n", + "git config --global credential.helper store\n", + "\n", + "Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.\u001b[0m\n", + "Token has not been saved to git credential helper.\n", + "Your token has been saved to /root/.cache/huggingface/token\n", + "Login successful\n" + ] + } + ], + "source": [ + "!huggingface-cli login" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 185, + "referenced_widgets": [ + "cbd11116207b486181c4c43979b9dec3", + "831b6347e2e04644801fd735ff05ea0a", + "d64fa193dc124a44aa4473eaf68a6fe0", + "b9e21eeb459f44328539fb721769c9e1", + "f778812398c64e5598ebc45e79a11c09", + "1d3b54593d74415d82383336a8994c4e", + "d1f7d3e50d06480da69bd565eac0381a", + "391cf00a7b5943749ce0323917119a09", + "1f57def550df4a5d82a42a7c39bc8005", + "d3d4f102e8294de1a7c6d1696aa169ab", + "8b9d548b971549fe85942c08e85b4183", + "b50310cd013840fcbf73e09f07352d68", + "8a336087b0f44ad09f70fe767be9e48e", + "c5de2397a2584da093df0c260effb112", + "79ed5ee25b0247998de475eeff22c328", + "160f5a4de3f842368a6242dfb793077e", + "41ed68bb08d34fdb8fe91b5e89cc1eac", + "35d75260c1344c57a52102b72dcc4232", + "dc97ee07af61405e87ae433006fae390", + "019b00baaa27422e922fc689eb562fe9", + "2a4128adfb314488af5230e76d05a88c", + "62580a52989848c38235b4d17969cc83" + ] + }, + "id": "-NBagdkY5RI1", + "outputId": "7942fe7b-d998-4ce1-ba00-6bb94a3e294e" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Configuration saved in /tmp/tmpga7eb38a/config.json\n", + "Model weights saved in /tmp/tmpga7eb38a/pytorch_model.bin\n", + "Uploading the following files to Zombely/RobertaForSequenceClassification-sst2: config.json,pytorch_model.bin\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "cbd11116207b486181c4c43979b9dec3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Upload 1 LFS files: 0%| | 0/1 [00:00=1.17 in /usr/local/lib/python3.8/dist-packages (from datasets) (1.21.6)\n", + "Collecting huggingface-hub<1.0.0,>=0.2.0\n", + " Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m190.3/190.3 KB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.8/dist-packages (from datasets) (2.25.1)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from datasets) (1.3.5)\n", + "Requirement already satisfied: pyarrow>=6.0.0 in /usr/local/lib/python3.8/dist-packages (from datasets) (9.0.0)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.8/dist-packages (from datasets) (23.0)\n", + "Collecting responses<0.19\n", + " Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n", + "Collecting xxhash\n", + " Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m213.0/213.0 KB\u001b[0m \u001b[31m10.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: dill<0.3.7 in /usr/local/lib/python3.8/dist-packages (from datasets) (0.3.6)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.8/dist-packages (from datasets) (3.8.3)\n", + "Requirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.8/dist-packages (from datasets) (2023.1.0)\n", + "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.8/dist-packages (from datasets) (4.64.1)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.8/dist-packages (from datasets) (6.0)\n", + "Collecting multiprocess\n", + " Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m132.0/132.0 KB\u001b[0m \u001b[31m10.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1\n", + " Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m72.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from transformers) (3.9.0)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.8/dist-packages (from transformers) (2022.6.2)\n", + "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (2.1.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (22.2.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (6.0.4)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.3.3)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.3.1)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (4.0.2)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.8.2)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.8/dist-packages (from huggingface-hub<1.0.0,>=0.2.0->datasets) (4.4.0)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests>=2.19.0->datasets) (1.24.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.8/dist-packages (from requests>=2.19.0->datasets) (2022.12.7)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests>=2.19.0->datasets) (2.10)\n", + "Requirement already satisfied: chardet<5,>=3.0.2 in /usr/local/lib/python3.8/dist-packages (from requests>=2.19.0->datasets) (4.0.0)\n", + "Collecting urllib3<1.27,>=1.21.1\n", + " Downloading urllib3-1.26.14-py2.py3-none-any.whl (140 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m140.6/140.6 KB\u001b[0m \u001b[31m10.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas->datasets) (2.8.2)\n", + "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas->datasets) (2022.7.1)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.8/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n", + "Installing collected packages: tokenizers, sentencepiece, xxhash, urllib3, multiprocess, responses, huggingface-hub, transformers, datasets\n", + " Attempting uninstall: urllib3\n", + " Found existing installation: urllib3 1.24.3\n", + " Uninstalling urllib3-1.24.3:\n", + " Successfully uninstalled urllib3-1.24.3\n", + "Successfully installed datasets-2.9.0 huggingface-hub-0.12.0 multiprocess-0.70.14 responses-0.18.0 sentencepiece-0.1.97 tokenizers-0.13.2 transformers-4.26.1 urllib3-1.26.14 xxhash-3.2.0\n" + ] + } + ], + "source": [ + "!pip install datasets transformers sentencepiece" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "Itn0ce_3P-Cv" + }, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "import torch\n", + "from transformers import T5ForConditionalGeneration, T5Tokenizer, TrainingArguments, Trainer\n", + "from sklearn.metrics import accuracy_score, precision_recall_fscore_support\n", + "import tensorflow as tf\n", + "from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset\n", + "import random\n", + "import time\n", + "import numpy as np\n", + "import datetime\n", + "import sklearn\n", + "from tqdm.notebook import tqdm\n", + "import os" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load data and transform dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "GQmeuSdNQkB7" + }, + "outputs": [], + "source": [ + "def load_and_process_dataset():\n", + " dataset = load_dataset(\"sst2\")\n", + " dataset.remove_columns('idx')\n", + " del dataset['test']\n", + " dataset['test'] = dataset['validation']\n", + " del dataset['validation']\n", + " split_dataset = dataset['train'].train_test_split(test_size=1600)\n", + " dataset['train'] = split_dataset['train']\n", + " dataset['validation'] = split_dataset['test']\n", + " return dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 329, + "referenced_widgets": [ + "1f0d8bc4c93048baa73f3a1cf00eae63", + "c026ef4af9c145b9a8315e60e01c8db3", + "17ccc44770fc407f9e2619b4080eced1", + "502b22b7634a4862b3f083fe5dc5efaf", + "9e4d054b2f5742d287837daecdf3c1d3", + "c8fd183dd6694c5788b50922a326e3fa", + "6857e7133d024d5f9e0a54528a06f164", + "e6a081c879df438da42c384e665e606d", + "b36171be00c54e718cc546f2f8b28250", + "8d8167938ae14287b214f177b0a3d5be", + "dd9e3eb27fc643d9ad20c35491afa209" + ] + }, + "id": "BWonEzhAQmnF", + "outputId": "3bfe3731-974e-4a77-d7b1-5f3bfb806a4b" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:datasets.builder:Found cached dataset sst2 (/root/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1f0d8bc4c93048baa73f3a1cf00eae63", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3 [00:00 and Privacy Policy\n", + ", and TensorBoard.dev's Terms of Service\n", + ".\n", + "\n", + "This notice will not be shown again while you are logged into the uploader.\n", + "To log out, run `tensorboard dev auth revoke`.\n", + "\n", + "Continue? (yes/NO) yes\n", + "\n", + "To sign in with the TensorBoard uploader:\n", + "\n", + "1. On your computer or phone, visit:\n", + "\n", + " https://www.google.com/device\n", + "\n", + "2. Sign in with your Google account, then enter:\n", + "\n", + " ZBRH-SMMW\n", + "\n", + "\n", + "Upload started and will continue reading any new data as it's added to the logdir.\n", + "\n", + "To stop uploading, press Ctrl-C.\n", + "\n", + "New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/CgaWd9pATZeuquRT7TZp7w/\n", + "\n", + "\u001b[1m[2023-02-13T11:50:01]\u001b[0m Started scanning logdir.\n", + "\u001b[1m[2023-02-13T11:50:04]\u001b[0m Total uploaded: 12630 scalars, 0 tensors, 0 binary objects\n", + "\n", + "\n", + "Interrupted. View your TensorBoard at https://tensorboard.dev/experiment/CgaWd9pATZeuquRT7TZp7w/\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/bin/tensorboard\", line 8, in \n", + " sys.exit(run_main())\n", + " File \"/usr/local/lib/python3.8/dist-packages/tensorboard/main.py\", line 46, in run_main\n", + " app.run(tensorboard.main, flags_parser=tensorboard.configure)\n", + " File \"/usr/local/lib/python3.8/dist-packages/absl/app.py\", line 308, in run\n", + " _run_main(main, args)\n", + " File \"/usr/local/lib/python3.8/dist-packages/absl/app.py\", line 254, in _run_main\n", + " sys.exit(main(argv))\n", + " File \"/usr/local/lib/python3.8/dist-packages/tensorboard/program.py\", line 276, in main\n", + " return runner(self.flags) or 0\n", + " File \"/usr/local/lib/python3.8/dist-packages/tensorboard/uploader/uploader_subcommand.py\", line 691, in run\n", + " return _run(flags, self._experiment_url_callback)\n", + " File \"/usr/local/lib/python3.8/dist-packages/tensorboard/uploader/uploader_subcommand.py\", line 124, in _run\n", + " intent.execute(server_info, channel)\n", + " File \"/usr/local/lib/python3.8/dist-packages/grpc/_channel.py\", line 1564, in __exit__\n", + " self._close()\n", + " File \"/usr/local/lib/python3.8/dist-packages/grpc/_channel.py\", line 1550, in _close\n", + " self._channel.close(cygrpc.StatusCode.cancelled, 'Channel closed!')\n", + " File \"src/python/grpcio/grpc/_cython/_cygrpc/channel.pyx.pxi\", line 513, in grpc._cython.cygrpc.Channel.close\n", + " File \"src/python/grpcio/grpc/_cython/_cygrpc/channel.pyx.pxi\", line 399, in grpc._cython.cygrpc._close\n", + " File \"src/python/grpcio/grpc/_cython/_cygrpc/channel.pyx.pxi\", line 429, in grpc._cython.cygrpc._close\n", + " File \"/usr/lib/python3.8/threading.py\", line 364, in notify_all\n", + " def notify_all(self):\n", + "KeyboardInterrupt\n", + "^C\n" + ] + } + ], + "source": [ + "!tensorboard dev upload --logdir logs --name t5-sst2" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "CHSu41ZvnaBB", + "outputId": "2948e2b2-54a1-43aa-9d29-27a7b40538ed" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('./model/tokenizer_config.json',\n", + " './model/special_tokens_map.json',\n", + " './model/spiece.model',\n", + " './model/added_tokens.json')" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t5model.model.save_pretrained(\"./model\")\n", + "t5model.tokenizer.save_pretrained(\"./model\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Xja8cK6yoHcM", + "outputId": "df987295-d175-4d17-928e-d9b03207169c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " adding: model/ (stored 0%)\n", + " adding: model/tokenizer_config.json (deflated 82%)\n", + " adding: model/config.json (deflated 62%)\n", + " adding: model/generation_config.json (deflated 29%)\n", + " adding: model/pytorch_model.bin (deflated 8%)\n", + " adding: model/special_tokens_map.json (deflated 86%)\n", + " adding: model/spiece.model (deflated 48%)\n" + ] + } + ], + "source": [ + "!zip -r /content/model model" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "16jB6CpRoslW", + "outputId": "15561efe-747b-4f21-aefc-84fb913c3037" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " _| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|\n", + " _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|\n", + " _|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|\n", + " _| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|\n", + " _| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|\n", + " \n", + " To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .\n", + "Token: \n", + "Add token as git credential? (Y/n) y\n", + "Token is valid.\n", + "\u001b[1m\u001b[31mCannot authenticate through git-credential as no helper is defined on your machine.\n", + "You might have to re-authenticate when pushing to the Hugging Face Hub.\n", + "Run the following command in your terminal in case you want to set the 'store' credential helper as default.\n", + "\n", + "git config --global credential.helper store\n", + "\n", + "Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.\u001b[0m\n", + "Token has not been saved to git credential helper.\n", + "Your token has been saved to /root/.cache/huggingface/token\n", + "Login successful\n" + ] + } + ], + "source": [ + "!huggingface-cli login" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 133, + "referenced_widgets": [ + "7e8cfa36fed049e2a7b7d8666bd57cdf", + "3c3d6d206d4844b199743baffb14d6c8", + "7e4c949d6ed246359c2e683f7697886d", + "52a2d3aa6abc4ac0a74ec9a2fa400f2c", + "2b964660bb9f41f599c6282547e23bc2", + "a0a534ea53ba41c2bab6ca1e99fee542", + "5170299b7a664c34b5aad7ea5d0f85c0", + "27cd138062d94582b0323d477c2df364", + "9d1f7fa6d41e46ba8a3a4d330d73d05b", + "992bd52b3e474ce297dfa09fb0007a01", + "6a17c291ee0547f2a31e430db846397c", + "5a2c81a908b94bb8b2a4422effbce25c", + "c22999770c184330a75f5efa68e61cab", + "2f9ccc2d649f4b45a30f3851beff2629", + "b6f2f601496c4018ba14e5615e1e300f", + "1e9fb6e057014cc78f64006ef96d9cb1", + "0f244fbff6de4511883e953c494f0a22", + "5266c0225cc246eab466d882a47e60dd", + "fda5959a3987462b8e75ec01c0eb2fda", + "256c1b04cfd94ec984e6a844c60e881c", + "500de4e8a3c5444aa8395a6bb3fa90f3", + "25b3a226518f44f983cd12cdbde7e434" + ] + }, + "id": "DptjUZgypVN9", + "outputId": "339f2bc3-0ccc-4e72-9bb9-f90d056a6044" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7e8cfa36fed049e2a7b7d8666bd57cdf", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Upload 1 LFS files: 0%| | 0/1 [00:00