commit 69663af74a25f98dcc5dee499a21c3048faedce6 Author: Aleksandra Jonas Date: Tue Feb 14 23:44:20 2023 +0100 final ver diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c7d30d1 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*/.DS_Store diff --git a/all_models.ipynb b/all_models.ipynb new file mode 100644 index 0000000..07c6026 --- /dev/null +++ b/all_models.ipynb @@ -0,0 +1,6530 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "svk2qSrl7ICc" + }, + "source": [ + "# **Uczenie Głębokie - projekt**\n", + "W projekcie wykorzystano dataset [emotion](https://huggingface.co/datasets/emotion), zawierający wpisy nacechowane określonymi emocjami.\n", + "\n", + "
\n", + "\n", + "Labels:\n", + "- 0 - sadness\n", + "- 1 - joy\n", + "- 2 - love\n", + "- 3 - anger\n", + "- 4 - fear\n", + "- 5 - surprise" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wJ30OIAM7ICf" + }, + "source": [ + "### **REQUIREMENTS**" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "XkE5ENXV7ICf", + "outputId": "68ec24ee-8dcd-48b7-c0ce-3d18c1b9bcd6" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Requirement already satisfied: transformers in /usr/local/lib/python3.8/dist-packages (4.23.1)\n", + "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.8/dist-packages (1.2.1)\n", + "Requirement already satisfied: accelerate in /usr/local/lib/python3.8/dist-packages (0.16.0)\n", + "Requirement already satisfied: evaluate in /usr/local/lib/python3.8/dist-packages (0.4.0)\n", + "Requirement already satisfied: datasets in /usr/local/lib/python3.8/dist-packages (2.9.0)\n", + "Requirement already satisfied: torch in /usr/local/lib/python3.8/dist-packages (1.13.1)\n", + "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.8/dist-packages (0.1.97)\n", + "Requirement already satisfied: torchvision in /usr/local/lib/python3.8/dist-packages (0.14.1+cu116)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from transformers) (3.9.0)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.8/dist-packages (from transformers) (4.64.1)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.8/dist-packages (from transformers) (1.21.6)\n", + "Requirement already satisfied: huggingface-hub<1.0,>=0.10.0 in /usr/local/lib/python3.8/dist-packages (from transformers) (0.12.0)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.8/dist-packages (from transformers) (2022.6.2)\n", + "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.8/dist-packages (from transformers) (0.13.2)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.8/dist-packages (from transformers) (2.25.1)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.8/dist-packages (from transformers) (6.0)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.8/dist-packages (from transformers) (23.0)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from scikit-learn) (3.1.0)\n", + "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.8/dist-packages (from scikit-learn) (1.2.0)\n", + "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.8/dist-packages (from scikit-learn) (1.7.3)\n", + "Requirement already satisfied: psutil in /usr/local/lib/python3.8/dist-packages (from accelerate) (5.4.8)\n", + "Requirement already satisfied: dill in /usr/local/lib/python3.8/dist-packages (from evaluate) (0.3.6)\n", + "Requirement already satisfied: responses<0.19 in /usr/local/lib/python3.8/dist-packages (from evaluate) (0.18.0)\n", + "Requirement already satisfied: fsspec[http]>=2021.05.0 in /usr/local/lib/python3.8/dist-packages (from evaluate) (2023.1.0)\n", + "Requirement already satisfied: xxhash in /usr/local/lib/python3.8/dist-packages (from evaluate) (3.2.0)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from evaluate) (1.3.5)\n", + "Requirement already satisfied: multiprocess in /usr/local/lib/python3.8/dist-packages (from evaluate) (0.70.14)\n", + "Requirement already satisfied: pyarrow>=6.0.0 in /usr/local/lib/python3.8/dist-packages (from datasets) (9.0.0)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.8/dist-packages (from datasets) (3.8.3)\n", + "Requirement already satisfied: nvidia-cublas-cu11==11.10.3.66 in /usr/local/lib/python3.8/dist-packages (from torch) (11.10.3.66)\n", + "Requirement already satisfied: nvidia-cuda-runtime-cu11==11.7.99 in /usr/local/lib/python3.8/dist-packages (from torch) (11.7.99)\n", + "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.8/dist-packages (from torch) (4.4.0)\n", + "Requirement already satisfied: nvidia-cuda-nvrtc-cu11==11.7.99 in /usr/local/lib/python3.8/dist-packages (from torch) (11.7.99)\n", + "Requirement already satisfied: nvidia-cudnn-cu11==8.5.0.96 in /usr/local/lib/python3.8/dist-packages (from torch) (8.5.0.96)\n", + "Requirement already satisfied: wheel in /usr/local/lib/python3.8/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch) (0.38.4)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.8/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch) (57.4.0)\n", + "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.8/dist-packages (from torchvision) (7.1.2)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (4.0.2)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.8.2)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (6.0.4)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.3.3)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (22.2.0)\n", + "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (2.1.1)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (2022.12.7)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (1.26.14)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (2.10)\n", + "Requirement already satisfied: chardet<5,>=3.0.2 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (4.0.0)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas->evaluate) (2.8.2)\n", + "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas->evaluate) (2022.7.1)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.8/dist-packages (from python-dateutil>=2.7.3->pandas->evaluate) (1.15.0)\n" + ] + } + ], + "source": [ + "!pip3 install transformers scikit-learn accelerate evaluate datasets torch sentencepiece torchvision" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "MrV5G1gW7ICg" + }, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "from pathlib import Path\n", + "from typing import Dict, List\n", + "from datasets import load_dataset\n", + "import torch\n", + "import pandas as pd\n", + "\n", + "os.environ['TOKENIZERS_PARALLELISM'] = 'true'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y107u4JG7ICh" + }, + "source": [ + "### **DATA PREP**" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PmgAAQFV7ICh", + "outputId": "e6f4f065-4d0d-4102-d96a-c5ca791dd113" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "No config specified, defaulting to: emotion/split\n", + "Found cached dataset emotion (/root/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd)\n", + "\r 0% 0/3 [00:00,\n", + "ignore_data_skip=False,\n", + "include_inputs_for_metrics=False,\n", + "jit_mode_eval=False,\n", + "label_names=None,\n", + "label_smoothing_factor=0.0,\n", + "learning_rate=2e-05,\n", + "length_column_name=length,\n", + "load_best_model_at_end=False,\n", + "local_rank=-1,\n", + "log_level=passive,\n", + "log_level_replica=passive,\n", + "log_on_each_node=True,\n", + "logging_dir=out/emotion/roberta/runs/Feb14_21-45-00_fc0011e45a00,\n", + "logging_first_step=False,\n", + "logging_nan_inf_filter=True,\n", + "logging_steps=500,\n", + "logging_strategy=steps,\n", + "lr_scheduler_type=linear,\n", + "max_grad_norm=1.0,\n", + "max_steps=-1,\n", + "metric_for_best_model=None,\n", + "mp_parameters=,\n", + "no_cuda=False,\n", + "num_train_epochs=1.0,\n", + "optim=adamw_hf,\n", + "output_dir=out/emotion/roberta,\n", + "overwrite_output_dir=True,\n", + "past_index=-1,\n", + "per_device_eval_batch_size=24,\n", + "per_device_train_batch_size=24,\n", + "prediction_loss_only=False,\n", + "push_to_hub=False,\n", + "push_to_hub_model_id=None,\n", + "push_to_hub_organization=None,\n", + "push_to_hub_token=,\n", + "ray_scope=last,\n", + "remove_unused_columns=True,\n", + "report_to=['tensorboard'],\n", + "resume_from_checkpoint=None,\n", + "run_name=out/emotion/roberta,\n", + "save_on_each_node=False,\n", + "save_steps=500,\n", + "save_strategy=steps,\n", + "save_total_limit=None,\n", + "seed=42,\n", + "sharded_ddp=[],\n", + "skip_memory_metrics=True,\n", + "tf32=None,\n", + "torchdynamo=None,\n", + "tpu_metrics_debug=False,\n", + "tpu_num_cores=None,\n", + "use_ipex=False,\n", + "use_legacy_prediction_loop=False,\n", + "use_mps_device=False,\n", + "warmup_ratio=0.0,\n", + "warmup_steps=0,\n", + "weight_decay=0.0,\n", + "xpu_backend=None,\n", + ")\n", + "INFO:__main__:load a local file for train: data/train.json\n", + "INFO:__main__:load a local file for validation: data/valid.json\n", + "INFO:__main__:load a local file for test: data/test.json\n", + "WARNING:datasets.builder:Using custom data configuration default-01aa9d8252a24a0d\n", + "INFO:datasets.info:Loading Dataset Infos from /usr/local/lib/python3.8/dist-packages/datasets/packaged_modules/json\n", + "INFO:datasets.builder:Generating dataset json (/content/roberta_training_cache/json/default-01aa9d8252a24a0d/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n", + "Downloading and preparing dataset json/default to /content/roberta_training_cache/json/default-01aa9d8252a24a0d/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...\n", + "Downloading data files: 100% 3/3 [00:00<00:00, 11491.24it/s]\n", + "INFO:datasets.download.download_manager:Downloading took 0.0 min\n", + "INFO:datasets.download.download_manager:Checksum Computation took 0.0 min\n", + "Extracting data files: 100% 3/3 [00:00<00:00, 1882.54it/s]\n", + "INFO:datasets.utils.info_utils:Unable to verify checksums.\n", + "INFO:datasets.builder:Generating train split\n", + "INFO:datasets.builder:Generating validation split\n", + "INFO:datasets.builder:Generating test split\n", + "INFO:datasets.utils.info_utils:Unable to verify splits sizes.\n", + "Dataset json downloaded and prepared to /content/roberta_training_cache/json/default-01aa9d8252a24a0d/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.\n", + "100% 3/3 [00:00<00:00, 573.49it/s]\n", + "Downloading (…)lve/main/config.json: 100% 481/481 [00:00<00:00, 83.8kB/s]\n", + "[INFO|configuration_utils.py:653] 2023-02-14 21:45:01,575 >> loading configuration file config.json from cache at roberta_training_cache/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json\n", + "[INFO|configuration_utils.py:705] 2023-02-14 21:45:01,576 >> Model config RobertaConfig {\n", + " \"_name_or_path\": \"roberta-base\",\n", + " \"architectures\": [\n", + " \"RobertaForMaskedLM\"\n", + " ],\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"bos_token_id\": 0,\n", + " \"classifier_dropout\": null,\n", + " \"eos_token_id\": 2,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"id2label\": {\n", + " \"0\": \"LABEL_0\",\n", + " \"1\": \"LABEL_1\",\n", + " \"2\": \"LABEL_2\",\n", + " \"3\": \"LABEL_3\",\n", + " \"4\": \"LABEL_4\",\n", + " \"5\": \"LABEL_5\"\n", + " },\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"label2id\": {\n", + " \"LABEL_0\": 0,\n", + " \"LABEL_1\": 1,\n", + " \"LABEL_2\": 2,\n", + " \"LABEL_3\": 3,\n", + " \"LABEL_4\": 4,\n", + " \"LABEL_5\": 5\n", + " },\n", + " \"layer_norm_eps\": 1e-05,\n", + " \"max_position_embeddings\": 514,\n", + " \"model_type\": \"roberta\",\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"pad_token_id\": 1,\n", + " \"position_embedding_type\": \"absolute\",\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"type_vocab_size\": 1,\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50265\n", + "}\n", + "\n", + "[INFO|tokenization_auto.py:418] 2023-02-14 21:45:01,670 >> Could not locate the tokenizer configuration file, will try to use the model config instead.\n", + "[INFO|configuration_utils.py:653] 2023-02-14 21:45:01,762 >> loading configuration file config.json from cache at roberta_training_cache/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json\n", + "[INFO|configuration_utils.py:705] 2023-02-14 21:45:01,763 >> Model config RobertaConfig {\n", + " \"_name_or_path\": \"roberta-base\",\n", + " \"architectures\": [\n", + " \"RobertaForMaskedLM\"\n", + " ],\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"bos_token_id\": 0,\n", + " \"classifier_dropout\": null,\n", + " \"eos_token_id\": 2,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"layer_norm_eps\": 1e-05,\n", + " \"max_position_embeddings\": 514,\n", + " \"model_type\": \"roberta\",\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"pad_token_id\": 1,\n", + " \"position_embedding_type\": \"absolute\",\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"type_vocab_size\": 1,\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50265\n", + "}\n", + "\n", + "Downloading (…)olve/main/vocab.json: 100% 899k/899k [00:00<00:00, 9.36MB/s]\n", + "Downloading (…)olve/main/merges.txt: 100% 456k/456k [00:00<00:00, 4.95MB/s]\n", + "Downloading (…)/main/tokenizer.json: 100% 1.36M/1.36M [00:00<00:00, 11.7MB/s]\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:45:02,975 >> loading file vocab.json from cache at roberta_training_cache/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/vocab.json\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:45:02,976 >> loading file merges.txt from cache at roberta_training_cache/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/merges.txt\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:45:02,976 >> loading file tokenizer.json from cache at roberta_training_cache/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/tokenizer.json\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:45:02,976 >> loading file added_tokens.json from cache at None\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:45:02,976 >> loading file special_tokens_map.json from cache at None\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:45:02,976 >> loading file tokenizer_config.json from cache at None\n", + "[INFO|configuration_utils.py:653] 2023-02-14 21:45:02,976 >> loading configuration file config.json from cache at roberta_training_cache/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json\n", + "[INFO|configuration_utils.py:705] 2023-02-14 21:45:02,977 >> Model config RobertaConfig {\n", + " \"_name_or_path\": \"roberta-base\",\n", + " \"architectures\": [\n", + " \"RobertaForMaskedLM\"\n", + " ],\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"bos_token_id\": 0,\n", + " \"classifier_dropout\": null,\n", + " \"eos_token_id\": 2,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"layer_norm_eps\": 1e-05,\n", + " \"max_position_embeddings\": 514,\n", + " \"model_type\": \"roberta\",\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"pad_token_id\": 1,\n", + " \"position_embedding_type\": \"absolute\",\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"type_vocab_size\": 1,\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50265\n", + "}\n", + "\n", + "INFO:__main__:Using implementation from class: AutoModelForSequenceClassification\n", + "Downloading (…)\"pytorch_model.bin\";: 100% 501M/501M [00:04<00:00, 105MB/s]\n", + "[INFO|modeling_utils.py:2156] 2023-02-14 21:45:08,072 >> loading weights file pytorch_model.bin from cache at roberta_training_cache/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/pytorch_model.bin\n", + "[WARNING|modeling_utils.py:2596] 2023-02-14 21:45:09,415 >> Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']\n", + "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "[WARNING|modeling_utils.py:2608] 2023-02-14 21:45:09,415 >> Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "\n", + "\n", + "Frozen layers:\n", + "[('roberta.encoder.layer.0.attention.self.query.weight', False), ('roberta.encoder.layer.0.attention.self.query.bias', False), ('roberta.encoder.layer.0.attention.self.key.weight', False), ('roberta.encoder.layer.0.attention.self.key.bias', False), ('roberta.encoder.layer.0.attention.self.value.weight', False), ('roberta.encoder.layer.0.attention.self.value.bias', False), ('roberta.encoder.layer.0.attention.output.dense.weight', False), ('roberta.encoder.layer.0.attention.output.dense.bias', False), ('roberta.encoder.layer.0.attention.output.LayerNorm.weight', False), ('roberta.encoder.layer.0.attention.output.LayerNorm.bias', False), ('roberta.encoder.layer.0.intermediate.dense.weight', False), ('roberta.encoder.layer.0.intermediate.dense.bias', False), ('roberta.encoder.layer.0.output.dense.weight', False), ('roberta.encoder.layer.0.output.dense.bias', False), ('roberta.encoder.layer.0.output.LayerNorm.weight', False), ('roberta.encoder.layer.0.output.LayerNorm.bias', False), ('roberta.encoder.layer.2.attention.self.query.weight', False), ('roberta.encoder.layer.2.attention.self.query.bias', False), ('roberta.encoder.layer.2.attention.self.key.weight', False), ('roberta.encoder.layer.2.attention.self.key.bias', False), ('roberta.encoder.layer.2.attention.self.value.weight', False), ('roberta.encoder.layer.2.attention.self.value.bias', False), ('roberta.encoder.layer.2.attention.output.dense.weight', False), ('roberta.encoder.layer.2.attention.output.dense.bias', False), ('roberta.encoder.layer.2.attention.output.LayerNorm.weight', False), ('roberta.encoder.layer.2.attention.output.LayerNorm.bias', False), ('roberta.encoder.layer.2.intermediate.dense.weight', False), ('roberta.encoder.layer.2.intermediate.dense.bias', False), ('roberta.encoder.layer.2.output.dense.weight', False), ('roberta.encoder.layer.2.output.dense.bias', False), ('roberta.encoder.layer.2.output.LayerNorm.weight', False), ('roberta.encoder.layer.2.output.LayerNorm.bias', False), ('roberta.encoder.layer.4.attention.self.query.weight', False), ('roberta.encoder.layer.4.attention.self.query.bias', False), ('roberta.encoder.layer.4.attention.self.key.weight', False), ('roberta.encoder.layer.4.attention.self.key.bias', False), ('roberta.encoder.layer.4.attention.self.value.weight', False), ('roberta.encoder.layer.4.attention.self.value.bias', False), ('roberta.encoder.layer.4.attention.output.dense.weight', False), ('roberta.encoder.layer.4.attention.output.dense.bias', False), ('roberta.encoder.layer.4.attention.output.LayerNorm.weight', False), ('roberta.encoder.layer.4.attention.output.LayerNorm.bias', False), ('roberta.encoder.layer.4.intermediate.dense.weight', False), ('roberta.encoder.layer.4.intermediate.dense.bias', False), ('roberta.encoder.layer.4.output.dense.weight', False), ('roberta.encoder.layer.4.output.dense.bias', False), ('roberta.encoder.layer.4.output.LayerNorm.weight', False), ('roberta.encoder.layer.4.output.LayerNorm.bias', False), ('roberta.encoder.layer.6.attention.self.query.weight', False), ('roberta.encoder.layer.6.attention.self.query.bias', False), ('roberta.encoder.layer.6.attention.self.key.weight', False), ('roberta.encoder.layer.6.attention.self.key.bias', False), ('roberta.encoder.layer.6.attention.self.value.weight', False), ('roberta.encoder.layer.6.attention.self.value.bias', False), ('roberta.encoder.layer.6.attention.output.dense.weight', False), ('roberta.encoder.layer.6.attention.output.dense.bias', False), ('roberta.encoder.layer.6.attention.output.LayerNorm.weight', False), ('roberta.encoder.layer.6.attention.output.LayerNorm.bias', False), ('roberta.encoder.layer.6.intermediate.dense.weight', False), ('roberta.encoder.layer.6.intermediate.dense.bias', False), ('roberta.encoder.layer.6.output.dense.weight', False), ('roberta.encoder.layer.6.output.dense.bias', False), ('roberta.encoder.layer.6.output.LayerNorm.weight', False), ('roberta.encoder.layer.6.output.LayerNorm.bias', False), ('roberta.encoder.layer.8.attention.self.query.weight', False), ('roberta.encoder.layer.8.attention.self.query.bias', False), ('roberta.encoder.layer.8.attention.self.key.weight', False), ('roberta.encoder.layer.8.attention.self.key.bias', False), ('roberta.encoder.layer.8.attention.self.value.weight', False), ('roberta.encoder.layer.8.attention.self.value.bias', False), ('roberta.encoder.layer.8.attention.output.dense.weight', False), ('roberta.encoder.layer.8.attention.output.dense.bias', False), ('roberta.encoder.layer.8.attention.output.LayerNorm.weight', False), ('roberta.encoder.layer.8.attention.output.LayerNorm.bias', False), ('roberta.encoder.layer.8.intermediate.dense.weight', False), ('roberta.encoder.layer.8.intermediate.dense.bias', False), ('roberta.encoder.layer.8.output.dense.weight', False), ('roberta.encoder.layer.8.output.dense.bias', False), ('roberta.encoder.layer.8.output.LayerNorm.weight', False), ('roberta.encoder.layer.8.output.LayerNorm.bias', False), ('roberta.encoder.layer.10.attention.self.query.weight', False), ('roberta.encoder.layer.10.attention.self.query.bias', False), ('roberta.encoder.layer.10.attention.self.key.weight', False), ('roberta.encoder.layer.10.attention.self.key.bias', False), ('roberta.encoder.layer.10.attention.self.value.weight', False), ('roberta.encoder.layer.10.attention.self.value.bias', False), ('roberta.encoder.layer.10.attention.output.dense.weight', False), ('roberta.encoder.layer.10.attention.output.dense.bias', False), ('roberta.encoder.layer.10.attention.output.LayerNorm.weight', False), ('roberta.encoder.layer.10.attention.output.LayerNorm.bias', False), ('roberta.encoder.layer.10.intermediate.dense.weight', False), ('roberta.encoder.layer.10.intermediate.dense.bias', False), ('roberta.encoder.layer.10.output.dense.weight', False), ('roberta.encoder.layer.10.output.dense.bias', False), ('roberta.encoder.layer.10.output.LayerNorm.weight', False), ('roberta.encoder.layer.10.output.LayerNorm.bias', False)] \n", + "\n", + "\n", + "Running tokenizer on dataset: 0% 0/16 [00:00> The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`, you can safely ignore this message.\n", + "/usr/local/lib/python3.8/dist-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " warnings.warn(\n", + "[INFO|trainer.py:1607] 2023-02-14 21:45:13,109 >> ***** Running training *****\n", + "[INFO|trainer.py:1608] 2023-02-14 21:45:13,109 >> Num examples = 16000\n", + "[INFO|trainer.py:1609] 2023-02-14 21:45:13,109 >> Num Epochs = 1\n", + "[INFO|trainer.py:1610] 2023-02-14 21:45:13,109 >> Instantaneous batch size per device = 24\n", + "[INFO|trainer.py:1611] 2023-02-14 21:45:13,109 >> Total train batch size (w. parallel, distributed & accumulation) = 24\n", + "[INFO|trainer.py:1612] 2023-02-14 21:45:13,109 >> Gradient Accumulation steps = 1\n", + "[INFO|trainer.py:1613] 2023-02-14 21:45:13,109 >> Total optimization steps = 667\n", + "{'loss': 0.8083, 'learning_rate': 5.0074962518740634e-06, 'epoch': 0.75}\n", + " 75% 500/667 [00:58<00:19, 8.76it/s][INFO|trainer.py:2656] 2023-02-14 21:46:11,148 >> Saving model checkpoint to out/emotion/roberta/checkpoint-500\n", + "[INFO|configuration_utils.py:447] 2023-02-14 21:46:11,149 >> Configuration saved in out/emotion/roberta/checkpoint-500/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 21:46:12,047 >> Model weights saved in out/emotion/roberta/checkpoint-500/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 21:46:12,048 >> tokenizer config file saved in out/emotion/roberta/checkpoint-500/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 21:46:12,048 >> Special tokens file saved in out/emotion/roberta/checkpoint-500/special_tokens_map.json\n", + "100% 666/667 [01:19<00:00, 8.78it/s][INFO|trainer.py:1852] 2023-02-14 21:46:32,443 >> \n", + "\n", + "Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "\n", + "\n", + "{'train_runtime': 79.3341, 'train_samples_per_second': 201.679, 'train_steps_per_second': 8.407, 'train_loss': 0.7161429089227359, 'epoch': 1.0}\n", + "100% 667/667 [01:19<00:00, 8.41it/s]\n", + "[INFO|trainer.py:2656] 2023-02-14 21:46:32,445 >> Saving model checkpoint to out/emotion/roberta\n", + "[INFO|configuration_utils.py:447] 2023-02-14 21:46:32,446 >> Configuration saved in out/emotion/roberta/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 21:46:33,422 >> Model weights saved in out/emotion/roberta/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 21:46:33,422 >> tokenizer config file saved in out/emotion/roberta/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 21:46:33,423 >> Special tokens file saved in out/emotion/roberta/special_tokens_map.json\n", + "***** train metrics *****\n", + " epoch = 1.0\n", + " train_loss = 0.7161\n", + " train_runtime = 0:01:19.33\n", + " train_samples = 16000\n", + " train_samples_per_second = 201.679\n", + " train_steps_per_second = 8.407\n", + "INFO:__main__:*** Evaluate ***\n", + "[INFO|trainer.py:725] 2023-02-14 21:46:33,524 >> The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 21:46:33,526 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 21:46:33,526 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 21:46:33,526 >> Batch size = 24\n", + "100% 84/84 [00:03<00:00, 23.66it/s]\n", + "***** eval metrics *****\n", + " epoch = 1.0\n", + " eval_accuracy = 0.889\n", + " eval_loss = 0.3302\n", + " eval_runtime = 0:00:03.59\n", + " eval_samples = 2000\n", + " eval_samples_per_second = 556.411\n", + " eval_steps_per_second = 23.369\n", + "INFO:__main__:*** Predict ***\n", + "[INFO|trainer.py:725] 2023-02-14 21:46:37,124 >> The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 21:46:37,125 >> ***** Running Prediction *****\n", + "[INFO|trainer.py:2909] 2023-02-14 21:46:37,125 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 21:46:37,125 >> Batch size = 24\n", + "100% 84/84 [00:03<00:00, 23.68it/s]\n", + "INFO:__main__:***** Predict results None *****\n", + "[INFO|modelcard.py:444] 2023-02-14 21:46:40,840 >> Dropping the following result as it does not have all the necessary fields:\n", + "{'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.8889999985694885}]}\n" + ] + } + ], + "source": [ + "!python run_glue.py \\\n", + " --cache_dir roberta_training_cache \\\n", + " --model_name_or_path roberta-base \\\n", + " --train_file data/train.json \\\n", + " --validation_file data/valid.json \\\n", + " --test_file data/test.json \\\n", + " --per_device_train_batch_size 24 \\\n", + " --per_device_eval_batch_size 24 \\\n", + " --do_train \\\n", + " --do_eval \\\n", + " --do_predict \\\n", + " --max_seq_length 128 \\\n", + " --learning_rate 2e-5 \\\n", + " --num_train_epochs 1 \\\n", + " --output_dir out/emotion/roberta \\\n", + " --overwrite_output_dir" + ] + }, + { + "cell_type": "markdown", + "source": [ + "- full data\n", + "- sequence length: 128\n", + "- leakyRelu instad of relu\n", + "- every other layer frozen\n", + "- custom head" + ], + "metadata": { + "id": "b1iFFLFAf9PC" + } + }, + { + "cell_type": "code", + "source": [ + "!python run_glue.py \\\n", + " --cache_dir roberta_custom_training_cache \\\n", + " --model_name_or_path roberta-base \\\n", + " --custom_model roberta_custom \\\n", + " --train_file data/train.json \\\n", + " --validation_file data/valid.json \\\n", + " --test_file data/test.json \\\n", + " --per_device_train_batch_size 24 \\\n", + " --per_device_eval_batch_size 24 \\\n", + " --do_train \\\n", + " --do_eval \\\n", + " --do_predict \\\n", + " --max_seq_length 128 \\\n", + " --learning_rate 2e-5 \\\n", + " --num_train_epochs 1 \\\n", + " --output_dir out/emotion/roberta_custom \\\n", + " --overwrite_output_dir" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WzRBwNKqkDAk", + "outputId": "8d042117-3af6-4041-d1a5-d70024df24fb" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2023-02-14 21:47:02.722049: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-02-14 21:47:02.876002: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", + "2023-02-14 21:47:03.659342: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia\n", + "2023-02-14 21:47:03.659451: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia\n", + "2023-02-14 21:47:03.659470: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n", + "WARNING:__main__:Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n", + "INFO:__main__:Training/evaluation parameters TrainingArguments(\n", + "_n_gpu=1,\n", + "adafactor=False,\n", + "adam_beta1=0.9,\n", + "adam_beta2=0.999,\n", + "adam_epsilon=1e-08,\n", + "auto_find_batch_size=False,\n", + "bf16=False,\n", + "bf16_full_eval=False,\n", + "data_seed=None,\n", + "dataloader_drop_last=False,\n", + "dataloader_num_workers=0,\n", + "dataloader_pin_memory=True,\n", + "ddp_bucket_cap_mb=None,\n", + "ddp_find_unused_parameters=None,\n", + "ddp_timeout=1800,\n", + "debug=[],\n", + "deepspeed=None,\n", + "disable_tqdm=False,\n", + "do_eval=True,\n", + "do_predict=True,\n", + "do_train=True,\n", + "eval_accumulation_steps=None,\n", + "eval_delay=0,\n", + "eval_steps=None,\n", + "evaluation_strategy=no,\n", + "fp16=False,\n", + "fp16_backend=auto,\n", + "fp16_full_eval=False,\n", + "fp16_opt_level=O1,\n", + "fsdp=[],\n", + "fsdp_min_num_params=0,\n", + "fsdp_transformer_layer_cls_to_wrap=None,\n", + "full_determinism=False,\n", + "gradient_accumulation_steps=1,\n", + "gradient_checkpointing=False,\n", + "greater_is_better=None,\n", + "group_by_length=False,\n", + "half_precision_backend=auto,\n", + "hub_model_id=None,\n", + "hub_private_repo=False,\n", + "hub_strategy=every_save,\n", + "hub_token=,\n", + "ignore_data_skip=False,\n", + "include_inputs_for_metrics=False,\n", + "jit_mode_eval=False,\n", + "label_names=None,\n", + "label_smoothing_factor=0.0,\n", + "learning_rate=2e-05,\n", + "length_column_name=length,\n", + "load_best_model_at_end=False,\n", + "local_rank=-1,\n", + "log_level=passive,\n", + "log_level_replica=passive,\n", + "log_on_each_node=True,\n", + "logging_dir=out/emotion/roberta_custom/runs/Feb14_21-47-05_fc0011e45a00,\n", + "logging_first_step=False,\n", + "logging_nan_inf_filter=True,\n", + "logging_steps=500,\n", + "logging_strategy=steps,\n", + "lr_scheduler_type=linear,\n", + "max_grad_norm=1.0,\n", + "max_steps=-1,\n", + "metric_for_best_model=None,\n", + "mp_parameters=,\n", + "no_cuda=False,\n", + "num_train_epochs=1.0,\n", + "optim=adamw_hf,\n", + "output_dir=out/emotion/roberta_custom,\n", + "overwrite_output_dir=True,\n", + "past_index=-1,\n", + "per_device_eval_batch_size=24,\n", + "per_device_train_batch_size=24,\n", + "prediction_loss_only=False,\n", + "push_to_hub=False,\n", + "push_to_hub_model_id=None,\n", + "push_to_hub_organization=None,\n", + "push_to_hub_token=,\n", + "ray_scope=last,\n", + "remove_unused_columns=True,\n", + "report_to=['tensorboard'],\n", + "resume_from_checkpoint=None,\n", + "run_name=out/emotion/roberta_custom,\n", + "save_on_each_node=False,\n", + "save_steps=500,\n", + "save_strategy=steps,\n", + "save_total_limit=None,\n", + "seed=42,\n", + "sharded_ddp=[],\n", + "skip_memory_metrics=True,\n", + "tf32=None,\n", + "torchdynamo=None,\n", + "tpu_metrics_debug=False,\n", + "tpu_num_cores=None,\n", + "use_ipex=False,\n", + "use_legacy_prediction_loop=False,\n", + "use_mps_device=False,\n", + "warmup_ratio=0.0,\n", + "warmup_steps=0,\n", + "weight_decay=0.0,\n", + "xpu_backend=None,\n", + ")\n", + "INFO:__main__:load a local file for train: data/train.json\n", + "INFO:__main__:load a local file for validation: data/valid.json\n", + "INFO:__main__:load a local file for test: data/test.json\n", + "WARNING:datasets.builder:Using custom data configuration default-01aa9d8252a24a0d\n", + "INFO:datasets.info:Loading Dataset Infos from /usr/local/lib/python3.8/dist-packages/datasets/packaged_modules/json\n", + "INFO:datasets.builder:Generating dataset json (/content/roberta_custom_training_cache/json/default-01aa9d8252a24a0d/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n", + "Downloading and preparing dataset json/default to /content/roberta_custom_training_cache/json/default-01aa9d8252a24a0d/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...\n", + "Downloading data files: 100% 3/3 [00:00<00:00, 14463.12it/s]\n", + "INFO:datasets.download.download_manager:Downloading took 0.0 min\n", + "INFO:datasets.download.download_manager:Checksum Computation took 0.0 min\n", + "Extracting data files: 100% 3/3 [00:00<00:00, 2119.76it/s]\n", + "INFO:datasets.utils.info_utils:Unable to verify checksums.\n", + "INFO:datasets.builder:Generating train split\n", + "INFO:datasets.builder:Generating validation split\n", + "INFO:datasets.builder:Generating test split\n", + "INFO:datasets.utils.info_utils:Unable to verify splits sizes.\n", + "Dataset json downloaded and prepared to /content/roberta_custom_training_cache/json/default-01aa9d8252a24a0d/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.\n", + "100% 3/3 [00:00<00:00, 657.14it/s]\n", + "Downloading (…)lve/main/config.json: 100% 481/481 [00:00<00:00, 88.4kB/s]\n", + "[INFO|configuration_utils.py:653] 2023-02-14 21:47:06,896 >> loading configuration file config.json from cache at roberta_custom_training_cache/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json\n", + "[INFO|configuration_utils.py:705] 2023-02-14 21:47:06,897 >> Model config RobertaConfig {\n", + " \"_name_or_path\": \"roberta-base\",\n", + " \"architectures\": [\n", + " \"RobertaForMaskedLM\"\n", + " ],\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"bos_token_id\": 0,\n", + " \"classifier_dropout\": null,\n", + " \"eos_token_id\": 2,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"id2label\": {\n", + " \"0\": \"LABEL_0\",\n", + " \"1\": \"LABEL_1\",\n", + " \"2\": \"LABEL_2\",\n", + " \"3\": \"LABEL_3\",\n", + " \"4\": \"LABEL_4\",\n", + " \"5\": \"LABEL_5\"\n", + " },\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"label2id\": {\n", + " \"LABEL_0\": 0,\n", + " \"LABEL_1\": 1,\n", + " \"LABEL_2\": 2,\n", + " \"LABEL_3\": 3,\n", + " \"LABEL_4\": 4,\n", + " \"LABEL_5\": 5\n", + " },\n", + " \"layer_norm_eps\": 1e-05,\n", + " \"max_position_embeddings\": 514,\n", + " \"model_type\": \"roberta\",\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"pad_token_id\": 1,\n", + " \"position_embedding_type\": \"absolute\",\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"type_vocab_size\": 1,\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50265\n", + "}\n", + "\n", + "[INFO|tokenization_auto.py:418] 2023-02-14 21:47:06,989 >> Could not locate the tokenizer configuration file, will try to use the model config instead.\n", + "[INFO|configuration_utils.py:653] 2023-02-14 21:47:07,079 >> loading configuration file config.json from cache at roberta_custom_training_cache/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json\n", + "[INFO|configuration_utils.py:705] 2023-02-14 21:47:07,080 >> Model config RobertaConfig {\n", + " \"_name_or_path\": \"roberta-base\",\n", + " \"architectures\": [\n", + " \"RobertaForMaskedLM\"\n", + " ],\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"bos_token_id\": 0,\n", + " \"classifier_dropout\": null,\n", + " \"eos_token_id\": 2,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"layer_norm_eps\": 1e-05,\n", + " \"max_position_embeddings\": 514,\n", + " \"model_type\": \"roberta\",\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"pad_token_id\": 1,\n", + " \"position_embedding_type\": \"absolute\",\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"type_vocab_size\": 1,\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50265\n", + "}\n", + "\n", + "Downloading (…)olve/main/vocab.json: 100% 899k/899k [00:00<00:00, 9.35MB/s]\n", + "Downloading (…)olve/main/merges.txt: 100% 456k/456k [00:00<00:00, 4.91MB/s]\n", + "Downloading (…)/main/tokenizer.json: 100% 1.36M/1.36M [00:00<00:00, 10.3MB/s]\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:47:08,305 >> loading file vocab.json from cache at roberta_custom_training_cache/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/vocab.json\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:47:08,305 >> loading file merges.txt from cache at roberta_custom_training_cache/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/merges.txt\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:47:08,305 >> loading file tokenizer.json from cache at roberta_custom_training_cache/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/tokenizer.json\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:47:08,305 >> loading file added_tokens.json from cache at None\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:47:08,305 >> loading file special_tokens_map.json from cache at None\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:47:08,305 >> loading file tokenizer_config.json from cache at None\n", + "[INFO|configuration_utils.py:653] 2023-02-14 21:47:08,306 >> loading configuration file config.json from cache at roberta_custom_training_cache/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json\n", + "[INFO|configuration_utils.py:705] 2023-02-14 21:47:08,306 >> Model config RobertaConfig {\n", + " \"_name_or_path\": \"roberta-base\",\n", + " \"architectures\": [\n", + " \"RobertaForMaskedLM\"\n", + " ],\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"bos_token_id\": 0,\n", + " \"classifier_dropout\": null,\n", + " \"eos_token_id\": 2,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"layer_norm_eps\": 1e-05,\n", + " \"max_position_embeddings\": 514,\n", + " \"model_type\": \"roberta\",\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"pad_token_id\": 1,\n", + " \"position_embedding_type\": \"absolute\",\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"type_vocab_size\": 1,\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50265\n", + "}\n", + "\n", + "INFO:__main__:Using hidden states in model: False\n", + "INFO:__main__:Using implementation from class: RobertaForSequenceClassificationCustomAlternative\n", + "Downloading (…)\"pytorch_model.bin\";: 100% 501M/501M [00:04<00:00, 106MB/s]\n", + "[INFO|modeling_utils.py:2156] 2023-02-14 21:47:13,300 >> loading weights file pytorch_model.bin from cache at roberta_custom_training_cache/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/pytorch_model.bin\n", + "[WARNING|modeling_utils.py:2596] 2023-02-14 21:47:15,772 >> Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassificationCustomAlternative: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']\n", + "- This IS expected if you are initializing RobertaForSequenceClassificationCustomAlternative from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing RobertaForSequenceClassificationCustomAlternative from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "[WARNING|modeling_utils.py:2608] 2023-02-14 21:47:15,772 >> Some weights of RobertaForSequenceClassificationCustomAlternative were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense_1_input.weight', 'classifier.dense_2.weight', 'classifier.out_proj.bias', 'classifier.dense_2.bias', 'classifier.dense_1_input.bias', 'classifier.dense_1_hidden.weight', 'classifier.dense_1_hidden.bias', 'classifier.out_proj.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "\n", + "\n", + "Frozen layers:\n", + "[('roberta.encoder.layer.0.attention.self.query.weight', False), ('roberta.encoder.layer.0.attention.self.query.bias', False), ('roberta.encoder.layer.0.attention.self.key.weight', False), ('roberta.encoder.layer.0.attention.self.key.bias', False), ('roberta.encoder.layer.0.attention.self.value.weight', False), ('roberta.encoder.layer.0.attention.self.value.bias', False), ('roberta.encoder.layer.0.attention.output.dense.weight', False), ('roberta.encoder.layer.0.attention.output.dense.bias', False), ('roberta.encoder.layer.0.attention.output.LayerNorm.weight', False), ('roberta.encoder.layer.0.attention.output.LayerNorm.bias', False), ('roberta.encoder.layer.0.intermediate.dense.weight', False), ('roberta.encoder.layer.0.intermediate.dense.bias', False), ('roberta.encoder.layer.0.output.dense.weight', False), ('roberta.encoder.layer.0.output.dense.bias', False), ('roberta.encoder.layer.0.output.LayerNorm.weight', False), ('roberta.encoder.layer.0.output.LayerNorm.bias', False), ('roberta.encoder.layer.2.attention.self.query.weight', False), ('roberta.encoder.layer.2.attention.self.query.bias', False), ('roberta.encoder.layer.2.attention.self.key.weight', False), ('roberta.encoder.layer.2.attention.self.key.bias', False), ('roberta.encoder.layer.2.attention.self.value.weight', False), ('roberta.encoder.layer.2.attention.self.value.bias', False), ('roberta.encoder.layer.2.attention.output.dense.weight', False), ('roberta.encoder.layer.2.attention.output.dense.bias', False), ('roberta.encoder.layer.2.attention.output.LayerNorm.weight', False), ('roberta.encoder.layer.2.attention.output.LayerNorm.bias', False), ('roberta.encoder.layer.2.intermediate.dense.weight', False), ('roberta.encoder.layer.2.intermediate.dense.bias', False), ('roberta.encoder.layer.2.output.dense.weight', False), ('roberta.encoder.layer.2.output.dense.bias', False), ('roberta.encoder.layer.2.output.LayerNorm.weight', False), ('roberta.encoder.layer.2.output.LayerNorm.bias', False), ('roberta.encoder.layer.4.attention.self.query.weight', False), ('roberta.encoder.layer.4.attention.self.query.bias', False), ('roberta.encoder.layer.4.attention.self.key.weight', False), ('roberta.encoder.layer.4.attention.self.key.bias', False), ('roberta.encoder.layer.4.attention.self.value.weight', False), ('roberta.encoder.layer.4.attention.self.value.bias', False), ('roberta.encoder.layer.4.attention.output.dense.weight', False), ('roberta.encoder.layer.4.attention.output.dense.bias', False), ('roberta.encoder.layer.4.attention.output.LayerNorm.weight', False), ('roberta.encoder.layer.4.attention.output.LayerNorm.bias', False), ('roberta.encoder.layer.4.intermediate.dense.weight', False), ('roberta.encoder.layer.4.intermediate.dense.bias', False), ('roberta.encoder.layer.4.output.dense.weight', False), ('roberta.encoder.layer.4.output.dense.bias', False), ('roberta.encoder.layer.4.output.LayerNorm.weight', False), ('roberta.encoder.layer.4.output.LayerNorm.bias', False), ('roberta.encoder.layer.6.attention.self.query.weight', False), ('roberta.encoder.layer.6.attention.self.query.bias', False), ('roberta.encoder.layer.6.attention.self.key.weight', False), ('roberta.encoder.layer.6.attention.self.key.bias', False), ('roberta.encoder.layer.6.attention.self.value.weight', False), ('roberta.encoder.layer.6.attention.self.value.bias', False), ('roberta.encoder.layer.6.attention.output.dense.weight', False), ('roberta.encoder.layer.6.attention.output.dense.bias', False), ('roberta.encoder.layer.6.attention.output.LayerNorm.weight', False), ('roberta.encoder.layer.6.attention.output.LayerNorm.bias', False), ('roberta.encoder.layer.6.intermediate.dense.weight', False), ('roberta.encoder.layer.6.intermediate.dense.bias', False), ('roberta.encoder.layer.6.output.dense.weight', False), ('roberta.encoder.layer.6.output.dense.bias', False), ('roberta.encoder.layer.6.output.LayerNorm.weight', False), ('roberta.encoder.layer.6.output.LayerNorm.bias', False), ('roberta.encoder.layer.8.attention.self.query.weight', False), ('roberta.encoder.layer.8.attention.self.query.bias', False), ('roberta.encoder.layer.8.attention.self.key.weight', False), ('roberta.encoder.layer.8.attention.self.key.bias', False), ('roberta.encoder.layer.8.attention.self.value.weight', False), ('roberta.encoder.layer.8.attention.self.value.bias', False), ('roberta.encoder.layer.8.attention.output.dense.weight', False), ('roberta.encoder.layer.8.attention.output.dense.bias', False), ('roberta.encoder.layer.8.attention.output.LayerNorm.weight', False), ('roberta.encoder.layer.8.attention.output.LayerNorm.bias', False), ('roberta.encoder.layer.8.intermediate.dense.weight', False), ('roberta.encoder.layer.8.intermediate.dense.bias', False), ('roberta.encoder.layer.8.output.dense.weight', False), ('roberta.encoder.layer.8.output.dense.bias', False), ('roberta.encoder.layer.8.output.LayerNorm.weight', False), ('roberta.encoder.layer.8.output.LayerNorm.bias', False), ('roberta.encoder.layer.10.attention.self.query.weight', False), ('roberta.encoder.layer.10.attention.self.query.bias', False), ('roberta.encoder.layer.10.attention.self.key.weight', False), ('roberta.encoder.layer.10.attention.self.key.bias', False), ('roberta.encoder.layer.10.attention.self.value.weight', False), ('roberta.encoder.layer.10.attention.self.value.bias', False), ('roberta.encoder.layer.10.attention.output.dense.weight', False), ('roberta.encoder.layer.10.attention.output.dense.bias', False), ('roberta.encoder.layer.10.attention.output.LayerNorm.weight', False), ('roberta.encoder.layer.10.attention.output.LayerNorm.bias', False), ('roberta.encoder.layer.10.intermediate.dense.weight', False), ('roberta.encoder.layer.10.intermediate.dense.bias', False), ('roberta.encoder.layer.10.output.dense.weight', False), ('roberta.encoder.layer.10.output.dense.bias', False), ('roberta.encoder.layer.10.output.LayerNorm.weight', False), ('roberta.encoder.layer.10.output.LayerNorm.bias', False)] \n", + "\n", + "\n", + "Running tokenizer on dataset: 0% 0/16 [00:00> The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassificationCustomAlternative.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomAlternative.forward`, you can safely ignore this message.\n", + "/usr/local/lib/python3.8/dist-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " warnings.warn(\n", + "[INFO|trainer.py:1607] 2023-02-14 21:47:19,649 >> ***** Running training *****\n", + "[INFO|trainer.py:1608] 2023-02-14 21:47:19,649 >> Num examples = 16000\n", + "[INFO|trainer.py:1609] 2023-02-14 21:47:19,649 >> Num Epochs = 1\n", + "[INFO|trainer.py:1610] 2023-02-14 21:47:19,649 >> Instantaneous batch size per device = 24\n", + "[INFO|trainer.py:1611] 2023-02-14 21:47:19,649 >> Total train batch size (w. parallel, distributed & accumulation) = 24\n", + "[INFO|trainer.py:1612] 2023-02-14 21:47:19,649 >> Gradient Accumulation steps = 1\n", + "[INFO|trainer.py:1613] 2023-02-14 21:47:19,649 >> Total optimization steps = 667\n", + "{'loss': 0.8955, 'learning_rate': 5.0074962518740634e-06, 'epoch': 0.75}\n", + " 75% 500/667 [00:58<00:19, 8.75it/s][INFO|trainer.py:2656] 2023-02-14 21:48:17,996 >> Saving model checkpoint to out/emotion/roberta_custom/checkpoint-500\n", + "[INFO|configuration_utils.py:447] 2023-02-14 21:48:17,997 >> Configuration saved in out/emotion/roberta_custom/checkpoint-500/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 21:48:19,015 >> Model weights saved in out/emotion/roberta_custom/checkpoint-500/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 21:48:19,016 >> tokenizer config file saved in out/emotion/roberta_custom/checkpoint-500/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 21:48:19,016 >> Special tokens file saved in out/emotion/roberta_custom/checkpoint-500/special_tokens_map.json\n", + "100% 666/667 [01:20<00:00, 8.66it/s][INFO|trainer.py:1852] 2023-02-14 21:48:40,745 >> \n", + "\n", + "Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "\n", + "\n", + "{'train_runtime': 81.0963, 'train_samples_per_second': 197.296, 'train_steps_per_second': 8.225, 'train_loss': 0.8004468377383573, 'epoch': 1.0}\n", + "100% 667/667 [01:21<00:00, 8.23it/s]\n", + "[INFO|trainer.py:2656] 2023-02-14 21:48:40,747 >> Saving model checkpoint to out/emotion/roberta_custom\n", + "[INFO|configuration_utils.py:447] 2023-02-14 21:48:40,748 >> Configuration saved in out/emotion/roberta_custom/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 21:48:41,796 >> Model weights saved in out/emotion/roberta_custom/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 21:48:41,797 >> tokenizer config file saved in out/emotion/roberta_custom/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 21:48:41,797 >> Special tokens file saved in out/emotion/roberta_custom/special_tokens_map.json\n", + "***** train metrics *****\n", + " epoch = 1.0\n", + " train_loss = 0.8004\n", + " train_runtime = 0:01:21.09\n", + " train_samples = 16000\n", + " train_samples_per_second = 197.296\n", + " train_steps_per_second = 8.225\n", + "INFO:__main__:*** Evaluate ***\n", + "[INFO|trainer.py:725] 2023-02-14 21:48:41,898 >> The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassificationCustomAlternative.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomAlternative.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 21:48:41,899 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 21:48:41,900 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 21:48:41,900 >> Batch size = 24\n", + "100% 84/84 [00:03<00:00, 23.62it/s]\n", + "***** eval metrics *****\n", + " epoch = 1.0\n", + " eval_accuracy = 0.867\n", + " eval_loss = 0.39\n", + " eval_runtime = 0:00:03.59\n", + " eval_samples = 2000\n", + " eval_samples_per_second = 555.583\n", + " eval_steps_per_second = 23.334\n", + "INFO:__main__:*** Predict ***\n", + "[INFO|trainer.py:725] 2023-02-14 21:48:45,503 >> The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassificationCustomAlternative.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomAlternative.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 21:48:45,504 >> ***** Running Prediction *****\n", + "[INFO|trainer.py:2909] 2023-02-14 21:48:45,504 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 21:48:45,504 >> Batch size = 24\n", + "100% 84/84 [00:03<00:00, 23.74it/s]\n", + "INFO:__main__:***** Predict results None *****\n", + "[INFO|modelcard.py:444] 2023-02-14 21:48:49,211 >> Dropping the following result as it does not have all the necessary fields:\n", + "{'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.8669999837875366}]}\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HUdoRk5o7ICl" + }, + "source": [ + "## **GPT2**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "exFg0yb-7ICl" + }, + "source": [ + "- full data\n", + "- model `GPT2`\n", + "- sequnece length: 128\n", + "- training epoch: 1" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "DMHK35db7ICl", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "5a3776f5-7feb-480b-a433-a80ed81f3eb7" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2023-02-14 21:48:52.605236: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-02-14 21:48:52.757779: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", + "2023-02-14 21:48:53.540701: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia\n", + "2023-02-14 21:48:53.540799: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia\n", + "2023-02-14 21:48:53.540819: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n", + "WARNING:__main__:Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n", + "INFO:__main__:Training/evaluation parameters TrainingArguments(\n", + "_n_gpu=1,\n", + "adafactor=False,\n", + "adam_beta1=0.9,\n", + "adam_beta2=0.999,\n", + "adam_epsilon=1e-08,\n", + "auto_find_batch_size=False,\n", + "bf16=False,\n", + "bf16_full_eval=False,\n", + "data_seed=None,\n", + "dataloader_drop_last=False,\n", + "dataloader_num_workers=0,\n", + "dataloader_pin_memory=True,\n", + "ddp_bucket_cap_mb=None,\n", + "ddp_find_unused_parameters=None,\n", + "ddp_timeout=1800,\n", + "debug=[],\n", + "deepspeed=None,\n", + "disable_tqdm=False,\n", + "do_eval=True,\n", + "do_predict=True,\n", + "do_train=True,\n", + "eval_accumulation_steps=None,\n", + "eval_delay=0,\n", + "eval_steps=250,\n", + "evaluation_strategy=steps,\n", + "fp16=False,\n", + "fp16_backend=auto,\n", + "fp16_full_eval=False,\n", + "fp16_opt_level=O1,\n", + "fsdp=[],\n", + "fsdp_min_num_params=0,\n", + "fsdp_transformer_layer_cls_to_wrap=None,\n", + "full_determinism=False,\n", + "gradient_accumulation_steps=1,\n", + "gradient_checkpointing=False,\n", + "greater_is_better=True,\n", + "group_by_length=False,\n", + "half_precision_backend=auto,\n", + "hub_model_id=None,\n", + "hub_private_repo=False,\n", + "hub_strategy=every_save,\n", + "hub_token=,\n", + "ignore_data_skip=False,\n", + "include_inputs_for_metrics=False,\n", + "jit_mode_eval=False,\n", + "label_names=None,\n", + "label_smoothing_factor=0.0,\n", + "learning_rate=2e-05,\n", + "length_column_name=length,\n", + "load_best_model_at_end=True,\n", + "local_rank=-1,\n", + "log_level=passive,\n", + "log_level_replica=passive,\n", + "log_on_each_node=True,\n", + "logging_dir=out/emotion/gpt2/runs/Feb14_21-48-55_fc0011e45a00,\n", + "logging_first_step=False,\n", + "logging_nan_inf_filter=True,\n", + "logging_steps=100,\n", + "logging_strategy=steps,\n", + "lr_scheduler_type=linear,\n", + "max_grad_norm=1.0,\n", + "max_steps=2500,\n", + "metric_for_best_model=accuracy,\n", + "mp_parameters=,\n", + "no_cuda=False,\n", + "num_train_epochs=1.0,\n", + "optim=adamw_hf,\n", + "output_dir=out/emotion/gpt2,\n", + "overwrite_output_dir=True,\n", + "past_index=-1,\n", + "per_device_eval_batch_size=24,\n", + "per_device_train_batch_size=24,\n", + "prediction_loss_only=False,\n", + "push_to_hub=False,\n", + "push_to_hub_model_id=None,\n", + "push_to_hub_organization=None,\n", + "push_to_hub_token=,\n", + "ray_scope=last,\n", + "remove_unused_columns=True,\n", + "report_to=['tensorboard'],\n", + "resume_from_checkpoint=None,\n", + "run_name=out/emotion/gpt2,\n", + "save_on_each_node=False,\n", + "save_steps=500,\n", + "save_strategy=steps,\n", + "save_total_limit=5,\n", + "seed=42,\n", + "sharded_ddp=[],\n", + "skip_memory_metrics=True,\n", + "tf32=None,\n", + "torchdynamo=None,\n", + "tpu_metrics_debug=False,\n", + "tpu_num_cores=None,\n", + "use_ipex=False,\n", + "use_legacy_prediction_loop=False,\n", + "use_mps_device=False,\n", + "warmup_ratio=0.0,\n", + "warmup_steps=0,\n", + "weight_decay=0.0,\n", + "xpu_backend=None,\n", + ")\n", + "INFO:__main__:load a local file for train: data/train.json\n", + "INFO:__main__:load a local file for validation: data/valid.json\n", + "INFO:__main__:load a local file for test: data/test.json\n", + "WARNING:datasets.builder:Using custom data configuration default-01aa9d8252a24a0d\n", + "INFO:datasets.info:Loading Dataset Infos from /usr/local/lib/python3.8/dist-packages/datasets/packaged_modules/json\n", + "INFO:datasets.builder:Generating dataset json (/content/gtp_cache_training/json/default-01aa9d8252a24a0d/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n", + "Downloading and preparing dataset json/default to /content/gtp_cache_training/json/default-01aa9d8252a24a0d/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...\n", + "Downloading data files: 100% 3/3 [00:00<00:00, 12169.16it/s]\n", + "INFO:datasets.download.download_manager:Downloading took 0.0 min\n", + "INFO:datasets.download.download_manager:Checksum Computation took 0.0 min\n", + "Extracting data files: 100% 3/3 [00:00<00:00, 2183.40it/s]\n", + "INFO:datasets.utils.info_utils:Unable to verify checksums.\n", + "INFO:datasets.builder:Generating train split\n", + "INFO:datasets.builder:Generating validation split\n", + "INFO:datasets.builder:Generating test split\n", + "INFO:datasets.utils.info_utils:Unable to verify splits sizes.\n", + "Dataset json downloaded and prepared to /content/gtp_cache_training/json/default-01aa9d8252a24a0d/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.\n", + "100% 3/3 [00:00<00:00, 665.62it/s]\n", + "Downloading (…)lve/main/config.json: 100% 665/665 [00:00<00:00, 125kB/s]\n", + "[INFO|configuration_utils.py:653] 2023-02-14 21:48:57,052 >> loading configuration file config.json from cache at gtp_cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json\n", + "[INFO|configuration_utils.py:705] 2023-02-14 21:48:57,053 >> Model config GPT2Config {\n", + " \"_name_or_path\": \"gpt2\",\n", + " \"activation_function\": \"gelu_new\",\n", + " \"architectures\": [\n", + " \"GPT2LMHeadModel\"\n", + " ],\n", + " \"attn_pdrop\": 0.1,\n", + " \"bos_token_id\": 50256,\n", + " \"embd_pdrop\": 0.1,\n", + " \"eos_token_id\": 50256,\n", + " \"id2label\": {\n", + " \"0\": \"LABEL_0\",\n", + " \"1\": \"LABEL_1\",\n", + " \"2\": \"LABEL_2\",\n", + " \"3\": \"LABEL_3\",\n", + " \"4\": \"LABEL_4\",\n", + " \"5\": \"LABEL_5\"\n", + " },\n", + " \"initializer_range\": 0.02,\n", + " \"label2id\": {\n", + " \"LABEL_0\": 0,\n", + " \"LABEL_1\": 1,\n", + " \"LABEL_2\": 2,\n", + " \"LABEL_3\": 3,\n", + " \"LABEL_4\": 4,\n", + " \"LABEL_5\": 5\n", + " },\n", + " \"layer_norm_epsilon\": 1e-05,\n", + " \"model_type\": \"gpt2\",\n", + " \"n_ctx\": 1024,\n", + " \"n_embd\": 768,\n", + " \"n_head\": 12,\n", + " \"n_inner\": null,\n", + " \"n_layer\": 12,\n", + " \"n_positions\": 1024,\n", + " \"reorder_and_upcast_attn\": false,\n", + " \"resid_pdrop\": 0.1,\n", + " \"scale_attn_by_inverse_layer_idx\": false,\n", + " \"scale_attn_weights\": true,\n", + " \"summary_activation\": null,\n", + " \"summary_first_dropout\": 0.1,\n", + " \"summary_proj_to_labels\": true,\n", + " \"summary_type\": \"cls_index\",\n", + " \"summary_use_proj\": true,\n", + " \"task_specific_params\": {\n", + " \"text-generation\": {\n", + " \"do_sample\": true,\n", + " \"max_length\": 50\n", + " }\n", + " },\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50257\n", + "}\n", + "\n", + "[INFO|tokenization_auto.py:418] 2023-02-14 21:48:57,145 >> Could not locate the tokenizer configuration file, will try to use the model config instead.\n", + "[INFO|configuration_utils.py:653] 2023-02-14 21:48:57,236 >> loading configuration file config.json from cache at gtp_cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json\n", + "[INFO|configuration_utils.py:705] 2023-02-14 21:48:57,237 >> Model config GPT2Config {\n", + " \"_name_or_path\": \"gpt2\",\n", + " \"activation_function\": \"gelu_new\",\n", + " \"architectures\": [\n", + " \"GPT2LMHeadModel\"\n", + " ],\n", + " \"attn_pdrop\": 0.1,\n", + " \"bos_token_id\": 50256,\n", + " \"embd_pdrop\": 0.1,\n", + " \"eos_token_id\": 50256,\n", + " \"initializer_range\": 0.02,\n", + " \"layer_norm_epsilon\": 1e-05,\n", + " \"model_type\": \"gpt2\",\n", + " \"n_ctx\": 1024,\n", + " \"n_embd\": 768,\n", + " \"n_head\": 12,\n", + " \"n_inner\": null,\n", + " \"n_layer\": 12,\n", + " \"n_positions\": 1024,\n", + " \"reorder_and_upcast_attn\": false,\n", + " \"resid_pdrop\": 0.1,\n", + " \"scale_attn_by_inverse_layer_idx\": false,\n", + " \"scale_attn_weights\": true,\n", + " \"summary_activation\": null,\n", + " \"summary_first_dropout\": 0.1,\n", + " \"summary_proj_to_labels\": true,\n", + " \"summary_type\": \"cls_index\",\n", + " \"summary_use_proj\": true,\n", + " \"task_specific_params\": {\n", + " \"text-generation\": {\n", + " \"do_sample\": true,\n", + " \"max_length\": 50\n", + " }\n", + " },\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50257\n", + "}\n", + "\n", + "Downloading (…)olve/main/vocab.json: 100% 1.04M/1.04M [00:00<00:00, 9.20MB/s]\n", + "Downloading (…)olve/main/merges.txt: 100% 456k/456k [00:00<00:00, 6.19MB/s]\n", + "Downloading (…)/main/tokenizer.json: 100% 1.36M/1.36M [00:00<00:00, 11.7MB/s]\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:48:58,447 >> loading file vocab.json from cache at gtp_cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/vocab.json\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:48:58,447 >> loading file merges.txt from cache at gtp_cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/merges.txt\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:48:58,447 >> loading file tokenizer.json from cache at gtp_cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/tokenizer.json\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:48:58,447 >> loading file added_tokens.json from cache at None\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:48:58,447 >> loading file special_tokens_map.json from cache at None\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:48:58,447 >> loading file tokenizer_config.json from cache at None\n", + "[INFO|configuration_utils.py:653] 2023-02-14 21:48:58,447 >> loading configuration file config.json from cache at gtp_cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json\n", + "[INFO|configuration_utils.py:705] 2023-02-14 21:48:58,448 >> Model config GPT2Config {\n", + " \"_name_or_path\": \"gpt2\",\n", + " \"activation_function\": \"gelu_new\",\n", + " \"architectures\": [\n", + " \"GPT2LMHeadModel\"\n", + " ],\n", + " \"attn_pdrop\": 0.1,\n", + " \"bos_token_id\": 50256,\n", + " \"embd_pdrop\": 0.1,\n", + " \"eos_token_id\": 50256,\n", + " \"initializer_range\": 0.02,\n", + " \"layer_norm_epsilon\": 1e-05,\n", + " \"model_type\": \"gpt2\",\n", + " \"n_ctx\": 1024,\n", + " \"n_embd\": 768,\n", + " \"n_head\": 12,\n", + " \"n_inner\": null,\n", + " \"n_layer\": 12,\n", + " \"n_positions\": 1024,\n", + " \"reorder_and_upcast_attn\": false,\n", + " \"resid_pdrop\": 0.1,\n", + " \"scale_attn_by_inverse_layer_idx\": false,\n", + " \"scale_attn_weights\": true,\n", + " \"summary_activation\": null,\n", + " \"summary_first_dropout\": 0.1,\n", + " \"summary_proj_to_labels\": true,\n", + " \"summary_type\": \"cls_index\",\n", + " \"summary_use_proj\": true,\n", + " \"task_specific_params\": {\n", + " \"text-generation\": {\n", + " \"do_sample\": true,\n", + " \"max_length\": 50\n", + " }\n", + " },\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50257\n", + "}\n", + "\n", + "INFO:__main__:Using implementation from class: AutoModelForSequenceClassification\n", + "Downloading (…)\"pytorch_model.bin\";: 100% 548M/548M [00:05<00:00, 108MB/s]\n", + "[INFO|modeling_utils.py:2156] 2023-02-14 21:49:03,784 >> loading weights file pytorch_model.bin from cache at gtp_cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/pytorch_model.bin\n", + "[INFO|modeling_utils.py:2606] 2023-02-14 21:49:05,169 >> All model checkpoint weights were used when initializing GPT2ForSequenceClassification.\n", + "\n", + "[WARNING|modeling_utils.py:2608] 2023-02-14 21:49:05,169 >> Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "[ERROR|tokenization_utils_base.py:1019] 2023-02-14 21:49:05,177 >> Using pad_token, but it is not set yet.\n", + "INFO:__main__:Set PAD token to EOS: <|endoftext|>\n", + "Running tokenizer on dataset: 0% 0/16 [00:00> max_steps is given, it will override any value given in num_train_epochs\n", + "[INFO|trainer.py:725] 2023-02-14 21:49:08,712 >> The following columns in the training set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`, you can safely ignore this message.\n", + "/usr/local/lib/python3.8/dist-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " warnings.warn(\n", + "[INFO|trainer.py:1607] 2023-02-14 21:49:08,718 >> ***** Running training *****\n", + "[INFO|trainer.py:1608] 2023-02-14 21:49:08,718 >> Num examples = 16000\n", + "[INFO|trainer.py:1609] 2023-02-14 21:49:08,718 >> Num Epochs = 4\n", + "[INFO|trainer.py:1610] 2023-02-14 21:49:08,719 >> Instantaneous batch size per device = 24\n", + "[INFO|trainer.py:1611] 2023-02-14 21:49:08,719 >> Total train batch size (w. parallel, distributed & accumulation) = 24\n", + "[INFO|trainer.py:1612] 2023-02-14 21:49:08,719 >> Gradient Accumulation steps = 1\n", + "[INFO|trainer.py:1613] 2023-02-14 21:49:08,719 >> Total optimization steps = 2500\n", + "{'loss': 2.3442, 'learning_rate': 1.9200000000000003e-05, 'epoch': 0.15}\n", + "{'loss': 1.3126, 'learning_rate': 1.8400000000000003e-05, 'epoch': 0.3}\n", + " 10% 250/2500 [00:37<05:31, 6.79it/s][INFO|trainer.py:725] 2023-02-14 21:49:46,426 >> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 21:49:46,428 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 21:49:46,428 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 21:49:46,428 >> Batch size = 24\n", + "\n", + " 0% 0/84 [00:00> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 21:50:27,314 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 21:50:27,314 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 21:50:27,314 >> Batch size = 24\n", + "\n", + " 0% 0/84 [00:00> Saving model checkpoint to out/emotion/gpt2/checkpoint-500\n", + "[INFO|configuration_utils.py:447] 2023-02-14 21:50:31,308 >> Configuration saved in out/emotion/gpt2/checkpoint-500/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 21:50:32,356 >> Model weights saved in out/emotion/gpt2/checkpoint-500/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 21:50:32,357 >> tokenizer config file saved in out/emotion/gpt2/checkpoint-500/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 21:50:32,357 >> Special tokens file saved in out/emotion/gpt2/checkpoint-500/special_tokens_map.json\n", + "{'loss': 0.3554, 'learning_rate': 1.5200000000000002e-05, 'epoch': 0.9}\n", + "{'loss': 0.2871, 'learning_rate': 1.4400000000000001e-05, 'epoch': 1.05}\n", + " 30% 750/2500 [02:02<04:19, 6.74it/s][INFO|trainer.py:725] 2023-02-14 21:51:11,104 >> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 21:51:11,106 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 21:51:11,106 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 21:51:11,106 >> Batch size = 24\n", + "\n", + " 0% 0/84 [00:00> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 21:51:51,749 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 21:51:51,750 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 21:51:51,750 >> Batch size = 24\n", + "\n", + " 0% 0/84 [00:00> Saving model checkpoint to out/emotion/gpt2/checkpoint-1000\n", + "[INFO|configuration_utils.py:447] 2023-02-14 21:51:55,717 >> Configuration saved in out/emotion/gpt2/checkpoint-1000/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 21:51:56,708 >> Model weights saved in out/emotion/gpt2/checkpoint-1000/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 21:51:56,709 >> tokenizer config file saved in out/emotion/gpt2/checkpoint-1000/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 21:51:56,709 >> Special tokens file saved in out/emotion/gpt2/checkpoint-1000/special_tokens_map.json\n", + "{'loss': 0.1906, 'learning_rate': 1.1200000000000001e-05, 'epoch': 1.65}\n", + "{'loss': 0.1793, 'learning_rate': 1.04e-05, 'epoch': 1.8}\n", + " 50% 1250/2500 [03:26<03:04, 6.76it/s][INFO|trainer.py:725] 2023-02-14 21:52:35,220 >> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 21:52:35,222 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 21:52:35,222 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 21:52:35,222 >> Batch size = 24\n", + "\n", + " 0% 0/84 [00:00> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 21:53:15,833 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 21:53:15,833 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 21:53:15,833 >> Batch size = 24\n", + "\n", + " 0% 0/84 [00:00> Saving model checkpoint to out/emotion/gpt2/checkpoint-1500\n", + "[INFO|configuration_utils.py:447] 2023-02-14 21:53:19,812 >> Configuration saved in out/emotion/gpt2/checkpoint-1500/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 21:53:21,455 >> Model weights saved in out/emotion/gpt2/checkpoint-1500/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 21:53:21,456 >> tokenizer config file saved in out/emotion/gpt2/checkpoint-1500/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 21:53:21,456 >> Special tokens file saved in out/emotion/gpt2/checkpoint-1500/special_tokens_map.json\n", + "{'loss': 0.157, 'learning_rate': 7.2000000000000005e-06, 'epoch': 2.4}\n", + "{'loss': 0.141, 'learning_rate': 6.4000000000000006e-06, 'epoch': 2.55}\n", + " 70% 1750/2500 [04:51<01:50, 6.80it/s][INFO|trainer.py:725] 2023-02-14 21:54:00,007 >> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 21:54:00,009 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 21:54:00,009 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 21:54:00,009 >> Batch size = 24\n", + "\n", + " 0% 0/84 [00:00> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 21:54:40,635 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 21:54:40,635 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 21:54:40,635 >> Batch size = 24\n", + "\n", + " 0% 0/84 [00:00> Saving model checkpoint to out/emotion/gpt2/checkpoint-2000\n", + "[INFO|configuration_utils.py:447] 2023-02-14 21:54:44,615 >> Configuration saved in out/emotion/gpt2/checkpoint-2000/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 21:54:46,838 >> Model weights saved in out/emotion/gpt2/checkpoint-2000/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 21:54:46,839 >> tokenizer config file saved in out/emotion/gpt2/checkpoint-2000/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 21:54:46,839 >> Special tokens file saved in out/emotion/gpt2/checkpoint-2000/special_tokens_map.json\n", + "{'loss': 0.1256, 'learning_rate': 3.2000000000000003e-06, 'epoch': 3.15}\n", + "{'loss': 0.1246, 'learning_rate': 2.4000000000000003e-06, 'epoch': 3.3}\n", + " 90% 2250/2500 [06:16<00:36, 6.76it/s][INFO|trainer.py:725] 2023-02-14 21:55:25,309 >> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 21:55:25,311 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 21:55:25,311 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 21:55:25,311 >> Batch size = 24\n", + "\n", + " 0% 0/84 [00:00> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 21:56:05,971 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 21:56:05,971 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 21:56:05,971 >> Batch size = 24\n", + "\n", + " 0% 0/84 [00:00> Saving model checkpoint to out/emotion/gpt2/checkpoint-2500\n", + "[INFO|configuration_utils.py:447] 2023-02-14 21:56:09,957 >> Configuration saved in out/emotion/gpt2/checkpoint-2500/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 21:56:10,953 >> Model weights saved in out/emotion/gpt2/checkpoint-2500/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 21:56:10,954 >> tokenizer config file saved in out/emotion/gpt2/checkpoint-2500/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 21:56:10,954 >> Special tokens file saved in out/emotion/gpt2/checkpoint-2500/special_tokens_map.json\n", + "[INFO|trainer.py:1852] 2023-02-14 21:56:12,777 >> \n", + "\n", + "Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "\n", + "\n", + "[INFO|trainer.py:1946] 2023-02-14 21:56:12,778 >> Loading best model from out/emotion/gpt2/checkpoint-1500 (score: 0.9330000281333923).\n", + "{'train_runtime': 424.4983, 'train_samples_per_second': 141.343, 'train_steps_per_second': 5.889, 'train_loss': 0.351297896194458, 'epoch': 3.75}\n", + "100% 2500/2500 [07:04<00:00, 5.89it/s]\n", + "[INFO|trainer.py:2656] 2023-02-14 21:56:13,218 >> Saving model checkpoint to out/emotion/gpt2\n", + "[INFO|configuration_utils.py:447] 2023-02-14 21:56:13,220 >> Configuration saved in out/emotion/gpt2/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 21:56:14,063 >> Model weights saved in out/emotion/gpt2/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 21:56:14,064 >> tokenizer config file saved in out/emotion/gpt2/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 21:56:14,064 >> Special tokens file saved in out/emotion/gpt2/special_tokens_map.json\n", + "***** train metrics *****\n", + " epoch = 3.75\n", + " train_loss = 0.3513\n", + " train_runtime = 0:07:04.49\n", + " train_samples = 16000\n", + " train_samples_per_second = 141.343\n", + " train_steps_per_second = 5.889\n", + "INFO:__main__:*** Evaluate ***\n", + "[INFO|trainer.py:725] 2023-02-14 21:56:14,169 >> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 21:56:14,170 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 21:56:14,170 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 21:56:14,170 >> Batch size = 24\n", + "100% 84/84 [00:03<00:00, 21.20it/s]\n", + "***** eval metrics *****\n", + " epoch = 3.75\n", + " eval_accuracy = 0.933\n", + " eval_loss = 0.1609\n", + " eval_runtime = 0:00:04.02\n", + " eval_samples = 2000\n", + " eval_samples_per_second = 497.496\n", + " eval_steps_per_second = 20.895\n", + "INFO:__main__:*** Predict ***\n", + "[INFO|trainer.py:725] 2023-02-14 21:56:18,194 >> The following columns in the test set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 21:56:18,195 >> ***** Running Prediction *****\n", + "[INFO|trainer.py:2909] 2023-02-14 21:56:18,195 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 21:56:18,195 >> Batch size = 24\n", + "100% 84/84 [00:03<00:00, 21.40it/s]\n", + "INFO:__main__:***** Predict results None *****\n", + "[INFO|modelcard.py:444] 2023-02-14 21:56:22,304 >> Dropping the following result as it does not have all the necessary fields:\n", + "{'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.9330000281333923}]}\n" + ] + } + ], + "source": [ + "!python run_glue.py \\\n", + " --cache_dir gtp_cache_training \\\n", + " --model_name_or_path gpt2 \\\n", + " --train_file data/train.json \\\n", + " --validation_file data/valid.json \\\n", + " --test_file data/test.json \\\n", + " --per_device_train_batch_size 24 \\\n", + " --per_device_eval_batch_size 24 \\\n", + " --do_train \\\n", + " --do_eval \\\n", + " --do_predict \\\n", + " --max_seq_length 128 \\\n", + " --learning_rate 2e-5 \\\n", + " --num_train_epochs 1 \\\n", + " --output_dir out/emotion/gpt2 \\\n", + " --overwrite_output_dir \\\n", + " --eval_steps 250 \\\n", + " --evaluation_strategy steps \\\n", + " --metric_for_best_model accuracy \\\n", + " --logging_steps 100 \\\n", + " --save_total_limit 5 \\\n", + " --max_steps 2500 \\\n", + " --load_best_model_at_end True " + ] + }, + { + "cell_type": "markdown", + "source": [ + "- full dataset\n", + "- custom head" + ], + "metadata": { + "id": "zJeUGay5n1JW" + } + }, + { + "cell_type": "code", + "source": [ + "!python run_glue.py \\\n", + " --cache_dir gtp_custom_cache_training \\\n", + " --model_name_or_path gpt2 \\\n", + " --custom_model gpt2_custom \\\n", + " --train_file data/train.json \\\n", + " --validation_file data/valid.json \\\n", + " --test_file data/test.json \\\n", + " --per_device_train_batch_size 24 \\\n", + " --per_device_eval_batch_size 24 \\\n", + " --do_train \\\n", + " --do_eval \\\n", + " --do_predict \\\n", + " --max_seq_length 128 \\\n", + " --learning_rate 2e-5 \\\n", + " --num_train_epochs 1 \\\n", + " --output_dir out/emotion/gpt2_custom \\\n", + " --overwrite_output_dir \\\n", + " --eval_steps 250 \\\n", + " --evaluation_strategy steps \\\n", + " --metric_for_best_model accuracy \\\n", + " --logging_steps 100 \\\n", + " --save_total_limit 5 \\\n", + " --max_steps 2500 \\\n", + " --load_best_model_at_end True " + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LXRMDiD-n1nG", + "outputId": "1383e6a3-b485-49a0-d111-05bea71acd23" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2023-02-14 21:56:25.884599: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-02-14 21:56:26.040127: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", + "2023-02-14 21:56:26.823479: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia\n", + "2023-02-14 21:56:26.823595: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia\n", + "2023-02-14 21:56:26.823615: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n", + "WARNING:__main__:Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n", + "INFO:__main__:Training/evaluation parameters TrainingArguments(\n", + "_n_gpu=1,\n", + "adafactor=False,\n", + "adam_beta1=0.9,\n", + "adam_beta2=0.999,\n", + "adam_epsilon=1e-08,\n", + "auto_find_batch_size=False,\n", + "bf16=False,\n", + "bf16_full_eval=False,\n", + "data_seed=None,\n", + "dataloader_drop_last=False,\n", + "dataloader_num_workers=0,\n", + "dataloader_pin_memory=True,\n", + "ddp_bucket_cap_mb=None,\n", + "ddp_find_unused_parameters=None,\n", + "ddp_timeout=1800,\n", + "debug=[],\n", + "deepspeed=None,\n", + "disable_tqdm=False,\n", + "do_eval=True,\n", + "do_predict=True,\n", + "do_train=True,\n", + "eval_accumulation_steps=None,\n", + "eval_delay=0,\n", + "eval_steps=250,\n", + "evaluation_strategy=steps,\n", + "fp16=False,\n", + "fp16_backend=auto,\n", + "fp16_full_eval=False,\n", + "fp16_opt_level=O1,\n", + "fsdp=[],\n", + "fsdp_min_num_params=0,\n", + "fsdp_transformer_layer_cls_to_wrap=None,\n", + "full_determinism=False,\n", + "gradient_accumulation_steps=1,\n", + "gradient_checkpointing=False,\n", + "greater_is_better=True,\n", + "group_by_length=False,\n", + "half_precision_backend=auto,\n", + "hub_model_id=None,\n", + "hub_private_repo=False,\n", + "hub_strategy=every_save,\n", + "hub_token=,\n", + "ignore_data_skip=False,\n", + "include_inputs_for_metrics=False,\n", + "jit_mode_eval=False,\n", + "label_names=None,\n", + "label_smoothing_factor=0.0,\n", + "learning_rate=2e-05,\n", + "length_column_name=length,\n", + "load_best_model_at_end=True,\n", + "local_rank=-1,\n", + "log_level=passive,\n", + "log_level_replica=passive,\n", + "log_on_each_node=True,\n", + "logging_dir=out/emotion/gpt2_custom/runs/Feb14_21-56-28_fc0011e45a00,\n", + "logging_first_step=False,\n", + "logging_nan_inf_filter=True,\n", + "logging_steps=100,\n", + "logging_strategy=steps,\n", + "lr_scheduler_type=linear,\n", + "max_grad_norm=1.0,\n", + "max_steps=2500,\n", + "metric_for_best_model=accuracy,\n", + "mp_parameters=,\n", + "no_cuda=False,\n", + "num_train_epochs=1.0,\n", + "optim=adamw_hf,\n", + "output_dir=out/emotion/gpt2_custom,\n", + "overwrite_output_dir=True,\n", + "past_index=-1,\n", + "per_device_eval_batch_size=24,\n", + "per_device_train_batch_size=24,\n", + "prediction_loss_only=False,\n", + "push_to_hub=False,\n", + "push_to_hub_model_id=None,\n", + "push_to_hub_organization=None,\n", + "push_to_hub_token=,\n", + "ray_scope=last,\n", + "remove_unused_columns=True,\n", + "report_to=['tensorboard'],\n", + "resume_from_checkpoint=None,\n", + "run_name=out/emotion/gpt2_custom,\n", + "save_on_each_node=False,\n", + "save_steps=500,\n", + "save_strategy=steps,\n", + "save_total_limit=5,\n", + "seed=42,\n", + "sharded_ddp=[],\n", + "skip_memory_metrics=True,\n", + "tf32=None,\n", + "torchdynamo=None,\n", + "tpu_metrics_debug=False,\n", + "tpu_num_cores=None,\n", + "use_ipex=False,\n", + "use_legacy_prediction_loop=False,\n", + "use_mps_device=False,\n", + "warmup_ratio=0.0,\n", + "warmup_steps=0,\n", + "weight_decay=0.0,\n", + "xpu_backend=None,\n", + ")\n", + "INFO:__main__:load a local file for train: data/train.json\n", + "INFO:__main__:load a local file for validation: data/valid.json\n", + "INFO:__main__:load a local file for test: data/test.json\n", + "WARNING:datasets.builder:Using custom data configuration default-01aa9d8252a24a0d\n", + "INFO:datasets.info:Loading Dataset Infos from /usr/local/lib/python3.8/dist-packages/datasets/packaged_modules/json\n", + "INFO:datasets.builder:Generating dataset json (/content/gtp_custom_cache_training/json/default-01aa9d8252a24a0d/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n", + "Downloading and preparing dataset json/default to /content/gtp_custom_cache_training/json/default-01aa9d8252a24a0d/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...\n", + "Downloading data files: 100% 3/3 [00:00<00:00, 14138.10it/s]\n", + "INFO:datasets.download.download_manager:Downloading took 0.0 min\n", + "INFO:datasets.download.download_manager:Checksum Computation took 0.0 min\n", + "Extracting data files: 100% 3/3 [00:00<00:00, 2175.09it/s]\n", + "INFO:datasets.utils.info_utils:Unable to verify checksums.\n", + "INFO:datasets.builder:Generating train split\n", + "INFO:datasets.builder:Generating validation split\n", + "INFO:datasets.builder:Generating test split\n", + "INFO:datasets.utils.info_utils:Unable to verify splits sizes.\n", + "Dataset json downloaded and prepared to /content/gtp_custom_cache_training/json/default-01aa9d8252a24a0d/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.\n", + "100% 3/3 [00:00<00:00, 672.49it/s]\n", + "Downloading (…)lve/main/config.json: 100% 665/665 [00:00<00:00, 123kB/s]\n", + "[INFO|configuration_utils.py:653] 2023-02-14 21:56:30,068 >> loading configuration file config.json from cache at gtp_custom_cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json\n", + "[INFO|configuration_utils.py:705] 2023-02-14 21:56:30,068 >> Model config GPT2Config {\n", + " \"_name_or_path\": \"gpt2\",\n", + " \"activation_function\": \"gelu_new\",\n", + " \"architectures\": [\n", + " \"GPT2LMHeadModel\"\n", + " ],\n", + " \"attn_pdrop\": 0.1,\n", + " \"bos_token_id\": 50256,\n", + " \"embd_pdrop\": 0.1,\n", + " \"eos_token_id\": 50256,\n", + " \"id2label\": {\n", + " \"0\": \"LABEL_0\",\n", + " \"1\": \"LABEL_1\",\n", + " \"2\": \"LABEL_2\",\n", + " \"3\": \"LABEL_3\",\n", + " \"4\": \"LABEL_4\",\n", + " \"5\": \"LABEL_5\"\n", + " },\n", + " \"initializer_range\": 0.02,\n", + " \"label2id\": {\n", + " \"LABEL_0\": 0,\n", + " \"LABEL_1\": 1,\n", + " \"LABEL_2\": 2,\n", + " \"LABEL_3\": 3,\n", + " \"LABEL_4\": 4,\n", + " \"LABEL_5\": 5\n", + " },\n", + " \"layer_norm_epsilon\": 1e-05,\n", + " \"model_type\": \"gpt2\",\n", + " \"n_ctx\": 1024,\n", + " \"n_embd\": 768,\n", + " \"n_head\": 12,\n", + " \"n_inner\": null,\n", + " \"n_layer\": 12,\n", + " \"n_positions\": 1024,\n", + " \"reorder_and_upcast_attn\": false,\n", + " \"resid_pdrop\": 0.1,\n", + " \"scale_attn_by_inverse_layer_idx\": false,\n", + " \"scale_attn_weights\": true,\n", + " \"summary_activation\": null,\n", + " \"summary_first_dropout\": 0.1,\n", + " \"summary_proj_to_labels\": true,\n", + " \"summary_type\": \"cls_index\",\n", + " \"summary_use_proj\": true,\n", + " \"task_specific_params\": {\n", + " \"text-generation\": {\n", + " \"do_sample\": true,\n", + " \"max_length\": 50\n", + " }\n", + " },\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50257\n", + "}\n", + "\n", + "[INFO|tokenization_auto.py:418] 2023-02-14 21:56:30,162 >> Could not locate the tokenizer configuration file, will try to use the model config instead.\n", + "[INFO|configuration_utils.py:653] 2023-02-14 21:56:30,251 >> loading configuration file config.json from cache at gtp_custom_cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json\n", + "[INFO|configuration_utils.py:705] 2023-02-14 21:56:30,252 >> Model config GPT2Config {\n", + " \"_name_or_path\": \"gpt2\",\n", + " \"activation_function\": \"gelu_new\",\n", + " \"architectures\": [\n", + " \"GPT2LMHeadModel\"\n", + " ],\n", + " \"attn_pdrop\": 0.1,\n", + " \"bos_token_id\": 50256,\n", + " \"embd_pdrop\": 0.1,\n", + " \"eos_token_id\": 50256,\n", + " \"initializer_range\": 0.02,\n", + " \"layer_norm_epsilon\": 1e-05,\n", + " \"model_type\": \"gpt2\",\n", + " \"n_ctx\": 1024,\n", + " \"n_embd\": 768,\n", + " \"n_head\": 12,\n", + " \"n_inner\": null,\n", + " \"n_layer\": 12,\n", + " \"n_positions\": 1024,\n", + " \"reorder_and_upcast_attn\": false,\n", + " \"resid_pdrop\": 0.1,\n", + " \"scale_attn_by_inverse_layer_idx\": false,\n", + " \"scale_attn_weights\": true,\n", + " \"summary_activation\": null,\n", + " \"summary_first_dropout\": 0.1,\n", + " \"summary_proj_to_labels\": true,\n", + " \"summary_type\": \"cls_index\",\n", + " \"summary_use_proj\": true,\n", + " \"task_specific_params\": {\n", + " \"text-generation\": {\n", + " \"do_sample\": true,\n", + " \"max_length\": 50\n", + " }\n", + " },\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50257\n", + "}\n", + "\n", + "Downloading (…)olve/main/vocab.json: 100% 1.04M/1.04M [00:00<00:00, 9.18MB/s]\n", + "Downloading (…)olve/main/merges.txt: 100% 456k/456k [00:00<00:00, 4.90MB/s]\n", + "Downloading (…)/main/tokenizer.json: 100% 1.36M/1.36M [00:00<00:00, 14.3MB/s]\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:56:31,525 >> loading file vocab.json from cache at gtp_custom_cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/vocab.json\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:56:31,525 >> loading file merges.txt from cache at gtp_custom_cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/merges.txt\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:56:31,525 >> loading file tokenizer.json from cache at gtp_custom_cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/tokenizer.json\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:56:31,525 >> loading file added_tokens.json from cache at None\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:56:31,525 >> loading file special_tokens_map.json from cache at None\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 21:56:31,525 >> loading file tokenizer_config.json from cache at None\n", + "[INFO|configuration_utils.py:653] 2023-02-14 21:56:31,525 >> loading configuration file config.json from cache at gtp_custom_cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json\n", + "[INFO|configuration_utils.py:705] 2023-02-14 21:56:31,526 >> Model config GPT2Config {\n", + " \"_name_or_path\": \"gpt2\",\n", + " \"activation_function\": \"gelu_new\",\n", + " \"architectures\": [\n", + " \"GPT2LMHeadModel\"\n", + " ],\n", + " \"attn_pdrop\": 0.1,\n", + " \"bos_token_id\": 50256,\n", + " \"embd_pdrop\": 0.1,\n", + " \"eos_token_id\": 50256,\n", + " \"initializer_range\": 0.02,\n", + " \"layer_norm_epsilon\": 1e-05,\n", + " \"model_type\": \"gpt2\",\n", + " \"n_ctx\": 1024,\n", + " \"n_embd\": 768,\n", + " \"n_head\": 12,\n", + " \"n_inner\": null,\n", + " \"n_layer\": 12,\n", + " \"n_positions\": 1024,\n", + " \"reorder_and_upcast_attn\": false,\n", + " \"resid_pdrop\": 0.1,\n", + " \"scale_attn_by_inverse_layer_idx\": false,\n", + " \"scale_attn_weights\": true,\n", + " \"summary_activation\": null,\n", + " \"summary_first_dropout\": 0.1,\n", + " \"summary_proj_to_labels\": true,\n", + " \"summary_type\": \"cls_index\",\n", + " \"summary_use_proj\": true,\n", + " \"task_specific_params\": {\n", + " \"text-generation\": {\n", + " \"do_sample\": true,\n", + " \"max_length\": 50\n", + " }\n", + " },\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50257\n", + "}\n", + "\n", + "INFO:__main__:Using hidden states in model: False\n", + "INFO:__main__:Using implementation from class: GPT2ForSequenceClassificationCustom\n", + "Downloading (…)\"pytorch_model.bin\";: 100% 548M/548M [00:05<00:00, 108MB/s]\n", + "[INFO|modeling_utils.py:2156] 2023-02-14 21:56:36,895 >> loading weights file pytorch_model.bin from cache at gtp_custom_cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/pytorch_model.bin\n", + "[INFO|modeling_utils.py:2606] 2023-02-14 21:56:39,410 >> All model checkpoint weights were used when initializing GPT2ForSequenceClassificationCustom.\n", + "\n", + "[WARNING|modeling_utils.py:2608] 2023-02-14 21:56:39,410 >> Some weights of GPT2ForSequenceClassificationCustom were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.dense_1_hidden.bias', 'score.dense_1_input.weight', 'score.dense_2.bias', 'score.dense_2.weight', 'score.out_proj.weight', 'score.dense_1_hidden.weight', 'score.dense_1_input.bias']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "[ERROR|tokenization_utils_base.py:1019] 2023-02-14 21:56:39,418 >> Using pad_token, but it is not set yet.\n", + "INFO:__main__:Set PAD token to EOS: <|endoftext|>\n", + "Running tokenizer on dataset: 0% 0/16 [00:00> max_steps is given, it will override any value given in num_train_epochs\n", + "[INFO|trainer.py:725] 2023-02-14 21:56:42,941 >> The following columns in the training set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`, you can safely ignore this message.\n", + "/usr/local/lib/python3.8/dist-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " warnings.warn(\n", + "[INFO|trainer.py:1607] 2023-02-14 21:56:42,947 >> ***** Running training *****\n", + "[INFO|trainer.py:1608] 2023-02-14 21:56:42,947 >> Num examples = 16000\n", + "[INFO|trainer.py:1609] 2023-02-14 21:56:42,947 >> Num Epochs = 4\n", + "[INFO|trainer.py:1610] 2023-02-14 21:56:42,947 >> Instantaneous batch size per device = 24\n", + "[INFO|trainer.py:1611] 2023-02-14 21:56:42,947 >> Total train batch size (w. parallel, distributed & accumulation) = 24\n", + "[INFO|trainer.py:1612] 2023-02-14 21:56:42,947 >> Gradient Accumulation steps = 1\n", + "[INFO|trainer.py:1613] 2023-02-14 21:56:42,947 >> Total optimization steps = 2500\n", + "{'loss': 1.6218, 'learning_rate': 1.9200000000000003e-05, 'epoch': 0.15}\n", + "{'loss': 1.1593, 'learning_rate': 1.8400000000000003e-05, 'epoch': 0.3}\n", + " 10% 250/2500 [00:39<05:43, 6.56it/s][INFO|trainer.py:725] 2023-02-14 21:57:22,025 >> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 21:57:22,027 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 21:57:22,027 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 21:57:22,027 >> Batch size = 24\n", + "\n", + " 0% 0/84 [00:00> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 21:58:04,248 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 21:58:04,248 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 21:58:04,248 >> Batch size = 24\n", + "\n", + " 0% 0/84 [00:00> Saving model checkpoint to out/emotion/gpt2_custom/checkpoint-500\n", + "[INFO|configuration_utils.py:447] 2023-02-14 21:58:08,381 >> Configuration saved in out/emotion/gpt2_custom/checkpoint-500/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 21:58:09,983 >> Model weights saved in out/emotion/gpt2_custom/checkpoint-500/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 21:58:09,984 >> tokenizer config file saved in out/emotion/gpt2_custom/checkpoint-500/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 21:58:09,984 >> Special tokens file saved in out/emotion/gpt2_custom/checkpoint-500/special_tokens_map.json\n", + "{'loss': 0.356, 'learning_rate': 1.5200000000000002e-05, 'epoch': 0.9}\n", + "{'loss': 0.2714, 'learning_rate': 1.4400000000000001e-05, 'epoch': 1.05}\n", + " 30% 750/2500 [02:07<04:25, 6.59it/s][INFO|trainer.py:725] 2023-02-14 21:58:49,972 >> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 21:58:49,973 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 21:58:49,974 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 21:58:49,974 >> Batch size = 24\n", + "\n", + " 0% 0/84 [00:00> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 21:59:32,170 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 21:59:32,170 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 21:59:32,171 >> Batch size = 24\n", + "\n", + " 0% 0/84 [00:00> Saving model checkpoint to out/emotion/gpt2_custom/checkpoint-1000\n", + "[INFO|configuration_utils.py:447] 2023-02-14 21:59:36,294 >> Configuration saved in out/emotion/gpt2_custom/checkpoint-1000/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 21:59:37,744 >> Model weights saved in out/emotion/gpt2_custom/checkpoint-1000/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 21:59:37,744 >> tokenizer config file saved in out/emotion/gpt2_custom/checkpoint-1000/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 21:59:37,744 >> Special tokens file saved in out/emotion/gpt2_custom/checkpoint-1000/special_tokens_map.json\n", + "{'loss': 0.1836, 'learning_rate': 1.1200000000000001e-05, 'epoch': 1.65}\n", + "{'loss': 0.1844, 'learning_rate': 1.04e-05, 'epoch': 1.8}\n", + " 50% 1250/2500 [03:34<03:09, 6.59it/s][INFO|trainer.py:725] 2023-02-14 22:00:17,827 >> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 22:00:17,829 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 22:00:17,829 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 22:00:17,829 >> Batch size = 24\n", + "\n", + " 0% 0/84 [00:00> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 22:00:59,988 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 22:00:59,988 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 22:00:59,988 >> Batch size = 24\n", + "\n", + " 0% 0/84 [00:00> Saving model checkpoint to out/emotion/gpt2_custom/checkpoint-1500\n", + "[INFO|configuration_utils.py:447] 2023-02-14 22:01:04,120 >> Configuration saved in out/emotion/gpt2_custom/checkpoint-1500/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 22:01:05,576 >> Model weights saved in out/emotion/gpt2_custom/checkpoint-1500/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 22:01:05,576 >> tokenizer config file saved in out/emotion/gpt2_custom/checkpoint-1500/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 22:01:05,576 >> Special tokens file saved in out/emotion/gpt2_custom/checkpoint-1500/special_tokens_map.json\n", + "{'loss': 0.1497, 'learning_rate': 7.2000000000000005e-06, 'epoch': 2.4}\n", + "{'loss': 0.1496, 'learning_rate': 6.4000000000000006e-06, 'epoch': 2.55}\n", + " 70% 1750/2500 [05:02<01:54, 6.54it/s][INFO|trainer.py:725] 2023-02-14 22:01:45,617 >> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 22:01:45,618 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 22:01:45,619 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 22:01:45,619 >> Batch size = 24\n", + "\n", + " 0% 0/84 [00:00> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 22:02:27,846 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 22:02:27,846 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 22:02:27,846 >> Batch size = 24\n", + "\n", + " 0% 0/84 [00:00> Saving model checkpoint to out/emotion/gpt2_custom/checkpoint-2000\n", + "[INFO|configuration_utils.py:447] 2023-02-14 22:02:31,976 >> Configuration saved in out/emotion/gpt2_custom/checkpoint-2000/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 22:02:33,429 >> Model weights saved in out/emotion/gpt2_custom/checkpoint-2000/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 22:02:33,430 >> tokenizer config file saved in out/emotion/gpt2_custom/checkpoint-2000/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 22:02:33,430 >> Special tokens file saved in out/emotion/gpt2_custom/checkpoint-2000/special_tokens_map.json\n", + "{'loss': 0.104, 'learning_rate': 3.2000000000000003e-06, 'epoch': 3.15}\n", + "{'loss': 0.1206, 'learning_rate': 2.4000000000000003e-06, 'epoch': 3.3}\n", + " 90% 2250/2500 [06:30<00:38, 6.55it/s][INFO|trainer.py:725] 2023-02-14 22:03:13,484 >> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 22:03:13,486 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 22:03:13,486 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 22:03:13,486 >> Batch size = 24\n", + "\n", + " 0% 0/84 [00:00> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 22:03:55,705 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 22:03:55,705 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 22:03:55,706 >> Batch size = 24\n", + "\n", + " 0% 0/84 [00:00> Saving model checkpoint to out/emotion/gpt2_custom/checkpoint-2500\n", + "[INFO|configuration_utils.py:447] 2023-02-14 22:03:59,823 >> Configuration saved in out/emotion/gpt2_custom/checkpoint-2500/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 22:04:00,568 >> Model weights saved in out/emotion/gpt2_custom/checkpoint-2500/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 22:04:00,569 >> tokenizer config file saved in out/emotion/gpt2_custom/checkpoint-2500/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 22:04:00,569 >> Special tokens file saved in out/emotion/gpt2_custom/checkpoint-2500/special_tokens_map.json\n", + "[INFO|trainer.py:1852] 2023-02-14 22:04:02,582 >> \n", + "\n", + "Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "\n", + "\n", + "[INFO|trainer.py:1946] 2023-02-14 22:04:02,582 >> Loading best model from out/emotion/gpt2_custom/checkpoint-2000 (score: 0.9365000128746033).\n", + "{'train_runtime': 440.0758, 'train_samples_per_second': 136.34, 'train_steps_per_second': 5.681, 'train_loss': 0.32335229415893557, 'epoch': 3.75}\n", + "100% 2500/2500 [07:20<00:00, 5.68it/s]\n", + "[INFO|trainer.py:2656] 2023-02-14 22:04:03,025 >> Saving model checkpoint to out/emotion/gpt2_custom\n", + "[INFO|configuration_utils.py:447] 2023-02-14 22:04:03,026 >> Configuration saved in out/emotion/gpt2_custom/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 22:04:03,965 >> Model weights saved in out/emotion/gpt2_custom/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 22:04:03,966 >> tokenizer config file saved in out/emotion/gpt2_custom/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 22:04:03,966 >> Special tokens file saved in out/emotion/gpt2_custom/special_tokens_map.json\n", + "***** train metrics *****\n", + " epoch = 3.75\n", + " train_loss = 0.3234\n", + " train_runtime = 0:07:20.07\n", + " train_samples = 16000\n", + " train_samples_per_second = 136.34\n", + " train_steps_per_second = 5.681\n", + "INFO:__main__:*** Evaluate ***\n", + "[INFO|trainer.py:725] 2023-02-14 22:04:04,068 >> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 22:04:04,069 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 22:04:04,069 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 22:04:04,070 >> Batch size = 24\n", + "100% 84/84 [00:04<00:00, 20.35it/s]\n", + "***** eval metrics *****\n", + " epoch = 3.75\n", + " eval_accuracy = 0.9365\n", + " eval_loss = 0.1436\n", + " eval_runtime = 0:00:04.18\n", + " eval_samples = 2000\n", + " eval_samples_per_second = 477.778\n", + " eval_steps_per_second = 20.067\n", + "INFO:__main__:*** Predict ***\n", + "[INFO|trainer.py:725] 2023-02-14 22:04:08,259 >> The following columns in the test set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`, you can safely ignore this message.\n", + "[INFO|trainer.py:2907] 2023-02-14 22:04:08,260 >> ***** Running Prediction *****\n", + "[INFO|trainer.py:2909] 2023-02-14 22:04:08,260 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 22:04:08,260 >> Batch size = 24\n", + "100% 84/84 [00:04<00:00, 20.62it/s]\n", + "INFO:__main__:***** Predict results None *****\n", + "[INFO|modelcard.py:444] 2023-02-14 22:04:12,537 >> Dropping the following result as it does not have all the necessary fields:\n", + "{'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.9365000128746033}]}\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VrHmnOaT7ICl" + }, + "source": [ + "## **T5**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CmuDde477ICl" + }, + "source": [ + "- full data\n", + "- model `T5`\n", + "- sequnece length: 128\n", + "- training epoch: 1\n", + "- first few layers frozen" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "2ruXjeqj7ICl", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "4d73b407-08c3-4007-aa32-c8709dd696fa" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2023-02-14 22:04:17.129470: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-02-14 22:04:17.281426: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", + "2023-02-14 22:04:18.087509: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia\n", + "2023-02-14 22:04:18.087605: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia\n", + "2023-02-14 22:04:18.087624: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n", + "WARNING:__main__:Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n", + "INFO:__main__:Training/evaluation parameters Seq2SeqTrainingArguments(\n", + "_n_gpu=1,\n", + "adafactor=False,\n", + "adam_beta1=0.9,\n", + "adam_beta2=0.999,\n", + "adam_epsilon=1e-08,\n", + "auto_find_batch_size=False,\n", + "bf16=False,\n", + "bf16_full_eval=False,\n", + "data_seed=None,\n", + "dataloader_drop_last=False,\n", + "dataloader_num_workers=0,\n", + "dataloader_pin_memory=True,\n", + "ddp_bucket_cap_mb=None,\n", + "ddp_find_unused_parameters=None,\n", + "ddp_timeout=1800,\n", + "debug=[],\n", + "deepspeed=None,\n", + "disable_tqdm=False,\n", + "do_eval=True,\n", + "do_predict=True,\n", + "do_train=True,\n", + "eval_accumulation_steps=None,\n", + "eval_delay=0,\n", + "eval_steps=250,\n", + "evaluation_strategy=steps,\n", + "fp16=False,\n", + "fp16_backend=auto,\n", + "fp16_full_eval=False,\n", + "fp16_opt_level=O1,\n", + "fsdp=[],\n", + "fsdp_min_num_params=0,\n", + "fsdp_transformer_layer_cls_to_wrap=None,\n", + "full_determinism=False,\n", + "generation_max_length=128,\n", + "generation_num_beams=None,\n", + "gradient_accumulation_steps=1,\n", + "gradient_checkpointing=False,\n", + "greater_is_better=True,\n", + "group_by_length=False,\n", + "half_precision_backend=auto,\n", + "hub_model_id=None,\n", + "hub_private_repo=False,\n", + "hub_strategy=every_save,\n", + "hub_token=,\n", + "ignore_data_skip=False,\n", + "include_inputs_for_metrics=False,\n", + "jit_mode_eval=False,\n", + "label_names=None,\n", + "label_smoothing_factor=0.0,\n", + "learning_rate=5e-05,\n", + "length_column_name=length,\n", + "load_best_model_at_end=True,\n", + "local_rank=-1,\n", + "log_level=passive,\n", + "log_level_replica=passive,\n", + "log_on_each_node=True,\n", + "logging_dir=out/emotion/t5_v1_1/runs/Feb14_22-04-20_fc0011e45a00,\n", + "logging_first_step=False,\n", + "logging_nan_inf_filter=True,\n", + "logging_steps=100,\n", + "logging_strategy=steps,\n", + "lr_scheduler_type=linear,\n", + "max_grad_norm=1.0,\n", + "max_steps=2500,\n", + "metric_for_best_model=accuracy,\n", + "mp_parameters=,\n", + "no_cuda=False,\n", + "num_train_epochs=1.0,\n", + "optim=adamw_hf,\n", + "output_dir=out/emotion/t5_v1_1,\n", + "overwrite_output_dir=True,\n", + "past_index=-1,\n", + "per_device_eval_batch_size=8,\n", + "per_device_train_batch_size=8,\n", + "predict_with_generate=True,\n", + "prediction_loss_only=False,\n", + "push_to_hub=False,\n", + "push_to_hub_model_id=None,\n", + "push_to_hub_organization=None,\n", + "push_to_hub_token=,\n", + "ray_scope=last,\n", + "remove_unused_columns=True,\n", + "report_to=['tensorboard'],\n", + "resume_from_checkpoint=None,\n", + "run_name=out/emotion/t5_v1_1,\n", + "save_on_each_node=False,\n", + "save_steps=500,\n", + "save_strategy=steps,\n", + "save_total_limit=5,\n", + "seed=42,\n", + "sharded_ddp=[],\n", + "skip_memory_metrics=True,\n", + "sortish_sampler=False,\n", + "tf32=None,\n", + "torchdynamo=None,\n", + "tpu_metrics_debug=False,\n", + "tpu_num_cores=None,\n", + "use_ipex=False,\n", + "use_legacy_prediction_loop=False,\n", + "use_mps_device=False,\n", + "warmup_ratio=0.0,\n", + "warmup_steps=0,\n", + "weight_decay=0.0,\n", + "xpu_backend=None,\n", + ")\n", + "WARNING:datasets.builder:Using custom data configuration default-a82ca4164dba097e\n", + "INFO:datasets.info:Loading Dataset Infos from /usr/local/lib/python3.8/dist-packages/datasets/packaged_modules/json\n", + "INFO:datasets.builder:Generating dataset json (/content/t5_cache_training/json/default-a82ca4164dba097e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n", + "Downloading and preparing dataset json/default to /content/t5_cache_training/json/default-a82ca4164dba097e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...\n", + "Downloading data files: 100% 3/3 [00:00<00:00, 11848.32it/s]\n", + "INFO:datasets.download.download_manager:Downloading took 0.0 min\n", + "INFO:datasets.download.download_manager:Checksum Computation took 0.0 min\n", + "Extracting data files: 100% 3/3 [00:00<00:00, 2097.85it/s]\n", + "INFO:datasets.utils.info_utils:Unable to verify checksums.\n", + "INFO:datasets.builder:Generating train split\n", + "INFO:datasets.builder:Generating validation split\n", + "INFO:datasets.builder:Generating test split\n", + "INFO:datasets.utils.info_utils:Unable to verify splits sizes.\n", + "Dataset json downloaded and prepared to /content/t5_cache_training/json/default-a82ca4164dba097e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.\n", + "100% 3/3 [00:00<00:00, 953.83it/s]\n", + "Downloading (…)lve/main/config.json: 100% 537/537 [00:00<00:00, 97.0kB/s]\n", + "[INFO|configuration_utils.py:653] 2023-02-14 22:04:20,972 >> loading configuration file config.json from cache at t5_cache_training/models--google--t5-v1_1-small/snapshots/8a88af75516269158a3aa488d1abdfd3d5e4ee49/config.json\n", + "[INFO|configuration_utils.py:705] 2023-02-14 22:04:20,975 >> Model config T5Config {\n", + " \"_name_or_path\": \"google/t5-v1_1-small\",\n", + " \"architectures\": [\n", + " \"T5ForConditionalGeneration\"\n", + " ],\n", + " \"d_ff\": 1024,\n", + " \"d_kv\": 64,\n", + " \"d_model\": 512,\n", + " \"decoder_start_token_id\": 0,\n", + " \"dense_act_fn\": \"gelu_new\",\n", + " \"dropout_rate\": 0.1,\n", + " \"eos_token_id\": 1,\n", + " \"feed_forward_proj\": \"gated-gelu\",\n", + " \"initializer_factor\": 1.0,\n", + " \"is_encoder_decoder\": true,\n", + " \"is_gated_act\": true,\n", + " \"layer_norm_epsilon\": 1e-06,\n", + " \"model_type\": \"t5\",\n", + " \"num_decoder_layers\": 8,\n", + " \"num_heads\": 6,\n", + " \"num_layers\": 8,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": 0,\n", + " \"relative_attention_max_distance\": 128,\n", + " \"relative_attention_num_buckets\": 32,\n", + " \"tie_word_embeddings\": false,\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 32128\n", + "}\n", + "\n", + "Downloading (…)okenizer_config.json: 100% 1.86k/1.86k [00:00<00:00, 853kB/s]\n", + "[INFO|configuration_utils.py:653] 2023-02-14 22:04:21,160 >> loading configuration file config.json from cache at t5_cache_training/models--google--t5-v1_1-small/snapshots/8a88af75516269158a3aa488d1abdfd3d5e4ee49/config.json\n", + "[INFO|configuration_utils.py:705] 2023-02-14 22:04:21,160 >> Model config T5Config {\n", + " \"_name_or_path\": \"google/t5-v1_1-small\",\n", + " \"architectures\": [\n", + " \"T5ForConditionalGeneration\"\n", + " ],\n", + " \"d_ff\": 1024,\n", + " \"d_kv\": 64,\n", + " \"d_model\": 512,\n", + " \"decoder_start_token_id\": 0,\n", + " \"dense_act_fn\": \"gelu_new\",\n", + " \"dropout_rate\": 0.1,\n", + " \"eos_token_id\": 1,\n", + " \"feed_forward_proj\": \"gated-gelu\",\n", + " \"initializer_factor\": 1.0,\n", + " \"is_encoder_decoder\": true,\n", + " \"is_gated_act\": true,\n", + " \"layer_norm_epsilon\": 1e-06,\n", + " \"model_type\": \"t5\",\n", + " \"num_decoder_layers\": 8,\n", + " \"num_heads\": 6,\n", + " \"num_layers\": 8,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": 0,\n", + " \"relative_attention_max_distance\": 128,\n", + " \"relative_attention_num_buckets\": 32,\n", + " \"tie_word_embeddings\": false,\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 32128\n", + "}\n", + "\n", + "Downloading (…)ve/main/spiece.model: 100% 792k/792k [00:00<00:00, 10.2MB/s]\n", + "Downloading (…)cial_tokens_map.json: 100% 1.79k/1.79k [00:00<00:00, 705kB/s]\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 22:04:21,837 >> loading file spiece.model from cache at t5_cache_training/models--google--t5-v1_1-small/snapshots/8a88af75516269158a3aa488d1abdfd3d5e4ee49/spiece.model\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 22:04:21,837 >> loading file tokenizer.json from cache at None\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 22:04:21,837 >> loading file added_tokens.json from cache at None\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 22:04:21,837 >> loading file special_tokens_map.json from cache at t5_cache_training/models--google--t5-v1_1-small/snapshots/8a88af75516269158a3aa488d1abdfd3d5e4ee49/special_tokens_map.json\n", + "[INFO|tokenization_utils_base.py:1773] 2023-02-14 22:04:21,837 >> loading file tokenizer_config.json from cache at t5_cache_training/models--google--t5-v1_1-small/snapshots/8a88af75516269158a3aa488d1abdfd3d5e4ee49/tokenizer_config.json\n", + "[INFO|configuration_utils.py:653] 2023-02-14 22:04:21,838 >> loading configuration file config.json from cache at t5_cache_training/models--google--t5-v1_1-small/snapshots/8a88af75516269158a3aa488d1abdfd3d5e4ee49/config.json\n", + "[INFO|configuration_utils.py:705] 2023-02-14 22:04:21,838 >> Model config T5Config {\n", + " \"_name_or_path\": \"google/t5-v1_1-small\",\n", + " \"architectures\": [\n", + " \"T5ForConditionalGeneration\"\n", + " ],\n", + " \"d_ff\": 1024,\n", + " \"d_kv\": 64,\n", + " \"d_model\": 512,\n", + " \"decoder_start_token_id\": 0,\n", + " \"dense_act_fn\": \"gelu_new\",\n", + " \"dropout_rate\": 0.1,\n", + " \"eos_token_id\": 1,\n", + " \"feed_forward_proj\": \"gated-gelu\",\n", + " \"initializer_factor\": 1.0,\n", + " \"is_encoder_decoder\": true,\n", + " \"is_gated_act\": true,\n", + " \"layer_norm_epsilon\": 1e-06,\n", + " \"model_type\": \"t5\",\n", + " \"num_decoder_layers\": 8,\n", + " \"num_heads\": 6,\n", + " \"num_layers\": 8,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": 0,\n", + " \"relative_attention_max_distance\": 128,\n", + " \"relative_attention_num_buckets\": 32,\n", + " \"tie_word_embeddings\": false,\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 32128\n", + "}\n", + "\n", + "[INFO|configuration_utils.py:653] 2023-02-14 22:04:21,888 >> loading configuration file config.json from cache at t5_cache_training/models--google--t5-v1_1-small/snapshots/8a88af75516269158a3aa488d1abdfd3d5e4ee49/config.json\n", + "[INFO|configuration_utils.py:705] 2023-02-14 22:04:21,889 >> Model config T5Config {\n", + " \"_name_or_path\": \"google/t5-v1_1-small\",\n", + " \"architectures\": [\n", + " \"T5ForConditionalGeneration\"\n", + " ],\n", + " \"d_ff\": 1024,\n", + " \"d_kv\": 64,\n", + " \"d_model\": 512,\n", + " \"decoder_start_token_id\": 0,\n", + " \"dense_act_fn\": \"gelu_new\",\n", + " \"dropout_rate\": 0.1,\n", + " \"eos_token_id\": 1,\n", + " \"feed_forward_proj\": \"gated-gelu\",\n", + " \"initializer_factor\": 1.0,\n", + " \"is_encoder_decoder\": true,\n", + " \"is_gated_act\": true,\n", + " \"layer_norm_epsilon\": 1e-06,\n", + " \"model_type\": \"t5\",\n", + " \"num_decoder_layers\": 8,\n", + " \"num_heads\": 6,\n", + " \"num_layers\": 8,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": 0,\n", + " \"relative_attention_max_distance\": 128,\n", + " \"relative_attention_num_buckets\": 32,\n", + " \"tie_word_embeddings\": false,\n", + " \"transformers_version\": \"4.23.1\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 32128\n", + "}\n", + "\n", + "Downloading (…)\"pytorch_model.bin\";: 100% 308M/308M [00:03<00:00, 84.8MB/s]\n", + "[INFO|modeling_utils.py:2156] 2023-02-14 22:04:26,050 >> loading weights file pytorch_model.bin from cache at t5_cache_training/models--google--t5-v1_1-small/snapshots/8a88af75516269158a3aa488d1abdfd3d5e4ee49/pytorch_model.bin\n", + "[INFO|modeling_utils.py:2606] 2023-02-14 22:04:27,048 >> All model checkpoint weights were used when initializing T5ForConditionalGeneration.\n", + "\n", + "[INFO|modeling_utils.py:2614] 2023-02-14 22:04:27,048 >> All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at google/t5-v1_1-small.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.\n", + "\n", + "\n", + "Frozen layers:\n", + "[('encoder.block.1.layer.0.SelfAttention.q.weight', False), ('encoder.block.1.layer.0.SelfAttention.k.weight', False), ('encoder.block.1.layer.0.SelfAttention.v.weight', False), ('encoder.block.1.layer.0.SelfAttention.o.weight', False), ('encoder.block.1.layer.0.layer_norm.weight', False), ('encoder.block.1.layer.1.DenseReluDense.wi_0.weight', False), ('encoder.block.1.layer.1.DenseReluDense.wi_1.weight', False), ('encoder.block.1.layer.1.DenseReluDense.wo.weight', False), ('encoder.block.1.layer.1.layer_norm.weight', False), ('encoder.block.2.layer.0.SelfAttention.q.weight', False), ('encoder.block.2.layer.0.SelfAttention.k.weight', False), ('encoder.block.2.layer.0.SelfAttention.v.weight', False), ('encoder.block.2.layer.0.SelfAttention.o.weight', False), ('encoder.block.2.layer.0.layer_norm.weight', False), ('encoder.block.2.layer.1.DenseReluDense.wi_0.weight', False), ('encoder.block.2.layer.1.DenseReluDense.wi_1.weight', False), ('encoder.block.2.layer.1.DenseReluDense.wo.weight', False), ('encoder.block.2.layer.1.layer_norm.weight', False), ('encoder.block.3.layer.0.SelfAttention.q.weight', False), ('encoder.block.3.layer.0.SelfAttention.k.weight', False), ('encoder.block.3.layer.0.SelfAttention.v.weight', False), ('encoder.block.3.layer.0.SelfAttention.o.weight', False), ('encoder.block.3.layer.0.layer_norm.weight', False), ('encoder.block.3.layer.1.DenseReluDense.wi_0.weight', False), ('encoder.block.3.layer.1.DenseReluDense.wi_1.weight', False), ('encoder.block.3.layer.1.DenseReluDense.wo.weight', False), ('encoder.block.3.layer.1.layer_norm.weight', False), ('encoder.block.4.layer.0.SelfAttention.q.weight', False), ('encoder.block.4.layer.0.SelfAttention.k.weight', False), ('encoder.block.4.layer.0.SelfAttention.v.weight', False), ('encoder.block.4.layer.0.SelfAttention.o.weight', False), ('encoder.block.4.layer.0.layer_norm.weight', False), ('encoder.block.4.layer.1.DenseReluDense.wi_0.weight', False), ('encoder.block.4.layer.1.DenseReluDense.wi_1.weight', False), ('encoder.block.4.layer.1.DenseReluDense.wo.weight', False), ('encoder.block.4.layer.1.layer_norm.weight', False), ('encoder.block.5.layer.0.SelfAttention.q.weight', False), ('encoder.block.5.layer.0.SelfAttention.k.weight', False), ('encoder.block.5.layer.0.SelfAttention.v.weight', False), ('encoder.block.5.layer.0.SelfAttention.o.weight', False), ('encoder.block.5.layer.0.layer_norm.weight', False), ('encoder.block.5.layer.1.DenseReluDense.wi_0.weight', False), ('encoder.block.5.layer.1.DenseReluDense.wi_1.weight', False), ('encoder.block.5.layer.1.DenseReluDense.wo.weight', False), ('encoder.block.5.layer.1.layer_norm.weight', False), ('encoder.block.6.layer.0.SelfAttention.q.weight', False), ('encoder.block.6.layer.0.SelfAttention.k.weight', False), ('encoder.block.6.layer.0.SelfAttention.v.weight', False), ('encoder.block.6.layer.0.SelfAttention.o.weight', False), ('encoder.block.6.layer.0.layer_norm.weight', False), ('encoder.block.6.layer.1.DenseReluDense.wi_0.weight', False), ('encoder.block.6.layer.1.DenseReluDense.wi_1.weight', False), ('encoder.block.6.layer.1.DenseReluDense.wo.weight', False), ('encoder.block.6.layer.1.layer_norm.weight', False), ('encoder.block.7.layer.0.SelfAttention.q.weight', False), ('encoder.block.7.layer.0.SelfAttention.k.weight', False), ('encoder.block.7.layer.0.SelfAttention.v.weight', False), ('encoder.block.7.layer.0.SelfAttention.o.weight', False), ('encoder.block.7.layer.0.layer_norm.weight', False), ('encoder.block.7.layer.1.DenseReluDense.wi_0.weight', False), ('encoder.block.7.layer.1.DenseReluDense.wi_1.weight', False), ('encoder.block.7.layer.1.DenseReluDense.wo.weight', False), ('encoder.block.7.layer.1.layer_norm.weight', False)] \n", + "\n", + "\n", + "INFO:__main__:Using translation prefix: \"emotion classification: \"\n", + "Running tokenizer on train dataset: 0% 0/16 [00:00> max_steps is given, it will override any value given in num_train_epochs\n", + "/usr/local/lib/python3.8/dist-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " warnings.warn(\n", + "[INFO|trainer.py:1607] 2023-02-14 22:04:30,911 >> ***** Running training *****\n", + "[INFO|trainer.py:1608] 2023-02-14 22:04:30,911 >> Num examples = 16000\n", + "[INFO|trainer.py:1609] 2023-02-14 22:04:30,911 >> Num Epochs = 2\n", + "[INFO|trainer.py:1610] 2023-02-14 22:04:30,911 >> Instantaneous batch size per device = 8\n", + "[INFO|trainer.py:1611] 2023-02-14 22:04:30,911 >> Total train batch size (w. parallel, distributed & accumulation) = 8\n", + "[INFO|trainer.py:1612] 2023-02-14 22:04:30,911 >> Gradient Accumulation steps = 1\n", + "[INFO|trainer.py:1613] 2023-02-14 22:04:30,911 >> Total optimization steps = 2500\n", + " 0% 0/2500 [00:00> You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n", + "{'loss': 21.5908, 'learning_rate': 4.8e-05, 'epoch': 0.05}\n", + "{'loss': 14.8264, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.1}\n", + " 10% 249/2500 [00:24<03:31, 10.64it/s][INFO|trainer.py:2907] 2023-02-14 22:04:55,366 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 22:04:55,366 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 22:04:55,366 >> Batch size = 8\n", + "\n", + " 0% 0/250 [00:00> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 22:05:35,963 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 22:05:35,963 >> Batch size = 8\n", + "\n", + " 0% 0/250 [00:00> Saving model checkpoint to out/emotion/t5_v1_1/checkpoint-500\n", + "[INFO|configuration_utils.py:447] 2023-02-14 22:05:53,120 >> Configuration saved in out/emotion/t5_v1_1/checkpoint-500/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 22:05:53,749 >> Model weights saved in out/emotion/t5_v1_1/checkpoint-500/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 22:05:53,750 >> tokenizer config file saved in out/emotion/t5_v1_1/checkpoint-500/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 22:05:53,750 >> Special tokens file saved in out/emotion/t5_v1_1/checkpoint-500/special_tokens_map.json\n", + "[INFO|tokenization_t5_fast.py:187] 2023-02-14 22:05:53,788 >> Copy vocab file to out/emotion/t5_v1_1/checkpoint-500/spiece.model\n", + "{'loss': 3.7795, 'learning_rate': 3.8e-05, 'epoch': 0.3}\n", + "{'loss': 2.9169, 'learning_rate': 3.6e-05, 'epoch': 0.35}\n", + " 30% 749/2500 [01:47<02:43, 10.71it/s][INFO|trainer.py:2907] 2023-02-14 22:06:18,135 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 22:06:18,136 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 22:06:18,136 >> Batch size = 8\n", + "\n", + " 0% 0/250 [00:00> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 22:06:58,636 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 22:06:58,636 >> Batch size = 8\n", + "\n", + " 0% 0/250 [00:00> Saving model checkpoint to out/emotion/t5_v1_1/checkpoint-1000\n", + "[INFO|configuration_utils.py:447] 2023-02-14 22:07:15,785 >> Configuration saved in out/emotion/t5_v1_1/checkpoint-1000/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 22:07:16,414 >> Model weights saved in out/emotion/t5_v1_1/checkpoint-1000/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 22:07:16,415 >> tokenizer config file saved in out/emotion/t5_v1_1/checkpoint-1000/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 22:07:16,416 >> Special tokens file saved in out/emotion/t5_v1_1/checkpoint-1000/special_tokens_map.json\n", + "[INFO|tokenization_t5_fast.py:187] 2023-02-14 22:07:16,453 >> Copy vocab file to out/emotion/t5_v1_1/checkpoint-1000/spiece.model\n", + "{'loss': 1.9003, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.55}\n", + "{'loss': 1.7884, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.6}\n", + " 50% 1249/2500 [03:09<01:59, 10.49it/s][INFO|trainer.py:2907] 2023-02-14 22:07:40,879 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 22:07:40,879 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 22:07:40,879 >> Batch size = 8\n", + "\n", + " 0% 0/250 [00:00> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 22:08:21,433 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 22:08:21,433 >> Batch size = 8\n", + "\n", + " 0% 0/250 [00:00> Saving model checkpoint to out/emotion/t5_v1_1/checkpoint-1500\n", + "[INFO|configuration_utils.py:447] 2023-02-14 22:08:38,814 >> Configuration saved in out/emotion/t5_v1_1/checkpoint-1500/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 22:08:39,285 >> Model weights saved in out/emotion/t5_v1_1/checkpoint-1500/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 22:08:39,286 >> tokenizer config file saved in out/emotion/t5_v1_1/checkpoint-1500/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 22:08:39,286 >> Special tokens file saved in out/emotion/t5_v1_1/checkpoint-1500/special_tokens_map.json\n", + "[INFO|tokenization_t5_fast.py:187] 2023-02-14 22:08:39,322 >> Copy vocab file to out/emotion/t5_v1_1/checkpoint-1500/spiece.model\n", + "{'loss': 1.4835, 'learning_rate': 1.8e-05, 'epoch': 0.8}\n", + "{'loss': 1.449, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.85}\n", + " 70% 1749/2500 [04:32<01:10, 10.61it/s][INFO|trainer.py:2907] 2023-02-14 22:09:03,363 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 22:09:03,363 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 22:09:03,363 >> Batch size = 8\n", + "\n", + " 0% 0/250 [00:00> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 22:09:43,863 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 22:09:43,863 >> Batch size = 8\n", + "\n", + " 0% 0/250 [00:00> Saving model checkpoint to out/emotion/t5_v1_1/checkpoint-2000\n", + "[INFO|configuration_utils.py:447] 2023-02-14 22:10:01,105 >> Configuration saved in out/emotion/t5_v1_1/checkpoint-2000/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 22:10:01,585 >> Model weights saved in out/emotion/t5_v1_1/checkpoint-2000/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 22:10:01,586 >> tokenizer config file saved in out/emotion/t5_v1_1/checkpoint-2000/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 22:10:01,586 >> Special tokens file saved in out/emotion/t5_v1_1/checkpoint-2000/special_tokens_map.json\n", + "[INFO|tokenization_t5_fast.py:187] 2023-02-14 22:10:01,623 >> Copy vocab file to out/emotion/t5_v1_1/checkpoint-2000/spiece.model\n", + "{'loss': 1.2708, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.05}\n", + "{'loss': 1.3351, 'learning_rate': 6e-06, 'epoch': 1.1}\n", + " 90% 2249/2500 [05:54<00:23, 10.80it/s][INFO|trainer.py:2907] 2023-02-14 22:10:25,736 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 22:10:25,736 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 22:10:25,736 >> Batch size = 8\n", + "\n", + " 0% 0/250 [00:00> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 22:11:06,283 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 22:11:06,283 >> Batch size = 8\n", + "\n", + " 0% 0/250 [00:00> Saving model checkpoint to out/emotion/t5_v1_1/checkpoint-2500\n", + "[INFO|configuration_utils.py:447] 2023-02-14 22:11:23,557 >> Configuration saved in out/emotion/t5_v1_1/checkpoint-2500/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 22:11:24,033 >> Model weights saved in out/emotion/t5_v1_1/checkpoint-2500/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 22:11:24,034 >> tokenizer config file saved in out/emotion/t5_v1_1/checkpoint-2500/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 22:11:24,034 >> Special tokens file saved in out/emotion/t5_v1_1/checkpoint-2500/special_tokens_map.json\n", + "[INFO|tokenization_t5_fast.py:187] 2023-02-14 22:11:24,070 >> Copy vocab file to out/emotion/t5_v1_1/checkpoint-2500/spiece.model\n", + "[INFO|trainer.py:1852] 2023-02-14 22:11:24,853 >> \n", + "\n", + "Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "\n", + "\n", + "[INFO|trainer.py:1946] 2023-02-14 22:11:24,854 >> Loading best model from out/emotion/t5_v1_1/checkpoint-500 (score: 1.0).\n", + "{'train_runtime': 414.2608, 'train_samples_per_second': 48.279, 'train_steps_per_second': 6.035, 'train_loss': 3.8232721221923827, 'epoch': 1.25}\n", + "100% 2500/2500 [06:54<00:00, 6.03it/s]\n", + "[INFO|trainer.py:2656] 2023-02-14 22:11:25,173 >> Saving model checkpoint to out/emotion/t5_v1_1\n", + "[INFO|configuration_utils.py:447] 2023-02-14 22:11:25,174 >> Configuration saved in out/emotion/t5_v1_1/config.json\n", + "[INFO|modeling_utils.py:1624] 2023-02-14 22:11:25,662 >> Model weights saved in out/emotion/t5_v1_1/pytorch_model.bin\n", + "[INFO|tokenization_utils_base.py:2123] 2023-02-14 22:11:25,663 >> tokenizer config file saved in out/emotion/t5_v1_1/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2130] 2023-02-14 22:11:25,663 >> Special tokens file saved in out/emotion/t5_v1_1/special_tokens_map.json\n", + "[INFO|tokenization_t5_fast.py:187] 2023-02-14 22:11:25,703 >> Copy vocab file to out/emotion/t5_v1_1/spiece.model\n", + "***** train metrics *****\n", + " epoch = 1.25\n", + " train_loss = 3.8233\n", + " train_runtime = 0:06:54.26\n", + " train_samples = 16000\n", + " train_samples_per_second = 48.279\n", + " train_steps_per_second = 6.035\n", + "INFO:__main__:*** Evaluate ***\n", + "[INFO|trainer.py:2907] 2023-02-14 22:11:25,713 >> ***** Running Evaluation *****\n", + "[INFO|trainer.py:2909] 2023-02-14 22:11:25,713 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 22:11:25,713 >> Batch size = 8\n", + "100% 250/250 [00:17<00:00, 14.50it/s]\n", + "***** eval metrics *****\n", + " epoch = 1.25\n", + " eval_accuracy = 1.0\n", + " eval_bleu = 0.0\n", + " eval_gen_len = 2.0\n", + " eval_loss = 2.1697\n", + " eval_runtime = 0:00:17.31\n", + " eval_samples = 2000\n", + " eval_samples_per_second = 115.494\n", + " eval_steps_per_second = 14.437\n", + "INFO:__main__:*** Predict ***\n", + "[INFO|trainer.py:2907] 2023-02-14 22:11:43,033 >> ***** Running Prediction *****\n", + "[INFO|trainer.py:2909] 2023-02-14 22:11:43,033 >> Num examples = 2000\n", + "[INFO|trainer.py:2912] 2023-02-14 22:11:43,034 >> Batch size = 8\n", + "100% 250/250 [00:17<00:00, 14.58it/s]\n", + "***** predict metrics *****\n", + " predict_accuracy = 1.0\n", + " predict_bleu = 0.0\n", + " predict_gen_len = 2.0\n", + " predict_loss = 2.1029\n", + " predict_runtime = 0:00:17.21\n", + " predict_samples = 2000\n", + " predict_samples_per_second = 116.158\n", + " predict_steps_per_second = 14.52\n", + "[INFO|modelcard.py:444] 2023-02-14 22:12:00,417 >> Dropping the following result as it does not have all the necessary fields:\n", + "{'task': {'name': 'Translation', 'type': 'translation'}, 'metrics': [{'name': 'Bleu', 'type': 'bleu', 'value': 0.0}, {'name': 'Accuracy', 'type': 'accuracy', 'value': 1.0}]}\n" + ] + } + ], + "source": [ + "!python run_translation.py \\\n", + " --cache_dir t5_cache_training \\\n", + " --model_name_or_path \"google/t5-v1_1-small\" \\\n", + " --train_file data/s2s-train.json \\\n", + " --validation_file data/s2s-valid.json \\\n", + " --test_file data/s2s-test.json \\\n", + " --per_device_train_batch_size 8 \\\n", + " --per_device_eval_batch_size 8 \\\n", + " --source_lang \"text\" \\\n", + " --target_lang \"label\" \\\n", + " --source_prefix \"emotion classification\" \\\n", + " --max_source_length 256 \\\n", + " --max_target_length 128 \\\n", + " --generation_max_length 128 \\\n", + " --do_train \\\n", + " --do_eval \\\n", + " --do_predict \\\n", + " --predict_with_generate \\\n", + " --num_train_epochs 1 \\\n", + " --output_dir out/emotion/t5_v1_1 \\\n", + " --overwrite_output_dir \\\n", + " --eval_steps 250 \\\n", + " --evaluation_strategy steps \\\n", + " --metric_for_best_model accuracy \\\n", + " --logging_steps 100 \\\n", + " --save_total_limit 5 \\\n", + " --max_steps 2500 \\\n", + " --load_best_model_at_end True " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XyC_7Ov07ICm" + }, + "source": [ + "# **FLAN T5**" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "nX6LOzsF7ICm" + }, + "outputs": [], + "source": [ + "from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "EEuIugWA7ICm" + }, + "outputs": [], + "source": [ + "if torch.cuda.is_available():\n", + " device = 0\n", + "else:\n", + " device = -1" + ] + }, + { + "cell_type": "code", + "source": [ + "def perform_shot_learning(pipeline_type, model_name, test_file):\n", + " class_type = AutoModelForSeq2SeqLM\n", + " model = class_type.from_pretrained(model_name, torch_dtype=torch.float32)\n", + " tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + "\n", + " our_pipeline = pipeline(pipeline_type, model=model, tokenizer=tokenizer, device=device)\n", + "\n", + " correct = 0\n", + "\n", + " labels = \"possible labels: sadness, joy, love, anger, fear, surprise\"\n", + "\n", + " with open(test_file) as f:\n", + " f_lines = f.readlines()\n", + " for line in f_lines:\n", + " ex = json.loads(line)\n", + " prompt = ex['text']\n", + "\n", + " tmp = labels + '\\n' + f'text: {prompt}' + '\\n' + 'label: '\n", + " \n", + " predict = our_pipeline(tmp, do_sample=False)[0]['generated_text']\n", + "\n", + " if predict == ex['label']:\n", + " correct += 1\n", + "\n", + " print(f'Accuracy: {correct/len(f_lines)}')" + ], + "metadata": { + "id": "AtDz85GKalzg" + }, + "execution_count": 15, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "test_ds = 'data/s2s-test.json'" + ], + "metadata": { + "id": "q9-4fzxpaoff" + }, + "execution_count": 16, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "perform_shot_learning('text2text-generation', 'google/flan-t5-large', test_ds)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 219, + "referenced_widgets": [ + "18f03144f5194bd2a88064eaae1140f0", + "36b8333766d44ee2aaa8da8ee75975d2", + "f7a9b125cf1346468e428abd689ff800", + "9b9e6161874f41c98d5c5e55d8d4fc86", + "9925a6f17ba14eee96332f0ea1dc88e5", + "dce64adfb8334591a8ce182918ecb4e3", + "9efd8cd2208245aca3f369f0735e2ee1", + "3d05704ffb0040c8b5bfb5c068c3329b", + "9564dcdd10c64072bb09e70def311ff3", + "f406c9b52a274068bd636554558497b2", + "d97be50f8cc64f8680a6cce112863255", + "7d6b88e56dad4dcbb0f1b1720f1ff118", + "eabc78cbdeef40feb36cf90fdbcdfbc7", + "6477d99dffbc4cf39e2c6998f71e37f7", + "d63511a8852942309cabe53720939fcc", + "3096b59f64eb48659a8eedea5a171be4", + "acc58b06f3b54801b10ee872fab39e6e", + "c2bd9c9ddab848529e52adfdc7634044", + "8d7e8c29d7e247f1b55d329d40508526", + "457b70adcab0464c9f990b13f433c635", + "0858fe327ec549b488f6169de1d84654", + "e18a505153c7491f8900142fb1189cd7", + "945026e5e11448b39ab37fb2a0bd963c", + "8c3aa97d58cb4f21b59af6253c952859", + "848ff807a83c4a79a1b3d7d80c29499c", + "a7b1f6722fcd4e90811041b24df0fe7b", + "f815d05091814c39a467cd8f528db504", + "915449ab41d848d39d801b4feb932a4f", + "2937b015455647abb7a524f858a881d2", + "c2b6cda9a8e94f7e97d7fb032b8e2bc5", + "af885a022ad743098e5037e1c8dc760a", + "088ec36aff7f415abfc4fd926fa0f902", + "b1b99d863dc64208afc11416d4936c2c", + "cb9e02be7ec44f6bb6b8771691c114e4", + "f68a247bddf9484e9f7b1666802f4612", + "d8d89ac972084304bff515a16e009452", + "3495b00846ae49acbb0cf3e15edf361e", + "60f6f23e78ce4ee2abf7389ab936c3ac", + "9d428e02c4134510baf179ce9137d90c", + "5298f4cd4e2e404ea66d70c62bcfe439", + "cd9fdc3eb94a4d00b5af6115318dcf45", + "d664c674a977456cad109347c0206d0e", + "17e5dedc0aeb4a1da32113e51158fd74", + "9b70ec9f110f4080a6a26fd12044fe94" + ] + }, + "id": "7fWzF9PVatgL", + "outputId": "6c37c046-a14c-4cab-e285-fa1ddfeb3241" + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Downloading (…)okenizer_config.json: 0%| | 0.00/2.54k [00:00 None: + file_name = 's2s-' + original_save_path.name + file_path = original_save_path.parent / file_name + + print(f'Saving into: {file_path}') + with open(file_path, 'wt') as f_write: + for data_line in data_to_save: + label = data_line['label'] + new_label = MAP_LABEL_TRANSLATION[label] + data_line['label'] = new_label + data_line_str = json.dumps(data_line) + f_write.write(f'{data_line_str}\n') + +def main() -> None: + loaded_data = load_dataset('emotion') + logger.info(f'Loaded dataset emotion: {loaded_data}') + + save_path = Path('data') + save_train_path = save_path / 'train.json' + save_valid_path = save_path / 'valid.json' + save_test_path = save_path / 'test.json' + if not save_path.exists(): + save_path.mkdir() + + data_train, data_valid, data_test = [], [], [] + for source_data, dataset, max_size in [ + (loaded_data['train'], data_train, None), + (loaded_data['test'], data_test, None), + (loaded_data['validation'], data_valid, None) + ]: + for i, data in enumerate(source_data): + if max_size is not None and i >= max_size: + break + data_line = { + 'label': int(data['label']), + 'text': data['text'], + } + dataset.append(data_line) + logger.info(f'Train: {len(data_train):6d}') + logger.info(f'Test: {len(data_test):6d}') + logger.info(f'Validation: {len(data_valid):6d}') + + for file_path, data_to_save in [ + (save_train_path, data_train), + (save_valid_path, data_valid), + (save_test_path, data_test) + ]: + print(f'Saving into: {file_path}') + with open(file_path, 'wt') as f_write: + for data_line in data_to_save: + data_line_str = json.dumps(data_line) + f_write.write(f'{data_line_str}\n') + + save_as_translations(file_path, data_to_save) + + + +if __name__ == '__main__': + main() diff --git a/scripts/gpt2.py b/scripts/gpt2.py new file mode 100644 index 0000000..522f568 --- /dev/null +++ b/scripts/gpt2.py @@ -0,0 +1,205 @@ +import logging +from typing import Optional, Tuple, Union + +import torch +from torch import nn +from torch.nn import MSELoss, CrossEntropyLoss, BCEWithLogitsLoss +from transformers import GPT2Model, GPT2ForSequenceClassification +from transformers.modeling_outputs import SequenceClassifierOutputWithPast + + +logger = logging.getLogger(__name__) + + +# Simple version # + +class GPT2ClassificationHeadCustomSimple(nn.Module): + def __init__(self, config): + super().__init__() + hidden_size = config.n_embd + self.dense_1 = nn.Linear(hidden_size, 4 * hidden_size) + self.dense_2 = nn.Linear(4 * hidden_size, 2* hidden_size) + self.dense_3 = nn.Linear(2 * hidden_size, hidden_size) + self.dropout = nn.Dropout(config.resid_pdrop) + self.out_proj = nn.Linear(hidden_size, config.num_labels, bias=False) + + def forward(self, x): + x = self.dense_1(x) + x = torch.relu(x) + x = self.dropout(x) + + x = self.dense_2(x) + x = torch.relu(x) + x = self.dropout(x) + + x = self.dense_3(x) + x = torch.relu(x) + x = self.dropout(x) + + x = self.out_proj(x) + return x + + +class GPT2ForSequenceClassificationCustomSimple(GPT2ForSequenceClassification): + _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.transformer = GPT2Model(config) + self.score = GPT2ClassificationHeadCustomSimple(config) + + # Model parallel + self.model_parallel = False + self.device_map = None + + # Initialize weights and apply final processing + self.post_init() + + +# Version with custom forward 1 # + +class GPT2ClassificationHeadCustom(nn.Module): + def __init__(self, config): + super().__init__() + hidden_size = config.n_embd + self.dense_1_input = nn.Linear(hidden_size, 2 * hidden_size) + self.dense_1_hidden = nn.Linear(hidden_size, 2 * hidden_size) + self.dense_2 = nn.Linear(4 * hidden_size, hidden_size) + self.dropout = nn.Dropout(config.resid_pdrop) + self.out_proj = nn.Linear(hidden_size, config.num_labels, bias=False) + + def forward(self, x, **kwargs): + if 'hidden_states' in kwargs and kwargs['hidden_states'] is not None: + # Get hidden states from second from the end + hidden = kwargs['hidden_states'][-2] + else: + hidden = torch.zeros(x.size(), dtype=x.dtype, device=x.device) + + x = self.dense_1_input(x) + x = torch.relu(x) + x = self.dropout(x) + + hidden = self.dense_1_hidden(hidden) + hidden = torch.relu(hidden) + hidden = self.dropout(hidden) + + x = torch.cat((x, hidden), dim=2) + x = self.dense_2(x) + x = torch.relu(x) + x = self.dropout(x) + + x = self.out_proj(x) + return x + + +class GPT2ForSequenceClassificationCustom(GPT2ForSequenceClassification): + _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.transformer = GPT2Model(config) + self.score = GPT2ClassificationHeadCustom(config) + + # Model parallel + self.model_parallel = False + self.device_map = None + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states or self.config.use_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states, hidden_states=transformer_outputs.hidden_states) + + if input_ids is not None: + batch_size, sequence_length = input_ids.shape[:2] + else: + batch_size, sequence_length = inputs_embeds.shape[:2] + + assert ( + self.config.pad_token_id is not None or batch_size == 1 + ), "Cannot handle batch sizes > 1 if no padding token is defined." + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1 + else: + sequence_lengths = -1 + logger.warning( + f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " + "unexpected if using padding tokens in conjunction with `inputs_embeds.`" + ) + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/scripts/roberta.py b/scripts/roberta.py new file mode 100644 index 0000000..5824775 --- /dev/null +++ b/scripts/roberta.py @@ -0,0 +1,334 @@ +from typing import Optional, Union, Tuple + +import torch +from torch import nn +from torch.nn import MSELoss, CrossEntropyLoss, BCEWithLogitsLoss +from transformers import RobertaForSequenceClassification, RobertaModel +from transformers.modeling_outputs import SequenceClassifierOutput + + +# Simple version # + +class RobertaClassificationHeadCustomSimple(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config): + super().__init__() + hidden_size = config.hidden_size + self.dense_1 = nn.Linear(hidden_size, 2 * hidden_size) + self.dense_1_hidden = nn.Linear(hidden_size, 2 * hidden_size) + self.dense_2 = nn.Linear(2 * hidden_size, hidden_size) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.relu = nn.LeakyReLU() + self.out_proj = nn.Linear(hidden_size, config.num_labels) + + def forward(self, features, **kwargs): + x = features[:, 0, :] # take token (equiv. to [CLS]) + + x = self.dense_1(x) + x = self.relu(x) + x = self.dropout(x) + + hidden = self.dense_1_hidden(hidden) + hidden = torch.relu(hidden) + hidden = self.dropout(hidden) + + x = torch.cat((x, hidden), dim=2) + + x = self.dense_2(x) + x = self.relu(x) + x = self.dropout(x) + + x = self.out_proj(x) + return x + + +class RobertaForSequenceClassificationCustomSimple(RobertaForSequenceClassification): + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.roberta = RobertaModel(config, add_pooling_layer=False) + self.classifier = RobertaClassificationHeadCustomSimple(config) + + # Initialize weights and apply final processing + self.init_weights() + + +# Version with custom forward 1 # + +class RobertaClassificationHeadCustom(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config): + super().__init__() + self.use_hidden_states = config.use_hidden_states + hidden_size = config.hidden_size + if self.use_hidden_states: + hidden_size *= 2 + + self.dense_1 = nn.Linear(hidden_size, 2 * hidden_size) + self.dense_2 = nn.Linear(2 * hidden_size, hidden_size) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.out_proj = nn.Linear(hidden_size, config.num_labels) + + def forward(self, features, **kwargs): + if 'hidden_states' in kwargs and kwargs['hidden_states'] is not None: + if self.use_hidden_states: + x = torch.cat( + ( + features[:, 0, :], + # take token (equiv. to [CLS]) from hidden states from second from the end + kwargs['hidden_states'][-2][:, 0, :] + ), + dim=1 + ) + else: + x = features[:, 0, :] + kwargs['hidden_states'][-2][:, 0, :] + del kwargs['hidden_states'] + else: + x = features[:, 0, :] # take token (equiv. to [CLS]) + if self.use_hidden_states: + x = torch.cat( + ( + features[:, 0, :], + torch.zeros(x.size(), dtype=x.dtype, device=x.device) + ), + dim=1 + ) + + x = self.dense_1(x) + x = torch.relu(x) + x = self.dropout(x) + + x = self.dense_2(x) + x = torch.relu(x) + x = self.dropout(x) + + x = self.out_proj(x) + return x + + +class RobertaForSequenceClassificationCustom(RobertaForSequenceClassification): + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.roberta = RobertaModel(config, add_pooling_layer=False) + self.classifier = RobertaClassificationHeadCustom(config) + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states or self.config.use_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + if return_dict: + logits = self.classifier(sequence_output, hidden_states=outputs.hidden_states) + else: + raise NotImplemented('Not implemented for using non-dictionary object') + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +# Version with custom forward 2 # + +class RobertaClassificationHeadCustomAlternative(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config): + super().__init__() + hidden_size = config.hidden_size + + self.dense_1_input = nn.Linear(hidden_size, 2 * hidden_size) + self.dense_1_hidden = nn.Linear(hidden_size, 2 * hidden_size) + self.dense_2 = nn.Linear(4 * hidden_size, hidden_size) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.out_proj = nn.Linear(hidden_size, config.num_labels) + + def forward(self, features, **kwargs): + x = features[:, 0, :] # take token (equiv. to [CLS]) + if 'hidden_states' in kwargs and kwargs['hidden_states'] is not None: + # take token (equiv. to [CLS]) from hidden states from second from the end + hidden = kwargs['hidden_states'][-2][:, 0, :] + else: + hidden = torch.zeros(x.size(), dtype=x.dtype, device=x.device) + + x = self.dense_1_input(x) + x = torch.relu(x) + x = self.dropout(x) + + hidden = self.dense_1_hidden(hidden) + hidden = torch.relu(hidden) + hidden = self.dropout(hidden) + + x = torch.cat((x, hidden), dim=1) + x = self.dense_2(x) + x = torch.relu(x) + x = self.dropout(x) + + x = self.out_proj(x) + return x + + +class RobertaForSequenceClassificationCustomAlternative(RobertaForSequenceClassification): + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.roberta = RobertaModel(config, add_pooling_layer=False) + self.classifier = RobertaClassificationHeadCustomAlternative(config) + + self.init_weights() + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states or self.config.use_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + if return_dict: + logits = self.classifier(sequence_output, hidden_states=outputs.hidden_states) + else: + raise NotImplemented('Not implemented for using non-dictionary object') + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/scripts/run_glue.py b/scripts/run_glue.py new file mode 100644 index 0000000..4e0b039 --- /dev/null +++ b/scripts/run_glue.py @@ -0,0 +1,661 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Finetuning the library models for sequence classification on GLUE.""" +# You can also adapt this script on your own text classification task. Pointers for this are left as comments. + +import logging +import os +import random +import sys +from dataclasses import dataclass, field +from typing import Optional + +import datasets +import numpy as np +from datasets import load_dataset + +import evaluate +import transformers +from transformers import ( + AutoConfig, + AutoModelForSequenceClassification, + AutoTokenizer, + DataCollatorWithPadding, + EvalPrediction, + HfArgumentParser, + PretrainedConfig, + Trainer, + TrainingArguments, + default_data_collator, + set_seed, +) +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils.versions import require_version +from gpt2 import GPT2ForSequenceClassificationCustom +from roberta import RobertaForSequenceClassificationCustomAlternative + +MODEL_NAME_TO_CLASS = { + 'roberta_custom': RobertaForSequenceClassificationCustomAlternative, + 'gpt2_custom': GPT2ForSequenceClassificationCustom +} +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.23.0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") + +task_to_keys = { + "cola": ("sentence", None), + "mnli": ("premise", "hypothesis"), + "mrpc": ("sentence1", "sentence2"), + "qnli": ("question", "sentence"), + "qqp": ("question1", "question2"), + "rte": ("sentence1", "sentence2"), + "sst2": ("sentence", None), + "stsb": ("sentence1", "sentence2"), + "wnli": ("sentence1", "sentence2"), +} + +logger = logging.getLogger(__name__) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + + Using `HfArgumentParser` we can turn this class + into argparse arguments to be able to specify them on + the command line. + """ + + task_name: Optional[str] = field( + default=None, + metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())}, + ) + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + max_seq_length: int = field( + default=128, + metadata={ + "help": ( + "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + ) + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."} + ) + pad_to_max_length: bool = field( + default=True, + metadata={ + "help": ( + "Whether to pad all samples to `max_seq_length`. " + "If False, will pad the samples dynamically when batching to the maximum length in the batch." + ) + }, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + ) + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + ) + }, + ) + max_predict_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of prediction examples to this " + "value if set." + ) + }, + ) + train_file: Optional[str] = field( + default=None, metadata={"help": "A csv or a json file containing the training data."} + ) + validation_file: Optional[str] = field( + default=None, metadata={"help": "A csv or a json file containing the validation data."} + ) + test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."}) + + def __post_init__(self): + if self.task_name is not None: + self.task_name = self.task_name.lower() + if self.task_name not in task_to_keys.keys(): + raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys())) + elif self.dataset_name is not None: + pass + elif self.train_file is None or self.validation_file is None: + raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.") + else: + train_extension = self.train_file.split(".")[-1] + assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file." + validation_extension = self.validation_file.split(".")[-1] + assert ( + validation_extension == train_extension + ), "`validation_file` should have the same extension (csv or json) as `train_file`." + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": ( + "Will use the token generated when running `huggingface-cli login` (necessary to use this script " + "with private models)." + ) + }, + ) + ignore_mismatched_sizes: bool = field( + default=False, + metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."}, + ) + custom_model: str = field( + default=None, + metadata={ + "help": "Use custom implementation from available list", + "choices": list(MODEL_NAME_TO_CLASS.keys()), + }, + ) + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The + # information sent is the one passed as arguments along with your Python/PyTorch versions. + send_example_telemetry("run_glue", model_args, data_args) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"Training/evaluation parameters {training_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) + # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the + # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named + # label if at least two columns are provided. + # + # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this + # single column. You can easily tweak this behavior (see below) + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.task_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + "glue", + data_args.task_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) + elif data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) + else: + # Loading a dataset from your local files. + # CSV/JSON training and evaluation files are needed. + data_files = {"train": data_args.train_file, "validation": data_args.validation_file} + + # Get the test dataset: you can provide your own CSV/JSON test file (see below) + # when you use `do_predict` without specifying a GLUE benchmark task. + if training_args.do_predict: + if data_args.test_file is not None: + train_extension = data_args.train_file.split(".")[-1] + test_extension = data_args.test_file.split(".")[-1] + assert ( + test_extension == train_extension + ), "`test_file` should have the same extension (csv or json) as `train_file`." + data_files["test"] = data_args.test_file + else: + raise ValueError("Need either a GLUE task or a test file for `do_predict`.") + + for key in data_files.keys(): + logger.info(f"load a local file for {key}: {data_files[key]}") + + if data_args.train_file.endswith(".csv"): + # Loading a dataset from local csv files + raw_datasets = load_dataset( + "csv", + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) + else: + # Loading a dataset from local json files + raw_datasets = load_dataset( + "json", + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) + # See more about loading any type of standard or custom dataset at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Labels + if data_args.task_name is not None: + is_regression = data_args.task_name == "stsb" + if not is_regression: + label_list = raw_datasets["train"].features["label"].names + num_labels = len(label_list) + else: + num_labels = 1 + else: + # Trying to have good defaults here, don't hesitate to tweak to your needs. + is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"] + if is_regression: + num_labels = 1 + else: + # A useful fast method: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique + label_list = raw_datasets["train"].unique("label") + label_list.sort() # Let's sort it for determinism + num_labels = len(label_list) + + # Load pretrained model and tokenizer + # + # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + num_labels=num_labels, + finetuning_task=data_args.task_name, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + custom_model = model_args.custom_model + if custom_model is not None: + if 'roberta' in custom_model and 'roberta' not in model_args.model_name_or_path: + raise RuntimeError('Model and custom implementation should be the same type: RoBERTa') + elif 'gpt2' in custom_model and 'gpt2' not in model_args.model_name_or_path: + raise RuntimeError('Model and custom implementation should be the same type: GPT-2') + config.use_hidden_states = 'hidden' in custom_model + logger.info(f'Using hidden states in model: {config.use_hidden_states}') + model_cls = MODEL_NAME_TO_CLASS[custom_model] + else: + model_cls = AutoModelForSequenceClassification + logger.info(f'Using implementation from class: {model_cls.__name__}') + model = model_cls.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, + ) + if model_args.model_name_or_path.startswith('roberta'): + for name, param in model.named_parameters(): + if name.startswith('roberta.encoder.layer'): + layer_number = name.split('.')[3] + if int(layer_number) % 2 == 0: + param.requires_grad = False + frozen_layers=[(name,param.requires_grad) for (name, param) in model.named_parameters() if not param.requires_grad] + print('\n\nFrozen layers:') + print(frozen_layers,'\n\n') + + if 'gpt2' in tokenizer.name_or_path and tokenizer.pad_token is None: + logger.info(f'Set PAD token to EOS: {tokenizer.eos_token}') + tokenizer._pad_token = tokenizer.eos_token + model.config.pad_token_id = model.config.eos_token_id + + # Preprocessing the raw_datasets + if data_args.task_name is not None: + sentence1_key, sentence2_key = task_to_keys[data_args.task_name] + else: + # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. + non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"] + if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: + sentence1_key, sentence2_key = "sentence1", "sentence2" + else: + if len(non_label_column_names) >= 2: + sentence1_key, sentence2_key = non_label_column_names[:2] + else: + sentence1_key, sentence2_key = non_label_column_names[0], None + + # Padding strategy + if data_args.pad_to_max_length: + padding = "max_length" + else: + # We will pad later, dynamically at batch creation, to the max sequence length in each batch + padding = False + + # Some models have set the order of the labels to use, so let's make sure we do use it. + label_to_id = None + if ( + model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id + and data_args.task_name is not None + and not is_regression + ): + # Some have all caps in their config, some don't. + label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()} + if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): + label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)} + else: + logger.warning( + "Your model seems to have been trained with labels, but they don't match the dataset: ", + f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." + "\nIgnoring the model labels as a result.", + ) + elif data_args.task_name is None and not is_regression: + label_to_id = {v: i for i, v in enumerate(label_list)} + + if label_to_id is not None: + model.config.label2id = label_to_id + model.config.id2label = {id: label for label, id in config.label2id.items()} + elif data_args.task_name is not None and not is_regression: + model.config.label2id = {l: i for i, l in enumerate(label_list)} + model.config.id2label = {id: label for label, id in config.label2id.items()} + + if data_args.max_seq_length > tokenizer.model_max_length: + logger.warning( + f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" + f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." + ) + max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) + + def preprocess_function(examples): + # Tokenize the texts + args = ( + (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) + ) + result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) + + # Map labels to IDs (not necessary for GLUE tasks) + if label_to_id is not None and "label" in examples: + result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]] + return result + + with training_args.main_process_first(desc="dataset map pre-processing"): + raw_datasets = raw_datasets.map( + preprocess_function, + batched=True, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + if training_args.do_train: + if "train" not in raw_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = raw_datasets["train"] + if data_args.max_train_samples is not None: + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) + + if training_args.do_eval: + if "validation" not in raw_datasets and "validation_matched" not in raw_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] + if data_args.max_eval_samples is not None: + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) + + if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None: + if "test" not in raw_datasets and "test_matched" not in raw_datasets: + raise ValueError("--do_predict requires a test dataset") + predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"] + if data_args.max_predict_samples is not None: + max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) + predict_dataset = predict_dataset.select(range(max_predict_samples)) + + # Log a few random samples from the training set: + if training_args.do_train: + for index in random.sample(range(len(train_dataset)), 3): + logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") + + # Get the metric function + if data_args.task_name is not None: + metric = evaluate.load("glue", data_args.task_name) + else: + metric = evaluate.load("accuracy") + + # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a + # predictions and label_ids field) and has to return a dictionary string to float. + def compute_metrics(p: EvalPrediction): + preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions + preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) + if data_args.task_name is not None: + result = metric.compute(predictions=preds, references=p.label_ids) + if len(result) > 1: + result["combined_score"] = np.mean(list(result.values())).item() + return result + elif is_regression: + return {"mse": ((preds - p.label_ids) ** 2).mean().item()} + else: + return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()} + + # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if + # we already did the padding. + if data_args.pad_to_max_length: + data_collator = default_data_collator + elif training_args.fp16: + data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) + else: + data_collator = None + + # Initialize our Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + compute_metrics=compute_metrics, + tokenizer=tokenizer, + data_collator=data_collator, + ) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + metrics = train_result.metrics + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.save_model() # Saves the tokenizer too for easy upload + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + + # Loop to handle MNLI double evaluation (matched, mis-matched) + tasks = [data_args.task_name] + eval_datasets = [eval_dataset] + if data_args.task_name == "mnli": + tasks.append("mnli-mm") + valid_mm_dataset = raw_datasets["validation_mismatched"] + if data_args.max_eval_samples is not None: + max_eval_samples = min(len(valid_mm_dataset), data_args.max_eval_samples) + valid_mm_dataset = valid_mm_dataset.select(range(max_eval_samples)) + eval_datasets.append(valid_mm_dataset) + combined = {} + + for eval_dataset, task in zip(eval_datasets, tasks): + metrics = trainer.evaluate(eval_dataset=eval_dataset) + + max_eval_samples = ( + data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + ) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + + if task == "mnli-mm": + metrics = {k + "_mm": v for k, v in metrics.items()} + if task is not None and "mnli" in task: + combined.update(metrics) + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", combined if task is not None and "mnli" in task else metrics) + + if training_args.do_predict: + logger.info("*** Predict ***") + + # Loop to handle MNLI double evaluation (matched, mis-matched) + tasks = [data_args.task_name] + predict_datasets = [predict_dataset] + if data_args.task_name == "mnli": + tasks.append("mnli-mm") + predict_datasets.append(raw_datasets["test_mismatched"]) + + for predict_dataset, task in zip(predict_datasets, tasks): + # Removing the `label` columns because it contains -1 and Trainer won't like that. + predict_dataset = predict_dataset.remove_columns("label") + predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions + predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1) + + output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt") + if trainer.is_world_process_zero(): + with open(output_predict_file, "w") as writer: + logger.info(f"***** Predict results {task} *****") + writer.write("index\tprediction\n") + for index, item in enumerate(predictions): + if is_regression: + writer.write(f"{index}\t{item:3.3f}\n") + else: + item = label_list[item] + writer.write(f"{index}\t{item}\n") + + kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"} + if data_args.task_name is not None: + kwargs["language"] = "en" + kwargs["dataset_tags"] = "glue" + kwargs["dataset_args"] = data_args.task_name + kwargs["dataset"] = f"GLUE {data_args.task_name.upper()}" + + if training_args.push_to_hub: + trainer.push_to_hub(**kwargs) + else: + trainer.create_model_card(**kwargs) + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() diff --git a/scripts/run_translation.py b/scripts/run_translation.py new file mode 100644 index 0000000..fc345a3 --- /dev/null +++ b/scripts/run_translation.py @@ -0,0 +1,707 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Fine-tuning the library models for sequence to sequence. +""" +# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments. +from collections import defaultdict +import logging +import os +import sys +from dataclasses import dataclass, field +from typing import Optional +import torch +import datasets +import numpy as np +from datasets import load_dataset + +import evaluate +import transformers +from transformers import ( + AutoConfig, + AutoModelForSeq2SeqLM, + AutoTokenizer, + DataCollatorForSeq2Seq, + HfArgumentParser, + M2M100Tokenizer, + MBart50Tokenizer, + MBart50TokenizerFast, + MBartTokenizer, + MBartTokenizerFast, + Seq2SeqTrainer, + Seq2SeqTrainingArguments, + default_data_collator, + set_seed, +) +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils.versions import require_version + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.23.0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") + +logger = logging.getLogger(__name__) + +# A list of all multilingual tokenizer which require src_lang and tgt_lang attributes. +MULTILINGUAL_TOKENIZERS = [MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast, M2M100Tokenizer] + +MAP_LABEL_TRANSLATION = { + 0: 'sadness', + 1: 'joy', + 2: 'love', + 3: 'anger', + 4: 'fear', + 5: 'surprise', +} + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": ( + "Will use the token generated when running `huggingface-cli login` (necessary to use this script " + "with private models)." + ) + }, + ) + freeze_weights: bool = field( + default=False, + metadata={"help": "Freeze encoder weights"}, + ) + + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + source_lang: str = field(default=None, metadata={"help": "Source language id for translation."}) + target_lang: str = field(default=None, metadata={"help": "Target language id for translation."}) + + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a jsonlines)."}) + validation_file: Optional[str] = field( + default=None, + metadata={ + "help": "An optional input evaluation data file to evaluate the metrics (sacrebleu) on a jsonlines file." + }, + ) + test_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input test data file to evaluate the metrics (sacrebleu) on a jsonlines file."}, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + max_source_length: Optional[int] = field( + default=1024, + metadata={ + "help": ( + "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + ) + }, + ) + max_target_length: Optional[int] = field( + default=128, + metadata={ + "help": ( + "The maximum total sequence length for target text after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + ) + }, + ) + val_max_target_length: Optional[int] = field( + default=None, + metadata={ + "help": ( + "The maximum total sequence length for validation target text after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`." + "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used " + "during ``evaluate`` and ``predict``." + ) + }, + ) + pad_to_max_length: bool = field( + default=False, + metadata={ + "help": ( + "Whether to pad all samples to model maximum sentence length. " + "If False, will pad the samples dynamically when batching to the maximum length in the batch. More " + "efficient on GPU but very bad for TPU." + ) + }, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + ) + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + ) + }, + ) + max_predict_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of prediction examples to this " + "value if set." + ) + }, + ) + num_beams: Optional[int] = field( + default=None, + metadata={ + "help": ( + "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, " + "which is used during ``evaluate`` and ``predict``." + ) + }, + ) + ignore_pad_token_for_loss: bool = field( + default=True, + metadata={ + "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not." + }, + ) + source_prefix: Optional[str] = field( + default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."} + ) + forced_bos_token: Optional[str] = field( + default=None, + metadata={ + "help": ( + "The token to force as the first generated token after the :obj:`decoder_start_token_id`.Useful for" + " multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token needs to" + " be the target language token.(Usually it is the target language token)" + ) + }, + ) + + def __post_init__(self): + if self.dataset_name is None and self.train_file is None and self.validation_file is None: + raise ValueError("Need either a dataset name or a training/validation file.") + elif self.source_lang is None or self.target_lang is None: + raise ValueError("Need to specify the source language and the target language.") + + # accepting both json and jsonl file extensions, as + # many jsonlines files actually have a .json extension + valid_extensions = ["json", "jsonl"] + + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in valid_extensions, "`train_file` should be a jsonlines file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in valid_extensions, "`validation_file` should be a jsonlines file." + if self.val_max_target_length is None: + self.val_max_target_length = self.max_target_length + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The + # information sent is the one passed as arguments along with your Python/PyTorch versions. + send_example_telemetry("run_translation", model_args, data_args) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"Training/evaluation parameters {training_args}") + + if data_args.source_prefix is None and model_args.model_name_or_path in [ + "t5-small", + "t5-base", + "t5-large", + "t5-3b", + "t5-11b", + ]: + logger.warning( + "You're running a t5 model but didn't provide a source prefix, which is expected, e.g. with " + "`--source_prefix 'translate English to German: ' `" + ) + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own JSON training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For translation, only JSON files are supported, with one field named "translation" containing two keys for the + # source and target languages (unless you adapt what follows). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) + else: + data_files = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + extension = data_args.train_file.split(".")[-1] + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = data_args.validation_file.split(".")[-1] + if data_args.test_file is not None: + data_files["test"] = data_args.test_file + extension = data_args.test_file.split(".")[-1] + raw_datasets = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + ) + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + model = AutoModelForSeq2SeqLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + if model_args.model_name_or_path.startswith('google/t5'): + for name, param in model.named_parameters(): + if name.startswith('encoder.block'): + num_block = name.split('.')[2] + num_layer = name.split('.')[4] + if int(num_block) and int(num_layer) in [0,1]: + param.requires_grad = False + frozen_layers=[(name,param.requires_grad) for (name, param) in model.named_parameters() if not param.requires_grad] + print('\n\nFrozen layers:') + print(frozen_layers,'\n\n') + + model.resize_token_embeddings(len(tokenizer)) + + # Set decoder_start_token_id + if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): + if isinstance(tokenizer, MBartTokenizer): + model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.target_lang] + else: + model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(data_args.target_lang) + + if model.config.decoder_start_token_id is None: + raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") + + prefix = data_args.source_prefix if data_args.source_prefix is not None else "" + if 'classification' not in prefix: + raise RuntimeError('Not found "classification" prefix!') + prefix = prefix.strip() + if not prefix.endswith(':'): + prefix += ':' + prefix += ' ' + logger.info(f'Using translation prefix: "{prefix}"') + + # Preprocessing the datasets. + # We need to tokenize inputs and targets. + if training_args.do_train: + column_names = raw_datasets["train"].column_names + elif training_args.do_eval: + column_names = raw_datasets["validation"].column_names + elif training_args.do_predict: + column_names = raw_datasets["test"].column_names + else: + logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.") + return + + # For translation we set the codes of our source and target languages (only useful for mBART, the others will + # ignore those attributes). + if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)): + assert data_args.target_lang is not None and data_args.source_lang is not None, ( + f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --source_lang and " + "--target_lang arguments." + ) + + tokenizer.src_lang = 'text' + tokenizer.tgt_lang = 'label' + + # For multilingual translation models like mBART-50 and M2M100 we need to force the target language token + # as the first generated token. We ask the user to explicitly provide this as --forced_bos_token argument. + forced_bos_token_id = ( + tokenizer.lang_code_to_id[data_args.forced_bos_token] if data_args.forced_bos_token is not None else None + ) + model.config.forced_bos_token_id = forced_bos_token_id + + # Get the language codes for input/target. + source_lang = 'text' + target_lang = 'label' + + # Temporarily set max_target_length for training. + max_target_length = data_args.max_target_length + padding = "max_length" if data_args.pad_to_max_length else False + + if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"): + logger.warning( + "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" + f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" + ) + + def preprocess_function(examples): + inputs = [ex for ex in examples[source_lang]] + targets = [ex for ex in examples[target_lang]] + inputs = [prefix + inp for inp in inputs] + model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) + + # Tokenize targets with the `text_target` keyword argument + labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True) + + # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore + # padding in the loss. + if padding == "max_length" and data_args.ignore_pad_token_for_loss: + labels["input_ids"] = [ + [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] + ] + + model_inputs["labels"] = labels["input_ids"] + return model_inputs + + if training_args.do_train: + if "train" not in raw_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = raw_datasets["train"] + if data_args.max_train_samples is not None: + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) + with training_args.main_process_first(desc="train dataset map pre-processing"): + train_dataset = train_dataset.map( + preprocess_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on train dataset", + ) + + if training_args.do_eval: + max_target_length = data_args.val_max_target_length + if "validation" not in raw_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_dataset = raw_datasets["validation"] + if data_args.max_eval_samples is not None: + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + label_to_indexes = defaultdict(list) + for index, eval_sample in enumerate(eval_dataset): + label_to_indexes[eval_sample['label']].append(index) + max_samples_per_label = int(max_eval_samples / len(label_to_indexes)) + eval_sample_indexes = [] + for label, indexes in label_to_indexes.items(): + eval_sample_indexes.extend(indexes[:max_samples_per_label]) + logger.info(f"Set {max_samples_per_label} samples for {label}-class") + eval_sample_indexes.sort() + eval_dataset = eval_dataset.select(eval_sample_indexes) + + + + + + + with training_args.main_process_first(desc="validation dataset map pre-processing"): + eval_dataset = eval_dataset.map( + preprocess_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on validation dataset", + ) + + if training_args.do_predict: + max_target_length = data_args.val_max_target_length + if "test" not in raw_datasets: + raise ValueError("--do_predict requires a test dataset") + predict_dataset = raw_datasets["test"] + if data_args.max_predict_samples is not None: + max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) + predict_dataset = predict_dataset.select(range(max_predict_samples)) + with training_args.main_process_first(desc="prediction dataset map pre-processing"): + predict_dataset = predict_dataset.map( + preprocess_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on prediction dataset", + ) + + # Data collator + label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id + if data_args.pad_to_max_length: + data_collator = default_data_collator + else: + data_collator = DataCollatorForSeq2Seq( + tokenizer, + model=model, + label_pad_token_id=label_pad_token_id, + pad_to_multiple_of=8 if training_args.fp16 else None, + ) + + # Metric + metric = evaluate.load("sacrebleu") + metric_accuracy = evaluate.load("accuracy") + + def postprocess_text(preds, labels): + preds = [pred.strip() for pred in preds] + labels = [[label.strip()] for label in labels] + + return preds, labels + + def compute_metrics(eval_preds): + preds, labels = eval_preds + if isinstance(preds, tuple): + preds = preds[0] + decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) + if data_args.ignore_pad_token_for_loss: + # Replace -100 in the labels as we can't decode them. + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + + # Some simple post-processing + decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) + decoded_preds_accuracy = [MAP_LABEL_TRANSLATION.get(decoded_pred, -1) for decoded_pred in decoded_preds] + decoded_labels_accuracy = [MAP_LABEL_TRANSLATION.get(decoded_label[0], -1) for decoded_label in decoded_labels] + + + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + result_accuracy = metric_accuracy.compute(predictions=decoded_preds_accuracy, references=decoded_labels_accuracy) + result = {"bleu": result["score"], "accuracy": result_accuracy["accuracy"]} + + prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] + result["gen_len"] = np.mean(prediction_lens) + result = {k: round(v, 4) for k, v in result.items()} + return result + + # Initialize our Trainer + trainer = Seq2SeqTrainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics if training_args.predict_with_generate else None, + ) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + metrics = train_result.metrics + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + results = {} + max_length = ( + training_args.generation_max_length + if training_args.generation_max_length is not None + else data_args.val_max_target_length + ) + num_beams = data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams + if training_args.do_eval: + logger.info("*** Evaluate ***") + + metrics = trainer.evaluate(max_length=max_length, num_beams=num_beams, metric_key_prefix="eval") + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + if training_args.do_predict: + logger.info("*** Predict ***") + + predict_results = trainer.predict( + predict_dataset, metric_key_prefix="predict", max_length=max_length, num_beams=num_beams + ) + metrics = predict_results.metrics + max_predict_samples = ( + data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset) + ) + metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) + + trainer.log_metrics("predict", metrics) + trainer.save_metrics("predict", metrics) + + if trainer.is_world_process_zero(): + if training_args.predict_with_generate: + predictions = tokenizer.batch_decode( + predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True + ) + predictions = [pred.strip() for pred in predictions] + output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt") + with open(output_prediction_file, "w", encoding="utf-8") as writer: + writer.write("\n".join(predictions)) + + kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "translation"} + if data_args.dataset_name is not None: + kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + kwargs["dataset_args"] = data_args.dataset_config_name + kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + kwargs["dataset"] = data_args.dataset_name + + languages = [l for l in [data_args.source_lang, data_args.target_lang] if l is not None] + if len(languages) > 0: + kwargs["language"] = languages + + if training_args.push_to_hub: + trainer.push_to_hub(**kwargs) + else: + trainer.create_model_card(**kwargs) + + return results + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main()