{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "!pip install transformers torch datasets evaluate scikit-learn sacremoses sentencepiece ipywidgets > /dev/null" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Roberta" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Modifications" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "- Custom classification head with bigger hidden size\n", "- Changed activation function to GELU" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Code" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from torch import nn\n", "from transformers import RobertaForSequenceClassification, RobertaModel\n", "\n", "\n", "# Simple version #\n", "\n", "class RobertaClassificationHeadCustomSimple(nn.Module):\n", " \"\"\"Head for sentence-level classification tasks.\"\"\"\n", "\n", " def __init__(self, config):\n", " super().__init__()\n", " hidden_size = config.hidden_size\n", " self.dense_1 = nn.Linear(hidden_size, 4 * hidden_size)\n", " self.dense_2 = nn.Linear(4 * hidden_size, hidden_size)\n", " classifier_dropout = (\n", " config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob\n", " )\n", " self.dropout = nn.Dropout(classifier_dropout)\n", " self.out_proj = nn.Linear(hidden_size, config.num_labels)\n", " self.activation = nn.GELU()\n", "\n", " def forward(self, features, **kwargs):\n", " x = features[:, 0, :] # take token (equiv. to [CLS])\n", "\n", " x = self.dense_1(x)\n", " x = self.activation(x)\n", " x = self.dropout(x)\n", "\n", " x = self.dense_2(x)\n", " x = self.activation(x)\n", " x = self.dropout(x)\n", "\n", " x = self.out_proj(x)\n", " return x\n", "\n", "\n", "class RobertaForSequenceClassificationCustomSimple(RobertaForSequenceClassification):\n", " _keys_to_ignore_on_load_missing = [r\"position_ids\"]\n", "\n", " def __init__(self, config):\n", " super().__init__(config)\n", " self.num_labels = config.num_labels\n", " self.config = config\n", "\n", " self.roberta = RobertaModel(config, add_pooling_layer=False)\n", " self.classifier = RobertaClassificationHeadCustomSimple(config)\n", "\n", " # Initialize weights and apply final processing\n", " self.post_init()\n" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Model" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassificationCustomSimple: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']\n", "- This IS expected if you are initializing RobertaForSequenceClassificationCustomSimple from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing RobertaForSequenceClassificationCustomSimple from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of RobertaForSequenceClassificationCustomSimple were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense_1.weight', 'classifier.out_proj.bias', 'classifier.dense_2.bias', 'classifier.dense_2.weight', 'classifier.out_proj.weight', 'classifier.dense_1.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] }, { "data": { "text/plain": [ "RobertaForSequenceClassificationCustomSimple(\n", " (roberta): RobertaModel(\n", " (embeddings): RobertaEmbeddings(\n", " (word_embeddings): Embedding(50265, 768, padding_idx=1)\n", " (position_embeddings): Embedding(514, 768, padding_idx=1)\n", " (token_type_embeddings): Embedding(1, 768)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (encoder): RobertaEncoder(\n", " (layer): ModuleList(\n", " (0): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (1): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (2): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (3): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (4): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (5): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (6): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (7): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (8): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (9): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (10): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (11): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " )\n", " )\n", " (classifier): RobertaClassificationHeadCustomSimple(\n", " (dense_1): Linear(in_features=768, out_features=3072, bias=True)\n", " (dense_2): Linear(in_features=3072, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (out_proj): Linear(in_features=768, out_features=2, bias=True)\n", " (activation): GELU(approximate='none')\n", " )\n", ")" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "RobertaForSequenceClassificationCustomSimple.from_pretrained(\"roberta-base\")" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Training" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "02/16/2023 15:21:14 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n", "02/16/2023 15:21:14 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n", "_n_gpu=1,\n", "adafactor=False,\n", "adam_beta1=0.9,\n", "adam_beta2=0.999,\n", "adam_epsilon=1e-08,\n", "auto_find_batch_size=False,\n", "bf16=False,\n", "bf16_full_eval=False,\n", "data_seed=None,\n", "dataloader_drop_last=False,\n", "dataloader_num_workers=0,\n", "dataloader_pin_memory=True,\n", "ddp_bucket_cap_mb=None,\n", "ddp_find_unused_parameters=None,\n", "ddp_timeout=1800,\n", "debug=[],\n", "deepspeed=None,\n", "disable_tqdm=False,\n", "do_eval=True,\n", "do_predict=True,\n", "do_train=True,\n", "eval_accumulation_steps=None,\n", "eval_delay=0,\n", "eval_steps=250,\n", "evaluation_strategy=steps,\n", "fp16=False,\n", "fp16_backend=auto,\n", "fp16_full_eval=False,\n", "fp16_opt_level=O1,\n", "fsdp=[],\n", "fsdp_min_num_params=0,\n", "fsdp_transformer_layer_cls_to_wrap=None,\n", "full_determinism=False,\n", "gradient_accumulation_steps=1,\n", "gradient_checkpointing=False,\n", "greater_is_better=True,\n", "group_by_length=False,\n", "half_precision_backend=auto,\n", "hub_model_id=None,\n", "hub_private_repo=False,\n", "hub_strategy=every_save,\n", "hub_token=,\n", "ignore_data_skip=False,\n", "include_inputs_for_metrics=False,\n", "jit_mode_eval=False,\n", "label_names=None,\n", "label_smoothing_factor=0.0,\n", "learning_rate=2e-05,\n", "length_column_name=length,\n", "load_best_model_at_end=True,\n", "local_rank=-1,\n", "log_level=passive,\n", "log_level_replica=passive,\n", "log_on_each_node=True,\n", "logging_dir=out/roberta/runs/Feb16_15-21-13_DESKTOP-R7JO8BQ,\n", "logging_first_step=False,\n", "logging_nan_inf_filter=True,\n", "logging_steps=100,\n", "logging_strategy=steps,\n", "lr_scheduler_type=linear,\n", "max_grad_norm=1.0,\n", "max_steps=2500,\n", "metric_for_best_model=accuracy,\n", "mp_parameters=,\n", "no_cuda=False,\n", "num_train_epochs=1.0,\n", "optim=adamw_hf,\n", "optim_args=None,\n", "output_dir=out/roberta,\n", "overwrite_output_dir=False,\n", "past_index=-1,\n", "per_device_eval_batch_size=8,\n", "per_device_train_batch_size=8,\n", "prediction_loss_only=False,\n", "push_to_hub=False,\n", "push_to_hub_model_id=None,\n", "push_to_hub_organization=None,\n", "push_to_hub_token=,\n", "ray_scope=last,\n", "remove_unused_columns=True,\n", "report_to=[],\n", "resume_from_checkpoint=None,\n", "run_name=out/roberta,\n", "save_on_each_node=False,\n", "save_steps=250,\n", "save_strategy=steps,\n", "save_total_limit=5,\n", "seed=42,\n", "sharded_ddp=[],\n", "skip_memory_metrics=True,\n", "tf32=None,\n", "torch_compile=False,\n", "torch_compile_backend=None,\n", "torch_compile_mode=None,\n", "torchdynamo=None,\n", "tpu_metrics_debug=False,\n", "tpu_num_cores=None,\n", "use_ipex=False,\n", "use_legacy_prediction_loop=False,\n", "use_mps_device=False,\n", "warmup_ratio=0.0,\n", "warmup_steps=0,\n", "weight_decay=0.0,\n", "xpu_backend=None,\n", ")\n", "02/16/2023 15:21:14 - INFO - __main__ - Checkpoint detected, resuming training at out/roberta/checkpoint-2500. To avoid this behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch.\n", "02/16/2023 15:21:14 - INFO - __main__ - load a local file for train: data/train.json\n", "02/16/2023 15:21:14 - INFO - __main__ - load a local file for validation: data/valid.json\n", "02/16/2023 15:21:14 - INFO - __main__ - load a local file for test: data/test.json\n", "02/16/2023 15:21:14 - WARNING - datasets.builder - Using custom data configuration default-f6e8039906850c57\n", "02/16/2023 15:21:14 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json\n", "02/16/2023 15:21:14 - INFO - datasets.builder - Overwrite dataset info from restored data version.\n", "02/16/2023 15:21:14 - INFO - datasets.info - Loading Dataset info from .cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n", "02/16/2023 15:21:14 - WARNING - datasets.builder - Found cached dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n", "02/16/2023 15:21:14 - INFO - datasets.info - Loading Dataset info from /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n", "100%|█████████████████████████████████████████████| 3/3 [00:00<00:00, 48.00it/s]\n", "[INFO|configuration_utils.py:660] 2023-02-16 15:21:15,174 >> loading configuration file config.json from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json\n", "[INFO|configuration_utils.py:712] 2023-02-16 15:21:15,175 >> Model config RobertaConfig {\n", " \"_name_or_path\": \"roberta-base\",\n", " \"architectures\": [\n", " \"RobertaForMaskedLM\"\n", " ],\n", " \"attention_probs_dropout_prob\": 0.1,\n", " \"bos_token_id\": 0,\n", " \"classifier_dropout\": null,\n", " \"eos_token_id\": 2,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout_prob\": 0.1,\n", " \"hidden_size\": 768,\n", " \"id2label\": {\n", " \"0\": \"LABEL_0\",\n", " \"1\": \"LABEL_1\",\n", " \"2\": \"LABEL_2\",\n", " \"3\": \"LABEL_3\"\n", " },\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"label2id\": {\n", " \"LABEL_0\": 0,\n", " \"LABEL_1\": 1,\n", " \"LABEL_2\": 2,\n", " \"LABEL_3\": 3\n", " },\n", " \"layer_norm_eps\": 1e-05,\n", " \"max_position_embeddings\": 514,\n", " \"model_type\": \"roberta\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 12,\n", " \"pad_token_id\": 1,\n", " \"position_embedding_type\": \"absolute\",\n", " \"transformers_version\": \"4.26.1\",\n", " \"type_vocab_size\": 1,\n", " \"use_cache\": true,\n", " \"vocab_size\": 50265\n", "}\n", "\n", "[INFO|tokenization_auto.py:458] 2023-02-16 15:21:15,654 >> Could not locate the tokenizer configuration file, will try to use the model config instead.\n", "[INFO|configuration_utils.py:660] 2023-02-16 15:21:16,123 >> loading configuration file config.json from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json\n", "[INFO|configuration_utils.py:712] 2023-02-16 15:21:16,123 >> Model config RobertaConfig {\n", " \"_name_or_path\": \"roberta-base\",\n", " \"architectures\": [\n", " \"RobertaForMaskedLM\"\n", " ],\n", " \"attention_probs_dropout_prob\": 0.1,\n", " \"bos_token_id\": 0,\n", " \"classifier_dropout\": null,\n", " \"eos_token_id\": 2,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout_prob\": 0.1,\n", " \"hidden_size\": 768,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"layer_norm_eps\": 1e-05,\n", " \"max_position_embeddings\": 514,\n", " \"model_type\": \"roberta\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 12,\n", " \"pad_token_id\": 1,\n", " \"position_embedding_type\": \"absolute\",\n", " \"transformers_version\": \"4.26.1\",\n", " \"type_vocab_size\": 1,\n", " \"use_cache\": true,\n", " \"vocab_size\": 50265\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file vocab.json from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/vocab.json\n", "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file merges.txt from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/merges.txt\n", "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file tokenizer.json from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/tokenizer.json\n", "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file added_tokens.json from cache at None\n", "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file special_tokens_map.json from cache at None\n", "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file tokenizer_config.json from cache at None\n", "[INFO|configuration_utils.py:660] 2023-02-16 15:21:17,045 >> loading configuration file config.json from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json\n", "[INFO|configuration_utils.py:712] 2023-02-16 15:21:17,046 >> Model config RobertaConfig {\n", " \"_name_or_path\": \"roberta-base\",\n", " \"architectures\": [\n", " \"RobertaForMaskedLM\"\n", " ],\n", " \"attention_probs_dropout_prob\": 0.1,\n", " \"bos_token_id\": 0,\n", " \"classifier_dropout\": null,\n", " \"eos_token_id\": 2,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout_prob\": 0.1,\n", " \"hidden_size\": 768,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"layer_norm_eps\": 1e-05,\n", " \"max_position_embeddings\": 514,\n", " \"model_type\": \"roberta\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 12,\n", " \"pad_token_id\": 1,\n", " \"position_embedding_type\": \"absolute\",\n", " \"transformers_version\": \"4.26.1\",\n", " \"type_vocab_size\": 1,\n", " \"use_cache\": true,\n", " \"vocab_size\": 50265\n", "}\n", "\n", "02/16/2023 15:21:17 - INFO - __main__ - Using hidden states in model: False\n", "-------------------------------------------------------- Using hidden: False\n", "02/16/2023 15:21:17 - INFO - __main__ - Using implementation from class: RobertaForSequenceClassificationCustomSimple\n", "[INFO|modeling_utils.py:2275] 2023-02-16 15:21:17,101 >> loading weights file pytorch_model.bin from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/pytorch_model.bin\n", "[WARNING|modeling_utils.py:2847] 2023-02-16 15:21:22,965 >> Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassificationCustomSimple: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias']\n", "- This IS expected if you are initializing RobertaForSequenceClassificationCustomSimple from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing RobertaForSequenceClassificationCustomSimple from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "[WARNING|modeling_utils.py:2859] 2023-02-16 15:21:22,965 >> Some weights of RobertaForSequenceClassificationCustomSimple were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense_1.bias', 'classifier.dense_2.weight', 'classifier.out_proj.weight', 'classifier.dense_2.bias', 'classifier.out_proj.bias', 'classifier.dense_1.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", "RobertaForSequenceClassificationCustomSimple(\n", " (roberta): RobertaModel(\n", " (embeddings): RobertaEmbeddings(\n", " (word_embeddings): Embedding(50265, 768, padding_idx=1)\n", " (position_embeddings): Embedding(514, 768, padding_idx=1)\n", " (token_type_embeddings): Embedding(1, 768)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (encoder): RobertaEncoder(\n", " (layer): ModuleList(\n", " (0): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (1): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (2): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (3): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (4): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (5): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (6): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (7): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (8): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (9): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (10): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (11): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " )\n", " )\n", " (classifier): RobertaClassificationHeadCustomSimple(\n", " (dense_1): Linear(in_features=768, out_features=3072, bias=True)\n", " (dense_2): Linear(in_features=3072, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (out_proj): Linear(in_features=768, out_features=4, bias=True)\n", " (activation): GELU(approximate='none')\n", " )\n", ")\n", "02/16/2023 15:21:22 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-204a6dc6fcae3352.arrow\n", "Running tokenizer on dataset: 0%| | 0/4 [00:00> max_steps is given, it will override any value given in num_train_epochs\n", "[INFO|trainer.py:1972] 2023-02-16 15:21:27,576 >> Loading model from out/roberta/checkpoint-2500.\n", "[INFO|trainer.py:710] 2023-02-16 15:21:29,498 >> The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassificationCustomSimple.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomSimple.forward`, you can safely ignore this message.\n", "/home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", " warnings.warn(\n", "[INFO|trainer.py:1650] 2023-02-16 15:21:31,949 >> ***** Running training *****\n", "[INFO|trainer.py:1651] 2023-02-16 15:21:31,950 >> Num examples = 120000\n", "[INFO|trainer.py:1652] 2023-02-16 15:21:31,950 >> Num Epochs = 1\n", "[INFO|trainer.py:1653] 2023-02-16 15:21:31,950 >> Instantaneous batch size per device = 8\n", "[INFO|trainer.py:1654] 2023-02-16 15:21:31,950 >> Total train batch size (w. parallel, distributed & accumulation) = 8\n", "[INFO|trainer.py:1655] 2023-02-16 15:21:31,950 >> Gradient Accumulation steps = 1\n", "[INFO|trainer.py:1656] 2023-02-16 15:21:31,950 >> Total optimization steps = 2500\n", "[INFO|trainer.py:1657] 2023-02-16 15:21:31,951 >> Number of trainable parameters = 128780548\n", "[INFO|trainer.py:1679] 2023-02-16 15:21:31,951 >> Continuing training from checkpoint, will skip to saved global_step\n", "[INFO|trainer.py:1680] 2023-02-16 15:21:31,951 >> Continuing training from epoch 0\n", "[INFO|trainer.py:1681] 2023-02-16 15:21:31,951 >> Continuing training from global step 2500\n", "[INFO|trainer.py:1683] 2023-02-16 15:21:31,951 >> Will skip the first 0 epochs then the first 2500 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.\n", "Skipping the first batches: 0%| | 0/2500 [00:00> \n", "\n", "Training completed. Do not forget to share your model on huggingface.co/models =)\n", "\n", "\n", "[INFO|trainer.py:2025] 2023-02-16 15:21:36,738 >> Loading best model from out/roberta/checkpoint-2500 (score: 0.9229999780654907).\n", "\n", "\u001b[A{'train_runtime': 5.7972, 'train_samples_per_second': 3449.95, 'train_steps_per_second': 431.244, 'train_loss': 3.2215512862971954e-06, 'epoch': 0.17}\n", "\n", "2501it [00:05, 431.57it/s]\u001b[A\n", "[INFO|trainer.py:2709] 2023-02-16 15:21:37,750 >> Saving model checkpoint to out/roberta\n", "[INFO|configuration_utils.py:453] 2023-02-16 15:21:37,751 >> Configuration saved in out/roberta/config.json\n", "[INFO|modeling_utils.py:1704] 2023-02-16 15:21:38,719 >> Model weights saved in out/roberta/pytorch_model.bin\n", "[INFO|tokenization_utils_base.py:2160] 2023-02-16 15:21:38,742 >> tokenizer config file saved in out/roberta/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2167] 2023-02-16 15:21:38,743 >> Special tokens file saved in out/roberta/special_tokens_map.json\n", "***** train metrics *****\n", " epoch = 0.17\n", " train_loss = 0.0\n", " train_runtime = 0:00:05.79\n", " train_samples = 120000\n", " train_samples_per_second = 3449.95\n", " train_steps_per_second = 431.244\n", "02/16/2023 15:21:38 - INFO - __main__ - *** Evaluate ***\n", "[INFO|trainer.py:710] 2023-02-16 15:21:38,862 >> The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassificationCustomSimple.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomSimple.forward`, you can safely ignore this message.\n", "[INFO|trainer.py:2964] 2023-02-16 15:21:38,863 >> ***** Running Evaluation *****\n", "[INFO|trainer.py:2966] 2023-02-16 15:21:38,863 >> Num examples = 2000\n", "[INFO|trainer.py:2969] 2023-02-16 15:21:38,863 >> Batch size = 8\n", "100%|█████████████████████████████████████████| 250/250 [00:16<00:00, 14.75it/s]\n", "***** eval metrics *****\n", " epoch = 0.17\n", " eval_accuracy = 0.923\n", " eval_loss = 0.296\n", " eval_runtime = 0:00:17.06\n", " eval_samples = 2000\n", " eval_samples_per_second = 117.168\n", " eval_steps_per_second = 14.646\n", "02/16/2023 15:21:55 - INFO - __main__ - *** Predict ***\n", "[INFO|trainer.py:710] 2023-02-16 15:21:55,934 >> The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassificationCustomSimple.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomSimple.forward`, you can safely ignore this message.\n", "[INFO|trainer.py:2964] 2023-02-16 15:21:55,935 >> ***** Running Prediction *****\n", "[INFO|trainer.py:2966] 2023-02-16 15:21:55,935 >> Num examples = 3800\n", "[INFO|trainer.py:2969] 2023-02-16 15:21:55,935 >> Batch size = 8\n", "100%|█████████████████████████████████████████| 475/475 [00:32<00:00, 14.74it/s]\n", "02/16/2023 15:22:28 - INFO - __main__ - ***** Predict results None *****\n", "[INFO|modelcard.py:449] 2023-02-16 15:22:28,796 >> Dropping the following result as it does not have all the necessary fields:\n", "{'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.9229999780654907}]}\n" ] } ], "source": [ "!python run_glue.py \\\n", " --cache_dir .cache_training \\\n", " --model_name_or_path roberta-base \\\n", " --custom_model roberta_simple \\\n", " --train_file data/train.json \\\n", " --validation_file data/valid.json \\\n", " --test_file data/test.json \\\n", " --per_device_train_batch_size 8 \\\n", " --per_device_eval_batch_size 8 \\\n", " --do_train \\\n", " --do_eval \\\n", " --do_predict \\\n", " --max_seq_length 128 \\\n", " --learning_rate 2e-5 \\\n", " --max_eval_samples 2000 \\\n", " --max_steps 2500 \\\n", " --num_train_epochs 1 \\\n", " --save_strategy steps \\\n", " --save_steps 250 \\\n", " --save_total_limit 5 \\\n", " --logging_strategy steps \\\n", " --logging_steps 100 \\\n", " --eval_steps 250 \\\n", " --evaluation_strategy steps \\\n", " --metric_for_best_model accuracy \\\n", " --greater_is_better True \\\n", " --load_best_model_at_end True \\\n", " --output_dir out/roberta" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluation" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "02/16/2023 16:46:49 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n", "02/16/2023 16:46:49 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n", "_n_gpu=1,\n", "adafactor=False,\n", "adam_beta1=0.9,\n", "adam_beta2=0.999,\n", "adam_epsilon=1e-08,\n", "auto_find_batch_size=False,\n", "bf16=False,\n", "bf16_full_eval=False,\n", "data_seed=None,\n", "dataloader_drop_last=False,\n", "dataloader_num_workers=0,\n", "dataloader_pin_memory=True,\n", "ddp_bucket_cap_mb=None,\n", "ddp_find_unused_parameters=None,\n", "ddp_timeout=1800,\n", "debug=[],\n", "deepspeed=None,\n", "disable_tqdm=False,\n", "do_eval=True,\n", "do_predict=True,\n", "do_train=False,\n", "eval_accumulation_steps=None,\n", "eval_delay=0,\n", "eval_steps=250,\n", "evaluation_strategy=steps,\n", "fp16=False,\n", "fp16_backend=auto,\n", "fp16_full_eval=False,\n", "fp16_opt_level=O1,\n", "fsdp=[],\n", "fsdp_min_num_params=0,\n", "fsdp_transformer_layer_cls_to_wrap=None,\n", "full_determinism=False,\n", "gradient_accumulation_steps=1,\n", "gradient_checkpointing=False,\n", "greater_is_better=True,\n", "group_by_length=False,\n", "half_precision_backend=auto,\n", "hub_model_id=None,\n", "hub_private_repo=False,\n", "hub_strategy=every_save,\n", "hub_token=,\n", "ignore_data_skip=False,\n", "include_inputs_for_metrics=False,\n", "jit_mode_eval=False,\n", "label_names=None,\n", "label_smoothing_factor=0.0,\n", "learning_rate=2e-05,\n", "length_column_name=length,\n", "load_best_model_at_end=True,\n", "local_rank=-1,\n", "log_level=passive,\n", "log_level_replica=passive,\n", "log_on_each_node=True,\n", "logging_dir=out/roberta_results/runs/Feb16_16-46-48_DESKTOP-R7JO8BQ,\n", "logging_first_step=False,\n", "logging_nan_inf_filter=True,\n", "logging_steps=100,\n", "logging_strategy=steps,\n", "lr_scheduler_type=linear,\n", "max_grad_norm=1.0,\n", "max_steps=2500,\n", "metric_for_best_model=accuracy,\n", "mp_parameters=,\n", "no_cuda=False,\n", "num_train_epochs=1.0,\n", "optim=adamw_hf,\n", "optim_args=None,\n", "output_dir=out/roberta_results,\n", "overwrite_output_dir=False,\n", "past_index=-1,\n", "per_device_eval_batch_size=8,\n", "per_device_train_batch_size=8,\n", "prediction_loss_only=False,\n", "push_to_hub=False,\n", "push_to_hub_model_id=None,\n", "push_to_hub_organization=None,\n", "push_to_hub_token=,\n", "ray_scope=last,\n", "remove_unused_columns=True,\n", "report_to=[],\n", "resume_from_checkpoint=None,\n", "run_name=out/roberta_results,\n", "save_on_each_node=False,\n", "save_steps=250,\n", "save_strategy=steps,\n", "save_total_limit=5,\n", "seed=42,\n", "sharded_ddp=[],\n", "skip_memory_metrics=True,\n", "tf32=None,\n", "torch_compile=False,\n", "torch_compile_backend=None,\n", "torch_compile_mode=None,\n", "torchdynamo=None,\n", "tpu_metrics_debug=False,\n", "tpu_num_cores=None,\n", "use_ipex=False,\n", "use_legacy_prediction_loop=False,\n", "use_mps_device=False,\n", "warmup_ratio=0.0,\n", "warmup_steps=0,\n", "weight_decay=0.0,\n", "xpu_backend=None,\n", ")\n", "02/16/2023 16:46:49 - INFO - __main__ - load a local file for train: data/train.json\n", "02/16/2023 16:46:49 - INFO - __main__ - load a local file for validation: data/valid.json\n", "02/16/2023 16:46:49 - INFO - __main__ - load a local file for test: data/test.json\n", "02/16/2023 16:46:50 - WARNING - datasets.builder - Using custom data configuration default-f6e8039906850c57\n", "02/16/2023 16:46:50 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json\n", "02/16/2023 16:46:50 - INFO - datasets.builder - Overwrite dataset info from restored data version.\n", "02/16/2023 16:46:50 - INFO - datasets.info - Loading Dataset info from .cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n", "02/16/2023 16:46:50 - WARNING - datasets.builder - Found cached dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n", "02/16/2023 16:46:50 - INFO - datasets.info - Loading Dataset info from /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n", "100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 752.21it/s]\n", "[INFO|configuration_utils.py:658] 2023-02-16 16:46:50,276 >> loading configuration file out/roberta/config.json\n", "[INFO|configuration_utils.py:712] 2023-02-16 16:46:50,277 >> Model config RobertaConfig {\n", " \"_name_or_path\": \"out/roberta\",\n", " \"architectures\": [\n", " \"RobertaForSequenceClassificationCustomSimple\"\n", " ],\n", " \"attention_probs_dropout_prob\": 0.1,\n", " \"bos_token_id\": 0,\n", " \"classifier_dropout\": null,\n", " \"eos_token_id\": 2,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout_prob\": 0.1,\n", " \"hidden_size\": 768,\n", " \"id2label\": {\n", " \"0\": 0,\n", " \"1\": 1,\n", " \"2\": 2,\n", " \"3\": 3\n", " },\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"label2id\": {\n", " \"0\": 0,\n", " \"1\": 1,\n", " \"2\": 2,\n", " \"3\": 3\n", " },\n", " \"layer_norm_eps\": 1e-05,\n", " \"max_position_embeddings\": 514,\n", " \"model_type\": \"roberta\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 12,\n", " \"pad_token_id\": 1,\n", " \"position_embedding_type\": \"absolute\",\n", " \"problem_type\": \"single_label_classification\",\n", " \"torch_dtype\": \"float32\",\n", " \"transformers_version\": \"4.26.1\",\n", " \"type_vocab_size\": 1,\n", " \"use_cache\": true,\n", " \"use_hidden_states\": false,\n", " \"vocab_size\": 50265\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,283 >> loading file vocab.json\n", "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,283 >> loading file merges.txt\n", "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,284 >> loading file tokenizer.json\n", "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,284 >> loading file added_tokens.json\n", "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,284 >> loading file special_tokens_map.json\n", "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,284 >> loading file tokenizer_config.json\n", "02/16/2023 16:46:50 - INFO - __main__ - Using hidden states in model: False\n", "-------------------------------------------------------- Using hidden: False\n", "02/16/2023 16:46:50 - INFO - __main__ - Using implementation from class: RobertaForSequenceClassificationCustomSimple\n", "[INFO|modeling_utils.py:2272] 2023-02-16 16:46:50,339 >> loading weights file out/roberta/pytorch_model.bin\n", "[INFO|modeling_utils.py:2857] 2023-02-16 16:46:52,079 >> All model checkpoint weights were used when initializing RobertaForSequenceClassificationCustomSimple.\n", "\n", "[INFO|modeling_utils.py:2865] 2023-02-16 16:46:52,079 >> All the weights of RobertaForSequenceClassificationCustomSimple were initialized from the model checkpoint at out/roberta.\n", "If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassificationCustomSimple for predictions without further training.\n", "RobertaForSequenceClassificationCustomSimple(\n", " (roberta): RobertaModel(\n", " (embeddings): RobertaEmbeddings(\n", " (word_embeddings): Embedding(50265, 768, padding_idx=1)\n", " (position_embeddings): Embedding(514, 768, padding_idx=1)\n", " (token_type_embeddings): Embedding(1, 768)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (encoder): RobertaEncoder(\n", " (layer): ModuleList(\n", " (0): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (1): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (2): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (3): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (4): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (5): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (6): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (7): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (8): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (9): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (10): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (11): RobertaLayer(\n", " (attention): RobertaAttention(\n", " (self): RobertaSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): RobertaSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): RobertaIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): RobertaOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " )\n", " )\n", " (classifier): RobertaClassificationHeadCustomSimple(\n", " (dense_1): Linear(in_features=768, out_features=3072, bias=True)\n", " (dense_2): Linear(in_features=3072, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (out_proj): Linear(in_features=768, out_features=4, bias=True)\n", " (activation): GELU(approximate='none')\n", " )\n", ")\n", "02/16/2023 16:46:52 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-df96547ec55a44ce.arrow\n", "02/16/2023 16:46:52 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-67b1030adaffbb4a.arrow\n", "02/16/2023 16:46:52 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-ae09252df5e9bac1.arrow\n", "02/16/2023 16:46:52 - INFO - __main__ - Set 500 samples for 0-class\n", "02/16/2023 16:46:52 - INFO - __main__ - Set 500 samples for 1-class\n", "02/16/2023 16:46:52 - INFO - __main__ - Set 500 samples for 2-class\n", "02/16/2023 16:46:52 - INFO - __main__ - Set 500 samples for 3-class\n", "[INFO|trainer.py:511] 2023-02-16 16:46:55,346 >> max_steps is given, it will override any value given in num_train_epochs\n", "02/16/2023 16:46:55 - INFO - __main__ - *** Evaluate ***\n", "[INFO|trainer.py:710] 2023-02-16 16:46:55,346 >> The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassificationCustomSimple.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomSimple.forward`, you can safely ignore this message.\n", "[INFO|trainer.py:2964] 2023-02-16 16:46:55,348 >> ***** Running Evaluation *****\n", "[INFO|trainer.py:2966] 2023-02-16 16:46:55,348 >> Num examples = 2000\n", "[INFO|trainer.py:2969] 2023-02-16 16:46:55,348 >> Batch size = 8\n", "100%|█████████████████████████████████████████| 250/250 [00:17<00:00, 14.53it/s]\n", "***** eval metrics *****\n", " eval_accuracy = 0.923\n", " eval_loss = 0.296\n", " eval_runtime = 0:00:17.81\n", " eval_samples = 2000\n", " eval_samples_per_second = 112.255\n", " eval_steps_per_second = 14.032\n", "02/16/2023 16:47:13 - INFO - __main__ - *** Predict ***\n", "[INFO|trainer.py:710] 2023-02-16 16:47:13,166 >> The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassificationCustomSimple.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomSimple.forward`, you can safely ignore this message.\n", "[INFO|trainer.py:2964] 2023-02-16 16:47:13,167 >> ***** Running Prediction *****\n", "[INFO|trainer.py:2966] 2023-02-16 16:47:13,167 >> Num examples = 3800\n", "[INFO|trainer.py:2969] 2023-02-16 16:47:13,167 >> Batch size = 8\n", "100%|█████████████████████████████████████████| 475/475 [00:32<00:00, 14.53it/s]\n", "02/16/2023 16:47:45 - INFO - __main__ - ***** Predict results None *****\n", "[INFO|modelcard.py:449] 2023-02-16 16:47:46,438 >> Dropping the following result as it does not have all the necessary fields:\n", "{'task': {'name': 'Text Classification', 'type': 'text-classification'}}\n" ] } ], "source": [ "!python run_glue.py \\\n", " --cache_dir .cache_training \\\n", " --model_name_or_path out/roberta \\\n", " --custom_model roberta_simple \\\n", " --train_file data/train.json \\\n", " --validation_file data/valid.json \\\n", " --test_file data/test.json \\\n", " --per_device_train_batch_size 8 \\\n", " --per_device_eval_batch_size 8 \\\n", " --do_eval \\\n", " --do_predict \\\n", " --max_seq_length 128 \\\n", " --learning_rate 2e-5 \\\n", " --max_eval_samples 2000 \\\n", " --max_steps 2500 \\\n", " --num_train_epochs 1 \\\n", " --save_strategy steps \\\n", " --save_steps 250 \\\n", " --save_total_limit 5 \\\n", " --logging_strategy steps \\\n", " --logging_steps 100 \\\n", " --eval_steps 250 \\\n", " --evaluation_strategy steps \\\n", " --metric_for_best_model accuracy \\\n", " --greater_is_better True \\\n", " --load_best_model_at_end True \\\n", " --output_dir out/roberta_results" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Results" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0;39m0.9229999780654907\u001b[0m\n" ] } ], "source": [ "!cat out/roberta_results/eval_results.json | jq .eval_accuracy" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# GPT2" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Modifications" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "- Custom classification head with 3 dense layers\n", "- Using hidden states from last layer" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Code" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import torch\n", "from torch import nn\n", "from transformers import GPT2PreTrainedModel, GPT2Model\n", "from transformers.modeling_outputs import SequenceClassifierOutputWithPast\n", "\n", "class GPT2ForSequenceClassification(GPT2PreTrainedModel):\n", " def __init__(self, config):\n", " super().__init__(config)\n", " self.num_labels = config.num_labels\n", " self.transformer = GPT2Model(config)\n", " self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)\n", "\n", " # Model parallel\n", " self.model_parallel = False\n", " self.device_map = None\n", "\n", " # Initialize weights and apply final processing\n", " self.post_init()\n", "\n", "\n", "class GPT2ClassificationHeadCustom(nn.Module):\n", " def __init__(self, config):\n", " super().__init__()\n", " hidden_size = config.n_embd\n", " self.dense_1_input = nn.Linear(hidden_size, 2 * hidden_size)\n", " self.dense_1_hidden = nn.Linear(hidden_size, 2 * hidden_size)\n", " self.dense_2 = nn.Linear(4 * hidden_size, hidden_size)\n", " self.dropout = nn.Dropout(config.resid_pdrop)\n", " self.out_proj = nn.Linear(hidden_size, config.num_labels, bias=False)\n", "\n", " def forward(self, x, **kwargs):\n", " if 'hidden_states' in kwargs and kwargs['hidden_states'] is not None:\n", " hidden = kwargs['hidden_states'][-1]\n", " else:\n", " hidden = torch.zeros(x.size(), dtype=x.dtype, device=x.device)\n", "\n", " x = self.dense_1_input(x)\n", " x = torch.relu(x)\n", " x = self.dropout(x)\n", "\n", " hidden = self.dense_1_hidden(hidden)\n", " hidden = torch.relu(hidden)\n", " hidden = self.dropout(hidden)\n", "\n", " x = torch.cat((x, hidden), dim=2)\n", " x = self.dense_2(x)\n", " x = torch.relu(x)\n", " x = self.dropout(x)\n", "\n", " x = self.out_proj(x)\n", " return x\n", "\n", "class GPT2ForSequenceClassificationCustom(GPT2ForSequenceClassification):\n", " def __init__(self, config):\n", " super().__init__(config)\n", " self.num_labels = config.num_labels\n", " self.transformer = GPT2Model(config)\n", " self.score = GPT2ClassificationHeadCustom(config)\n", "\n", " self.init_weights()\n", "\n", " # Model parallel\n", " self.model_parallel = False\n", " self.device_map = None\n", "\n", " def forward(\n", " self,\n", " input_ids=None,\n", " past_key_values=None,\n", " attention_mask=None,\n", " token_type_ids=None,\n", " position_ids=None,\n", " head_mask=None,\n", " inputs_embeds=None,\n", " labels=None,\n", " use_cache=None,\n", " output_attentions=None,\n", " output_hidden_states=None,\n", " return_dict=None,\n", " ):\n", " r\"\"\"\n", " labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):\n", " Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,\n", " config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),\n", " If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).\n", " \"\"\"\n", " return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n", "\n", " transformer_outputs = self.transformer(\n", " input_ids,\n", " past_key_values=past_key_values,\n", " attention_mask=attention_mask,\n", " token_type_ids=token_type_ids,\n", " position_ids=position_ids,\n", " head_mask=head_mask,\n", " inputs_embeds=inputs_embeds,\n", " use_cache=use_cache,\n", " output_attentions=output_attentions,\n", " output_hidden_states=output_hidden_states,\n", " return_dict=return_dict,\n", " )\n", " hidden_states = transformer_outputs[0]\n", " if return_dict:\n", " logits = self.score(hidden_states, hidden_states=transformer_outputs.hidden_states)\n", " else:\n", " raise NotImplemented('Not implemented for using non-dictionary object')\n", "\n", " if input_ids is not None:\n", " batch_size, sequence_length = input_ids.shape[:2]\n", " else:\n", " batch_size, sequence_length = inputs_embeds.shape[:2]\n", "\n", " assert (\n", " self.config.pad_token_id is not None or batch_size == 1\n", " ), \"Cannot handle batch sizes > 1 if no padding token is defined.\"\n", " if self.config.pad_token_id is None:\n", " sequence_lengths = -1\n", " else:\n", " if input_ids is not None:\n", " sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1\n", " else:\n", " sequence_lengths = -1\n", "\n", " pooled_logits = logits[range(batch_size), sequence_lengths]\n", "\n", " loss = None\n", " if labels is not None:\n", " if self.num_labels == 1:\n", " # We are doing regression\n", " loss_fct = nn.MSELoss()\n", " loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1))\n", " else:\n", " loss_fct = nn.CrossEntropyLoss()\n", " loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))\n", "\n", " if not return_dict:\n", " output = (pooled_logits,) + transformer_outputs[1:]\n", " return ((loss,) + output) if loss is not None else output\n", "\n", " return SequenceClassifierOutputWithPast(\n", " loss=loss,\n", " logits=pooled_logits,\n", " past_key_values=transformer_outputs.past_key_values,\n", " hidden_states=transformer_outputs.hidden_states,\n", " attentions=transformer_outputs.attentions,\n", " )" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Model" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4f980b257c2b453797f63ddc89c98923", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)lve/main/config.json: 0%| | 0.00/665 [00:00,\n", "ignore_data_skip=False,\n", "include_inputs_for_metrics=False,\n", "jit_mode_eval=False,\n", "label_names=None,\n", "label_smoothing_factor=0.0,\n", "learning_rate=2e-05,\n", "length_column_name=length,\n", "load_best_model_at_end=True,\n", "local_rank=-1,\n", "log_level=passive,\n", "log_level_replica=passive,\n", "log_on_each_node=True,\n", "logging_dir=out/gpt2/runs/Feb16_15-22-36_DESKTOP-R7JO8BQ,\n", "logging_first_step=False,\n", "logging_nan_inf_filter=True,\n", "logging_steps=100,\n", "logging_strategy=steps,\n", "lr_scheduler_type=linear,\n", "max_grad_norm=1.0,\n", "max_steps=2500,\n", "metric_for_best_model=accuracy,\n", "mp_parameters=,\n", "no_cuda=False,\n", "num_train_epochs=1.0,\n", "optim=adamw_hf,\n", "optim_args=None,\n", "output_dir=out/gpt2,\n", "overwrite_output_dir=False,\n", "past_index=-1,\n", "per_device_eval_batch_size=8,\n", "per_device_train_batch_size=8,\n", "prediction_loss_only=False,\n", "push_to_hub=False,\n", "push_to_hub_model_id=None,\n", "push_to_hub_organization=None,\n", "push_to_hub_token=,\n", "ray_scope=last,\n", "remove_unused_columns=True,\n", "report_to=[],\n", "resume_from_checkpoint=None,\n", "run_name=out/gpt2,\n", "save_on_each_node=False,\n", "save_steps=250,\n", "save_strategy=steps,\n", "save_total_limit=5,\n", "seed=42,\n", "sharded_ddp=[],\n", "skip_memory_metrics=True,\n", "tf32=None,\n", "torch_compile=False,\n", "torch_compile_backend=None,\n", "torch_compile_mode=None,\n", "torchdynamo=None,\n", "tpu_metrics_debug=False,\n", "tpu_num_cores=None,\n", "use_ipex=False,\n", "use_legacy_prediction_loop=False,\n", "use_mps_device=False,\n", "warmup_ratio=0.0,\n", "warmup_steps=0,\n", "weight_decay=0.0,\n", "xpu_backend=None,\n", ")\n", "02/16/2023 15:22:37 - INFO - __main__ - Checkpoint detected, resuming training at out/gpt2/checkpoint-2500. To avoid this behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch.\n", "02/16/2023 15:22:37 - INFO - __main__ - load a local file for train: data/train.json\n", "02/16/2023 15:22:37 - INFO - __main__ - load a local file for validation: data/valid.json\n", "02/16/2023 15:22:37 - WARNING - datasets.builder - Using custom data configuration default-e10a382a423bbb9a\n", "02/16/2023 15:22:37 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json\n", "02/16/2023 15:22:37 - INFO - datasets.builder - Generating dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n", "Downloading and preparing dataset json/default to /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...\n", "Downloading data files: 100%|██████████████████| 2/2 [00:00<00:00, 14820.86it/s]\n", "02/16/2023 15:22:37 - INFO - datasets.download.download_manager - Downloading took 0.0 min\n", "02/16/2023 15:22:37 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min\n", "Extracting data files: 100%|████████████████████| 2/2 [00:00<00:00, 2476.71it/s]\n", "02/16/2023 15:22:37 - INFO - datasets.utils.info_utils - Unable to verify checksums.\n", "02/16/2023 15:22:37 - INFO - datasets.builder - Generating train split\n", "02/16/2023 15:22:37 - INFO - datasets.builder - Generating validation split\n", "02/16/2023 15:22:37 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.\n", "Dataset json downloaded and prepared to /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.\n", "100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 642.61it/s]\n", "[INFO|configuration_utils.py:660] 2023-02-16 15:22:38,465 >> loading configuration file config.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json\n", "[INFO|configuration_utils.py:712] 2023-02-16 15:22:38,465 >> Model config GPT2Config {\n", " \"_name_or_path\": \"gpt2\",\n", " \"activation_function\": \"gelu_new\",\n", " \"architectures\": [\n", " \"GPT2LMHeadModel\"\n", " ],\n", " \"attn_pdrop\": 0.1,\n", " \"bos_token_id\": 50256,\n", " \"embd_pdrop\": 0.1,\n", " \"eos_token_id\": 50256,\n", " \"id2label\": {\n", " \"0\": \"LABEL_0\",\n", " \"1\": \"LABEL_1\",\n", " \"2\": \"LABEL_2\",\n", " \"3\": \"LABEL_3\"\n", " },\n", " \"initializer_range\": 0.02,\n", " \"label2id\": {\n", " \"LABEL_0\": 0,\n", " \"LABEL_1\": 1,\n", " \"LABEL_2\": 2,\n", " \"LABEL_3\": 3\n", " },\n", " \"layer_norm_epsilon\": 1e-05,\n", " \"model_type\": \"gpt2\",\n", " \"n_ctx\": 1024,\n", " \"n_embd\": 768,\n", " \"n_head\": 12,\n", " \"n_inner\": null,\n", " \"n_layer\": 12,\n", " \"n_positions\": 1024,\n", " \"reorder_and_upcast_attn\": false,\n", " \"resid_pdrop\": 0.1,\n", " \"scale_attn_by_inverse_layer_idx\": false,\n", " \"scale_attn_weights\": true,\n", " \"summary_activation\": null,\n", " \"summary_first_dropout\": 0.1,\n", " \"summary_proj_to_labels\": true,\n", " \"summary_type\": \"cls_index\",\n", " \"summary_use_proj\": true,\n", " \"task_specific_params\": {\n", " \"text-generation\": {\n", " \"do_sample\": true,\n", " \"max_length\": 50\n", " }\n", " },\n", " \"transformers_version\": \"4.26.1\",\n", " \"use_cache\": true,\n", " \"vocab_size\": 50257\n", "}\n", "\n", "[INFO|tokenization_auto.py:458] 2023-02-16 15:22:38,945 >> Could not locate the tokenizer configuration file, will try to use the model config instead.\n", "[INFO|configuration_utils.py:660] 2023-02-16 15:22:39,423 >> loading configuration file config.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json\n", "[INFO|configuration_utils.py:712] 2023-02-16 15:22:39,424 >> Model config GPT2Config {\n", " \"_name_or_path\": \"gpt2\",\n", " \"activation_function\": \"gelu_new\",\n", " \"architectures\": [\n", " \"GPT2LMHeadModel\"\n", " ],\n", " \"attn_pdrop\": 0.1,\n", " \"bos_token_id\": 50256,\n", " \"embd_pdrop\": 0.1,\n", " \"eos_token_id\": 50256,\n", " \"initializer_range\": 0.02,\n", " \"layer_norm_epsilon\": 1e-05,\n", " \"model_type\": \"gpt2\",\n", " \"n_ctx\": 1024,\n", " \"n_embd\": 768,\n", " \"n_head\": 12,\n", " \"n_inner\": null,\n", " \"n_layer\": 12,\n", " \"n_positions\": 1024,\n", " \"reorder_and_upcast_attn\": false,\n", " \"resid_pdrop\": 0.1,\n", " \"scale_attn_by_inverse_layer_idx\": false,\n", " \"scale_attn_weights\": true,\n", " \"summary_activation\": null,\n", " \"summary_first_dropout\": 0.1,\n", " \"summary_proj_to_labels\": true,\n", " \"summary_type\": \"cls_index\",\n", " \"summary_use_proj\": true,\n", " \"task_specific_params\": {\n", " \"text-generation\": {\n", " \"do_sample\": true,\n", " \"max_length\": 50\n", " }\n", " },\n", " \"transformers_version\": \"4.26.1\",\n", " \"use_cache\": true,\n", " \"vocab_size\": 50257\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file vocab.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/vocab.json\n", "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file merges.txt from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/merges.txt\n", "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file tokenizer.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/tokenizer.json\n", "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file added_tokens.json from cache at None\n", "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file special_tokens_map.json from cache at None\n", "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file tokenizer_config.json from cache at None\n", "[INFO|configuration_utils.py:660] 2023-02-16 15:22:40,400 >> loading configuration file config.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json\n", "[INFO|configuration_utils.py:712] 2023-02-16 15:22:40,400 >> Model config GPT2Config {\n", " \"_name_or_path\": \"gpt2\",\n", " \"activation_function\": \"gelu_new\",\n", " \"architectures\": [\n", " \"GPT2LMHeadModel\"\n", " ],\n", " \"attn_pdrop\": 0.1,\n", " \"bos_token_id\": 50256,\n", " \"embd_pdrop\": 0.1,\n", " \"eos_token_id\": 50256,\n", " \"initializer_range\": 0.02,\n", " \"layer_norm_epsilon\": 1e-05,\n", " \"model_type\": \"gpt2\",\n", " \"n_ctx\": 1024,\n", " \"n_embd\": 768,\n", " \"n_head\": 12,\n", " \"n_inner\": null,\n", " \"n_layer\": 12,\n", " \"n_positions\": 1024,\n", " \"reorder_and_upcast_attn\": false,\n", " \"resid_pdrop\": 0.1,\n", " \"scale_attn_by_inverse_layer_idx\": false,\n", " \"scale_attn_weights\": true,\n", " \"summary_activation\": null,\n", " \"summary_first_dropout\": 0.1,\n", " \"summary_proj_to_labels\": true,\n", " \"summary_type\": \"cls_index\",\n", " \"summary_use_proj\": true,\n", " \"task_specific_params\": {\n", " \"text-generation\": {\n", " \"do_sample\": true,\n", " \"max_length\": 50\n", " }\n", " },\n", " \"transformers_version\": \"4.26.1\",\n", " \"use_cache\": true,\n", " \"vocab_size\": 50257\n", "}\n", "\n", "02/16/2023 15:22:40 - INFO - __main__ - Using hidden states in model: True\n", "-------------------------------------------------------- Using hidden: True\n", "02/16/2023 15:22:40 - INFO - __main__ - Using implementation from class: GPT2ForSequenceClassificationCustom\n", "[INFO|modeling_utils.py:2275] 2023-02-16 15:22:40,458 >> loading weights file pytorch_model.bin from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/pytorch_model.bin\n", "[INFO|modeling_utils.py:2857] 2023-02-16 15:22:42,848 >> All model checkpoint weights were used when initializing GPT2ForSequenceClassificationCustom.\n", "\n", "[WARNING|modeling_utils.py:2859] 2023-02-16 15:22:42,849 >> Some weights of GPT2ForSequenceClassificationCustom were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.11.attn.masked_bias', 'score.out_proj.weight', 'h.7.attn.masked_bias', 'h.6.attn.masked_bias', 'h.8.attn.masked_bias', 'h.5.attn.masked_bias', 'score.dense_2.weight', 'h.9.attn.masked_bias', 'score.dense_4.bias', 'score.dense_1_input.bias', 'score.dense_3.weight', 'score.dense_1_hidden.bias', 'score.dense_1_input.weight', 'h.1.attn.masked_bias', 'score.dense_3.bias', 'h.10.attn.masked_bias', 'h.2.attn.masked_bias', 'h.4.attn.masked_bias', 'score.dense_1_hidden.weight', 'score.dense_2.bias', 'score.dense_4.weight', 'h.0.attn.masked_bias', 'h.3.attn.masked_bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", "GPT2ForSequenceClassificationCustom(\n", " (transformer): GPT2Model(\n", " (wte): Embedding(50257, 768)\n", " (wpe): Embedding(1024, 768)\n", " (drop): Dropout(p=0.1, inplace=False)\n", " (h): ModuleList(\n", " (0): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (1): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (2): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (3): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (4): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (5): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (6): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (7): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (8): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (9): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (10): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (11): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " )\n", " (score): GPT2ClassificationHeadCustom(\n", " (dense_1_input): Linear(in_features=768, out_features=1536, bias=True)\n", " (dense_1_hidden): Linear(in_features=768, out_features=1536, bias=True)\n", " (dense_2): Linear(in_features=3072, out_features=3072, bias=True)\n", " (dense_3): Linear(in_features=3072, out_features=3072, bias=True)\n", " (dense_4): Linear(in_features=3072, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (out_proj): Linear(in_features=768, out_features=4, bias=False)\n", " )\n", ")\n", "[ERROR|tokenization_utils_base.py:1042] 2023-02-16 15:22:42,852 >> Using pad_token, but it is not set yet.\n", "02/16/2023 15:22:42 - INFO - __main__ - Set PAD token to EOS: <|endoftext|>\n", "Running tokenizer on dataset: 0%| | 0/120 [00:00\n", " main()\n", " File \"/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/run_glue.py\", line 533, in main\n", " raise ValueError(\"--do_predict requires a test dataset\")\n", "ValueError: --do_predict requires a test dataset\n" ] } ], "source": [ "!python run_glue.py \\\n", " --cache_dir .cache_training \\\n", " --model_name_or_path gpt2 \\\n", " --custom_model gpt2_hidden \\\n", " --train_file data/train.json \\\n", " --validation_file data/valid.json \\\n", " --test_file data/test.json \\\n", " --per_device_train_batch_size 8 \\\n", " --per_device_eval_batch_size 8 \\\n", " --do_train \\\n", " --do_eval \\\n", " --max_seq_length 128 \\\n", " --learning_rate 2e-5 \\\n", " --max_eval_samples 2000 \\\n", " --max_steps 2500 \\\n", " --num_train_epochs 1 \\\n", " --save_strategy steps \\\n", " --save_steps 250 \\\n", " --save_total_limit 5 \\\n", " --logging_strategy steps \\\n", " --logging_steps 100 \\\n", " --eval_steps 250 \\\n", " --evaluation_strategy steps \\\n", " --metric_for_best_model accuracy \\\n", " --greater_is_better True \\\n", " --load_best_model_at_end True \\\n", " --output_dir out/gpt2 " ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluation" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "02/16/2023 16:51:20 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n", "02/16/2023 16:51:20 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n", "_n_gpu=1,\n", "adafactor=False,\n", "adam_beta1=0.9,\n", "adam_beta2=0.999,\n", "adam_epsilon=1e-08,\n", "auto_find_batch_size=False,\n", "bf16=False,\n", "bf16_full_eval=False,\n", "data_seed=None,\n", "dataloader_drop_last=False,\n", "dataloader_num_workers=0,\n", "dataloader_pin_memory=True,\n", "ddp_bucket_cap_mb=None,\n", "ddp_find_unused_parameters=None,\n", "ddp_timeout=1800,\n", "debug=[],\n", "deepspeed=None,\n", "disable_tqdm=False,\n", "do_eval=True,\n", "do_predict=True,\n", "do_train=False,\n", "eval_accumulation_steps=None,\n", "eval_delay=0,\n", "eval_steps=250,\n", "evaluation_strategy=steps,\n", "fp16=False,\n", "fp16_backend=auto,\n", "fp16_full_eval=False,\n", "fp16_opt_level=O1,\n", "fsdp=[],\n", "fsdp_min_num_params=0,\n", "fsdp_transformer_layer_cls_to_wrap=None,\n", "full_determinism=False,\n", "gradient_accumulation_steps=1,\n", "gradient_checkpointing=False,\n", "greater_is_better=True,\n", "group_by_length=False,\n", "half_precision_backend=auto,\n", "hub_model_id=None,\n", "hub_private_repo=False,\n", "hub_strategy=every_save,\n", "hub_token=,\n", "ignore_data_skip=False,\n", "include_inputs_for_metrics=False,\n", "jit_mode_eval=False,\n", "label_names=None,\n", "label_smoothing_factor=0.0,\n", "learning_rate=2e-05,\n", "length_column_name=length,\n", "load_best_model_at_end=True,\n", "local_rank=-1,\n", "log_level=passive,\n", "log_level_replica=passive,\n", "log_on_each_node=True,\n", "logging_dir=out/gpt2_results/runs/Feb16_16-51-19_DESKTOP-R7JO8BQ,\n", "logging_first_step=False,\n", "logging_nan_inf_filter=True,\n", "logging_steps=100,\n", "logging_strategy=steps,\n", "lr_scheduler_type=linear,\n", "max_grad_norm=1.0,\n", "max_steps=2500,\n", "metric_for_best_model=accuracy,\n", "mp_parameters=,\n", "no_cuda=False,\n", "num_train_epochs=1.0,\n", "optim=adamw_hf,\n", "optim_args=None,\n", "output_dir=out/gpt2_results,\n", "overwrite_output_dir=False,\n", "past_index=-1,\n", "per_device_eval_batch_size=8,\n", "per_device_train_batch_size=8,\n", "prediction_loss_only=False,\n", "push_to_hub=False,\n", "push_to_hub_model_id=None,\n", "push_to_hub_organization=None,\n", "push_to_hub_token=,\n", "ray_scope=last,\n", "remove_unused_columns=True,\n", "report_to=[],\n", "resume_from_checkpoint=None,\n", "run_name=out/gpt2_results,\n", "save_on_each_node=False,\n", "save_steps=250,\n", "save_strategy=steps,\n", "save_total_limit=5,\n", "seed=42,\n", "sharded_ddp=[],\n", "skip_memory_metrics=True,\n", "tf32=None,\n", "torch_compile=False,\n", "torch_compile_backend=None,\n", "torch_compile_mode=None,\n", "torchdynamo=None,\n", "tpu_metrics_debug=False,\n", "tpu_num_cores=None,\n", "use_ipex=False,\n", "use_legacy_prediction_loop=False,\n", "use_mps_device=False,\n", "warmup_ratio=0.0,\n", "warmup_steps=0,\n", "weight_decay=0.0,\n", "xpu_backend=None,\n", ")\n", "02/16/2023 16:51:20 - INFO - __main__ - load a local file for train: data/train.json\n", "02/16/2023 16:51:20 - INFO - __main__ - load a local file for validation: data/valid.json\n", "02/16/2023 16:51:20 - INFO - __main__ - load a local file for test: data/test.json\n", "02/16/2023 16:51:20 - WARNING - datasets.builder - Using custom data configuration default-f6e8039906850c57\n", "02/16/2023 16:51:20 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json\n", "02/16/2023 16:51:20 - INFO - datasets.builder - Overwrite dataset info from restored data version.\n", "02/16/2023 16:51:20 - INFO - datasets.info - Loading Dataset info from .cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n", "02/16/2023 16:51:20 - WARNING - datasets.builder - Found cached dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n", "02/16/2023 16:51:20 - INFO - datasets.info - Loading Dataset info from /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n", "100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 591.33it/s]\n", "[INFO|configuration_utils.py:658] 2023-02-16 16:51:20,920 >> loading configuration file out/gpt2/config.json\n", "[INFO|configuration_utils.py:712] 2023-02-16 16:51:20,921 >> Model config GPT2Config {\n", " \"_name_or_path\": \"out/gpt2\",\n", " \"activation_function\": \"gelu_new\",\n", " \"architectures\": [\n", " \"GPT2ForSequenceClassificationCustom\"\n", " ],\n", " \"attn_pdrop\": 0.1,\n", " \"bos_token_id\": 50256,\n", " \"embd_pdrop\": 0.1,\n", " \"eos_token_id\": 50256,\n", " \"id2label\": {\n", " \"0\": 0,\n", " \"1\": 1,\n", " \"2\": 2,\n", " \"3\": 3\n", " },\n", " \"initializer_range\": 0.02,\n", " \"label2id\": {\n", " \"0\": 0,\n", " \"1\": 1,\n", " \"2\": 2,\n", " \"3\": 3\n", " },\n", " \"layer_norm_epsilon\": 1e-05,\n", " \"model_type\": \"gpt2\",\n", " \"n_ctx\": 1024,\n", " \"n_embd\": 768,\n", " \"n_head\": 12,\n", " \"n_inner\": null,\n", " \"n_layer\": 12,\n", " \"n_positions\": 1024,\n", " \"pad_token_id\": 50256,\n", " \"reorder_and_upcast_attn\": false,\n", " \"resid_pdrop\": 0.1,\n", " \"scale_attn_by_inverse_layer_idx\": false,\n", " \"scale_attn_weights\": true,\n", " \"summary_activation\": null,\n", " \"summary_first_dropout\": 0.1,\n", " \"summary_proj_to_labels\": true,\n", " \"summary_type\": \"cls_index\",\n", " \"summary_use_proj\": true,\n", " \"task_specific_params\": {\n", " \"text-generation\": {\n", " \"do_sample\": true,\n", " \"max_length\": 50\n", " }\n", " },\n", " \"torch_dtype\": \"float32\",\n", " \"transformers_version\": \"4.26.1\",\n", " \"use_cache\": true,\n", " \"use_hidden_states\": true,\n", " \"vocab_size\": 50257\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file vocab.json\n", "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file merges.txt\n", "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file tokenizer.json\n", "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file added_tokens.json\n", "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file special_tokens_map.json\n", "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file tokenizer_config.json\n", "02/16/2023 16:51:20 - INFO - __main__ - Using hidden states in model: True\n", "-------------------------------------------------------- Using hidden: True\n", "02/16/2023 16:51:20 - INFO - __main__ - Using implementation from class: GPT2ForSequenceClassificationCustom\n", "[INFO|modeling_utils.py:2272] 2023-02-16 16:51:20,982 >> loading weights file out/gpt2/pytorch_model.bin\n", "[INFO|modeling_utils.py:2857] 2023-02-16 16:51:23,451 >> All model checkpoint weights were used when initializing GPT2ForSequenceClassificationCustom.\n", "\n", "[INFO|modeling_utils.py:2865] 2023-02-16 16:51:23,451 >> All the weights of GPT2ForSequenceClassificationCustom were initialized from the model checkpoint at out/gpt2.\n", "If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2ForSequenceClassificationCustom for predictions without further training.\n", "GPT2ForSequenceClassificationCustom(\n", " (transformer): GPT2Model(\n", " (wte): Embedding(50257, 768)\n", " (wpe): Embedding(1024, 768)\n", " (drop): Dropout(p=0.1, inplace=False)\n", " (h): ModuleList(\n", " (0): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (1): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (2): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (3): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (4): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (5): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (6): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (7): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (8): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (9): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (10): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (11): GPT2Block(\n", " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (attn): GPT2Attention(\n", " (c_attn): Conv1D()\n", " (c_proj): Conv1D()\n", " (attn_dropout): Dropout(p=0.1, inplace=False)\n", " (resid_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " (mlp): GPT2MLP(\n", " (c_fc): Conv1D()\n", " (c_proj): Conv1D()\n", " (act): NewGELUActivation()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", " )\n", " (score): GPT2ClassificationHeadCustom(\n", " (dense_1_input): Linear(in_features=768, out_features=1536, bias=True)\n", " (dense_1_hidden): Linear(in_features=768, out_features=1536, bias=True)\n", " (dense_2): Linear(in_features=3072, out_features=3072, bias=True)\n", " (dense_3): Linear(in_features=3072, out_features=3072, bias=True)\n", " (dense_4): Linear(in_features=3072, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (out_proj): Linear(in_features=768, out_features=4, bias=False)\n", " )\n", ")\n", "Running tokenizer on dataset: 0%| | 0/120 [00:00> max_steps is given, it will override any value given in num_train_epochs\n", "02/16/2023 16:51:35 - INFO - __main__ - *** Evaluate ***\n", "[INFO|trainer.py:710] 2023-02-16 16:51:35,120 >> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`, you can safely ignore this message.\n", "[INFO|trainer.py:2964] 2023-02-16 16:51:35,123 >> ***** Running Evaluation *****\n", "[INFO|trainer.py:2966] 2023-02-16 16:51:35,123 >> Num examples = 2000\n", "[INFO|trainer.py:2969] 2023-02-16 16:51:35,123 >> Batch size = 8\n", "100%|█████████████████████████████████████████| 250/250 [00:23<00:00, 10.65it/s]\n", "***** eval metrics *****\n", " eval_accuracy = 0.9195\n", " eval_loss = 0.302\n", " eval_runtime = 0:00:24.11\n", " eval_samples = 2000\n", " eval_samples_per_second = 82.94\n", " eval_steps_per_second = 10.367\n", "02/16/2023 16:51:59 - INFO - __main__ - *** Predict ***\n", "[INFO|trainer.py:710] 2023-02-16 16:51:59,239 >> The following columns in the test set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`, you can safely ignore this message.\n", "[INFO|trainer.py:2964] 2023-02-16 16:51:59,240 >> ***** Running Prediction *****\n", "[INFO|trainer.py:2966] 2023-02-16 16:51:59,240 >> Num examples = 3800\n", "[INFO|trainer.py:2969] 2023-02-16 16:51:59,240 >> Batch size = 8\n", "100%|█████████████████████████████████████████| 475/475 [00:43<00:00, 10.84it/s]\n", "02/16/2023 16:52:43 - INFO - __main__ - ***** Predict results None *****\n", "[INFO|modelcard.py:449] 2023-02-16 16:52:43,692 >> Dropping the following result as it does not have all the necessary fields:\n", "{'task': {'name': 'Text Classification', 'type': 'text-classification'}}\n" ] } ], "source": [ "!python run_glue.py \\\n", " --cache_dir .cache_training \\\n", " --model_name_or_path out/gpt2 \\\n", " --custom_model gpt2_hidden \\\n", " --train_file data/train.json \\\n", " --validation_file data/valid.json \\\n", " --test_file data/test.json \\\n", " --per_device_train_batch_size 8 \\\n", " --per_device_eval_batch_size 8 \\\n", " --do_eval \\\n", " --do_predict \\\n", " --max_seq_length 128 \\\n", " --learning_rate 2e-5 \\\n", " --max_eval_samples 2000 \\\n", " --max_steps 2500 \\\n", " --num_train_epochs 1 \\\n", " --save_strategy steps \\\n", " --save_steps 250 \\\n", " --save_total_limit 5 \\\n", " --logging_strategy steps \\\n", " --logging_steps 100 \\\n", " --eval_steps 250 \\\n", " --evaluation_strategy steps \\\n", " --metric_for_best_model accuracy \\\n", " --greater_is_better True \\\n", " --load_best_model_at_end True \\\n", " --output_dir out/gpt2_results " ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Results" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0;39m0.9194999933242798\u001b[0m\n" ] } ], "source": [ "!cat out/gpt2_results/eval_results.json | jq .eval_accuracy" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# T5" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Modifications" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "- Custom classification head with 3 dense layers\n", "- Encoder layers frozen\n", "- Decoder layers frozen" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Code" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import copy\n", "from torch import nn\n", "from transformers import T5PreTrainedModel, T5Config\n", "from transformers.models.t5.modeling_t5 import T5Stack\n", "from transformers.modeling_outputs import SequenceClassifierOutput\n", "\n", "\n", "class T5ClassificationHead(nn.Module):\n", " def __init__(self, config: T5Config):\n", " super().__init__()\n", "\n", " self.dense_in = nn.Linear(config.d_model, 768)\n", " self.dense = nn.Linear(768, 768)\n", " self.dense_out = nn.Linear(768, config.num_labels)\n", " self.dropout = nn.Dropout(0.1)\n", "\n", " def forward(self, features, **kwargs):\n", " x = features[:, 0, :]\n", " x = self.dropout(x)\n", " x = self.dense_in(x)\n", " x = torch.relu(x)\n", " x = self.dropout(x)\n", " x = self.dense(x)\n", " x = torch.relu(x)\n", " x = self.dropout(x)\n", " x = self.dense_out(x)\n", "\n", " return x\n", "\n", "\n", "class T5ForClassification(T5PreTrainedModel):\n", " def __init__(self, config: T5Config):\n", " super().__init__(config)\n", " self.model_dim = config.d_model\n", "\n", " self.shared = nn.Embedding(config.vocab_size, config.d_model)\n", "\n", " encoder_config = copy.deepcopy(config)\n", " encoder_config.is_decoder = False\n", " encoder_config.use_cache = False\n", " encoder_config.is_encoder_decoder = False\n", " self.encoder = T5Stack(encoder_config, self.shared)\n", "\n", " decoder_config = copy.deepcopy(config)\n", " decoder_config.is_decoder = True\n", " decoder_config.is_encoder_decoder = False\n", " decoder_config.num_layers = config.num_decoder_layers\n", " self.decoder = T5Stack(decoder_config, self.shared)\n", "\n", " modules_to_freeze = [self.encoder.block[i].layer[0] for i in range(len(self.encoder.block))]\n", " modules_to_freeze.extend([self.decoder.block[i].layer[0] for i in range(len(self.decoder.block))])\n", " modules_to_freeze.extend([self.decoder.block[i].layer[1] for i in range(len(self.decoder.block))])\n", "\n", " for module in modules_to_freeze:\n", " for param in module.parameters():\n", " param.requires_grad = False\n", "\n", " self.lm_head = T5ClassificationHead(config)\n", "\n", " # Initialize weights and apply final processing\n", " self.post_init()\n", "\n", " # Model parallel\n", " self.model_parallel = False\n", " self.device_map = None\n", "\n", "\n", " def forward(\n", " self,\n", " input_ids=None,\n", " attention_mask=None,\n", " head_mask=None,\n", " cross_attn_head_mask=None,\n", " past_key_values=None,\n", " inputs_embeds=None,\n", " decoder_inputs_embeds=None,\n", " use_cache=None,\n", " output_attentions=None,\n", " output_hidden_states=None,\n", " return_dict=None,\n", " labels=None\n", " ):\n", " return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n", "\n", " outputs = self.encoder(\n", " input_ids,\n", " attention_mask=attention_mask,\n", " head_mask=head_mask,\n", " cross_attn_head_mask=cross_attn_head_mask,\n", " past_key_values=past_key_values,\n", " inputs_embeds=inputs_embeds,\n", " use_cache=use_cache,\n", " output_attentions=output_attentions,\n", " output_hidden_states=output_hidden_states,\n", " return_dict=return_dict,\n", " )\n", "\n", " outputs = self.decoder(\n", " input_ids,\n", " attention_mask=attention_mask,\n", " head_mask=head_mask,\n", " cross_attn_head_mask=cross_attn_head_mask,\n", " past_key_values=past_key_values,\n", " inputs_embeds=inputs_embeds,\n", " use_cache=use_cache,\n", " output_attentions=output_attentions,\n", " output_hidden_states=output_hidden_states,\n", " return_dict=return_dict,\n", " )\n", "\n", "\n", " logits = self.lm_head(outputs[0])\n", "\n", "\n", " loss = None\n", " if labels is not None:\n", " loss_fct = nn.CrossEntropyLoss()\n", " loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))\n", "\n", "\n", " return SequenceClassifierOutput(\n", " loss=loss,\n", " logits=logits,\n", " )\n" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Model" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "fda885ac92b1459ba9c0faf41a9d925f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)lve/main/config.json: 0%| | 0.00/1.21k [00:00,\n", "ignore_data_skip=False,\n", "include_inputs_for_metrics=False,\n", "jit_mode_eval=False,\n", "label_names=None,\n", "label_smoothing_factor=0.0,\n", "learning_rate=2e-05,\n", "length_column_name=length,\n", "load_best_model_at_end=True,\n", "local_rank=-1,\n", "log_level=passive,\n", "log_level_replica=passive,\n", "log_on_each_node=True,\n", "logging_dir=out/t5/runs/Feb16_15-24-12_DESKTOP-R7JO8BQ,\n", "logging_first_step=False,\n", "logging_nan_inf_filter=True,\n", "logging_steps=100,\n", "logging_strategy=steps,\n", "lr_scheduler_type=linear,\n", "max_grad_norm=1.0,\n", "max_steps=2500,\n", "metric_for_best_model=accuracy,\n", "mp_parameters=,\n", "no_cuda=False,\n", "num_train_epochs=1.0,\n", "optim=adamw_hf,\n", "optim_args=None,\n", "output_dir=out/t5,\n", "overwrite_output_dir=False,\n", "past_index=-1,\n", "per_device_eval_batch_size=8,\n", "per_device_train_batch_size=8,\n", "prediction_loss_only=False,\n", "push_to_hub=False,\n", "push_to_hub_model_id=None,\n", "push_to_hub_organization=None,\n", "push_to_hub_token=,\n", "ray_scope=last,\n", "remove_unused_columns=True,\n", "report_to=[],\n", "resume_from_checkpoint=None,\n", "run_name=out/t5,\n", "save_on_each_node=False,\n", "save_steps=250,\n", "save_strategy=steps,\n", "save_total_limit=5,\n", "seed=42,\n", "sharded_ddp=[],\n", "skip_memory_metrics=True,\n", "tf32=None,\n", "torch_compile=False,\n", "torch_compile_backend=None,\n", "torch_compile_mode=None,\n", "torchdynamo=None,\n", "tpu_metrics_debug=False,\n", "tpu_num_cores=None,\n", "use_ipex=False,\n", "use_legacy_prediction_loop=False,\n", "use_mps_device=False,\n", "warmup_ratio=0.0,\n", "warmup_steps=0,\n", "weight_decay=0.0,\n", "xpu_backend=None,\n", ")\n", "02/16/2023 15:24:13 - INFO - __main__ - Checkpoint detected, resuming training at out/t5/checkpoint-2500. To avoid this behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch.\n", "02/16/2023 15:24:13 - INFO - __main__ - load a local file for train: data/train.json\n", "02/16/2023 15:24:13 - INFO - __main__ - load a local file for validation: data/valid.json\n", "02/16/2023 15:24:13 - WARNING - datasets.builder - Using custom data configuration default-e10a382a423bbb9a\n", "02/16/2023 15:24:13 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json\n", "02/16/2023 15:24:13 - INFO - datasets.builder - Overwrite dataset info from restored data version.\n", "02/16/2023 15:24:13 - INFO - datasets.info - Loading Dataset info from .cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n", "02/16/2023 15:24:13 - WARNING - datasets.builder - Found cached dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n", "02/16/2023 15:24:13 - INFO - datasets.info - Loading Dataset info from /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n", "100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 426.97it/s]\n", "[INFO|configuration_utils.py:660] 2023-02-16 15:24:14,422 >> loading configuration file config.json from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/config.json\n", "[INFO|configuration_utils.py:712] 2023-02-16 15:24:14,423 >> Model config T5Config {\n", " \"_name_or_path\": \"t5-base\",\n", " \"architectures\": [\n", " \"T5ForConditionalGeneration\"\n", " ],\n", " \"d_ff\": 3072,\n", " \"d_kv\": 64,\n", " \"d_model\": 768,\n", " \"decoder_start_token_id\": 0,\n", " \"dense_act_fn\": \"relu\",\n", " \"dropout_rate\": 0.1,\n", " \"eos_token_id\": 1,\n", " \"feed_forward_proj\": \"relu\",\n", " \"id2label\": {\n", " \"0\": \"LABEL_0\",\n", " \"1\": \"LABEL_1\",\n", " \"2\": \"LABEL_2\",\n", " \"3\": \"LABEL_3\"\n", " },\n", " \"initializer_factor\": 1.0,\n", " \"is_encoder_decoder\": true,\n", " \"is_gated_act\": false,\n", " \"label2id\": {\n", " \"LABEL_0\": 0,\n", " \"LABEL_1\": 1,\n", " \"LABEL_2\": 2,\n", " \"LABEL_3\": 3\n", " },\n", " \"layer_norm_epsilon\": 1e-06,\n", " \"model_type\": \"t5\",\n", " \"n_positions\": 512,\n", " \"num_decoder_layers\": 12,\n", " \"num_heads\": 12,\n", " \"num_layers\": 12,\n", " \"output_past\": true,\n", " \"pad_token_id\": 0,\n", " \"relative_attention_max_distance\": 128,\n", " \"relative_attention_num_buckets\": 32,\n", " \"task_specific_params\": {\n", " \"summarization\": {\n", " \"early_stopping\": true,\n", " \"length_penalty\": 2.0,\n", " \"max_length\": 200,\n", " \"min_length\": 30,\n", " \"no_repeat_ngram_size\": 3,\n", " \"num_beams\": 4,\n", " \"prefix\": \"summarize: \"\n", " },\n", " \"translation_en_to_de\": {\n", " \"early_stopping\": true,\n", " \"max_length\": 300,\n", " \"num_beams\": 4,\n", " \"prefix\": \"translate English to German: \"\n", " },\n", " \"translation_en_to_fr\": {\n", " \"early_stopping\": true,\n", " \"max_length\": 300,\n", " \"num_beams\": 4,\n", " \"prefix\": \"translate English to French: \"\n", " },\n", " \"translation_en_to_ro\": {\n", " \"early_stopping\": true,\n", " \"max_length\": 300,\n", " \"num_beams\": 4,\n", " \"prefix\": \"translate English to Romanian: \"\n", " }\n", " },\n", " \"transformers_version\": \"4.26.1\",\n", " \"use_cache\": true,\n", " \"vocab_size\": 32128\n", "}\n", "\n", "[INFO|tokenization_auto.py:458] 2023-02-16 15:24:14,918 >> Could not locate the tokenizer configuration file, will try to use the model config instead.\n", "[INFO|configuration_utils.py:660] 2023-02-16 15:24:15,378 >> loading configuration file config.json from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/config.json\n", "[INFO|configuration_utils.py:712] 2023-02-16 15:24:15,378 >> Model config T5Config {\n", " \"_name_or_path\": \"t5-base\",\n", " \"architectures\": [\n", " \"T5ForConditionalGeneration\"\n", " ],\n", " \"d_ff\": 3072,\n", " \"d_kv\": 64,\n", " \"d_model\": 768,\n", " \"decoder_start_token_id\": 0,\n", " \"dense_act_fn\": \"relu\",\n", " \"dropout_rate\": 0.1,\n", " \"eos_token_id\": 1,\n", " \"feed_forward_proj\": \"relu\",\n", " \"initializer_factor\": 1.0,\n", " \"is_encoder_decoder\": true,\n", " \"is_gated_act\": false,\n", " \"layer_norm_epsilon\": 1e-06,\n", " \"model_type\": \"t5\",\n", " \"n_positions\": 512,\n", " \"num_decoder_layers\": 12,\n", " \"num_heads\": 12,\n", " \"num_layers\": 12,\n", " \"output_past\": true,\n", " \"pad_token_id\": 0,\n", " \"relative_attention_max_distance\": 128,\n", " \"relative_attention_num_buckets\": 32,\n", " \"task_specific_params\": {\n", " \"summarization\": {\n", " \"early_stopping\": true,\n", " \"length_penalty\": 2.0,\n", " \"max_length\": 200,\n", " \"min_length\": 30,\n", " \"no_repeat_ngram_size\": 3,\n", " \"num_beams\": 4,\n", " \"prefix\": \"summarize: \"\n", " },\n", " \"translation_en_to_de\": {\n", " \"early_stopping\": true,\n", " \"max_length\": 300,\n", " \"num_beams\": 4,\n", " \"prefix\": \"translate English to German: \"\n", " },\n", " \"translation_en_to_fr\": {\n", " \"early_stopping\": true,\n", " \"max_length\": 300,\n", " \"num_beams\": 4,\n", " \"prefix\": \"translate English to French: \"\n", " },\n", " \"translation_en_to_ro\": {\n", " \"early_stopping\": true,\n", " \"max_length\": 300,\n", " \"num_beams\": 4,\n", " \"prefix\": \"translate English to Romanian: \"\n", " }\n", " },\n", " \"transformers_version\": \"4.26.1\",\n", " \"use_cache\": true,\n", " \"vocab_size\": 32128\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:24:16,341 >> loading file spiece.model from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/spiece.model\n", "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:24:16,341 >> loading file tokenizer.json from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/tokenizer.json\n", "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:24:16,341 >> loading file added_tokens.json from cache at None\n", "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:24:16,341 >> loading file special_tokens_map.json from cache at None\n", "[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:24:16,341 >> loading file tokenizer_config.json from cache at None\n", "[INFO|configuration_utils.py:660] 2023-02-16 15:24:16,342 >> loading configuration file config.json from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/config.json\n", "[INFO|configuration_utils.py:712] 2023-02-16 15:24:16,342 >> Model config T5Config {\n", " \"_name_or_path\": \"t5-base\",\n", " \"architectures\": [\n", " \"T5ForConditionalGeneration\"\n", " ],\n", " \"d_ff\": 3072,\n", " \"d_kv\": 64,\n", " \"d_model\": 768,\n", " \"decoder_start_token_id\": 0,\n", " \"dense_act_fn\": \"relu\",\n", " \"dropout_rate\": 0.1,\n", " \"eos_token_id\": 1,\n", " \"feed_forward_proj\": \"relu\",\n", " \"initializer_factor\": 1.0,\n", " \"is_encoder_decoder\": true,\n", " \"is_gated_act\": false,\n", " \"layer_norm_epsilon\": 1e-06,\n", " \"model_type\": \"t5\",\n", " \"n_positions\": 512,\n", " \"num_decoder_layers\": 12,\n", " \"num_heads\": 12,\n", " \"num_layers\": 12,\n", " \"output_past\": true,\n", " \"pad_token_id\": 0,\n", " \"relative_attention_max_distance\": 128,\n", " \"relative_attention_num_buckets\": 32,\n", " \"task_specific_params\": {\n", " \"summarization\": {\n", " \"early_stopping\": true,\n", " \"length_penalty\": 2.0,\n", " \"max_length\": 200,\n", " \"min_length\": 30,\n", " \"no_repeat_ngram_size\": 3,\n", " \"num_beams\": 4,\n", " \"prefix\": \"summarize: \"\n", " },\n", " \"translation_en_to_de\": {\n", " \"early_stopping\": true,\n", " \"max_length\": 300,\n", " \"num_beams\": 4,\n", " \"prefix\": \"translate English to German: \"\n", " },\n", " \"translation_en_to_fr\": {\n", " \"early_stopping\": true,\n", " \"max_length\": 300,\n", " \"num_beams\": 4,\n", " \"prefix\": \"translate English to French: \"\n", " },\n", " \"translation_en_to_ro\": {\n", " \"early_stopping\": true,\n", " \"max_length\": 300,\n", " \"num_beams\": 4,\n", " \"prefix\": \"translate English to Romanian: \"\n", " }\n", " },\n", " \"transformers_version\": \"4.26.1\",\n", " \"use_cache\": true,\n", " \"vocab_size\": 32128\n", "}\n", "\n", "/home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/transformers/models/t5/tokenization_t5_fast.py:155: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.\n", "For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n", "- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.\n", "- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n", "- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.\n", " warnings.warn(\n", "02/16/2023 15:24:16 - INFO - __main__ - Using hidden states in model: False\n", "-------------------------------------------------------- Using hidden: False\n", "02/16/2023 15:24:16 - INFO - __main__ - Using implementation from class: T5ForClassification\n", "[INFO|modeling_utils.py:2275] 2023-02-16 15:24:16,391 >> loading weights file pytorch_model.bin from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/pytorch_model.bin\n", "[WARNING|modeling_utils.py:2847] 2023-02-16 15:24:19,101 >> Some weights of the model checkpoint at t5-base were not used when initializing T5ForClassification: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']\n", "- This IS expected if you are initializing T5ForClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing T5ForClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "[WARNING|modeling_utils.py:2859] 2023-02-16 15:24:19,102 >> Some weights of T5ForClassification were not initialized from the model checkpoint at t5-base and are newly initialized: ['decoder.embed_tokens.weight', 'lm_head.dense.bias', 'lm_head.dense_out.bias', 'encoder.embed_tokens.weight', 'lm_head.dense_in.bias', 'lm_head.dense_in.weight', 'lm_head.dense.weight', 'lm_head.dense_out.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", "T5ForClassification(\n", " (shared): Embedding(32128, 768)\n", " (encoder): T5Stack(\n", " (embed_tokens): Embedding(32128, 768)\n", " (block): ModuleList(\n", " (0): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " (relative_attention_bias): Embedding(32, 12)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (1): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (2): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (3): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (4): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (5): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (6): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (7): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (8): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (9): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (10): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (11): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " )\n", " (final_layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (decoder): T5Stack(\n", " (embed_tokens): Embedding(32128, 768)\n", " (block): ModuleList(\n", " (0): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " (relative_attention_bias): Embedding(32, 12)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (1): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (2): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (3): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (4): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (5): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (6): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (7): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (8): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (9): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (10): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (11): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " )\n", " (final_layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (lm_head): T5ClassificationHead(\n", " (dense_in): Linear(in_features=768, out_features=768, bias=True)\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (dense_out): Linear(in_features=768, out_features=4, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", ")\n", "Running tokenizer on dataset: 0%| | 0/120 [00:00\n", " main()\n", " File \"/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/run_glue.py\", line 533, in main\n", " raise ValueError(\"--do_predict requires a test dataset\")\n", "ValueError: --do_predict requires a test dataset\n" ] } ], "source": [ "!python run_glue.py \\\n", " --cache_dir .cache_training \\\n", " --model_name_or_path t5-base \\\n", " --custom_model t5_custom \\\n", " --train_file data/train.json \\\n", " --validation_file data/valid.json \\\n", " --test_file data/test.json \\\n", " --per_device_train_batch_size 8 \\\n", " --per_device_eval_batch_size 8 \\\n", " --do_train \\\n", " --do_eval \\\n", " --max_seq_length 128 \\\n", " --learning_rate 2e-5 \\\n", " --max_eval_samples 2000 \\\n", " --max_steps 2500 \\\n", " --num_train_epochs 1 \\\n", " --save_strategy steps \\\n", " --save_steps 250 \\\n", " --save_total_limit 5 \\\n", " --logging_strategy steps \\\n", " --logging_steps 100 \\\n", " --eval_steps 250 \\\n", " --evaluation_strategy steps \\\n", " --metric_for_best_model accuracy \\\n", " --greater_is_better True \\\n", " --load_best_model_at_end True \\\n", " --output_dir out/t5" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluation" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "02/16/2023 16:52:57 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n", "02/16/2023 16:52:57 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n", "_n_gpu=1,\n", "adafactor=False,\n", "adam_beta1=0.9,\n", "adam_beta2=0.999,\n", "adam_epsilon=1e-08,\n", "auto_find_batch_size=False,\n", "bf16=False,\n", "bf16_full_eval=False,\n", "data_seed=None,\n", "dataloader_drop_last=False,\n", "dataloader_num_workers=0,\n", "dataloader_pin_memory=True,\n", "ddp_bucket_cap_mb=None,\n", "ddp_find_unused_parameters=None,\n", "ddp_timeout=1800,\n", "debug=[],\n", "deepspeed=None,\n", "disable_tqdm=False,\n", "do_eval=True,\n", "do_predict=True,\n", "do_train=False,\n", "eval_accumulation_steps=None,\n", "eval_delay=0,\n", "eval_steps=250,\n", "evaluation_strategy=steps,\n", "fp16=False,\n", "fp16_backend=auto,\n", "fp16_full_eval=False,\n", "fp16_opt_level=O1,\n", "fsdp=[],\n", "fsdp_min_num_params=0,\n", "fsdp_transformer_layer_cls_to_wrap=None,\n", "full_determinism=False,\n", "gradient_accumulation_steps=1,\n", "gradient_checkpointing=False,\n", "greater_is_better=True,\n", "group_by_length=False,\n", "half_precision_backend=auto,\n", "hub_model_id=None,\n", "hub_private_repo=False,\n", "hub_strategy=every_save,\n", "hub_token=,\n", "ignore_data_skip=False,\n", "include_inputs_for_metrics=False,\n", "jit_mode_eval=False,\n", "label_names=None,\n", "label_smoothing_factor=0.0,\n", "learning_rate=2e-05,\n", "length_column_name=length,\n", "load_best_model_at_end=True,\n", "local_rank=-1,\n", "log_level=passive,\n", "log_level_replica=passive,\n", "log_on_each_node=True,\n", "logging_dir=out/t5_results/runs/Feb16_16-52-56_DESKTOP-R7JO8BQ,\n", "logging_first_step=False,\n", "logging_nan_inf_filter=True,\n", "logging_steps=100,\n", "logging_strategy=steps,\n", "lr_scheduler_type=linear,\n", "max_grad_norm=1.0,\n", "max_steps=2500,\n", "metric_for_best_model=accuracy,\n", "mp_parameters=,\n", "no_cuda=False,\n", "num_train_epochs=1.0,\n", "optim=adamw_hf,\n", "optim_args=None,\n", "output_dir=out/t5_results,\n", "overwrite_output_dir=False,\n", "past_index=-1,\n", "per_device_eval_batch_size=8,\n", "per_device_train_batch_size=8,\n", "prediction_loss_only=False,\n", "push_to_hub=False,\n", "push_to_hub_model_id=None,\n", "push_to_hub_organization=None,\n", "push_to_hub_token=,\n", "ray_scope=last,\n", "remove_unused_columns=True,\n", "report_to=[],\n", "resume_from_checkpoint=None,\n", "run_name=out/t5_results,\n", "save_on_each_node=False,\n", "save_steps=250,\n", "save_strategy=steps,\n", "save_total_limit=5,\n", "seed=42,\n", "sharded_ddp=[],\n", "skip_memory_metrics=True,\n", "tf32=None,\n", "torch_compile=False,\n", "torch_compile_backend=None,\n", "torch_compile_mode=None,\n", "torchdynamo=None,\n", "tpu_metrics_debug=False,\n", "tpu_num_cores=None,\n", "use_ipex=False,\n", "use_legacy_prediction_loop=False,\n", "use_mps_device=False,\n", "warmup_ratio=0.0,\n", "warmup_steps=0,\n", "weight_decay=0.0,\n", "xpu_backend=None,\n", ")\n", "02/16/2023 16:52:57 - INFO - __main__ - load a local file for train: data/train.json\n", "02/16/2023 16:52:57 - INFO - __main__ - load a local file for validation: data/valid.json\n", "02/16/2023 16:52:57 - INFO - __main__ - load a local file for test: data/test.json\n", "02/16/2023 16:52:58 - WARNING - datasets.builder - Using custom data configuration default-f6e8039906850c57\n", "02/16/2023 16:52:58 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json\n", "02/16/2023 16:52:58 - INFO - datasets.builder - Overwrite dataset info from restored data version.\n", "02/16/2023 16:52:58 - INFO - datasets.info - Loading Dataset info from .cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n", "02/16/2023 16:52:58 - WARNING - datasets.builder - Found cached dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n", "02/16/2023 16:52:58 - INFO - datasets.info - Loading Dataset info from /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n", "100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 769.41it/s]\n", "[INFO|configuration_utils.py:658] 2023-02-16 16:52:58,326 >> loading configuration file out/t5/config.json\n", "[INFO|configuration_utils.py:712] 2023-02-16 16:52:58,327 >> Model config T5Config {\n", " \"_name_or_path\": \"out/t5\",\n", " \"architectures\": [\n", " \"T5ForClassification\"\n", " ],\n", " \"d_ff\": 3072,\n", " \"d_kv\": 64,\n", " \"d_model\": 768,\n", " \"decoder_start_token_id\": 0,\n", " \"dense_act_fn\": \"relu\",\n", " \"dropout_rate\": 0.1,\n", " \"eos_token_id\": 1,\n", " \"feed_forward_proj\": \"relu\",\n", " \"id2label\": {\n", " \"0\": 0,\n", " \"1\": 1,\n", " \"2\": 2,\n", " \"3\": 3\n", " },\n", " \"initializer_factor\": 1.0,\n", " \"is_encoder_decoder\": true,\n", " \"is_gated_act\": false,\n", " \"label2id\": {\n", " \"0\": 0,\n", " \"1\": 1,\n", " \"2\": 2,\n", " \"3\": 3\n", " },\n", " \"layer_norm_epsilon\": 1e-06,\n", " \"model_type\": \"t5\",\n", " \"n_positions\": 512,\n", " \"num_decoder_layers\": 12,\n", " \"num_heads\": 12,\n", " \"num_layers\": 12,\n", " \"output_past\": true,\n", " \"pad_token_id\": 0,\n", " \"relative_attention_max_distance\": 128,\n", " \"relative_attention_num_buckets\": 32,\n", " \"task_specific_params\": {\n", " \"summarization\": {\n", " \"early_stopping\": true,\n", " \"length_penalty\": 2.0,\n", " \"max_length\": 200,\n", " \"min_length\": 30,\n", " \"no_repeat_ngram_size\": 3,\n", " \"num_beams\": 4,\n", " \"prefix\": \"summarize: \"\n", " },\n", " \"translation_en_to_de\": {\n", " \"early_stopping\": true,\n", " \"max_length\": 300,\n", " \"num_beams\": 4,\n", " \"prefix\": \"translate English to German: \"\n", " },\n", " \"translation_en_to_fr\": {\n", " \"early_stopping\": true,\n", " \"max_length\": 300,\n", " \"num_beams\": 4,\n", " \"prefix\": \"translate English to French: \"\n", " },\n", " \"translation_en_to_ro\": {\n", " \"early_stopping\": true,\n", " \"max_length\": 300,\n", " \"num_beams\": 4,\n", " \"prefix\": \"translate English to Romanian: \"\n", " }\n", " },\n", " \"torch_dtype\": \"float32\",\n", " \"transformers_version\": \"4.26.1\",\n", " \"use_cache\": true,\n", " \"use_hidden_states\": false,\n", " \"vocab_size\": 32128\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:52:58,328 >> loading file spiece.model\n", "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:52:58,328 >> loading file tokenizer.json\n", "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:52:58,328 >> loading file added_tokens.json\n", "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:52:58,328 >> loading file special_tokens_map.json\n", "[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:52:58,328 >> loading file tokenizer_config.json\n", "02/16/2023 16:52:58 - INFO - __main__ - Using hidden states in model: False\n", "-------------------------------------------------------- Using hidden: False\n", "02/16/2023 16:52:58 - INFO - __main__ - Using implementation from class: T5ForClassification\n", "[INFO|modeling_utils.py:2272] 2023-02-16 16:52:58,375 >> loading weights file out/t5/pytorch_model.bin\n", "[INFO|modeling_utils.py:2857] 2023-02-16 16:53:00,690 >> All model checkpoint weights were used when initializing T5ForClassification.\n", "\n", "[INFO|modeling_utils.py:2865] 2023-02-16 16:53:00,690 >> All the weights of T5ForClassification were initialized from the model checkpoint at out/t5.\n", "If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForClassification for predictions without further training.\n", "T5ForClassification(\n", " (shared): Embedding(32128, 768)\n", " (encoder): T5Stack(\n", " (embed_tokens): Embedding(32128, 768)\n", " (block): ModuleList(\n", " (0): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " (relative_attention_bias): Embedding(32, 12)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (1): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (2): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (3): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (4): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (5): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (6): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (7): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (8): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (9): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (10): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (11): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " )\n", " (final_layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (decoder): T5Stack(\n", " (embed_tokens): Embedding(32128, 768)\n", " (block): ModuleList(\n", " (0): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " (relative_attention_bias): Embedding(32, 12)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (1): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (2): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (3): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (4): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (5): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (6): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (7): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (8): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (9): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (10): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " (11): T5Block(\n", " (layer): ModuleList(\n", " (0): T5LayerSelfAttention(\n", " (SelfAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (1): T5LayerCrossAttention(\n", " (EncDecAttention): T5Attention(\n", " (q): Linear(in_features=768, out_features=768, bias=False)\n", " (k): Linear(in_features=768, out_features=768, bias=False)\n", " (v): Linear(in_features=768, out_features=768, bias=False)\n", " (o): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (2): T5LayerFF(\n", " (DenseReluDense): T5DenseActDense(\n", " (wi): Linear(in_features=768, out_features=3072, bias=False)\n", " (wo): Linear(in_features=3072, out_features=768, bias=False)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (act): ReLU()\n", " )\n", " (layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " )\n", " (final_layer_norm): T5LayerNorm()\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (lm_head): T5ClassificationHead(\n", " (dense_in): Linear(in_features=768, out_features=768, bias=True)\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (dense_out): Linear(in_features=768, out_features=4, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", ")\n", "Running tokenizer on dataset: 0%| | 0/120 [00:00> max_steps is given, it will override any value given in num_train_epochs\n", "02/16/2023 16:53:12 - INFO - __main__ - *** Evaluate ***\n", "[INFO|trainer.py:710] 2023-02-16 16:53:12,739 >> The following columns in the evaluation set don't have a corresponding argument in `T5ForClassification.forward` and have been ignored: text. If text are not expected by `T5ForClassification.forward`, you can safely ignore this message.\n", "[INFO|trainer.py:2964] 2023-02-16 16:53:12,740 >> ***** Running Evaluation *****\n", "[INFO|trainer.py:2966] 2023-02-16 16:53:12,740 >> Num examples = 2000\n", "[INFO|trainer.py:2969] 2023-02-16 16:53:12,740 >> Batch size = 8\n", "100%|█████████████████████████████████████████| 250/250 [00:39<00:00, 6.26it/s]\n", "***** eval metrics *****\n", " eval_accuracy = 0.4675\n", " eval_loss = 1.2139\n", " eval_runtime = 0:00:40.56\n", " eval_samples = 2000\n", " eval_samples_per_second = 49.303\n", " eval_steps_per_second = 6.163\n", "02/16/2023 16:53:53 - INFO - __main__ - *** Predict ***\n", "[INFO|trainer.py:710] 2023-02-16 16:53:53,307 >> The following columns in the test set don't have a corresponding argument in `T5ForClassification.forward` and have been ignored: text. If text are not expected by `T5ForClassification.forward`, you can safely ignore this message.\n", "[INFO|trainer.py:2964] 2023-02-16 16:53:53,308 >> ***** Running Prediction *****\n", "[INFO|trainer.py:2966] 2023-02-16 16:53:53,308 >> Num examples = 3800\n", "[INFO|trainer.py:2969] 2023-02-16 16:53:53,308 >> Batch size = 8\n", "100%|█████████████████████████████████████████| 475/475 [01:15<00:00, 6.32it/s]\n", "02/16/2023 16:55:08 - INFO - __main__ - ***** Predict results None *****\n", "[INFO|modelcard.py:449] 2023-02-16 16:55:09,179 >> Dropping the following result as it does not have all the necessary fields:\n", "{'task': {'name': 'Text Classification', 'type': 'text-classification'}}\n" ] } ], "source": [ "!python run_glue.py \\\n", " --cache_dir .cache_training \\\n", " --model_name_or_path out/t5 \\\n", " --custom_model t5_custom \\\n", " --train_file data/train.json \\\n", " --validation_file data/valid.json \\\n", " --test_file data/test.json \\\n", " --per_device_train_batch_size 8 \\\n", " --per_device_eval_batch_size 8 \\\n", " --do_eval \\\n", " --do_predict \\\n", " --max_seq_length 128 \\\n", " --learning_rate 2e-5 \\\n", " --max_eval_samples 2000 \\\n", " --max_steps 2500 \\\n", " --num_train_epochs 1 \\\n", " --save_strategy steps \\\n", " --save_steps 250 \\\n", " --save_total_limit 5 \\\n", " --logging_strategy steps \\\n", " --logging_steps 100 \\\n", " --eval_steps 250 \\\n", " --evaluation_strategy steps \\\n", " --metric_for_best_model accuracy \\\n", " --greater_is_better True \\\n", " --load_best_model_at_end True \\\n", " --output_dir out/t5_results" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Result" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[0;39m0.4675000011920929\u001b[0m\n" ] } ], "source": [ "!cat out/t5_results/eval_results.json | jq .eval_accuracy" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Bart - Zero shot" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Code" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8de84b2cf8ed46488a6eb0bb4e0b11ef", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)lve/main/config.json: 0%| | 0.00/1.40k [00:00