UGP/projektV2.ipynb

7336 lines
370 KiB
Plaintext
Raw Permalink Normal View History

2023-02-16 18:21:17 +01:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"!pip install transformers torch datasets evaluate scikit-learn sacremoses sentencepiece ipywidgets > /dev/null"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Roberta"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Modifications"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"- Custom classification head with bigger hidden size\n",
"- Changed activation function to GELU"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Code"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from torch import nn\n",
"from transformers import RobertaForSequenceClassification, RobertaModel\n",
"\n",
"\n",
"# Simple version #\n",
"\n",
"class RobertaClassificationHeadCustomSimple(nn.Module):\n",
" \"\"\"Head for sentence-level classification tasks.\"\"\"\n",
"\n",
" def __init__(self, config):\n",
" super().__init__()\n",
" hidden_size = config.hidden_size\n",
" self.dense_1 = nn.Linear(hidden_size, 4 * hidden_size)\n",
" self.dense_2 = nn.Linear(4 * hidden_size, hidden_size)\n",
" classifier_dropout = (\n",
" config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob\n",
" )\n",
" self.dropout = nn.Dropout(classifier_dropout)\n",
" self.out_proj = nn.Linear(hidden_size, config.num_labels)\n",
" self.activation = nn.GELU()\n",
"\n",
" def forward(self, features, **kwargs):\n",
" x = features[:, 0, :] # take <s> token (equiv. to [CLS])\n",
"\n",
" x = self.dense_1(x)\n",
" x = self.activation(x)\n",
" x = self.dropout(x)\n",
"\n",
" x = self.dense_2(x)\n",
" x = self.activation(x)\n",
" x = self.dropout(x)\n",
"\n",
" x = self.out_proj(x)\n",
" return x\n",
"\n",
"\n",
"class RobertaForSequenceClassificationCustomSimple(RobertaForSequenceClassification):\n",
" _keys_to_ignore_on_load_missing = [r\"position_ids\"]\n",
"\n",
" def __init__(self, config):\n",
" super().__init__(config)\n",
" self.num_labels = config.num_labels\n",
" self.config = config\n",
"\n",
" self.roberta = RobertaModel(config, add_pooling_layer=False)\n",
" self.classifier = RobertaClassificationHeadCustomSimple(config)\n",
"\n",
" # Initialize weights and apply final processing\n",
" self.post_init()\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassificationCustomSimple: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']\n",
"- This IS expected if you are initializing RobertaForSequenceClassificationCustomSimple from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing RobertaForSequenceClassificationCustomSimple from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"Some weights of RobertaForSequenceClassificationCustomSimple were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense_1.weight', 'classifier.out_proj.bias', 'classifier.dense_2.bias', 'classifier.dense_2.weight', 'classifier.out_proj.weight', 'classifier.dense_1.bias']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
},
{
"data": {
"text/plain": [
"RobertaForSequenceClassificationCustomSimple(\n",
" (roberta): RobertaModel(\n",
" (embeddings): RobertaEmbeddings(\n",
" (word_embeddings): Embedding(50265, 768, padding_idx=1)\n",
" (position_embeddings): Embedding(514, 768, padding_idx=1)\n",
" (token_type_embeddings): Embedding(1, 768)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (encoder): RobertaEncoder(\n",
" (layer): ModuleList(\n",
" (0): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (1): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (2): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (3): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (4): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (5): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (6): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (7): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (8): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (9): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (10): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (11): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (classifier): RobertaClassificationHeadCustomSimple(\n",
" (dense_1): Linear(in_features=768, out_features=3072, bias=True)\n",
" (dense_2): Linear(in_features=3072, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (out_proj): Linear(in_features=768, out_features=2, bias=True)\n",
" (activation): GELU(approximate='none')\n",
" )\n",
")"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"RobertaForSequenceClassificationCustomSimple.from_pretrained(\"roberta-base\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"02/16/2023 15:21:14 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n",
"02/16/2023 15:21:14 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n",
"_n_gpu=1,\n",
"adafactor=False,\n",
"adam_beta1=0.9,\n",
"adam_beta2=0.999,\n",
"adam_epsilon=1e-08,\n",
"auto_find_batch_size=False,\n",
"bf16=False,\n",
"bf16_full_eval=False,\n",
"data_seed=None,\n",
"dataloader_drop_last=False,\n",
"dataloader_num_workers=0,\n",
"dataloader_pin_memory=True,\n",
"ddp_bucket_cap_mb=None,\n",
"ddp_find_unused_parameters=None,\n",
"ddp_timeout=1800,\n",
"debug=[],\n",
"deepspeed=None,\n",
"disable_tqdm=False,\n",
"do_eval=True,\n",
"do_predict=True,\n",
"do_train=True,\n",
"eval_accumulation_steps=None,\n",
"eval_delay=0,\n",
"eval_steps=250,\n",
"evaluation_strategy=steps,\n",
"fp16=False,\n",
"fp16_backend=auto,\n",
"fp16_full_eval=False,\n",
"fp16_opt_level=O1,\n",
"fsdp=[],\n",
"fsdp_min_num_params=0,\n",
"fsdp_transformer_layer_cls_to_wrap=None,\n",
"full_determinism=False,\n",
"gradient_accumulation_steps=1,\n",
"gradient_checkpointing=False,\n",
"greater_is_better=True,\n",
"group_by_length=False,\n",
"half_precision_backend=auto,\n",
"hub_model_id=None,\n",
"hub_private_repo=False,\n",
"hub_strategy=every_save,\n",
"hub_token=<HUB_TOKEN>,\n",
"ignore_data_skip=False,\n",
"include_inputs_for_metrics=False,\n",
"jit_mode_eval=False,\n",
"label_names=None,\n",
"label_smoothing_factor=0.0,\n",
"learning_rate=2e-05,\n",
"length_column_name=length,\n",
"load_best_model_at_end=True,\n",
"local_rank=-1,\n",
"log_level=passive,\n",
"log_level_replica=passive,\n",
"log_on_each_node=True,\n",
"logging_dir=out/roberta/runs/Feb16_15-21-13_DESKTOP-R7JO8BQ,\n",
"logging_first_step=False,\n",
"logging_nan_inf_filter=True,\n",
"logging_steps=100,\n",
"logging_strategy=steps,\n",
"lr_scheduler_type=linear,\n",
"max_grad_norm=1.0,\n",
"max_steps=2500,\n",
"metric_for_best_model=accuracy,\n",
"mp_parameters=,\n",
"no_cuda=False,\n",
"num_train_epochs=1.0,\n",
"optim=adamw_hf,\n",
"optim_args=None,\n",
"output_dir=out/roberta,\n",
"overwrite_output_dir=False,\n",
"past_index=-1,\n",
"per_device_eval_batch_size=8,\n",
"per_device_train_batch_size=8,\n",
"prediction_loss_only=False,\n",
"push_to_hub=False,\n",
"push_to_hub_model_id=None,\n",
"push_to_hub_organization=None,\n",
"push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
"ray_scope=last,\n",
"remove_unused_columns=True,\n",
"report_to=[],\n",
"resume_from_checkpoint=None,\n",
"run_name=out/roberta,\n",
"save_on_each_node=False,\n",
"save_steps=250,\n",
"save_strategy=steps,\n",
"save_total_limit=5,\n",
"seed=42,\n",
"sharded_ddp=[],\n",
"skip_memory_metrics=True,\n",
"tf32=None,\n",
"torch_compile=False,\n",
"torch_compile_backend=None,\n",
"torch_compile_mode=None,\n",
"torchdynamo=None,\n",
"tpu_metrics_debug=False,\n",
"tpu_num_cores=None,\n",
"use_ipex=False,\n",
"use_legacy_prediction_loop=False,\n",
"use_mps_device=False,\n",
"warmup_ratio=0.0,\n",
"warmup_steps=0,\n",
"weight_decay=0.0,\n",
"xpu_backend=None,\n",
")\n",
"02/16/2023 15:21:14 - INFO - __main__ - Checkpoint detected, resuming training at out/roberta/checkpoint-2500. To avoid this behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch.\n",
"02/16/2023 15:21:14 - INFO - __main__ - load a local file for train: data/train.json\n",
"02/16/2023 15:21:14 - INFO - __main__ - load a local file for validation: data/valid.json\n",
"02/16/2023 15:21:14 - INFO - __main__ - load a local file for test: data/test.json\n",
"02/16/2023 15:21:14 - WARNING - datasets.builder - Using custom data configuration default-f6e8039906850c57\n",
"02/16/2023 15:21:14 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json\n",
"02/16/2023 15:21:14 - INFO - datasets.builder - Overwrite dataset info from restored data version.\n",
"02/16/2023 15:21:14 - INFO - datasets.info - Loading Dataset info from .cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n",
"02/16/2023 15:21:14 - WARNING - datasets.builder - Found cached dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
"02/16/2023 15:21:14 - INFO - datasets.info - Loading Dataset info from /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n",
"100%|█████████████████████████████████████████████| 3/3 [00:00<00:00, 48.00it/s]\n",
"[INFO|configuration_utils.py:660] 2023-02-16 15:21:15,174 >> loading configuration file config.json from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json\n",
"[INFO|configuration_utils.py:712] 2023-02-16 15:21:15,175 >> Model config RobertaConfig {\n",
" \"_name_or_path\": \"roberta-base\",\n",
" \"architectures\": [\n",
" \"RobertaForMaskedLM\"\n",
" ],\n",
" \"attention_probs_dropout_prob\": 0.1,\n",
" \"bos_token_id\": 0,\n",
" \"classifier_dropout\": null,\n",
" \"eos_token_id\": 2,\n",
" \"hidden_act\": \"gelu\",\n",
" \"hidden_dropout_prob\": 0.1,\n",
" \"hidden_size\": 768,\n",
" \"id2label\": {\n",
" \"0\": \"LABEL_0\",\n",
" \"1\": \"LABEL_1\",\n",
" \"2\": \"LABEL_2\",\n",
" \"3\": \"LABEL_3\"\n",
" },\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 3072,\n",
" \"label2id\": {\n",
" \"LABEL_0\": 0,\n",
" \"LABEL_1\": 1,\n",
" \"LABEL_2\": 2,\n",
" \"LABEL_3\": 3\n",
" },\n",
" \"layer_norm_eps\": 1e-05,\n",
" \"max_position_embeddings\": 514,\n",
" \"model_type\": \"roberta\",\n",
" \"num_attention_heads\": 12,\n",
" \"num_hidden_layers\": 12,\n",
" \"pad_token_id\": 1,\n",
" \"position_embedding_type\": \"absolute\",\n",
" \"transformers_version\": \"4.26.1\",\n",
" \"type_vocab_size\": 1,\n",
" \"use_cache\": true,\n",
" \"vocab_size\": 50265\n",
"}\n",
"\n",
"[INFO|tokenization_auto.py:458] 2023-02-16 15:21:15,654 >> Could not locate the tokenizer configuration file, will try to use the model config instead.\n",
"[INFO|configuration_utils.py:660] 2023-02-16 15:21:16,123 >> loading configuration file config.json from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json\n",
"[INFO|configuration_utils.py:712] 2023-02-16 15:21:16,123 >> Model config RobertaConfig {\n",
" \"_name_or_path\": \"roberta-base\",\n",
" \"architectures\": [\n",
" \"RobertaForMaskedLM\"\n",
" ],\n",
" \"attention_probs_dropout_prob\": 0.1,\n",
" \"bos_token_id\": 0,\n",
" \"classifier_dropout\": null,\n",
" \"eos_token_id\": 2,\n",
" \"hidden_act\": \"gelu\",\n",
" \"hidden_dropout_prob\": 0.1,\n",
" \"hidden_size\": 768,\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 3072,\n",
" \"layer_norm_eps\": 1e-05,\n",
" \"max_position_embeddings\": 514,\n",
" \"model_type\": \"roberta\",\n",
" \"num_attention_heads\": 12,\n",
" \"num_hidden_layers\": 12,\n",
" \"pad_token_id\": 1,\n",
" \"position_embedding_type\": \"absolute\",\n",
" \"transformers_version\": \"4.26.1\",\n",
" \"type_vocab_size\": 1,\n",
" \"use_cache\": true,\n",
" \"vocab_size\": 50265\n",
"}\n",
"\n",
"[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file vocab.json from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/vocab.json\n",
"[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file merges.txt from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/merges.txt\n",
"[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file tokenizer.json from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/tokenizer.json\n",
"[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file added_tokens.json from cache at None\n",
"[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file special_tokens_map.json from cache at None\n",
"[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file tokenizer_config.json from cache at None\n",
"[INFO|configuration_utils.py:660] 2023-02-16 15:21:17,045 >> loading configuration file config.json from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json\n",
"[INFO|configuration_utils.py:712] 2023-02-16 15:21:17,046 >> Model config RobertaConfig {\n",
" \"_name_or_path\": \"roberta-base\",\n",
" \"architectures\": [\n",
" \"RobertaForMaskedLM\"\n",
" ],\n",
" \"attention_probs_dropout_prob\": 0.1,\n",
" \"bos_token_id\": 0,\n",
" \"classifier_dropout\": null,\n",
" \"eos_token_id\": 2,\n",
" \"hidden_act\": \"gelu\",\n",
" \"hidden_dropout_prob\": 0.1,\n",
" \"hidden_size\": 768,\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 3072,\n",
" \"layer_norm_eps\": 1e-05,\n",
" \"max_position_embeddings\": 514,\n",
" \"model_type\": \"roberta\",\n",
" \"num_attention_heads\": 12,\n",
" \"num_hidden_layers\": 12,\n",
" \"pad_token_id\": 1,\n",
" \"position_embedding_type\": \"absolute\",\n",
" \"transformers_version\": \"4.26.1\",\n",
" \"type_vocab_size\": 1,\n",
" \"use_cache\": true,\n",
" \"vocab_size\": 50265\n",
"}\n",
"\n",
"02/16/2023 15:21:17 - INFO - __main__ - Using hidden states in model: False\n",
"-------------------------------------------------------- Using hidden: False\n",
"02/16/2023 15:21:17 - INFO - __main__ - Using implementation from class: RobertaForSequenceClassificationCustomSimple\n",
"[INFO|modeling_utils.py:2275] 2023-02-16 15:21:17,101 >> loading weights file pytorch_model.bin from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/pytorch_model.bin\n",
"[WARNING|modeling_utils.py:2847] 2023-02-16 15:21:22,965 >> Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassificationCustomSimple: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias']\n",
"- This IS expected if you are initializing RobertaForSequenceClassificationCustomSimple from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing RobertaForSequenceClassificationCustomSimple from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"[WARNING|modeling_utils.py:2859] 2023-02-16 15:21:22,965 >> Some weights of RobertaForSequenceClassificationCustomSimple were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense_1.bias', 'classifier.dense_2.weight', 'classifier.out_proj.weight', 'classifier.dense_2.bias', 'classifier.out_proj.bias', 'classifier.dense_1.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"RobertaForSequenceClassificationCustomSimple(\n",
" (roberta): RobertaModel(\n",
" (embeddings): RobertaEmbeddings(\n",
" (word_embeddings): Embedding(50265, 768, padding_idx=1)\n",
" (position_embeddings): Embedding(514, 768, padding_idx=1)\n",
" (token_type_embeddings): Embedding(1, 768)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (encoder): RobertaEncoder(\n",
" (layer): ModuleList(\n",
" (0): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (1): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (2): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (3): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (4): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (5): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (6): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (7): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (8): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (9): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (10): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (11): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (classifier): RobertaClassificationHeadCustomSimple(\n",
" (dense_1): Linear(in_features=768, out_features=3072, bias=True)\n",
" (dense_2): Linear(in_features=3072, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (out_proj): Linear(in_features=768, out_features=4, bias=True)\n",
" (activation): GELU(approximate='none')\n",
" )\n",
")\n",
"02/16/2023 15:21:22 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-204a6dc6fcae3352.arrow\n",
"Running tokenizer on dataset: 0%| | 0/4 [00:00<?, ?ba/s]02/16/2023 15:21:23 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-9091129e58fb62d5.arrow\n",
"Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 15.86ba/s]\n",
"02/16/2023 15:21:23 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-bdfe4224bf4c9f20.arrow\n",
"02/16/2023 15:21:23 - INFO - __main__ - Set 500 samples for 0-class\n",
"02/16/2023 15:21:23 - INFO - __main__ - Set 500 samples for 1-class\n",
"02/16/2023 15:21:23 - INFO - __main__ - Set 500 samples for 2-class\n",
"02/16/2023 15:21:23 - INFO - __main__ - Set 500 samples for 3-class\n",
"02/16/2023 15:21:23 - INFO - __main__ - Sample 83810 of the training set: {'label': 0, 'text': \"Policeman 'saw fatal train crash' An off-duty policeman watched a train plough into a car on a level crossing in Berkshire, killing six people.\", 'input_ids': [0, 510, 12589, 5649, 128, 35349, 6484, 2341, 2058, 108, 660, 160, 12, 15593, 20976, 3996, 10, 2341, 2968, 4894, 88, 10, 512, 15, 10, 672, 6724, 1437, 11, 16563, 6, 2429, 411, 82, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}.\n",
"02/16/2023 15:21:23 - INFO - __main__ - Sample 14592 of the training set: {'label': 1, 'text': 'Silver finale for USA In the last event of the 2004 Olympic Games, the United States track team produced one last surprise. Meb Keflezighi, a native of Eritrea who moved to the United States as ', 'input_ids': [0, 39008, 7712, 13, 2805, 96, 5, 94, 515, 9, 5, 4482, 3336, 3100, 6, 5, 315, 532, 1349, 165, 2622, 65, 94, 2755, 4, 256, 3209, 229, 4550, 23250, 8774, 118, 6, 10, 3763, 9, 24372, 9891, 54, 1410, 7, 5, 315, 532, 25, 1437, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}.\n",
"02/16/2023 15:21:23 - INFO - __main__ - Sample 3278 of the training set: {'label': 3, 'text': 'Compuware Blasts IBM #39;s Legal Tactics Two years ago, IBM was ordered to produce the source code for its products, which Compuware identified as containing its pirated intellectual property. The code was missing. But lo and behold -- last week, they called and said they had it, quot; ...', 'input_ids': [0, 24699, 257, 10680, 2091, 13651, 11510, 849, 3416, 131, 29, 10661, 45689, 1596, 107, 536, 6, 11510, 21, 2740, 7, 2592, 5, 1300, 3260, 13, 63, 785, 6, 61, 10081, 257, 10680, 2006, 25, 8200, 63, 36287, 1070, 9594, 1038, 4, 20, 3260, 21, 1716, 4, 125, 4600, 8, 29308, 480, 94, 186, 6, 51, 373, 8, 26, 51, 56, 24, 6, 39809, 131, 1666, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}.\n",
"[INFO|trainer.py:511] 2023-02-16 15:21:27,576 >> max_steps is given, it will override any value given in num_train_epochs\n",
"[INFO|trainer.py:1972] 2023-02-16 15:21:27,576 >> Loading model from out/roberta/checkpoint-2500.\n",
"[INFO|trainer.py:710] 2023-02-16 15:21:29,498 >> The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassificationCustomSimple.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomSimple.forward`, you can safely ignore this message.\n",
"/home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
" warnings.warn(\n",
"[INFO|trainer.py:1650] 2023-02-16 15:21:31,949 >> ***** Running training *****\n",
"[INFO|trainer.py:1651] 2023-02-16 15:21:31,950 >> Num examples = 120000\n",
"[INFO|trainer.py:1652] 2023-02-16 15:21:31,950 >> Num Epochs = 1\n",
"[INFO|trainer.py:1653] 2023-02-16 15:21:31,950 >> Instantaneous batch size per device = 8\n",
"[INFO|trainer.py:1654] 2023-02-16 15:21:31,950 >> Total train batch size (w. parallel, distributed & accumulation) = 8\n",
"[INFO|trainer.py:1655] 2023-02-16 15:21:31,950 >> Gradient Accumulation steps = 1\n",
"[INFO|trainer.py:1656] 2023-02-16 15:21:31,950 >> Total optimization steps = 2500\n",
"[INFO|trainer.py:1657] 2023-02-16 15:21:31,951 >> Number of trainable parameters = 128780548\n",
"[INFO|trainer.py:1679] 2023-02-16 15:21:31,951 >> Continuing training from checkpoint, will skip to saved global_step\n",
"[INFO|trainer.py:1680] 2023-02-16 15:21:31,951 >> Continuing training from epoch 0\n",
"[INFO|trainer.py:1681] 2023-02-16 15:21:31,951 >> Continuing training from global step 2500\n",
"[INFO|trainer.py:1683] 2023-02-16 15:21:31,951 >> Will skip the first 0 epochs then the first 2500 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.\n",
"Skipping the first batches: 0%| | 0/2500 [00:00<?, ?it/s]\n",
"Skipping the first batches: 100%|██████████| 2500/2500 [00:03<00:00, 717.10it/s]\u001b[A\n",
"\n",
"2501it [00:04, 522.91it/s] \u001b[A[INFO|trainer.py:1901] 2023-02-16 15:21:36,738 >> \n",
"\n",
"Training completed. Do not forget to share your model on huggingface.co/models =)\n",
"\n",
"\n",
"[INFO|trainer.py:2025] 2023-02-16 15:21:36,738 >> Loading best model from out/roberta/checkpoint-2500 (score: 0.9229999780654907).\n",
"\n",
"\u001b[A{'train_runtime': 5.7972, 'train_samples_per_second': 3449.95, 'train_steps_per_second': 431.244, 'train_loss': 3.2215512862971954e-06, 'epoch': 0.17}\n",
"\n",
"2501it [00:05, 431.57it/s]\u001b[A\n",
"[INFO|trainer.py:2709] 2023-02-16 15:21:37,750 >> Saving model checkpoint to out/roberta\n",
"[INFO|configuration_utils.py:453] 2023-02-16 15:21:37,751 >> Configuration saved in out/roberta/config.json\n",
"[INFO|modeling_utils.py:1704] 2023-02-16 15:21:38,719 >> Model weights saved in out/roberta/pytorch_model.bin\n",
"[INFO|tokenization_utils_base.py:2160] 2023-02-16 15:21:38,742 >> tokenizer config file saved in out/roberta/tokenizer_config.json\n",
"[INFO|tokenization_utils_base.py:2167] 2023-02-16 15:21:38,743 >> Special tokens file saved in out/roberta/special_tokens_map.json\n",
"***** train metrics *****\n",
" epoch = 0.17\n",
" train_loss = 0.0\n",
" train_runtime = 0:00:05.79\n",
" train_samples = 120000\n",
" train_samples_per_second = 3449.95\n",
" train_steps_per_second = 431.244\n",
"02/16/2023 15:21:38 - INFO - __main__ - *** Evaluate ***\n",
"[INFO|trainer.py:710] 2023-02-16 15:21:38,862 >> The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassificationCustomSimple.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomSimple.forward`, you can safely ignore this message.\n",
"[INFO|trainer.py:2964] 2023-02-16 15:21:38,863 >> ***** Running Evaluation *****\n",
"[INFO|trainer.py:2966] 2023-02-16 15:21:38,863 >> Num examples = 2000\n",
"[INFO|trainer.py:2969] 2023-02-16 15:21:38,863 >> Batch size = 8\n",
"100%|█████████████████████████████████████████| 250/250 [00:16<00:00, 14.75it/s]\n",
"***** eval metrics *****\n",
" epoch = 0.17\n",
" eval_accuracy = 0.923\n",
" eval_loss = 0.296\n",
" eval_runtime = 0:00:17.06\n",
" eval_samples = 2000\n",
" eval_samples_per_second = 117.168\n",
" eval_steps_per_second = 14.646\n",
"02/16/2023 15:21:55 - INFO - __main__ - *** Predict ***\n",
"[INFO|trainer.py:710] 2023-02-16 15:21:55,934 >> The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassificationCustomSimple.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomSimple.forward`, you can safely ignore this message.\n",
"[INFO|trainer.py:2964] 2023-02-16 15:21:55,935 >> ***** Running Prediction *****\n",
"[INFO|trainer.py:2966] 2023-02-16 15:21:55,935 >> Num examples = 3800\n",
"[INFO|trainer.py:2969] 2023-02-16 15:21:55,935 >> Batch size = 8\n",
"100%|█████████████████████████████████████████| 475/475 [00:32<00:00, 14.74it/s]\n",
"02/16/2023 15:22:28 - INFO - __main__ - ***** Predict results None *****\n",
"[INFO|modelcard.py:449] 2023-02-16 15:22:28,796 >> Dropping the following result as it does not have all the necessary fields:\n",
"{'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.9229999780654907}]}\n"
]
}
],
"source": [
"!python run_glue.py \\\n",
" --cache_dir .cache_training \\\n",
" --model_name_or_path roberta-base \\\n",
" --custom_model roberta_simple \\\n",
" --train_file data/train.json \\\n",
" --validation_file data/valid.json \\\n",
" --test_file data/test.json \\\n",
" --per_device_train_batch_size 8 \\\n",
" --per_device_eval_batch_size 8 \\\n",
" --do_train \\\n",
" --do_eval \\\n",
" --do_predict \\\n",
" --max_seq_length 128 \\\n",
" --learning_rate 2e-5 \\\n",
" --max_eval_samples 2000 \\\n",
" --max_steps 2500 \\\n",
" --num_train_epochs 1 \\\n",
" --save_strategy steps \\\n",
" --save_steps 250 \\\n",
" --save_total_limit 5 \\\n",
" --logging_strategy steps \\\n",
" --logging_steps 100 \\\n",
" --eval_steps 250 \\\n",
" --evaluation_strategy steps \\\n",
" --metric_for_best_model accuracy \\\n",
" --greater_is_better True \\\n",
" --load_best_model_at_end True \\\n",
" --output_dir out/roberta"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluation"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"02/16/2023 16:46:49 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n",
"02/16/2023 16:46:49 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n",
"_n_gpu=1,\n",
"adafactor=False,\n",
"adam_beta1=0.9,\n",
"adam_beta2=0.999,\n",
"adam_epsilon=1e-08,\n",
"auto_find_batch_size=False,\n",
"bf16=False,\n",
"bf16_full_eval=False,\n",
"data_seed=None,\n",
"dataloader_drop_last=False,\n",
"dataloader_num_workers=0,\n",
"dataloader_pin_memory=True,\n",
"ddp_bucket_cap_mb=None,\n",
"ddp_find_unused_parameters=None,\n",
"ddp_timeout=1800,\n",
"debug=[],\n",
"deepspeed=None,\n",
"disable_tqdm=False,\n",
"do_eval=True,\n",
"do_predict=True,\n",
"do_train=False,\n",
"eval_accumulation_steps=None,\n",
"eval_delay=0,\n",
"eval_steps=250,\n",
"evaluation_strategy=steps,\n",
"fp16=False,\n",
"fp16_backend=auto,\n",
"fp16_full_eval=False,\n",
"fp16_opt_level=O1,\n",
"fsdp=[],\n",
"fsdp_min_num_params=0,\n",
"fsdp_transformer_layer_cls_to_wrap=None,\n",
"full_determinism=False,\n",
"gradient_accumulation_steps=1,\n",
"gradient_checkpointing=False,\n",
"greater_is_better=True,\n",
"group_by_length=False,\n",
"half_precision_backend=auto,\n",
"hub_model_id=None,\n",
"hub_private_repo=False,\n",
"hub_strategy=every_save,\n",
"hub_token=<HUB_TOKEN>,\n",
"ignore_data_skip=False,\n",
"include_inputs_for_metrics=False,\n",
"jit_mode_eval=False,\n",
"label_names=None,\n",
"label_smoothing_factor=0.0,\n",
"learning_rate=2e-05,\n",
"length_column_name=length,\n",
"load_best_model_at_end=True,\n",
"local_rank=-1,\n",
"log_level=passive,\n",
"log_level_replica=passive,\n",
"log_on_each_node=True,\n",
"logging_dir=out/roberta_results/runs/Feb16_16-46-48_DESKTOP-R7JO8BQ,\n",
"logging_first_step=False,\n",
"logging_nan_inf_filter=True,\n",
"logging_steps=100,\n",
"logging_strategy=steps,\n",
"lr_scheduler_type=linear,\n",
"max_grad_norm=1.0,\n",
"max_steps=2500,\n",
"metric_for_best_model=accuracy,\n",
"mp_parameters=,\n",
"no_cuda=False,\n",
"num_train_epochs=1.0,\n",
"optim=adamw_hf,\n",
"optim_args=None,\n",
"output_dir=out/roberta_results,\n",
"overwrite_output_dir=False,\n",
"past_index=-1,\n",
"per_device_eval_batch_size=8,\n",
"per_device_train_batch_size=8,\n",
"prediction_loss_only=False,\n",
"push_to_hub=False,\n",
"push_to_hub_model_id=None,\n",
"push_to_hub_organization=None,\n",
"push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
"ray_scope=last,\n",
"remove_unused_columns=True,\n",
"report_to=[],\n",
"resume_from_checkpoint=None,\n",
"run_name=out/roberta_results,\n",
"save_on_each_node=False,\n",
"save_steps=250,\n",
"save_strategy=steps,\n",
"save_total_limit=5,\n",
"seed=42,\n",
"sharded_ddp=[],\n",
"skip_memory_metrics=True,\n",
"tf32=None,\n",
"torch_compile=False,\n",
"torch_compile_backend=None,\n",
"torch_compile_mode=None,\n",
"torchdynamo=None,\n",
"tpu_metrics_debug=False,\n",
"tpu_num_cores=None,\n",
"use_ipex=False,\n",
"use_legacy_prediction_loop=False,\n",
"use_mps_device=False,\n",
"warmup_ratio=0.0,\n",
"warmup_steps=0,\n",
"weight_decay=0.0,\n",
"xpu_backend=None,\n",
")\n",
"02/16/2023 16:46:49 - INFO - __main__ - load a local file for train: data/train.json\n",
"02/16/2023 16:46:49 - INFO - __main__ - load a local file for validation: data/valid.json\n",
"02/16/2023 16:46:49 - INFO - __main__ - load a local file for test: data/test.json\n",
"02/16/2023 16:46:50 - WARNING - datasets.builder - Using custom data configuration default-f6e8039906850c57\n",
"02/16/2023 16:46:50 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json\n",
"02/16/2023 16:46:50 - INFO - datasets.builder - Overwrite dataset info from restored data version.\n",
"02/16/2023 16:46:50 - INFO - datasets.info - Loading Dataset info from .cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n",
"02/16/2023 16:46:50 - WARNING - datasets.builder - Found cached dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
"02/16/2023 16:46:50 - INFO - datasets.info - Loading Dataset info from /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n",
"100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 752.21it/s]\n",
"[INFO|configuration_utils.py:658] 2023-02-16 16:46:50,276 >> loading configuration file out/roberta/config.json\n",
"[INFO|configuration_utils.py:712] 2023-02-16 16:46:50,277 >> Model config RobertaConfig {\n",
" \"_name_or_path\": \"out/roberta\",\n",
" \"architectures\": [\n",
" \"RobertaForSequenceClassificationCustomSimple\"\n",
" ],\n",
" \"attention_probs_dropout_prob\": 0.1,\n",
" \"bos_token_id\": 0,\n",
" \"classifier_dropout\": null,\n",
" \"eos_token_id\": 2,\n",
" \"hidden_act\": \"gelu\",\n",
" \"hidden_dropout_prob\": 0.1,\n",
" \"hidden_size\": 768,\n",
" \"id2label\": {\n",
" \"0\": 0,\n",
" \"1\": 1,\n",
" \"2\": 2,\n",
" \"3\": 3\n",
" },\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 3072,\n",
" \"label2id\": {\n",
" \"0\": 0,\n",
" \"1\": 1,\n",
" \"2\": 2,\n",
" \"3\": 3\n",
" },\n",
" \"layer_norm_eps\": 1e-05,\n",
" \"max_position_embeddings\": 514,\n",
" \"model_type\": \"roberta\",\n",
" \"num_attention_heads\": 12,\n",
" \"num_hidden_layers\": 12,\n",
" \"pad_token_id\": 1,\n",
" \"position_embedding_type\": \"absolute\",\n",
" \"problem_type\": \"single_label_classification\",\n",
" \"torch_dtype\": \"float32\",\n",
" \"transformers_version\": \"4.26.1\",\n",
" \"type_vocab_size\": 1,\n",
" \"use_cache\": true,\n",
" \"use_hidden_states\": false,\n",
" \"vocab_size\": 50265\n",
"}\n",
"\n",
"[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,283 >> loading file vocab.json\n",
"[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,283 >> loading file merges.txt\n",
"[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,284 >> loading file tokenizer.json\n",
"[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,284 >> loading file added_tokens.json\n",
"[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,284 >> loading file special_tokens_map.json\n",
"[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,284 >> loading file tokenizer_config.json\n",
"02/16/2023 16:46:50 - INFO - __main__ - Using hidden states in model: False\n",
"-------------------------------------------------------- Using hidden: False\n",
"02/16/2023 16:46:50 - INFO - __main__ - Using implementation from class: RobertaForSequenceClassificationCustomSimple\n",
"[INFO|modeling_utils.py:2272] 2023-02-16 16:46:50,339 >> loading weights file out/roberta/pytorch_model.bin\n",
"[INFO|modeling_utils.py:2857] 2023-02-16 16:46:52,079 >> All model checkpoint weights were used when initializing RobertaForSequenceClassificationCustomSimple.\n",
"\n",
"[INFO|modeling_utils.py:2865] 2023-02-16 16:46:52,079 >> All the weights of RobertaForSequenceClassificationCustomSimple were initialized from the model checkpoint at out/roberta.\n",
"If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassificationCustomSimple for predictions without further training.\n",
"RobertaForSequenceClassificationCustomSimple(\n",
" (roberta): RobertaModel(\n",
" (embeddings): RobertaEmbeddings(\n",
" (word_embeddings): Embedding(50265, 768, padding_idx=1)\n",
" (position_embeddings): Embedding(514, 768, padding_idx=1)\n",
" (token_type_embeddings): Embedding(1, 768)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (encoder): RobertaEncoder(\n",
" (layer): ModuleList(\n",
" (0): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (1): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (2): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (3): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (4): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (5): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (6): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (7): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (8): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (9): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (10): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (11): RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (classifier): RobertaClassificationHeadCustomSimple(\n",
" (dense_1): Linear(in_features=768, out_features=3072, bias=True)\n",
" (dense_2): Linear(in_features=3072, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (out_proj): Linear(in_features=768, out_features=4, bias=True)\n",
" (activation): GELU(approximate='none')\n",
" )\n",
")\n",
"02/16/2023 16:46:52 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-df96547ec55a44ce.arrow\n",
"02/16/2023 16:46:52 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-67b1030adaffbb4a.arrow\n",
"02/16/2023 16:46:52 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-ae09252df5e9bac1.arrow\n",
"02/16/2023 16:46:52 - INFO - __main__ - Set 500 samples for 0-class\n",
"02/16/2023 16:46:52 - INFO - __main__ - Set 500 samples for 1-class\n",
"02/16/2023 16:46:52 - INFO - __main__ - Set 500 samples for 2-class\n",
"02/16/2023 16:46:52 - INFO - __main__ - Set 500 samples for 3-class\n",
"[INFO|trainer.py:511] 2023-02-16 16:46:55,346 >> max_steps is given, it will override any value given in num_train_epochs\n",
"02/16/2023 16:46:55 - INFO - __main__ - *** Evaluate ***\n",
"[INFO|trainer.py:710] 2023-02-16 16:46:55,346 >> The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassificationCustomSimple.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomSimple.forward`, you can safely ignore this message.\n",
"[INFO|trainer.py:2964] 2023-02-16 16:46:55,348 >> ***** Running Evaluation *****\n",
"[INFO|trainer.py:2966] 2023-02-16 16:46:55,348 >> Num examples = 2000\n",
"[INFO|trainer.py:2969] 2023-02-16 16:46:55,348 >> Batch size = 8\n",
"100%|█████████████████████████████████████████| 250/250 [00:17<00:00, 14.53it/s]\n",
"***** eval metrics *****\n",
" eval_accuracy = 0.923\n",
" eval_loss = 0.296\n",
" eval_runtime = 0:00:17.81\n",
" eval_samples = 2000\n",
" eval_samples_per_second = 112.255\n",
" eval_steps_per_second = 14.032\n",
"02/16/2023 16:47:13 - INFO - __main__ - *** Predict ***\n",
"[INFO|trainer.py:710] 2023-02-16 16:47:13,166 >> The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassificationCustomSimple.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomSimple.forward`, you can safely ignore this message.\n",
"[INFO|trainer.py:2964] 2023-02-16 16:47:13,167 >> ***** Running Prediction *****\n",
"[INFO|trainer.py:2966] 2023-02-16 16:47:13,167 >> Num examples = 3800\n",
"[INFO|trainer.py:2969] 2023-02-16 16:47:13,167 >> Batch size = 8\n",
"100%|█████████████████████████████████████████| 475/475 [00:32<00:00, 14.53it/s]\n",
"02/16/2023 16:47:45 - INFO - __main__ - ***** Predict results None *****\n",
"[INFO|modelcard.py:449] 2023-02-16 16:47:46,438 >> Dropping the following result as it does not have all the necessary fields:\n",
"{'task': {'name': 'Text Classification', 'type': 'text-classification'}}\n"
]
}
],
"source": [
"!python run_glue.py \\\n",
" --cache_dir .cache_training \\\n",
" --model_name_or_path out/roberta \\\n",
" --custom_model roberta_simple \\\n",
" --train_file data/train.json \\\n",
" --validation_file data/valid.json \\\n",
" --test_file data/test.json \\\n",
" --per_device_train_batch_size 8 \\\n",
" --per_device_eval_batch_size 8 \\\n",
" --do_eval \\\n",
" --do_predict \\\n",
" --max_seq_length 128 \\\n",
" --learning_rate 2e-5 \\\n",
" --max_eval_samples 2000 \\\n",
" --max_steps 2500 \\\n",
" --num_train_epochs 1 \\\n",
" --save_strategy steps \\\n",
" --save_steps 250 \\\n",
" --save_total_limit 5 \\\n",
" --logging_strategy steps \\\n",
" --logging_steps 100 \\\n",
" --eval_steps 250 \\\n",
" --evaluation_strategy steps \\\n",
" --metric_for_best_model accuracy \\\n",
" --greater_is_better True \\\n",
" --load_best_model_at_end True \\\n",
" --output_dir out/roberta_results"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Results"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[0;39m0.9229999780654907\u001b[0m\n"
]
}
],
"source": [
"!cat out/roberta_results/eval_results.json | jq .eval_accuracy"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# GPT2"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Modifications"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"- Custom classification head with 3 dense layers\n",
"- Using hidden states from last layer"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Code"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from torch import nn\n",
"from transformers import GPT2PreTrainedModel, GPT2Model\n",
"from transformers.modeling_outputs import SequenceClassifierOutputWithPast\n",
"\n",
"class GPT2ForSequenceClassification(GPT2PreTrainedModel):\n",
" def __init__(self, config):\n",
" super().__init__(config)\n",
" self.num_labels = config.num_labels\n",
" self.transformer = GPT2Model(config)\n",
" self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)\n",
"\n",
" # Model parallel\n",
" self.model_parallel = False\n",
" self.device_map = None\n",
"\n",
" # Initialize weights and apply final processing\n",
" self.post_init()\n",
"\n",
"\n",
"class GPT2ClassificationHeadCustom(nn.Module):\n",
" def __init__(self, config):\n",
" super().__init__()\n",
" hidden_size = config.n_embd\n",
" self.dense_1_input = nn.Linear(hidden_size, 2 * hidden_size)\n",
" self.dense_1_hidden = nn.Linear(hidden_size, 2 * hidden_size)\n",
" self.dense_2 = nn.Linear(4 * hidden_size, hidden_size)\n",
" self.dropout = nn.Dropout(config.resid_pdrop)\n",
" self.out_proj = nn.Linear(hidden_size, config.num_labels, bias=False)\n",
"\n",
" def forward(self, x, **kwargs):\n",
" if 'hidden_states' in kwargs and kwargs['hidden_states'] is not None:\n",
" hidden = kwargs['hidden_states'][-1]\n",
" else:\n",
" hidden = torch.zeros(x.size(), dtype=x.dtype, device=x.device)\n",
"\n",
" x = self.dense_1_input(x)\n",
" x = torch.relu(x)\n",
" x = self.dropout(x)\n",
"\n",
" hidden = self.dense_1_hidden(hidden)\n",
" hidden = torch.relu(hidden)\n",
" hidden = self.dropout(hidden)\n",
"\n",
" x = torch.cat((x, hidden), dim=2)\n",
" x = self.dense_2(x)\n",
" x = torch.relu(x)\n",
" x = self.dropout(x)\n",
"\n",
" x = self.out_proj(x)\n",
" return x\n",
"\n",
"class GPT2ForSequenceClassificationCustom(GPT2ForSequenceClassification):\n",
" def __init__(self, config):\n",
" super().__init__(config)\n",
" self.num_labels = config.num_labels\n",
" self.transformer = GPT2Model(config)\n",
" self.score = GPT2ClassificationHeadCustom(config)\n",
"\n",
" self.init_weights()\n",
"\n",
" # Model parallel\n",
" self.model_parallel = False\n",
" self.device_map = None\n",
"\n",
" def forward(\n",
" self,\n",
" input_ids=None,\n",
" past_key_values=None,\n",
" attention_mask=None,\n",
" token_type_ids=None,\n",
" position_ids=None,\n",
" head_mask=None,\n",
" inputs_embeds=None,\n",
" labels=None,\n",
" use_cache=None,\n",
" output_attentions=None,\n",
" output_hidden_states=None,\n",
" return_dict=None,\n",
" ):\n",
" r\"\"\"\n",
" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):\n",
" Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,\n",
" config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),\n",
" If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).\n",
" \"\"\"\n",
" return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n",
"\n",
" transformer_outputs = self.transformer(\n",
" input_ids,\n",
" past_key_values=past_key_values,\n",
" attention_mask=attention_mask,\n",
" token_type_ids=token_type_ids,\n",
" position_ids=position_ids,\n",
" head_mask=head_mask,\n",
" inputs_embeds=inputs_embeds,\n",
" use_cache=use_cache,\n",
" output_attentions=output_attentions,\n",
" output_hidden_states=output_hidden_states,\n",
" return_dict=return_dict,\n",
" )\n",
" hidden_states = transformer_outputs[0]\n",
" if return_dict:\n",
" logits = self.score(hidden_states, hidden_states=transformer_outputs.hidden_states)\n",
" else:\n",
" raise NotImplemented('Not implemented for using non-dictionary object')\n",
"\n",
" if input_ids is not None:\n",
" batch_size, sequence_length = input_ids.shape[:2]\n",
" else:\n",
" batch_size, sequence_length = inputs_embeds.shape[:2]\n",
"\n",
" assert (\n",
" self.config.pad_token_id is not None or batch_size == 1\n",
" ), \"Cannot handle batch sizes > 1 if no padding token is defined.\"\n",
" if self.config.pad_token_id is None:\n",
" sequence_lengths = -1\n",
" else:\n",
" if input_ids is not None:\n",
" sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1\n",
" else:\n",
" sequence_lengths = -1\n",
"\n",
" pooled_logits = logits[range(batch_size), sequence_lengths]\n",
"\n",
" loss = None\n",
" if labels is not None:\n",
" if self.num_labels == 1:\n",
" # We are doing regression\n",
" loss_fct = nn.MSELoss()\n",
" loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1))\n",
" else:\n",
" loss_fct = nn.CrossEntropyLoss()\n",
" loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))\n",
"\n",
" if not return_dict:\n",
" output = (pooled_logits,) + transformer_outputs[1:]\n",
" return ((loss,) + output) if loss is not None else output\n",
"\n",
" return SequenceClassifierOutputWithPast(\n",
" loss=loss,\n",
" logits=pooled_logits,\n",
" past_key_values=transformer_outputs.past_key_values,\n",
" hidden_states=transformer_outputs.hidden_states,\n",
" attentions=transformer_outputs.attentions,\n",
" )"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4f980b257c2b453797f63ddc89c98923",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)lve/main/config.json: 0%| | 0.00/665 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of GPT2ForSequenceClassificationCustom were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.10.attn.masked_bias', 'h.2.attn.masked_bias', 'h.5.attn.masked_bias', 'score.dense_2.weight', 'h.9.attn.masked_bias', 'score.dense_1_input.bias', 'score.out_proj.weight', 'h.7.attn.masked_bias', 'h.4.attn.masked_bias', 'h.3.attn.masked_bias', 'h.11.attn.masked_bias', 'h.6.attn.masked_bias', 'h.8.attn.masked_bias', 'score.dense_1_hidden.weight', 'h.1.attn.masked_bias', 'h.0.attn.masked_bias', 'score.dense_1_input.weight', 'score.dense_1_hidden.bias', 'score.dense_2.bias']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
},
{
"data": {
"text/plain": [
"GPT2ForSequenceClassificationCustom(\n",
" (transformer): GPT2Model(\n",
" (wte): Embedding(50257, 768)\n",
" (wpe): Embedding(1024, 768)\n",
" (drop): Dropout(p=0.1, inplace=False)\n",
" (h): ModuleList(\n",
" (0): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (1): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (2): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (3): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (4): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (5): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (6): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (7): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (8): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (9): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (10): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (11): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" )\n",
" (score): GPT2ClassificationHeadCustom(\n",
" (dense_1_input): Linear(in_features=768, out_features=1536, bias=True)\n",
" (dense_1_hidden): Linear(in_features=768, out_features=1536, bias=True)\n",
" (dense_2): Linear(in_features=3072, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (out_proj): Linear(in_features=768, out_features=2, bias=False)\n",
" )\n",
")"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"GPT2ForSequenceClassificationCustom.from_pretrained('gpt2')"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"02/16/2023 15:22:37 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n",
"02/16/2023 15:22:37 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n",
"_n_gpu=1,\n",
"adafactor=False,\n",
"adam_beta1=0.9,\n",
"adam_beta2=0.999,\n",
"adam_epsilon=1e-08,\n",
"auto_find_batch_size=False,\n",
"bf16=False,\n",
"bf16_full_eval=False,\n",
"data_seed=None,\n",
"dataloader_drop_last=False,\n",
"dataloader_num_workers=0,\n",
"dataloader_pin_memory=True,\n",
"ddp_bucket_cap_mb=None,\n",
"ddp_find_unused_parameters=None,\n",
"ddp_timeout=1800,\n",
"debug=[],\n",
"deepspeed=None,\n",
"disable_tqdm=False,\n",
"do_eval=True,\n",
"do_predict=False,\n",
"do_train=True,\n",
"eval_accumulation_steps=None,\n",
"eval_delay=0,\n",
"eval_steps=250,\n",
"evaluation_strategy=steps,\n",
"fp16=False,\n",
"fp16_backend=auto,\n",
"fp16_full_eval=False,\n",
"fp16_opt_level=O1,\n",
"fsdp=[],\n",
"fsdp_min_num_params=0,\n",
"fsdp_transformer_layer_cls_to_wrap=None,\n",
"full_determinism=False,\n",
"gradient_accumulation_steps=1,\n",
"gradient_checkpointing=False,\n",
"greater_is_better=True,\n",
"group_by_length=False,\n",
"half_precision_backend=auto,\n",
"hub_model_id=None,\n",
"hub_private_repo=False,\n",
"hub_strategy=every_save,\n",
"hub_token=<HUB_TOKEN>,\n",
"ignore_data_skip=False,\n",
"include_inputs_for_metrics=False,\n",
"jit_mode_eval=False,\n",
"label_names=None,\n",
"label_smoothing_factor=0.0,\n",
"learning_rate=2e-05,\n",
"length_column_name=length,\n",
"load_best_model_at_end=True,\n",
"local_rank=-1,\n",
"log_level=passive,\n",
"log_level_replica=passive,\n",
"log_on_each_node=True,\n",
"logging_dir=out/gpt2/runs/Feb16_15-22-36_DESKTOP-R7JO8BQ,\n",
"logging_first_step=False,\n",
"logging_nan_inf_filter=True,\n",
"logging_steps=100,\n",
"logging_strategy=steps,\n",
"lr_scheduler_type=linear,\n",
"max_grad_norm=1.0,\n",
"max_steps=2500,\n",
"metric_for_best_model=accuracy,\n",
"mp_parameters=,\n",
"no_cuda=False,\n",
"num_train_epochs=1.0,\n",
"optim=adamw_hf,\n",
"optim_args=None,\n",
"output_dir=out/gpt2,\n",
"overwrite_output_dir=False,\n",
"past_index=-1,\n",
"per_device_eval_batch_size=8,\n",
"per_device_train_batch_size=8,\n",
"prediction_loss_only=False,\n",
"push_to_hub=False,\n",
"push_to_hub_model_id=None,\n",
"push_to_hub_organization=None,\n",
"push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
"ray_scope=last,\n",
"remove_unused_columns=True,\n",
"report_to=[],\n",
"resume_from_checkpoint=None,\n",
"run_name=out/gpt2,\n",
"save_on_each_node=False,\n",
"save_steps=250,\n",
"save_strategy=steps,\n",
"save_total_limit=5,\n",
"seed=42,\n",
"sharded_ddp=[],\n",
"skip_memory_metrics=True,\n",
"tf32=None,\n",
"torch_compile=False,\n",
"torch_compile_backend=None,\n",
"torch_compile_mode=None,\n",
"torchdynamo=None,\n",
"tpu_metrics_debug=False,\n",
"tpu_num_cores=None,\n",
"use_ipex=False,\n",
"use_legacy_prediction_loop=False,\n",
"use_mps_device=False,\n",
"warmup_ratio=0.0,\n",
"warmup_steps=0,\n",
"weight_decay=0.0,\n",
"xpu_backend=None,\n",
")\n",
"02/16/2023 15:22:37 - INFO - __main__ - Checkpoint detected, resuming training at out/gpt2/checkpoint-2500. To avoid this behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch.\n",
"02/16/2023 15:22:37 - INFO - __main__ - load a local file for train: data/train.json\n",
"02/16/2023 15:22:37 - INFO - __main__ - load a local file for validation: data/valid.json\n",
"02/16/2023 15:22:37 - WARNING - datasets.builder - Using custom data configuration default-e10a382a423bbb9a\n",
"02/16/2023 15:22:37 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json\n",
"02/16/2023 15:22:37 - INFO - datasets.builder - Generating dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
"Downloading and preparing dataset json/default to /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...\n",
"Downloading data files: 100%|██████████████████| 2/2 [00:00<00:00, 14820.86it/s]\n",
"02/16/2023 15:22:37 - INFO - datasets.download.download_manager - Downloading took 0.0 min\n",
"02/16/2023 15:22:37 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min\n",
"Extracting data files: 100%|████████████████████| 2/2 [00:00<00:00, 2476.71it/s]\n",
"02/16/2023 15:22:37 - INFO - datasets.utils.info_utils - Unable to verify checksums.\n",
"02/16/2023 15:22:37 - INFO - datasets.builder - Generating train split\n",
"02/16/2023 15:22:37 - INFO - datasets.builder - Generating validation split\n",
"02/16/2023 15:22:37 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.\n",
"Dataset json downloaded and prepared to /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.\n",
"100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 642.61it/s]\n",
"[INFO|configuration_utils.py:660] 2023-02-16 15:22:38,465 >> loading configuration file config.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json\n",
"[INFO|configuration_utils.py:712] 2023-02-16 15:22:38,465 >> Model config GPT2Config {\n",
" \"_name_or_path\": \"gpt2\",\n",
" \"activation_function\": \"gelu_new\",\n",
" \"architectures\": [\n",
" \"GPT2LMHeadModel\"\n",
" ],\n",
" \"attn_pdrop\": 0.1,\n",
" \"bos_token_id\": 50256,\n",
" \"embd_pdrop\": 0.1,\n",
" \"eos_token_id\": 50256,\n",
" \"id2label\": {\n",
" \"0\": \"LABEL_0\",\n",
" \"1\": \"LABEL_1\",\n",
" \"2\": \"LABEL_2\",\n",
" \"3\": \"LABEL_3\"\n",
" },\n",
" \"initializer_range\": 0.02,\n",
" \"label2id\": {\n",
" \"LABEL_0\": 0,\n",
" \"LABEL_1\": 1,\n",
" \"LABEL_2\": 2,\n",
" \"LABEL_3\": 3\n",
" },\n",
" \"layer_norm_epsilon\": 1e-05,\n",
" \"model_type\": \"gpt2\",\n",
" \"n_ctx\": 1024,\n",
" \"n_embd\": 768,\n",
" \"n_head\": 12,\n",
" \"n_inner\": null,\n",
" \"n_layer\": 12,\n",
" \"n_positions\": 1024,\n",
" \"reorder_and_upcast_attn\": false,\n",
" \"resid_pdrop\": 0.1,\n",
" \"scale_attn_by_inverse_layer_idx\": false,\n",
" \"scale_attn_weights\": true,\n",
" \"summary_activation\": null,\n",
" \"summary_first_dropout\": 0.1,\n",
" \"summary_proj_to_labels\": true,\n",
" \"summary_type\": \"cls_index\",\n",
" \"summary_use_proj\": true,\n",
" \"task_specific_params\": {\n",
" \"text-generation\": {\n",
" \"do_sample\": true,\n",
" \"max_length\": 50\n",
" }\n",
" },\n",
" \"transformers_version\": \"4.26.1\",\n",
" \"use_cache\": true,\n",
" \"vocab_size\": 50257\n",
"}\n",
"\n",
"[INFO|tokenization_auto.py:458] 2023-02-16 15:22:38,945 >> Could not locate the tokenizer configuration file, will try to use the model config instead.\n",
"[INFO|configuration_utils.py:660] 2023-02-16 15:22:39,423 >> loading configuration file config.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json\n",
"[INFO|configuration_utils.py:712] 2023-02-16 15:22:39,424 >> Model config GPT2Config {\n",
" \"_name_or_path\": \"gpt2\",\n",
" \"activation_function\": \"gelu_new\",\n",
" \"architectures\": [\n",
" \"GPT2LMHeadModel\"\n",
" ],\n",
" \"attn_pdrop\": 0.1,\n",
" \"bos_token_id\": 50256,\n",
" \"embd_pdrop\": 0.1,\n",
" \"eos_token_id\": 50256,\n",
" \"initializer_range\": 0.02,\n",
" \"layer_norm_epsilon\": 1e-05,\n",
" \"model_type\": \"gpt2\",\n",
" \"n_ctx\": 1024,\n",
" \"n_embd\": 768,\n",
" \"n_head\": 12,\n",
" \"n_inner\": null,\n",
" \"n_layer\": 12,\n",
" \"n_positions\": 1024,\n",
" \"reorder_and_upcast_attn\": false,\n",
" \"resid_pdrop\": 0.1,\n",
" \"scale_attn_by_inverse_layer_idx\": false,\n",
" \"scale_attn_weights\": true,\n",
" \"summary_activation\": null,\n",
" \"summary_first_dropout\": 0.1,\n",
" \"summary_proj_to_labels\": true,\n",
" \"summary_type\": \"cls_index\",\n",
" \"summary_use_proj\": true,\n",
" \"task_specific_params\": {\n",
" \"text-generation\": {\n",
" \"do_sample\": true,\n",
" \"max_length\": 50\n",
" }\n",
" },\n",
" \"transformers_version\": \"4.26.1\",\n",
" \"use_cache\": true,\n",
" \"vocab_size\": 50257\n",
"}\n",
"\n",
"[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file vocab.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/vocab.json\n",
"[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file merges.txt from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/merges.txt\n",
"[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file tokenizer.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/tokenizer.json\n",
"[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file added_tokens.json from cache at None\n",
"[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file special_tokens_map.json from cache at None\n",
"[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file tokenizer_config.json from cache at None\n",
"[INFO|configuration_utils.py:660] 2023-02-16 15:22:40,400 >> loading configuration file config.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json\n",
"[INFO|configuration_utils.py:712] 2023-02-16 15:22:40,400 >> Model config GPT2Config {\n",
" \"_name_or_path\": \"gpt2\",\n",
" \"activation_function\": \"gelu_new\",\n",
" \"architectures\": [\n",
" \"GPT2LMHeadModel\"\n",
" ],\n",
" \"attn_pdrop\": 0.1,\n",
" \"bos_token_id\": 50256,\n",
" \"embd_pdrop\": 0.1,\n",
" \"eos_token_id\": 50256,\n",
" \"initializer_range\": 0.02,\n",
" \"layer_norm_epsilon\": 1e-05,\n",
" \"model_type\": \"gpt2\",\n",
" \"n_ctx\": 1024,\n",
" \"n_embd\": 768,\n",
" \"n_head\": 12,\n",
" \"n_inner\": null,\n",
" \"n_layer\": 12,\n",
" \"n_positions\": 1024,\n",
" \"reorder_and_upcast_attn\": false,\n",
" \"resid_pdrop\": 0.1,\n",
" \"scale_attn_by_inverse_layer_idx\": false,\n",
" \"scale_attn_weights\": true,\n",
" \"summary_activation\": null,\n",
" \"summary_first_dropout\": 0.1,\n",
" \"summary_proj_to_labels\": true,\n",
" \"summary_type\": \"cls_index\",\n",
" \"summary_use_proj\": true,\n",
" \"task_specific_params\": {\n",
" \"text-generation\": {\n",
" \"do_sample\": true,\n",
" \"max_length\": 50\n",
" }\n",
" },\n",
" \"transformers_version\": \"4.26.1\",\n",
" \"use_cache\": true,\n",
" \"vocab_size\": 50257\n",
"}\n",
"\n",
"02/16/2023 15:22:40 - INFO - __main__ - Using hidden states in model: True\n",
"-------------------------------------------------------- Using hidden: True\n",
"02/16/2023 15:22:40 - INFO - __main__ - Using implementation from class: GPT2ForSequenceClassificationCustom\n",
"[INFO|modeling_utils.py:2275] 2023-02-16 15:22:40,458 >> loading weights file pytorch_model.bin from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/pytorch_model.bin\n",
"[INFO|modeling_utils.py:2857] 2023-02-16 15:22:42,848 >> All model checkpoint weights were used when initializing GPT2ForSequenceClassificationCustom.\n",
"\n",
"[WARNING|modeling_utils.py:2859] 2023-02-16 15:22:42,849 >> Some weights of GPT2ForSequenceClassificationCustom were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.11.attn.masked_bias', 'score.out_proj.weight', 'h.7.attn.masked_bias', 'h.6.attn.masked_bias', 'h.8.attn.masked_bias', 'h.5.attn.masked_bias', 'score.dense_2.weight', 'h.9.attn.masked_bias', 'score.dense_4.bias', 'score.dense_1_input.bias', 'score.dense_3.weight', 'score.dense_1_hidden.bias', 'score.dense_1_input.weight', 'h.1.attn.masked_bias', 'score.dense_3.bias', 'h.10.attn.masked_bias', 'h.2.attn.masked_bias', 'h.4.attn.masked_bias', 'score.dense_1_hidden.weight', 'score.dense_2.bias', 'score.dense_4.weight', 'h.0.attn.masked_bias', 'h.3.attn.masked_bias']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"GPT2ForSequenceClassificationCustom(\n",
" (transformer): GPT2Model(\n",
" (wte): Embedding(50257, 768)\n",
" (wpe): Embedding(1024, 768)\n",
" (drop): Dropout(p=0.1, inplace=False)\n",
" (h): ModuleList(\n",
" (0): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (1): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (2): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (3): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (4): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (5): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (6): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (7): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (8): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (9): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (10): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (11): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" )\n",
" (score): GPT2ClassificationHeadCustom(\n",
" (dense_1_input): Linear(in_features=768, out_features=1536, bias=True)\n",
" (dense_1_hidden): Linear(in_features=768, out_features=1536, bias=True)\n",
" (dense_2): Linear(in_features=3072, out_features=3072, bias=True)\n",
" (dense_3): Linear(in_features=3072, out_features=3072, bias=True)\n",
" (dense_4): Linear(in_features=3072, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (out_proj): Linear(in_features=768, out_features=4, bias=False)\n",
" )\n",
")\n",
"[ERROR|tokenization_utils_base.py:1042] 2023-02-16 15:22:42,852 >> Using pad_token, but it is not set yet.\n",
"02/16/2023 15:22:42 - INFO - __main__ - Set PAD token to EOS: <|endoftext|>\n",
"Running tokenizer on dataset: 0%| | 0/120 [00:00<?, ?ba/s]02/16/2023 15:22:42 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-d91f860557c08124.arrow\n",
"Running tokenizer on dataset: 100%|███████████| 120/120 [00:06<00:00, 17.67ba/s]\n",
"Running tokenizer on dataset: 0%| | 0/4 [00:00<?, ?ba/s]02/16/2023 15:22:49 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-b30f34d164a78c00.arrow\n",
"Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 19.47ba/s]\n",
"02/16/2023 15:22:50 - INFO - __main__ - Set 500 samples for 0-class\n",
"02/16/2023 15:22:50 - INFO - __main__ - Set 500 samples for 1-class\n",
"02/16/2023 15:22:50 - INFO - __main__ - Set 500 samples for 2-class\n",
"02/16/2023 15:22:50 - INFO - __main__ - Set 500 samples for 3-class\n",
"Traceback (most recent call last):\n",
" File \"/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/run_glue.py\", line 685, in <module>\n",
" main()\n",
" File \"/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/run_glue.py\", line 533, in main\n",
" raise ValueError(\"--do_predict requires a test dataset\")\n",
"ValueError: --do_predict requires a test dataset\n"
]
}
],
"source": [
"!python run_glue.py \\\n",
" --cache_dir .cache_training \\\n",
" --model_name_or_path gpt2 \\\n",
" --custom_model gpt2_hidden \\\n",
" --train_file data/train.json \\\n",
" --validation_file data/valid.json \\\n",
" --test_file data/test.json \\\n",
" --per_device_train_batch_size 8 \\\n",
" --per_device_eval_batch_size 8 \\\n",
" --do_train \\\n",
" --do_eval \\\n",
" --max_seq_length 128 \\\n",
" --learning_rate 2e-5 \\\n",
" --max_eval_samples 2000 \\\n",
" --max_steps 2500 \\\n",
" --num_train_epochs 1 \\\n",
" --save_strategy steps \\\n",
" --save_steps 250 \\\n",
" --save_total_limit 5 \\\n",
" --logging_strategy steps \\\n",
" --logging_steps 100 \\\n",
" --eval_steps 250 \\\n",
" --evaluation_strategy steps \\\n",
" --metric_for_best_model accuracy \\\n",
" --greater_is_better True \\\n",
" --load_best_model_at_end True \\\n",
" --output_dir out/gpt2 "
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluation"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"02/16/2023 16:51:20 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n",
"02/16/2023 16:51:20 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n",
"_n_gpu=1,\n",
"adafactor=False,\n",
"adam_beta1=0.9,\n",
"adam_beta2=0.999,\n",
"adam_epsilon=1e-08,\n",
"auto_find_batch_size=False,\n",
"bf16=False,\n",
"bf16_full_eval=False,\n",
"data_seed=None,\n",
"dataloader_drop_last=False,\n",
"dataloader_num_workers=0,\n",
"dataloader_pin_memory=True,\n",
"ddp_bucket_cap_mb=None,\n",
"ddp_find_unused_parameters=None,\n",
"ddp_timeout=1800,\n",
"debug=[],\n",
"deepspeed=None,\n",
"disable_tqdm=False,\n",
"do_eval=True,\n",
"do_predict=True,\n",
"do_train=False,\n",
"eval_accumulation_steps=None,\n",
"eval_delay=0,\n",
"eval_steps=250,\n",
"evaluation_strategy=steps,\n",
"fp16=False,\n",
"fp16_backend=auto,\n",
"fp16_full_eval=False,\n",
"fp16_opt_level=O1,\n",
"fsdp=[],\n",
"fsdp_min_num_params=0,\n",
"fsdp_transformer_layer_cls_to_wrap=None,\n",
"full_determinism=False,\n",
"gradient_accumulation_steps=1,\n",
"gradient_checkpointing=False,\n",
"greater_is_better=True,\n",
"group_by_length=False,\n",
"half_precision_backend=auto,\n",
"hub_model_id=None,\n",
"hub_private_repo=False,\n",
"hub_strategy=every_save,\n",
"hub_token=<HUB_TOKEN>,\n",
"ignore_data_skip=False,\n",
"include_inputs_for_metrics=False,\n",
"jit_mode_eval=False,\n",
"label_names=None,\n",
"label_smoothing_factor=0.0,\n",
"learning_rate=2e-05,\n",
"length_column_name=length,\n",
"load_best_model_at_end=True,\n",
"local_rank=-1,\n",
"log_level=passive,\n",
"log_level_replica=passive,\n",
"log_on_each_node=True,\n",
"logging_dir=out/gpt2_results/runs/Feb16_16-51-19_DESKTOP-R7JO8BQ,\n",
"logging_first_step=False,\n",
"logging_nan_inf_filter=True,\n",
"logging_steps=100,\n",
"logging_strategy=steps,\n",
"lr_scheduler_type=linear,\n",
"max_grad_norm=1.0,\n",
"max_steps=2500,\n",
"metric_for_best_model=accuracy,\n",
"mp_parameters=,\n",
"no_cuda=False,\n",
"num_train_epochs=1.0,\n",
"optim=adamw_hf,\n",
"optim_args=None,\n",
"output_dir=out/gpt2_results,\n",
"overwrite_output_dir=False,\n",
"past_index=-1,\n",
"per_device_eval_batch_size=8,\n",
"per_device_train_batch_size=8,\n",
"prediction_loss_only=False,\n",
"push_to_hub=False,\n",
"push_to_hub_model_id=None,\n",
"push_to_hub_organization=None,\n",
"push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
"ray_scope=last,\n",
"remove_unused_columns=True,\n",
"report_to=[],\n",
"resume_from_checkpoint=None,\n",
"run_name=out/gpt2_results,\n",
"save_on_each_node=False,\n",
"save_steps=250,\n",
"save_strategy=steps,\n",
"save_total_limit=5,\n",
"seed=42,\n",
"sharded_ddp=[],\n",
"skip_memory_metrics=True,\n",
"tf32=None,\n",
"torch_compile=False,\n",
"torch_compile_backend=None,\n",
"torch_compile_mode=None,\n",
"torchdynamo=None,\n",
"tpu_metrics_debug=False,\n",
"tpu_num_cores=None,\n",
"use_ipex=False,\n",
"use_legacy_prediction_loop=False,\n",
"use_mps_device=False,\n",
"warmup_ratio=0.0,\n",
"warmup_steps=0,\n",
"weight_decay=0.0,\n",
"xpu_backend=None,\n",
")\n",
"02/16/2023 16:51:20 - INFO - __main__ - load a local file for train: data/train.json\n",
"02/16/2023 16:51:20 - INFO - __main__ - load a local file for validation: data/valid.json\n",
"02/16/2023 16:51:20 - INFO - __main__ - load a local file for test: data/test.json\n",
"02/16/2023 16:51:20 - WARNING - datasets.builder - Using custom data configuration default-f6e8039906850c57\n",
"02/16/2023 16:51:20 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json\n",
"02/16/2023 16:51:20 - INFO - datasets.builder - Overwrite dataset info from restored data version.\n",
"02/16/2023 16:51:20 - INFO - datasets.info - Loading Dataset info from .cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n",
"02/16/2023 16:51:20 - WARNING - datasets.builder - Found cached dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
"02/16/2023 16:51:20 - INFO - datasets.info - Loading Dataset info from /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n",
"100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 591.33it/s]\n",
"[INFO|configuration_utils.py:658] 2023-02-16 16:51:20,920 >> loading configuration file out/gpt2/config.json\n",
"[INFO|configuration_utils.py:712] 2023-02-16 16:51:20,921 >> Model config GPT2Config {\n",
" \"_name_or_path\": \"out/gpt2\",\n",
" \"activation_function\": \"gelu_new\",\n",
" \"architectures\": [\n",
" \"GPT2ForSequenceClassificationCustom\"\n",
" ],\n",
" \"attn_pdrop\": 0.1,\n",
" \"bos_token_id\": 50256,\n",
" \"embd_pdrop\": 0.1,\n",
" \"eos_token_id\": 50256,\n",
" \"id2label\": {\n",
" \"0\": 0,\n",
" \"1\": 1,\n",
" \"2\": 2,\n",
" \"3\": 3\n",
" },\n",
" \"initializer_range\": 0.02,\n",
" \"label2id\": {\n",
" \"0\": 0,\n",
" \"1\": 1,\n",
" \"2\": 2,\n",
" \"3\": 3\n",
" },\n",
" \"layer_norm_epsilon\": 1e-05,\n",
" \"model_type\": \"gpt2\",\n",
" \"n_ctx\": 1024,\n",
" \"n_embd\": 768,\n",
" \"n_head\": 12,\n",
" \"n_inner\": null,\n",
" \"n_layer\": 12,\n",
" \"n_positions\": 1024,\n",
" \"pad_token_id\": 50256,\n",
" \"reorder_and_upcast_attn\": false,\n",
" \"resid_pdrop\": 0.1,\n",
" \"scale_attn_by_inverse_layer_idx\": false,\n",
" \"scale_attn_weights\": true,\n",
" \"summary_activation\": null,\n",
" \"summary_first_dropout\": 0.1,\n",
" \"summary_proj_to_labels\": true,\n",
" \"summary_type\": \"cls_index\",\n",
" \"summary_use_proj\": true,\n",
" \"task_specific_params\": {\n",
" \"text-generation\": {\n",
" \"do_sample\": true,\n",
" \"max_length\": 50\n",
" }\n",
" },\n",
" \"torch_dtype\": \"float32\",\n",
" \"transformers_version\": \"4.26.1\",\n",
" \"use_cache\": true,\n",
" \"use_hidden_states\": true,\n",
" \"vocab_size\": 50257\n",
"}\n",
"\n",
"[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file vocab.json\n",
"[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file merges.txt\n",
"[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file tokenizer.json\n",
"[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file added_tokens.json\n",
"[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file special_tokens_map.json\n",
"[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file tokenizer_config.json\n",
"02/16/2023 16:51:20 - INFO - __main__ - Using hidden states in model: True\n",
"-------------------------------------------------------- Using hidden: True\n",
"02/16/2023 16:51:20 - INFO - __main__ - Using implementation from class: GPT2ForSequenceClassificationCustom\n",
"[INFO|modeling_utils.py:2272] 2023-02-16 16:51:20,982 >> loading weights file out/gpt2/pytorch_model.bin\n",
"[INFO|modeling_utils.py:2857] 2023-02-16 16:51:23,451 >> All model checkpoint weights were used when initializing GPT2ForSequenceClassificationCustom.\n",
"\n",
"[INFO|modeling_utils.py:2865] 2023-02-16 16:51:23,451 >> All the weights of GPT2ForSequenceClassificationCustom were initialized from the model checkpoint at out/gpt2.\n",
"If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2ForSequenceClassificationCustom for predictions without further training.\n",
"GPT2ForSequenceClassificationCustom(\n",
" (transformer): GPT2Model(\n",
" (wte): Embedding(50257, 768)\n",
" (wpe): Embedding(1024, 768)\n",
" (drop): Dropout(p=0.1, inplace=False)\n",
" (h): ModuleList(\n",
" (0): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (1): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (2): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (3): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (4): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (5): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (6): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (7): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (8): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (9): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (10): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (11): GPT2Block(\n",
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (attn): GPT2Attention(\n",
" (c_attn): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): GPT2MLP(\n",
" (c_fc): Conv1D()\n",
" (c_proj): Conv1D()\n",
" (act): NewGELUActivation()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
" )\n",
" (score): GPT2ClassificationHeadCustom(\n",
" (dense_1_input): Linear(in_features=768, out_features=1536, bias=True)\n",
" (dense_1_hidden): Linear(in_features=768, out_features=1536, bias=True)\n",
" (dense_2): Linear(in_features=3072, out_features=3072, bias=True)\n",
" (dense_3): Linear(in_features=3072, out_features=3072, bias=True)\n",
" (dense_4): Linear(in_features=3072, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (out_proj): Linear(in_features=768, out_features=4, bias=False)\n",
" )\n",
")\n",
"Running tokenizer on dataset: 0%| | 0/120 [00:00<?, ?ba/s]02/16/2023 16:51:23 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-7179a56e6d5f6003.arrow\n",
"Running tokenizer on dataset: 100%|███████████| 120/120 [00:07<00:00, 15.47ba/s]\n",
"Running tokenizer on dataset: 0%| | 0/4 [00:00<?, ?ba/s]02/16/2023 16:51:31 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-dd7e86ec7f74125a.arrow\n",
"Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 15.75ba/s]\n",
"Running tokenizer on dataset: 0%| | 0/4 [00:00<?, ?ba/s]02/16/2023 16:51:31 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-a11e14ac330179d1.arrow\n",
"Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 15.37ba/s]\n",
"02/16/2023 16:51:32 - INFO - __main__ - Set 500 samples for 0-class\n",
"02/16/2023 16:51:32 - INFO - __main__ - Set 500 samples for 1-class\n",
"02/16/2023 16:51:32 - INFO - __main__ - Set 500 samples for 2-class\n",
"02/16/2023 16:51:32 - INFO - __main__ - Set 500 samples for 3-class\n",
"[INFO|trainer.py:511] 2023-02-16 16:51:35,119 >> max_steps is given, it will override any value given in num_train_epochs\n",
"02/16/2023 16:51:35 - INFO - __main__ - *** Evaluate ***\n",
"[INFO|trainer.py:710] 2023-02-16 16:51:35,120 >> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`, you can safely ignore this message.\n",
"[INFO|trainer.py:2964] 2023-02-16 16:51:35,123 >> ***** Running Evaluation *****\n",
"[INFO|trainer.py:2966] 2023-02-16 16:51:35,123 >> Num examples = 2000\n",
"[INFO|trainer.py:2969] 2023-02-16 16:51:35,123 >> Batch size = 8\n",
"100%|█████████████████████████████████████████| 250/250 [00:23<00:00, 10.65it/s]\n",
"***** eval metrics *****\n",
" eval_accuracy = 0.9195\n",
" eval_loss = 0.302\n",
" eval_runtime = 0:00:24.11\n",
" eval_samples = 2000\n",
" eval_samples_per_second = 82.94\n",
" eval_steps_per_second = 10.367\n",
"02/16/2023 16:51:59 - INFO - __main__ - *** Predict ***\n",
"[INFO|trainer.py:710] 2023-02-16 16:51:59,239 >> The following columns in the test set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`, you can safely ignore this message.\n",
"[INFO|trainer.py:2964] 2023-02-16 16:51:59,240 >> ***** Running Prediction *****\n",
"[INFO|trainer.py:2966] 2023-02-16 16:51:59,240 >> Num examples = 3800\n",
"[INFO|trainer.py:2969] 2023-02-16 16:51:59,240 >> Batch size = 8\n",
"100%|█████████████████████████████████████████| 475/475 [00:43<00:00, 10.84it/s]\n",
"02/16/2023 16:52:43 - INFO - __main__ - ***** Predict results None *****\n",
"[INFO|modelcard.py:449] 2023-02-16 16:52:43,692 >> Dropping the following result as it does not have all the necessary fields:\n",
"{'task': {'name': 'Text Classification', 'type': 'text-classification'}}\n"
]
}
],
"source": [
"!python run_glue.py \\\n",
" --cache_dir .cache_training \\\n",
" --model_name_or_path out/gpt2 \\\n",
" --custom_model gpt2_hidden \\\n",
" --train_file data/train.json \\\n",
" --validation_file data/valid.json \\\n",
" --test_file data/test.json \\\n",
" --per_device_train_batch_size 8 \\\n",
" --per_device_eval_batch_size 8 \\\n",
" --do_eval \\\n",
" --do_predict \\\n",
" --max_seq_length 128 \\\n",
" --learning_rate 2e-5 \\\n",
" --max_eval_samples 2000 \\\n",
" --max_steps 2500 \\\n",
" --num_train_epochs 1 \\\n",
" --save_strategy steps \\\n",
" --save_steps 250 \\\n",
" --save_total_limit 5 \\\n",
" --logging_strategy steps \\\n",
" --logging_steps 100 \\\n",
" --eval_steps 250 \\\n",
" --evaluation_strategy steps \\\n",
" --metric_for_best_model accuracy \\\n",
" --greater_is_better True \\\n",
" --load_best_model_at_end True \\\n",
" --output_dir out/gpt2_results "
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Results"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[0;39m0.9194999933242798\u001b[0m\n"
]
}
],
"source": [
"!cat out/gpt2_results/eval_results.json | jq .eval_accuracy"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# T5"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Modifications"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"- Custom classification head with 3 dense layers\n",
"- Encoder layers frozen\n",
"- Decoder layers frozen"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Code"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import copy\n",
"from torch import nn\n",
"from transformers import T5PreTrainedModel, T5Config\n",
"from transformers.models.t5.modeling_t5 import T5Stack\n",
"from transformers.modeling_outputs import SequenceClassifierOutput\n",
"\n",
"\n",
"class T5ClassificationHead(nn.Module):\n",
" def __init__(self, config: T5Config):\n",
" super().__init__()\n",
"\n",
" self.dense_in = nn.Linear(config.d_model, 768)\n",
" self.dense = nn.Linear(768, 768)\n",
" self.dense_out = nn.Linear(768, config.num_labels)\n",
" self.dropout = nn.Dropout(0.1)\n",
"\n",
" def forward(self, features, **kwargs):\n",
" x = features[:, 0, :]\n",
" x = self.dropout(x)\n",
" x = self.dense_in(x)\n",
" x = torch.relu(x)\n",
" x = self.dropout(x)\n",
" x = self.dense(x)\n",
" x = torch.relu(x)\n",
" x = self.dropout(x)\n",
" x = self.dense_out(x)\n",
"\n",
" return x\n",
"\n",
"\n",
"class T5ForClassification(T5PreTrainedModel):\n",
" def __init__(self, config: T5Config):\n",
" super().__init__(config)\n",
" self.model_dim = config.d_model\n",
"\n",
" self.shared = nn.Embedding(config.vocab_size, config.d_model)\n",
"\n",
" encoder_config = copy.deepcopy(config)\n",
" encoder_config.is_decoder = False\n",
" encoder_config.use_cache = False\n",
" encoder_config.is_encoder_decoder = False\n",
" self.encoder = T5Stack(encoder_config, self.shared)\n",
"\n",
" decoder_config = copy.deepcopy(config)\n",
" decoder_config.is_decoder = True\n",
" decoder_config.is_encoder_decoder = False\n",
" decoder_config.num_layers = config.num_decoder_layers\n",
" self.decoder = T5Stack(decoder_config, self.shared)\n",
"\n",
" modules_to_freeze = [self.encoder.block[i].layer[0] for i in range(len(self.encoder.block))]\n",
" modules_to_freeze.extend([self.decoder.block[i].layer[0] for i in range(len(self.decoder.block))])\n",
" modules_to_freeze.extend([self.decoder.block[i].layer[1] for i in range(len(self.decoder.block))])\n",
"\n",
" for module in modules_to_freeze:\n",
" for param in module.parameters():\n",
" param.requires_grad = False\n",
"\n",
" self.lm_head = T5ClassificationHead(config)\n",
"\n",
" # Initialize weights and apply final processing\n",
" self.post_init()\n",
"\n",
" # Model parallel\n",
" self.model_parallel = False\n",
" self.device_map = None\n",
"\n",
"\n",
" def forward(\n",
" self,\n",
" input_ids=None,\n",
" attention_mask=None,\n",
" head_mask=None,\n",
" cross_attn_head_mask=None,\n",
" past_key_values=None,\n",
" inputs_embeds=None,\n",
" decoder_inputs_embeds=None,\n",
" use_cache=None,\n",
" output_attentions=None,\n",
" output_hidden_states=None,\n",
" return_dict=None,\n",
" labels=None\n",
" ):\n",
" return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n",
"\n",
" outputs = self.encoder(\n",
" input_ids,\n",
" attention_mask=attention_mask,\n",
" head_mask=head_mask,\n",
" cross_attn_head_mask=cross_attn_head_mask,\n",
" past_key_values=past_key_values,\n",
" inputs_embeds=inputs_embeds,\n",
" use_cache=use_cache,\n",
" output_attentions=output_attentions,\n",
" output_hidden_states=output_hidden_states,\n",
" return_dict=return_dict,\n",
" )\n",
"\n",
" outputs = self.decoder(\n",
" input_ids,\n",
" attention_mask=attention_mask,\n",
" head_mask=head_mask,\n",
" cross_attn_head_mask=cross_attn_head_mask,\n",
" past_key_values=past_key_values,\n",
" inputs_embeds=inputs_embeds,\n",
" use_cache=use_cache,\n",
" output_attentions=output_attentions,\n",
" output_hidden_states=output_hidden_states,\n",
" return_dict=return_dict,\n",
" )\n",
"\n",
"\n",
" logits = self.lm_head(outputs[0])\n",
"\n",
"\n",
" loss = None\n",
" if labels is not None:\n",
" loss_fct = nn.CrossEntropyLoss()\n",
" loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))\n",
"\n",
"\n",
" return SequenceClassifierOutput(\n",
" loss=loss,\n",
" logits=logits,\n",
" )\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "fda885ac92b1459ba9c0faf41a9d925f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)lve/main/config.json: 0%| | 0.00/1.21k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7b82fb0c2b284fcd940e67f81abbf397",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)\"pytorch_model.bin\";: 0%| | 0.00/892M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of the model checkpoint at t5-base were not used when initializing T5ForClassification: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']\n",
"- This IS expected if you are initializing T5ForClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing T5ForClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"Some weights of T5ForClassification were not initialized from the model checkpoint at t5-base and are newly initialized: ['lm_head.dense_out.bias', 'lm_head.dense.bias', 'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.dense_out.weight', 'lm_head.dense.weight', 'lm_head.dense_in.bias', 'lm_head.dense_in.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
},
{
"data": {
"text/plain": [
"T5ForClassification(\n",
" (shared): Embedding(32128, 768)\n",
" (encoder): T5Stack(\n",
" (embed_tokens): Embedding(32128, 768)\n",
" (block): ModuleList(\n",
" (0): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" (relative_attention_bias): Embedding(32, 12)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (1): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (2): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (3): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (4): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (5): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (6): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (7): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (8): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (9): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (10): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (11): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (final_layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (decoder): T5Stack(\n",
" (embed_tokens): Embedding(32128, 768)\n",
" (block): ModuleList(\n",
" (0): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" (relative_attention_bias): Embedding(32, 12)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (1): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (2): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (3): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (4): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (5): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (6): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (7): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (8): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (9): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (10): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (11): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (final_layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (lm_head): T5ClassificationHead(\n",
" (dense_in): Linear(in_features=768, out_features=768, bias=True)\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (dense_out): Linear(in_features=768, out_features=2, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
")"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"T5ForClassification.from_pretrained(\"t5-base\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"02/16/2023 15:24:13 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n",
"02/16/2023 15:24:13 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n",
"_n_gpu=1,\n",
"adafactor=False,\n",
"adam_beta1=0.9,\n",
"adam_beta2=0.999,\n",
"adam_epsilon=1e-08,\n",
"auto_find_batch_size=False,\n",
"bf16=False,\n",
"bf16_full_eval=False,\n",
"data_seed=None,\n",
"dataloader_drop_last=False,\n",
"dataloader_num_workers=0,\n",
"dataloader_pin_memory=True,\n",
"ddp_bucket_cap_mb=None,\n",
"ddp_find_unused_parameters=None,\n",
"ddp_timeout=1800,\n",
"debug=[],\n",
"deepspeed=None,\n",
"disable_tqdm=False,\n",
"do_eval=True,\n",
"do_predict=False,\n",
"do_train=True,\n",
"eval_accumulation_steps=None,\n",
"eval_delay=0,\n",
"eval_steps=250,\n",
"evaluation_strategy=steps,\n",
"fp16=False,\n",
"fp16_backend=auto,\n",
"fp16_full_eval=False,\n",
"fp16_opt_level=O1,\n",
"fsdp=[],\n",
"fsdp_min_num_params=0,\n",
"fsdp_transformer_layer_cls_to_wrap=None,\n",
"full_determinism=False,\n",
"gradient_accumulation_steps=1,\n",
"gradient_checkpointing=False,\n",
"greater_is_better=True,\n",
"group_by_length=False,\n",
"half_precision_backend=auto,\n",
"hub_model_id=None,\n",
"hub_private_repo=False,\n",
"hub_strategy=every_save,\n",
"hub_token=<HUB_TOKEN>,\n",
"ignore_data_skip=False,\n",
"include_inputs_for_metrics=False,\n",
"jit_mode_eval=False,\n",
"label_names=None,\n",
"label_smoothing_factor=0.0,\n",
"learning_rate=2e-05,\n",
"length_column_name=length,\n",
"load_best_model_at_end=True,\n",
"local_rank=-1,\n",
"log_level=passive,\n",
"log_level_replica=passive,\n",
"log_on_each_node=True,\n",
"logging_dir=out/t5/runs/Feb16_15-24-12_DESKTOP-R7JO8BQ,\n",
"logging_first_step=False,\n",
"logging_nan_inf_filter=True,\n",
"logging_steps=100,\n",
"logging_strategy=steps,\n",
"lr_scheduler_type=linear,\n",
"max_grad_norm=1.0,\n",
"max_steps=2500,\n",
"metric_for_best_model=accuracy,\n",
"mp_parameters=,\n",
"no_cuda=False,\n",
"num_train_epochs=1.0,\n",
"optim=adamw_hf,\n",
"optim_args=None,\n",
"output_dir=out/t5,\n",
"overwrite_output_dir=False,\n",
"past_index=-1,\n",
"per_device_eval_batch_size=8,\n",
"per_device_train_batch_size=8,\n",
"prediction_loss_only=False,\n",
"push_to_hub=False,\n",
"push_to_hub_model_id=None,\n",
"push_to_hub_organization=None,\n",
"push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
"ray_scope=last,\n",
"remove_unused_columns=True,\n",
"report_to=[],\n",
"resume_from_checkpoint=None,\n",
"run_name=out/t5,\n",
"save_on_each_node=False,\n",
"save_steps=250,\n",
"save_strategy=steps,\n",
"save_total_limit=5,\n",
"seed=42,\n",
"sharded_ddp=[],\n",
"skip_memory_metrics=True,\n",
"tf32=None,\n",
"torch_compile=False,\n",
"torch_compile_backend=None,\n",
"torch_compile_mode=None,\n",
"torchdynamo=None,\n",
"tpu_metrics_debug=False,\n",
"tpu_num_cores=None,\n",
"use_ipex=False,\n",
"use_legacy_prediction_loop=False,\n",
"use_mps_device=False,\n",
"warmup_ratio=0.0,\n",
"warmup_steps=0,\n",
"weight_decay=0.0,\n",
"xpu_backend=None,\n",
")\n",
"02/16/2023 15:24:13 - INFO - __main__ - Checkpoint detected, resuming training at out/t5/checkpoint-2500. To avoid this behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch.\n",
"02/16/2023 15:24:13 - INFO - __main__ - load a local file for train: data/train.json\n",
"02/16/2023 15:24:13 - INFO - __main__ - load a local file for validation: data/valid.json\n",
"02/16/2023 15:24:13 - WARNING - datasets.builder - Using custom data configuration default-e10a382a423bbb9a\n",
"02/16/2023 15:24:13 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json\n",
"02/16/2023 15:24:13 - INFO - datasets.builder - Overwrite dataset info from restored data version.\n",
"02/16/2023 15:24:13 - INFO - datasets.info - Loading Dataset info from .cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n",
"02/16/2023 15:24:13 - WARNING - datasets.builder - Found cached dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
"02/16/2023 15:24:13 - INFO - datasets.info - Loading Dataset info from /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n",
"100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 426.97it/s]\n",
"[INFO|configuration_utils.py:660] 2023-02-16 15:24:14,422 >> loading configuration file config.json from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/config.json\n",
"[INFO|configuration_utils.py:712] 2023-02-16 15:24:14,423 >> Model config T5Config {\n",
" \"_name_or_path\": \"t5-base\",\n",
" \"architectures\": [\n",
" \"T5ForConditionalGeneration\"\n",
" ],\n",
" \"d_ff\": 3072,\n",
" \"d_kv\": 64,\n",
" \"d_model\": 768,\n",
" \"decoder_start_token_id\": 0,\n",
" \"dense_act_fn\": \"relu\",\n",
" \"dropout_rate\": 0.1,\n",
" \"eos_token_id\": 1,\n",
" \"feed_forward_proj\": \"relu\",\n",
" \"id2label\": {\n",
" \"0\": \"LABEL_0\",\n",
" \"1\": \"LABEL_1\",\n",
" \"2\": \"LABEL_2\",\n",
" \"3\": \"LABEL_3\"\n",
" },\n",
" \"initializer_factor\": 1.0,\n",
" \"is_encoder_decoder\": true,\n",
" \"is_gated_act\": false,\n",
" \"label2id\": {\n",
" \"LABEL_0\": 0,\n",
" \"LABEL_1\": 1,\n",
" \"LABEL_2\": 2,\n",
" \"LABEL_3\": 3\n",
" },\n",
" \"layer_norm_epsilon\": 1e-06,\n",
" \"model_type\": \"t5\",\n",
" \"n_positions\": 512,\n",
" \"num_decoder_layers\": 12,\n",
" \"num_heads\": 12,\n",
" \"num_layers\": 12,\n",
" \"output_past\": true,\n",
" \"pad_token_id\": 0,\n",
" \"relative_attention_max_distance\": 128,\n",
" \"relative_attention_num_buckets\": 32,\n",
" \"task_specific_params\": {\n",
" \"summarization\": {\n",
" \"early_stopping\": true,\n",
" \"length_penalty\": 2.0,\n",
" \"max_length\": 200,\n",
" \"min_length\": 30,\n",
" \"no_repeat_ngram_size\": 3,\n",
" \"num_beams\": 4,\n",
" \"prefix\": \"summarize: \"\n",
" },\n",
" \"translation_en_to_de\": {\n",
" \"early_stopping\": true,\n",
" \"max_length\": 300,\n",
" \"num_beams\": 4,\n",
" \"prefix\": \"translate English to German: \"\n",
" },\n",
" \"translation_en_to_fr\": {\n",
" \"early_stopping\": true,\n",
" \"max_length\": 300,\n",
" \"num_beams\": 4,\n",
" \"prefix\": \"translate English to French: \"\n",
" },\n",
" \"translation_en_to_ro\": {\n",
" \"early_stopping\": true,\n",
" \"max_length\": 300,\n",
" \"num_beams\": 4,\n",
" \"prefix\": \"translate English to Romanian: \"\n",
" }\n",
" },\n",
" \"transformers_version\": \"4.26.1\",\n",
" \"use_cache\": true,\n",
" \"vocab_size\": 32128\n",
"}\n",
"\n",
"[INFO|tokenization_auto.py:458] 2023-02-16 15:24:14,918 >> Could not locate the tokenizer configuration file, will try to use the model config instead.\n",
"[INFO|configuration_utils.py:660] 2023-02-16 15:24:15,378 >> loading configuration file config.json from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/config.json\n",
"[INFO|configuration_utils.py:712] 2023-02-16 15:24:15,378 >> Model config T5Config {\n",
" \"_name_or_path\": \"t5-base\",\n",
" \"architectures\": [\n",
" \"T5ForConditionalGeneration\"\n",
" ],\n",
" \"d_ff\": 3072,\n",
" \"d_kv\": 64,\n",
" \"d_model\": 768,\n",
" \"decoder_start_token_id\": 0,\n",
" \"dense_act_fn\": \"relu\",\n",
" \"dropout_rate\": 0.1,\n",
" \"eos_token_id\": 1,\n",
" \"feed_forward_proj\": \"relu\",\n",
" \"initializer_factor\": 1.0,\n",
" \"is_encoder_decoder\": true,\n",
" \"is_gated_act\": false,\n",
" \"layer_norm_epsilon\": 1e-06,\n",
" \"model_type\": \"t5\",\n",
" \"n_positions\": 512,\n",
" \"num_decoder_layers\": 12,\n",
" \"num_heads\": 12,\n",
" \"num_layers\": 12,\n",
" \"output_past\": true,\n",
" \"pad_token_id\": 0,\n",
" \"relative_attention_max_distance\": 128,\n",
" \"relative_attention_num_buckets\": 32,\n",
" \"task_specific_params\": {\n",
" \"summarization\": {\n",
" \"early_stopping\": true,\n",
" \"length_penalty\": 2.0,\n",
" \"max_length\": 200,\n",
" \"min_length\": 30,\n",
" \"no_repeat_ngram_size\": 3,\n",
" \"num_beams\": 4,\n",
" \"prefix\": \"summarize: \"\n",
" },\n",
" \"translation_en_to_de\": {\n",
" \"early_stopping\": true,\n",
" \"max_length\": 300,\n",
" \"num_beams\": 4,\n",
" \"prefix\": \"translate English to German: \"\n",
" },\n",
" \"translation_en_to_fr\": {\n",
" \"early_stopping\": true,\n",
" \"max_length\": 300,\n",
" \"num_beams\": 4,\n",
" \"prefix\": \"translate English to French: \"\n",
" },\n",
" \"translation_en_to_ro\": {\n",
" \"early_stopping\": true,\n",
" \"max_length\": 300,\n",
" \"num_beams\": 4,\n",
" \"prefix\": \"translate English to Romanian: \"\n",
" }\n",
" },\n",
" \"transformers_version\": \"4.26.1\",\n",
" \"use_cache\": true,\n",
" \"vocab_size\": 32128\n",
"}\n",
"\n",
"[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:24:16,341 >> loading file spiece.model from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/spiece.model\n",
"[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:24:16,341 >> loading file tokenizer.json from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/tokenizer.json\n",
"[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:24:16,341 >> loading file added_tokens.json from cache at None\n",
"[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:24:16,341 >> loading file special_tokens_map.json from cache at None\n",
"[INFO|tokenization_utils_base.py:1802] 2023-02-16 15:24:16,341 >> loading file tokenizer_config.json from cache at None\n",
"[INFO|configuration_utils.py:660] 2023-02-16 15:24:16,342 >> loading configuration file config.json from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/config.json\n",
"[INFO|configuration_utils.py:712] 2023-02-16 15:24:16,342 >> Model config T5Config {\n",
" \"_name_or_path\": \"t5-base\",\n",
" \"architectures\": [\n",
" \"T5ForConditionalGeneration\"\n",
" ],\n",
" \"d_ff\": 3072,\n",
" \"d_kv\": 64,\n",
" \"d_model\": 768,\n",
" \"decoder_start_token_id\": 0,\n",
" \"dense_act_fn\": \"relu\",\n",
" \"dropout_rate\": 0.1,\n",
" \"eos_token_id\": 1,\n",
" \"feed_forward_proj\": \"relu\",\n",
" \"initializer_factor\": 1.0,\n",
" \"is_encoder_decoder\": true,\n",
" \"is_gated_act\": false,\n",
" \"layer_norm_epsilon\": 1e-06,\n",
" \"model_type\": \"t5\",\n",
" \"n_positions\": 512,\n",
" \"num_decoder_layers\": 12,\n",
" \"num_heads\": 12,\n",
" \"num_layers\": 12,\n",
" \"output_past\": true,\n",
" \"pad_token_id\": 0,\n",
" \"relative_attention_max_distance\": 128,\n",
" \"relative_attention_num_buckets\": 32,\n",
" \"task_specific_params\": {\n",
" \"summarization\": {\n",
" \"early_stopping\": true,\n",
" \"length_penalty\": 2.0,\n",
" \"max_length\": 200,\n",
" \"min_length\": 30,\n",
" \"no_repeat_ngram_size\": 3,\n",
" \"num_beams\": 4,\n",
" \"prefix\": \"summarize: \"\n",
" },\n",
" \"translation_en_to_de\": {\n",
" \"early_stopping\": true,\n",
" \"max_length\": 300,\n",
" \"num_beams\": 4,\n",
" \"prefix\": \"translate English to German: \"\n",
" },\n",
" \"translation_en_to_fr\": {\n",
" \"early_stopping\": true,\n",
" \"max_length\": 300,\n",
" \"num_beams\": 4,\n",
" \"prefix\": \"translate English to French: \"\n",
" },\n",
" \"translation_en_to_ro\": {\n",
" \"early_stopping\": true,\n",
" \"max_length\": 300,\n",
" \"num_beams\": 4,\n",
" \"prefix\": \"translate English to Romanian: \"\n",
" }\n",
" },\n",
" \"transformers_version\": \"4.26.1\",\n",
" \"use_cache\": true,\n",
" \"vocab_size\": 32128\n",
"}\n",
"\n",
"/home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/transformers/models/t5/tokenization_t5_fast.py:155: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.\n",
"For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n",
"- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.\n",
"- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n",
"- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.\n",
" warnings.warn(\n",
"02/16/2023 15:24:16 - INFO - __main__ - Using hidden states in model: False\n",
"-------------------------------------------------------- Using hidden: False\n",
"02/16/2023 15:24:16 - INFO - __main__ - Using implementation from class: T5ForClassification\n",
"[INFO|modeling_utils.py:2275] 2023-02-16 15:24:16,391 >> loading weights file pytorch_model.bin from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/pytorch_model.bin\n",
"[WARNING|modeling_utils.py:2847] 2023-02-16 15:24:19,101 >> Some weights of the model checkpoint at t5-base were not used when initializing T5ForClassification: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']\n",
"- This IS expected if you are initializing T5ForClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing T5ForClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"[WARNING|modeling_utils.py:2859] 2023-02-16 15:24:19,102 >> Some weights of T5ForClassification were not initialized from the model checkpoint at t5-base and are newly initialized: ['decoder.embed_tokens.weight', 'lm_head.dense.bias', 'lm_head.dense_out.bias', 'encoder.embed_tokens.weight', 'lm_head.dense_in.bias', 'lm_head.dense_in.weight', 'lm_head.dense.weight', 'lm_head.dense_out.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"T5ForClassification(\n",
" (shared): Embedding(32128, 768)\n",
" (encoder): T5Stack(\n",
" (embed_tokens): Embedding(32128, 768)\n",
" (block): ModuleList(\n",
" (0): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" (relative_attention_bias): Embedding(32, 12)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (1): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (2): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (3): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (4): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (5): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (6): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (7): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (8): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (9): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (10): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (11): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (final_layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (decoder): T5Stack(\n",
" (embed_tokens): Embedding(32128, 768)\n",
" (block): ModuleList(\n",
" (0): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" (relative_attention_bias): Embedding(32, 12)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (1): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (2): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (3): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (4): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (5): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (6): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (7): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (8): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (9): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (10): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (11): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (final_layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (lm_head): T5ClassificationHead(\n",
" (dense_in): Linear(in_features=768, out_features=768, bias=True)\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (dense_out): Linear(in_features=768, out_features=4, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
")\n",
"Running tokenizer on dataset: 0%| | 0/120 [00:00<?, ?ba/s]02/16/2023 15:24:19 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-0f99c998b010fbf8.arrow\n",
"Running tokenizer on dataset: 100%|███████████| 120/120 [00:07<00:00, 15.69ba/s]\n",
"Running tokenizer on dataset: 0%| | 0/4 [00:00<?, ?ba/s]02/16/2023 15:24:26 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-0cfaba6ab7fdc0e3.arrow\n",
"Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 17.12ba/s]\n",
"02/16/2023 15:24:27 - INFO - __main__ - Set 500 samples for 0-class\n",
"02/16/2023 15:24:27 - INFO - __main__ - Set 500 samples for 1-class\n",
"02/16/2023 15:24:27 - INFO - __main__ - Set 500 samples for 2-class\n",
"02/16/2023 15:24:27 - INFO - __main__ - Set 500 samples for 3-class\n",
"Traceback (most recent call last):\n",
" File \"/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/run_glue.py\", line 685, in <module>\n",
" main()\n",
" File \"/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/run_glue.py\", line 533, in main\n",
" raise ValueError(\"--do_predict requires a test dataset\")\n",
"ValueError: --do_predict requires a test dataset\n"
]
}
],
"source": [
"!python run_glue.py \\\n",
" --cache_dir .cache_training \\\n",
" --model_name_or_path t5-base \\\n",
" --custom_model t5_custom \\\n",
" --train_file data/train.json \\\n",
" --validation_file data/valid.json \\\n",
" --test_file data/test.json \\\n",
" --per_device_train_batch_size 8 \\\n",
" --per_device_eval_batch_size 8 \\\n",
" --do_train \\\n",
" --do_eval \\\n",
" --max_seq_length 128 \\\n",
" --learning_rate 2e-5 \\\n",
" --max_eval_samples 2000 \\\n",
" --max_steps 2500 \\\n",
" --num_train_epochs 1 \\\n",
" --save_strategy steps \\\n",
" --save_steps 250 \\\n",
" --save_total_limit 5 \\\n",
" --logging_strategy steps \\\n",
" --logging_steps 100 \\\n",
" --eval_steps 250 \\\n",
" --evaluation_strategy steps \\\n",
" --metric_for_best_model accuracy \\\n",
" --greater_is_better True \\\n",
" --load_best_model_at_end True \\\n",
" --output_dir out/t5"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluation"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"02/16/2023 16:52:57 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n",
"02/16/2023 16:52:57 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n",
"_n_gpu=1,\n",
"adafactor=False,\n",
"adam_beta1=0.9,\n",
"adam_beta2=0.999,\n",
"adam_epsilon=1e-08,\n",
"auto_find_batch_size=False,\n",
"bf16=False,\n",
"bf16_full_eval=False,\n",
"data_seed=None,\n",
"dataloader_drop_last=False,\n",
"dataloader_num_workers=0,\n",
"dataloader_pin_memory=True,\n",
"ddp_bucket_cap_mb=None,\n",
"ddp_find_unused_parameters=None,\n",
"ddp_timeout=1800,\n",
"debug=[],\n",
"deepspeed=None,\n",
"disable_tqdm=False,\n",
"do_eval=True,\n",
"do_predict=True,\n",
"do_train=False,\n",
"eval_accumulation_steps=None,\n",
"eval_delay=0,\n",
"eval_steps=250,\n",
"evaluation_strategy=steps,\n",
"fp16=False,\n",
"fp16_backend=auto,\n",
"fp16_full_eval=False,\n",
"fp16_opt_level=O1,\n",
"fsdp=[],\n",
"fsdp_min_num_params=0,\n",
"fsdp_transformer_layer_cls_to_wrap=None,\n",
"full_determinism=False,\n",
"gradient_accumulation_steps=1,\n",
"gradient_checkpointing=False,\n",
"greater_is_better=True,\n",
"group_by_length=False,\n",
"half_precision_backend=auto,\n",
"hub_model_id=None,\n",
"hub_private_repo=False,\n",
"hub_strategy=every_save,\n",
"hub_token=<HUB_TOKEN>,\n",
"ignore_data_skip=False,\n",
"include_inputs_for_metrics=False,\n",
"jit_mode_eval=False,\n",
"label_names=None,\n",
"label_smoothing_factor=0.0,\n",
"learning_rate=2e-05,\n",
"length_column_name=length,\n",
"load_best_model_at_end=True,\n",
"local_rank=-1,\n",
"log_level=passive,\n",
"log_level_replica=passive,\n",
"log_on_each_node=True,\n",
"logging_dir=out/t5_results/runs/Feb16_16-52-56_DESKTOP-R7JO8BQ,\n",
"logging_first_step=False,\n",
"logging_nan_inf_filter=True,\n",
"logging_steps=100,\n",
"logging_strategy=steps,\n",
"lr_scheduler_type=linear,\n",
"max_grad_norm=1.0,\n",
"max_steps=2500,\n",
"metric_for_best_model=accuracy,\n",
"mp_parameters=,\n",
"no_cuda=False,\n",
"num_train_epochs=1.0,\n",
"optim=adamw_hf,\n",
"optim_args=None,\n",
"output_dir=out/t5_results,\n",
"overwrite_output_dir=False,\n",
"past_index=-1,\n",
"per_device_eval_batch_size=8,\n",
"per_device_train_batch_size=8,\n",
"prediction_loss_only=False,\n",
"push_to_hub=False,\n",
"push_to_hub_model_id=None,\n",
"push_to_hub_organization=None,\n",
"push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
"ray_scope=last,\n",
"remove_unused_columns=True,\n",
"report_to=[],\n",
"resume_from_checkpoint=None,\n",
"run_name=out/t5_results,\n",
"save_on_each_node=False,\n",
"save_steps=250,\n",
"save_strategy=steps,\n",
"save_total_limit=5,\n",
"seed=42,\n",
"sharded_ddp=[],\n",
"skip_memory_metrics=True,\n",
"tf32=None,\n",
"torch_compile=False,\n",
"torch_compile_backend=None,\n",
"torch_compile_mode=None,\n",
"torchdynamo=None,\n",
"tpu_metrics_debug=False,\n",
"tpu_num_cores=None,\n",
"use_ipex=False,\n",
"use_legacy_prediction_loop=False,\n",
"use_mps_device=False,\n",
"warmup_ratio=0.0,\n",
"warmup_steps=0,\n",
"weight_decay=0.0,\n",
"xpu_backend=None,\n",
")\n",
"02/16/2023 16:52:57 - INFO - __main__ - load a local file for train: data/train.json\n",
"02/16/2023 16:52:57 - INFO - __main__ - load a local file for validation: data/valid.json\n",
"02/16/2023 16:52:57 - INFO - __main__ - load a local file for test: data/test.json\n",
"02/16/2023 16:52:58 - WARNING - datasets.builder - Using custom data configuration default-f6e8039906850c57\n",
"02/16/2023 16:52:58 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json\n",
"02/16/2023 16:52:58 - INFO - datasets.builder - Overwrite dataset info from restored data version.\n",
"02/16/2023 16:52:58 - INFO - datasets.info - Loading Dataset info from .cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n",
"02/16/2023 16:52:58 - WARNING - datasets.builder - Found cached dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n",
"02/16/2023 16:52:58 - INFO - datasets.info - Loading Dataset info from /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\n",
"100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 769.41it/s]\n",
"[INFO|configuration_utils.py:658] 2023-02-16 16:52:58,326 >> loading configuration file out/t5/config.json\n",
"[INFO|configuration_utils.py:712] 2023-02-16 16:52:58,327 >> Model config T5Config {\n",
" \"_name_or_path\": \"out/t5\",\n",
" \"architectures\": [\n",
" \"T5ForClassification\"\n",
" ],\n",
" \"d_ff\": 3072,\n",
" \"d_kv\": 64,\n",
" \"d_model\": 768,\n",
" \"decoder_start_token_id\": 0,\n",
" \"dense_act_fn\": \"relu\",\n",
" \"dropout_rate\": 0.1,\n",
" \"eos_token_id\": 1,\n",
" \"feed_forward_proj\": \"relu\",\n",
" \"id2label\": {\n",
" \"0\": 0,\n",
" \"1\": 1,\n",
" \"2\": 2,\n",
" \"3\": 3\n",
" },\n",
" \"initializer_factor\": 1.0,\n",
" \"is_encoder_decoder\": true,\n",
" \"is_gated_act\": false,\n",
" \"label2id\": {\n",
" \"0\": 0,\n",
" \"1\": 1,\n",
" \"2\": 2,\n",
" \"3\": 3\n",
" },\n",
" \"layer_norm_epsilon\": 1e-06,\n",
" \"model_type\": \"t5\",\n",
" \"n_positions\": 512,\n",
" \"num_decoder_layers\": 12,\n",
" \"num_heads\": 12,\n",
" \"num_layers\": 12,\n",
" \"output_past\": true,\n",
" \"pad_token_id\": 0,\n",
" \"relative_attention_max_distance\": 128,\n",
" \"relative_attention_num_buckets\": 32,\n",
" \"task_specific_params\": {\n",
" \"summarization\": {\n",
" \"early_stopping\": true,\n",
" \"length_penalty\": 2.0,\n",
" \"max_length\": 200,\n",
" \"min_length\": 30,\n",
" \"no_repeat_ngram_size\": 3,\n",
" \"num_beams\": 4,\n",
" \"prefix\": \"summarize: \"\n",
" },\n",
" \"translation_en_to_de\": {\n",
" \"early_stopping\": true,\n",
" \"max_length\": 300,\n",
" \"num_beams\": 4,\n",
" \"prefix\": \"translate English to German: \"\n",
" },\n",
" \"translation_en_to_fr\": {\n",
" \"early_stopping\": true,\n",
" \"max_length\": 300,\n",
" \"num_beams\": 4,\n",
" \"prefix\": \"translate English to French: \"\n",
" },\n",
" \"translation_en_to_ro\": {\n",
" \"early_stopping\": true,\n",
" \"max_length\": 300,\n",
" \"num_beams\": 4,\n",
" \"prefix\": \"translate English to Romanian: \"\n",
" }\n",
" },\n",
" \"torch_dtype\": \"float32\",\n",
" \"transformers_version\": \"4.26.1\",\n",
" \"use_cache\": true,\n",
" \"use_hidden_states\": false,\n",
" \"vocab_size\": 32128\n",
"}\n",
"\n",
"[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:52:58,328 >> loading file spiece.model\n",
"[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:52:58,328 >> loading file tokenizer.json\n",
"[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:52:58,328 >> loading file added_tokens.json\n",
"[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:52:58,328 >> loading file special_tokens_map.json\n",
"[INFO|tokenization_utils_base.py:1800] 2023-02-16 16:52:58,328 >> loading file tokenizer_config.json\n",
"02/16/2023 16:52:58 - INFO - __main__ - Using hidden states in model: False\n",
"-------------------------------------------------------- Using hidden: False\n",
"02/16/2023 16:52:58 - INFO - __main__ - Using implementation from class: T5ForClassification\n",
"[INFO|modeling_utils.py:2272] 2023-02-16 16:52:58,375 >> loading weights file out/t5/pytorch_model.bin\n",
"[INFO|modeling_utils.py:2857] 2023-02-16 16:53:00,690 >> All model checkpoint weights were used when initializing T5ForClassification.\n",
"\n",
"[INFO|modeling_utils.py:2865] 2023-02-16 16:53:00,690 >> All the weights of T5ForClassification were initialized from the model checkpoint at out/t5.\n",
"If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForClassification for predictions without further training.\n",
"T5ForClassification(\n",
" (shared): Embedding(32128, 768)\n",
" (encoder): T5Stack(\n",
" (embed_tokens): Embedding(32128, 768)\n",
" (block): ModuleList(\n",
" (0): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" (relative_attention_bias): Embedding(32, 12)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (1): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (2): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (3): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (4): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (5): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (6): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (7): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (8): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (9): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (10): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (11): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (final_layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (decoder): T5Stack(\n",
" (embed_tokens): Embedding(32128, 768)\n",
" (block): ModuleList(\n",
" (0): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" (relative_attention_bias): Embedding(32, 12)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (1): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (2): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (3): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (4): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (5): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (6): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (7): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (8): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (9): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (10): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (11): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseActDense(\n",
" (wi): Linear(in_features=768, out_features=3072, bias=False)\n",
" (wo): Linear(in_features=3072, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): ReLU()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (final_layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (lm_head): T5ClassificationHead(\n",
" (dense_in): Linear(in_features=768, out_features=768, bias=True)\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (dense_out): Linear(in_features=768, out_features=4, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
")\n",
"Running tokenizer on dataset: 0%| | 0/120 [00:00<?, ?ba/s]02/16/2023 16:53:00 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-461127b59c7ea04e.arrow\n",
"Running tokenizer on dataset: 100%|███████████| 120/120 [00:08<00:00, 14.36ba/s]\n",
"Running tokenizer on dataset: 0%| | 0/4 [00:00<?, ?ba/s]02/16/2023 16:53:09 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-bbee377e7bea95e7.arrow\n",
"Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 15.94ba/s]\n",
"Running tokenizer on dataset: 0%| | 0/4 [00:00<?, ?ba/s]02/16/2023 16:53:09 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-4e0cbdadca2e6dc6.arrow\n",
"Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 16.87ba/s]\n",
"02/16/2023 16:53:10 - INFO - __main__ - Set 500 samples for 0-class\n",
"02/16/2023 16:53:10 - INFO - __main__ - Set 500 samples for 1-class\n",
"02/16/2023 16:53:10 - INFO - __main__ - Set 500 samples for 2-class\n",
"02/16/2023 16:53:10 - INFO - __main__ - Set 500 samples for 3-class\n",
"[INFO|trainer.py:511] 2023-02-16 16:53:12,738 >> max_steps is given, it will override any value given in num_train_epochs\n",
"02/16/2023 16:53:12 - INFO - __main__ - *** Evaluate ***\n",
"[INFO|trainer.py:710] 2023-02-16 16:53:12,739 >> The following columns in the evaluation set don't have a corresponding argument in `T5ForClassification.forward` and have been ignored: text. If text are not expected by `T5ForClassification.forward`, you can safely ignore this message.\n",
"[INFO|trainer.py:2964] 2023-02-16 16:53:12,740 >> ***** Running Evaluation *****\n",
"[INFO|trainer.py:2966] 2023-02-16 16:53:12,740 >> Num examples = 2000\n",
"[INFO|trainer.py:2969] 2023-02-16 16:53:12,740 >> Batch size = 8\n",
"100%|█████████████████████████████████████████| 250/250 [00:39<00:00, 6.26it/s]\n",
"***** eval metrics *****\n",
" eval_accuracy = 0.4675\n",
" eval_loss = 1.2139\n",
" eval_runtime = 0:00:40.56\n",
" eval_samples = 2000\n",
" eval_samples_per_second = 49.303\n",
" eval_steps_per_second = 6.163\n",
"02/16/2023 16:53:53 - INFO - __main__ - *** Predict ***\n",
"[INFO|trainer.py:710] 2023-02-16 16:53:53,307 >> The following columns in the test set don't have a corresponding argument in `T5ForClassification.forward` and have been ignored: text. If text are not expected by `T5ForClassification.forward`, you can safely ignore this message.\n",
"[INFO|trainer.py:2964] 2023-02-16 16:53:53,308 >> ***** Running Prediction *****\n",
"[INFO|trainer.py:2966] 2023-02-16 16:53:53,308 >> Num examples = 3800\n",
"[INFO|trainer.py:2969] 2023-02-16 16:53:53,308 >> Batch size = 8\n",
"100%|█████████████████████████████████████████| 475/475 [01:15<00:00, 6.32it/s]\n",
"02/16/2023 16:55:08 - INFO - __main__ - ***** Predict results None *****\n",
"[INFO|modelcard.py:449] 2023-02-16 16:55:09,179 >> Dropping the following result as it does not have all the necessary fields:\n",
"{'task': {'name': 'Text Classification', 'type': 'text-classification'}}\n"
]
}
],
"source": [
"!python run_glue.py \\\n",
" --cache_dir .cache_training \\\n",
" --model_name_or_path out/t5 \\\n",
" --custom_model t5_custom \\\n",
" --train_file data/train.json \\\n",
" --validation_file data/valid.json \\\n",
" --test_file data/test.json \\\n",
" --per_device_train_batch_size 8 \\\n",
" --per_device_eval_batch_size 8 \\\n",
" --do_eval \\\n",
" --do_predict \\\n",
" --max_seq_length 128 \\\n",
" --learning_rate 2e-5 \\\n",
" --max_eval_samples 2000 \\\n",
" --max_steps 2500 \\\n",
" --num_train_epochs 1 \\\n",
" --save_strategy steps \\\n",
" --save_steps 250 \\\n",
" --save_total_limit 5 \\\n",
" --logging_strategy steps \\\n",
" --logging_steps 100 \\\n",
" --eval_steps 250 \\\n",
" --evaluation_strategy steps \\\n",
" --metric_for_best_model accuracy \\\n",
" --greater_is_better True \\\n",
" --load_best_model_at_end True \\\n",
" --output_dir out/t5_results"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Result"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[0;39m0.4675000011920929\u001b[0m\n"
]
}
],
"source": [
"!cat out/t5_results/eval_results.json | jq .eval_accuracy"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Bart - Zero shot"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Code"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "8de84b2cf8ed46488a6eb0bb4e0b11ef",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)lve/main/config.json: 0%| | 0.00/1.40k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a0821410f9c64d608250175972c7e65e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)\"pytorch_model.bin\";: 0%| | 0.00/990M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "10ea3442bf2e4af88050e6b6bf9ced14",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)neration_config.json: 0%| | 0.00/147 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "aef52d2ec9594d21a1e328b8cd9b78e4",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)okenizer_config.json: 0%| | 0.00/2.54k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "68abdd279b314c9794d8d7c697f534cd",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)\"spiece.model\";: 0%| | 0.00/792k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ee44259445634805b86f28a00817e036",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)/main/tokenizer.json: 0%| | 0.00/2.42M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5a7a9a0ca77d4cfba7132ee76fec44e7",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)cial_tokens_map.json: 0%| | 0.00/2.20k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline\n",
"from datasets import load_dataset\n",
"from tqdm.notebook import tqdm\n",
"model = AutoModelForSeq2SeqLM.from_pretrained(\"google/flan-t5-base\")\n",
"tokenizer = AutoTokenizer.from_pretrained(\"google/flan-t5-base\")\n",
"\n",
"pipeline = pipeline(\"text2text-generation\", model=model, tokenizer=tokenizer)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using custom data configuration default-20e4aa4ef5e587fb\n",
"Found cached dataset json (/home/jacob/.cache/huggingface/datasets/json/default-20e4aa4ef5e587fb/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "575936fbea7d4ceabb455ed732bace7e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loading cached processed dataset at /home/jacob/.cache/huggingface/datasets/json/default-20e4aa4ef5e587fb/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-6a7c4b64ea03ea9d.arrow\n"
]
}
],
"source": [
"MAP_LABEL_TRANSLATION = {\n",
" 0: 'world',\n",
" 1: 'sport',\n",
" 2: 'business',\n",
" 3: 'scitech'\n",
"}\n",
"dataset = load_dataset(\"json\", data_files={'test': 'data/test.json'})\n",
"\n",
"dataset['test'] = dataset['test'].map(lambda x: { 'label': MAP_LABEL_TRANSLATION[x['label']], 'text': x['text']})"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"T5ForConditionalGeneration(\n",
" (shared): Embedding(32128, 768)\n",
" (encoder): T5Stack(\n",
" (embed_tokens): Embedding(32128, 768)\n",
" (block): ModuleList(\n",
" (0): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" (relative_attention_bias): Embedding(32, 12)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (1): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (2): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (3): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (4): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (5): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (6): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (7): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (8): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (9): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (10): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (11): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (final_layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (decoder): T5Stack(\n",
" (embed_tokens): Embedding(32128, 768)\n",
" (block): ModuleList(\n",
" (0): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" (relative_attention_bias): Embedding(32, 12)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (1): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (2): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (3): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (4): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (5): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (6): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (7): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (8): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (9): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (10): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (11): T5Block(\n",
" (layer): ModuleList(\n",
" (0): T5LayerSelfAttention(\n",
" (SelfAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (1): T5LayerCrossAttention(\n",
" (EncDecAttention): T5Attention(\n",
" (q): Linear(in_features=768, out_features=768, bias=False)\n",
" (k): Linear(in_features=768, out_features=768, bias=False)\n",
" (v): Linear(in_features=768, out_features=768, bias=False)\n",
" (o): Linear(in_features=768, out_features=768, bias=False)\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (2): T5LayerFF(\n",
" (DenseReluDense): T5DenseGatedActDense(\n",
" (wi_0): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wi_1): Linear(in_features=768, out_features=2048, bias=False)\n",
" (wo): Linear(in_features=2048, out_features=768, bias=False)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (act): NewGELUActivation()\n",
" )\n",
" (layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (final_layer_norm): T5LayerNorm()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (lm_head): Linear(in_features=768, out_features=32128, bias=False)\n",
")"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Validation"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.7560526315789474\n"
]
}
],
"source": [
"correct = 0\n",
"labels = \"sport, world, business, scitech\"\n",
"\n",
"for entry in dataset['test']:\n",
" prompt = f\"classify with possible labels: {labels}\\ntext: {entry['text']}\"\n",
" output = pipeline(prompt, do_sample=False)[0]['generated_text'].lower()\n",
" if output == entry['label']:\n",
" correct += 1\n",
"\n",
"accuracy = correct / len(dataset['test'])\n",
"print(f\"Accuracy: {accuracy}\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Summary"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"| |Roberta|GPT2 |T5 |Flan-T5|\n",
"|--------|-------|-----|-----|-------|\n",
"|Accuracy|92.2% |91.9%|46.7%|75.6% |"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.9 ('ugp')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "4f917e9727e89f2278497f95f2732cc5b9cb99f840615e0399f81b235c1c2211"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}