370 KiB
370 KiB
!pip install transformers torch datasets evaluate scikit-learn sacremoses sentencepiece ipywidgets > /dev/null
Roberta
Modifications
- Custom classification head with bigger hidden size
- Changed activation function to GELU
Code
from torch import nn
from transformers import RobertaForSequenceClassification, RobertaModel
# Simple version #
class RobertaClassificationHeadCustomSimple(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(self, config):
super().__init__()
hidden_size = config.hidden_size
self.dense_1 = nn.Linear(hidden_size, 4 * hidden_size)
self.dense_2 = nn.Linear(4 * hidden_size, hidden_size)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Linear(hidden_size, config.num_labels)
self.activation = nn.GELU()
def forward(self, features, **kwargs):
x = features[:, 0, :] # take <s> token (equiv. to [CLS])
x = self.dense_1(x)
x = self.activation(x)
x = self.dropout(x)
x = self.dense_2(x)
x = self.activation(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
class RobertaForSequenceClassificationCustomSimple(RobertaForSequenceClassification):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.roberta = RobertaModel(config, add_pooling_layer=False)
self.classifier = RobertaClassificationHeadCustomSimple(config)
# Initialize weights and apply final processing
self.post_init()
Model
RobertaForSequenceClassificationCustomSimple.from_pretrained("roberta-base")
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassificationCustomSimple: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias'] - This IS expected if you are initializing RobertaForSequenceClassificationCustomSimple from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing RobertaForSequenceClassificationCustomSimple from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Some weights of RobertaForSequenceClassificationCustomSimple were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense_1.weight', 'classifier.out_proj.bias', 'classifier.dense_2.bias', 'classifier.dense_2.weight', 'classifier.out_proj.weight', 'classifier.dense_1.bias'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
RobertaForSequenceClassificationCustomSimple( (roberta): RobertaModel( (embeddings): RobertaEmbeddings( (word_embeddings): Embedding(50265, 768, padding_idx=1) (position_embeddings): Embedding(514, 768, padding_idx=1) (token_type_embeddings): Embedding(1, 768) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): RobertaEncoder( (layer): ModuleList( (0): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) ) (classifier): RobertaClassificationHeadCustomSimple( (dense_1): Linear(in_features=768, out_features=3072, bias=True) (dense_2): Linear(in_features=3072, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) (out_proj): Linear(in_features=768, out_features=2, bias=True) (activation): GELU(approximate='none') ) )
Training
!python run_glue.py \
--cache_dir .cache_training \
--model_name_or_path roberta-base \
--custom_model roberta_simple \
--train_file data/train.json \
--validation_file data/valid.json \
--test_file data/test.json \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--do_train \
--do_eval \
--do_predict \
--max_seq_length 128 \
--learning_rate 2e-5 \
--max_eval_samples 2000 \
--max_steps 2500 \
--num_train_epochs 1 \
--save_strategy steps \
--save_steps 250 \
--save_total_limit 5 \
--logging_strategy steps \
--logging_steps 100 \
--eval_steps 250 \
--evaluation_strategy steps \
--metric_for_best_model accuracy \
--greater_is_better True \
--load_best_model_at_end True \
--output_dir out/roberta
02/16/2023 15:21:14 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False 02/16/2023 15:21:14 - INFO - __main__ - Training/evaluation parameters TrainingArguments( _n_gpu=1, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, bf16=False, bf16_full_eval=False, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=0, dataloader_pin_memory=True, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=None, disable_tqdm=False, do_eval=True, do_predict=True, do_train=True, eval_accumulation_steps=None, eval_delay=0, eval_steps=250, evaluation_strategy=steps, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, fsdp=[], fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=False, greater_is_better=True, group_by_length=False, half_precision_backend=auto, hub_model_id=None, hub_private_repo=False, hub_strategy=every_save, hub_token=<HUB_TOKEN>, ignore_data_skip=False, include_inputs_for_metrics=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=2e-05, length_column_name=length, load_best_model_at_end=True, local_rank=-1, log_level=passive, log_level_replica=passive, log_on_each_node=True, logging_dir=out/roberta/runs/Feb16_15-21-13_DESKTOP-R7JO8BQ, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=100, logging_strategy=steps, lr_scheduler_type=linear, max_grad_norm=1.0, max_steps=2500, metric_for_best_model=accuracy, mp_parameters=, no_cuda=False, num_train_epochs=1.0, optim=adamw_hf, optim_args=None, output_dir=out/roberta, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=8, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=<PUSH_TO_HUB_TOKEN>, ray_scope=last, remove_unused_columns=True, report_to=[], resume_from_checkpoint=None, run_name=out/roberta, save_on_each_node=False, save_steps=250, save_strategy=steps, save_total_limit=5, seed=42, sharded_ddp=[], skip_memory_metrics=True, tf32=None, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_ipex=False, use_legacy_prediction_loop=False, use_mps_device=False, warmup_ratio=0.0, warmup_steps=0, weight_decay=0.0, xpu_backend=None, ) 02/16/2023 15:21:14 - INFO - __main__ - Checkpoint detected, resuming training at out/roberta/checkpoint-2500. To avoid this behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch. 02/16/2023 15:21:14 - INFO - __main__ - load a local file for train: data/train.json 02/16/2023 15:21:14 - INFO - __main__ - load a local file for validation: data/valid.json 02/16/2023 15:21:14 - INFO - __main__ - load a local file for test: data/test.json 02/16/2023 15:21:14 - WARNING - datasets.builder - Using custom data configuration default-f6e8039906850c57 02/16/2023 15:21:14 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json 02/16/2023 15:21:14 - INFO - datasets.builder - Overwrite dataset info from restored data version. 02/16/2023 15:21:14 - INFO - datasets.info - Loading Dataset info from .cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51 02/16/2023 15:21:14 - WARNING - datasets.builder - Found cached dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51) 02/16/2023 15:21:14 - INFO - datasets.info - Loading Dataset info from /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51 100%|█████████████████████████████████████████████| 3/3 [00:00<00:00, 48.00it/s] [INFO|configuration_utils.py:660] 2023-02-16 15:21:15,174 >> loading configuration file config.json from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json [INFO|configuration_utils.py:712] 2023-02-16 15:21:15,175 >> Model config RobertaConfig { "_name_or_path": "roberta-base", "architectures": [ "RobertaForMaskedLM" ], "attention_probs_dropout_prob": 0.1, "bos_token_id": 0, "classifier_dropout": null, "eos_token_id": 2, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "id2label": { "0": "LABEL_0", "1": "LABEL_1", "2": "LABEL_2", "3": "LABEL_3" }, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2, "LABEL_3": 3 }, "layer_norm_eps": 1e-05, "max_position_embeddings": 514, "model_type": "roberta", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 1, "position_embedding_type": "absolute", "transformers_version": "4.26.1", "type_vocab_size": 1, "use_cache": true, "vocab_size": 50265 } [INFO|tokenization_auto.py:458] 2023-02-16 15:21:15,654 >> Could not locate the tokenizer configuration file, will try to use the model config instead. [INFO|configuration_utils.py:660] 2023-02-16 15:21:16,123 >> loading configuration file config.json from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json [INFO|configuration_utils.py:712] 2023-02-16 15:21:16,123 >> Model config RobertaConfig { "_name_or_path": "roberta-base", "architectures": [ "RobertaForMaskedLM" ], "attention_probs_dropout_prob": 0.1, "bos_token_id": 0, "classifier_dropout": null, "eos_token_id": 2, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-05, "max_position_embeddings": 514, "model_type": "roberta", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 1, "position_embedding_type": "absolute", "transformers_version": "4.26.1", "type_vocab_size": 1, "use_cache": true, "vocab_size": 50265 } [INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file vocab.json from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/vocab.json [INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file merges.txt from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/merges.txt [INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file tokenizer.json from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/tokenizer.json [INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file added_tokens.json from cache at None [INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file special_tokens_map.json from cache at None [INFO|tokenization_utils_base.py:1802] 2023-02-16 15:21:17,045 >> loading file tokenizer_config.json from cache at None [INFO|configuration_utils.py:660] 2023-02-16 15:21:17,045 >> loading configuration file config.json from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json [INFO|configuration_utils.py:712] 2023-02-16 15:21:17,046 >> Model config RobertaConfig { "_name_or_path": "roberta-base", "architectures": [ "RobertaForMaskedLM" ], "attention_probs_dropout_prob": 0.1, "bos_token_id": 0, "classifier_dropout": null, "eos_token_id": 2, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-05, "max_position_embeddings": 514, "model_type": "roberta", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 1, "position_embedding_type": "absolute", "transformers_version": "4.26.1", "type_vocab_size": 1, "use_cache": true, "vocab_size": 50265 } 02/16/2023 15:21:17 - INFO - __main__ - Using hidden states in model: False -------------------------------------------------------- Using hidden: False 02/16/2023 15:21:17 - INFO - __main__ - Using implementation from class: RobertaForSequenceClassificationCustomSimple [INFO|modeling_utils.py:2275] 2023-02-16 15:21:17,101 >> loading weights file pytorch_model.bin from cache at .cache_training/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/pytorch_model.bin [WARNING|modeling_utils.py:2847] 2023-02-16 15:21:22,965 >> Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassificationCustomSimple: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias'] - This IS expected if you are initializing RobertaForSequenceClassificationCustomSimple from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing RobertaForSequenceClassificationCustomSimple from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). [WARNING|modeling_utils.py:2859] 2023-02-16 15:21:22,965 >> Some weights of RobertaForSequenceClassificationCustomSimple were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense_1.bias', 'classifier.dense_2.weight', 'classifier.out_proj.weight', 'classifier.dense_2.bias', 'classifier.out_proj.bias', 'classifier.dense_1.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. RobertaForSequenceClassificationCustomSimple( (roberta): RobertaModel( (embeddings): RobertaEmbeddings( (word_embeddings): Embedding(50265, 768, padding_idx=1) (position_embeddings): Embedding(514, 768, padding_idx=1) (token_type_embeddings): Embedding(1, 768) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): RobertaEncoder( (layer): ModuleList( (0): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) ) (classifier): RobertaClassificationHeadCustomSimple( (dense_1): Linear(in_features=768, out_features=3072, bias=True) (dense_2): Linear(in_features=3072, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) (out_proj): Linear(in_features=768, out_features=4, bias=True) (activation): GELU(approximate='none') ) ) 02/16/2023 15:21:22 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-204a6dc6fcae3352.arrow Running tokenizer on dataset: 0%| | 0/4 [00:00<?, ?ba/s]02/16/2023 15:21:23 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-9091129e58fb62d5.arrow Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 15.86ba/s] 02/16/2023 15:21:23 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-bdfe4224bf4c9f20.arrow 02/16/2023 15:21:23 - INFO - __main__ - Set 500 samples for 0-class 02/16/2023 15:21:23 - INFO - __main__ - Set 500 samples for 1-class 02/16/2023 15:21:23 - INFO - __main__ - Set 500 samples for 2-class 02/16/2023 15:21:23 - INFO - __main__ - Set 500 samples for 3-class 02/16/2023 15:21:23 - INFO - __main__ - Sample 83810 of the training set: {'label': 0, 'text': "Policeman 'saw fatal train crash' An off-duty policeman watched a train plough into a car on a level crossing in Berkshire, killing six people.", 'input_ids': [0, 510, 12589, 5649, 128, 35349, 6484, 2341, 2058, 108, 660, 160, 12, 15593, 20976, 3996, 10, 2341, 2968, 4894, 88, 10, 512, 15, 10, 672, 6724, 1437, 11, 16563, 6, 2429, 411, 82, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}. 02/16/2023 15:21:23 - INFO - __main__ - Sample 14592 of the training set: {'label': 1, 'text': 'Silver finale for USA In the last event of the 2004 Olympic Games, the United States track team produced one last surprise. Meb Keflezighi, a native of Eritrea who moved to the United States as ', 'input_ids': [0, 39008, 7712, 13, 2805, 96, 5, 94, 515, 9, 5, 4482, 3336, 3100, 6, 5, 315, 532, 1349, 165, 2622, 65, 94, 2755, 4, 256, 3209, 229, 4550, 23250, 8774, 118, 6, 10, 3763, 9, 24372, 9891, 54, 1410, 7, 5, 315, 532, 25, 1437, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}. 02/16/2023 15:21:23 - INFO - __main__ - Sample 3278 of the training set: {'label': 3, 'text': 'Compuware Blasts IBM #39;s Legal Tactics Two years ago, IBM was ordered to produce the source code for its products, which Compuware identified as containing its pirated intellectual property. The code was missing. But lo and behold -- last week, they called and said they had it, quot; ...', 'input_ids': [0, 24699, 257, 10680, 2091, 13651, 11510, 849, 3416, 131, 29, 10661, 45689, 1596, 107, 536, 6, 11510, 21, 2740, 7, 2592, 5, 1300, 3260, 13, 63, 785, 6, 61, 10081, 257, 10680, 2006, 25, 8200, 63, 36287, 1070, 9594, 1038, 4, 20, 3260, 21, 1716, 4, 125, 4600, 8, 29308, 480, 94, 186, 6, 51, 373, 8, 26, 51, 56, 24, 6, 39809, 131, 1666, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}. [INFO|trainer.py:511] 2023-02-16 15:21:27,576 >> max_steps is given, it will override any value given in num_train_epochs [INFO|trainer.py:1972] 2023-02-16 15:21:27,576 >> Loading model from out/roberta/checkpoint-2500. [INFO|trainer.py:710] 2023-02-16 15:21:29,498 >> The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassificationCustomSimple.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomSimple.forward`, you can safely ignore this message. /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning warnings.warn( [INFO|trainer.py:1650] 2023-02-16 15:21:31,949 >> ***** Running training ***** [INFO|trainer.py:1651] 2023-02-16 15:21:31,950 >> Num examples = 120000 [INFO|trainer.py:1652] 2023-02-16 15:21:31,950 >> Num Epochs = 1 [INFO|trainer.py:1653] 2023-02-16 15:21:31,950 >> Instantaneous batch size per device = 8 [INFO|trainer.py:1654] 2023-02-16 15:21:31,950 >> Total train batch size (w. parallel, distributed & accumulation) = 8 [INFO|trainer.py:1655] 2023-02-16 15:21:31,950 >> Gradient Accumulation steps = 1 [INFO|trainer.py:1656] 2023-02-16 15:21:31,950 >> Total optimization steps = 2500 [INFO|trainer.py:1657] 2023-02-16 15:21:31,951 >> Number of trainable parameters = 128780548 [INFO|trainer.py:1679] 2023-02-16 15:21:31,951 >> Continuing training from checkpoint, will skip to saved global_step [INFO|trainer.py:1680] 2023-02-16 15:21:31,951 >> Continuing training from epoch 0 [INFO|trainer.py:1681] 2023-02-16 15:21:31,951 >> Continuing training from global step 2500 [INFO|trainer.py:1683] 2023-02-16 15:21:31,951 >> Will skip the first 0 epochs then the first 2500 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model. Skipping the first batches: 0%| | 0/2500 [00:00<?, ?it/s] Skipping the first batches: 100%|██████████| 2500/2500 [00:03<00:00, 717.10it/s][A 2501it [00:04, 522.91it/s] [A[INFO|trainer.py:1901] 2023-02-16 15:21:36,738 >> Training completed. Do not forget to share your model on huggingface.co/models =) [INFO|trainer.py:2025] 2023-02-16 15:21:36,738 >> Loading best model from out/roberta/checkpoint-2500 (score: 0.9229999780654907). [A{'train_runtime': 5.7972, 'train_samples_per_second': 3449.95, 'train_steps_per_second': 431.244, 'train_loss': 3.2215512862971954e-06, 'epoch': 0.17} 2501it [00:05, 431.57it/s][A [INFO|trainer.py:2709] 2023-02-16 15:21:37,750 >> Saving model checkpoint to out/roberta [INFO|configuration_utils.py:453] 2023-02-16 15:21:37,751 >> Configuration saved in out/roberta/config.json [INFO|modeling_utils.py:1704] 2023-02-16 15:21:38,719 >> Model weights saved in out/roberta/pytorch_model.bin [INFO|tokenization_utils_base.py:2160] 2023-02-16 15:21:38,742 >> tokenizer config file saved in out/roberta/tokenizer_config.json [INFO|tokenization_utils_base.py:2167] 2023-02-16 15:21:38,743 >> Special tokens file saved in out/roberta/special_tokens_map.json ***** train metrics ***** epoch = 0.17 train_loss = 0.0 train_runtime = 0:00:05.79 train_samples = 120000 train_samples_per_second = 3449.95 train_steps_per_second = 431.244 02/16/2023 15:21:38 - INFO - __main__ - *** Evaluate *** [INFO|trainer.py:710] 2023-02-16 15:21:38,862 >> The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassificationCustomSimple.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomSimple.forward`, you can safely ignore this message. [INFO|trainer.py:2964] 2023-02-16 15:21:38,863 >> ***** Running Evaluation ***** [INFO|trainer.py:2966] 2023-02-16 15:21:38,863 >> Num examples = 2000 [INFO|trainer.py:2969] 2023-02-16 15:21:38,863 >> Batch size = 8 100%|█████████████████████████████████████████| 250/250 [00:16<00:00, 14.75it/s] ***** eval metrics ***** epoch = 0.17 eval_accuracy = 0.923 eval_loss = 0.296 eval_runtime = 0:00:17.06 eval_samples = 2000 eval_samples_per_second = 117.168 eval_steps_per_second = 14.646 02/16/2023 15:21:55 - INFO - __main__ - *** Predict *** [INFO|trainer.py:710] 2023-02-16 15:21:55,934 >> The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassificationCustomSimple.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomSimple.forward`, you can safely ignore this message. [INFO|trainer.py:2964] 2023-02-16 15:21:55,935 >> ***** Running Prediction ***** [INFO|trainer.py:2966] 2023-02-16 15:21:55,935 >> Num examples = 3800 [INFO|trainer.py:2969] 2023-02-16 15:21:55,935 >> Batch size = 8 100%|█████████████████████████████████████████| 475/475 [00:32<00:00, 14.74it/s] 02/16/2023 15:22:28 - INFO - __main__ - ***** Predict results None ***** [INFO|modelcard.py:449] 2023-02-16 15:22:28,796 >> Dropping the following result as it does not have all the necessary fields: {'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.9229999780654907}]}
Evaluation
!python run_glue.py \
--cache_dir .cache_training \
--model_name_or_path out/roberta \
--custom_model roberta_simple \
--train_file data/train.json \
--validation_file data/valid.json \
--test_file data/test.json \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--do_eval \
--do_predict \
--max_seq_length 128 \
--learning_rate 2e-5 \
--max_eval_samples 2000 \
--max_steps 2500 \
--num_train_epochs 1 \
--save_strategy steps \
--save_steps 250 \
--save_total_limit 5 \
--logging_strategy steps \
--logging_steps 100 \
--eval_steps 250 \
--evaluation_strategy steps \
--metric_for_best_model accuracy \
--greater_is_better True \
--load_best_model_at_end True \
--output_dir out/roberta_results
02/16/2023 16:46:49 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False 02/16/2023 16:46:49 - INFO - __main__ - Training/evaluation parameters TrainingArguments( _n_gpu=1, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, bf16=False, bf16_full_eval=False, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=0, dataloader_pin_memory=True, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=None, disable_tqdm=False, do_eval=True, do_predict=True, do_train=False, eval_accumulation_steps=None, eval_delay=0, eval_steps=250, evaluation_strategy=steps, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, fsdp=[], fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=False, greater_is_better=True, group_by_length=False, half_precision_backend=auto, hub_model_id=None, hub_private_repo=False, hub_strategy=every_save, hub_token=<HUB_TOKEN>, ignore_data_skip=False, include_inputs_for_metrics=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=2e-05, length_column_name=length, load_best_model_at_end=True, local_rank=-1, log_level=passive, log_level_replica=passive, log_on_each_node=True, logging_dir=out/roberta_results/runs/Feb16_16-46-48_DESKTOP-R7JO8BQ, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=100, logging_strategy=steps, lr_scheduler_type=linear, max_grad_norm=1.0, max_steps=2500, metric_for_best_model=accuracy, mp_parameters=, no_cuda=False, num_train_epochs=1.0, optim=adamw_hf, optim_args=None, output_dir=out/roberta_results, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=8, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=<PUSH_TO_HUB_TOKEN>, ray_scope=last, remove_unused_columns=True, report_to=[], resume_from_checkpoint=None, run_name=out/roberta_results, save_on_each_node=False, save_steps=250, save_strategy=steps, save_total_limit=5, seed=42, sharded_ddp=[], skip_memory_metrics=True, tf32=None, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_ipex=False, use_legacy_prediction_loop=False, use_mps_device=False, warmup_ratio=0.0, warmup_steps=0, weight_decay=0.0, xpu_backend=None, ) 02/16/2023 16:46:49 - INFO - __main__ - load a local file for train: data/train.json 02/16/2023 16:46:49 - INFO - __main__ - load a local file for validation: data/valid.json 02/16/2023 16:46:49 - INFO - __main__ - load a local file for test: data/test.json 02/16/2023 16:46:50 - WARNING - datasets.builder - Using custom data configuration default-f6e8039906850c57 02/16/2023 16:46:50 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json 02/16/2023 16:46:50 - INFO - datasets.builder - Overwrite dataset info from restored data version. 02/16/2023 16:46:50 - INFO - datasets.info - Loading Dataset info from .cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51 02/16/2023 16:46:50 - WARNING - datasets.builder - Found cached dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51) 02/16/2023 16:46:50 - INFO - datasets.info - Loading Dataset info from /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51 100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 752.21it/s] [INFO|configuration_utils.py:658] 2023-02-16 16:46:50,276 >> loading configuration file out/roberta/config.json [INFO|configuration_utils.py:712] 2023-02-16 16:46:50,277 >> Model config RobertaConfig { "_name_or_path": "out/roberta", "architectures": [ "RobertaForSequenceClassificationCustomSimple" ], "attention_probs_dropout_prob": 0.1, "bos_token_id": 0, "classifier_dropout": null, "eos_token_id": 2, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "id2label": { "0": 0, "1": 1, "2": 2, "3": 3 }, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 0, "1": 1, "2": 2, "3": 3 }, "layer_norm_eps": 1e-05, "max_position_embeddings": 514, "model_type": "roberta", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 1, "position_embedding_type": "absolute", "problem_type": "single_label_classification", "torch_dtype": "float32", "transformers_version": "4.26.1", "type_vocab_size": 1, "use_cache": true, "use_hidden_states": false, "vocab_size": 50265 } [INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,283 >> loading file vocab.json [INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,283 >> loading file merges.txt [INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,284 >> loading file tokenizer.json [INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,284 >> loading file added_tokens.json [INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,284 >> loading file special_tokens_map.json [INFO|tokenization_utils_base.py:1800] 2023-02-16 16:46:50,284 >> loading file tokenizer_config.json 02/16/2023 16:46:50 - INFO - __main__ - Using hidden states in model: False -------------------------------------------------------- Using hidden: False 02/16/2023 16:46:50 - INFO - __main__ - Using implementation from class: RobertaForSequenceClassificationCustomSimple [INFO|modeling_utils.py:2272] 2023-02-16 16:46:50,339 >> loading weights file out/roberta/pytorch_model.bin [INFO|modeling_utils.py:2857] 2023-02-16 16:46:52,079 >> All model checkpoint weights were used when initializing RobertaForSequenceClassificationCustomSimple. [INFO|modeling_utils.py:2865] 2023-02-16 16:46:52,079 >> All the weights of RobertaForSequenceClassificationCustomSimple were initialized from the model checkpoint at out/roberta. If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassificationCustomSimple for predictions without further training. RobertaForSequenceClassificationCustomSimple( (roberta): RobertaModel( (embeddings): RobertaEmbeddings( (word_embeddings): Embedding(50265, 768, padding_idx=1) (position_embeddings): Embedding(514, 768, padding_idx=1) (token_type_embeddings): Embedding(1, 768) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): RobertaEncoder( (layer): ModuleList( (0): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): RobertaLayer( (attention): RobertaAttention( (self): RobertaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): RobertaSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): RobertaIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): RobertaOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) ) (classifier): RobertaClassificationHeadCustomSimple( (dense_1): Linear(in_features=768, out_features=3072, bias=True) (dense_2): Linear(in_features=3072, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) (out_proj): Linear(in_features=768, out_features=4, bias=True) (activation): GELU(approximate='none') ) ) 02/16/2023 16:46:52 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-df96547ec55a44ce.arrow 02/16/2023 16:46:52 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-67b1030adaffbb4a.arrow 02/16/2023 16:46:52 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-ae09252df5e9bac1.arrow 02/16/2023 16:46:52 - INFO - __main__ - Set 500 samples for 0-class 02/16/2023 16:46:52 - INFO - __main__ - Set 500 samples for 1-class 02/16/2023 16:46:52 - INFO - __main__ - Set 500 samples for 2-class 02/16/2023 16:46:52 - INFO - __main__ - Set 500 samples for 3-class [INFO|trainer.py:511] 2023-02-16 16:46:55,346 >> max_steps is given, it will override any value given in num_train_epochs 02/16/2023 16:46:55 - INFO - __main__ - *** Evaluate *** [INFO|trainer.py:710] 2023-02-16 16:46:55,346 >> The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassificationCustomSimple.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomSimple.forward`, you can safely ignore this message. [INFO|trainer.py:2964] 2023-02-16 16:46:55,348 >> ***** Running Evaluation ***** [INFO|trainer.py:2966] 2023-02-16 16:46:55,348 >> Num examples = 2000 [INFO|trainer.py:2969] 2023-02-16 16:46:55,348 >> Batch size = 8 100%|█████████████████████████████████████████| 250/250 [00:17<00:00, 14.53it/s] ***** eval metrics ***** eval_accuracy = 0.923 eval_loss = 0.296 eval_runtime = 0:00:17.81 eval_samples = 2000 eval_samples_per_second = 112.255 eval_steps_per_second = 14.032 02/16/2023 16:47:13 - INFO - __main__ - *** Predict *** [INFO|trainer.py:710] 2023-02-16 16:47:13,166 >> The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassificationCustomSimple.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassificationCustomSimple.forward`, you can safely ignore this message. [INFO|trainer.py:2964] 2023-02-16 16:47:13,167 >> ***** Running Prediction ***** [INFO|trainer.py:2966] 2023-02-16 16:47:13,167 >> Num examples = 3800 [INFO|trainer.py:2969] 2023-02-16 16:47:13,167 >> Batch size = 8 100%|█████████████████████████████████████████| 475/475 [00:32<00:00, 14.53it/s] 02/16/2023 16:47:45 - INFO - __main__ - ***** Predict results None ***** [INFO|modelcard.py:449] 2023-02-16 16:47:46,438 >> Dropping the following result as it does not have all the necessary fields: {'task': {'name': 'Text Classification', 'type': 'text-classification'}}
Results
!cat out/roberta_results/eval_results.json | jq .eval_accuracy
[0;39m0.9229999780654907[0m
GPT2
Modifications
- Custom classification head with 3 dense layers
- Using hidden states from last layer
Code
import torch
from torch import nn
from transformers import GPT2PreTrainedModel, GPT2Model
from transformers.modeling_outputs import SequenceClassifierOutputWithPast
class GPT2ForSequenceClassification(GPT2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.transformer = GPT2Model(config)
self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
# Model parallel
self.model_parallel = False
self.device_map = None
# Initialize weights and apply final processing
self.post_init()
class GPT2ClassificationHeadCustom(nn.Module):
def __init__(self, config):
super().__init__()
hidden_size = config.n_embd
self.dense_1_input = nn.Linear(hidden_size, 2 * hidden_size)
self.dense_1_hidden = nn.Linear(hidden_size, 2 * hidden_size)
self.dense_2 = nn.Linear(4 * hidden_size, hidden_size)
self.dropout = nn.Dropout(config.resid_pdrop)
self.out_proj = nn.Linear(hidden_size, config.num_labels, bias=False)
def forward(self, x, **kwargs):
if 'hidden_states' in kwargs and kwargs['hidden_states'] is not None:
hidden = kwargs['hidden_states'][-1]
else:
hidden = torch.zeros(x.size(), dtype=x.dtype, device=x.device)
x = self.dense_1_input(x)
x = torch.relu(x)
x = self.dropout(x)
hidden = self.dense_1_hidden(hidden)
hidden = torch.relu(hidden)
hidden = self.dropout(hidden)
x = torch.cat((x, hidden), dim=2)
x = self.dense_2(x)
x = torch.relu(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
class GPT2ForSequenceClassificationCustom(GPT2ForSequenceClassification):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.transformer = GPT2Model(config)
self.score = GPT2ClassificationHeadCustom(config)
self.init_weights()
# Model parallel
self.model_parallel = False
self.device_map = None
def forward(
self,
input_ids=None,
past_key_values=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer(
input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = transformer_outputs[0]
if return_dict:
logits = self.score(hidden_states, hidden_states=transformer_outputs.hidden_states)
else:
raise NotImplemented('Not implemented for using non-dictionary object')
if input_ids is not None:
batch_size, sequence_length = input_ids.shape[:2]
else:
batch_size, sequence_length = inputs_embeds.shape[:2]
assert (
self.config.pad_token_id is not None or batch_size == 1
), "Cannot handle batch sizes > 1 if no padding token is defined."
if self.config.pad_token_id is None:
sequence_lengths = -1
else:
if input_ids is not None:
sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
else:
sequence_lengths = -1
pooled_logits = logits[range(batch_size), sequence_lengths]
loss = None
if labels is not None:
if self.num_labels == 1:
# We are doing regression
loss_fct = nn.MSELoss()
loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1))
else:
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (pooled_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutputWithPast(
loss=loss,
logits=pooled_logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
Model
GPT2ForSequenceClassificationCustom.from_pretrained('gpt2')
Downloading (…)lve/main/config.json: 0%| | 0.00/665 [00:00<?, ?B/s]
Some weights of GPT2ForSequenceClassificationCustom were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.10.attn.masked_bias', 'h.2.attn.masked_bias', 'h.5.attn.masked_bias', 'score.dense_2.weight', 'h.9.attn.masked_bias', 'score.dense_1_input.bias', 'score.out_proj.weight', 'h.7.attn.masked_bias', 'h.4.attn.masked_bias', 'h.3.attn.masked_bias', 'h.11.attn.masked_bias', 'h.6.attn.masked_bias', 'h.8.attn.masked_bias', 'score.dense_1_hidden.weight', 'h.1.attn.masked_bias', 'h.0.attn.masked_bias', 'score.dense_1_input.weight', 'score.dense_1_hidden.bias', 'score.dense_2.bias'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPT2ForSequenceClassificationCustom( (transformer): GPT2Model( (wte): Embedding(50257, 768) (wpe): Embedding(1024, 768) (drop): Dropout(p=0.1, inplace=False) (h): ModuleList( (0): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (1): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (2): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (3): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (4): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (5): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (6): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (7): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (8): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (9): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (10): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (11): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) (score): GPT2ClassificationHeadCustom( (dense_1_input): Linear(in_features=768, out_features=1536, bias=True) (dense_1_hidden): Linear(in_features=768, out_features=1536, bias=True) (dense_2): Linear(in_features=3072, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) (out_proj): Linear(in_features=768, out_features=2, bias=False) ) )
Training
!python run_glue.py \
--cache_dir .cache_training \
--model_name_or_path gpt2 \
--custom_model gpt2_hidden \
--train_file data/train.json \
--validation_file data/valid.json \
--test_file data/test.json \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--do_train \
--do_eval \
--max_seq_length 128 \
--learning_rate 2e-5 \
--max_eval_samples 2000 \
--max_steps 2500 \
--num_train_epochs 1 \
--save_strategy steps \
--save_steps 250 \
--save_total_limit 5 \
--logging_strategy steps \
--logging_steps 100 \
--eval_steps 250 \
--evaluation_strategy steps \
--metric_for_best_model accuracy \
--greater_is_better True \
--load_best_model_at_end True \
--output_dir out/gpt2
02/16/2023 15:22:37 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False 02/16/2023 15:22:37 - INFO - __main__ - Training/evaluation parameters TrainingArguments( _n_gpu=1, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, bf16=False, bf16_full_eval=False, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=0, dataloader_pin_memory=True, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=None, disable_tqdm=False, do_eval=True, do_predict=False, do_train=True, eval_accumulation_steps=None, eval_delay=0, eval_steps=250, evaluation_strategy=steps, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, fsdp=[], fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=False, greater_is_better=True, group_by_length=False, half_precision_backend=auto, hub_model_id=None, hub_private_repo=False, hub_strategy=every_save, hub_token=<HUB_TOKEN>, ignore_data_skip=False, include_inputs_for_metrics=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=2e-05, length_column_name=length, load_best_model_at_end=True, local_rank=-1, log_level=passive, log_level_replica=passive, log_on_each_node=True, logging_dir=out/gpt2/runs/Feb16_15-22-36_DESKTOP-R7JO8BQ, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=100, logging_strategy=steps, lr_scheduler_type=linear, max_grad_norm=1.0, max_steps=2500, metric_for_best_model=accuracy, mp_parameters=, no_cuda=False, num_train_epochs=1.0, optim=adamw_hf, optim_args=None, output_dir=out/gpt2, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=8, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=<PUSH_TO_HUB_TOKEN>, ray_scope=last, remove_unused_columns=True, report_to=[], resume_from_checkpoint=None, run_name=out/gpt2, save_on_each_node=False, save_steps=250, save_strategy=steps, save_total_limit=5, seed=42, sharded_ddp=[], skip_memory_metrics=True, tf32=None, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_ipex=False, use_legacy_prediction_loop=False, use_mps_device=False, warmup_ratio=0.0, warmup_steps=0, weight_decay=0.0, xpu_backend=None, ) 02/16/2023 15:22:37 - INFO - __main__ - Checkpoint detected, resuming training at out/gpt2/checkpoint-2500. To avoid this behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch. 02/16/2023 15:22:37 - INFO - __main__ - load a local file for train: data/train.json 02/16/2023 15:22:37 - INFO - __main__ - load a local file for validation: data/valid.json 02/16/2023 15:22:37 - WARNING - datasets.builder - Using custom data configuration default-e10a382a423bbb9a 02/16/2023 15:22:37 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json 02/16/2023 15:22:37 - INFO - datasets.builder - Generating dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51) Downloading and preparing dataset json/default to /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51... Downloading data files: 100%|██████████████████| 2/2 [00:00<00:00, 14820.86it/s] 02/16/2023 15:22:37 - INFO - datasets.download.download_manager - Downloading took 0.0 min 02/16/2023 15:22:37 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min Extracting data files: 100%|████████████████████| 2/2 [00:00<00:00, 2476.71it/s] 02/16/2023 15:22:37 - INFO - datasets.utils.info_utils - Unable to verify checksums. 02/16/2023 15:22:37 - INFO - datasets.builder - Generating train split 02/16/2023 15:22:37 - INFO - datasets.builder - Generating validation split 02/16/2023 15:22:37 - INFO - datasets.utils.info_utils - Unable to verify splits sizes. Dataset json downloaded and prepared to /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data. 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 642.61it/s] [INFO|configuration_utils.py:660] 2023-02-16 15:22:38,465 >> loading configuration file config.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json [INFO|configuration_utils.py:712] 2023-02-16 15:22:38,465 >> Model config GPT2Config { "_name_or_path": "gpt2", "activation_function": "gelu_new", "architectures": [ "GPT2LMHeadModel" ], "attn_pdrop": 0.1, "bos_token_id": 50256, "embd_pdrop": 0.1, "eos_token_id": 50256, "id2label": { "0": "LABEL_0", "1": "LABEL_1", "2": "LABEL_2", "3": "LABEL_3" }, "initializer_range": 0.02, "label2id": { "LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2, "LABEL_3": 3 }, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_inner": null, "n_layer": 12, "n_positions": 1024, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, "scale_attn_by_inverse_layer_idx": false, "scale_attn_weights": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "summary_type": "cls_index", "summary_use_proj": true, "task_specific_params": { "text-generation": { "do_sample": true, "max_length": 50 } }, "transformers_version": "4.26.1", "use_cache": true, "vocab_size": 50257 } [INFO|tokenization_auto.py:458] 2023-02-16 15:22:38,945 >> Could not locate the tokenizer configuration file, will try to use the model config instead. [INFO|configuration_utils.py:660] 2023-02-16 15:22:39,423 >> loading configuration file config.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json [INFO|configuration_utils.py:712] 2023-02-16 15:22:39,424 >> Model config GPT2Config { "_name_or_path": "gpt2", "activation_function": "gelu_new", "architectures": [ "GPT2LMHeadModel" ], "attn_pdrop": 0.1, "bos_token_id": 50256, "embd_pdrop": 0.1, "eos_token_id": 50256, "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_inner": null, "n_layer": 12, "n_positions": 1024, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, "scale_attn_by_inverse_layer_idx": false, "scale_attn_weights": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "summary_type": "cls_index", "summary_use_proj": true, "task_specific_params": { "text-generation": { "do_sample": true, "max_length": 50 } }, "transformers_version": "4.26.1", "use_cache": true, "vocab_size": 50257 } [INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file vocab.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/vocab.json [INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file merges.txt from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/merges.txt [INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file tokenizer.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/tokenizer.json [INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file added_tokens.json from cache at None [INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file special_tokens_map.json from cache at None [INFO|tokenization_utils_base.py:1802] 2023-02-16 15:22:40,400 >> loading file tokenizer_config.json from cache at None [INFO|configuration_utils.py:660] 2023-02-16 15:22:40,400 >> loading configuration file config.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json [INFO|configuration_utils.py:712] 2023-02-16 15:22:40,400 >> Model config GPT2Config { "_name_or_path": "gpt2", "activation_function": "gelu_new", "architectures": [ "GPT2LMHeadModel" ], "attn_pdrop": 0.1, "bos_token_id": 50256, "embd_pdrop": 0.1, "eos_token_id": 50256, "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_inner": null, "n_layer": 12, "n_positions": 1024, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, "scale_attn_by_inverse_layer_idx": false, "scale_attn_weights": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "summary_type": "cls_index", "summary_use_proj": true, "task_specific_params": { "text-generation": { "do_sample": true, "max_length": 50 } }, "transformers_version": "4.26.1", "use_cache": true, "vocab_size": 50257 } 02/16/2023 15:22:40 - INFO - __main__ - Using hidden states in model: True -------------------------------------------------------- Using hidden: True 02/16/2023 15:22:40 - INFO - __main__ - Using implementation from class: GPT2ForSequenceClassificationCustom [INFO|modeling_utils.py:2275] 2023-02-16 15:22:40,458 >> loading weights file pytorch_model.bin from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/pytorch_model.bin [INFO|modeling_utils.py:2857] 2023-02-16 15:22:42,848 >> All model checkpoint weights were used when initializing GPT2ForSequenceClassificationCustom. [WARNING|modeling_utils.py:2859] 2023-02-16 15:22:42,849 >> Some weights of GPT2ForSequenceClassificationCustom were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.11.attn.masked_bias', 'score.out_proj.weight', 'h.7.attn.masked_bias', 'h.6.attn.masked_bias', 'h.8.attn.masked_bias', 'h.5.attn.masked_bias', 'score.dense_2.weight', 'h.9.attn.masked_bias', 'score.dense_4.bias', 'score.dense_1_input.bias', 'score.dense_3.weight', 'score.dense_1_hidden.bias', 'score.dense_1_input.weight', 'h.1.attn.masked_bias', 'score.dense_3.bias', 'h.10.attn.masked_bias', 'h.2.attn.masked_bias', 'h.4.attn.masked_bias', 'score.dense_1_hidden.weight', 'score.dense_2.bias', 'score.dense_4.weight', 'h.0.attn.masked_bias', 'h.3.attn.masked_bias'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. GPT2ForSequenceClassificationCustom( (transformer): GPT2Model( (wte): Embedding(50257, 768) (wpe): Embedding(1024, 768) (drop): Dropout(p=0.1, inplace=False) (h): ModuleList( (0): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (1): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (2): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (3): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (4): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (5): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (6): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (7): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (8): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (9): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (10): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (11): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) (score): GPT2ClassificationHeadCustom( (dense_1_input): Linear(in_features=768, out_features=1536, bias=True) (dense_1_hidden): Linear(in_features=768, out_features=1536, bias=True) (dense_2): Linear(in_features=3072, out_features=3072, bias=True) (dense_3): Linear(in_features=3072, out_features=3072, bias=True) (dense_4): Linear(in_features=3072, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) (out_proj): Linear(in_features=768, out_features=4, bias=False) ) ) [ERROR|tokenization_utils_base.py:1042] 2023-02-16 15:22:42,852 >> Using pad_token, but it is not set yet. 02/16/2023 15:22:42 - INFO - __main__ - Set PAD token to EOS: <|endoftext|> Running tokenizer on dataset: 0%| | 0/120 [00:00<?, ?ba/s]02/16/2023 15:22:42 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-d91f860557c08124.arrow Running tokenizer on dataset: 100%|███████████| 120/120 [00:06<00:00, 17.67ba/s] Running tokenizer on dataset: 0%| | 0/4 [00:00<?, ?ba/s]02/16/2023 15:22:49 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-b30f34d164a78c00.arrow Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 19.47ba/s] 02/16/2023 15:22:50 - INFO - __main__ - Set 500 samples for 0-class 02/16/2023 15:22:50 - INFO - __main__ - Set 500 samples for 1-class 02/16/2023 15:22:50 - INFO - __main__ - Set 500 samples for 2-class 02/16/2023 15:22:50 - INFO - __main__ - Set 500 samples for 3-class Traceback (most recent call last): File "/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/run_glue.py", line 685, in <module> main() File "/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/run_glue.py", line 533, in main raise ValueError("--do_predict requires a test dataset") ValueError: --do_predict requires a test dataset
Evaluation
!python run_glue.py \
--cache_dir .cache_training \
--model_name_or_path out/gpt2 \
--custom_model gpt2_hidden \
--train_file data/train.json \
--validation_file data/valid.json \
--test_file data/test.json \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--do_eval \
--do_predict \
--max_seq_length 128 \
--learning_rate 2e-5 \
--max_eval_samples 2000 \
--max_steps 2500 \
--num_train_epochs 1 \
--save_strategy steps \
--save_steps 250 \
--save_total_limit 5 \
--logging_strategy steps \
--logging_steps 100 \
--eval_steps 250 \
--evaluation_strategy steps \
--metric_for_best_model accuracy \
--greater_is_better True \
--load_best_model_at_end True \
--output_dir out/gpt2_results
02/16/2023 16:51:20 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False 02/16/2023 16:51:20 - INFO - __main__ - Training/evaluation parameters TrainingArguments( _n_gpu=1, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, bf16=False, bf16_full_eval=False, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=0, dataloader_pin_memory=True, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=None, disable_tqdm=False, do_eval=True, do_predict=True, do_train=False, eval_accumulation_steps=None, eval_delay=0, eval_steps=250, evaluation_strategy=steps, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, fsdp=[], fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=False, greater_is_better=True, group_by_length=False, half_precision_backend=auto, hub_model_id=None, hub_private_repo=False, hub_strategy=every_save, hub_token=<HUB_TOKEN>, ignore_data_skip=False, include_inputs_for_metrics=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=2e-05, length_column_name=length, load_best_model_at_end=True, local_rank=-1, log_level=passive, log_level_replica=passive, log_on_each_node=True, logging_dir=out/gpt2_results/runs/Feb16_16-51-19_DESKTOP-R7JO8BQ, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=100, logging_strategy=steps, lr_scheduler_type=linear, max_grad_norm=1.0, max_steps=2500, metric_for_best_model=accuracy, mp_parameters=, no_cuda=False, num_train_epochs=1.0, optim=adamw_hf, optim_args=None, output_dir=out/gpt2_results, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=8, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=<PUSH_TO_HUB_TOKEN>, ray_scope=last, remove_unused_columns=True, report_to=[], resume_from_checkpoint=None, run_name=out/gpt2_results, save_on_each_node=False, save_steps=250, save_strategy=steps, save_total_limit=5, seed=42, sharded_ddp=[], skip_memory_metrics=True, tf32=None, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_ipex=False, use_legacy_prediction_loop=False, use_mps_device=False, warmup_ratio=0.0, warmup_steps=0, weight_decay=0.0, xpu_backend=None, ) 02/16/2023 16:51:20 - INFO - __main__ - load a local file for train: data/train.json 02/16/2023 16:51:20 - INFO - __main__ - load a local file for validation: data/valid.json 02/16/2023 16:51:20 - INFO - __main__ - load a local file for test: data/test.json 02/16/2023 16:51:20 - WARNING - datasets.builder - Using custom data configuration default-f6e8039906850c57 02/16/2023 16:51:20 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json 02/16/2023 16:51:20 - INFO - datasets.builder - Overwrite dataset info from restored data version. 02/16/2023 16:51:20 - INFO - datasets.info - Loading Dataset info from .cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51 02/16/2023 16:51:20 - WARNING - datasets.builder - Found cached dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51) 02/16/2023 16:51:20 - INFO - datasets.info - Loading Dataset info from /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51 100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 591.33it/s] [INFO|configuration_utils.py:658] 2023-02-16 16:51:20,920 >> loading configuration file out/gpt2/config.json [INFO|configuration_utils.py:712] 2023-02-16 16:51:20,921 >> Model config GPT2Config { "_name_or_path": "out/gpt2", "activation_function": "gelu_new", "architectures": [ "GPT2ForSequenceClassificationCustom" ], "attn_pdrop": 0.1, "bos_token_id": 50256, "embd_pdrop": 0.1, "eos_token_id": 50256, "id2label": { "0": 0, "1": 1, "2": 2, "3": 3 }, "initializer_range": 0.02, "label2id": { "0": 0, "1": 1, "2": 2, "3": 3 }, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_inner": null, "n_layer": 12, "n_positions": 1024, "pad_token_id": 50256, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, "scale_attn_by_inverse_layer_idx": false, "scale_attn_weights": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "summary_type": "cls_index", "summary_use_proj": true, "task_specific_params": { "text-generation": { "do_sample": true, "max_length": 50 } }, "torch_dtype": "float32", "transformers_version": "4.26.1", "use_cache": true, "use_hidden_states": true, "vocab_size": 50257 } [INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file vocab.json [INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file merges.txt [INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file tokenizer.json [INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file added_tokens.json [INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file special_tokens_map.json [INFO|tokenization_utils_base.py:1800] 2023-02-16 16:51:20,929 >> loading file tokenizer_config.json 02/16/2023 16:51:20 - INFO - __main__ - Using hidden states in model: True -------------------------------------------------------- Using hidden: True 02/16/2023 16:51:20 - INFO - __main__ - Using implementation from class: GPT2ForSequenceClassificationCustom [INFO|modeling_utils.py:2272] 2023-02-16 16:51:20,982 >> loading weights file out/gpt2/pytorch_model.bin [INFO|modeling_utils.py:2857] 2023-02-16 16:51:23,451 >> All model checkpoint weights were used when initializing GPT2ForSequenceClassificationCustom. [INFO|modeling_utils.py:2865] 2023-02-16 16:51:23,451 >> All the weights of GPT2ForSequenceClassificationCustom were initialized from the model checkpoint at out/gpt2. If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2ForSequenceClassificationCustom for predictions without further training. GPT2ForSequenceClassificationCustom( (transformer): GPT2Model( (wte): Embedding(50257, 768) (wpe): Embedding(1024, 768) (drop): Dropout(p=0.1, inplace=False) (h): ModuleList( (0): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (1): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (2): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (3): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (4): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (5): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (6): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (7): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (8): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (9): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (10): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (11): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) (score): GPT2ClassificationHeadCustom( (dense_1_input): Linear(in_features=768, out_features=1536, bias=True) (dense_1_hidden): Linear(in_features=768, out_features=1536, bias=True) (dense_2): Linear(in_features=3072, out_features=3072, bias=True) (dense_3): Linear(in_features=3072, out_features=3072, bias=True) (dense_4): Linear(in_features=3072, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) (out_proj): Linear(in_features=768, out_features=4, bias=False) ) ) Running tokenizer on dataset: 0%| | 0/120 [00:00<?, ?ba/s]02/16/2023 16:51:23 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-7179a56e6d5f6003.arrow Running tokenizer on dataset: 100%|███████████| 120/120 [00:07<00:00, 15.47ba/s] Running tokenizer on dataset: 0%| | 0/4 [00:00<?, ?ba/s]02/16/2023 16:51:31 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-dd7e86ec7f74125a.arrow Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 15.75ba/s] Running tokenizer on dataset: 0%| | 0/4 [00:00<?, ?ba/s]02/16/2023 16:51:31 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-a11e14ac330179d1.arrow Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 15.37ba/s] 02/16/2023 16:51:32 - INFO - __main__ - Set 500 samples for 0-class 02/16/2023 16:51:32 - INFO - __main__ - Set 500 samples for 1-class 02/16/2023 16:51:32 - INFO - __main__ - Set 500 samples for 2-class 02/16/2023 16:51:32 - INFO - __main__ - Set 500 samples for 3-class [INFO|trainer.py:511] 2023-02-16 16:51:35,119 >> max_steps is given, it will override any value given in num_train_epochs 02/16/2023 16:51:35 - INFO - __main__ - *** Evaluate *** [INFO|trainer.py:710] 2023-02-16 16:51:35,120 >> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`, you can safely ignore this message. [INFO|trainer.py:2964] 2023-02-16 16:51:35,123 >> ***** Running Evaluation ***** [INFO|trainer.py:2966] 2023-02-16 16:51:35,123 >> Num examples = 2000 [INFO|trainer.py:2969] 2023-02-16 16:51:35,123 >> Batch size = 8 100%|█████████████████████████████████████████| 250/250 [00:23<00:00, 10.65it/s] ***** eval metrics ***** eval_accuracy = 0.9195 eval_loss = 0.302 eval_runtime = 0:00:24.11 eval_samples = 2000 eval_samples_per_second = 82.94 eval_steps_per_second = 10.367 02/16/2023 16:51:59 - INFO - __main__ - *** Predict *** [INFO|trainer.py:710] 2023-02-16 16:51:59,239 >> The following columns in the test set don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassificationCustom.forward`, you can safely ignore this message. [INFO|trainer.py:2964] 2023-02-16 16:51:59,240 >> ***** Running Prediction ***** [INFO|trainer.py:2966] 2023-02-16 16:51:59,240 >> Num examples = 3800 [INFO|trainer.py:2969] 2023-02-16 16:51:59,240 >> Batch size = 8 100%|█████████████████████████████████████████| 475/475 [00:43<00:00, 10.84it/s] 02/16/2023 16:52:43 - INFO - __main__ - ***** Predict results None ***** [INFO|modelcard.py:449] 2023-02-16 16:52:43,692 >> Dropping the following result as it does not have all the necessary fields: {'task': {'name': 'Text Classification', 'type': 'text-classification'}}
Results
!cat out/gpt2_results/eval_results.json | jq .eval_accuracy
[0;39m0.9194999933242798[0m
T5
Modifications
- Custom classification head with 3 dense layers
- Encoder layers frozen
- Decoder layers frozen
Code
import torch
import copy
from torch import nn
from transformers import T5PreTrainedModel, T5Config
from transformers.models.t5.modeling_t5 import T5Stack
from transformers.modeling_outputs import SequenceClassifierOutput
class T5ClassificationHead(nn.Module):
def __init__(self, config: T5Config):
super().__init__()
self.dense_in = nn.Linear(config.d_model, 768)
self.dense = nn.Linear(768, 768)
self.dense_out = nn.Linear(768, config.num_labels)
self.dropout = nn.Dropout(0.1)
def forward(self, features, **kwargs):
x = features[:, 0, :]
x = self.dropout(x)
x = self.dense_in(x)
x = torch.relu(x)
x = self.dropout(x)
x = self.dense(x)
x = torch.relu(x)
x = self.dropout(x)
x = self.dense_out(x)
return x
class T5ForClassification(T5PreTrainedModel):
def __init__(self, config: T5Config):
super().__init__(config)
self.model_dim = config.d_model
self.shared = nn.Embedding(config.vocab_size, config.d_model)
encoder_config = copy.deepcopy(config)
encoder_config.is_decoder = False
encoder_config.use_cache = False
encoder_config.is_encoder_decoder = False
self.encoder = T5Stack(encoder_config, self.shared)
decoder_config = copy.deepcopy(config)
decoder_config.is_decoder = True
decoder_config.is_encoder_decoder = False
decoder_config.num_layers = config.num_decoder_layers
self.decoder = T5Stack(decoder_config, self.shared)
modules_to_freeze = [self.encoder.block[i].layer[0] for i in range(len(self.encoder.block))]
modules_to_freeze.extend([self.decoder.block[i].layer[0] for i in range(len(self.decoder.block))])
modules_to_freeze.extend([self.decoder.block[i].layer[1] for i in range(len(self.decoder.block))])
for module in modules_to_freeze:
for param in module.parameters():
param.requires_grad = False
self.lm_head = T5ClassificationHead(config)
# Initialize weights and apply final processing
self.post_init()
# Model parallel
self.model_parallel = False
self.device_map = None
def forward(
self,
input_ids=None,
attention_mask=None,
head_mask=None,
cross_attn_head_mask=None,
past_key_values=None,
inputs_embeds=None,
decoder_inputs_embeds=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
labels=None
):
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.encoder(
input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
outputs = self.decoder(
input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
logits = self.lm_head(outputs[0])
loss = None
if labels is not None:
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
return SequenceClassifierOutput(
loss=loss,
logits=logits,
)
Model
T5ForClassification.from_pretrained("t5-base")
Downloading (…)lve/main/config.json: 0%| | 0.00/1.21k [00:00<?, ?B/s]
Downloading (…)"pytorch_model.bin";: 0%| | 0.00/892M [00:00<?, ?B/s]
Some weights of the model checkpoint at t5-base were not used when initializing T5ForClassification: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight'] - This IS expected if you are initializing T5ForClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing T5ForClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Some weights of T5ForClassification were not initialized from the model checkpoint at t5-base and are newly initialized: ['lm_head.dense_out.bias', 'lm_head.dense.bias', 'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.dense_out.weight', 'lm_head.dense.weight', 'lm_head.dense_in.bias', 'lm_head.dense_in.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
T5ForClassification( (shared): Embedding(32128, 768) (encoder): T5Stack( (embed_tokens): Embedding(32128, 768) (block): ModuleList( (0): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) (relative_attention_bias): Embedding(32, 12) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (1): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (2): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (3): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (4): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (5): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (6): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (7): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (8): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (9): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (10): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (11): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (final_layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (decoder): T5Stack( (embed_tokens): Embedding(32128, 768) (block): ModuleList( (0): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) (relative_attention_bias): Embedding(32, 12) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (1): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (2): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (3): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (4): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (5): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (6): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (7): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (8): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (9): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (10): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (11): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (final_layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (lm_head): T5ClassificationHead( (dense_in): Linear(in_features=768, out_features=768, bias=True) (dense): Linear(in_features=768, out_features=768, bias=True) (dense_out): Linear(in_features=768, out_features=2, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) )
Training
!python run_glue.py \
--cache_dir .cache_training \
--model_name_or_path t5-base \
--custom_model t5_custom \
--train_file data/train.json \
--validation_file data/valid.json \
--test_file data/test.json \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--do_train \
--do_eval \
--max_seq_length 128 \
--learning_rate 2e-5 \
--max_eval_samples 2000 \
--max_steps 2500 \
--num_train_epochs 1 \
--save_strategy steps \
--save_steps 250 \
--save_total_limit 5 \
--logging_strategy steps \
--logging_steps 100 \
--eval_steps 250 \
--evaluation_strategy steps \
--metric_for_best_model accuracy \
--greater_is_better True \
--load_best_model_at_end True \
--output_dir out/t5
02/16/2023 15:24:13 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False 02/16/2023 15:24:13 - INFO - __main__ - Training/evaluation parameters TrainingArguments( _n_gpu=1, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, bf16=False, bf16_full_eval=False, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=0, dataloader_pin_memory=True, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=None, disable_tqdm=False, do_eval=True, do_predict=False, do_train=True, eval_accumulation_steps=None, eval_delay=0, eval_steps=250, evaluation_strategy=steps, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, fsdp=[], fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=False, greater_is_better=True, group_by_length=False, half_precision_backend=auto, hub_model_id=None, hub_private_repo=False, hub_strategy=every_save, hub_token=<HUB_TOKEN>, ignore_data_skip=False, include_inputs_for_metrics=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=2e-05, length_column_name=length, load_best_model_at_end=True, local_rank=-1, log_level=passive, log_level_replica=passive, log_on_each_node=True, logging_dir=out/t5/runs/Feb16_15-24-12_DESKTOP-R7JO8BQ, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=100, logging_strategy=steps, lr_scheduler_type=linear, max_grad_norm=1.0, max_steps=2500, metric_for_best_model=accuracy, mp_parameters=, no_cuda=False, num_train_epochs=1.0, optim=adamw_hf, optim_args=None, output_dir=out/t5, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=8, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=<PUSH_TO_HUB_TOKEN>, ray_scope=last, remove_unused_columns=True, report_to=[], resume_from_checkpoint=None, run_name=out/t5, save_on_each_node=False, save_steps=250, save_strategy=steps, save_total_limit=5, seed=42, sharded_ddp=[], skip_memory_metrics=True, tf32=None, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_ipex=False, use_legacy_prediction_loop=False, use_mps_device=False, warmup_ratio=0.0, warmup_steps=0, weight_decay=0.0, xpu_backend=None, ) 02/16/2023 15:24:13 - INFO - __main__ - Checkpoint detected, resuming training at out/t5/checkpoint-2500. To avoid this behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch. 02/16/2023 15:24:13 - INFO - __main__ - load a local file for train: data/train.json 02/16/2023 15:24:13 - INFO - __main__ - load a local file for validation: data/valid.json 02/16/2023 15:24:13 - WARNING - datasets.builder - Using custom data configuration default-e10a382a423bbb9a 02/16/2023 15:24:13 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json 02/16/2023 15:24:13 - INFO - datasets.builder - Overwrite dataset info from restored data version. 02/16/2023 15:24:13 - INFO - datasets.info - Loading Dataset info from .cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51 02/16/2023 15:24:13 - WARNING - datasets.builder - Found cached dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51) 02/16/2023 15:24:13 - INFO - datasets.info - Loading Dataset info from /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51 100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 426.97it/s] [INFO|configuration_utils.py:660] 2023-02-16 15:24:14,422 >> loading configuration file config.json from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/config.json [INFO|configuration_utils.py:712] 2023-02-16 15:24:14,423 >> Model config T5Config { "_name_or_path": "t5-base", "architectures": [ "T5ForConditionalGeneration" ], "d_ff": 3072, "d_kv": 64, "d_model": 768, "decoder_start_token_id": 0, "dense_act_fn": "relu", "dropout_rate": 0.1, "eos_token_id": 1, "feed_forward_proj": "relu", "id2label": { "0": "LABEL_0", "1": "LABEL_1", "2": "LABEL_2", "3": "LABEL_3" }, "initializer_factor": 1.0, "is_encoder_decoder": true, "is_gated_act": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2, "LABEL_3": 3 }, "layer_norm_epsilon": 1e-06, "model_type": "t5", "n_positions": 512, "num_decoder_layers": 12, "num_heads": 12, "num_layers": 12, "output_past": true, "pad_token_id": 0, "relative_attention_max_distance": 128, "relative_attention_num_buckets": 32, "task_specific_params": { "summarization": { "early_stopping": true, "length_penalty": 2.0, "max_length": 200, "min_length": 30, "no_repeat_ngram_size": 3, "num_beams": 4, "prefix": "summarize: " }, "translation_en_to_de": { "early_stopping": true, "max_length": 300, "num_beams": 4, "prefix": "translate English to German: " }, "translation_en_to_fr": { "early_stopping": true, "max_length": 300, "num_beams": 4, "prefix": "translate English to French: " }, "translation_en_to_ro": { "early_stopping": true, "max_length": 300, "num_beams": 4, "prefix": "translate English to Romanian: " } }, "transformers_version": "4.26.1", "use_cache": true, "vocab_size": 32128 } [INFO|tokenization_auto.py:458] 2023-02-16 15:24:14,918 >> Could not locate the tokenizer configuration file, will try to use the model config instead. [INFO|configuration_utils.py:660] 2023-02-16 15:24:15,378 >> loading configuration file config.json from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/config.json [INFO|configuration_utils.py:712] 2023-02-16 15:24:15,378 >> Model config T5Config { "_name_or_path": "t5-base", "architectures": [ "T5ForConditionalGeneration" ], "d_ff": 3072, "d_kv": 64, "d_model": 768, "decoder_start_token_id": 0, "dense_act_fn": "relu", "dropout_rate": 0.1, "eos_token_id": 1, "feed_forward_proj": "relu", "initializer_factor": 1.0, "is_encoder_decoder": true, "is_gated_act": false, "layer_norm_epsilon": 1e-06, "model_type": "t5", "n_positions": 512, "num_decoder_layers": 12, "num_heads": 12, "num_layers": 12, "output_past": true, "pad_token_id": 0, "relative_attention_max_distance": 128, "relative_attention_num_buckets": 32, "task_specific_params": { "summarization": { "early_stopping": true, "length_penalty": 2.0, "max_length": 200, "min_length": 30, "no_repeat_ngram_size": 3, "num_beams": 4, "prefix": "summarize: " }, "translation_en_to_de": { "early_stopping": true, "max_length": 300, "num_beams": 4, "prefix": "translate English to German: " }, "translation_en_to_fr": { "early_stopping": true, "max_length": 300, "num_beams": 4, "prefix": "translate English to French: " }, "translation_en_to_ro": { "early_stopping": true, "max_length": 300, "num_beams": 4, "prefix": "translate English to Romanian: " } }, "transformers_version": "4.26.1", "use_cache": true, "vocab_size": 32128 } [INFO|tokenization_utils_base.py:1802] 2023-02-16 15:24:16,341 >> loading file spiece.model from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/spiece.model [INFO|tokenization_utils_base.py:1802] 2023-02-16 15:24:16,341 >> loading file tokenizer.json from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/tokenizer.json [INFO|tokenization_utils_base.py:1802] 2023-02-16 15:24:16,341 >> loading file added_tokens.json from cache at None [INFO|tokenization_utils_base.py:1802] 2023-02-16 15:24:16,341 >> loading file special_tokens_map.json from cache at None [INFO|tokenization_utils_base.py:1802] 2023-02-16 15:24:16,341 >> loading file tokenizer_config.json from cache at None [INFO|configuration_utils.py:660] 2023-02-16 15:24:16,342 >> loading configuration file config.json from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/config.json [INFO|configuration_utils.py:712] 2023-02-16 15:24:16,342 >> Model config T5Config { "_name_or_path": "t5-base", "architectures": [ "T5ForConditionalGeneration" ], "d_ff": 3072, "d_kv": 64, "d_model": 768, "decoder_start_token_id": 0, "dense_act_fn": "relu", "dropout_rate": 0.1, "eos_token_id": 1, "feed_forward_proj": "relu", "initializer_factor": 1.0, "is_encoder_decoder": true, "is_gated_act": false, "layer_norm_epsilon": 1e-06, "model_type": "t5", "n_positions": 512, "num_decoder_layers": 12, "num_heads": 12, "num_layers": 12, "output_past": true, "pad_token_id": 0, "relative_attention_max_distance": 128, "relative_attention_num_buckets": 32, "task_specific_params": { "summarization": { "early_stopping": true, "length_penalty": 2.0, "max_length": 200, "min_length": 30, "no_repeat_ngram_size": 3, "num_beams": 4, "prefix": "summarize: " }, "translation_en_to_de": { "early_stopping": true, "max_length": 300, "num_beams": 4, "prefix": "translate English to German: " }, "translation_en_to_fr": { "early_stopping": true, "max_length": 300, "num_beams": 4, "prefix": "translate English to French: " }, "translation_en_to_ro": { "early_stopping": true, "max_length": 300, "num_beams": 4, "prefix": "translate English to Romanian: " } }, "transformers_version": "4.26.1", "use_cache": true, "vocab_size": 32128 } /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/transformers/models/t5/tokenization_t5_fast.py:155: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5. For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`. - Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding. - If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding. - To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value. warnings.warn( 02/16/2023 15:24:16 - INFO - __main__ - Using hidden states in model: False -------------------------------------------------------- Using hidden: False 02/16/2023 15:24:16 - INFO - __main__ - Using implementation from class: T5ForClassification [INFO|modeling_utils.py:2275] 2023-02-16 15:24:16,391 >> loading weights file pytorch_model.bin from cache at .cache_training/models--t5-base/snapshots/0db7e623bcaee2daf9b859a646637ea39bf016cd/pytorch_model.bin [WARNING|modeling_utils.py:2847] 2023-02-16 15:24:19,101 >> Some weights of the model checkpoint at t5-base were not used when initializing T5ForClassification: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight'] - This IS expected if you are initializing T5ForClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing T5ForClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). [WARNING|modeling_utils.py:2859] 2023-02-16 15:24:19,102 >> Some weights of T5ForClassification were not initialized from the model checkpoint at t5-base and are newly initialized: ['decoder.embed_tokens.weight', 'lm_head.dense.bias', 'lm_head.dense_out.bias', 'encoder.embed_tokens.weight', 'lm_head.dense_in.bias', 'lm_head.dense_in.weight', 'lm_head.dense.weight', 'lm_head.dense_out.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. T5ForClassification( (shared): Embedding(32128, 768) (encoder): T5Stack( (embed_tokens): Embedding(32128, 768) (block): ModuleList( (0): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) (relative_attention_bias): Embedding(32, 12) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (1): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (2): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (3): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (4): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (5): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (6): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (7): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (8): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (9): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (10): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (11): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (final_layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (decoder): T5Stack( (embed_tokens): Embedding(32128, 768) (block): ModuleList( (0): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) (relative_attention_bias): Embedding(32, 12) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (1): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (2): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (3): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (4): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (5): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (6): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (7): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (8): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (9): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (10): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (11): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (final_layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (lm_head): T5ClassificationHead( (dense_in): Linear(in_features=768, out_features=768, bias=True) (dense): Linear(in_features=768, out_features=768, bias=True) (dense_out): Linear(in_features=768, out_features=4, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) ) Running tokenizer on dataset: 0%| | 0/120 [00:00<?, ?ba/s]02/16/2023 15:24:19 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-0f99c998b010fbf8.arrow Running tokenizer on dataset: 100%|███████████| 120/120 [00:07<00:00, 15.69ba/s] Running tokenizer on dataset: 0%| | 0/4 [00:00<?, ?ba/s]02/16/2023 15:24:26 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-e10a382a423bbb9a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-0cfaba6ab7fdc0e3.arrow Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 17.12ba/s] 02/16/2023 15:24:27 - INFO - __main__ - Set 500 samples for 0-class 02/16/2023 15:24:27 - INFO - __main__ - Set 500 samples for 1-class 02/16/2023 15:24:27 - INFO - __main__ - Set 500 samples for 2-class 02/16/2023 15:24:27 - INFO - __main__ - Set 500 samples for 3-class Traceback (most recent call last): File "/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/run_glue.py", line 685, in <module> main() File "/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/run_glue.py", line 533, in main raise ValueError("--do_predict requires a test dataset") ValueError: --do_predict requires a test dataset
Evaluation
!python run_glue.py \
--cache_dir .cache_training \
--model_name_or_path out/t5 \
--custom_model t5_custom \
--train_file data/train.json \
--validation_file data/valid.json \
--test_file data/test.json \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--do_eval \
--do_predict \
--max_seq_length 128 \
--learning_rate 2e-5 \
--max_eval_samples 2000 \
--max_steps 2500 \
--num_train_epochs 1 \
--save_strategy steps \
--save_steps 250 \
--save_total_limit 5 \
--logging_strategy steps \
--logging_steps 100 \
--eval_steps 250 \
--evaluation_strategy steps \
--metric_for_best_model accuracy \
--greater_is_better True \
--load_best_model_at_end True \
--output_dir out/t5_results
02/16/2023 16:52:57 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False 02/16/2023 16:52:57 - INFO - __main__ - Training/evaluation parameters TrainingArguments( _n_gpu=1, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, bf16=False, bf16_full_eval=False, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=0, dataloader_pin_memory=True, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=None, disable_tqdm=False, do_eval=True, do_predict=True, do_train=False, eval_accumulation_steps=None, eval_delay=0, eval_steps=250, evaluation_strategy=steps, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, fsdp=[], fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=False, greater_is_better=True, group_by_length=False, half_precision_backend=auto, hub_model_id=None, hub_private_repo=False, hub_strategy=every_save, hub_token=<HUB_TOKEN>, ignore_data_skip=False, include_inputs_for_metrics=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=2e-05, length_column_name=length, load_best_model_at_end=True, local_rank=-1, log_level=passive, log_level_replica=passive, log_on_each_node=True, logging_dir=out/t5_results/runs/Feb16_16-52-56_DESKTOP-R7JO8BQ, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=100, logging_strategy=steps, lr_scheduler_type=linear, max_grad_norm=1.0, max_steps=2500, metric_for_best_model=accuracy, mp_parameters=, no_cuda=False, num_train_epochs=1.0, optim=adamw_hf, optim_args=None, output_dir=out/t5_results, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=8, per_device_train_batch_size=8, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=<PUSH_TO_HUB_TOKEN>, ray_scope=last, remove_unused_columns=True, report_to=[], resume_from_checkpoint=None, run_name=out/t5_results, save_on_each_node=False, save_steps=250, save_strategy=steps, save_total_limit=5, seed=42, sharded_ddp=[], skip_memory_metrics=True, tf32=None, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_ipex=False, use_legacy_prediction_loop=False, use_mps_device=False, warmup_ratio=0.0, warmup_steps=0, weight_decay=0.0, xpu_backend=None, ) 02/16/2023 16:52:57 - INFO - __main__ - load a local file for train: data/train.json 02/16/2023 16:52:57 - INFO - __main__ - load a local file for validation: data/valid.json 02/16/2023 16:52:57 - INFO - __main__ - load a local file for test: data/test.json 02/16/2023 16:52:58 - WARNING - datasets.builder - Using custom data configuration default-f6e8039906850c57 02/16/2023 16:52:58 - INFO - datasets.info - Loading Dataset Infos from /home/jacob/anaconda3/envs/ugp/lib/python3.10/site-packages/datasets/packaged_modules/json 02/16/2023 16:52:58 - INFO - datasets.builder - Overwrite dataset info from restored data version. 02/16/2023 16:52:58 - INFO - datasets.info - Loading Dataset info from .cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51 02/16/2023 16:52:58 - WARNING - datasets.builder - Found cached dataset json (/home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51) 02/16/2023 16:52:58 - INFO - datasets.info - Loading Dataset info from /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51 100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 769.41it/s] [INFO|configuration_utils.py:658] 2023-02-16 16:52:58,326 >> loading configuration file out/t5/config.json [INFO|configuration_utils.py:712] 2023-02-16 16:52:58,327 >> Model config T5Config { "_name_or_path": "out/t5", "architectures": [ "T5ForClassification" ], "d_ff": 3072, "d_kv": 64, "d_model": 768, "decoder_start_token_id": 0, "dense_act_fn": "relu", "dropout_rate": 0.1, "eos_token_id": 1, "feed_forward_proj": "relu", "id2label": { "0": 0, "1": 1, "2": 2, "3": 3 }, "initializer_factor": 1.0, "is_encoder_decoder": true, "is_gated_act": false, "label2id": { "0": 0, "1": 1, "2": 2, "3": 3 }, "layer_norm_epsilon": 1e-06, "model_type": "t5", "n_positions": 512, "num_decoder_layers": 12, "num_heads": 12, "num_layers": 12, "output_past": true, "pad_token_id": 0, "relative_attention_max_distance": 128, "relative_attention_num_buckets": 32, "task_specific_params": { "summarization": { "early_stopping": true, "length_penalty": 2.0, "max_length": 200, "min_length": 30, "no_repeat_ngram_size": 3, "num_beams": 4, "prefix": "summarize: " }, "translation_en_to_de": { "early_stopping": true, "max_length": 300, "num_beams": 4, "prefix": "translate English to German: " }, "translation_en_to_fr": { "early_stopping": true, "max_length": 300, "num_beams": 4, "prefix": "translate English to French: " }, "translation_en_to_ro": { "early_stopping": true, "max_length": 300, "num_beams": 4, "prefix": "translate English to Romanian: " } }, "torch_dtype": "float32", "transformers_version": "4.26.1", "use_cache": true, "use_hidden_states": false, "vocab_size": 32128 } [INFO|tokenization_utils_base.py:1800] 2023-02-16 16:52:58,328 >> loading file spiece.model [INFO|tokenization_utils_base.py:1800] 2023-02-16 16:52:58,328 >> loading file tokenizer.json [INFO|tokenization_utils_base.py:1800] 2023-02-16 16:52:58,328 >> loading file added_tokens.json [INFO|tokenization_utils_base.py:1800] 2023-02-16 16:52:58,328 >> loading file special_tokens_map.json [INFO|tokenization_utils_base.py:1800] 2023-02-16 16:52:58,328 >> loading file tokenizer_config.json 02/16/2023 16:52:58 - INFO - __main__ - Using hidden states in model: False -------------------------------------------------------- Using hidden: False 02/16/2023 16:52:58 - INFO - __main__ - Using implementation from class: T5ForClassification [INFO|modeling_utils.py:2272] 2023-02-16 16:52:58,375 >> loading weights file out/t5/pytorch_model.bin [INFO|modeling_utils.py:2857] 2023-02-16 16:53:00,690 >> All model checkpoint weights were used when initializing T5ForClassification. [INFO|modeling_utils.py:2865] 2023-02-16 16:53:00,690 >> All the weights of T5ForClassification were initialized from the model checkpoint at out/t5. If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForClassification for predictions without further training. T5ForClassification( (shared): Embedding(32128, 768) (encoder): T5Stack( (embed_tokens): Embedding(32128, 768) (block): ModuleList( (0): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) (relative_attention_bias): Embedding(32, 12) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (1): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (2): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (3): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (4): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (5): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (6): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (7): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (8): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (9): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (10): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (11): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (final_layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (decoder): T5Stack( (embed_tokens): Embedding(32128, 768) (block): ModuleList( (0): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) (relative_attention_bias): Embedding(32, 12) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (1): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (2): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (3): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (4): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (5): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (6): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (7): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (8): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (9): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (10): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (11): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseActDense( (wi): Linear(in_features=768, out_features=3072, bias=False) (wo): Linear(in_features=3072, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): ReLU() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (final_layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (lm_head): T5ClassificationHead( (dense_in): Linear(in_features=768, out_features=768, bias=True) (dense): Linear(in_features=768, out_features=768, bias=True) (dense_out): Linear(in_features=768, out_features=4, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) ) Running tokenizer on dataset: 0%| | 0/120 [00:00<?, ?ba/s]02/16/2023 16:53:00 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-461127b59c7ea04e.arrow Running tokenizer on dataset: 100%|███████████| 120/120 [00:08<00:00, 14.36ba/s] Running tokenizer on dataset: 0%| | 0/4 [00:00<?, ?ba/s]02/16/2023 16:53:09 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-bbee377e7bea95e7.arrow Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 15.94ba/s] Running tokenizer on dataset: 0%| | 0/4 [00:00<?, ?ba/s]02/16/2023 16:53:09 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/jacob/code/university/uczenie_glebokie_w_przetwarzaniu_tekstu/.cache_training/json/default-f6e8039906850c57/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-4e0cbdadca2e6dc6.arrow Running tokenizer on dataset: 100%|███████████████| 4/4 [00:00<00:00, 16.87ba/s] 02/16/2023 16:53:10 - INFO - __main__ - Set 500 samples for 0-class 02/16/2023 16:53:10 - INFO - __main__ - Set 500 samples for 1-class 02/16/2023 16:53:10 - INFO - __main__ - Set 500 samples for 2-class 02/16/2023 16:53:10 - INFO - __main__ - Set 500 samples for 3-class [INFO|trainer.py:511] 2023-02-16 16:53:12,738 >> max_steps is given, it will override any value given in num_train_epochs 02/16/2023 16:53:12 - INFO - __main__ - *** Evaluate *** [INFO|trainer.py:710] 2023-02-16 16:53:12,739 >> The following columns in the evaluation set don't have a corresponding argument in `T5ForClassification.forward` and have been ignored: text. If text are not expected by `T5ForClassification.forward`, you can safely ignore this message. [INFO|trainer.py:2964] 2023-02-16 16:53:12,740 >> ***** Running Evaluation ***** [INFO|trainer.py:2966] 2023-02-16 16:53:12,740 >> Num examples = 2000 [INFO|trainer.py:2969] 2023-02-16 16:53:12,740 >> Batch size = 8 100%|█████████████████████████████████████████| 250/250 [00:39<00:00, 6.26it/s] ***** eval metrics ***** eval_accuracy = 0.4675 eval_loss = 1.2139 eval_runtime = 0:00:40.56 eval_samples = 2000 eval_samples_per_second = 49.303 eval_steps_per_second = 6.163 02/16/2023 16:53:53 - INFO - __main__ - *** Predict *** [INFO|trainer.py:710] 2023-02-16 16:53:53,307 >> The following columns in the test set don't have a corresponding argument in `T5ForClassification.forward` and have been ignored: text. If text are not expected by `T5ForClassification.forward`, you can safely ignore this message. [INFO|trainer.py:2964] 2023-02-16 16:53:53,308 >> ***** Running Prediction ***** [INFO|trainer.py:2966] 2023-02-16 16:53:53,308 >> Num examples = 3800 [INFO|trainer.py:2969] 2023-02-16 16:53:53,308 >> Batch size = 8 100%|█████████████████████████████████████████| 475/475 [01:15<00:00, 6.32it/s] 02/16/2023 16:55:08 - INFO - __main__ - ***** Predict results None ***** [INFO|modelcard.py:449] 2023-02-16 16:55:09,179 >> Dropping the following result as it does not have all the necessary fields: {'task': {'name': 'Text Classification', 'type': 'text-classification'}}
Result
!cat out/t5_results/eval_results.json | jq .eval_accuracy
[0;39m0.4675000011920929[0m
Bart - Zero shot
Code
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from datasets import load_dataset
from tqdm.notebook import tqdm
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
Downloading (…)lve/main/config.json: 0%| | 0.00/1.40k [00:00<?, ?B/s]
Downloading (…)"pytorch_model.bin";: 0%| | 0.00/990M [00:00<?, ?B/s]
Downloading (…)neration_config.json: 0%| | 0.00/147 [00:00<?, ?B/s]
Downloading (…)okenizer_config.json: 0%| | 0.00/2.54k [00:00<?, ?B/s]
Downloading (…)"spiece.model";: 0%| | 0.00/792k [00:00<?, ?B/s]
Downloading (…)/main/tokenizer.json: 0%| | 0.00/2.42M [00:00<?, ?B/s]
Downloading (…)cial_tokens_map.json: 0%| | 0.00/2.20k [00:00<?, ?B/s]
MAP_LABEL_TRANSLATION = {
0: 'world',
1: 'sport',
2: 'business',
3: 'scitech'
}
dataset = load_dataset("json", data_files={'test': 'data/test.json'})
dataset['test'] = dataset['test'].map(lambda x: { 'label': MAP_LABEL_TRANSLATION[x['label']], 'text': x['text']})
Using custom data configuration default-20e4aa4ef5e587fb Found cached dataset json (/home/jacob/.cache/huggingface/datasets/json/default-20e4aa4ef5e587fb/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)
0%| | 0/1 [00:00<?, ?it/s]
Loading cached processed dataset at /home/jacob/.cache/huggingface/datasets/json/default-20e4aa4ef5e587fb/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-6a7c4b64ea03ea9d.arrow
Model
model
T5ForConditionalGeneration( (shared): Embedding(32128, 768) (encoder): T5Stack( (embed_tokens): Embedding(32128, 768) (block): ModuleList( (0): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) (relative_attention_bias): Embedding(32, 12) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (1): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (2): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (3): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (4): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (5): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (6): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (7): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (8): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (9): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (10): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (11): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (final_layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (decoder): T5Stack( (embed_tokens): Embedding(32128, 768) (block): ModuleList( (0): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) (relative_attention_bias): Embedding(32, 12) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (1): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (2): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (3): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (4): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (5): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (6): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (7): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (8): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (9): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (10): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (11): T5Block( (layer): ModuleList( (0): T5LayerSelfAttention( (SelfAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (1): T5LayerCrossAttention( (EncDecAttention): T5Attention( (q): Linear(in_features=768, out_features=768, bias=False) (k): Linear(in_features=768, out_features=768, bias=False) (v): Linear(in_features=768, out_features=768, bias=False) (o): Linear(in_features=768, out_features=768, bias=False) ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (2): T5LayerFF( (DenseReluDense): T5DenseGatedActDense( (wi_0): Linear(in_features=768, out_features=2048, bias=False) (wi_1): Linear(in_features=768, out_features=2048, bias=False) (wo): Linear(in_features=2048, out_features=768, bias=False) (dropout): Dropout(p=0.1, inplace=False) (act): NewGELUActivation() ) (layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (final_layer_norm): T5LayerNorm() (dropout): Dropout(p=0.1, inplace=False) ) (lm_head): Linear(in_features=768, out_features=32128, bias=False) )
Validation
correct = 0
labels = "sport, world, business, scitech"
for entry in dataset['test']:
prompt = f"classify with possible labels: {labels}\ntext: {entry['text']}"
output = pipeline(prompt, do_sample=False)[0]['generated_text'].lower()
if output == entry['label']:
correct += 1
accuracy = correct / len(dataset['test'])
print(f"Accuracy: {accuracy}")
Accuracy: 0.7560526315789474
Summary
Roberta | GPT2 | T5 | Flan-T5 | |
---|---|---|---|---|
Accuracy | 92.2% | 91.9% | 46.7% | 75.6% |