348 KiB
348 KiB
! pip install datasets transformers torch scikit-learn evaluate
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Collecting datasets Downloading datasets-2.9.0-py3-none-any.whl (462 kB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m [?25hCollecting transformers Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m66.9 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: torch in /usr/local/lib/python3.8/dist-packages (1.13.1+cu116) Requirement already satisfied: scikit-learn in /usr/local/lib/python3.8/dist-packages (1.0.2) Collecting evaluate Downloading evaluate-0.4.0-py3-none-any.whl (81 kB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: pyarrow>=6.0.0 in /usr/local/lib/python3.8/dist-packages (from datasets) (9.0.0) Requirement already satisfied: dill<0.3.7 in /usr/local/lib/python3.8/dist-packages (from datasets) (0.3.6) Requirement already satisfied: aiohttp in /usr/local/lib/python3.8/dist-packages (from datasets) (3.8.3) Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.8/dist-packages (from datasets) (2.25.1) Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.8/dist-packages (from datasets) (6.0) Collecting xxhash Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.8/dist-packages (from datasets) (1.21.6) Collecting responses<0.19 Downloading responses-0.18.0-py3-none-any.whl (38 kB) Collecting multiprocess Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.8/dist-packages (from datasets) (2023.1.0) Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.8/dist-packages (from datasets) (4.64.1) Collecting huggingface-hub<1.0.0,>=0.2.0 Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from datasets) (1.3.5) Requirement already satisfied: packaging in /usr/local/lib/python3.8/dist-packages (from datasets) (23.0) Requirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from transformers) (3.9.0) Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.8/dist-packages (from transformers) (2022.6.2) Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m86.4 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.8/dist-packages (from torch) (4.4.0) Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.8/dist-packages (from scikit-learn) (1.2.0) Requirement already satisfied: scipy>=1.1.0 in /usr/local/lib/python3.8/dist-packages (from scikit-learn) (1.7.3) Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from scikit-learn) (3.1.0) Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.8.2) Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.3.3) Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.3.1) Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (22.2.0) Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (6.0.4) Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (2.1.1) Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (4.0.2) Requirement already satisfied: chardet<5,>=3.0.2 in /usr/local/lib/python3.8/dist-packages (from requests>=2.19.0->datasets) (4.0.0) Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests>=2.19.0->datasets) (2.10) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.8/dist-packages (from requests>=2.19.0->datasets) (2022.12.7) Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests>=2.19.0->datasets) (1.24.3) Collecting urllib3<1.27,>=1.21.1 Downloading urllib3-1.26.14-py2.py3-none-any.whl (140 kB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas->datasets) (2.8.2) Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas->datasets) (2022.7.1) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.8/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0) Installing collected packages: tokenizers, xxhash, urllib3, multiprocess, responses, huggingface-hub, transformers, datasets, evaluate Attempting uninstall: urllib3 Found existing installation: urllib3 1.24.3 Uninstalling urllib3-1.24.3: Successfully uninstalled urllib3-1.24.3 Successfully installed datasets-2.9.0 evaluate-0.4.0 huggingface-hub-0.12.0 multiprocess-0.70.14 responses-0.18.0 tokenizers-0.13.2 transformers-4.26.1 urllib3-1.26.14 xxhash-3.2.0
!wget 'https://raw.githubusercontent.com/huggingface/transformers/v4.23.1/examples/pytorch/text-classification/run_glue.py' -O 'run_glue.py'
--2023-02-10 21:16:19-- https://raw.githubusercontent.com/huggingface/transformers/v4.23.1/examples/pytorch/text-classification/run_glue.py Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ... Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 27259 (27K) [text/plain] Saving to: ‘run_glue.py’ run_glue.py 0%[ ] 0 --.-KB/s run_glue.py 100%[===================>] 26.62K --.-KB/s in 0.003s 2023-02-10 21:16:19 (9.37 MB/s) - ‘run_glue.py’ saved [27259/27259]
import json
from pathlib import Path
from typing import Dict, List
from datasets import load_dataset
loaded_data = load_dataset('emotion')
!mkdir -v -p data
train_path = Path('data/train.json')
valid_path = Path('data/valid.json')
test_path = Path('data/test.json')
data_train, data_valid, data_test = [], [], []
for source_data, dataset, max_size in [
(loaded_data['train'], data_train, None),
(loaded_data['test'], data_valid, None),
]:
for i, data in enumerate(source_data):
if max_size is not None and i >= max_size:
break
data_line = {
'label': int(data['label']),
'text': data['text'],
}
dataset.append(data_line)
print(f'Train: {len(data_train):6d}')
print(f'Valid: {len(data_valid):6d}')
data_class_1, data_class_2 = [], []
for data in data_valid:
label = data['label']
if label == 0:
data_class_1.append(data)
elif label == 1:
data_class_2.append(data)
print(f'Label 1: {len(data_class_1):6d}')
print(f'Label 2: {len(data_class_2):6d}')
size_half_class_1 = int(len(data_class_1) / 2)
size_half_class_2 = int(len(data_class_2) / 2)
data_valid = data_class_1[:size_half_class_1] + data_class_2[:size_half_class_2]
data_test = data_class_1[size_half_class_1:] + data_class_2[size_half_class_2:]
print(f'Valid: {len(data_valid):6d}')
print(f'Test : {len(data_test):6d}')
MAP_LABEL_TRANSLATION = {
0: 'sadness',
1: 'joy',
2: 'love',
3: 'anger',
4: 'fear',
5: 'surprise',
}
def save_as_translations(original_save_path: Path, data_to_save: List[Dict]) -> None:
file_name = 's2s-' + original_save_path.name
file_path = original_save_path.parent / file_name
print(f'Saving into: {file_path}')
with open(file_path, 'wt') as f_write:
for data_line in data_to_save:
label = data_line['label']
new_label = MAP_LABEL_TRANSLATION[label]
data_line['label'] = new_label
data_line_str = json.dumps(data_line)
f_write.write(f'{data_line_str}\n')
for file_path, data_to_save in [(train_path, data_train), (valid_path, data_valid), (test_path, data_test)]:
print(f'Saving into: {file_path}')
with open(file_path, 'wt') as f_write:
for data_line in data_to_save:
data_line_str = json.dumps(data_line)
f_write.write(f'{data_line_str}\n')
save_as_translations(file_path, data_to_save)
WARNING:datasets.builder:No config specified, defaulting to: emotion/split WARNING:datasets.builder:Found cached dataset emotion (/root/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd)
0%| | 0/3 [00:00<?, ?it/s]
Train: 16000 Valid: 2000 Label 1: 581 Label 2: 695 Valid: 637 Test : 639 Saving into: data/train.json Saving into: data/s2s-train.json Saving into: data/valid.json Saving into: data/s2s-valid.json Saving into: data/test.json Saving into: data/s2s-test.json
!head -n 2500 data/train.json > data/train-5k.json
!tail -n 2500 data/train.json >> data/train-5k.json
!wc -l data/train-5k.json
5000 data/train-5k.json
from pathlib import Path
for file_name in ["train", "valid", "test", "s2s-train", "s2s-valid", "s2s-test"]:
print(f"=== {file_name} ===")
all_text = Path(f"data/{file_name}.json").read_text().split('\n')
text = all_text[:2500] + all_text[-2500:]
Path(f"data/{file_name}-5k.json").write_text("\n".join(text))
=== train === === valid === === test === === s2s-train === === s2s-valid === === s2s-test ===
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
!python run_glue.py \
--cache_dir .cache_training \
--model_name_or_path gpt2 \
--train_file data/train-5k.json \
--validation_file data/valid-5k.json \
--test_file data/test-5k.json \
--per_device_train_batch_size 24 \
--per_device_eval_batch_size 24 \
--do_train \
--do_eval \
--do_predict \
--max_seq_length 128 \
--learning_rate 2e-5 \
--num_train_epochs 5 \
--output_dir out/imdb-5k/gpt2
2023-02-10 21:22:54.785242: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2023-02-10 21:22:54.925689: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. 2023-02-10 21:22:55.662472: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia 2023-02-10 21:22:55.662568: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia 2023-02-10 21:22:55.662585: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. WARNING:__main__:Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False INFO:__main__:Training/evaluation parameters TrainingArguments( _n_gpu=1, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, bf16=False, bf16_full_eval=False, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=0, dataloader_pin_memory=True, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=None, disable_tqdm=False, do_eval=True, do_predict=True, do_train=True, eval_accumulation_steps=None, eval_delay=0, eval_steps=None, evaluation_strategy=no, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, fsdp=[], fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, gradient_accumulation_steps=1, gradient_checkpointing=False, greater_is_better=None, group_by_length=False, half_precision_backend=auto, hub_model_id=None, hub_private_repo=False, hub_strategy=every_save, hub_token=<HUB_TOKEN>, ignore_data_skip=False, include_inputs_for_metrics=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=2e-05, length_column_name=length, load_best_model_at_end=False, local_rank=-1, log_level=passive, log_level_replica=passive, log_on_each_node=True, logging_dir=out/imdb-5k/gpt2/runs/Feb10_21-22-58_4bf02db3dc1f, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=500, logging_strategy=steps, lr_scheduler_type=linear, max_grad_norm=1.0, max_steps=-1, metric_for_best_model=None, mp_parameters=, no_cuda=False, num_train_epochs=5.0, optim=adamw_hf, optim_args=None, output_dir=out/imdb-5k/gpt2, overwrite_output_dir=False, past_index=-1, per_device_eval_batch_size=24, per_device_train_batch_size=24, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=<PUSH_TO_HUB_TOKEN>, ray_scope=last, remove_unused_columns=True, report_to=['tensorboard'], resume_from_checkpoint=None, run_name=out/imdb-5k/gpt2, save_on_each_node=False, save_steps=500, save_strategy=steps, save_total_limit=None, seed=42, sharded_ddp=[], skip_memory_metrics=True, tf32=None, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_ipex=False, use_legacy_prediction_loop=False, use_mps_device=False, warmup_ratio=0.0, warmup_steps=0, weight_decay=0.0, xpu_backend=None, ) INFO:__main__:Checkpoint detected, resuming training at out/imdb-5k/gpt2/checkpoint-500. To avoid this behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch. INFO:__main__:load a local file for train: data/train-5k.json INFO:__main__:load a local file for validation: data/valid-5k.json INFO:__main__:load a local file for test: data/test-5k.json WARNING:datasets.builder:Using custom data configuration default-58ab9a923ac72046 INFO:datasets.info:Loading Dataset Infos from /usr/local/lib/python3.8/dist-packages/datasets/packaged_modules/json INFO:datasets.builder:Overwrite dataset info from restored data version. INFO:datasets.info:Loading Dataset info from .cache_training/json/default-58ab9a923ac72046/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51 WARNING:datasets.builder:Found cached dataset json (/content/.cache_training/json/default-58ab9a923ac72046/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51) INFO:datasets.info:Loading Dataset info from /content/.cache_training/json/default-58ab9a923ac72046/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51 100% 3/3 [00:00<00:00, 880.54it/s] [INFO|configuration_utils.py:660] 2023-02-10 21:22:59,860 >> loading configuration file config.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json [INFO|configuration_utils.py:712] 2023-02-10 21:22:59,863 >> Model config GPT2Config { "_name_or_path": "gpt2", "activation_function": "gelu_new", "architectures": [ "GPT2LMHeadModel" ], "attn_pdrop": 0.1, "bos_token_id": 50256, "embd_pdrop": 0.1, "eos_token_id": 50256, "id2label": { "0": "LABEL_0", "1": "LABEL_1", "2": "LABEL_2", "3": "LABEL_3", "4": "LABEL_4", "5": "LABEL_5" }, "initializer_range": 0.02, "label2id": { "LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2, "LABEL_3": 3, "LABEL_4": 4, "LABEL_5": 5 }, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_inner": null, "n_layer": 12, "n_positions": 1024, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, "scale_attn_by_inverse_layer_idx": false, "scale_attn_weights": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "summary_type": "cls_index", "summary_use_proj": true, "task_specific_params": { "text-generation": { "do_sample": true, "max_length": 50 } }, "transformers_version": "4.26.1", "use_cache": true, "vocab_size": 50257 } [INFO|tokenization_auto.py:458] 2023-02-10 21:22:59,992 >> Could not locate the tokenizer configuration file, will try to use the model config instead. [INFO|configuration_utils.py:660] 2023-02-10 21:23:00,119 >> loading configuration file config.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json [INFO|configuration_utils.py:712] 2023-02-10 21:23:00,120 >> Model config GPT2Config { "_name_or_path": "gpt2", "activation_function": "gelu_new", "architectures": [ "GPT2LMHeadModel" ], "attn_pdrop": 0.1, "bos_token_id": 50256, "embd_pdrop": 0.1, "eos_token_id": 50256, "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_inner": null, "n_layer": 12, "n_positions": 1024, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, "scale_attn_by_inverse_layer_idx": false, "scale_attn_weights": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "summary_type": "cls_index", "summary_use_proj": true, "task_specific_params": { "text-generation": { "do_sample": true, "max_length": 50 } }, "transformers_version": "4.26.1", "use_cache": true, "vocab_size": 50257 } [INFO|tokenization_utils_base.py:1802] 2023-02-10 21:23:00,397 >> loading file vocab.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/vocab.json [INFO|tokenization_utils_base.py:1802] 2023-02-10 21:23:00,397 >> loading file merges.txt from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/merges.txt [INFO|tokenization_utils_base.py:1802] 2023-02-10 21:23:00,397 >> loading file tokenizer.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/tokenizer.json [INFO|tokenization_utils_base.py:1802] 2023-02-10 21:23:00,397 >> loading file added_tokens.json from cache at None [INFO|tokenization_utils_base.py:1802] 2023-02-10 21:23:00,397 >> loading file special_tokens_map.json from cache at None [INFO|tokenization_utils_base.py:1802] 2023-02-10 21:23:00,397 >> loading file tokenizer_config.json from cache at None [INFO|configuration_utils.py:660] 2023-02-10 21:23:00,397 >> loading configuration file config.json from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json [INFO|configuration_utils.py:712] 2023-02-10 21:23:00,398 >> Model config GPT2Config { "_name_or_path": "gpt2", "activation_function": "gelu_new", "architectures": [ "GPT2LMHeadModel" ], "attn_pdrop": 0.1, "bos_token_id": 50256, "embd_pdrop": 0.1, "eos_token_id": 50256, "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "model_type": "gpt2", "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_inner": null, "n_layer": 12, "n_positions": 1024, "reorder_and_upcast_attn": false, "resid_pdrop": 0.1, "scale_attn_by_inverse_layer_idx": false, "scale_attn_weights": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "summary_type": "cls_index", "summary_use_proj": true, "task_specific_params": { "text-generation": { "do_sample": true, "max_length": 50 } }, "transformers_version": "4.26.1", "use_cache": true, "vocab_size": 50257 } [INFO|modeling_utils.py:2275] 2023-02-10 21:23:00,491 >> loading weights file pytorch_model.bin from cache at .cache_training/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/pytorch_model.bin [INFO|modeling_utils.py:2857] 2023-02-10 21:23:01,899 >> All model checkpoint weights were used when initializing GPT2ForSequenceClassification. [WARNING|modeling_utils.py:2859] 2023-02-10 21:23:01,899 >> Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. WARNING:datasets.arrow_dataset:Loading cached processed dataset at /content/.cache_training/json/default-58ab9a923ac72046/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-d1ffff8de8defc1a.arrow Running tokenizer on dataset: 0% 0/2 [00:00<?, ?ba/s][ERROR|tokenization_utils_base.py:1042] 2023-02-10 21:23:04,511 >> Using pad_token, but it is not set yet. INFO:__main__:Set PAD token to EOS: <|endoftext|> INFO:datasets.arrow_dataset:Caching processed dataset at /content/.cache_training/json/default-58ab9a923ac72046/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-c4ceea5bc782c3e6.arrow Running tokenizer on dataset: 100% 2/2 [00:00<00:00, 26.57ba/s] WARNING:datasets.arrow_dataset:Loading cached processed dataset at /content/.cache_training/json/default-58ab9a923ac72046/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-4279d22576228078.arrow INFO:__main__:Sample 912 of the training set: {'label': 2, 'text': 'i feel we need a little romantic boost in the relationship', 'input_ids': [72, 1254, 356, 761, 257, 1310, 14348, 5750, 287, 262, 2776, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}. INFO:__main__:Sample 204 of the training set: {'label': 1, 'text': 'i feel pretty mellow so far about whatever healing wounding process may be getting underway', 'input_ids': [72, 1254, 2495, 33748, 322, 523, 1290, 546, 4232, 11516, 40942, 1429, 743, 307, 1972, 17715, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}. INFO:__main__:Sample 2253 of the training set: {'label': 1, 'text': 'i feel ive answered those questions for her and shes pretty trusting for the most part', 'input_ids': [72, 1254, 220, 425, 9373, 883, 2683, 329, 607, 290, 673, 82, 2495, 33914, 329, 262, 749, 636, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}. [INFO|trainer.py:1972] 2023-02-10 21:23:07,962 >> Loading model from out/imdb-5k/gpt2/checkpoint-500. [INFO|trainer.py:710] 2023-02-10 21:23:08,382 >> The following columns in the training set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`, you can safely ignore this message. /usr/local/lib/python3.8/dist-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning warnings.warn( [INFO|trainer.py:1650] 2023-02-10 21:23:09,160 >> ***** Running training ***** [INFO|trainer.py:1651] 2023-02-10 21:23:09,160 >> Num examples = 4999 [INFO|trainer.py:1652] 2023-02-10 21:23:09,160 >> Num Epochs = 5 [INFO|trainer.py:1653] 2023-02-10 21:23:09,160 >> Instantaneous batch size per device = 24 [INFO|trainer.py:1654] 2023-02-10 21:23:09,160 >> Total train batch size (w. parallel, distributed & accumulation) = 24 [INFO|trainer.py:1655] 2023-02-10 21:23:09,160 >> Gradient Accumulation steps = 1 [INFO|trainer.py:1656] 2023-02-10 21:23:09,160 >> Total optimization steps = 1045 [INFO|trainer.py:1657] 2023-02-10 21:23:09,161 >> Number of trainable parameters = 124444416 [INFO|trainer.py:1679] 2023-02-10 21:23:09,161 >> Continuing training from checkpoint, will skip to saved global_step [INFO|trainer.py:1680] 2023-02-10 21:23:09,161 >> Continuing training from epoch 2 [INFO|trainer.py:1681] 2023-02-10 21:23:09,161 >> Continuing training from global step 500 [INFO|trainer.py:1683] 2023-02-10 21:23:09,161 >> Will skip the first 2 epochs then the first 82 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model. Skipping the first batches: 0% 0/82 [00:00<?, ?it/s] Skipping the first batches: 100% 82/82 [00:00<00:00, 209.80it/s] 48% 501/1045 [00:01<00:01, 346.02it/s][A 51% 536/1045 [00:06<00:08, 62.48it/s] [A 53% 551/1045 [00:08<00:11, 43.73it/s][A 54% 560/1045 [00:10<00:13, 35.46it/s][A 54% 566/1045 [00:10<00:15, 30.43it/s][A 55% 570/1045 [00:11<00:17, 26.96it/s][A 55% 573/1045 [00:12<00:19, 24.22it/s][A 55% 575/1045 [00:12<00:21, 22.27it/s][A 55% 577/1045 [00:12<00:23, 20.14it/s][A 55% 579/1045 [00:12<00:25, 17.95it/s][A 56% 581/1045 [00:13<00:29, 15.83it/s][A 56% 582/1045 [00:13<00:31, 14.75it/s][A 56% 583/1045 [00:13<00:33, 13.59it/s][A 56% 584/1045 [00:13<00:37, 12.43it/s][A 56% 585/1045 [00:13<00:40, 11.32it/s][A 56% 586/1045 [00:13<00:44, 10.33it/s][A 56% 587/1045 [00:14<00:48, 9.49it/s][A 56% 588/1045 [00:14<00:51, 8.81it/s][A 56% 589/1045 [00:14<00:55, 8.28it/s][A 56% 590/1045 [00:14<00:57, 7.86it/s][A 57% 591/1045 [00:14<01:00, 7.56it/s][A 57% 592/1045 [00:14<01:01, 7.35it/s][A 57% 593/1045 [00:14<01:02, 7.20it/s][A 57% 594/1045 [00:15<01:03, 7.08it/s][A 57% 595/1045 [00:15<01:04, 7.00it/s][A 57% 596/1045 [00:15<01:04, 6.94it/s][A 57% 597/1045 [00:15<01:04, 6.91it/s][A 57% 598/1045 [00:15<01:04, 6.88it/s][A 57% 599/1045 [00:15<01:04, 6.87it/s][A 57% 600/1045 [00:16<01:04, 6.85it/s][A 58% 601/1045 [00:16<01:04, 6.84it/s][A 58% 602/1045 [00:16<01:04, 6.84it/s][A 58% 603/1045 [00:16<01:04, 6.82it/s][A 58% 604/1045 [00:16<01:04, 6.82it/s][A 58% 605/1045 [00:16<01:04, 6.82it/s][A 58% 606/1045 [00:16<01:04, 6.82it/s][A 58% 607/1045 [00:17<01:04, 6.82it/s][A 58% 608/1045 [00:17<01:04, 6.80it/s][A 58% 609/1045 [00:17<01:04, 6.79it/s][A 58% 610/1045 [00:17<01:04, 6.79it/s][A 58% 611/1045 [00:17<01:04, 6.77it/s][A 59% 612/1045 [00:17<01:03, 6.77it/s][A 59% 613/1045 [00:17<01:03, 6.77it/s][A 59% 614/1045 [00:18<01:03, 6.78it/s][A 59% 615/1045 [00:18<01:03, 6.77it/s][A 59% 616/1045 [00:18<01:03, 6.78it/s][A 59% 617/1045 [00:18<01:03, 6.78it/s][A 59% 618/1045 [00:18<01:03, 6.77it/s][A 59% 619/1045 [00:18<01:02, 6.78it/s][A 59% 620/1045 [00:18<01:02, 6.79it/s][A 59% 621/1045 [00:19<01:02, 6.79it/s][A 60% 622/1045 [00:19<01:02, 6.79it/s][A 60% 623/1045 [00:19<01:02, 6.79it/s][A 60% 624/1045 [00:19<01:02, 6.79it/s][A 60% 625/1045 [00:19<01:01, 6.80it/s][A 60% 626/1045 [00:19<01:01, 6.80it/s][A 60% 628/1045 [00:20<00:53, 7.75it/s][A 60% 629/1045 [00:20<00:55, 7.50it/s][A 60% 630/1045 [00:20<00:56, 7.31it/s][A 60% 631/1045 [00:20<00:57, 7.17it/s][A 60% 632/1045 [00:20<00:59, 6.97it/s][A 61% 633/1045 [00:20<00:59, 6.91it/s][A 61% 634/1045 [00:20<00:59, 6.88it/s][A 61% 635/1045 [00:21<00:59, 6.87it/s][A 61% 636/1045 [00:21<00:59, 6.86it/s][A 61% 637/1045 [00:21<00:59, 6.85it/s][A 61% 638/1045 [00:21<00:59, 6.83it/s][A 61% 639/1045 [00:21<00:59, 6.83it/s][A 61% 640/1045 [00:21<00:59, 6.82it/s][A 61% 641/1045 [00:21<00:59, 6.82it/s][A 61% 642/1045 [00:22<00:59, 6.82it/s][A 62% 643/1045 [00:22<00:59, 6.81it/s][A 62% 644/1045 [00:22<00:58, 6.81it/s][A 62% 645/1045 [00:22<00:58, 6.81it/s][A 62% 646/1045 [00:22<00:58, 6.81it/s][A 62% 647/1045 [00:22<00:58, 6.80it/s][A 62% 648/1045 [00:22<00:58, 6.80it/s][A 62% 649/1045 [00:23<00:58, 6.79it/s][A 62% 650/1045 [00:23<00:58, 6.76it/s][A 62% 651/1045 [00:23<00:58, 6.77it/s][A 62% 652/1045 [00:23<00:57, 6.78it/s][A 62% 653/1045 [00:23<00:57, 6.79it/s][A 63% 654/1045 [00:23<00:57, 6.78it/s][A 63% 655/1045 [00:24<00:57, 6.79it/s][A 63% 656/1045 [00:24<00:57, 6.80it/s][A 63% 657/1045 [00:24<00:56, 6.81it/s][A 63% 658/1045 [00:24<00:56, 6.81it/s][A 63% 659/1045 [00:24<00:56, 6.79it/s][A 63% 660/1045 [00:24<00:56, 6.80it/s][A 63% 661/1045 [00:24<00:56, 6.80it/s][A 63% 662/1045 [00:25<00:56, 6.80it/s][A 63% 663/1045 [00:25<00:56, 6.81it/s][A 64% 664/1045 [00:25<00:55, 6.80it/s][A 64% 665/1045 [00:25<00:56, 6.69it/s][A 64% 666/1045 [00:25<00:56, 6.72it/s][A 64% 667/1045 [00:25<00:56, 6.74it/s][A 64% 668/1045 [00:25<00:55, 6.75it/s][A 64% 669/1045 [00:26<00:55, 6.77it/s][A 64% 670/1045 [00:26<00:55, 6.79it/s][A 64% 671/1045 [00:26<00:55, 6.77it/s][A 64% 672/1045 [00:26<00:54, 6.78it/s][A 64% 673/1045 [00:26<00:54, 6.77it/s][A 64% 674/1045 [00:26<00:54, 6.79it/s][A 65% 675/1045 [00:26<00:54, 6.76it/s][A 65% 676/1045 [00:27<00:54, 6.77it/s][A 65% 677/1045 [00:27<00:54, 6.77it/s][A 65% 678/1045 [00:27<00:54, 6.79it/s][A 65% 679/1045 [00:27<00:53, 6.78it/s][A 65% 680/1045 [00:27<00:53, 6.79it/s][A 65% 681/1045 [00:27<00:53, 6.79it/s][A 65% 682/1045 [00:28<00:53, 6.79it/s][A 65% 683/1045 [00:28<00:53, 6.80it/s][A 65% 684/1045 [00:28<00:53, 6.81it/s][A 66% 685/1045 [00:28<00:53, 6.69it/s][A 66% 686/1045 [00:28<00:53, 6.71it/s][A 66% 687/1045 [00:28<00:53, 6.74it/s][A 66% 688/1045 [00:28<00:52, 6.76it/s][A 66% 689/1045 [00:29<00:52, 6.78it/s][A 66% 690/1045 [00:29<00:52, 6.78it/s][A 66% 691/1045 [00:29<00:52, 6.78it/s][A 66% 692/1045 [00:29<00:52, 6.78it/s][A 66% 693/1045 [00:29<00:51, 6.78it/s][A 66% 694/1045 [00:29<00:51, 6.79it/s][A 67% 695/1045 [00:29<00:51, 6.79it/s][A 67% 696/1045 [00:30<00:52, 6.67it/s][A 67% 697/1045 [00:30<00:51, 6.71it/s][A 67% 698/1045 [00:30<00:51, 6.74it/s][A 67% 699/1045 [00:30<00:51, 6.77it/s][A 67% 700/1045 [00:30<00:50, 6.77it/s][A 67% 701/1045 [00:30<00:50, 6.78it/s][A 67% 702/1045 [00:30<00:50, 6.79it/s][A 67% 703/1045 [00:31<00:50, 6.79it/s][A 67% 704/1045 [00:31<00:50, 6.79it/s][A 67% 705/1045 [00:31<00:50, 6.78it/s][A 68% 706/1045 [00:31<00:49, 6.78it/s][A 68% 707/1045 [00:31<00:49, 6.79it/s][A 68% 708/1045 [00:31<00:49, 6.79it/s][A 68% 709/1045 [00:31<00:49, 6.80it/s][A 68% 710/1045 [00:32<00:49, 6.78it/s][A 68% 711/1045 [00:32<00:49, 6.80it/s][A 68% 712/1045 [00:32<00:48, 6.80it/s][A 68% 713/1045 [00:32<00:48, 6.81it/s][A 68% 714/1045 [00:32<00:49, 6.70it/s][A 68% 715/1045 [00:32<00:49, 6.72it/s][A 69% 716/1045 [00:33<00:48, 6.74it/s][A 69% 717/1045 [00:33<00:48, 6.77it/s][A 69% 718/1045 [00:33<00:48, 6.78it/s][A 69% 719/1045 [00:33<00:47, 6.80it/s][A 69% 720/1045 [00:33<00:47, 6.81it/s][A 69% 721/1045 [00:33<00:47, 6.82it/s][A 69% 722/1045 [00:33<00:47, 6.82it/s][A 69% 723/1045 [00:34<00:47, 6.82it/s][A 69% 724/1045 [00:34<00:47, 6.82it/s][A 69% 725/1045 [00:34<00:46, 6.83it/s][A 69% 726/1045 [00:34<00:46, 6.82it/s][A 70% 727/1045 [00:34<00:46, 6.83it/s][A 70% 728/1045 [00:34<00:46, 6.83it/s][A 70% 729/1045 [00:34<00:46, 6.83it/s][A 70% 730/1045 [00:35<00:46, 6.83it/s][A 70% 731/1045 [00:35<00:46, 6.82it/s][A 70% 732/1045 [00:35<00:45, 6.83it/s][A 70% 733/1045 [00:35<00:45, 6.82it/s][A 70% 734/1045 [00:35<00:45, 6.81it/s][A 70% 735/1045 [00:35<00:45, 6.81it/s][A 70% 736/1045 [00:35<00:45, 6.82it/s][A 71% 737/1045 [00:36<00:45, 6.81it/s][A 71% 738/1045 [00:36<00:45, 6.81it/s][A 71% 739/1045 [00:36<00:45, 6.80it/s][A 71% 740/1045 [00:36<00:44, 6.80it/s][A 71% 741/1045 [00:36<00:44, 6.81it/s][A 71% 742/1045 [00:36<00:44, 6.81it/s][A 71% 743/1045 [00:36<00:44, 6.82it/s][A 71% 744/1045 [00:37<00:44, 6.81it/s][A 71% 745/1045 [00:37<00:44, 6.81it/s][A 71% 746/1045 [00:37<00:43, 6.82it/s][A 71% 747/1045 [00:37<00:43, 6.81it/s][A 72% 748/1045 [00:37<00:43, 6.82it/s][A 72% 749/1045 [00:37<00:43, 6.83it/s][A 72% 750/1045 [00:38<00:43, 6.82it/s][A 72% 751/1045 [00:38<00:43, 6.82it/s][A 72% 752/1045 [00:38<00:42, 6.81it/s][A 72% 753/1045 [00:38<00:42, 6.81it/s][A 72% 754/1045 [00:38<00:42, 6.81it/s][A 72% 755/1045 [00:38<00:42, 6.81it/s][A 72% 756/1045 [00:38<00:42, 6.81it/s][A 72% 757/1045 [00:39<00:42, 6.80it/s][A 73% 758/1045 [00:39<00:42, 6.80it/s][A 73% 759/1045 [00:39<00:42, 6.81it/s][A 73% 760/1045 [00:39<00:41, 6.81it/s][A 73% 761/1045 [00:39<00:41, 6.78it/s][A 73% 762/1045 [00:39<00:41, 6.79it/s][A 73% 763/1045 [00:39<00:41, 6.80it/s][A 73% 764/1045 [00:40<00:41, 6.81it/s][A 73% 765/1045 [00:40<00:41, 6.80it/s][A 73% 766/1045 [00:40<00:40, 6.81it/s][A 73% 767/1045 [00:40<00:40, 6.81it/s][A 73% 768/1045 [00:40<00:40, 6.82it/s][A 74% 769/1045 [00:40<00:41, 6.70it/s][A 74% 770/1045 [00:40<00:40, 6.72it/s][A 74% 771/1045 [00:41<00:40, 6.75it/s][A 74% 772/1045 [00:41<00:40, 6.75it/s][A 74% 773/1045 [00:41<00:40, 6.76it/s][A 74% 774/1045 [00:41<00:40, 6.77it/s][A 74% 775/1045 [00:41<00:39, 6.78it/s][A 74% 776/1045 [00:41<00:39, 6.77it/s][A 74% 777/1045 [00:42<00:39, 6.77it/s][A 74% 778/1045 [00:42<00:39, 6.78it/s][A 75% 779/1045 [00:42<00:39, 6.78it/s][A 75% 780/1045 [00:42<00:39, 6.78it/s][A 75% 781/1045 [00:42<00:38, 6.78it/s][A 75% 782/1045 [00:42<00:38, 6.77it/s][A 75% 783/1045 [00:42<00:38, 6.77it/s][A 75% 784/1045 [00:43<00:38, 6.76it/s][A 75% 785/1045 [00:43<00:38, 6.77it/s][A 75% 786/1045 [00:43<00:38, 6.77it/s][A 75% 787/1045 [00:43<00:38, 6.77it/s][A 75% 788/1045 [00:43<00:37, 6.78it/s][A 76% 789/1045 [00:43<00:37, 6.77it/s][A 76% 790/1045 [00:43<00:37, 6.78it/s][A 76% 791/1045 [00:44<00:37, 6.78it/s][A 76% 792/1045 [00:44<00:37, 6.79it/s][A 76% 793/1045 [00:44<00:37, 6.80it/s][A 76% 794/1045 [00:44<00:36, 6.81it/s][A 76% 795/1045 [00:44<00:36, 6.81it/s][A 76% 796/1045 [00:44<00:36, 6.82it/s][A 76% 797/1045 [00:44<00:36, 6.81it/s][A 76% 798/1045 [00:45<00:36, 6.81it/s][A 76% 799/1045 [00:45<00:36, 6.80it/s][A 77% 800/1045 [00:45<00:35, 6.81it/s][A 77% 801/1045 [00:45<00:35, 6.79it/s][A 77% 802/1045 [00:45<00:35, 6.79it/s][A 77% 803/1045 [00:45<00:35, 6.80it/s][A 77% 804/1045 [00:45<00:35, 6.79it/s][A 77% 805/1045 [00:46<00:35, 6.80it/s][A 77% 806/1045 [00:46<00:35, 6.81it/s][A 77% 807/1045 [00:46<00:34, 6.80it/s][A 77% 808/1045 [00:46<00:34, 6.80it/s][A 77% 809/1045 [00:46<00:34, 6.80it/s][A 78% 810/1045 [00:46<00:34, 6.79it/s][A 78% 811/1045 [00:47<00:34, 6.79it/s][A 78% 812/1045 [00:47<00:34, 6.78it/s][A 78% 813/1045 [00:47<00:34, 6.79it/s][A 78% 814/1045 [00:47<00:34, 6.73it/s][A 78% 815/1045 [00:47<00:34, 6.75it/s][A 78% 816/1045 [00:47<00:33, 6.76it/s][A 78% 817/1045 [00:47<00:33, 6.76it/s][A 78% 818/1045 [00:48<00:33, 6.78it/s][A 78% 819/1045 [00:48<00:33, 6.78it/s][A 78% 820/1045 [00:48<00:33, 6.79it/s][A 79% 821/1045 [00:48<00:32, 6.80it/s][A 79% 822/1045 [00:48<00:32, 6.80it/s][A 79% 823/1045 [00:48<00:32, 6.81it/s][A 79% 824/1045 [00:48<00:32, 6.81it/s][A 79% 825/1045 [00:49<00:32, 6.82it/s][A 79% 826/1045 [00:49<00:32, 6.81it/s][A 79% 827/1045 [00:49<00:31, 6.82it/s][A 79% 828/1045 [00:49<00:31, 6.81it/s][A 79% 829/1045 [00:49<00:31, 6.79it/s][A 79% 830/1045 [00:49<00:31, 6.80it/s][A 80% 831/1045 [00:49<00:31, 6.77it/s][A 80% 832/1045 [00:50<00:31, 6.77it/s][A 80% 833/1045 [00:50<00:31, 6.79it/s][A 80% 834/1045 [00:50<00:31, 6.79it/s][A 80% 835/1045 [00:50<00:30, 6.80it/s][A 80% 837/1045 [00:50<00:26, 7.79it/s][A 80% 838/1045 [00:50<00:27, 7.52it/s][A 80% 839/1045 [00:51<00:28, 7.32it/s][A 80% 840/1045 [00:51<00:28, 7.17it/s][A 80% 841/1045 [00:51<00:28, 7.07it/s][A 81% 842/1045 [00:51<00:29, 6.98it/s][A 81% 843/1045 [00:51<00:29, 6.94it/s][A 81% 844/1045 [00:51<00:29, 6.91it/s][A 81% 845/1045 [00:51<00:29, 6.89it/s][A 81% 846/1045 [00:52<00:28, 6.86it/s][A 81% 847/1045 [00:52<00:28, 6.86it/s][A 81% 848/1045 [00:52<00:28, 6.85it/s][A 81% 849/1045 [00:52<00:28, 6.85it/s][A 81% 850/1045 [00:52<00:28, 6.83it/s][A 81% 851/1045 [00:52<00:28, 6.81it/s][A 82% 852/1045 [00:52<00:28, 6.80it/s][A 82% 853/1045 [00:53<00:28, 6.80it/s][A 82% 854/1045 [00:53<00:28, 6.80it/s][A 82% 855/1045 [00:53<00:27, 6.80it/s][A 82% 856/1045 [00:53<00:27, 6.81it/s][A 82% 857/1045 [00:53<00:27, 6.80it/s][A 82% 858/1045 [00:53<00:27, 6.81it/s][A 82% 859/1045 [00:53<00:27, 6.79it/s][A 82% 860/1045 [00:54<00:27, 6.80it/s][A 82% 861/1045 [00:54<00:27, 6.80it/s][A 82% 862/1045 [00:54<00:26, 6.79it/s][A 83% 863/1045 [00:54<00:26, 6.79it/s][A 83% 864/1045 [00:54<00:26, 6.78it/s][A 83% 865/1045 [00:54<00:26, 6.80it/s][A 83% 866/1045 [00:55<00:26, 6.80it/s][A 83% 867/1045 [00:55<00:26, 6.80it/s][A 83% 868/1045 [00:55<00:25, 6.81it/s][A 83% 869/1045 [00:55<00:25, 6.82it/s][A 83% 870/1045 [00:55<00:25, 6.79it/s][A 83% 871/1045 [00:55<00:25, 6.80it/s][A 83% 872/1045 [00:55<00:25, 6.79it/s][A 84% 873/1045 [00:56<00:25, 6.80it/s][A 84% 874/1045 [00:56<00:25, 6.79it/s][A 84% 875/1045 [00:56<00:25, 6.80it/s][A 84% 876/1045 [00:56<00:24, 6.80it/s][A 84% 877/1045 [00:56<00:24, 6.80it/s][A 84% 878/1045 [00:56<00:24, 6.79it/s][A 84% 879/1045 [00:56<00:24, 6.79it/s][A 84% 880/1045 [00:57<00:24, 6.80it/s][A 84% 881/1045 [00:57<00:24, 6.80it/s][A 84% 882/1045 [00:57<00:23, 6.82it/s][A 84% 883/1045 [00:57<00:23, 6.82it/s][A 85% 884/1045 [00:57<00:23, 6.82it/s][A 85% 885/1045 [00:57<00:23, 6.80it/s][A 85% 886/1045 [00:57<00:23, 6.81it/s][A 85% 887/1045 [00:58<00:23, 6.79it/s][A 85% 888/1045 [00:58<00:23, 6.80it/s][A 85% 889/1045 [00:58<00:23, 6.75it/s][A 85% 890/1045 [00:58<00:22, 6.75it/s][A 85% 891/1045 [00:58<00:22, 6.75it/s][A 85% 892/1045 [00:58<00:22, 6.77it/s][A 85% 893/1045 [00:58<00:22, 6.78it/s][A 86% 894/1045 [00:59<00:22, 6.74it/s][A 86% 895/1045 [00:59<00:22, 6.76it/s][A 86% 896/1045 [00:59<00:21, 6.78it/s][A 86% 897/1045 [00:59<00:21, 6.80it/s][A 86% 898/1045 [00:59<00:21, 6.79it/s][A 86% 899/1045 [00:59<00:21, 6.80it/s][A 86% 900/1045 [01:00<00:21, 6.80it/s][A 86% 901/1045 [01:00<00:21, 6.80it/s][A 86% 902/1045 [01:00<00:21, 6.80it/s][A 86% 903/1045 [01:00<00:20, 6.81it/s][A 87% 904/1045 [01:00<00:20, 6.80it/s][A 87% 905/1045 [01:00<00:20, 6.78it/s][A 87% 906/1045 [01:00<00:20, 6.79it/s][A 87% 907/1045 [01:01<00:20, 6.80it/s][A 87% 908/1045 [01:01<00:20, 6.80it/s][A 87% 909/1045 [01:01<00:20, 6.76it/s][A 87% 910/1045 [01:01<00:19, 6.78it/s][A 87% 911/1045 [01:01<00:19, 6.79it/s][A 87% 912/1045 [01:01<00:19, 6.80it/s][A 87% 913/1045 [01:01<00:19, 6.80it/s][A 87% 914/1045 [01:02<00:19, 6.81it/s][A 88% 915/1045 [01:02<00:19, 6.81it/s][A 88% 916/1045 [01:02<00:18, 6.81it/s][A 88% 917/1045 [01:02<00:18, 6.81it/s][A 88% 918/1045 [01:02<00:18, 6.79it/s][A 88% 919/1045 [01:02<00:18, 6.80it/s][A 88% 920/1045 [01:02<00:18, 6.81it/s][A 88% 921/1045 [01:03<00:18, 6.81it/s][A 88% 922/1045 [01:03<00:18, 6.81it/s][A 88% 923/1045 [01:03<00:17, 6.82it/s][A 88% 924/1045 [01:03<00:17, 6.82it/s][A 89% 925/1045 [01:03<00:17, 6.82it/s][A 89% 926/1045 [01:03<00:17, 6.81it/s][A 89% 927/1045 [01:03<00:17, 6.82it/s][A 89% 928/1045 [01:04<00:17, 6.82it/s][A 89% 929/1045 [01:04<00:17, 6.80it/s][A 89% 930/1045 [01:04<00:16, 6.81it/s][A 89% 931/1045 [01:04<00:16, 6.82it/s][A 89% 932/1045 [01:04<00:16, 6.82it/s][A 89% 933/1045 [01:04<00:16, 6.71it/s][A 89% 934/1045 [01:05<00:16, 6.75it/s][A 89% 935/1045 [01:05<00:16, 6.77it/s][A 90% 936/1045 [01:05<00:16, 6.79it/s][A 90% 937/1045 [01:05<00:15, 6.78it/s][A 90% 938/1045 [01:05<00:15, 6.77it/s][A 90% 939/1045 [01:05<00:15, 6.79it/s][A 90% 940/1045 [01:05<00:15, 6.80it/s][A 90% 941/1045 [01:06<00:15, 6.80it/s][A 90% 942/1045 [01:06<00:15, 6.81it/s][A 90% 943/1045 [01:06<00:15, 6.71it/s][A 90% 944/1045 [01:06<00:15, 6.73it/s][A 90% 945/1045 [01:06<00:14, 6.75it/s][A 91% 946/1045 [01:06<00:14, 6.76it/s][A 91% 947/1045 [01:06<00:14, 6.77it/s][A 91% 948/1045 [01:07<00:14, 6.78it/s][A 91% 949/1045 [01:07<00:14, 6.80it/s][A 91% 950/1045 [01:07<00:13, 6.80it/s][A 91% 951/1045 [01:07<00:13, 6.80it/s][A 91% 952/1045 [01:07<00:13, 6.80it/s][A 91% 953/1045 [01:07<00:13, 6.81it/s][A 91% 954/1045 [01:07<00:13, 6.81it/s][A 91% 955/1045 [01:08<00:13, 6.80it/s][A 91% 956/1045 [01:08<00:13, 6.79it/s][A 92% 957/1045 [01:08<00:12, 6.79it/s][A 92% 958/1045 [01:08<00:12, 6.78it/s][A 92% 959/1045 [01:08<00:12, 6.79it/s][A 92% 960/1045 [01:08<00:12, 6.79it/s][A 92% 961/1045 [01:09<00:12, 6.80it/s][A 92% 962/1045 [01:09<00:12, 6.80it/s][A 92% 963/1045 [01:09<00:12, 6.80it/s][A 92% 964/1045 [01:09<00:11, 6.80it/s][A 92% 965/1045 [01:09<00:11, 6.80it/s][A 92% 966/1045 [01:09<00:11, 6.80it/s][A 93% 967/1045 [01:09<00:11, 6.81it/s][A 93% 968/1045 [01:10<00:11, 6.82it/s][A 93% 969/1045 [01:10<00:11, 6.81it/s][A 93% 970/1045 [01:10<00:11, 6.81it/s][A 93% 971/1045 [01:10<00:10, 6.80it/s][A 93% 972/1045 [01:10<00:10, 6.80it/s][A 93% 973/1045 [01:10<00:10, 6.80it/s][A 93% 974/1045 [01:10<00:10, 6.81it/s][A 93% 975/1045 [01:11<00:10, 6.82it/s][A 93% 976/1045 [01:11<00:10, 6.81it/s][A 93% 977/1045 [01:11<00:09, 6.82it/s][A 94% 978/1045 [01:11<00:09, 6.82it/s][A 94% 979/1045 [01:11<00:09, 6.81it/s][A 94% 980/1045 [01:11<00:09, 6.81it/s][A 94% 981/1045 [01:11<00:09, 6.81it/s][A 94% 982/1045 [01:12<00:09, 6.82it/s][A 94% 983/1045 [01:12<00:09, 6.82it/s][A 94% 984/1045 [01:12<00:08, 6.82it/s][A 94% 985/1045 [01:12<00:08, 6.82it/s][A 94% 986/1045 [01:12<00:08, 6.82it/s][A 94% 987/1045 [01:12<00:08, 6.82it/s][A 95% 988/1045 [01:12<00:08, 6.81it/s][A 95% 989/1045 [01:13<00:08, 6.80it/s][A 95% 990/1045 [01:13<00:08, 6.81it/s][A 95% 991/1045 [01:13<00:07, 6.82it/s][A 95% 992/1045 [01:13<00:07, 6.82it/s][A 95% 993/1045 [01:13<00:07, 6.82it/s][A 95% 994/1045 [01:13<00:07, 6.82it/s][A 95% 995/1045 [01:13<00:07, 6.82it/s][A 95% 996/1045 [01:14<00:07, 6.81it/s][A 95% 997/1045 [01:14<00:07, 6.82it/s][A 96% 998/1045 [01:14<00:06, 6.82it/s][A 96% 999/1045 [01:14<00:06, 6.82it/s][A 96% 1000/1045 [01:14<00:06, 6.83it/s][A [A{'loss': 0.2421, 'learning_rate': 8.612440191387561e-07, 'epoch': 4.78} 96% 1000/1045 [01:14<00:06, 6.83it/s][A[INFO|trainer.py:2709] 2023-02-10 21:24:23,900 >> Saving model checkpoint to out/imdb-5k/gpt2/checkpoint-1000 [INFO|configuration_utils.py:453] 2023-02-10 21:24:23,901 >> Configuration saved in out/imdb-5k/gpt2/checkpoint-1000/config.json [INFO|modeling_utils.py:1704] 2023-02-10 21:24:24,615 >> Model weights saved in out/imdb-5k/gpt2/checkpoint-1000/pytorch_model.bin [INFO|tokenization_utils_base.py:2160] 2023-02-10 21:24:24,616 >> tokenizer config file saved in out/imdb-5k/gpt2/checkpoint-1000/tokenizer_config.json [INFO|tokenization_utils_base.py:2167] 2023-02-10 21:24:24,616 >> Special tokens file saved in out/imdb-5k/gpt2/checkpoint-1000/special_tokens_map.json 96% 1001/1045 [01:17<00:36, 1.20it/s][A 96% 1002/1045 [01:17<00:26, 1.59it/s][A 96% 1003/1045 [01:17<00:20, 2.07it/s][A 96% 1004/1045 [01:17<00:15, 2.62it/s][A 96% 1005/1045 [01:17<00:12, 3.21it/s][A 96% 1006/1045 [01:17<00:10, 3.81it/s][A 96% 1007/1045 [01:18<00:08, 4.39it/s][A 96% 1008/1045 [01:18<00:07, 4.90it/s][A 97% 1009/1045 [01:18<00:06, 5.36it/s][A 97% 1010/1045 [01:18<00:06, 5.64it/s][A 97% 1011/1045 [01:18<00:05, 5.93it/s][A 97% 1012/1045 [01:18<00:05, 6.17it/s][A 97% 1013/1045 [01:18<00:05, 6.35it/s][A 97% 1014/1045 [01:19<00:04, 6.49it/s][A 97% 1015/1045 [01:19<00:04, 6.57it/s][A 97% 1016/1045 [01:19<00:04, 6.64it/s][A 97% 1017/1045 [01:19<00:04, 6.70it/s][A 97% 1018/1045 [01:19<00:04, 6.74it/s][A 98% 1019/1045 [01:19<00:03, 6.77it/s][A 98% 1020/1045 [01:19<00:03, 6.79it/s][A 98% 1021/1045 [01:20<00:03, 6.79it/s][A 98% 1022/1045 [01:20<00:03, 6.81it/s][A 98% 1023/1045 [01:20<00:03, 6.81it/s][A 98% 1024/1045 [01:20<00:03, 6.80it/s][A 98% 1025/1045 [01:20<00:02, 6.81it/s][A 98% 1026/1045 [01:20<00:02, 6.81it/s][A 98% 1027/1045 [01:20<00:02, 6.81it/s][A 98% 1028/1045 [01:21<00:02, 6.81it/s][A 98% 1029/1045 [01:21<00:02, 6.81it/s][A 99% 1030/1045 [01:21<00:02, 6.80it/s][A 99% 1031/1045 [01:21<00:02, 6.81it/s][A 99% 1032/1045 [01:21<00:01, 6.81it/s][A 99% 1033/1045 [01:21<00:01, 6.81it/s][A 99% 1034/1045 [01:22<00:01, 6.81it/s][A 99% 1035/1045 [01:22<00:01, 6.81it/s][A 99% 1036/1045 [01:22<00:01, 6.70it/s][A 99% 1037/1045 [01:22<00:01, 6.73it/s][A 99% 1038/1045 [01:22<00:01, 6.75it/s][A 99% 1039/1045 [01:22<00:00, 6.77it/s][A 100% 1040/1045 [01:22<00:00, 6.78it/s][A 100% 1041/1045 [01:23<00:00, 6.79it/s][A 100% 1042/1045 [01:23<00:00, 6.80it/s][A 100% 1043/1045 [01:23<00:00, 6.80it/s][A 100% 1044/1045 [01:23<00:00, 6.81it/s][A[INFO|trainer.py:1901] 2023-02-10 21:24:32,729 >> Training completed. Do not forget to share your model on huggingface.co/models =) [A{'train_runtime': 83.5689, 'train_samples_per_second': 299.094, 'train_steps_per_second': 12.505, 'train_loss': 0.1263527454942037, 'epoch': 5.0} 100% 1045/1045 [01:23<00:00, 12.51it/s] [INFO|trainer.py:2709] 2023-02-10 21:24:32,732 >> Saving model checkpoint to out/imdb-5k/gpt2 [INFO|configuration_utils.py:453] 2023-02-10 21:24:32,733 >> Configuration saved in out/imdb-5k/gpt2/config.json [INFO|modeling_utils.py:1704] 2023-02-10 21:24:33,797 >> Model weights saved in out/imdb-5k/gpt2/pytorch_model.bin [INFO|tokenization_utils_base.py:2160] 2023-02-10 21:24:33,797 >> tokenizer config file saved in out/imdb-5k/gpt2/tokenizer_config.json [INFO|tokenization_utils_base.py:2167] 2023-02-10 21:24:33,798 >> Special tokens file saved in out/imdb-5k/gpt2/special_tokens_map.json ***** train metrics ***** epoch = 5.0 train_loss = 0.1264 train_runtime = 0:01:23.56 train_samples = 4999 train_samples_per_second = 299.094 train_steps_per_second = 12.505 INFO:__main__:*** Evaluate *** [INFO|trainer.py:710] 2023-02-10 21:24:33,908 >> The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`, you can safely ignore this message. [INFO|trainer.py:2964] 2023-02-10 21:24:33,910 >> ***** Running Evaluation ***** [INFO|trainer.py:2966] 2023-02-10 21:24:33,910 >> Num examples = 1274 [INFO|trainer.py:2969] 2023-02-10 21:24:33,910 >> Batch size = 24 100% 54/54 [00:02<00:00, 21.70it/s] ***** eval metrics ***** epoch = 5.0 eval_accuracy = 0.9278 eval_loss = 0.1801 eval_runtime = 0:00:02.53 eval_samples = 1274 eval_samples_per_second = 502.583 eval_steps_per_second = 21.303 INFO:__main__:*** Predict *** [INFO|trainer.py:710] 2023-02-10 21:24:36,448 >> The following columns in the test set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`, you can safely ignore this message. [INFO|trainer.py:2964] 2023-02-10 21:24:36,449 >> ***** Running Prediction ***** [INFO|trainer.py:2966] 2023-02-10 21:24:36,450 >> Num examples = 1278 [INFO|trainer.py:2969] 2023-02-10 21:24:36,450 >> Batch size = 24 100% 54/54 [00:02<00:00, 22.06it/s] INFO:__main__:***** Predict results None ***** [INFO|modelcard.py:449] 2023-02-10 21:24:39,131 >> Dropping the following result as it does not have all the necessary fields: {'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.9277864694595337}]}
from transformers import GPT2Config, GPT2Tokenizer, GPT2Model, Trainer, TrainingArguments, GPT2ForSequenceClassification
from datasets import load_dataset
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
config=GPT2Config(vocab_size=2048, return_token_type_ids=False)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = GPT2ForSequenceClassification.from_pretrained('gpt2')
def tokenization(batched_text):
return tokenizer(batched_text['text'], return_tensors='pt', padding=True)
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
dataset = load_dataset("emotion")
train_data= dataset["train"]
test_data = dataset["test"]
eval_data = dataset["validation"]
train_data = train_data.map(tokenization, batched=True, batch_size=len(train_data))
eval_data = eval_data.map(tokenization, batched=True, batch_size=len(eval_data))
train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
eval_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
training_args = TrainingArguments(
output_dir="./output",
num_train_epochs=3,
per_device_train_batch_size = 8,
gradient_accumulation_steps = 16,
per_device_eval_batch_size= 8,
evaluation_strategy = "epoch",
save_strategy = "epoch",
disable_tqdm = False,
load_best_model_at_end=True,
warmup_steps=10,
weight_decay=0.01,
logging_steps = 4,
fp16 = True,
dataloader_num_workers = 2,
run_name = 'gpt-2-classification'
)
trainer = Trainer(
model=model,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_data,
eval_dataset=eval_data,
)
trainer.train()
i = 0
sum_preds = 0
model = model.to('cpu')
for line in test_data:
inputs = tokenizer(line.get('text'), return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(**inputs, labels=labels)
_, predictions = torch.max(outputs[1], 1)
a = int(predictions.int())
b = line.get('label')
print(i)
i += 1
sum_preds += int(a == b)
print(f"ACCURACY: {(sum_preds/i * 100)}")
Downloading (…)olve/main/vocab.json: 0%| | 0.00/1.04M [00:00<?, ?B/s]
Downloading (…)olve/main/merges.txt: 0%| | 0.00/456k [00:00<?, ?B/s]
Downloading (…)lve/main/config.json: 0%| | 0.00/665 [00:00<?, ?B/s]
Downloading (…)"pytorch_model.bin";: 0%| | 0.00/548M [00:00<?, ?B/s]
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading builder script: 0%| | 0.00/3.97k [00:00<?, ?B/s]
Downloading metadata: 0%| | 0.00/3.28k [00:00<?, ?B/s]
Downloading readme: 0%| | 0.00/8.78k [00:00<?, ?B/s]
WARNING:datasets.builder:No config specified, defaulting to: emotion/split
Downloading and preparing dataset emotion/split to /root/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd...
Downloading data files: 0%| | 0/3 [00:00<?, ?it/s]
Downloading data: 0%| | 0.00/592k [00:00<?, ?B/s]
Downloading data: 0%| | 0.00/74.0k [00:00<?, ?B/s]
Downloading data: 0%| | 0.00/74.9k [00:00<?, ?B/s]
Extracting data files: 0%| | 0/3 [00:00<?, ?it/s]
Generating train split: 0%| | 0/16000 [00:00<?, ? examples/s]
Generating validation split: 0%| | 0/2000 [00:00<?, ? examples/s]
Generating test split: 0%| | 0/2000 [00:00<?, ? examples/s]
Dataset emotion downloaded and prepared to /root/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd. Subsequent calls will reuse this data.
0%| | 0/3 [00:00<?, ?it/s]
0%| | 0/1 [00:00<?, ?ba/s]
0%| | 0/1 [00:00<?, ?ba/s]
Using cuda_amp half precision backend The following columns in the training set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`, you can safely ignore this message. /usr/local/lib/python3.8/dist-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning warnings.warn( ***** Running training ***** Num examples = 16000 Num Epochs = 3 Instantaneous batch size per device = 8 Total train batch size (w. parallel, distributed & accumulation) = 128 Gradient Accumulation steps = 16 Total optimization steps = 375 Number of trainable parameters = 124441344
[0;31m---------------------------------------------------------------------------[0m [0;31mRuntimeError[0m Traceback (most recent call last) [0;32m<ipython-input-2-c792703afcef>[0m in [0;36m<module>[0;34m[0m [1;32m 66[0m ) [1;32m 67[0m [0;34m[0m[0m [0;32m---> 68[0;31m [0mtrainer[0m[0;34m.[0m[0mtrain[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 69[0m [0mi[0m [0;34m=[0m [0;36m0[0m[0;34m[0m[0;34m[0m[0m [1;32m 70[0m [0msum_preds[0m [0;34m=[0m [0;36m0[0m[0;34m[0m[0;34m[0m[0m [0;32m/usr/local/lib/python3.8/dist-packages/transformers/trainer.py[0m in [0;36mtrain[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)[0m [1;32m 1541[0m [0mself[0m[0;34m.[0m[0m_inner_training_loop[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0m_train_batch_size[0m[0;34m,[0m [0margs[0m[0;34m.[0m[0mauto_find_batch_size[0m[0;34m[0m[0;34m[0m[0m [1;32m 1542[0m ) [0;32m-> 1543[0;31m return inner_training_loop( [0m[1;32m 1544[0m [0margs[0m[0;34m=[0m[0margs[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m [1;32m 1545[0m [0mresume_from_checkpoint[0m[0;34m=[0m[0mresume_from_checkpoint[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m [0;32m/usr/local/lib/python3.8/dist-packages/transformers/trainer.py[0m in [0;36m_inner_training_loop[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)[0m [1;32m 1789[0m [0mtr_loss_step[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mtraining_step[0m[0;34m([0m[0mmodel[0m[0;34m,[0m [0minputs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [1;32m 1790[0m [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m-> 1791[0;31m [0mtr_loss_step[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mtraining_step[0m[0;34m([0m[0mmodel[0m[0;34m,[0m [0minputs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 1792[0m [0;34m[0m[0m [1;32m 1793[0m if ( [0;32m/usr/local/lib/python3.8/dist-packages/transformers/trainer.py[0m in [0;36mtraining_step[0;34m(self, model, inputs)[0m [1;32m 2537[0m [0;34m[0m[0m [1;32m 2538[0m [0;32mwith[0m [0mself[0m[0;34m.[0m[0mcompute_loss_context_manager[0m[0;34m([0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m-> 2539[0;31m [0mloss[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mcompute_loss[0m[0;34m([0m[0mmodel[0m[0;34m,[0m [0minputs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 2540[0m [0;34m[0m[0m [1;32m 2541[0m [0;32mif[0m [0mself[0m[0;34m.[0m[0margs[0m[0;34m.[0m[0mn_gpu[0m [0;34m>[0m [0;36m1[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m/usr/local/lib/python3.8/dist-packages/transformers/trainer.py[0m in [0;36mcompute_loss[0;34m(self, model, inputs, return_outputs)[0m [1;32m 2569[0m [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [1;32m 2570[0m [0mlabels[0m [0;34m=[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m [0;32m-> 2571[0;31m [0moutputs[0m [0;34m=[0m [0mmodel[0m[0;34m([0m[0;34m**[0m[0minputs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 2572[0m [0;31m# Save past state if it exists[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [1;32m 2573[0m [0;31m# TODO: this needs to be fixed and made cleaner later.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [0;32m/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py[0m in [0;36m_call_impl[0;34m(self, *input, **kwargs)[0m [1;32m 1192[0m if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks [1;32m 1193[0m or _global_forward_hooks or _global_forward_pre_hooks): [0;32m-> 1194[0;31m [0;32mreturn[0m [0mforward_call[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 1195[0m [0;31m# Do not call functions when jit is used[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [1;32m 1196[0m [0mfull_backward_hooks[0m[0;34m,[0m [0mnon_full_backward_hooks[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m,[0m [0;34m[[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m [0;32m/usr/local/lib/python3.8/dist-packages/transformers/models/gpt2/modeling_gpt2.py[0m in [0;36mforward[0;34m(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)[0m [1;32m 1368[0m [0mreturn_dict[0m [0;34m=[0m [0mreturn_dict[0m [0;32mif[0m [0mreturn_dict[0m [0;32mis[0m [0;32mnot[0m [0;32mNone[0m [0;32melse[0m [0mself[0m[0;34m.[0m[0mconfig[0m[0;34m.[0m[0muse_return_dict[0m[0;34m[0m[0;34m[0m[0m [1;32m 1369[0m [0;34m[0m[0m [0;32m-> 1370[0;31m transformer_outputs = self.transformer( [0m[1;32m 1371[0m [0minput_ids[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m [1;32m 1372[0m [0mpast_key_values[0m[0;34m=[0m[0mpast_key_values[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m [0;32m/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py[0m in [0;36m_call_impl[0;34m(self, *input, **kwargs)[0m [1;32m 1192[0m if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks [1;32m 1193[0m or _global_forward_hooks or _global_forward_pre_hooks): [0;32m-> 1194[0;31m [0;32mreturn[0m [0mforward_call[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 1195[0m [0;31m# Do not call functions when jit is used[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [1;32m 1196[0m [0mfull_backward_hooks[0m[0;34m,[0m [0mnon_full_backward_hooks[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m,[0m [0;34m[[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m [0;32m/usr/local/lib/python3.8/dist-packages/transformers/models/gpt2/modeling_gpt2.py[0m in [0;36mforward[0;34m(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions, output_hidden_states, return_dict)[0m [1;32m 885[0m ) [1;32m 886[0m [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 887[0;31m outputs = block( [0m[1;32m 888[0m [0mhidden_states[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m [1;32m 889[0m [0mlayer_past[0m[0;34m=[0m[0mlayer_past[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m [0;32m/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py[0m in [0;36m_call_impl[0;34m(self, *input, **kwargs)[0m [1;32m 1192[0m if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks [1;32m 1193[0m or _global_forward_hooks or _global_forward_pre_hooks): [0;32m-> 1194[0;31m [0;32mreturn[0m [0mforward_call[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 1195[0m [0;31m# Do not call functions when jit is used[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [1;32m 1196[0m [0mfull_backward_hooks[0m[0;34m,[0m [0mnon_full_backward_hooks[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m,[0m [0;34m[[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m [0;32m/usr/local/lib/python3.8/dist-packages/transformers/models/gpt2/modeling_gpt2.py[0m in [0;36mforward[0;34m(self, hidden_states, layer_past, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions)[0m [1;32m 386[0m [0mresidual[0m [0;34m=[0m [0mhidden_states[0m[0;34m[0m[0;34m[0m[0m [1;32m 387[0m [0mhidden_states[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mln_1[0m[0;34m([0m[0mhidden_states[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 388[0;31m attn_outputs = self.attn( [0m[1;32m 389[0m [0mhidden_states[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m [1;32m 390[0m [0mlayer_past[0m[0;34m=[0m[0mlayer_past[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m [0;32m/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py[0m in [0;36m_call_impl[0;34m(self, *input, **kwargs)[0m [1;32m 1192[0m if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks [1;32m 1193[0m or _global_forward_hooks or _global_forward_pre_hooks): [0;32m-> 1194[0;31m [0;32mreturn[0m [0mforward_call[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 1195[0m [0;31m# Do not call functions when jit is used[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [1;32m 1196[0m [0mfull_backward_hooks[0m[0;34m,[0m [0mnon_full_backward_hooks[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m,[0m [0;34m[[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m [0;32m/usr/local/lib/python3.8/dist-packages/transformers/models/gpt2/modeling_gpt2.py[0m in [0;36mforward[0;34m(self, hidden_states, layer_past, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions)[0m [1;32m 308[0m [0mattention_mask[0m [0;34m=[0m [0mencoder_attention_mask[0m[0;34m[0m[0;34m[0m[0m [1;32m 309[0m [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 310[0;31m [0mquery[0m[0;34m,[0m [0mkey[0m[0;34m,[0m [0mvalue[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mc_attn[0m[0;34m([0m[0mhidden_states[0m[0;34m)[0m[0;34m.[0m[0msplit[0m[0;34m([0m[0mself[0m[0;34m.[0m[0msplit_size[0m[0;34m,[0m [0mdim[0m[0;34m=[0m[0;36m2[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 311[0m [0;34m[0m[0m [1;32m 312[0m [0mquery[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_split_heads[0m[0;34m([0m[0mquery[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mnum_heads[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mhead_dim[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py[0m in [0;36m_call_impl[0;34m(self, *input, **kwargs)[0m [1;32m 1192[0m if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks [1;32m 1193[0m or _global_forward_hooks or _global_forward_pre_hooks): [0;32m-> 1194[0;31m [0;32mreturn[0m [0mforward_call[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 1195[0m [0;31m# Do not call functions when jit is used[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [1;32m 1196[0m [0mfull_backward_hooks[0m[0;34m,[0m [0mnon_full_backward_hooks[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m,[0m [0;34m[[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m [0;32m/usr/local/lib/python3.8/dist-packages/transformers/pytorch_utils.py[0m in [0;36mforward[0;34m(self, x)[0m [1;32m 113[0m [0;32mdef[0m [0mforward[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mx[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [1;32m 114[0m [0msize_out[0m [0;34m=[0m [0mx[0m[0;34m.[0m[0msize[0m[0;34m([0m[0;34m)[0m[0;34m[[0m[0;34m:[0m[0;34m-[0m[0;36m1[0m[0;34m][0m [0;34m+[0m [0;34m([0m[0mself[0m[0;34m.[0m[0mnf[0m[0;34m,[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 115[0;31m [0mx[0m [0;34m=[0m [0mtorch[0m[0;34m.[0m[0maddmm[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mbias[0m[0;34m,[0m [0mx[0m[0;34m.[0m[0mview[0m[0;34m([0m[0;34m-[0m[0;36m1[0m[0;34m,[0m [0mx[0m[0;34m.[0m[0msize[0m[0;34m([0m[0;34m-[0m[0;36m1[0m[0;34m)[0m[0;34m)[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mweight[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 116[0m [0mx[0m [0;34m=[0m [0mx[0m[0;34m.[0m[0mview[0m[0;34m([0m[0msize_out[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [1;32m 117[0m [0;32mreturn[0m [0mx[0m[0;34m[0m[0;34m[0m[0m [0;31mRuntimeError[0m: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`