Uczenie-Glebokie/uczenie_glebokie.ipynb
2022-02-17 17:59:17 +01:00

319 KiB

GPT2

!python run_glue_no_trainer.py \
  --model_name_or_path gpt2 \
  --train_file data/train.json  \
  --validation_file data/valid.json \
  --test_file data/test.json \
  --per_device_train_batch_size 24 \
  --per_device_eval_batch_size 24 \
  --max_length 128 \
  --learning_rate 2e-5 \
  --num_train_epochs 1 \
  --output_dir out/tweet/gpt2
02/16/2022 00:13:42 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cpu
Use FP16 precision: False

02/16/2022 00:13:43 - WARNING - datasets.builder - Using custom data configuration default-67c9d932a627b7b8
02/16/2022 00:13:43 - WARNING - datasets.builder - Reusing dataset json (C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)

  0%|          | 0/3 [00:00<?, ?it/s]
100%|##########| 3/3 [00:00<00:00, 1491.40it/s]
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 50257
}

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 50257
}

loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at C:\Users\Foka/.cache\huggingface\transformers\684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at C:\Users\Foka/.cache\huggingface\transformers\c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/gpt2/resolve/main/tokenizer.json from cache at C:\Users\Foka/.cache\huggingface\transformers\16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0
loading file https://huggingface.co/gpt2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 50257
}

02/16/2022 00:13:48 - INFO - __main__ - Return hidden states from model: False
02/16/2022 00:13:48 - INFO - __main__ - Using implementation from: AutoModelForSequenceClassification
loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at C:\Users\Foka/.cache\huggingface\transformers\752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925
All model checkpoint weights were used when initializing GPT2ForSequenceClassification.

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using pad_token, but it is not set yet.
02/16/2022 00:13:50 - INFO - __main__ - Set PAD token to EOS: <|endoftext|>
02/16/2022 00:13:50 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-18c6f53370629db4.arrow
02/16/2022 00:13:50 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-da48038acf63cb08.arrow

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]
Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 55.70ba/s]
02/16/2022 00:13:50 - INFO - __main__ - Sample 2755 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [2435, 284, 651, 3772, 0, 340, 338, 264, 3658, 6184, 108, 126, 253, 126, 240, 126, 246, 220, 220, 220, 1303, 82, 3658, 1303, 10464, 437, 220], 'labels': 0}.
02/16/2022 00:13:50 - INFO - __main__ - Sample 2054 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [31, 7220, 2488, 7220, 220, 909, 1689, 1222, 696, 26, 8406, 268, 389, 262, 749, 1303, 17096, 11186, 220, 1893, 1222, 696, 26, 410, 79, 287, 2106, 13, 1303, 40954], 'labels': 1}.
02/16/2022 00:13:50 - INFO - __main__ - Sample 551 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [31, 7220, 523, 318, 340, 572, 605, 326, 1303, 73, 15515, 389, 8720, 220, 287, 262, 2951, 286, 262, 1303, 8019, 83, 446, 14568, 30, 220], 'labels': 1}.
02/16/2022 00:13:51 - INFO - __main__ - ***** Running training *****
02/16/2022 00:13:51 - INFO - __main__ -   Num examples = 4742
02/16/2022 00:13:51 - INFO - __main__ -   Num Epochs = 1
02/16/2022 00:13:51 - INFO - __main__ -   Instantaneous batch size per device = 24
02/16/2022 00:13:51 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 24
02/16/2022 00:13:51 - INFO - __main__ -   Gradient Accumulation steps = 1
02/16/2022 00:13:51 - INFO - __main__ -   Total optimization steps = 198

  0%|          | 0/198 [00:00<?, ?it/s]
  1%|          | 1/198 [00:03<10:40,  3.25s/it]
  1%|1         | 2/198 [00:06<10:12,  3.12s/it]
  2%|1         | 3/198 [00:10<11:42,  3.60s/it]
  2%|2         | 4/198 [00:13<10:52,  3.37s/it]
  3%|2         | 5/198 [00:17<11:54,  3.70s/it]
  3%|3         | 6/198 [00:25<16:09,  5.05s/it]
  4%|3         | 7/198 [00:30<16:15,  5.11s/it]
  4%|4         | 8/198 [00:37<18:02,  5.70s/it]
  5%|4         | 9/198 [00:42<17:15,  5.48s/it]
  5%|5         | 10/198 [00:48<17:15,  5.51s/it]
  6%|5         | 11/198 [00:51<15:20,  4.92s/it]
  6%|6         | 12/198 [00:55<13:50,  4.47s/it]
  7%|6         | 13/198 [01:00<14:10,  4.60s/it]
  7%|7         | 14/198 [01:02<12:23,  4.04s/it]
  8%|7         | 15/198 [01:10<15:20,  5.03s/it]
  8%|8         | 16/198 [01:14<14:41,  4.84s/it]
  9%|8         | 17/198 [01:18<13:32,  4.49s/it]
  9%|9         | 18/198 [01:21<12:04,  4.03s/it]
 10%|9         | 19/198 [01:24<11:16,  3.78s/it]
 10%|#         | 20/198 [01:27<10:43,  3.61s/it]
 11%|#         | 21/198 [01:31<11:11,  3.79s/it]
 11%|#1        | 22/198 [01:35<10:44,  3.66s/it]
 12%|#1        | 23/198 [01:40<11:44,  4.02s/it]
 12%|#2        | 24/198 [01:44<11:37,  4.01s/it]
 13%|#2        | 25/198 [01:47<10:46,  3.74s/it]
 13%|#3        | 26/198 [01:51<11:16,  3.93s/it]
 14%|#3        | 27/198 [01:55<11:30,  4.04s/it]
 14%|#4        | 28/198 [02:00<11:42,  4.13s/it]
 15%|#4        | 29/198 [02:03<10:55,  3.88s/it]
 15%|#5        | 30/198 [02:07<10:44,  3.84s/it]
 16%|#5        | 31/198 [02:10<10:06,  3.63s/it]
 16%|#6        | 32/198 [02:13<10:00,  3.62s/it]
 17%|#6        | 33/198 [02:17<09:35,  3.49s/it]
 17%|#7        | 34/198 [02:21<10:11,  3.73s/it]
 18%|#7        | 35/198 [02:25<10:17,  3.79s/it]
 18%|#8        | 36/198 [02:28<09:29,  3.51s/it]
 19%|#8        | 37/198 [02:33<11:12,  4.18s/it]
 19%|#9        | 38/198 [02:36<10:13,  3.84s/it]
 20%|#9        | 39/198 [02:40<10:02,  3.79s/it]
 20%|##        | 40/198 [02:44<10:18,  3.92s/it]
 21%|##        | 41/198 [02:48<09:38,  3.68s/it]
 21%|##1       | 42/198 [02:52<10:11,  3.92s/it]
 22%|##1       | 43/198 [02:58<11:44,  4.55s/it]
 22%|##2       | 44/198 [03:02<11:02,  4.30s/it]
 23%|##2       | 45/198 [03:06<11:16,  4.42s/it]
 23%|##3       | 46/198 [03:09<10:02,  3.96s/it]
 24%|##3       | 47/198 [03:13<09:44,  3.87s/it]
 24%|##4       | 48/198 [03:16<08:55,  3.57s/it]
 25%|##4       | 49/198 [03:21<09:54,  3.99s/it]
 25%|##5       | 50/198 [03:28<12:26,  5.04s/it]
 26%|##5       | 51/198 [03:32<11:09,  4.55s/it]
 26%|##6       | 52/198 [03:35<10:14,  4.21s/it]
 27%|##6       | 53/198 [03:39<09:42,  4.02s/it]
 27%|##7       | 54/198 [03:46<11:52,  4.95s/it]
 28%|##7       | 55/198 [03:49<10:34,  4.44s/it]
 28%|##8       | 56/198 [03:51<09:02,  3.82s/it]
 29%|##8       | 57/198 [03:56<09:16,  3.95s/it]
 29%|##9       | 58/198 [03:59<08:56,  3.83s/it]
 30%|##9       | 59/198 [04:02<08:02,  3.47s/it]
 30%|###       | 60/198 [04:05<07:40,  3.34s/it]
 31%|###       | 61/198 [04:12<10:15,  4.49s/it]
 31%|###1      | 62/198 [04:14<08:45,  3.86s/it]
 32%|###1      | 63/198 [04:19<08:55,  3.97s/it]
 32%|###2      | 64/198 [04:23<09:05,  4.07s/it]
 33%|###2      | 65/198 [04:27<09:05,  4.10s/it]
 33%|###3      | 66/198 [04:31<09:04,  4.12s/it]
 34%|###3      | 67/198 [04:34<08:15,  3.79s/it]
 34%|###4      | 68/198 [04:37<07:34,  3.50s/it]
 35%|###4      | 69/198 [04:44<09:48,  4.56s/it]
 35%|###5      | 70/198 [04:47<08:53,  4.17s/it]
 36%|###5      | 71/198 [04:52<08:49,  4.17s/it]
 36%|###6      | 72/198 [04:56<08:46,  4.18s/it]
 37%|###6      | 73/198 [04:59<08:01,  3.85s/it]
 37%|###7      | 74/198 [05:02<07:26,  3.60s/it]
 38%|###7      | 75/198 [05:08<08:49,  4.31s/it]
 38%|###8      | 76/198 [05:12<08:36,  4.23s/it]
 39%|###8      | 77/198 [05:16<08:33,  4.24s/it]
 39%|###9      | 78/198 [05:20<08:29,  4.25s/it]
 40%|###9      | 79/198 [05:23<07:37,  3.84s/it]
 40%|####      | 80/198 [05:26<06:55,  3.52s/it]
 41%|####      | 81/198 [05:30<07:07,  3.66s/it]
 41%|####1     | 82/198 [05:33<06:32,  3.39s/it]
 42%|####1     | 83/198 [05:37<06:40,  3.48s/it]
 42%|####2     | 84/198 [05:41<07:02,  3.71s/it]
 43%|####2     | 85/198 [05:44<06:27,  3.43s/it]
 43%|####3     | 86/198 [05:48<06:54,  3.70s/it]
 44%|####3     | 87/198 [05:52<07:09,  3.87s/it]
 44%|####4     | 88/198 [05:55<06:29,  3.54s/it]
 45%|####4     | 89/198 [06:01<07:48,  4.29s/it]
 45%|####5     | 90/198 [06:04<06:50,  3.80s/it]
 46%|####5     | 91/198 [06:06<06:05,  3.42s/it]
 46%|####6     | 92/198 [06:09<05:42,  3.23s/it]
 47%|####6     | 93/198 [06:13<05:59,  3.42s/it]
 47%|####7     | 94/198 [06:16<05:45,  3.33s/it]
 48%|####7     | 95/198 [06:18<05:16,  3.07s/it]
 48%|####8     | 96/198 [06:22<05:13,  3.07s/it]
 49%|####8     | 97/198 [06:25<05:12,  3.09s/it]
 49%|####9     | 98/198 [06:28<05:03,  3.03s/it]
 50%|#####     | 99/198 [06:31<05:00,  3.03s/it]
 51%|#####     | 100/198 [06:33<04:53,  3.00s/it]
 51%|#####1    | 101/198 [06:36<04:50,  2.99s/it]
 52%|#####1    | 102/198 [06:41<05:21,  3.35s/it]
 52%|#####2    | 103/198 [06:45<05:39,  3.57s/it]
 53%|#####2    | 104/198 [06:48<05:28,  3.49s/it]
 53%|#####3    | 105/198 [06:54<06:28,  4.18s/it]
 54%|#####3    | 106/198 [06:56<05:42,  3.72s/it]
 54%|#####4    | 107/198 [07:00<05:21,  3.53s/it]
 55%|#####4    | 108/198 [07:02<04:57,  3.30s/it]
 55%|#####5    | 109/198 [07:05<04:32,  3.06s/it]
 56%|#####5    | 110/198 [07:08<04:23,  2.99s/it]
 56%|#####6    | 111/198 [07:10<04:10,  2.88s/it]
 57%|#####6    | 112/198 [07:13<04:00,  2.80s/it]
 57%|#####7    | 113/198 [07:16<04:15,  3.01s/it]
 58%|#####7    | 114/198 [07:20<04:23,  3.13s/it]
 58%|#####8    | 115/198 [07:23<04:26,  3.21s/it]
 59%|#####8    | 116/198 [07:26<04:18,  3.15s/it]
 59%|#####9    | 117/198 [07:30<04:21,  3.22s/it]
 60%|#####9    | 118/198 [07:37<05:52,  4.41s/it]
 60%|######    | 119/198 [07:42<06:16,  4.76s/it]
 61%|######    | 120/198 [07:47<05:58,  4.60s/it]
 61%|######1   | 121/198 [07:49<05:07,  4.00s/it]
 62%|######1   | 122/198 [07:52<04:34,  3.61s/it]
 62%|######2   | 123/198 [07:55<04:14,  3.40s/it]
 63%|######2   | 124/198 [07:57<03:55,  3.19s/it]
 63%|######3   | 125/198 [08:02<04:13,  3.47s/it]
 64%|######3   | 126/198 [08:05<04:17,  3.57s/it]
 64%|######4   | 127/198 [08:10<04:27,  3.77s/it]
 65%|######4   | 128/198 [08:12<04:02,  3.47s/it]
 65%|######5   | 129/198 [08:17<04:24,  3.84s/it]
 66%|######5   | 130/198 [08:21<04:28,  3.95s/it]
 66%|######6   | 131/198 [08:24<03:52,  3.47s/it]
 67%|######6   | 132/198 [08:27<03:40,  3.34s/it]
 67%|######7   | 133/198 [08:31<03:58,  3.66s/it]
 68%|######7   | 134/198 [08:36<04:11,  3.93s/it]
 68%|######8   | 135/198 [08:38<03:45,  3.58s/it]
 69%|######8   | 136/198 [08:41<03:26,  3.32s/it]
 69%|######9   | 137/198 [08:45<03:32,  3.49s/it]
 70%|######9   | 138/198 [08:49<03:43,  3.72s/it]
 70%|#######   | 139/198 [08:53<03:37,  3.68s/it]
 71%|#######   | 140/198 [08:57<03:38,  3.76s/it]
 71%|#######1  | 141/198 [09:00<03:18,  3.49s/it]
 72%|#######1  | 142/198 [09:03<03:07,  3.34s/it]
 72%|#######2  | 143/198 [09:07<03:17,  3.59s/it]
 73%|#######2  | 144/198 [09:10<03:03,  3.41s/it]
 73%|#######3  | 145/198 [09:13<02:58,  3.37s/it]
 74%|#######3  | 146/198 [09:17<03:07,  3.60s/it]
 74%|#######4  | 147/198 [09:21<03:01,  3.56s/it]
 75%|#######4  | 148/198 [09:25<03:13,  3.88s/it]
 75%|#######5  | 149/198 [09:29<03:02,  3.72s/it]
 76%|#######5  | 150/198 [09:33<03:05,  3.86s/it]
 76%|#######6  | 151/198 [09:36<02:51,  3.65s/it]
 77%|#######6  | 152/198 [09:40<02:51,  3.73s/it]
 77%|#######7  | 153/198 [09:43<02:40,  3.56s/it]
 78%|#######7  | 154/198 [09:46<02:27,  3.35s/it]
 78%|#######8  | 155/198 [09:50<02:31,  3.51s/it]
 79%|#######8  | 156/198 [09:53<02:19,  3.33s/it]
 79%|#######9  | 157/198 [09:56<02:12,  3.24s/it]
 80%|#######9  | 158/198 [09:58<02:02,  3.05s/it]
 80%|########  | 159/198 [10:01<01:52,  2.89s/it]
 81%|########  | 160/198 [10:04<01:46,  2.81s/it]
 81%|########1 | 161/198 [10:08<01:59,  3.23s/it]
 82%|########1 | 162/198 [10:11<02:00,  3.36s/it]
 82%|########2 | 163/198 [10:15<01:56,  3.32s/it]
 83%|########2 | 164/198 [10:19<02:01,  3.58s/it]
 83%|########3 | 165/198 [10:23<01:59,  3.63s/it]
 84%|########3 | 166/198 [10:27<02:00,  3.78s/it]
 84%|########4 | 167/198 [10:31<01:58,  3.83s/it]
 85%|########4 | 168/198 [10:38<02:23,  4.79s/it]
 85%|########5 | 169/198 [10:41<02:05,  4.33s/it]
 86%|########5 | 170/198 [10:43<01:44,  3.74s/it]
 86%|########6 | 171/198 [10:46<01:31,  3.40s/it]
 87%|########6 | 172/198 [10:50<01:35,  3.66s/it]
 87%|########7 | 173/198 [10:54<01:35,  3.81s/it]
 88%|########7 | 174/198 [10:59<01:36,  4.00s/it]
 88%|########8 | 175/198 [11:02<01:24,  3.68s/it]
 89%|########8 | 176/198 [11:06<01:26,  3.94s/it]
 89%|########9 | 177/198 [11:10<01:21,  3.89s/it]
 90%|########9 | 178/198 [11:14<01:16,  3.85s/it]
 90%|######### | 179/198 [11:17<01:07,  3.56s/it]
 91%|######### | 180/198 [11:20<01:00,  3.34s/it]
 91%|#########1| 181/198 [11:22<00:54,  3.18s/it]
 92%|#########1| 182/198 [11:27<00:55,  3.49s/it]
 92%|#########2| 183/198 [11:30<00:50,  3.36s/it]
 93%|#########2| 184/198 [11:34<00:50,  3.64s/it]
 93%|#########3| 185/198 [11:39<00:53,  4.08s/it]
 94%|#########3| 186/198 [11:42<00:43,  3.66s/it]
 94%|#########4| 187/198 [11:46<00:41,  3.80s/it]
 95%|#########4| 188/198 [11:50<00:38,  3.84s/it]
 95%|#########5| 189/198 [11:52<00:31,  3.47s/it]
 96%|#########5| 190/198 [11:55<00:26,  3.34s/it]
 96%|#########6| 191/198 [11:59<00:24,  3.52s/it]
 97%|#########6| 192/198 [12:02<00:19,  3.22s/it]
 97%|#########7| 193/198 [12:05<00:16,  3.31s/it]
 98%|#########7| 194/198 [12:10<00:15,  3.81s/it]
 98%|#########8| 195/198 [12:17<00:13,  4.65s/it]
 99%|#########8| 196/198 [12:20<00:08,  4.17s/it]
 99%|#########9| 197/198 [12:23<00:03,  3.68s/it]
100%|##########| 198/198 [12:25<00:00,  3.28s/it]02/16/2022 00:26:49 - INFO - __main__ - Epoch 0: {'accuracy': 0.884}
02/16/2022 00:27:16 - INFO - __main__ - Test-set evaluation: {'accuracy': 0.864}
Configuration saved in out/tweet/gpt2\config.json
Model weights saved in out/tweet/gpt2\pytorch_model.bin
tokenizer config file saved in out/tweet/gpt2\tokenizer_config.json
Special tokens file saved in out/tweet/gpt2\special_tokens_map.json

100%|##########| 198/198 [13:25<00:00,  4.07s/it]

GPT2 version 2

!python run_glue_no_trainer.py \
  --model_name_or_path gpt2 \
  --train_file data/train.json  \
  --validation_file data/valid.json \
  --test_file data/test.json \
  --per_device_train_batch_size 24 \
  --per_device_eval_batch_size 24 \
  --max_length 128 \
  --freeze_model \
  --learning_rate 2e-5 \
  --num_train_epochs 1 \
  --output_dir out/tweet/gpt2_version_2
02/16/2022 00:27:21 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cpu
Use FP16 precision: False

02/16/2022 00:27:22 - WARNING - datasets.builder - Using custom data configuration default-67c9d932a627b7b8
02/16/2022 00:27:22 - WARNING - datasets.builder - Reusing dataset json (C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)

  0%|          | 0/3 [00:00<?, ?it/s]
100%|##########| 3/3 [00:00<00:00, 176.25it/s]
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 50257
}

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 50257
}

loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at C:\Users\Foka/.cache\huggingface\transformers\684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at C:\Users\Foka/.cache\huggingface\transformers\c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/gpt2/resolve/main/tokenizer.json from cache at C:\Users\Foka/.cache\huggingface\transformers\16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0
loading file https://huggingface.co/gpt2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 50257
}

02/16/2022 00:27:28 - INFO - __main__ - Return hidden states from model: False
02/16/2022 00:27:28 - INFO - __main__ - Using implementation from: AutoModelForSequenceClassification
loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at C:\Users\Foka/.cache\huggingface\transformers\752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925
All model checkpoint weights were used when initializing GPT2ForSequenceClassification.

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
02/16/2022 00:27:29 - INFO - __main__ - Freezing model weights
Using pad_token, but it is not set yet.
02/16/2022 00:27:29 - INFO - __main__ - Set PAD token to EOS: <|endoftext|>
02/16/2022 00:27:29 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-ba0dca0006a47e01.arrow

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]
Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 71.63ba/s]
02/16/2022 00:27:29 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-d41f6257e87d100c.arrow
02/16/2022 00:27:29 - INFO - __main__ - Sample 826 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [22940, 126, 222, 126, 250, 732, 262, 661, 22940, 126, 222, 126, 251, 6198, 4001, 6184, 95, 126, 222, 126, 250, 732, 262, 2330, 11, 1956, 19216, 10835, 13, 22940, 126, 222, 126, 251, 220, 220, 220, 220, 220, 6184, 95, 126, 222, 126, 99, 1303, 5304, 259, 19, 10879, 22940, 126, 222, 126, 99, 220], 'labels': 1}.
02/16/2022 00:27:29 - INFO - __main__ - Sample 521 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [31, 7220, 8425, 31582, 416, 2488, 7220, 287, 269, 30520, 13, 884, 23374, 986, 220, 220], 'labels': 1}.
02/16/2022 00:27:29 - INFO - __main__ - Sample 2806 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [4623, 68, 4964, 2168, 352, 286, 1303, 1169, 43764, 523, 355, 284, 3190, 3368, 4346, 13, 220, 220], 'labels': 0}.
02/16/2022 00:27:30 - INFO - __main__ - ***** Running training *****
02/16/2022 00:27:30 - INFO - __main__ -   Num examples = 4742
02/16/2022 00:27:30 - INFO - __main__ -   Num Epochs = 1
02/16/2022 00:27:30 - INFO - __main__ -   Instantaneous batch size per device = 24
02/16/2022 00:27:30 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 24
02/16/2022 00:27:30 - INFO - __main__ -   Gradient Accumulation steps = 1
02/16/2022 00:27:30 - INFO - __main__ -   Total optimization steps = 198

  0%|          | 0/198 [00:00<?, ?it/s]
  1%|          | 1/198 [00:01<05:14,  1.59s/it]
  1%|1         | 2/198 [00:02<04:21,  1.33s/it]
  2%|1         | 3/198 [00:03<03:49,  1.18s/it]
  2%|2         | 4/198 [00:05<04:26,  1.38s/it]
  3%|2         | 5/198 [00:07<05:00,  1.56s/it]
  3%|3         | 6/198 [00:08<04:27,  1.39s/it]
  4%|3         | 7/198 [00:10<04:50,  1.52s/it]
  4%|4         | 8/198 [00:11<04:25,  1.40s/it]
  5%|4         | 9/198 [00:12<04:34,  1.45s/it]
  5%|5         | 10/198 [00:14<04:35,  1.46s/it]
  6%|5         | 11/198 [00:15<04:21,  1.40s/it]
  6%|6         | 12/198 [00:16<03:55,  1.27s/it]
  7%|6         | 13/198 [00:18<04:08,  1.34s/it]
  7%|7         | 14/198 [00:20<05:34,  1.82s/it]
  8%|7         | 15/198 [00:22<05:06,  1.67s/it]
  8%|8         | 16/198 [00:23<04:25,  1.46s/it]
  9%|8         | 17/198 [00:24<04:00,  1.33s/it]
  9%|9         | 18/198 [00:27<05:22,  1.79s/it]
 10%|9         | 19/198 [00:28<05:07,  1.72s/it]
 10%|#         | 20/198 [00:29<04:24,  1.49s/it]
 11%|#         | 21/198 [00:32<05:15,  1.78s/it]
 11%|#1        | 22/198 [00:33<05:00,  1.71s/it]
 12%|#1        | 23/198 [00:34<04:19,  1.48s/it]
 12%|#2        | 24/198 [00:35<03:58,  1.37s/it]
 13%|#2        | 25/198 [00:36<03:40,  1.27s/it]
 13%|#3        | 26/198 [00:38<03:52,  1.35s/it]
 14%|#3        | 27/198 [00:39<03:49,  1.34s/it]
 14%|#4        | 28/198 [00:40<03:28,  1.22s/it]
 15%|#4        | 29/198 [00:42<03:41,  1.31s/it]
 15%|#5        | 30/198 [00:42<03:18,  1.18s/it]
 16%|#5        | 31/198 [00:44<03:43,  1.34s/it]
 16%|#6        | 32/198 [00:46<03:45,  1.36s/it]
 17%|#6        | 33/198 [00:47<03:49,  1.39s/it]
 17%|#7        | 34/198 [00:48<03:34,  1.31s/it]
 18%|#7        | 35/198 [00:49<03:23,  1.25s/it]
 18%|#8        | 36/198 [00:50<03:11,  1.18s/it]
 19%|#8        | 37/198 [00:52<03:30,  1.31s/it]
 19%|#9        | 38/198 [00:53<03:12,  1.20s/it]
 20%|#9        | 39/198 [00:54<03:02,  1.15s/it]
 20%|##        | 40/198 [00:56<03:29,  1.33s/it]
 21%|##        | 41/198 [00:57<03:17,  1.26s/it]
 21%|##1       | 42/198 [00:58<03:29,  1.35s/it]
 22%|##1       | 43/198 [00:59<03:08,  1.22s/it]
 22%|##2       | 44/198 [01:00<03:03,  1.19s/it]
 23%|##2       | 45/198 [01:02<03:14,  1.27s/it]
 23%|##3       | 46/198 [01:03<03:26,  1.36s/it]
 24%|##3       | 47/198 [01:04<03:07,  1.24s/it]
 24%|##4       | 48/198 [01:06<03:18,  1.32s/it]
 25%|##4       | 49/198 [01:07<02:58,  1.19s/it]
 25%|##5       | 50/198 [01:08<02:46,  1.12s/it]
 26%|##5       | 51/198 [01:09<03:04,  1.26s/it]
 26%|##6       | 52/198 [01:11<03:16,  1.34s/it]
 27%|##6       | 53/198 [01:12<02:55,  1.21s/it]
 27%|##7       | 54/198 [01:13<03:07,  1.30s/it]
 28%|##7       | 55/198 [01:14<03:00,  1.26s/it]
 28%|##8       | 56/198 [01:15<02:40,  1.13s/it]
 29%|##8       | 57/198 [01:17<03:10,  1.35s/it]
 29%|##9       | 58/198 [01:18<03:02,  1.30s/it]
 30%|##9       | 59/198 [01:20<03:09,  1.37s/it]
 30%|###       | 60/198 [01:21<02:45,  1.20s/it]
 31%|###       | 61/198 [01:22<02:40,  1.17s/it]
 31%|###1      | 62/198 [01:23<02:41,  1.18s/it]
 32%|###1      | 63/198 [01:24<02:54,  1.29s/it]
 32%|###2      | 64/198 [01:26<02:48,  1.26s/it]
 33%|###2      | 65/198 [01:27<02:56,  1.33s/it]
 33%|###3      | 66/198 [01:29<03:03,  1.39s/it]
 34%|###3      | 67/198 [01:30<03:10,  1.45s/it]
 34%|###4      | 68/198 [01:33<03:44,  1.73s/it]
 35%|###4      | 69/198 [01:34<03:16,  1.52s/it]
 35%|###5      | 70/198 [01:35<03:16,  1.53s/it]
 36%|###5      | 71/198 [01:36<02:53,  1.37s/it]
 36%|###6      | 72/198 [01:38<03:00,  1.43s/it]
 37%|###6      | 73/198 [01:39<02:58,  1.43s/it]
 37%|###7      | 74/198 [01:41<02:59,  1.45s/it]
 38%|###7      | 75/198 [01:42<02:45,  1.34s/it]
 38%|###8      | 76/198 [01:43<02:35,  1.28s/it]
 39%|###8      | 77/198 [01:44<02:40,  1.33s/it]
 39%|###9      | 78/198 [01:46<02:32,  1.27s/it]
 40%|###9      | 79/198 [01:47<02:31,  1.27s/it]
 40%|####      | 80/198 [01:48<02:28,  1.26s/it]
 41%|####      | 81/198 [01:49<02:18,  1.19s/it]
 41%|####1     | 82/198 [01:52<03:16,  1.69s/it]
 42%|####1     | 83/198 [01:53<03:10,  1.65s/it]
 42%|####2     | 84/198 [01:55<02:49,  1.49s/it]
 43%|####2     | 85/198 [01:56<02:56,  1.56s/it]
 43%|####3     | 86/198 [01:57<02:29,  1.34s/it]
 44%|####3     | 87/198 [01:58<02:24,  1.30s/it]
 44%|####4     | 88/198 [01:59<02:14,  1.23s/it]
 45%|####4     | 89/198 [02:01<02:13,  1.22s/it]
 45%|####5     | 90/198 [02:02<02:14,  1.24s/it]
 46%|####5     | 91/198 [02:03<02:09,  1.21s/it]
 46%|####6     | 92/198 [02:04<01:59,  1.13s/it]
 47%|####6     | 93/198 [02:05<01:53,  1.08s/it]
 47%|####7     | 94/198 [02:06<01:53,  1.09s/it]
 48%|####7     | 95/198 [02:07<01:45,  1.02s/it]
 48%|####8     | 96/198 [02:08<01:59,  1.17s/it]
 49%|####8     | 97/198 [02:09<01:53,  1.12s/it]
 49%|####9     | 98/198 [02:11<02:14,  1.35s/it]
 50%|#####     | 99/198 [02:13<02:13,  1.35s/it]
 51%|#####     | 100/198 [02:15<02:51,  1.75s/it]
 51%|#####1    | 101/198 [02:18<03:02,  1.88s/it]
 52%|#####1    | 102/198 [02:18<02:33,  1.60s/it]
 52%|#####2    | 103/198 [02:19<02:09,  1.36s/it]
 53%|#####2    | 104/198 [02:20<01:59,  1.27s/it]
 53%|#####3    | 105/198 [02:22<02:07,  1.37s/it]
 54%|#####3    | 106/198 [02:23<02:07,  1.38s/it]
 54%|#####4    | 107/198 [02:25<02:06,  1.39s/it]
 55%|#####4    | 108/198 [02:26<02:09,  1.43s/it]
 55%|#####5    | 109/198 [02:29<02:47,  1.88s/it]
 56%|#####5    | 110/198 [02:30<02:28,  1.69s/it]
 56%|#####6    | 111/198 [02:31<02:07,  1.47s/it]
 57%|#####6    | 112/198 [02:33<02:06,  1.47s/it]
 57%|#####7    | 113/198 [02:34<01:51,  1.31s/it]
 58%|#####7    | 114/198 [02:35<01:56,  1.39s/it]
 58%|#####8    | 115/198 [02:36<01:46,  1.29s/it]
 59%|#####8    | 116/198 [02:38<01:46,  1.29s/it]
 59%|#####9    | 117/198 [02:39<01:39,  1.23s/it]
 60%|#####9    | 118/198 [02:40<01:39,  1.25s/it]
 60%|######    | 119/198 [02:41<01:30,  1.15s/it]
 61%|######    | 120/198 [02:42<01:26,  1.11s/it]
 61%|######1   | 121/198 [02:43<01:31,  1.19s/it]
 62%|######1   | 122/198 [02:45<01:27,  1.15s/it]
 62%|######2   | 123/198 [02:46<01:31,  1.22s/it]
 63%|######2   | 124/198 [02:47<01:29,  1.21s/it]
 63%|######3   | 125/198 [02:49<01:37,  1.33s/it]
 64%|######3   | 126/198 [02:50<01:30,  1.25s/it]
 64%|######4   | 127/198 [02:52<01:39,  1.41s/it]
 65%|######4   | 128/198 [02:53<01:39,  1.42s/it]
 65%|######5   | 129/198 [02:54<01:32,  1.34s/it]
 66%|######5   | 130/198 [02:55<01:25,  1.25s/it]
 66%|######6   | 131/198 [02:58<01:52,  1.68s/it]
 67%|######6   | 132/198 [02:59<01:37,  1.47s/it]
 67%|######7   | 133/198 [03:01<01:41,  1.56s/it]
 68%|######7   | 134/198 [03:02<01:36,  1.50s/it]
 68%|######8   | 135/198 [03:03<01:28,  1.41s/it]
 69%|######8   | 136/198 [03:05<01:30,  1.47s/it]
 69%|######9   | 137/198 [03:06<01:17,  1.26s/it]
 70%|######9   | 138/198 [03:07<01:15,  1.27s/it]
 70%|#######   | 139/198 [03:08<01:06,  1.13s/it]
 71%|#######   | 140/198 [03:09<01:10,  1.21s/it]
 71%|#######1  | 141/198 [03:11<01:28,  1.55s/it]
 72%|#######1  | 142/198 [03:13<01:31,  1.63s/it]
 72%|#######2  | 143/198 [03:15<01:26,  1.58s/it]
 73%|#######2  | 144/198 [03:15<01:12,  1.35s/it]
 73%|#######3  | 145/198 [03:16<01:03,  1.19s/it]
 74%|#######3  | 146/198 [03:17<00:59,  1.15s/it]
 74%|#######4  | 147/198 [03:18<00:56,  1.11s/it]
 75%|#######4  | 148/198 [03:19<00:53,  1.07s/it]
 75%|#######5  | 149/198 [03:21<00:59,  1.21s/it]
 76%|#######5  | 150/198 [03:22<00:54,  1.14s/it]
 76%|#######6  | 151/198 [03:23<00:50,  1.08s/it]
 77%|#######6  | 152/198 [03:25<01:00,  1.32s/it]
 77%|#######7  | 153/198 [03:26<01:02,  1.38s/it]
 78%|#######7  | 154/198 [03:27<00:55,  1.27s/it]
 78%|#######8  | 155/198 [03:29<00:55,  1.29s/it]
 79%|#######8  | 156/198 [03:30<00:56,  1.35s/it]
 79%|#######9  | 157/198 [03:31<00:49,  1.22s/it]
 80%|#######9  | 158/198 [03:33<00:59,  1.49s/it]
 80%|########  | 159/198 [03:34<00:52,  1.34s/it]
 81%|########  | 160/198 [03:35<00:51,  1.36s/it]
 81%|########1 | 161/198 [03:37<00:47,  1.27s/it]
 82%|########1 | 162/198 [03:37<00:41,  1.15s/it]
 82%|########2 | 163/198 [03:38<00:36,  1.06s/it]
 83%|########2 | 164/198 [03:40<00:40,  1.20s/it]
 83%|########3 | 165/198 [03:41<00:42,  1.30s/it]
 84%|########3 | 166/198 [03:44<00:54,  1.69s/it]
 84%|########4 | 167/198 [03:47<01:03,  2.06s/it]
 85%|########4 | 168/198 [03:48<00:54,  1.81s/it]
 85%|########5 | 169/198 [03:50<00:49,  1.70s/it]
 86%|########5 | 170/198 [03:51<00:41,  1.49s/it]
 86%|########6 | 171/198 [03:52<00:37,  1.37s/it]
 87%|########6 | 172/198 [03:52<00:30,  1.18s/it]
 87%|########7 | 173/198 [03:53<00:27,  1.09s/it]
 88%|########7 | 174/198 [03:54<00:24,  1.03s/it]
 88%|########8 | 175/198 [03:56<00:26,  1.15s/it]
 89%|########8 | 176/198 [03:58<00:36,  1.67s/it]
 89%|########9 | 177/198 [04:00<00:32,  1.56s/it]
 90%|########9 | 178/198 [04:01<00:27,  1.38s/it]
 90%|######### | 179/198 [04:02<00:23,  1.24s/it]
 91%|######### | 180/198 [04:03<00:21,  1.18s/it]
 91%|#########1| 181/198 [04:04<00:19,  1.12s/it]
 92%|#########1| 182/198 [04:05<00:20,  1.26s/it]
 92%|#########2| 183/198 [04:07<00:19,  1.28s/it]
 93%|#########2| 184/198 [04:09<00:21,  1.54s/it]
 93%|#########3| 185/198 [04:10<00:20,  1.54s/it]
 94%|#########3| 186/198 [04:11<00:16,  1.42s/it]
 94%|#########4| 187/198 [04:12<00:14,  1.31s/it]
 95%|#########4| 188/198 [04:14<00:12,  1.29s/it]
 95%|#########5| 189/198 [04:15<00:10,  1.15s/it]
 96%|#########5| 190/198 [04:16<00:09,  1.25s/it]
 96%|#########6| 191/198 [04:18<00:09,  1.34s/it]
 97%|#########6| 192/198 [04:18<00:06,  1.17s/it]
 97%|#########7| 193/198 [04:21<00:07,  1.51s/it]
 98%|#########7| 194/198 [04:22<00:05,  1.39s/it]
 98%|#########8| 195/198 [04:23<00:03,  1.31s/it]
 99%|#########8| 196/198 [04:24<00:02,  1.17s/it]
 99%|#########9| 197/198 [04:25<00:01,  1.10s/it]
100%|##########| 198/198 [04:26<00:00,  1.09s/it]02/16/2022 00:32:30 - INFO - __main__ - Epoch 0: {'accuracy': 0.846}
02/16/2022 00:32:57 - INFO - __main__ - Test-set evaluation: {'accuracy': 0.904}
Configuration saved in out/tweet/gpt2_version_2\config.json
Model weights saved in out/tweet/gpt2_version_2\pytorch_model.bin
tokenizer config file saved in out/tweet/gpt2_version_2\tokenizer_config.json
Special tokens file saved in out/tweet/gpt2_version_2\special_tokens_map.json

100%|##########| 198/198 [05:27<00:00,  1.65s/it]

GPT2 version 3

!python run_glue_no_trainer.py \
  --model_name_or_path gpt2 \
  --train_file data/train.json  \
  --validation_file data/valid.json \
  --test_file data/test.json \
  --per_device_train_batch_size 24 \
  --per_device_eval_batch_size 24 \
  --max_length 128 \
  --freeze_model \
  --custom_model \
  --learning_rate 2e-5 \
  --num_train_epochs 1 \
  --output_dir out/tweet/gpt2_version_3
02/16/2022 00:33:00 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cpu
Use FP16 precision: False

02/16/2022 00:33:00 - WARNING - datasets.builder - Using custom data configuration default-67c9d932a627b7b8
02/16/2022 00:33:00 - WARNING - datasets.builder - Reusing dataset json (C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)

  0%|          | 0/3 [00:00<?, ?it/s]
100%|##########| 3/3 [00:00<00:00, 1504.23it/s]
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 50257
}

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 50257
}

loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at C:\Users\Foka/.cache\huggingface\transformers\684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at C:\Users\Foka/.cache\huggingface\transformers\c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/gpt2/resolve/main/tokenizer.json from cache at C:\Users\Foka/.cache\huggingface\transformers\16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0
loading file https://huggingface.co/gpt2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 50257
}

02/16/2022 00:33:06 - INFO - __main__ - Return hidden states from model: False
02/16/2022 00:33:06 - INFO - __main__ - Using implementation from: GPT2ForSequenceClassificationCustom
loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at C:\Users\Foka/.cache\huggingface\transformers\752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925
All model checkpoint weights were used when initializing GPT2ForSequenceClassificationCustom.

Some weights of GPT2ForSequenceClassificationCustom were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.dense_1_hidden.bias', 'score.dense_1_input.bias', 'score.out_proj.weight', 'score.dense_2.bias', 'score.dense_1_hidden.weight', 'score.dense_2.weight', 'score.dense_1_input.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
02/16/2022 00:33:08 - INFO - __main__ - Freezing model weights
Using pad_token, but it is not set yet.
02/16/2022 00:33:08 - INFO - __main__ - Set PAD token to EOS: <|endoftext|>
02/16/2022 00:33:08 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-f4385b00908c069e.arrow

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]
Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 32.35ba/s]
02/16/2022 00:33:08 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-c36412d695a9c6f1.arrow
02/16/2022 00:33:08 - INFO - __main__ - Sample 1528 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [258, 338, 281, 555, 2382, 7490, 764, 1303, 22584, 220], 'labels': 1}.
02/16/2022 00:33:08 - INFO - __main__ - Sample 113 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [31, 7220, 43646, 2148, 20577, 1303, 37098, 13948, 1337, 329, 1303, 11085, 77, 602, 25, 5387, 16155, 220, 1303, 17089, 6894, 5171, 4763, 220], 'labels': 1}.
02/16/2022 00:33:08 - INFO - __main__ - Sample 485 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [31, 7220, 2488, 7220, 883, 6886, 284, 220, 1303, 12480, 4604, 594, 1303, 5183, 445, 1303, 259, 31012, 1303, 42570, 6098, 999, 1303, 721, 16207, 481, 1309, 1303, 40954, 760, 674, 8666, 1303, 5539], 'labels': 1}.
02/16/2022 00:33:09 - INFO - __main__ - ***** Running training *****
02/16/2022 00:33:09 - INFO - __main__ -   Num examples = 4742
02/16/2022 00:33:09 - INFO - __main__ -   Num Epochs = 1
02/16/2022 00:33:09 - INFO - __main__ -   Instantaneous batch size per device = 24
02/16/2022 00:33:09 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 24
02/16/2022 00:33:09 - INFO - __main__ -   Gradient Accumulation steps = 1
02/16/2022 00:33:09 - INFO - __main__ -   Total optimization steps = 198

  0%|          | 0/198 [00:00<?, ?it/s]
  1%|          | 1/198 [00:01<03:23,  1.03s/it]
  1%|1         | 2/198 [00:02<03:32,  1.08s/it]
  2%|1         | 3/198 [00:04<04:40,  1.44s/it]
  2%|2         | 4/198 [00:05<04:33,  1.41s/it]
  3%|2         | 5/198 [00:06<04:33,  1.42s/it]
  3%|3         | 6/198 [00:08<04:18,  1.35s/it]
  4%|3         | 7/198 [00:09<04:03,  1.28s/it]
  4%|4         | 8/198 [00:10<03:43,  1.17s/it]
  5%|4         | 9/198 [00:12<05:08,  1.63s/it]
  5%|5         | 10/198 [00:14<05:01,  1.60s/it]
  6%|5         | 11/198 [00:15<04:29,  1.44s/it]
  6%|6         | 12/198 [00:17<04:39,  1.50s/it]
  7%|6         | 13/198 [00:18<04:23,  1.42s/it]
  7%|7         | 14/198 [00:19<04:04,  1.33s/it]
  8%|7         | 15/198 [00:22<05:33,  1.82s/it]
  8%|8         | 16/198 [00:23<05:13,  1.72s/it]
  9%|8         | 17/198 [00:25<05:16,  1.75s/it]
  9%|9         | 18/198 [00:26<04:28,  1.49s/it]
 10%|9         | 19/198 [00:28<04:34,  1.53s/it]
 10%|#         | 20/198 [00:30<04:55,  1.66s/it]
 11%|#         | 21/198 [00:31<04:30,  1.53s/it]
 11%|#1        | 22/198 [00:32<03:57,  1.35s/it]
 12%|#1        | 23/198 [00:33<04:04,  1.39s/it]
 12%|#2        | 24/198 [00:36<04:50,  1.67s/it]
 13%|#2        | 25/198 [00:37<04:31,  1.57s/it]
 13%|#3        | 26/198 [00:38<04:05,  1.43s/it]
 14%|#3        | 27/198 [00:39<03:49,  1.34s/it]
 14%|#4        | 28/198 [00:40<03:37,  1.28s/it]
 15%|#4        | 29/198 [00:42<03:48,  1.35s/it]
 15%|#5        | 30/198 [00:44<04:06,  1.47s/it]
 16%|#5        | 31/198 [00:45<03:43,  1.34s/it]
 16%|#6        | 32/198 [00:46<03:35,  1.30s/it]
 17%|#6        | 33/198 [00:47<03:27,  1.26s/it]
 17%|#7        | 34/198 [00:50<04:49,  1.76s/it]
 18%|#7        | 35/198 [00:51<04:15,  1.57s/it]
 18%|#8        | 36/198 [00:53<04:21,  1.61s/it]
 19%|#8        | 37/198 [00:54<04:23,  1.64s/it]
 19%|#9        | 38/198 [00:56<04:25,  1.66s/it]
 20%|#9        | 39/198 [00:58<04:25,  1.67s/it]
 20%|##        | 40/198 [00:59<04:03,  1.54s/it]
 21%|##        | 41/198 [01:00<03:52,  1.48s/it]
 21%|##1       | 42/198 [01:01<03:32,  1.36s/it]
 22%|##1       | 43/198 [01:03<03:29,  1.35s/it]
 22%|##2       | 44/198 [01:04<03:16,  1.28s/it]
 23%|##2       | 45/198 [01:06<03:31,  1.38s/it]
 23%|##3       | 46/198 [01:07<03:37,  1.43s/it]
 24%|##3       | 47/198 [01:08<03:17,  1.31s/it]
 24%|##4       | 48/198 [01:09<02:57,  1.19s/it]
 25%|##4       | 49/198 [01:10<02:52,  1.16s/it]
 25%|##5       | 50/198 [01:12<03:23,  1.38s/it]
 26%|##5       | 51/198 [01:13<03:16,  1.34s/it]
 26%|##6       | 52/198 [01:14<03:09,  1.30s/it]
 27%|##6       | 53/198 [01:16<03:07,  1.29s/it]
 27%|##7       | 54/198 [01:17<03:08,  1.31s/it]
 28%|##7       | 55/198 [01:19<03:22,  1.42s/it]
 28%|##8       | 56/198 [01:21<04:06,  1.74s/it]
 29%|##8       | 57/198 [01:22<03:35,  1.53s/it]
 29%|##9       | 58/198 [01:23<03:16,  1.40s/it]
 30%|##9       | 59/198 [01:26<04:01,  1.74s/it]
 30%|###       | 60/198 [01:28<03:57,  1.72s/it]
 31%|###       | 61/198 [01:29<03:49,  1.67s/it]
 31%|###1      | 62/198 [01:31<04:04,  1.80s/it]
 32%|###1      | 63/198 [01:33<03:44,  1.67s/it]
 32%|###2      | 64/198 [01:34<03:18,  1.48s/it]
 33%|###2      | 65/198 [01:35<02:57,  1.33s/it]
 33%|###3      | 66/198 [01:36<02:44,  1.25s/it]
 34%|###3      | 67/198 [01:37<02:59,  1.37s/it]
 34%|###4      | 68/198 [01:40<03:45,  1.73s/it]
 35%|###4      | 69/198 [01:41<03:29,  1.63s/it]
 35%|###5      | 70/198 [01:42<03:10,  1.49s/it]
 36%|###5      | 71/198 [01:44<03:17,  1.55s/it]
 36%|###6      | 72/198 [01:46<03:32,  1.69s/it]
 37%|###6      | 73/198 [01:47<03:06,  1.49s/it]
 37%|###7      | 74/198 [01:48<02:50,  1.38s/it]
 38%|###7      | 75/198 [01:49<02:38,  1.29s/it]
 38%|###8      | 76/198 [01:51<02:50,  1.40s/it]
 39%|###8      | 77/198 [01:52<02:50,  1.41s/it]
 39%|###9      | 78/198 [01:54<02:41,  1.34s/it]
 40%|###9      | 79/198 [01:55<02:54,  1.46s/it]
 40%|####      | 80/198 [01:58<03:39,  1.86s/it]
 41%|####      | 81/198 [01:59<03:05,  1.59s/it]
 41%|####1     | 82/198 [02:00<02:42,  1.40s/it]
 42%|####1     | 83/198 [02:01<02:26,  1.28s/it]
 42%|####2     | 84/198 [02:02<02:23,  1.25s/it]
 43%|####2     | 85/198 [02:03<02:12,  1.17s/it]
 43%|####3     | 86/198 [02:05<02:14,  1.20s/it]
 44%|####3     | 87/198 [02:06<02:19,  1.26s/it]
 44%|####4     | 88/198 [02:07<02:21,  1.29s/it]
 45%|####4     | 89/198 [02:08<02:11,  1.20s/it]
 45%|####5     | 90/198 [02:11<02:49,  1.57s/it]
 46%|####5     | 91/198 [02:14<03:32,  1.98s/it]
 46%|####6     | 92/198 [02:15<03:18,  1.87s/it]
 47%|####6     | 93/198 [02:16<02:48,  1.61s/it]
 47%|####7     | 94/198 [02:17<02:32,  1.47s/it]
 48%|####7     | 95/198 [02:19<02:41,  1.57s/it]
 48%|####8     | 96/198 [02:21<02:43,  1.60s/it]
 49%|####8     | 97/198 [02:22<02:27,  1.46s/it]
 49%|####9     | 98/198 [02:24<02:43,  1.63s/it]
 50%|#####     | 99/198 [02:26<02:42,  1.64s/it]
 51%|#####     | 100/198 [02:27<02:42,  1.66s/it]
 51%|#####1    | 101/198 [02:29<02:29,  1.54s/it]
 52%|#####1    | 102/198 [02:32<03:13,  2.01s/it]
 52%|#####2    | 103/198 [02:33<02:45,  1.74s/it]
 53%|#####2    | 104/198 [02:34<02:25,  1.55s/it]
 53%|#####3    | 105/198 [02:35<02:19,  1.50s/it]
 54%|#####3    | 106/198 [02:36<02:06,  1.37s/it]
 54%|#####4    | 107/198 [02:38<01:57,  1.29s/it]
 55%|#####4    | 108/198 [02:40<02:15,  1.50s/it]
 55%|#####5    | 109/198 [02:41<02:18,  1.56s/it]
 56%|#####5    | 110/198 [02:43<02:13,  1.52s/it]
 56%|#####6    | 111/198 [02:44<02:09,  1.48s/it]
 57%|#####6    | 112/198 [02:46<02:09,  1.51s/it]
 57%|#####7    | 113/198 [02:47<02:03,  1.45s/it]
 58%|#####7    | 114/198 [02:49<02:12,  1.58s/it]
 58%|#####8    | 115/198 [02:50<02:01,  1.46s/it]
 59%|#####8    | 116/198 [02:53<02:39,  1.95s/it]
 59%|#####9    | 117/198 [02:54<02:21,  1.74s/it]
 60%|#####9    | 118/198 [02:56<02:05,  1.57s/it]
 60%|######    | 119/198 [02:57<02:04,  1.57s/it]
 61%|######    | 120/198 [02:58<01:47,  1.38s/it]
 61%|######1   | 121/198 [03:00<01:55,  1.50s/it]
 62%|######1   | 122/198 [03:01<01:52,  1.48s/it]
 62%|######2   | 123/198 [03:03<01:54,  1.53s/it]
 63%|######2   | 124/198 [03:04<01:46,  1.44s/it]
 63%|######3   | 125/198 [03:06<01:50,  1.51s/it]
 64%|######3   | 126/198 [03:07<01:40,  1.39s/it]
 64%|######4   | 127/198 [03:08<01:28,  1.24s/it]
 65%|######4   | 128/198 [03:09<01:23,  1.19s/it]
 65%|######5   | 129/198 [03:10<01:24,  1.22s/it]
 66%|######5   | 130/198 [03:11<01:21,  1.20s/it]
 66%|######6   | 131/198 [03:13<01:21,  1.21s/it]
 67%|######6   | 132/198 [03:13<01:14,  1.12s/it]
 67%|######7   | 133/198 [03:15<01:24,  1.30s/it]
 68%|######7   | 134/198 [03:16<01:21,  1.27s/it]
 68%|######8   | 135/198 [03:17<01:14,  1.18s/it]
 69%|######8   | 136/198 [03:18<01:09,  1.12s/it]
 69%|######9   | 137/198 [03:20<01:09,  1.14s/it]
 70%|######9   | 138/198 [03:21<01:17,  1.29s/it]
 70%|#######   | 139/198 [03:24<01:48,  1.83s/it]
 71%|#######   | 140/198 [03:26<01:43,  1.79s/it]
 71%|#######1  | 141/198 [03:28<01:41,  1.78s/it]
 72%|#######1  | 142/198 [03:29<01:37,  1.73s/it]
 72%|#######2  | 143/198 [03:31<01:39,  1.81s/it]
 73%|#######2  | 144/198 [03:33<01:38,  1.83s/it]
 73%|#######3  | 145/198 [03:36<01:56,  2.20s/it]
 74%|#######3  | 146/198 [03:38<01:47,  2.06s/it]
 74%|#######4  | 147/198 [03:39<01:29,  1.75s/it]
 75%|#######4  | 148/198 [03:40<01:22,  1.65s/it]
 75%|#######5  | 149/198 [03:42<01:14,  1.52s/it]
 76%|#######5  | 150/198 [03:43<01:08,  1.43s/it]
 76%|#######6  | 151/198 [03:44<01:01,  1.31s/it]
 77%|#######6  | 152/198 [03:45<00:54,  1.19s/it]
 77%|#######7  | 153/198 [03:47<01:09,  1.55s/it]
 78%|#######7  | 154/198 [03:49<01:10,  1.60s/it]
 78%|#######8  | 155/198 [03:50<01:08,  1.59s/it]
 79%|#######8  | 156/198 [03:51<00:58,  1.39s/it]
 79%|#######9  | 157/198 [03:54<01:17,  1.89s/it]
 80%|#######9  | 158/198 [03:55<01:04,  1.62s/it]
 80%|########  | 159/198 [03:57<00:59,  1.54s/it]
 81%|########  | 160/198 [03:58<00:53,  1.40s/it]
 81%|########1 | 161/198 [04:00<00:57,  1.56s/it]
 82%|########1 | 162/198 [04:01<00:54,  1.51s/it]
 82%|########2 | 163/198 [04:03<00:55,  1.57s/it]
 83%|########2 | 164/198 [04:04<00:52,  1.56s/it]
 83%|########3 | 165/198 [04:06<00:50,  1.54s/it]
 84%|########3 | 166/198 [04:07<00:45,  1.42s/it]
 84%|########4 | 167/198 [04:09<00:44,  1.43s/it]
 85%|########4 | 168/198 [04:10<00:45,  1.51s/it]
 85%|########5 | 169/198 [04:12<00:45,  1.57s/it]
 86%|########5 | 170/198 [04:14<00:45,  1.63s/it]
 86%|########6 | 171/198 [04:15<00:40,  1.52s/it]
 87%|########6 | 172/198 [04:17<00:40,  1.57s/it]
 87%|########7 | 173/198 [04:18<00:40,  1.62s/it]
 88%|########7 | 174/198 [04:20<00:38,  1.59s/it]
 88%|########8 | 175/198 [04:21<00:33,  1.44s/it]
 89%|########8 | 176/198 [04:23<00:35,  1.59s/it]
 89%|########9 | 177/198 [04:25<00:34,  1.64s/it]
 90%|########9 | 178/198 [04:26<00:28,  1.43s/it]
 90%|######### | 179/198 [04:27<00:25,  1.36s/it]
 91%|######### | 180/198 [04:28<00:23,  1.29s/it]
 91%|#########1| 181/198 [04:31<00:30,  1.81s/it]
 92%|#########1| 182/198 [04:32<00:25,  1.57s/it]
 92%|#########2| 183/198 [04:33<00:21,  1.41s/it]
 93%|#########2| 184/198 [04:34<00:19,  1.36s/it]
 93%|#########3| 185/198 [04:36<00:17,  1.35s/it]
 94%|#########3| 186/198 [04:37<00:17,  1.47s/it]
 94%|#########4| 187/198 [04:39<00:16,  1.54s/it]
 95%|#########4| 188/198 [04:40<00:13,  1.40s/it]
 95%|#########5| 189/198 [04:41<00:11,  1.29s/it]
 96%|#########5| 190/198 [04:42<00:10,  1.29s/it]
 96%|#########6| 191/198 [04:44<00:08,  1.26s/it]
 97%|#########6| 192/198 [04:45<00:07,  1.17s/it]
 97%|#########7| 193/198 [04:46<00:06,  1.32s/it]
 98%|#########7| 194/198 [04:47<00:05,  1.26s/it]
 98%|#########8| 195/198 [04:49<00:03,  1.28s/it]
 99%|#########8| 196/198 [04:50<00:02,  1.27s/it]
 99%|#########9| 197/198 [04:51<00:01,  1.20s/it]
100%|##########| 198/198 [04:52<00:00,  1.21s/it]02/16/2022 00:38:36 - INFO - __main__ - Epoch 0: {'accuracy': 0.676}
02/16/2022 00:39:05 - INFO - __main__ - Test-set evaluation: {'accuracy': 0.636}
Configuration saved in out/tweet/gpt2_version_3\config.json
Model weights saved in out/tweet/gpt2_version_3\pytorch_model.bin
tokenizer config file saved in out/tweet/gpt2_version_3\tokenizer_config.json
Special tokens file saved in out/tweet/gpt2_version_3\special_tokens_map.json

100%|##########| 198/198 [05:56<00:00,  1.80s/it]

GPT2 version 4

!python run_glue_no_trainer.py \
  --model_name_or_path gpt2 \
  --train_file data/train.json  \
  --validation_file data/valid.json \
  --test_file data/test.json \
  --per_device_train_batch_size 24 \
  --per_device_eval_batch_size 24 \
  --max_length 128 \
  --freeze_model \
  --custom_model \
  --return_hidden_states \
  --learning_rate 2e-5 \
  --num_train_epochs 1 \
  --output_dir out/tweet/gpt2_version_4
02/16/2022 00:39:07 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cpu
Use FP16 precision: False

02/16/2022 00:39:08 - WARNING - datasets.builder - Using custom data configuration default-67c9d932a627b7b8
02/16/2022 00:39:08 - WARNING - datasets.builder - Reusing dataset json (C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)

  0%|          | 0/3 [00:00<?, ?it/s]
100%|##########| 3/3 [00:00<00:00, 1505.31it/s]
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 50257
}

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 50257
}

loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at C:\Users\Foka/.cache\huggingface\transformers\684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at C:\Users\Foka/.cache\huggingface\transformers\c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/gpt2/resolve/main/tokenizer.json from cache at C:\Users\Foka/.cache\huggingface\transformers\16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0
loading file https://huggingface.co/gpt2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 50257
}

02/16/2022 00:39:14 - INFO - __main__ - Return hidden states from model: True
02/16/2022 00:39:14 - INFO - __main__ - Using implementation from: GPT2ForSequenceClassificationCustom
loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at C:\Users\Foka/.cache\huggingface\transformers\752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925
All model checkpoint weights were used when initializing GPT2ForSequenceClassificationCustom.

Some weights of GPT2ForSequenceClassificationCustom were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.dense_1_hidden.weight', 'score.out_proj.weight', 'score.dense_1_input.bias', 'score.dense_2.bias', 'score.dense_2.weight', 'score.dense_1_input.weight', 'score.dense_1_hidden.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
02/16/2022 00:39:16 - INFO - __main__ - Freezing model weights
Using pad_token, but it is not set yet.
02/16/2022 00:39:16 - INFO - __main__ - Set PAD token to EOS: <|endoftext|>
02/16/2022 00:39:16 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-5a65b7038a57b5cc.arrow

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]
Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 34.58ba/s]
02/16/2022 00:39:16 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-5ed4052179e59c20.arrow
02/16/2022 00:39:16 - INFO - __main__ - Sample 3838 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [31, 7220, 662, 12, 24071, 2488, 7220, 1303, 83, 34715, 34191, 40252, 1492, 1909, 6184, 108, 126, 253, 126, 239, 26604, 27214, 126, 253, 126, 237, 126, 120, 220, 220, 1303, 3605, 76, 13513], 'labels': 0}.
02/16/2022 00:39:16 - INFO - __main__ - Sample 1761 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [10919, 257, 220, 995, 13, 611, 257, 582, 1718, 257, 15647, 588, 326, 11, 661, 561, 910, 340, 373, 5969, 13, 475, 275, 14, 66, 1303, 81, 623, 283, 1076, 88, 318, 257, 2415, 428, 318, 2938, 13], 'labels': 1}.
02/16/2022 00:39:16 - INFO - __main__ - Sample 1111 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [31, 7220, 2488, 7220, 612, 318, 645, 3338, 1295, 329, 15102, 287, 428, 1499, 780, 286, 661, 588, 345, 1303, 65, 1967, 220], 'labels': 1}.
02/16/2022 00:39:17 - INFO - __main__ - ***** Running training *****
02/16/2022 00:39:17 - INFO - __main__ -   Num examples = 4742
02/16/2022 00:39:17 - INFO - __main__ -   Num Epochs = 1
02/16/2022 00:39:17 - INFO - __main__ -   Instantaneous batch size per device = 24
02/16/2022 00:39:17 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 24
02/16/2022 00:39:17 - INFO - __main__ -   Gradient Accumulation steps = 1
02/16/2022 00:39:17 - INFO - __main__ -   Total optimization steps = 198

  0%|          | 0/198 [00:00<?, ?it/s]
  1%|          | 1/198 [00:01<05:35,  1.70s/it]
  1%|1         | 2/198 [00:02<04:33,  1.39s/it]
  2%|1         | 3/198 [00:04<04:24,  1.35s/it]
  2%|2         | 4/198 [00:07<06:43,  2.08s/it]
  3%|2         | 5/198 [00:08<05:50,  1.81s/it]
  3%|3         | 6/198 [00:10<05:21,  1.68s/it]
  4%|3         | 7/198 [00:11<05:11,  1.63s/it]
  4%|4         | 8/198 [00:12<04:40,  1.48s/it]
  5%|4         | 9/198 [00:14<04:30,  1.43s/it]
  5%|5         | 10/198 [00:15<04:09,  1.32s/it]
  6%|5         | 11/198 [00:16<04:29,  1.44s/it]
  6%|6         | 12/198 [00:17<03:58,  1.28s/it]
  7%|6         | 13/198 [00:19<04:18,  1.40s/it]
  7%|7         | 14/198 [00:20<04:07,  1.35s/it]
  8%|7         | 15/198 [00:22<04:03,  1.33s/it]
  8%|8         | 16/198 [00:23<04:14,  1.40s/it]
  9%|8         | 17/198 [00:24<04:01,  1.33s/it]
  9%|9         | 18/198 [00:25<03:32,  1.18s/it]
 10%|9         | 19/198 [00:27<03:50,  1.29s/it]
 10%|#         | 20/198 [00:28<04:09,  1.40s/it]
 11%|#         | 21/198 [00:29<03:54,  1.32s/it]
 11%|#1        | 22/198 [00:30<03:33,  1.21s/it]
 12%|#1        | 23/198 [00:32<03:57,  1.36s/it]
 12%|#2        | 24/198 [00:33<03:45,  1.30s/it]
 13%|#2        | 25/198 [00:35<03:52,  1.34s/it]
 13%|#3        | 26/198 [00:36<03:39,  1.28s/it]
 14%|#3        | 27/198 [00:37<03:31,  1.23s/it]
 14%|#4        | 28/198 [00:39<04:07,  1.46s/it]
 15%|#4        | 29/198 [00:40<04:03,  1.44s/it]
 15%|#5        | 30/198 [00:41<03:35,  1.28s/it]
 16%|#5        | 31/198 [00:43<03:55,  1.41s/it]
 16%|#6        | 32/198 [00:45<04:08,  1.50s/it]
 17%|#6        | 33/198 [00:46<03:45,  1.36s/it]
 17%|#7        | 34/198 [00:48<04:39,  1.70s/it]
 18%|#7        | 35/198 [00:50<04:25,  1.63s/it]
 18%|#8        | 36/198 [00:51<04:02,  1.50s/it]
 19%|#8        | 37/198 [00:52<03:48,  1.42s/it]
 19%|#9        | 38/198 [00:53<03:35,  1.35s/it]
 20%|#9        | 39/198 [00:55<03:43,  1.40s/it]
 20%|##        | 40/198 [00:56<03:49,  1.45s/it]
 21%|##        | 41/198 [00:57<03:25,  1.31s/it]
 21%|##1       | 42/198 [00:59<03:33,  1.37s/it]
 22%|##1       | 43/198 [01:00<03:15,  1.26s/it]
 22%|##2       | 44/198 [01:01<03:06,  1.21s/it]
 23%|##2       | 45/198 [01:02<02:52,  1.13s/it]
 23%|##3       | 46/198 [01:03<02:58,  1.17s/it]
 24%|##3       | 47/198 [01:04<02:45,  1.09s/it]
 24%|##4       | 48/198 [01:05<02:39,  1.06s/it]
 25%|##4       | 49/198 [01:07<03:20,  1.34s/it]
 25%|##5       | 50/198 [01:08<03:16,  1.33s/it]
 26%|##5       | 51/198 [01:10<03:31,  1.44s/it]
 26%|##6       | 52/198 [01:12<03:31,  1.45s/it]
 27%|##6       | 53/198 [01:13<03:13,  1.33s/it]
 27%|##7       | 54/198 [01:14<02:53,  1.21s/it]
 28%|##7       | 55/198 [01:15<02:51,  1.20s/it]
 28%|##8       | 56/198 [01:16<03:11,  1.35s/it]
 29%|##8       | 57/198 [01:17<02:54,  1.24s/it]
 29%|##9       | 58/198 [01:19<02:58,  1.28s/it]
 30%|##9       | 59/198 [01:20<02:49,  1.22s/it]
 30%|###       | 60/198 [01:21<02:53,  1.26s/it]
 31%|###       | 61/198 [01:22<02:39,  1.17s/it]
 31%|###1      | 62/198 [01:25<03:56,  1.74s/it]
 32%|###1      | 63/198 [01:26<03:28,  1.54s/it]
 32%|###2      | 64/198 [01:29<04:29,  2.01s/it]
 33%|###2      | 65/198 [01:31<04:06,  1.85s/it]
 33%|###3      | 66/198 [01:33<04:24,  2.00s/it]
 34%|###3      | 67/198 [01:35<04:12,  1.93s/it]
 34%|###4      | 68/198 [01:36<03:33,  1.64s/it]
 35%|###4      | 69/198 [01:37<03:21,  1.56s/it]
 35%|###5      | 70/198 [01:38<02:58,  1.39s/it]
 36%|###5      | 71/198 [01:40<02:50,  1.34s/it]
 36%|###6      | 72/198 [01:41<03:01,  1.44s/it]
 37%|###6      | 73/198 [01:43<02:58,  1.43s/it]
 37%|###7      | 74/198 [01:44<02:46,  1.34s/it]
 38%|###7      | 75/198 [01:45<02:58,  1.45s/it]
 38%|###8      | 76/198 [01:47<03:17,  1.62s/it]
 39%|###8      | 77/198 [01:49<03:12,  1.59s/it]
 39%|###9      | 78/198 [01:51<03:26,  1.72s/it]
 40%|###9      | 79/198 [01:52<03:07,  1.57s/it]
 40%|####      | 80/198 [01:54<03:11,  1.62s/it]
 41%|####      | 81/198 [01:55<02:47,  1.43s/it]
 41%|####1     | 82/198 [01:57<02:51,  1.48s/it]
 42%|####1     | 83/198 [01:58<02:42,  1.41s/it]
 42%|####2     | 84/198 [01:59<02:32,  1.34s/it]
 43%|####2     | 85/198 [02:00<02:26,  1.30s/it]
 43%|####3     | 86/198 [02:01<02:13,  1.19s/it]
 44%|####3     | 87/198 [02:02<02:04,  1.12s/it]
 44%|####4     | 88/198 [02:03<02:10,  1.19s/it]
 45%|####4     | 89/198 [02:05<02:14,  1.23s/it]
 45%|####5     | 90/198 [02:08<03:13,  1.79s/it]
 46%|####5     | 91/198 [02:11<03:39,  2.05s/it]
 46%|####6     | 92/198 [02:12<03:08,  1.77s/it]
 47%|####6     | 93/198 [02:13<02:46,  1.59s/it]
 47%|####7     | 94/198 [02:15<02:57,  1.71s/it]
 48%|####7     | 95/198 [02:16<02:43,  1.59s/it]
 48%|####8     | 96/198 [02:18<02:47,  1.65s/it]
 49%|####8     | 97/198 [02:19<02:35,  1.54s/it]
 49%|####9     | 98/198 [02:20<02:23,  1.44s/it]
 50%|#####     | 99/198 [02:22<02:36,  1.58s/it]
 51%|#####     | 100/198 [02:25<03:02,  1.87s/it]
 51%|#####1    | 101/198 [02:26<02:43,  1.68s/it]
 52%|#####1    | 102/198 [02:28<02:43,  1.71s/it]
 52%|#####2    | 103/198 [02:30<02:49,  1.78s/it]
 53%|#####2    | 104/198 [02:32<02:48,  1.79s/it]
 53%|#####3    | 105/198 [02:33<02:24,  1.55s/it]
 54%|#####3    | 106/198 [02:34<02:12,  1.44s/it]
 54%|#####4    | 107/198 [02:35<01:57,  1.29s/it]
 55%|#####4    | 108/198 [02:36<02:05,  1.39s/it]
 55%|#####5    | 109/198 [02:38<02:09,  1.45s/it]
 56%|#####5    | 110/198 [02:40<02:15,  1.54s/it]
 56%|#####6    | 111/198 [02:41<02:18,  1.59s/it]
 57%|#####6    | 112/198 [02:43<02:19,  1.63s/it]
 57%|#####7    | 113/198 [02:44<02:00,  1.41s/it]
 58%|#####7    | 114/198 [02:45<01:54,  1.36s/it]
 58%|#####8    | 115/198 [02:47<02:01,  1.46s/it]
 59%|#####8    | 116/198 [02:49<02:05,  1.53s/it]
 59%|#####9    | 117/198 [02:50<01:50,  1.36s/it]
 60%|#####9    | 118/198 [02:51<01:48,  1.36s/it]
 60%|######    | 119/198 [02:53<01:56,  1.47s/it]
 61%|######    | 120/198 [02:54<01:42,  1.31s/it]
 61%|######1   | 121/198 [02:55<01:34,  1.22s/it]
 62%|######1   | 122/198 [02:56<01:43,  1.36s/it]
 62%|######2   | 123/198 [02:57<01:37,  1.30s/it]
 63%|######2   | 124/198 [02:59<01:32,  1.26s/it]
 63%|######3   | 125/198 [03:00<01:28,  1.21s/it]
 64%|######3   | 126/198 [03:01<01:36,  1.34s/it]
 64%|######4   | 127/198 [03:03<01:37,  1.37s/it]
 65%|######4   | 128/198 [03:04<01:32,  1.32s/it]
 65%|######5   | 129/198 [03:05<01:28,  1.28s/it]
 66%|######5   | 130/198 [03:07<01:34,  1.39s/it]
 66%|######6   | 131/198 [03:10<02:06,  1.89s/it]
 67%|######6   | 132/198 [03:11<01:53,  1.73s/it]
 67%|######7   | 133/198 [03:14<02:20,  2.17s/it]
 68%|######7   | 134/198 [03:16<02:14,  2.10s/it]
 68%|######8   | 135/198 [03:18<02:06,  2.00s/it]
 69%|######8   | 136/198 [03:19<01:44,  1.68s/it]
 69%|######9   | 137/198 [03:20<01:29,  1.47s/it]
 70%|######9   | 138/198 [03:21<01:20,  1.34s/it]
 70%|#######   | 139/198 [03:22<01:17,  1.32s/it]
 71%|#######   | 140/198 [03:24<01:12,  1.25s/it]
 71%|#######1  | 141/198 [03:26<01:39,  1.75s/it]
 72%|#######1  | 142/198 [03:28<01:35,  1.70s/it]
 72%|#######2  | 143/198 [03:29<01:22,  1.50s/it]
 73%|#######2  | 144/198 [03:30<01:14,  1.38s/it]
 73%|#######3  | 145/198 [03:31<01:07,  1.27s/it]
 74%|#######3  | 146/198 [03:33<01:12,  1.40s/it]
 74%|#######4  | 147/198 [03:34<01:06,  1.30s/it]
 75%|#######4  | 148/198 [03:37<01:29,  1.78s/it]
 75%|#######5  | 149/198 [03:38<01:18,  1.61s/it]
 76%|#######5  | 150/198 [03:40<01:23,  1.73s/it]
 76%|#######6  | 151/198 [03:41<01:13,  1.57s/it]
 77%|#######6  | 152/198 [03:42<01:02,  1.37s/it]
 77%|#######7  | 153/198 [03:43<00:57,  1.29s/it]
 78%|#######7  | 154/198 [03:45<01:02,  1.41s/it]
 78%|#######8  | 155/198 [03:47<01:05,  1.52s/it]
 79%|#######8  | 156/198 [03:48<00:59,  1.43s/it]
 79%|#######9  | 157/198 [03:50<01:02,  1.51s/it]
 80%|#######9  | 158/198 [03:51<00:54,  1.37s/it]
 80%|########  | 159/198 [03:52<00:49,  1.27s/it]
 81%|########  | 160/198 [03:53<00:46,  1.22s/it]
 81%|########1 | 161/198 [03:54<00:42,  1.15s/it]
 82%|########1 | 162/198 [03:55<00:41,  1.15s/it]
 82%|########2 | 163/198 [03:56<00:38,  1.09s/it]
 83%|########2 | 164/198 [03:57<00:35,  1.05s/it]
 83%|########3 | 165/198 [03:58<00:40,  1.23s/it]
 84%|########3 | 166/198 [04:00<00:39,  1.24s/it]
 84%|########4 | 167/198 [04:01<00:41,  1.35s/it]
 85%|########4 | 168/198 [04:04<00:54,  1.81s/it]
 85%|########5 | 169/198 [04:07<00:57,  2.00s/it]
 86%|########5 | 170/198 [04:09<00:57,  2.04s/it]
 86%|########6 | 171/198 [04:10<00:47,  1.75s/it]
 87%|########6 | 172/198 [04:12<00:44,  1.73s/it]
 87%|########7 | 173/198 [04:13<00:37,  1.49s/it]
 88%|########7 | 174/198 [04:14<00:38,  1.62s/it]
 88%|########8 | 175/198 [04:16<00:36,  1.58s/it]
 89%|########8 | 176/198 [04:17<00:30,  1.40s/it]
 89%|########9 | 177/198 [04:19<00:32,  1.54s/it]
 90%|########9 | 178/198 [04:20<00:27,  1.36s/it]
 90%|######### | 179/198 [04:21<00:27,  1.43s/it]
 91%|######### | 180/198 [04:22<00:23,  1.33s/it]
 91%|#########1| 181/198 [04:24<00:21,  1.28s/it]
 92%|#########1| 182/198 [04:25<00:19,  1.21s/it]
 92%|#########2| 183/198 [04:27<00:23,  1.57s/it]
 93%|#########2| 184/198 [04:28<00:20,  1.49s/it]
 93%|#########3| 185/198 [04:29<00:18,  1.39s/it]
 94%|#########3| 186/198 [04:33<00:22,  1.88s/it]
 94%|#########4| 187/198 [04:34<00:17,  1.63s/it]
 95%|#########4| 188/198 [04:35<00:16,  1.64s/it]
 95%|#########5| 189/198 [04:36<00:13,  1.52s/it]
 96%|#########5| 190/198 [04:37<00:10,  1.37s/it]
 96%|#########6| 191/198 [04:39<00:09,  1.41s/it]
 97%|#########6| 192/198 [04:41<00:09,  1.55s/it]
 97%|#########7| 193/198 [04:43<00:08,  1.62s/it]
 98%|#########7| 194/198 [04:44<00:06,  1.65s/it]
 98%|#########8| 195/198 [04:46<00:04,  1.63s/it]
 99%|#########8| 196/198 [04:47<00:03,  1.53s/it]
 99%|#########9| 197/198 [04:49<00:01,  1.50s/it]
100%|##########| 198/198 [04:49<00:00,  1.21s/it]02/16/2022 00:44:41 - INFO - __main__ - Epoch 0: {'accuracy': 0.728}
02/16/2022 00:45:10 - INFO - __main__ - Test-set evaluation: {'accuracy': 0.732}
Configuration saved in out/tweet/gpt2_version_4\config.json
Model weights saved in out/tweet/gpt2_version_4\pytorch_model.bin
tokenizer config file saved in out/tweet/gpt2_version_4\tokenizer_config.json
Special tokens file saved in out/tweet/gpt2_version_4\special_tokens_map.json

100%|##########| 198/198 [05:53<00:00,  1.78s/it]

GPT2 version 5

!python run_glue_no_trainer.py \
  --model_name_or_path gpt2 \
  --train_file data/train.json  \
  --validation_file data/valid.json \
  --test_file data/test.json \
  --per_device_train_batch_size 32 \
  --per_device_eval_batch_size 32 \
  --max_length 128 \
  --freeze_model \
  --custom_model \
  --return_hidden_states \
  --learning_rate 2e-5 \
  --num_train_epochs 1 \
  --output_dir out/tweet/gpt2_version_5
02/17/2022 17:37:38 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cpu
Use FP16 precision: False

02/17/2022 17:37:39 - WARNING - datasets.builder - Using custom data configuration default-67c9d932a627b7b8
02/17/2022 17:37:39 - WARNING - datasets.builder - Reusing dataset json (C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)

  0%|          | 0/3 [00:00<?, ?it/s]
100%|##########| 3/3 [00:00<00:00, 1503.87it/s]
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 50257
}

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 50257
}

loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at C:\Users\Foka/.cache\huggingface\transformers\684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at C:\Users\Foka/.cache\huggingface\transformers\c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/gpt2/resolve/main/tokenizer.json from cache at C:\Users\Foka/.cache\huggingface\transformers\16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0
loading file https://huggingface.co/gpt2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 50257
}

02/17/2022 17:37:45 - INFO - __main__ - Return hidden states from model: True
02/17/2022 17:37:45 - INFO - __main__ - Using implementation from: GPT2ForSequenceClassificationCustom
loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at C:\Users\Foka/.cache\huggingface\transformers\752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925
All model checkpoint weights were used when initializing GPT2ForSequenceClassificationCustom.

Some weights of GPT2ForSequenceClassificationCustom were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.dense_1_input.weight', 'score.dense_2.weight', 'score.dense_2.bias', 'score.out_proj.weight', 'score.dense_1_hidden.weight', 'score.dense_1_hidden.bias', 'score.dense_1_input.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
02/17/2022 17:37:47 - INFO - __main__ - Freezing model weights
Using pad_token, but it is not set yet.
02/17/2022 17:37:47 - INFO - __main__ - Set PAD token to EOS: <|endoftext|>

Running tokenizer on dataset:   0%|          | 0/5 [00:00<?, ?ba/s]
Running tokenizer on dataset: 100%|##########| 5/5 [00:00<00:00, 46.33ba/s]
Running tokenizer on dataset: 100%|##########| 5/5 [00:00<00:00, 46.33ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]
Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 83.55ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]
Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 100.09ba/s]
02/17/2022 17:37:48 - INFO - __main__ - Sample 4558 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [31, 7220, 4953, 287, 262, 3223, 329, 616, 717, 2646, 286, 2488, 7220, 543, 318, 2488, 7220, 220, 220, 220, 1303, 41364, 469, 988, 1303, 276, 26240, 23411, 6184, 95, 126, 222, 126, 242, 986, 220], 'labels': 0}.
02/17/2022 17:37:48 - INFO - __main__ - Sample 2249 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [1169, 1306, 1524, 614, 318, 262, 614, 329, 26420, 13, 27214, 126, 253, 126, 246, 5196, 460, 470, 892, 546, 326, 6184, 108, 126, 253, 126, 246, 3907, 1303, 14347, 1303, 1069, 4105, 220, 220, 1303, 37035, 1303, 320, 12756, 1303, 529, 669, 6042, 1303, 260, 10396, 3508, 1251, 1303, 15219], 'labels': 0}.
02/17/2022 17:37:48 - INFO - __main__ - Sample 1448 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [2, 1416, 81, 315, 259, 1096, 1303, 82, 5570, 1222, 696, 26, 1303, 6381, 3455, 1303, 403, 6667, 11203, 540, 1303, 354, 5233, 1303, 2256, 6615, 287, 705, 32243, 1028, 10713, 25, 9265, 6, 220, 220], 'labels': 1}.
02/17/2022 17:37:48 - INFO - __main__ - ***** Running training *****
02/17/2022 17:37:48 - INFO - __main__ -   Num examples = 4742
02/17/2022 17:37:48 - INFO - __main__ -   Num Epochs = 1
02/17/2022 17:37:48 - INFO - __main__ -   Instantaneous batch size per device = 32
02/17/2022 17:37:48 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 32
02/17/2022 17:37:48 - INFO - __main__ -   Gradient Accumulation steps = 1
02/17/2022 17:37:48 - INFO - __main__ -   Total optimization steps = 149

  0%|          | 0/149 [00:00<?, ?it/s]
  1%|          | 1/149 [00:01<03:56,  1.60s/it]
  1%|1         | 2/149 [00:02<03:34,  1.46s/it]
  2%|2         | 3/149 [00:04<03:39,  1.50s/it]
  3%|2         | 4/149 [00:06<03:45,  1.56s/it]
  3%|3         | 5/149 [00:08<04:23,  1.83s/it]
  4%|4         | 6/149 [00:10<04:38,  1.95s/it]
  5%|4         | 7/149 [00:12<04:25,  1.87s/it]
  5%|5         | 8/149 [00:15<05:38,  2.40s/it]
  6%|6         | 9/149 [00:17<05:01,  2.15s/it]
  7%|6         | 10/149 [00:18<04:14,  1.83s/it]
  7%|7         | 11/149 [00:20<04:15,  1.85s/it]
  8%|8         | 12/149 [00:22<04:06,  1.80s/it]
  9%|8         | 13/149 [00:23<04:01,  1.78s/it]
  9%|9         | 14/149 [00:25<03:45,  1.67s/it]
 10%|#         | 15/149 [00:27<04:19,  1.94s/it]
 11%|#         | 16/149 [00:32<06:04,  2.74s/it]
 11%|#1        | 17/149 [00:35<06:24,  2.91s/it]
 12%|#2        | 18/149 [00:38<06:25,  2.94s/it]
 13%|#2        | 19/149 [00:42<06:51,  3.16s/it]
 13%|#3        | 20/149 [00:43<05:40,  2.64s/it]
 14%|#4        | 21/149 [00:46<05:43,  2.69s/it]
 15%|#4        | 22/149 [00:49<05:27,  2.58s/it]
 15%|#5        | 23/149 [00:50<04:55,  2.34s/it]
 16%|#6        | 24/149 [00:52<04:25,  2.12s/it]
 17%|#6        | 25/149 [00:54<04:03,  1.96s/it]
 17%|#7        | 26/149 [00:58<05:31,  2.70s/it]
 18%|#8        | 27/149 [01:00<04:47,  2.36s/it]
 19%|#8        | 28/149 [01:01<04:25,  2.20s/it]
 19%|#9        | 29/149 [01:06<05:41,  2.85s/it]
 20%|##        | 30/149 [01:07<04:59,  2.52s/it]
 21%|##        | 31/149 [01:10<04:42,  2.40s/it]
 21%|##1       | 32/149 [01:11<04:10,  2.14s/it]
 22%|##2       | 33/149 [01:13<03:50,  1.99s/it]
 23%|##2       | 34/149 [01:17<05:09,  2.69s/it]
 23%|##3       | 35/149 [01:20<05:08,  2.71s/it]
 24%|##4       | 36/149 [01:24<06:03,  3.22s/it]
 25%|##4       | 37/149 [01:27<05:31,  2.96s/it]
 26%|##5       | 38/149 [01:29<05:04,  2.75s/it]
 26%|##6       | 39/149 [01:31<04:47,  2.62s/it]
 27%|##6       | 40/149 [01:33<04:18,  2.37s/it]
 28%|##7       | 41/149 [01:34<03:43,  2.07s/it]
 28%|##8       | 42/149 [01:37<03:48,  2.13s/it]
 29%|##8       | 43/149 [01:38<03:22,  1.91s/it]
 30%|##9       | 44/149 [01:39<03:06,  1.77s/it]
 30%|###       | 45/149 [01:41<02:49,  1.63s/it]
 31%|###       | 46/149 [01:43<03:07,  1.82s/it]
 32%|###1      | 47/149 [01:44<02:52,  1.69s/it]
 32%|###2      | 48/149 [01:46<02:47,  1.66s/it]
 33%|###2      | 49/149 [01:48<02:47,  1.67s/it]
 34%|###3      | 50/149 [01:50<03:03,  1.86s/it]
 34%|###4      | 51/149 [01:53<03:50,  2.35s/it]
 35%|###4      | 52/149 [01:56<03:44,  2.31s/it]
 36%|###5      | 53/149 [01:58<03:40,  2.30s/it]
 36%|###6      | 54/149 [02:00<03:27,  2.18s/it]
 37%|###6      | 55/149 [02:03<03:38,  2.33s/it]
 38%|###7      | 56/149 [02:05<03:38,  2.34s/it]
 38%|###8      | 57/149 [02:07<03:33,  2.32s/it]
 39%|###8      | 58/149 [02:11<04:01,  2.65s/it]
 40%|###9      | 59/149 [02:13<03:44,  2.50s/it]
 40%|####      | 60/149 [02:15<03:38,  2.45s/it]
 41%|####      | 61/149 [02:17<03:11,  2.18s/it]
 42%|####1     | 62/149 [02:18<02:45,  1.90s/it]
 42%|####2     | 63/149 [02:20<02:53,  2.02s/it]
 43%|####2     | 64/149 [02:22<02:46,  1.96s/it]
 44%|####3     | 65/149 [02:23<02:32,  1.81s/it]
 44%|####4     | 66/149 [02:25<02:35,  1.88s/it]
 45%|####4     | 67/149 [02:28<02:50,  2.08s/it]
 46%|####5     | 68/149 [02:29<02:30,  1.86s/it]
 46%|####6     | 69/149 [02:32<02:47,  2.09s/it]
 47%|####6     | 70/149 [02:34<02:31,  1.91s/it]
 48%|####7     | 71/149 [02:36<02:37,  2.02s/it]
 48%|####8     | 72/149 [02:38<02:44,  2.14s/it]
 49%|####8     | 73/149 [02:41<02:47,  2.20s/it]
 50%|####9     | 74/149 [02:42<02:27,  1.97s/it]
 50%|#####     | 75/149 [02:43<02:14,  1.82s/it]
 51%|#####1    | 76/149 [02:45<02:05,  1.72s/it]
 52%|#####1    | 77/149 [02:47<02:15,  1.88s/it]
 52%|#####2    | 78/149 [02:48<02:00,  1.70s/it]
 53%|#####3    | 79/149 [02:50<01:50,  1.58s/it]
 54%|#####3    | 80/149 [02:52<02:11,  1.90s/it]
 54%|#####4    | 81/149 [02:54<02:07,  1.87s/it]
 55%|#####5    | 82/149 [02:56<02:12,  1.98s/it]
 56%|#####5    | 83/149 [02:59<02:16,  2.07s/it]
 56%|#####6    | 84/149 [03:01<02:12,  2.03s/it]
 57%|#####7    | 85/149 [03:05<02:50,  2.67s/it]
 58%|#####7    | 86/149 [03:06<02:23,  2.27s/it]
 58%|#####8    | 87/149 [03:08<02:10,  2.11s/it]
 59%|#####9    | 88/149 [03:10<01:59,  1.95s/it]
 60%|#####9    | 89/149 [03:12<02:02,  2.04s/it]
 60%|######    | 90/149 [03:14<02:01,  2.06s/it]
 61%|######1   | 91/149 [03:15<01:51,  1.93s/it]
 62%|######1   | 92/149 [03:17<01:40,  1.76s/it]
 62%|######2   | 93/149 [03:18<01:29,  1.60s/it]
 63%|######3   | 94/149 [03:22<02:13,  2.42s/it]
 64%|######3   | 95/149 [03:25<02:15,  2.50s/it]
 64%|######4   | 96/149 [03:29<02:37,  2.96s/it]
 65%|######5   | 97/149 [03:31<02:17,  2.64s/it]
 66%|######5   | 98/149 [03:33<02:03,  2.43s/it]
 66%|######6   | 99/149 [03:35<01:51,  2.24s/it]
 67%|######7   | 100/149 [03:37<01:49,  2.24s/it]
 68%|######7   | 101/149 [03:38<01:35,  2.00s/it]
 68%|######8   | 102/149 [03:40<01:23,  1.79s/it]
 69%|######9   | 103/149 [03:44<01:54,  2.49s/it]
 70%|######9   | 104/149 [03:46<01:42,  2.29s/it]
 70%|#######   | 105/149 [03:48<01:40,  2.28s/it]
 71%|#######1  | 106/149 [03:49<01:24,  1.96s/it]
 72%|#######1  | 107/149 [03:51<01:27,  2.08s/it]
 72%|#######2  | 108/149 [03:53<01:22,  2.02s/it]
 73%|#######3  | 109/149 [03:55<01:16,  1.92s/it]
 74%|#######3  | 110/149 [03:57<01:20,  2.06s/it]
 74%|#######4  | 111/149 [03:59<01:16,  2.01s/it]
 75%|#######5  | 112/149 [04:02<01:18,  2.11s/it]
 76%|#######5  | 113/149 [04:04<01:15,  2.10s/it]
 77%|#######6  | 114/149 [04:06<01:18,  2.25s/it]
 77%|#######7  | 115/149 [04:08<01:13,  2.15s/it]
 78%|#######7  | 116/149 [04:11<01:16,  2.33s/it]
 79%|#######8  | 117/149 [04:14<01:22,  2.59s/it]
 79%|#######9  | 118/149 [04:16<01:15,  2.44s/it]
 80%|#######9  | 119/149 [04:18<01:03,  2.11s/it]
 81%|########  | 120/149 [04:19<00:57,  1.97s/it]
 81%|########1 | 121/149 [04:21<00:51,  1.83s/it]
 82%|########1 | 122/149 [04:22<00:45,  1.68s/it]
 83%|########2 | 123/149 [04:24<00:48,  1.86s/it]
 83%|########3 | 124/149 [04:27<00:48,  1.94s/it]
 84%|########3 | 125/149 [04:28<00:42,  1.79s/it]
 85%|########4 | 126/149 [04:30<00:42,  1.84s/it]
 85%|########5 | 127/149 [04:31<00:38,  1.74s/it]
 86%|########5 | 128/149 [04:33<00:33,  1.58s/it]
 87%|########6 | 129/149 [04:35<00:35,  1.78s/it]
 87%|########7 | 130/149 [04:38<00:42,  2.23s/it]
 88%|########7 | 131/149 [04:39<00:34,  1.93s/it]
 89%|########8 | 132/149 [04:41<00:30,  1.77s/it]
 89%|########9 | 133/149 [04:43<00:29,  1.83s/it]
 90%|########9 | 134/149 [04:44<00:24,  1.65s/it]
 91%|######### | 135/149 [04:46<00:24,  1.72s/it]
 91%|#########1| 136/149 [04:47<00:21,  1.69s/it]
 92%|#########1| 137/149 [04:50<00:22,  1.86s/it]
 93%|#########2| 138/149 [04:51<00:19,  1.74s/it]
 93%|#########3| 139/149 [04:55<00:23,  2.37s/it]
 94%|#########3| 140/149 [04:57<00:20,  2.29s/it]
 95%|#########4| 141/149 [04:59<00:16,  2.06s/it]
 95%|#########5| 142/149 [05:00<00:13,  1.97s/it]
 96%|#########5| 143/149 [05:02<00:10,  1.83s/it]
 97%|#########6| 144/149 [05:04<00:09,  1.96s/it]
 97%|#########7| 145/149 [05:06<00:08,  2.01s/it]
 98%|#########7| 146/149 [05:08<00:05,  1.87s/it]
 99%|#########8| 147/149 [05:11<00:04,  2.17s/it]
 99%|#########9| 148/149 [05:13<00:02,  2.22s/it]
100%|##########| 149/149 [05:13<00:00,  1.62s/it]02/17/2022 17:43:39 - INFO - __main__ - Epoch 0: {'accuracy': 0.888}
02/17/2022 17:44:11 - INFO - __main__ - Test-set evaluation: {'accuracy': 0.914}
Configuration saved in out/tweet/gpt2_version_5\config.json
Model weights saved in out/tweet/gpt2_version_5\pytorch_model.bin
tokenizer config file saved in out/tweet/gpt2_version_5\tokenizer_config.json
Special tokens file saved in out/tweet/gpt2_version_5\special_tokens_map.json

100%|##########| 149/149 [06:23<00:00,  2.57s/it]

Roberta

!python run_glue_no_trainer.py \
  --model_name_or_path roberta-base \
  --train_file data/train.json  \
  --validation_file data/valid.json \
  --test_file data/test.json \
  --per_device_train_batch_size 24 \
  --per_device_eval_batch_size 24 \
  --max_length 128 \
  --learning_rate 2e-5 \
  --num_train_epochs 1 \
  --output_dir out/tweet/roberta
02/16/2022 00:45:12 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cpu
Use FP16 precision: False

02/16/2022 00:45:12 - WARNING - datasets.builder - Using custom data configuration default-67c9d932a627b7b8
02/16/2022 00:45:12 - WARNING - datasets.builder - Reusing dataset json (C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)

  0%|          | 0/3 [00:00<?, ?it/s]
100%|##########| 3/3 [00:00<00:00, 1503.87it/s]
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at C:\Users\Foka/.cache\huggingface\transformers\d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at C:\Users\Foka/.cache\huggingface\transformers\cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer.json from cache at C:\Users\Foka/.cache\huggingface\transformers\d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730
loading file https://huggingface.co/roberta-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

02/16/2022 00:45:18 - INFO - __main__ - Return hidden states from model: False
02/16/2022 00:45:18 - INFO - __main__ - Using implementation from: AutoModelForSequenceClassification
loading weights file https://huggingface.co/roberta-base/resolve/main/pytorch_model.bin from cache at C:\Users\Foka/.cache\huggingface\transformers\51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
02/16/2022 00:45:20 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-9bed43ed70dc0bb2.arrow

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]
Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 35.81ba/s]
02/16/2022 00:45:20 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-a7293927c8abf169.arrow
02/16/2022 00:45:20 - INFO - __main__ - Sample 528 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 9226, 269, 16, 103, 9869, 138, 47, 33, 89, 6, 4716, 1827, 8, 787, 12105, 157, 626, 4, 1437, 1437, 849, 22122, 991, 30619, 849, 21363, 46730, 219, 2], 'labels': 1}.
02/16/2022 00:45:20 - INFO - __main__ - Sample 3981 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 39398, 4056, 4333, 4056, 10674, 4056, 46, 849, 17693, 849, 16063, 1073, 5715, 849, 17827, 20168, 4183, 857, 299, 4, 35103, 849, 90, 25933, 849, 438, 4467, 849, 1794, 849, 28878, 16170, 849, 28481, 1794, 1437, 1437, 849, 28481, 3695, 4056, 7471, 4056, 18164, 1437, 2], 'labels': 0}.
02/16/2022 00:45:20 - INFO - __main__ - Sample 4184 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 2716, 18, 4076, 103, 849, 267, 6988, 428, 33726, 849, 1452, 10071, 849, 1452, 10071, 9029, 849, 4082, 5536, 11819, 849, 10393, 19347, 849, 37096, 1437, 1437, 849, 31518, 849, 1193, 366, 3695, 4056, 7471, 4056, 18164, 1437, 2], 'labels': 0}.
02/16/2022 00:45:21 - INFO - __main__ - ***** Running training *****
02/16/2022 00:45:21 - INFO - __main__ -   Num examples = 4742
02/16/2022 00:45:21 - INFO - __main__ -   Num Epochs = 1
02/16/2022 00:45:21 - INFO - __main__ -   Instantaneous batch size per device = 24
02/16/2022 00:45:21 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 24
02/16/2022 00:45:21 - INFO - __main__ -   Gradient Accumulation steps = 1
02/16/2022 00:45:21 - INFO - __main__ -   Total optimization steps = 198

  0%|          | 0/198 [00:00<?, ?it/s]
  1%|          | 1/198 [00:03<12:09,  3.70s/it]
  1%|1         | 2/198 [00:06<09:32,  2.92s/it]
  2%|1         | 3/198 [00:08<08:22,  2.57s/it]
  2%|2         | 4/198 [00:14<12:25,  3.84s/it]
  3%|2         | 5/198 [00:17<12:05,  3.76s/it]
  3%|3         | 6/198 [00:20<10:48,  3.38s/it]
  4%|3         | 7/198 [00:23<11:00,  3.46s/it]
  4%|4         | 8/198 [00:27<11:16,  3.56s/it]
  5%|4         | 9/198 [00:30<10:53,  3.46s/it]
  5%|5         | 10/198 [00:33<10:11,  3.25s/it]
  6%|5         | 11/198 [00:36<09:26,  3.03s/it]
  6%|6         | 12/198 [00:38<09:01,  2.91s/it]
  7%|6         | 13/198 [00:42<09:18,  3.02s/it]
  7%|7         | 14/198 [00:45<09:07,  2.98s/it]
  8%|7         | 15/198 [00:47<08:47,  2.88s/it]
  8%|8         | 16/198 [00:49<08:05,  2.67s/it]
  9%|8         | 17/198 [00:52<07:52,  2.61s/it]
  9%|9         | 18/198 [00:55<08:38,  2.88s/it]
 10%|9         | 19/198 [00:58<08:41,  2.91s/it]
 10%|#         | 20/198 [01:01<08:23,  2.83s/it]
 11%|#         | 21/198 [01:04<08:14,  2.79s/it]
 11%|#1        | 22/198 [01:07<08:45,  2.99s/it]
 12%|#1        | 23/198 [01:11<09:15,  3.17s/it]
 12%|#2        | 24/198 [01:17<11:41,  4.03s/it]
 13%|#2        | 25/198 [01:21<11:51,  4.11s/it]
 13%|#3        | 26/198 [01:25<11:24,  3.98s/it]
 14%|#3        | 27/198 [01:27<09:59,  3.50s/it]
 14%|#4        | 28/198 [01:30<09:05,  3.21s/it]
 15%|#4        | 29/198 [01:32<08:37,  3.06s/it]
 15%|#5        | 30/198 [01:35<08:19,  2.97s/it]
 16%|#5        | 31/198 [01:39<09:09,  3.29s/it]
 16%|#6        | 32/198 [01:42<08:22,  3.03s/it]
 17%|#6        | 33/198 [01:44<07:45,  2.82s/it]
 17%|#7        | 34/198 [01:47<07:36,  2.78s/it]
 18%|#7        | 35/198 [01:50<07:55,  2.92s/it]
 18%|#8        | 36/198 [01:53<07:49,  2.90s/it]
 19%|#8        | 37/198 [01:59<10:23,  3.87s/it]
 19%|#9        | 38/198 [02:03<10:44,  4.03s/it]
 20%|#9        | 39/198 [02:06<09:27,  3.57s/it]
 20%|##        | 40/198 [02:09<09:31,  3.62s/it]
 21%|##        | 41/198 [02:13<09:30,  3.63s/it]
 21%|##1       | 42/198 [02:16<08:53,  3.42s/it]
 22%|##1       | 43/198 [02:22<10:45,  4.17s/it]
 22%|##2       | 44/198 [02:26<10:21,  4.04s/it]
 23%|##2       | 45/198 [02:28<09:18,  3.65s/it]
 23%|##3       | 46/198 [02:31<08:46,  3.46s/it]
 24%|##3       | 47/198 [02:34<07:59,  3.18s/it]
 24%|##4       | 48/198 [02:36<07:27,  2.98s/it]
 25%|##4       | 49/198 [02:40<07:56,  3.20s/it]
 25%|##5       | 50/198 [02:43<07:30,  3.04s/it]
 26%|##5       | 51/198 [02:47<07:55,  3.24s/it]
 26%|##6       | 52/198 [02:50<08:15,  3.40s/it]
 27%|##6       | 53/198 [02:53<07:46,  3.22s/it]
 27%|##7       | 54/198 [02:55<06:56,  2.90s/it]
 28%|##7       | 55/198 [03:01<09:10,  3.85s/it]
 28%|##8       | 56/198 [03:04<08:23,  3.55s/it]
 29%|##8       | 57/198 [03:08<08:11,  3.49s/it]
 29%|##9       | 58/198 [03:11<07:48,  3.35s/it]
 30%|##9       | 59/198 [03:15<08:45,  3.78s/it]
 30%|###       | 60/198 [03:18<07:58,  3.47s/it]
 31%|###       | 61/198 [03:24<09:37,  4.22s/it]
 31%|###1      | 62/198 [03:27<08:49,  3.89s/it]
 32%|###1      | 63/198 [03:30<07:56,  3.53s/it]
 32%|###2      | 64/198 [03:33<07:37,  3.41s/it]
 33%|###2      | 65/198 [03:37<07:43,  3.49s/it]
 33%|###3      | 66/198 [03:39<06:54,  3.14s/it]
 34%|###3      | 67/198 [03:42<06:38,  3.04s/it]
 34%|###4      | 68/198 [03:46<07:15,  3.35s/it]
 35%|###4      | 69/198 [03:50<07:50,  3.65s/it]
 35%|###5      | 70/198 [03:54<07:33,  3.55s/it]
 36%|###5      | 71/198 [03:57<07:24,  3.50s/it]
 36%|###6      | 72/198 [03:59<06:45,  3.22s/it]
 37%|###6      | 73/198 [04:04<07:14,  3.47s/it]
 37%|###7      | 74/198 [04:06<06:32,  3.16s/it]
 38%|###7      | 75/198 [04:08<06:00,  2.93s/it]
 38%|###8      | 76/198 [04:11<05:40,  2.79s/it]
 39%|###8      | 77/198 [04:15<06:27,  3.20s/it]
 39%|###9      | 78/198 [04:18<06:19,  3.16s/it]
 40%|###9      | 79/198 [04:21<05:55,  2.99s/it]
 40%|####      | 80/198 [04:24<05:56,  3.02s/it]
 41%|####      | 81/198 [04:28<06:44,  3.46s/it]
 41%|####1     | 82/198 [04:31<06:15,  3.24s/it]
 42%|####1     | 83/198 [04:34<05:59,  3.12s/it]
 42%|####2     | 84/198 [04:36<05:35,  2.94s/it]
 43%|####2     | 85/198 [04:42<07:11,  3.82s/it]
 43%|####3     | 86/198 [04:44<06:13,  3.34s/it]
 44%|####3     | 87/198 [04:47<05:43,  3.09s/it]
 44%|####4     | 88/198 [04:49<05:12,  2.84s/it]
 45%|####4     | 89/198 [04:52<05:11,  2.86s/it]
 45%|####5     | 90/198 [04:56<05:52,  3.27s/it]
 46%|####5     | 91/198 [04:59<05:29,  3.08s/it]
 46%|####6     | 92/198 [05:02<05:28,  3.10s/it]
 47%|####6     | 93/198 [05:04<05:01,  2.87s/it]
 47%|####7     | 94/198 [05:07<04:58,  2.87s/it]
 48%|####7     | 95/198 [05:11<05:34,  3.25s/it]
 48%|####8     | 96/198 [05:15<05:44,  3.38s/it]
 49%|####8     | 97/198 [05:19<05:49,  3.46s/it]
 49%|####9     | 98/198 [05:22<05:51,  3.51s/it]
 50%|#####     | 99/198 [05:25<05:16,  3.20s/it]
 51%|#####     | 100/198 [05:29<05:27,  3.34s/it]
 51%|#####1    | 101/198 [05:32<05:35,  3.46s/it]
 52%|#####1    | 102/198 [05:35<05:14,  3.28s/it]
 52%|#####2    | 103/198 [05:37<04:40,  2.95s/it]
 53%|#####2    | 104/198 [05:40<04:40,  2.98s/it]
 53%|#####3    | 105/198 [05:42<04:13,  2.73s/it]
 54%|#####3    | 106/198 [05:45<04:12,  2.74s/it]
 54%|#####4    | 107/198 [05:48<04:05,  2.70s/it]
 55%|#####4    | 108/198 [05:51<04:05,  2.73s/it]
 55%|#####5    | 109/198 [05:54<04:08,  2.80s/it]
 56%|#####5    | 110/198 [05:57<04:20,  2.96s/it]
 56%|#####6    | 111/198 [06:00<04:24,  3.04s/it]
 57%|#####6    | 112/198 [06:04<04:37,  3.23s/it]
 57%|#####7    | 113/198 [06:07<04:25,  3.12s/it]
 58%|#####7    | 114/198 [06:09<04:12,  3.01s/it]
 58%|#####8    | 115/198 [06:13<04:23,  3.18s/it]
 59%|#####8    | 116/198 [06:16<04:19,  3.16s/it]
 59%|#####9    | 117/198 [06:19<04:11,  3.11s/it]
 60%|#####9    | 118/198 [06:23<04:26,  3.33s/it]
 60%|######    | 119/198 [06:25<03:57,  3.00s/it]
 61%|######    | 120/198 [06:29<04:09,  3.20s/it]
 61%|######1   | 121/198 [06:33<04:17,  3.34s/it]
 62%|######1   | 122/198 [06:39<05:14,  4.14s/it]
 62%|######2   | 123/198 [06:43<05:09,  4.12s/it]
 63%|######2   | 124/198 [06:45<04:35,  3.73s/it]
 63%|######3   | 125/198 [06:48<04:11,  3.44s/it]
 64%|######3   | 126/198 [06:51<03:46,  3.14s/it]
 64%|######4   | 127/198 [06:54<03:53,  3.29s/it]
 65%|######4   | 128/198 [06:57<03:39,  3.14s/it]
 65%|######5   | 129/198 [07:00<03:33,  3.09s/it]
 66%|######5   | 130/198 [07:03<03:27,  3.05s/it]
 66%|######6   | 131/198 [07:06<03:14,  2.91s/it]
 67%|######6   | 132/198 [07:08<03:02,  2.77s/it]
 67%|######7   | 133/198 [07:10<02:52,  2.65s/it]
 68%|######7   | 134/198 [07:13<02:43,  2.56s/it]
 68%|######8   | 135/198 [07:16<02:55,  2.78s/it]
 69%|######8   | 136/198 [07:19<02:48,  2.71s/it]
 69%|######9   | 137/198 [07:22<03:04,  3.02s/it]
 70%|######9   | 138/198 [07:28<03:43,  3.72s/it]
 70%|#######   | 139/198 [07:31<03:29,  3.56s/it]
 71%|#######   | 140/198 [07:34<03:26,  3.56s/it]
 71%|#######1  | 141/198 [07:37<03:12,  3.37s/it]
 72%|#######1  | 142/198 [07:41<03:12,  3.44s/it]
 72%|#######2  | 143/198 [07:45<03:12,  3.50s/it]
 73%|#######2  | 144/198 [07:47<02:56,  3.27s/it]
 73%|#######3  | 145/198 [07:50<02:46,  3.14s/it]
 74%|#######3  | 146/198 [07:53<02:36,  3.00s/it]
 74%|#######4  | 147/198 [07:55<02:25,  2.85s/it]
 75%|#######4  | 148/198 [07:59<02:34,  3.08s/it]
 75%|#######5  | 149/198 [08:03<02:40,  3.27s/it]
 76%|#######5  | 150/198 [08:05<02:23,  2.99s/it]
 76%|#######6  | 151/198 [08:09<02:30,  3.21s/it]
 77%|#######6  | 152/198 [08:12<02:33,  3.34s/it]
 77%|#######7  | 153/198 [08:15<02:21,  3.14s/it]
 78%|#######7  | 154/198 [08:19<02:25,  3.31s/it]
 78%|#######8  | 155/198 [08:21<02:12,  3.08s/it]
 79%|#######8  | 156/198 [08:24<01:58,  2.81s/it]
 79%|#######9  | 157/198 [08:27<02:07,  3.10s/it]
 80%|#######9  | 158/198 [08:30<02:00,  3.01s/it]
 80%|########  | 159/198 [08:35<02:17,  3.52s/it]
 81%|########  | 160/198 [08:37<02:01,  3.20s/it]
 81%|########1 | 161/198 [08:40<01:51,  3.02s/it]
 82%|########1 | 162/198 [08:45<02:10,  3.62s/it]
 82%|########2 | 163/198 [08:48<02:02,  3.51s/it]
 83%|########2 | 164/198 [08:52<02:02,  3.59s/it]
 83%|########3 | 165/198 [08:58<02:22,  4.31s/it]
 84%|########3 | 166/198 [09:00<02:00,  3.75s/it]
 84%|########4 | 167/198 [09:03<01:42,  3.30s/it]
 85%|########4 | 168/198 [09:06<01:41,  3.39s/it]
 85%|########5 | 169/198 [09:09<01:33,  3.21s/it]
 86%|########5 | 170/198 [09:12<01:25,  3.06s/it]
 86%|########6 | 171/198 [09:17<01:38,  3.67s/it]
 87%|########6 | 172/198 [09:19<01:27,  3.38s/it]
 87%|########7 | 173/198 [09:25<01:41,  4.07s/it]
 88%|########7 | 174/198 [09:28<01:28,  3.68s/it]
 88%|########8 | 175/198 [09:31<01:22,  3.60s/it]
 89%|########8 | 176/198 [09:35<01:21,  3.69s/it]
 89%|########9 | 177/198 [09:38<01:09,  3.31s/it]
 90%|########9 | 178/198 [09:40<00:59,  2.97s/it]
 90%|######### | 179/198 [09:43<01:00,  3.17s/it]
 91%|######### | 180/198 [09:47<00:58,  3.25s/it]
 91%|#########1| 181/198 [09:50<00:52,  3.07s/it]
 92%|#########1| 182/198 [09:53<00:49,  3.10s/it]
 92%|#########2| 183/198 [09:55<00:41,  2.79s/it]
 93%|#########2| 184/198 [09:58<00:42,  3.03s/it]
 93%|#########3| 185/198 [10:01<00:36,  2.84s/it]
 94%|#########3| 186/198 [10:03<00:31,  2.66s/it]
 94%|#########4| 187/198 [10:05<00:27,  2.53s/it]
 95%|#########4| 188/198 [10:08<00:25,  2.57s/it]
 95%|#########5| 189/198 [10:13<00:29,  3.25s/it]
 96%|#########5| 190/198 [10:16<00:26,  3.36s/it]
 96%|#########6| 191/198 [10:19<00:21,  3.03s/it]
 97%|#########6| 192/198 [10:22<00:18,  3.13s/it]
 97%|#########7| 193/198 [10:24<00:14,  2.91s/it]
 98%|#########7| 194/198 [10:28<00:12,  3.13s/it]
 98%|#########8| 195/198 [10:31<00:09,  3.06s/it]
 99%|#########8| 196/198 [10:36<00:07,  3.64s/it]
 99%|#########9| 197/198 [10:39<00:03,  3.58s/it]
100%|##########| 198/198 [10:41<00:00,  2.92s/it]02/16/2022 00:56:30 - INFO - __main__ - Epoch 0: {'accuracy': 0.948}
02/16/2022 00:56:53 - INFO - __main__ - Test-set evaluation: {'accuracy': 0.942}
Configuration saved in out/tweet/roberta\config.json
Model weights saved in out/tweet/roberta\pytorch_model.bin
tokenizer config file saved in out/tweet/roberta\tokenizer_config.json
Special tokens file saved in out/tweet/roberta\special_tokens_map.json

100%|##########| 198/198 [11:32<00:00,  3.50s/it]

Roberta version 2

!python run_glue_no_trainer.py \
  --model_name_or_path roberta-base \
  --train_file data/train.json  \
  --validation_file data/valid.json \
  --test_file data/test.json \
  --per_device_train_batch_size 24 \
  --per_device_eval_batch_size 24 \
  --max_length 128 \
  --freeze_model \
  --learning_rate 2e-5 \
  --num_train_epochs 1 \
  --output_dir out/tweet/roberta_version_2
02/16/2022 00:56:55 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cpu
Use FP16 precision: False

02/16/2022 00:56:56 - WARNING - datasets.builder - Using custom data configuration default-67c9d932a627b7b8
02/16/2022 00:56:56 - WARNING - datasets.builder - Reusing dataset json (C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)

  0%|          | 0/3 [00:00<?, ?it/s]
100%|##########| 3/3 [00:00<00:00, 1504.59it/s]
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at C:\Users\Foka/.cache\huggingface\transformers\d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at C:\Users\Foka/.cache\huggingface\transformers\cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer.json from cache at C:\Users\Foka/.cache\huggingface\transformers\d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730
loading file https://huggingface.co/roberta-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

02/16/2022 00:57:02 - INFO - __main__ - Return hidden states from model: False
02/16/2022 00:57:02 - INFO - __main__ - Using implementation from: AutoModelForSequenceClassification
loading weights file https://huggingface.co/roberta-base/resolve/main/pytorch_model.bin from cache at C:\Users\Foka/.cache\huggingface\transformers\51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
02/16/2022 00:57:03 - INFO - __main__ - Freezing model weights
02/16/2022 00:57:03 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-12e8873686c6be8d.arrow

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]
Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 35.81ba/s]
02/16/2022 00:57:03 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-6af3944f94b779cb.arrow
02/16/2022 00:57:03 - INFO - __main__ - Sample 2678 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 118, 437, 98, 1437, 1437, 8, 849, 6504, 39264, 122, 14, 111, 849, 3707, 9856, 1635, 1437, 2], 'labels': 0}.
02/16/2022 00:57:03 - INFO - __main__ - Sample 1289 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 1322, 47, 849, 14178, 359, 3914, 131, 619, 101, 952, 7258, 4056, 7471, 4056, 18164, 32, 30309, 154, 15, 47, 116, 4161, 1437, 849, 4903, 21210, 849, 90, 20564, 849, 119, 40879, 3695, 4056, 7471, 4056, 18164, 1437, 2], 'labels': 1}.
02/16/2022 00:57:03 - INFO - __main__ - Sample 2660 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 21714, 3308, 7512, 13, 127, 15382, 186, 11, 885, 4575, 1437, 1437, 2], 'labels': 0}.
02/16/2022 00:57:04 - INFO - __main__ - ***** Running training *****
02/16/2022 00:57:04 - INFO - __main__ -   Num examples = 4742
02/16/2022 00:57:04 - INFO - __main__ -   Num Epochs = 1
02/16/2022 00:57:04 - INFO - __main__ -   Instantaneous batch size per device = 24
02/16/2022 00:57:04 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 24
02/16/2022 00:57:04 - INFO - __main__ -   Gradient Accumulation steps = 1
02/16/2022 00:57:04 - INFO - __main__ -   Total optimization steps = 198

  0%|          | 0/198 [00:00<?, ?it/s]
  1%|          | 1/198 [00:00<02:44,  1.20it/s]
  1%|1         | 2/198 [00:01<02:38,  1.24it/s]
  2%|1         | 3/198 [00:02<03:22,  1.04s/it]
  2%|2         | 4/198 [00:03<02:45,  1.17it/s]
  3%|2         | 5/198 [00:04<03:19,  1.03s/it]
  3%|3         | 6/198 [00:06<03:31,  1.10s/it]
  4%|3         | 7/198 [00:06<03:15,  1.03s/it]
  4%|4         | 8/198 [00:08<03:43,  1.18s/it]
  5%|4         | 9/198 [00:09<03:29,  1.11s/it]
  5%|5         | 10/198 [00:10<03:05,  1.01it/s]
  6%|5         | 11/198 [00:10<02:55,  1.06it/s]
  6%|6         | 12/198 [00:11<02:57,  1.05it/s]
  7%|6         | 13/198 [00:12<02:56,  1.05it/s]
  7%|7         | 14/198 [00:13<02:41,  1.14it/s]
  8%|7         | 15/198 [00:14<02:49,  1.08it/s]
  8%|8         | 16/198 [00:15<02:47,  1.09it/s]
  9%|8         | 17/198 [00:16<02:52,  1.05it/s]
  9%|9         | 18/198 [00:18<03:28,  1.16s/it]
 10%|9         | 19/198 [00:19<03:23,  1.14s/it]
 10%|#         | 20/198 [00:20<02:59,  1.01s/it]
 11%|#         | 21/198 [00:21<03:18,  1.12s/it]
 11%|#1        | 22/198 [00:22<03:02,  1.03s/it]
 12%|#1        | 23/198 [00:23<02:52,  1.01it/s]
 12%|#2        | 24/198 [00:24<03:08,  1.08s/it]
 13%|#2        | 25/198 [00:25<03:17,  1.14s/it]
 13%|#3        | 26/198 [00:27<03:28,  1.21s/it]
 14%|#3        | 27/198 [00:27<03:05,  1.09s/it]
 14%|#4        | 28/198 [00:28<02:48,  1.01it/s]
 15%|#4        | 29/198 [00:29<02:37,  1.08it/s]
 15%|#5        | 30/198 [00:30<02:36,  1.07it/s]
 16%|#5        | 31/198 [00:31<02:28,  1.12it/s]
 16%|#6        | 32/198 [00:32<02:32,  1.09it/s]
 17%|#6        | 33/198 [00:32<02:27,  1.12it/s]
 17%|#7        | 34/198 [00:34<02:40,  1.02it/s]
 18%|#7        | 35/198 [00:35<03:02,  1.12s/it]
 18%|#8        | 36/198 [00:36<03:09,  1.17s/it]
 19%|#8        | 37/198 [00:37<02:53,  1.08s/it]
 19%|#9        | 38/198 [00:38<02:37,  1.02it/s]
 20%|#9        | 39/198 [00:39<02:50,  1.08s/it]
 20%|##        | 40/198 [00:41<02:56,  1.12s/it]
 21%|##        | 41/198 [00:42<03:00,  1.15s/it]
 21%|##1       | 42/198 [00:42<02:36,  1.00s/it]
 22%|##1       | 43/198 [00:44<02:58,  1.15s/it]
 22%|##2       | 44/198 [00:45<03:13,  1.26s/it]
 23%|##2       | 45/198 [00:46<03:03,  1.20s/it]
 23%|##3       | 46/198 [00:47<02:45,  1.09s/it]
 24%|##3       | 47/198 [00:49<02:54,  1.15s/it]
 24%|##4       | 48/198 [00:49<02:36,  1.05s/it]
 25%|##4       | 49/198 [00:50<02:33,  1.03s/it]
 25%|##5       | 50/198 [00:52<03:07,  1.27s/it]
 26%|##5       | 51/198 [00:53<02:44,  1.12s/it]
 26%|##6       | 52/198 [00:54<02:46,  1.14s/it]
 27%|##6       | 53/198 [00:55<02:49,  1.17s/it]
 27%|##7       | 54/198 [00:57<02:51,  1.19s/it]
 28%|##7       | 55/198 [00:58<02:57,  1.24s/it]
 28%|##8       | 56/198 [00:59<02:34,  1.09s/it]
 29%|##8       | 57/198 [01:01<03:30,  1.49s/it]
 29%|##9       | 58/198 [01:02<03:20,  1.43s/it]
 30%|##9       | 59/198 [01:04<03:12,  1.39s/it]
 30%|###       | 60/198 [01:05<02:50,  1.23s/it]
 31%|###       | 61/198 [01:06<02:34,  1.13s/it]
 31%|###1      | 62/198 [01:06<02:15,  1.00it/s]
 32%|###1      | 63/198 [01:07<02:09,  1.04it/s]
 32%|###2      | 64/198 [01:08<02:01,  1.10it/s]
 33%|###2      | 65/198 [01:09<02:25,  1.09s/it]
 33%|###3      | 66/198 [01:12<03:09,  1.44s/it]
 34%|###3      | 67/198 [01:13<02:54,  1.33s/it]
 34%|###4      | 68/198 [01:15<03:22,  1.56s/it]
 35%|###4      | 69/198 [01:16<03:17,  1.53s/it]
 35%|###5      | 70/198 [01:17<03:02,  1.43s/it]
 36%|###5      | 71/198 [01:18<02:39,  1.25s/it]
 36%|###6      | 72/198 [01:20<02:41,  1.28s/it]
 37%|###6      | 73/198 [01:22<03:23,  1.62s/it]
 37%|###7      | 74/198 [01:23<02:52,  1.39s/it]
 38%|###7      | 75/198 [01:24<02:34,  1.26s/it]
 38%|###8      | 76/198 [01:25<02:35,  1.27s/it]
 39%|###8      | 77/198 [01:26<02:34,  1.28s/it]
 39%|###9      | 78/198 [01:28<02:48,  1.40s/it]
 40%|###9      | 79/198 [01:29<02:21,  1.19s/it]
 40%|####      | 80/198 [01:30<02:25,  1.23s/it]
 41%|####      | 81/198 [01:31<02:13,  1.14s/it]
 41%|####1     | 82/198 [01:32<01:58,  1.03s/it]
 42%|####1     | 83/198 [01:33<01:54,  1.00it/s]
 42%|####2     | 84/198 [01:34<01:49,  1.04it/s]
 43%|####2     | 85/198 [01:35<01:58,  1.05s/it]
 43%|####3     | 86/198 [01:37<02:40,  1.43s/it]
 44%|####3     | 87/198 [01:39<02:42,  1.46s/it]
 44%|####4     | 88/198 [01:40<02:29,  1.36s/it]
 45%|####4     | 89/198 [01:41<02:11,  1.21s/it]
 45%|####5     | 90/198 [01:42<01:56,  1.08s/it]
 46%|####5     | 91/198 [01:42<01:43,  1.04it/s]
 46%|####6     | 92/198 [01:43<01:44,  1.02it/s]
 47%|####6     | 93/198 [01:45<02:04,  1.18s/it]
 47%|####7     | 94/198 [01:46<01:52,  1.08s/it]
 48%|####7     | 95/198 [01:47<01:58,  1.15s/it]
 48%|####8     | 96/198 [01:48<02:04,  1.22s/it]
 49%|####8     | 97/198 [01:49<01:54,  1.13s/it]
 49%|####9     | 98/198 [01:50<01:42,  1.02s/it]
 50%|#####     | 99/198 [01:51<01:41,  1.02s/it]
 51%|#####     | 100/198 [01:53<01:49,  1.12s/it]
 51%|#####1    | 101/198 [01:53<01:38,  1.02s/it]
 52%|#####1    | 102/198 [01:54<01:29,  1.08it/s]
 52%|#####2    | 103/198 [01:55<01:42,  1.08s/it]
 53%|#####2    | 104/198 [01:57<01:45,  1.12s/it]
 53%|#####3    | 105/198 [01:58<01:40,  1.08s/it]
 54%|#####3    | 106/198 [01:59<01:45,  1.14s/it]
 54%|#####4    | 107/198 [02:00<01:47,  1.18s/it]
 55%|#####4    | 108/198 [02:01<01:37,  1.09s/it]
 55%|#####5    | 109/198 [02:02<01:39,  1.11s/it]
 56%|#####5    | 110/198 [02:03<01:36,  1.09s/it]
 56%|#####6    | 111/198 [02:05<01:38,  1.14s/it]
 57%|#####6    | 112/198 [02:05<01:27,  1.02s/it]
 57%|#####7    | 113/198 [02:06<01:24,  1.00it/s]
 58%|#####7    | 114/198 [02:08<01:31,  1.08s/it]
 58%|#####8    | 115/198 [02:08<01:25,  1.03s/it]
 59%|#####8    | 116/198 [02:09<01:22,  1.01s/it]
 59%|#####9    | 117/198 [02:10<01:16,  1.05it/s]
 60%|#####9    | 118/198 [02:11<01:10,  1.13it/s]
 60%|######    | 119/198 [02:12<01:11,  1.11it/s]
 61%|######    | 120/198 [02:14<01:45,  1.36s/it]
 61%|######1   | 121/198 [02:15<01:27,  1.13s/it]
 62%|######1   | 122/198 [02:16<01:19,  1.04s/it]
 62%|######2   | 123/198 [02:17<01:24,  1.13s/it]
 63%|######2   | 124/198 [02:18<01:28,  1.19s/it]
 63%|######3   | 125/198 [02:20<01:29,  1.23s/it]
 64%|######3   | 126/198 [02:20<01:15,  1.05s/it]
 64%|######4   | 127/198 [02:21<01:16,  1.08s/it]
 65%|######4   | 128/198 [02:22<01:07,  1.04it/s]
 65%|######5   | 129/198 [02:24<01:26,  1.26s/it]
 66%|######5   | 130/198 [02:25<01:13,  1.08s/it]
 66%|######6   | 131/198 [02:26<01:05,  1.03it/s]
 67%|######6   | 132/198 [02:27<01:15,  1.14s/it]
 67%|######7   | 133/198 [02:28<01:06,  1.02s/it]
 68%|######7   | 134/198 [02:28<00:58,  1.10it/s]
 68%|######8   | 135/198 [02:29<00:53,  1.18it/s]
 69%|######8   | 136/198 [02:30<01:01,  1.01it/s]
 69%|######9   | 137/198 [02:31<00:59,  1.03it/s]
 70%|######9   | 138/198 [02:33<01:07,  1.12s/it]
 70%|#######   | 139/198 [02:34<00:59,  1.01s/it]
 71%|#######   | 140/198 [02:35<00:57,  1.01it/s]
 71%|#######1  | 141/198 [02:37<01:14,  1.30s/it]
 72%|#######1  | 142/198 [02:39<01:31,  1.63s/it]
 72%|#######2  | 143/198 [02:40<01:17,  1.41s/it]
 73%|#######2  | 144/198 [02:41<01:07,  1.25s/it]
 73%|#######3  | 145/198 [02:42<01:01,  1.17s/it]
 74%|#######3  | 146/198 [02:42<00:53,  1.04s/it]
 74%|#######4  | 147/198 [02:43<00:46,  1.10it/s]
 75%|#######4  | 148/198 [02:44<00:45,  1.09it/s]
 75%|#######5  | 149/198 [02:46<01:05,  1.34s/it]
 76%|#######5  | 150/198 [02:47<00:58,  1.22s/it]
 76%|#######6  | 151/198 [02:49<01:02,  1.32s/it]
 77%|#######6  | 152/198 [02:51<01:15,  1.65s/it]
 77%|#######7  | 153/198 [02:52<01:01,  1.38s/it]
 78%|#######7  | 154/198 [02:53<01:00,  1.38s/it]
 78%|#######8  | 155/198 [02:54<00:52,  1.22s/it]
 79%|#######8  | 156/198 [02:55<00:49,  1.18s/it]
 79%|#######9  | 157/198 [02:56<00:44,  1.09s/it]
 80%|#######9  | 158/198 [02:57<00:40,  1.02s/it]
 80%|########  | 159/198 [02:58<00:40,  1.03s/it]
 81%|########  | 160/198 [02:59<00:41,  1.09s/it]
 81%|########1 | 161/198 [03:00<00:37,  1.01s/it]
 82%|########1 | 162/198 [03:01<00:32,  1.10it/s]
 82%|########2 | 163/198 [03:02<00:31,  1.12it/s]
 83%|########2 | 164/198 [03:03<00:32,  1.04it/s]
 83%|########3 | 165/198 [03:05<00:46,  1.41s/it]
 84%|########3 | 166/198 [03:06<00:40,  1.25s/it]
 84%|########4 | 167/198 [03:07<00:37,  1.22s/it]
 85%|########4 | 168/198 [03:09<00:37,  1.25s/it]
 85%|########5 | 169/198 [03:10<00:36,  1.26s/it]
 86%|########5 | 170/198 [03:11<00:31,  1.12s/it]
 86%|########6 | 171/198 [03:12<00:31,  1.18s/it]
 87%|########6 | 172/198 [03:13<00:31,  1.20s/it]
 87%|########7 | 173/198 [03:14<00:28,  1.14s/it]
 88%|########7 | 174/198 [03:15<00:24,  1.03s/it]
 88%|########8 | 175/198 [03:16<00:22,  1.04it/s]
 89%|########8 | 176/198 [03:17<00:22,  1.01s/it]
 89%|########9 | 177/198 [03:18<00:22,  1.07s/it]
 90%|########9 | 178/198 [03:21<00:29,  1.48s/it]
 90%|######### | 179/198 [03:22<00:25,  1.37s/it]
 91%|######### | 180/198 [03:23<00:21,  1.22s/it]
 91%|#########1| 181/198 [03:23<00:19,  1.13s/it]
 92%|#########1| 182/198 [03:25<00:17,  1.12s/it]
 92%|#########2| 183/198 [03:25<00:15,  1.02s/it]
 93%|#########2| 184/198 [03:26<00:13,  1.02it/s]
 93%|#########3| 185/198 [03:28<00:13,  1.08s/it]
 94%|#########3| 186/198 [03:28<00:11,  1.01it/s]
 94%|#########4| 187/198 [03:30<00:12,  1.09s/it]
 95%|#########4| 188/198 [03:32<00:13,  1.34s/it]
 95%|#########5| 189/198 [03:33<00:10,  1.22s/it]
 96%|#########5| 190/198 [03:33<00:08,  1.11s/it]
 96%|#########6| 191/198 [03:34<00:07,  1.00s/it]
 97%|#########6| 192/198 [03:36<00:06,  1.16s/it]
 97%|#########7| 193/198 [03:37<00:05,  1.12s/it]
 98%|#########7| 194/198 [03:38<00:04,  1.08s/it]
 98%|#########8| 195/198 [03:38<00:02,  1.02it/s]
 99%|#########8| 196/198 [03:40<00:02,  1.22s/it]
 99%|#########9| 197/198 [03:41<00:01,  1.16s/it]
100%|##########| 198/198 [03:42<00:00,  1.05s/it]02/16/2022 01:01:14 - INFO - __main__ - Epoch 0: {'accuracy': 0.938}
02/16/2022 01:01:37 - INFO - __main__ - Test-set evaluation: {'accuracy': 1.0}
Configuration saved in out/tweet/roberta_version_2\config.json
Model weights saved in out/tweet/roberta_version_2\pytorch_model.bin
tokenizer config file saved in out/tweet/roberta_version_2\tokenizer_config.json
Special tokens file saved in out/tweet/roberta_version_2\special_tokens_map.json

100%|##########| 198/198 [04:33<00:00,  1.38s/it]

Roberta version 3

!python run_glue_no_trainer.py \
  --model_name_or_path roberta-base \
  --train_file data/train.json  \
  --validation_file data/valid.json \
  --test_file data/test.json \
  --per_device_train_batch_size 24 \
  --per_device_eval_batch_size 24 \
  --max_length 128 \
  --freeze_model \
  --custom_model \
  --learning_rate 2e-5 \
  --num_train_epochs 1 \
  --output_dir out/tweet/roberta_version_3
02/16/2022 01:01:39 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cpu
Use FP16 precision: False

02/16/2022 01:01:40 - WARNING - datasets.builder - Using custom data configuration default-67c9d932a627b7b8
02/16/2022 01:01:40 - WARNING - datasets.builder - Reusing dataset json (C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)

  0%|          | 0/3 [00:00<?, ?it/s]
100%|##########| 3/3 [00:00<00:00, 1507.66it/s]
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at C:\Users\Foka/.cache\huggingface\transformers\d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at C:\Users\Foka/.cache\huggingface\transformers\cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer.json from cache at C:\Users\Foka/.cache\huggingface\transformers\d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730
loading file https://huggingface.co/roberta-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

02/16/2022 01:01:46 - INFO - __main__ - Return hidden states from model: False
02/16/2022 01:01:46 - INFO - __main__ - Using implementation from: RobertaForSequenceClassificationCustomAlternative
loading weights file https://huggingface.co/roberta-base/resolve/main/pytorch_model.bin from cache at C:\Users\Foka/.cache\huggingface\transformers\51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassificationCustomAlternative: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassificationCustomAlternative from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassificationCustomAlternative from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassificationCustomAlternative were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense_2.weight', 'classifier.dense_1_input.weight', 'classifier.dense_2.bias', 'classifier.out_proj.weight', 'classifier.dense_1_hidden.bias', 'classifier.dense_1_input.bias', 'classifier.dense_1_hidden.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
02/16/2022 01:01:48 - INFO - __main__ - Freezing model weights
02/16/2022 01:01:48 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-ba2b749ff70d20c2.arrow

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]
Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 29.49ba/s]
02/16/2022 01:01:48 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-376097e0887bad71.arrow
02/16/2022 01:01:48 - INFO - __main__ - Sample 4466 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 8338, 365, 849, 11970, 409, 31, 3970, 727, 849, 28481, 268, 15, 849, 48056, 939, 437, 98, 1437, 1437, 849, 8656, 849, 8656, 254, 849, 45864, 849, 26949, 8585, 849, 12689, 627, 17693, 2], 'labels': 0}.
02/16/2022 01:01:48 - INFO - __main__ - Sample 979 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 1039, 12105, 849, 18897, 2527, 718, 5, 4117, 12, 267, 13760, 4289, 16, 7, 6876, 14, 952, 7258, 4056, 7471, 4056, 48, 405, 531, 33, 57, 16, 354, 4, 3695, 4056, 7471, 4056, 46, 1437, 952, 7258, 4056, 7471, 4056, 18164, 1437, 2], 'labels': 1}.
02/16/2022 01:01:48 - INFO - __main__ - Sample 2927 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 1039, 12105, 77, 16, 5, 92, 2642, 145, 703, 59, 787, 12105, 8, 110, 1108, 62, 116, 1437, 1437, 849, 17693, 1843, 10339, 4489, 849, 10120, 571, 5434, 2], 'labels': 0}.
02/16/2022 01:01:49 - INFO - __main__ - ***** Running training *****
02/16/2022 01:01:49 - INFO - __main__ -   Num examples = 4742
02/16/2022 01:01:49 - INFO - __main__ -   Num Epochs = 1
02/16/2022 01:01:49 - INFO - __main__ -   Instantaneous batch size per device = 24
02/16/2022 01:01:49 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 24
02/16/2022 01:01:49 - INFO - __main__ -   Gradient Accumulation steps = 1
02/16/2022 01:01:49 - INFO - __main__ -   Total optimization steps = 198

  0%|          | 0/198 [00:00<?, ?it/s]
  1%|          | 1/198 [00:00<02:57,  1.11it/s]
  1%|1         | 2/198 [00:02<03:38,  1.12s/it]
  2%|1         | 3/198 [00:03<04:04,  1.25s/it]
  2%|2         | 4/198 [00:04<03:42,  1.15s/it]
  3%|2         | 5/198 [00:05<03:13,  1.00s/it]
  3%|3         | 6/198 [00:06<03:48,  1.19s/it]
  4%|3         | 7/198 [00:07<03:25,  1.08s/it]
  4%|4         | 8/198 [00:09<04:20,  1.37s/it]
  5%|4         | 9/198 [00:12<05:25,  1.72s/it]
  5%|5         | 10/198 [00:13<04:31,  1.45s/it]
  6%|5         | 11/198 [00:14<04:18,  1.38s/it]
  6%|6         | 12/198 [00:15<04:15,  1.38s/it]
  7%|6         | 13/198 [00:17<04:30,  1.46s/it]
  7%|7         | 14/198 [00:18<04:13,  1.38s/it]
  8%|7         | 15/198 [00:19<04:11,  1.37s/it]
  8%|8         | 16/198 [00:21<04:22,  1.44s/it]
  9%|8         | 17/198 [00:22<04:27,  1.48s/it]
  9%|9         | 18/198 [00:23<03:47,  1.27s/it]
 10%|9         | 19/198 [00:25<04:00,  1.34s/it]
 10%|#         | 20/198 [00:26<03:28,  1.17s/it]
 11%|#         | 21/198 [00:26<03:13,  1.09s/it]
 11%|#1        | 22/198 [00:29<04:03,  1.38s/it]
 12%|#1        | 23/198 [00:29<03:29,  1.20s/it]
 12%|#2        | 24/198 [00:30<03:08,  1.08s/it]
 13%|#2        | 25/198 [00:31<03:13,  1.12s/it]
 13%|#3        | 26/198 [00:32<03:01,  1.05s/it]
 14%|#3        | 27/198 [00:34<03:15,  1.15s/it]
 14%|#4        | 28/198 [00:35<03:23,  1.20s/it]
 15%|#4        | 29/198 [00:36<03:28,  1.23s/it]
 15%|#5        | 30/198 [00:38<03:33,  1.27s/it]
 16%|#5        | 31/198 [00:39<03:36,  1.29s/it]
 16%|#6        | 32/198 [00:40<03:13,  1.17s/it]
 17%|#6        | 33/198 [00:41<03:18,  1.20s/it]
 17%|#7        | 34/198 [00:42<02:59,  1.09s/it]
 18%|#7        | 35/198 [00:43<03:17,  1.21s/it]
 18%|#8        | 36/198 [00:44<03:04,  1.14s/it]
 19%|#8        | 37/198 [00:46<03:12,  1.19s/it]
 19%|#9        | 38/198 [00:47<03:07,  1.17s/it]
 20%|#9        | 39/198 [00:48<03:12,  1.21s/it]
 20%|##        | 40/198 [00:49<02:52,  1.09s/it]
 21%|##        | 41/198 [00:50<02:36,  1.00it/s]
 21%|##1       | 42/198 [00:51<02:31,  1.03it/s]
 22%|##1       | 43/198 [00:52<02:31,  1.02it/s]
 22%|##2       | 44/198 [00:53<02:34,  1.00s/it]
 23%|##2       | 45/198 [00:54<02:27,  1.04it/s]
 23%|##3       | 46/198 [00:54<02:18,  1.10it/s]
 24%|##3       | 47/198 [00:55<02:12,  1.14it/s]
 24%|##4       | 48/198 [00:56<02:12,  1.13it/s]
 25%|##4       | 49/198 [00:57<02:32,  1.03s/it]
 25%|##5       | 50/198 [01:00<03:31,  1.43s/it]
 26%|##5       | 51/198 [01:01<03:22,  1.38s/it]
 26%|##6       | 52/198 [01:02<03:01,  1.25s/it]
 27%|##6       | 53/198 [01:03<02:39,  1.10s/it]
 27%|##7       | 54/198 [01:04<02:33,  1.06s/it]
 28%|##7       | 55/198 [01:05<02:33,  1.07s/it]
 28%|##8       | 56/198 [01:06<02:18,  1.03it/s]
 29%|##8       | 57/198 [01:06<02:15,  1.04it/s]
 29%|##9       | 58/198 [01:07<02:16,  1.02it/s]
 30%|##9       | 59/198 [01:08<02:06,  1.10it/s]
 30%|###       | 60/198 [01:09<02:02,  1.13it/s]
 31%|###       | 61/198 [01:10<02:02,  1.12it/s]
 31%|###1      | 62/198 [01:11<02:18,  1.02s/it]
 32%|###1      | 63/198 [01:12<02:16,  1.01s/it]
 32%|###2      | 64/198 [01:13<02:01,  1.10it/s]
 33%|###2      | 65/198 [01:14<02:05,  1.06it/s]
 33%|###3      | 66/198 [01:15<02:23,  1.09s/it]
 34%|###3      | 67/198 [01:16<02:08,  1.02it/s]
 34%|###4      | 68/198 [01:17<02:04,  1.04it/s]
 35%|###4      | 69/198 [01:18<02:13,  1.03s/it]
 35%|###5      | 70/198 [01:19<02:06,  1.01it/s]
 36%|###5      | 71/198 [01:20<01:56,  1.09it/s]
 36%|###6      | 72/198 [01:21<01:56,  1.08it/s]
 37%|###6      | 73/198 [01:22<02:10,  1.04s/it]
 37%|###7      | 74/198 [01:23<02:20,  1.13s/it]
 38%|###7      | 75/198 [01:24<02:11,  1.07s/it]
 38%|###8      | 76/198 [01:26<02:19,  1.15s/it]
 39%|###8      | 77/198 [01:27<02:32,  1.26s/it]
 39%|###9      | 78/198 [01:28<02:18,  1.15s/it]
 40%|###9      | 79/198 [01:29<02:09,  1.09s/it]
 40%|####      | 80/198 [01:30<02:00,  1.02s/it]
 41%|####      | 81/198 [01:31<01:51,  1.05it/s]
 41%|####1     | 82/198 [01:32<01:51,  1.04it/s]
 42%|####1     | 83/198 [01:33<01:59,  1.04s/it]
 42%|####2     | 84/198 [01:34<01:51,  1.03it/s]
 43%|####2     | 85/198 [01:35<01:53,  1.00s/it]
 43%|####3     | 86/198 [01:36<01:50,  1.02it/s]
 44%|####3     | 87/198 [01:37<01:54,  1.03s/it]
 44%|####4     | 88/198 [01:38<01:44,  1.05it/s]
 45%|####4     | 89/198 [01:39<01:46,  1.03it/s]
 45%|####5     | 90/198 [01:40<01:53,  1.05s/it]
 46%|####5     | 91/198 [01:41<02:01,  1.14s/it]
 46%|####6     | 92/198 [01:44<02:40,  1.52s/it]
 47%|####6     | 93/198 [01:46<03:02,  1.74s/it]
 47%|####7     | 94/198 [01:47<02:34,  1.49s/it]
 48%|####7     | 95/198 [01:48<02:10,  1.27s/it]
 48%|####8     | 96/198 [01:49<02:11,  1.29s/it]
 49%|####8     | 97/198 [01:50<02:05,  1.24s/it]
 49%|####9     | 98/198 [01:51<02:01,  1.21s/it]
 50%|#####     | 99/198 [01:52<01:52,  1.14s/it]
 51%|#####     | 100/198 [01:53<01:41,  1.03s/it]
 51%|#####1    | 101/198 [01:54<01:42,  1.05s/it]
 52%|#####1    | 102/198 [01:55<01:50,  1.15s/it]
 52%|#####2    | 103/198 [01:56<01:38,  1.04s/it]
 53%|#####2    | 104/198 [01:57<01:42,  1.09s/it]
 53%|#####3    | 105/198 [01:59<02:01,  1.31s/it]
 54%|#####3    | 106/198 [02:01<02:13,  1.46s/it]
 54%|#####4    | 107/198 [02:02<02:09,  1.42s/it]
 55%|#####4    | 108/198 [02:03<01:51,  1.24s/it]
 55%|#####5    | 109/198 [02:04<01:48,  1.22s/it]
 56%|#####5    | 110/198 [02:05<01:35,  1.09s/it]
 56%|#####6    | 111/198 [02:06<01:28,  1.02s/it]
 57%|#####6    | 112/198 [02:07<01:20,  1.06it/s]
 57%|#####7    | 113/198 [02:08<01:25,  1.00s/it]
 58%|#####7    | 114/198 [02:09<01:16,  1.10it/s]
 58%|#####8    | 115/198 [02:09<01:13,  1.13it/s]
 59%|#####8    | 116/198 [02:10<01:09,  1.18it/s]
 59%|#####9    | 117/198 [02:12<01:20,  1.00it/s]
 60%|#####9    | 118/198 [02:12<01:16,  1.04it/s]
 60%|######    | 119/198 [02:14<01:24,  1.06s/it]
 61%|######    | 120/198 [02:15<01:18,  1.00s/it]
 61%|######1   | 121/198 [02:17<01:40,  1.31s/it]
 62%|######1   | 122/198 [02:18<01:30,  1.19s/it]
 62%|######2   | 123/198 [02:19<01:28,  1.18s/it]
 63%|######2   | 124/198 [02:20<01:30,  1.22s/it]
 63%|######3   | 125/198 [02:21<01:21,  1.12s/it]
 64%|######3   | 126/198 [02:22<01:15,  1.05s/it]
 64%|######4   | 127/198 [02:23<01:16,  1.08s/it]
 65%|######4   | 128/198 [02:25<01:44,  1.49s/it]
 65%|######5   | 129/198 [02:27<01:40,  1.46s/it]
 66%|######5   | 130/198 [02:28<01:36,  1.42s/it]
 66%|######6   | 131/198 [02:29<01:26,  1.29s/it]
 67%|######6   | 132/198 [02:30<01:25,  1.29s/it]
 67%|######7   | 133/198 [02:31<01:16,  1.18s/it]
 68%|######7   | 134/198 [02:32<01:07,  1.06s/it]
 68%|######8   | 135/198 [02:34<01:17,  1.23s/it]
 69%|######8   | 136/198 [02:35<01:16,  1.23s/it]
 69%|######9   | 137/198 [02:36<01:05,  1.07s/it]
 70%|######9   | 138/198 [02:38<01:21,  1.36s/it]
 70%|#######   | 139/198 [02:40<01:37,  1.66s/it]
 71%|#######   | 140/198 [02:41<01:24,  1.45s/it]
 71%|#######1  | 141/198 [02:42<01:20,  1.41s/it]
 72%|#######1  | 142/198 [02:44<01:18,  1.40s/it]
 72%|#######2  | 143/198 [02:44<01:06,  1.20s/it]
 73%|#######2  | 144/198 [02:45<01:01,  1.13s/it]
 73%|#######3  | 145/198 [02:48<01:20,  1.52s/it]
 74%|#######3  | 146/198 [02:49<01:21,  1.57s/it]
 74%|#######4  | 147/198 [02:51<01:16,  1.50s/it]
 75%|#######4  | 148/198 [02:52<01:12,  1.45s/it]
 75%|#######5  | 149/198 [02:53<01:03,  1.29s/it]
 76%|#######5  | 150/198 [02:54<01:02,  1.30s/it]
 76%|#######6  | 151/198 [02:55<00:56,  1.20s/it]
 77%|#######6  | 152/198 [02:56<00:49,  1.08s/it]
 77%|#######7  | 153/198 [02:58<00:55,  1.23s/it]
 78%|#######7  | 154/198 [02:59<00:51,  1.16s/it]
 78%|#######8  | 155/198 [03:00<00:45,  1.06s/it]
 79%|#######8  | 156/198 [03:00<00:41,  1.02it/s]
 79%|#######9  | 157/198 [03:01<00:41,  1.02s/it]
 80%|#######9  | 158/198 [03:03<00:40,  1.02s/it]
 80%|########  | 159/198 [03:03<00:37,  1.05it/s]
 81%|########  | 160/198 [03:05<00:39,  1.05s/it]
 81%|########1 | 161/198 [03:06<00:41,  1.13s/it]
 82%|########1 | 162/198 [03:07<00:36,  1.00s/it]
 82%|########2 | 163/198 [03:08<00:38,  1.10s/it]
 83%|########2 | 164/198 [03:09<00:36,  1.08s/it]
 83%|########3 | 165/198 [03:10<00:34,  1.03s/it]
 84%|########3 | 166/198 [03:11<00:32,  1.03s/it]
 84%|########4 | 167/198 [03:12<00:29,  1.04it/s]
 85%|########4 | 168/198 [03:13<00:29,  1.02it/s]
 85%|########5 | 169/198 [03:15<00:36,  1.25s/it]
 86%|########5 | 170/198 [03:16<00:35,  1.25s/it]
 86%|########6 | 171/198 [03:17<00:31,  1.18s/it]
 87%|########6 | 172/198 [03:18<00:29,  1.13s/it]
 87%|########7 | 173/198 [03:19<00:27,  1.09s/it]
 88%|########7 | 174/198 [03:20<00:25,  1.05s/it]
 88%|########8 | 175/198 [03:21<00:26,  1.16s/it]
 89%|########8 | 176/198 [03:23<00:26,  1.20s/it]
 89%|########9 | 177/198 [03:24<00:25,  1.20s/it]
 90%|########9 | 178/198 [03:26<00:30,  1.55s/it]
 90%|######### | 179/198 [03:27<00:27,  1.46s/it]
 91%|######### | 180/198 [03:28<00:22,  1.26s/it]
 91%|#########1| 181/198 [03:31<00:27,  1.63s/it]
 92%|#########1| 182/198 [03:31<00:22,  1.38s/it]
 92%|#########2| 183/198 [03:33<00:19,  1.28s/it]
 93%|#########2| 184/198 [03:34<00:18,  1.29s/it]
 93%|#########3| 185/198 [03:35<00:17,  1.37s/it]
 94%|#########3| 186/198 [03:37<00:15,  1.30s/it]
 94%|#########4| 187/198 [03:38<00:14,  1.31s/it]
 95%|#########4| 188/198 [03:39<00:11,  1.15s/it]
 95%|#########5| 189/198 [03:41<00:13,  1.49s/it]
 96%|#########5| 190/198 [03:42<00:10,  1.27s/it]
 96%|#########6| 191/198 [03:43<00:08,  1.27s/it]
 97%|#########6| 192/198 [03:44<00:07,  1.22s/it]
 97%|#########7| 193/198 [03:45<00:05,  1.10s/it]
 98%|#########7| 194/198 [03:46<00:03,  1.01it/s]
 98%|#########8| 195/198 [03:47<00:02,  1.03it/s]
 99%|#########8| 196/198 [03:47<00:01,  1.11it/s]
 99%|#########9| 197/198 [03:48<00:00,  1.01it/s]
100%|##########| 198/198 [03:49<00:00,  1.19it/s]02/16/2022 01:06:06 - INFO - __main__ - Epoch 0: {'accuracy': 0.938}
02/16/2022 01:06:29 - INFO - __main__ - Test-set evaluation: {'accuracy': 1.0}
Configuration saved in out/tweet/roberta_version_3\config.json
Model weights saved in out/tweet/roberta_version_3\pytorch_model.bin
tokenizer config file saved in out/tweet/roberta_version_3\tokenizer_config.json
Special tokens file saved in out/tweet/roberta_version_3\special_tokens_map.json

100%|##########| 198/198 [04:40<00:00,  1.42s/it]

Roberta version 4

!python run_glue_no_trainer.py \
  --model_name_or_path roberta-base \
  --train_file data/train.json  \
  --validation_file data/valid.json \
  --test_file data/test.json \
  --per_device_train_batch_size 24 \
  --per_device_eval_batch_size 24 \
  --max_length 128 \
  --freeze_model \
  --custom_model \
  --return_hidden_states \
  --learning_rate 2e-5 \
  --num_train_epochs 1 \
  --output_dir out/tweet/roberta_version_4
02/16/2022 01:06:31 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cpu
Use FP16 precision: False

02/16/2022 01:06:32 - WARNING - datasets.builder - Using custom data configuration default-67c9d932a627b7b8
02/16/2022 01:06:32 - WARNING - datasets.builder - Reusing dataset json (C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)

  0%|          | 0/3 [00:00<?, ?it/s]
100%|##########| 3/3 [00:00<00:00, 1507.84it/s]
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at C:\Users\Foka/.cache\huggingface\transformers\d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at C:\Users\Foka/.cache\huggingface\transformers\cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer.json from cache at C:\Users\Foka/.cache\huggingface\transformers\d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730
loading file https://huggingface.co/roberta-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

02/16/2022 01:06:38 - INFO - __main__ - Return hidden states from model: True
02/16/2022 01:06:38 - INFO - __main__ - Using implementation from: RobertaForSequenceClassificationCustomAlternative
loading weights file https://huggingface.co/roberta-base/resolve/main/pytorch_model.bin from cache at C:\Users\Foka/.cache\huggingface\transformers\51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassificationCustomAlternative: ['lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassificationCustomAlternative from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassificationCustomAlternative from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassificationCustomAlternative were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense_1_hidden.weight', 'classifier.dense_1_hidden.bias', 'classifier.dense_1_input.bias', 'classifier.dense_2.bias', 'classifier.dense_2.weight', 'classifier.out_proj.weight', 'classifier.dense_1_input.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
02/16/2022 01:06:40 - INFO - __main__ - Freezing model weights
02/16/2022 01:06:40 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-73165df4ba3ef6cf.arrow

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]
Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 31.33ba/s]
02/16/2022 01:06:40 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-67c9d932a627b7b8\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-015ce493f6b049f3.arrow
02/16/2022 01:06:40 - INFO - __main__ - Sample 3979 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 1039, 12105, 787, 12105, 787, 12105, 787, 12105, 787, 12105, 45365, 5, 2526, 9, 84, 184, 1269, 4, 1437, 1437, 2], 'labels': 0}.
02/16/2022 01:06:40 - INFO - __main__ - Sample 2415 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 10669, 99, 84, 247, 439, 149, 42, 94, 76, 7, 192, 82, 836, 10, 22, 27076, 113, 7, 5, 4773, 359, 3914, 131, 283, 259, 13, 960, 53, 1037, 1437, 1437, 2], 'labels': 0}.
02/16/2022 01:06:40 - INFO - __main__ - Sample 2136 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 1039, 12105, 849, 41468, 1809, 4473, 20126, 849, 41468, 1809, 4742, 21929, 1809, 849, 41468, 1809, 119, 1350, 90, 428, 4759, 415, 596, 1437, 849, 31336, 28465, 16, 8266, 1437, 787, 12105, 2], 'labels': 1}.
02/16/2022 01:06:41 - INFO - __main__ - ***** Running training *****
02/16/2022 01:06:41 - INFO - __main__ -   Num examples = 4742
02/16/2022 01:06:41 - INFO - __main__ -   Num Epochs = 1
02/16/2022 01:06:41 - INFO - __main__ -   Instantaneous batch size per device = 24
02/16/2022 01:06:41 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 24
02/16/2022 01:06:41 - INFO - __main__ -   Gradient Accumulation steps = 1
02/16/2022 01:06:41 - INFO - __main__ -   Total optimization steps = 198

  0%|          | 0/198 [00:00<?, ?it/s]
  1%|          | 1/198 [00:02<07:48,  2.38s/it]
  1%|1         | 2/198 [00:04<07:47,  2.39s/it]
  2%|1         | 3/198 [00:06<06:30,  2.00s/it]
  2%|2         | 4/198 [00:07<04:55,  1.52s/it]
  3%|2         | 5/198 [00:08<04:20,  1.35s/it]
  3%|3         | 6/198 [00:09<04:16,  1.34s/it]
  4%|3         | 7/198 [00:10<04:14,  1.33s/it]
  4%|4         | 8/198 [00:12<04:12,  1.33s/it]
  5%|4         | 9/198 [00:13<03:49,  1.22s/it]
  5%|5         | 10/198 [00:14<03:40,  1.17s/it]
  6%|5         | 11/198 [00:15<03:24,  1.09s/it]
  6%|6         | 12/198 [00:15<03:07,  1.01s/it]
  7%|6         | 13/198 [00:16<03:09,  1.02s/it]
  7%|7         | 14/198 [00:17<03:03,  1.00it/s]
  8%|7         | 15/198 [00:18<02:56,  1.04it/s]
  8%|8         | 16/198 [00:19<02:49,  1.08it/s]
  9%|8         | 17/198 [00:20<02:56,  1.03it/s]
  9%|9         | 18/198 [00:21<02:47,  1.07it/s]
 10%|9         | 19/198 [00:22<02:53,  1.03it/s]
 10%|#         | 20/198 [00:24<03:27,  1.16s/it]
 11%|#         | 21/198 [00:24<03:01,  1.03s/it]
 11%|#1        | 22/198 [00:26<03:34,  1.22s/it]
 12%|#1        | 23/198 [00:27<03:20,  1.14s/it]
 12%|#2        | 24/198 [00:28<03:08,  1.08s/it]
 13%|#2        | 25/198 [00:29<03:17,  1.14s/it]
 13%|#3        | 26/198 [00:31<03:34,  1.24s/it]
 14%|#3        | 27/198 [00:32<03:38,  1.28s/it]
 14%|#4        | 28/198 [00:33<03:16,  1.16s/it]
 15%|#4        | 29/198 [00:34<03:02,  1.08s/it]
 15%|#5        | 30/198 [00:35<02:55,  1.05s/it]
 16%|#5        | 31/198 [00:36<02:43,  1.02it/s]
 16%|#6        | 32/198 [00:37<02:48,  1.01s/it]
 17%|#6        | 33/198 [00:38<02:51,  1.04s/it]
 17%|#7        | 34/198 [00:39<02:37,  1.04it/s]
 18%|#7        | 35/198 [00:39<02:29,  1.09it/s]
 18%|#8        | 36/198 [00:41<03:20,  1.23s/it]
 19%|#8        | 37/198 [00:44<04:16,  1.60s/it]
 19%|#9        | 38/198 [00:45<03:42,  1.39s/it]
 20%|#9        | 39/198 [00:46<03:35,  1.36s/it]
 20%|##        | 40/198 [00:48<03:39,  1.39s/it]
 21%|##        | 41/198 [00:48<03:16,  1.25s/it]
 21%|##1       | 42/198 [00:50<03:28,  1.34s/it]
 22%|##1       | 43/198 [00:52<03:57,  1.53s/it]
 22%|##2       | 44/198 [00:53<03:30,  1.37s/it]
 23%|##2       | 45/198 [00:54<03:05,  1.21s/it]
 23%|##3       | 46/198 [00:55<02:46,  1.09s/it]
 24%|##3       | 47/198 [00:55<02:30,  1.00it/s]
 24%|##4       | 48/198 [00:56<02:20,  1.07it/s]
 25%|##4       | 49/198 [00:57<02:11,  1.13it/s]
 25%|##5       | 50/198 [00:58<02:15,  1.09it/s]
 26%|##5       | 51/198 [00:59<02:09,  1.13it/s]
 26%|##6       | 52/198 [01:00<02:24,  1.01it/s]
 27%|##6       | 53/198 [01:01<02:11,  1.10it/s]
 27%|##7       | 54/198 [01:02<02:15,  1.06it/s]
 28%|##7       | 55/198 [01:03<02:28,  1.04s/it]
 28%|##8       | 56/198 [01:04<02:14,  1.06it/s]
 29%|##8       | 57/198 [01:05<02:30,  1.07s/it]
 29%|##9       | 58/198 [01:06<02:24,  1.03s/it]
 30%|##9       | 59/198 [01:07<02:35,  1.12s/it]
 30%|###       | 60/198 [01:08<02:27,  1.07s/it]
 31%|###       | 61/198 [01:10<02:35,  1.14s/it]
 31%|###1      | 62/198 [01:11<02:43,  1.20s/it]
 32%|###1      | 63/198 [01:12<02:48,  1.25s/it]
 32%|###2      | 64/198 [01:13<02:36,  1.17s/it]
 33%|###2      | 65/198 [01:14<02:15,  1.02s/it]
 33%|###3      | 66/198 [01:16<02:40,  1.22s/it]
 34%|###3      | 67/198 [01:18<03:20,  1.53s/it]
 34%|###4      | 68/198 [01:19<02:54,  1.35s/it]
 35%|###4      | 69/198 [01:20<03:02,  1.41s/it]
 35%|###5      | 70/198 [01:21<02:44,  1.29s/it]
 36%|###5      | 71/198 [01:22<02:27,  1.16s/it]
 36%|###6      | 72/198 [01:23<02:23,  1.14s/it]
 37%|###6      | 73/198 [01:25<02:25,  1.16s/it]
 37%|###7      | 74/198 [01:26<02:32,  1.23s/it]
 38%|###7      | 75/198 [01:27<02:15,  1.10s/it]
 38%|###8      | 76/198 [01:29<03:03,  1.50s/it]
 39%|###8      | 77/198 [01:30<02:47,  1.39s/it]
 39%|###9      | 78/198 [01:31<02:28,  1.24s/it]
 40%|###9      | 79/198 [01:32<02:12,  1.11s/it]
 40%|####      | 80/198 [01:33<02:08,  1.09s/it]
 41%|####      | 81/198 [01:34<01:56,  1.01it/s]
 41%|####1     | 82/198 [01:35<01:46,  1.09it/s]
 42%|####1     | 83/198 [01:36<01:52,  1.02it/s]
 42%|####2     | 84/198 [01:36<01:46,  1.07it/s]
 43%|####2     | 85/198 [01:37<01:41,  1.11it/s]
 43%|####3     | 86/198 [01:38<01:38,  1.14it/s]
 44%|####3     | 87/198 [01:41<02:30,  1.35s/it]
 44%|####4     | 88/198 [01:42<02:35,  1.41s/it]
 45%|####4     | 89/198 [01:43<02:15,  1.25s/it]
 45%|####5     | 90/198 [01:44<01:59,  1.10s/it]
 46%|####5     | 91/198 [01:46<02:41,  1.51s/it]
 46%|####6     | 92/198 [01:47<02:21,  1.34s/it]
 47%|####6     | 93/198 [01:48<02:09,  1.23s/it]
 47%|####7     | 94/198 [01:49<02:01,  1.17s/it]
 48%|####7     | 95/198 [01:50<01:52,  1.09s/it]
 48%|####8     | 96/198 [01:52<02:05,  1.23s/it]
 49%|####8     | 97/198 [01:52<01:51,  1.10s/it]
 49%|####9     | 98/198 [01:53<01:41,  1.02s/it]
 50%|#####     | 99/198 [01:54<01:38,  1.01it/s]
 51%|#####     | 100/198 [01:55<01:35,  1.03it/s]
 51%|#####1    | 101/198 [01:56<01:45,  1.09s/it]
 52%|#####1    | 102/198 [01:58<01:47,  1.12s/it]
 52%|#####2    | 103/198 [01:58<01:38,  1.03s/it]
 53%|#####2    | 104/198 [02:00<01:45,  1.12s/it]
 53%|#####3    | 105/198 [02:01<01:54,  1.23s/it]
 54%|#####3    | 106/198 [02:02<01:48,  1.18s/it]
 54%|#####4    | 107/198 [02:03<01:41,  1.12s/it]
 55%|#####4    | 108/198 [02:04<01:30,  1.00s/it]
 55%|#####5    | 109/198 [02:05<01:26,  1.03it/s]
 56%|#####5    | 110/198 [02:06<01:34,  1.07s/it]
 56%|#####6    | 111/198 [02:07<01:33,  1.08s/it]
 57%|#####6    | 112/198 [02:08<01:33,  1.09s/it]
 57%|#####7    | 113/198 [02:09<01:26,  1.02s/it]
 58%|#####7    | 114/198 [02:11<01:34,  1.12s/it]
 58%|#####8    | 115/198 [02:11<01:25,  1.03s/it]
 59%|#####8    | 116/198 [02:13<01:31,  1.12s/it]
 59%|#####9    | 117/198 [02:14<01:34,  1.16s/it]
 60%|#####9    | 118/198 [02:15<01:28,  1.11s/it]
 60%|######    | 119/198 [02:16<01:31,  1.16s/it]
 61%|######    | 120/198 [02:18<01:34,  1.21s/it]
 61%|######1   | 121/198 [02:19<01:26,  1.12s/it]
 62%|######1   | 122/198 [02:20<01:22,  1.09s/it]
 62%|######2   | 123/198 [02:21<01:26,  1.15s/it]
 63%|######2   | 124/198 [02:22<01:28,  1.20s/it]
 63%|######3   | 125/198 [02:25<01:55,  1.59s/it]
 64%|######3   | 126/198 [02:27<02:00,  1.67s/it]
 64%|######4   | 127/198 [02:27<01:37,  1.38s/it]
 65%|######4   | 128/198 [02:29<01:35,  1.37s/it]
 65%|######5   | 129/198 [02:30<01:33,  1.35s/it]
 66%|######5   | 130/198 [02:31<01:21,  1.21s/it]
 66%|######6   | 131/198 [02:32<01:11,  1.06s/it]
 67%|######6   | 132/198 [02:32<01:07,  1.02s/it]
 67%|######7   | 133/198 [02:34<01:16,  1.17s/it]
 68%|######7   | 134/198 [02:35<01:13,  1.14s/it]
 68%|######8   | 135/198 [02:37<01:28,  1.40s/it]
 69%|######8   | 136/198 [02:38<01:15,  1.22s/it]
 69%|######9   | 137/198 [02:39<01:17,  1.27s/it]
 70%|######9   | 138/198 [02:40<01:14,  1.24s/it]
 70%|#######   | 139/198 [02:42<01:13,  1.25s/it]
 71%|#######   | 140/198 [02:44<01:31,  1.58s/it]
 71%|#######1  | 141/198 [02:45<01:15,  1.33s/it]
 72%|#######1  | 142/198 [02:46<01:11,  1.28s/it]
 72%|#######2  | 143/198 [02:48<01:28,  1.60s/it]
 73%|#######2  | 144/198 [02:49<01:16,  1.42s/it]
 73%|#######3  | 145/198 [02:51<01:14,  1.40s/it]
 74%|#######3  | 146/198 [02:51<01:03,  1.23s/it]
 74%|#######4  | 147/198 [02:52<00:55,  1.09s/it]
 75%|#######4  | 148/198 [02:53<00:51,  1.02s/it]
 75%|#######5  | 149/198 [02:54<00:49,  1.01s/it]
 76%|#######5  | 150/198 [02:55<00:46,  1.04it/s]
 76%|#######6  | 151/198 [02:56<00:44,  1.07it/s]
 77%|#######6  | 152/198 [02:57<00:48,  1.05s/it]
 77%|#######7  | 153/198 [02:58<00:49,  1.11s/it]
 78%|#######7  | 154/198 [02:59<00:44,  1.01s/it]
 78%|#######8  | 155/198 [03:00<00:41,  1.04it/s]
 79%|#######8  | 156/198 [03:01<00:44,  1.06s/it]
 79%|#######9  | 157/198 [03:02<00:40,  1.02it/s]
 80%|#######9  | 158/198 [03:03<00:40,  1.01s/it]
 80%|########  | 159/198 [03:04<00:43,  1.10s/it]
 81%|########  | 160/198 [03:06<00:42,  1.11s/it]
 81%|########1 | 161/198 [03:07<00:38,  1.04s/it]
 82%|########1 | 162/198 [03:08<00:38,  1.06s/it]
 82%|########2 | 163/198 [03:09<00:45,  1.31s/it]
 83%|########2 | 164/198 [03:10<00:40,  1.19s/it]
 83%|########3 | 165/198 [03:11<00:36,  1.10s/it]
 84%|########3 | 166/198 [03:12<00:32,  1.01s/it]
 84%|########4 | 167/198 [03:13<00:33,  1.07s/it]
 85%|########4 | 168/198 [03:15<00:34,  1.14s/it]
 85%|########5 | 169/198 [03:16<00:31,  1.10s/it]
 86%|########5 | 170/198 [03:17<00:34,  1.22s/it]
 86%|########6 | 171/198 [03:18<00:33,  1.25s/it]
 87%|########6 | 172/198 [03:19<00:30,  1.16s/it]
 87%|########7 | 173/198 [03:21<00:35,  1.43s/it]
 88%|########7 | 174/198 [03:23<00:33,  1.39s/it]
 88%|########8 | 175/198 [03:24<00:33,  1.46s/it]
 89%|########8 | 176/198 [03:26<00:31,  1.41s/it]
 89%|########9 | 177/198 [03:27<00:29,  1.40s/it]
 90%|########9 | 178/198 [03:28<00:27,  1.37s/it]
 90%|######### | 179/198 [03:30<00:25,  1.32s/it]
 91%|######### | 180/198 [03:30<00:21,  1.17s/it]
 91%|#########1| 181/198 [03:31<00:18,  1.10s/it]
 92%|#########1| 182/198 [03:33<00:18,  1.13s/it]
 92%|#########2| 183/198 [03:33<00:15,  1.05s/it]
 93%|#########2| 184/198 [03:35<00:16,  1.15s/it]
 93%|#########3| 185/198 [03:36<00:13,  1.06s/it]
 94%|#########3| 186/198 [03:37<00:13,  1.15s/it]
 94%|#########4| 187/198 [03:38<00:11,  1.03s/it]
 95%|#########4| 188/198 [03:39<00:10,  1.03s/it]
 95%|#########5| 189/198 [03:40<00:09,  1.02s/it]
 96%|#########5| 190/198 [03:41<00:08,  1.10s/it]
 96%|#########6| 191/198 [03:42<00:08,  1.15s/it]
 97%|#########6| 192/198 [03:44<00:07,  1.21s/it]
 97%|#########7| 193/198 [03:45<00:05,  1.18s/it]
 98%|#########7| 194/198 [03:46<00:04,  1.08s/it]
 98%|#########8| 195/198 [03:46<00:02,  1.00it/s]
 99%|#########8| 196/198 [03:47<00:01,  1.04it/s]
 99%|#########9| 197/198 [03:48<00:00,  1.05it/s]
100%|##########| 198/198 [03:49<00:00,  1.27it/s]02/16/2022 01:10:58 - INFO - __main__ - Epoch 0: {'accuracy': 0.938}
02/16/2022 01:11:22 - INFO - __main__ - Test-set evaluation: {'accuracy': 1.0}
Configuration saved in out/tweet/roberta_version_4\config.json
Model weights saved in out/tweet/roberta_version_4\pytorch_model.bin
tokenizer config file saved in out/tweet/roberta_version_4\tokenizer_config.json
Special tokens file saved in out/tweet/roberta_version_4\special_tokens_map.json

100%|##########| 198/198 [04:40<00:00,  1.42s/it]

T5

!python run_translation_no_trainer.py \
  --model_name_or_path t5-small \
  --train_file data/translations-train.json \
  --validation_file data/translations-valid.json \
  --test_file data/translations-test.json \
  --per_device_train_batch_size 16 \
  --per_device_eval_batch_size 16 \
  --source_prefix "tweet classification" \
  --max_source_length 256 \
  --max_target_length 128 \
  --max_length 128 \
  --num_train_epochs 1 \
  --output_dir out/tweet/t5
02/17/2022 17:13:52 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cpu
Use FP16 precision: False

02/17/2022 17:13:53 - WARNING - datasets.builder - Using custom data configuration default-c1907d9305fb2fbb
02/17/2022 17:13:53 - WARNING - datasets.builder - Reusing dataset json (C:\Users\Foka\.cache\huggingface\datasets\json\default-c1907d9305fb2fbb\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)

  0%|          | 0/3 [00:00<?, ?it/s]
100%|##########| 3/3 [00:00<00:00, 143.23it/s]
loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985
Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    },
    "translation_en_to_fr": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to French: "
    },
    "translation_en_to_ro": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to Romanian: "
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 32128
}

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985
Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    },
    "translation_en_to_fr": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to French: "
    },
    "translation_en_to_ro": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to Romanian: "
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 32128
}

loading file https://huggingface.co/t5-small/resolve/main/spiece.model from cache at C:\Users\Foka/.cache\huggingface\transformers\65fc04e21f45f61430aea0c4fedffac16a4d20d78b8e6601d8d996ebefefecd2.3b69006860e7b5d0a63ffdddc01ddcd6b7c318a6f4fd793596552c741734c62d
loading file https://huggingface.co/t5-small/resolve/main/tokenizer.json from cache at C:\Users\Foka/.cache\huggingface\transformers\06779097c78e12f47ef67ecb728810c2ae757ee0a9efe9390c6419783d99382d.8627f1bd5d270a9fd2e5a51c8bec3223896587cc3cfe13edeabb0992ab43c529
loading file https://huggingface.co/t5-small/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/t5-small/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/t5-small/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985
Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    },
    "translation_en_to_fr": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to French: "
    },
    "translation_en_to_ro": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to Romanian: "
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 32128
}

loading weights file https://huggingface.co/t5-small/resolve/main/pytorch_model.bin from cache at C:\Users\Foka/.cache\huggingface\transformers\fee5a3a0ae379232608b6eed45d2d7a0d2966b9683728838412caccc41b4b0ed.ddacdc89ec88482db20c676f0861a336f3d0409f94748c209847b49529d73885
All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.
02/17/2022 17:14:00 - INFO - __main__ - Using translation prefix: "tweet classification: "

Running tokenizer on dataset:   0%|          | 0/5 [00:00<?, ?ba/s]
Running tokenizer on dataset:  60%|######    | 3/5 [00:00<00:00, 28.92ba/s]
Running tokenizer on dataset: 100%|##########| 5/5 [00:00<00:00, 32.34ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]
Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 66.84ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]
Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 77.13ba/s]
02/17/2022 17:14:00 - INFO - __main__ - Sample 2469 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [10657, 13774, 10, 62, 33, 1095, 385, 151, 7, 3, 2, 1095, 1024, 9632, 151, 1713, 9229, 324, 1713, 2138, 1713, 19699, 9229, 324, 1439, 2, 1], 'labels': [150, 5591, 1]}.
02/17/2022 17:14:00 - INFO - __main__ - Sample 3112 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [10657, 13774, 10, 175, 3075, 56, 129, 25, 2787, 21, 8, 647, 1439, 2, 1713, 3470, 1713, 28984, 1713, 89, 76, 2693, 1713, 14814, 1], 'labels': [150, 5591, 1]}.
02/17/2022 17:14:00 - INFO - __main__ - Sample 1243 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [10657, 13774, 10, 3320, 10041, 125, 31, 7, 8, 1750, 344, 3, 9, 528, 210, 11, 3, 9, 6871, 58, 3, 9, 6871, 744, 31, 17, 3, 7, 11763, 16, 8, 4836, 5, 10802, 7, 1713, 1924, 210, 1273, 1927, 1050, 1439, 2, 1], 'labels': [5591, 1]}.

Downloading:   0%|          | 0.00/2.37k [00:00<?, ?B/s]
Downloading: 5.67kB [00:00, 1.42MB/s]                   
02/17/2022 17:14:02 - INFO - __main__ - ***** Running training *****
02/17/2022 17:14:02 - INFO - __main__ -   Num examples = 4742
02/17/2022 17:14:02 - INFO - __main__ -   Num Epochs = 1
02/17/2022 17:14:02 - INFO - __main__ -   Instantaneous batch size per device = 16
02/17/2022 17:14:02 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 16
02/17/2022 17:14:02 - INFO - __main__ -   Gradient Accumulation steps = 1
02/17/2022 17:14:02 - INFO - __main__ -   Total optimization steps = 297

  0%|          | 0/297 [00:00<?, ?it/s]
  0%|          | 1/297 [00:00<04:27,  1.11it/s]
  1%|          | 2/297 [00:01<04:16,  1.15it/s]
  1%|1         | 3/297 [00:02<04:22,  1.12it/s]
  1%|1         | 4/297 [00:03<04:21,  1.12it/s]
  2%|1         | 5/297 [00:04<04:51,  1.00it/s]
  2%|2         | 6/297 [00:05<04:41,  1.04it/s]
  2%|2         | 7/297 [00:06<04:35,  1.05it/s]
  3%|2         | 8/297 [00:07<04:21,  1.10it/s]
  3%|3         | 9/297 [00:08<04:21,  1.10it/s]
  3%|3         | 10/297 [00:09<04:20,  1.10it/s]
  4%|3         | 11/297 [00:10<04:14,  1.12it/s]
  4%|4         | 12/297 [00:11<04:19,  1.10it/s]
  4%|4         | 13/297 [00:11<04:15,  1.11it/s]
  5%|4         | 14/297 [00:12<04:11,  1.12it/s]
  5%|5         | 15/297 [00:13<04:07,  1.14it/s]
  5%|5         | 16/297 [00:14<04:14,  1.10it/s]
  6%|5         | 17/297 [00:15<04:07,  1.13it/s]
  6%|6         | 18/297 [00:16<04:10,  1.11it/s]
  6%|6         | 19/297 [00:17<04:22,  1.06it/s]
  7%|6         | 20/297 [00:18<04:17,  1.07it/s]
  7%|7         | 21/297 [00:19<04:19,  1.07it/s]
  7%|7         | 22/297 [00:20<04:12,  1.09it/s]
  8%|7         | 23/297 [00:20<04:03,  1.12it/s]
  8%|8         | 24/297 [00:21<04:01,  1.13it/s]
  8%|8         | 25/297 [00:22<04:02,  1.12it/s]
  9%|8         | 26/297 [00:23<04:05,  1.10it/s]
  9%|9         | 27/297 [00:24<04:01,  1.12it/s]
  9%|9         | 28/297 [00:25<03:52,  1.16it/s]
 10%|9         | 29/297 [00:26<03:49,  1.17it/s]
 10%|#         | 30/297 [00:27<03:52,  1.15it/s]
 10%|#         | 31/297 [00:27<03:43,  1.19it/s]
 11%|#         | 32/297 [00:28<03:35,  1.23it/s]
 11%|#1        | 33/297 [00:29<03:35,  1.23it/s]
 11%|#1        | 34/297 [00:30<03:30,  1.25it/s]
 12%|#1        | 35/297 [00:30<03:31,  1.24it/s]
 12%|#2        | 36/297 [00:31<03:35,  1.21it/s]
 12%|#2        | 37/297 [00:32<03:30,  1.23it/s]
 13%|#2        | 38/297 [00:33<03:44,  1.15it/s]
 13%|#3        | 39/297 [00:34<03:45,  1.14it/s]
 13%|#3        | 40/297 [00:35<04:07,  1.04it/s]
 14%|#3        | 41/297 [00:36<03:59,  1.07it/s]
 14%|#4        | 42/297 [00:37<03:53,  1.09it/s]
 14%|#4        | 43/297 [00:38<03:42,  1.14it/s]
 15%|#4        | 44/297 [00:39<03:35,  1.17it/s]
 15%|#5        | 45/297 [00:39<03:39,  1.15it/s]
 15%|#5        | 46/297 [00:40<03:43,  1.12it/s]
 16%|#5        | 47/297 [00:41<03:52,  1.08it/s]
 16%|#6        | 48/297 [00:42<03:50,  1.08it/s]
 16%|#6        | 49/297 [00:43<03:43,  1.11it/s]
 17%|#6        | 50/297 [00:44<03:41,  1.12it/s]
 17%|#7        | 51/297 [00:45<03:35,  1.14it/s]
 18%|#7        | 52/297 [00:46<03:42,  1.10it/s]
 18%|#7        | 53/297 [00:47<03:34,  1.14it/s]
 18%|#8        | 54/297 [00:47<03:26,  1.18it/s]
 19%|#8        | 55/297 [00:48<03:28,  1.16it/s]
 19%|#8        | 56/297 [00:49<03:24,  1.18it/s]
 19%|#9        | 57/297 [00:50<03:19,  1.20it/s]
 20%|#9        | 58/297 [00:51<03:19,  1.20it/s]
 20%|#9        | 59/297 [00:52<03:21,  1.18it/s]
 20%|##        | 60/297 [00:53<03:26,  1.15it/s]
 21%|##        | 61/297 [00:53<03:24,  1.16it/s]
 21%|##        | 62/297 [00:55<03:40,  1.06it/s]
 21%|##1       | 63/297 [00:55<03:38,  1.07it/s]
 22%|##1       | 64/297 [00:56<03:30,  1.11it/s]
 22%|##1       | 65/297 [00:57<03:30,  1.10it/s]
 22%|##2       | 66/297 [00:58<03:22,  1.14it/s]
 23%|##2       | 67/297 [00:59<03:25,  1.12it/s]
 23%|##2       | 68/297 [01:00<03:25,  1.12it/s]
 23%|##3       | 69/297 [01:01<03:23,  1.12it/s]
 24%|##3       | 70/297 [01:02<03:17,  1.15it/s]
 24%|##3       | 71/297 [01:02<03:13,  1.17it/s]
 24%|##4       | 72/297 [01:03<03:10,  1.18it/s]
 25%|##4       | 73/297 [01:04<03:14,  1.15it/s]
 25%|##4       | 74/297 [01:05<03:25,  1.08it/s]
 25%|##5       | 75/297 [01:06<03:20,  1.11it/s]
 26%|##5       | 76/297 [01:07<03:18,  1.12it/s]
 26%|##5       | 77/297 [01:08<03:13,  1.14it/s]
 26%|##6       | 78/297 [01:08<03:04,  1.18it/s]
 27%|##6       | 79/297 [01:09<03:12,  1.13it/s]
 27%|##6       | 80/297 [01:10<03:12,  1.12it/s]
 27%|##7       | 81/297 [01:11<03:09,  1.14it/s]
 28%|##7       | 82/297 [01:12<03:08,  1.14it/s]
 28%|##7       | 83/297 [01:13<03:10,  1.12it/s]
 28%|##8       | 84/297 [01:14<03:13,  1.10it/s]
 29%|##8       | 85/297 [01:15<03:09,  1.12it/s]
 29%|##8       | 86/297 [01:16<03:06,  1.13it/s]
 29%|##9       | 87/297 [01:17<03:05,  1.13it/s]
 30%|##9       | 88/297 [01:17<03:06,  1.12it/s]
 30%|##9       | 89/297 [01:18<03:03,  1.14it/s]
 30%|###       | 90/297 [01:19<03:04,  1.12it/s]
 31%|###       | 91/297 [01:20<03:06,  1.10it/s]
 31%|###       | 92/297 [01:21<03:06,  1.10it/s]
 31%|###1      | 93/297 [01:22<03:08,  1.08it/s]
 32%|###1      | 94/297 [01:23<03:15,  1.04it/s]
 32%|###1      | 95/297 [01:24<03:10,  1.06it/s]
 32%|###2      | 96/297 [01:25<03:05,  1.08it/s]
 33%|###2      | 97/297 [01:26<03:01,  1.10it/s]
 33%|###2      | 98/297 [01:27<03:07,  1.06it/s]
 33%|###3      | 99/297 [01:28<03:02,  1.09it/s]
 34%|###3      | 100/297 [01:29<02:59,  1.10it/s]
 34%|###4      | 101/297 [01:29<02:59,  1.09it/s]
 34%|###4      | 102/297 [01:30<02:56,  1.11it/s]
 35%|###4      | 103/297 [01:31<02:58,  1.09it/s]
 35%|###5      | 104/297 [01:32<02:58,  1.08it/s]
 35%|###5      | 105/297 [01:33<02:56,  1.09it/s]
 36%|###5      | 106/297 [01:34<02:53,  1.10it/s]
 36%|###6      | 107/297 [01:35<02:55,  1.08it/s]
 36%|###6      | 108/297 [01:36<02:51,  1.10it/s]
 37%|###6      | 109/297 [01:37<02:51,  1.09it/s]
 37%|###7      | 110/297 [01:38<02:54,  1.07it/s]
 37%|###7      | 111/297 [01:39<02:51,  1.09it/s]
 38%|###7      | 112/297 [01:40<02:49,  1.09it/s]
 38%|###8      | 113/297 [01:40<02:46,  1.10it/s]
 38%|###8      | 114/297 [01:41<02:43,  1.12it/s]
 39%|###8      | 115/297 [01:42<02:42,  1.12it/s]
 39%|###9      | 116/297 [01:43<02:38,  1.14it/s]
 39%|###9      | 117/297 [01:44<02:39,  1.13it/s]
 40%|###9      | 118/297 [01:45<02:44,  1.08it/s]
 40%|####      | 119/297 [01:46<02:41,  1.10it/s]
 40%|####      | 120/297 [01:47<02:38,  1.12it/s]
 41%|####      | 121/297 [01:48<02:44,  1.07it/s]
 41%|####1     | 122/297 [01:49<02:40,  1.09it/s]
 41%|####1     | 123/297 [01:49<02:36,  1.11it/s]
 42%|####1     | 124/297 [01:50<02:36,  1.11it/s]
 42%|####2     | 125/297 [01:51<02:35,  1.11it/s]
 42%|####2     | 126/297 [01:52<02:35,  1.10it/s]
 43%|####2     | 127/297 [01:53<02:34,  1.10it/s]
 43%|####3     | 128/297 [01:54<02:31,  1.12it/s]
 43%|####3     | 129/297 [01:55<02:35,  1.08it/s]
 44%|####3     | 130/297 [01:56<02:29,  1.12it/s]
 44%|####4     | 131/297 [01:57<02:28,  1.12it/s]
 44%|####4     | 132/297 [01:58<02:28,  1.11it/s]
 45%|####4     | 133/297 [01:58<02:26,  1.12it/s]
 45%|####5     | 134/297 [01:59<02:27,  1.11it/s]
 45%|####5     | 135/297 [02:00<02:27,  1.10it/s]
 46%|####5     | 136/297 [02:01<02:25,  1.10it/s]
 46%|####6     | 137/297 [02:02<02:26,  1.09it/s]
 46%|####6     | 138/297 [02:03<02:22,  1.11it/s]
 47%|####6     | 139/297 [02:04<02:21,  1.11it/s]
 47%|####7     | 140/297 [02:05<02:21,  1.11it/s]
 47%|####7     | 141/297 [02:06<02:23,  1.09it/s]
 48%|####7     | 142/297 [02:07<02:18,  1.12it/s]
 48%|####8     | 143/297 [02:07<02:17,  1.12it/s]
 48%|####8     | 144/297 [02:08<02:14,  1.14it/s]
 49%|####8     | 145/297 [02:09<02:14,  1.13it/s]
 49%|####9     | 146/297 [02:10<02:11,  1.15it/s]
 49%|####9     | 147/297 [02:11<02:12,  1.13it/s]
 50%|####9     | 148/297 [02:12<02:10,  1.14it/s]
 50%|#####     | 149/297 [02:13<02:08,  1.15it/s]
 51%|#####     | 150/297 [02:14<02:13,  1.10it/s]
 51%|#####     | 151/297 [02:15<02:10,  1.12it/s]
 51%|#####1    | 152/297 [02:15<02:11,  1.11it/s]
 52%|#####1    | 153/297 [02:16<02:08,  1.12it/s]
 52%|#####1    | 154/297 [02:17<02:08,  1.11it/s]
 52%|#####2    | 155/297 [02:18<02:08,  1.11it/s]
 53%|#####2    | 156/297 [02:19<02:09,  1.09it/s]
 53%|#####2    | 157/297 [02:20<02:07,  1.10it/s]
 53%|#####3    | 158/297 [02:21<02:04,  1.12it/s]
 54%|#####3    | 159/297 [02:22<02:02,  1.13it/s]
 54%|#####3    | 160/297 [02:23<02:01,  1.13it/s]
 54%|#####4    | 161/297 [02:23<01:56,  1.17it/s]
 55%|#####4    | 162/297 [02:24<01:56,  1.16it/s]
 55%|#####4    | 163/297 [02:25<01:59,  1.12it/s]
 55%|#####5    | 164/297 [02:26<01:58,  1.12it/s]
 56%|#####5    | 165/297 [02:27<01:55,  1.14it/s]
 56%|#####5    | 166/297 [02:28<01:55,  1.14it/s]
 56%|#####6    | 167/297 [02:29<01:54,  1.14it/s]
 57%|#####6    | 168/297 [02:30<01:54,  1.12it/s]
 57%|#####6    | 169/297 [02:31<02:02,  1.05it/s]
 57%|#####7    | 170/297 [02:32<01:59,  1.06it/s]
 58%|#####7    | 171/297 [02:33<01:57,  1.07it/s]
 58%|#####7    | 172/297 [02:34<02:17,  1.10s/it]
 58%|#####8    | 173/297 [02:35<02:11,  1.06s/it]
 59%|#####8    | 174/297 [02:36<02:05,  1.02s/it]
 59%|#####8    | 175/297 [02:37<01:58,  1.03it/s]
 59%|#####9    | 176/297 [02:38<01:56,  1.04it/s]
 60%|#####9    | 177/297 [02:39<01:57,  1.02it/s]
 60%|#####9    | 178/297 [02:40<01:56,  1.02it/s]
 60%|######    | 179/297 [02:41<01:53,  1.04it/s]
 61%|######    | 180/297 [02:42<01:49,  1.07it/s]
 61%|######    | 181/297 [02:42<01:47,  1.08it/s]
 61%|######1   | 182/297 [02:43<01:45,  1.09it/s]
 62%|######1   | 183/297 [02:44<01:43,  1.10it/s]
 62%|######1   | 184/297 [02:45<01:43,  1.09it/s]
 62%|######2   | 185/297 [02:46<01:45,  1.06it/s]
 63%|######2   | 186/297 [02:47<01:43,  1.08it/s]
 63%|######2   | 187/297 [02:48<01:45,  1.05it/s]
 63%|######3   | 188/297 [02:49<01:40,  1.09it/s]
 64%|######3   | 189/297 [02:50<01:37,  1.11it/s]
 64%|######3   | 190/297 [02:51<01:35,  1.11it/s]
 64%|######4   | 191/297 [02:52<01:36,  1.10it/s]
 65%|######4   | 192/297 [02:53<01:35,  1.10it/s]
 65%|######4   | 193/297 [02:54<01:37,  1.06it/s]
 65%|######5   | 194/297 [02:54<01:35,  1.07it/s]
 66%|######5   | 195/297 [02:55<01:37,  1.05it/s]
 66%|######5   | 196/297 [02:56<01:32,  1.10it/s]
 66%|######6   | 197/297 [02:57<01:29,  1.12it/s]
 67%|######6   | 198/297 [02:58<01:41,  1.02s/it]
 67%|######7   | 199/297 [03:00<01:41,  1.03s/it]
 67%|######7   | 200/297 [03:00<01:37,  1.01s/it]
 68%|######7   | 201/297 [03:01<01:32,  1.04it/s]
 68%|######8   | 202/297 [03:02<01:27,  1.09it/s]
 68%|######8   | 203/297 [03:03<01:24,  1.12it/s]
 69%|######8   | 204/297 [03:04<01:23,  1.11it/s]
 69%|######9   | 205/297 [03:05<01:26,  1.06it/s]
 69%|######9   | 206/297 [03:06<01:28,  1.03it/s]
 70%|######9   | 207/297 [03:07<01:26,  1.05it/s]
 70%|#######   | 208/297 [03:08<01:26,  1.02it/s]
 70%|#######   | 209/297 [03:09<01:22,  1.07it/s]
 71%|#######   | 210/297 [03:10<01:21,  1.06it/s]
 71%|#######1  | 211/297 [03:11<01:19,  1.08it/s]
 71%|#######1  | 212/297 [03:12<01:18,  1.09it/s]
 72%|#######1  | 213/297 [03:12<01:17,  1.08it/s]
 72%|#######2  | 214/297 [03:13<01:17,  1.07it/s]
 72%|#######2  | 215/297 [03:14<01:15,  1.08it/s]
 73%|#######2  | 216/297 [03:15<01:14,  1.08it/s]
 73%|#######3  | 217/297 [03:16<01:12,  1.10it/s]
 73%|#######3  | 218/297 [03:17<01:10,  1.12it/s]
 74%|#######3  | 219/297 [03:18<01:10,  1.11it/s]
 74%|#######4  | 220/297 [03:19<01:09,  1.11it/s]
 74%|#######4  | 221/297 [03:20<01:08,  1.12it/s]
 75%|#######4  | 222/297 [03:20<01:05,  1.14it/s]
 75%|#######5  | 223/297 [03:21<01:05,  1.13it/s]
 75%|#######5  | 224/297 [03:22<01:05,  1.11it/s]
 76%|#######5  | 225/297 [03:23<01:07,  1.06it/s]
 76%|#######6  | 226/297 [03:24<01:04,  1.09it/s]
 76%|#######6  | 227/297 [03:25<01:03,  1.10it/s]
 77%|#######6  | 228/297 [03:26<01:01,  1.11it/s]
 77%|#######7  | 229/297 [03:27<01:01,  1.10it/s]
 77%|#######7  | 230/297 [03:28<01:01,  1.09it/s]
 78%|#######7  | 231/297 [03:29<00:58,  1.13it/s]
 78%|#######8  | 232/297 [03:29<00:55,  1.18it/s]
 78%|#######8  | 233/297 [03:30<00:54,  1.18it/s]
 79%|#######8  | 234/297 [03:31<00:55,  1.14it/s]
 79%|#######9  | 235/297 [03:32<00:55,  1.12it/s]
 79%|#######9  | 236/297 [03:33<00:55,  1.10it/s]
 80%|#######9  | 237/297 [03:34<00:54,  1.11it/s]
 80%|########  | 238/297 [03:35<00:54,  1.08it/s]
 80%|########  | 239/297 [03:36<00:54,  1.06it/s]
 81%|########  | 240/297 [03:37<00:52,  1.09it/s]
 81%|########1 | 241/297 [03:38<00:51,  1.08it/s]
 81%|########1 | 242/297 [03:39<00:49,  1.10it/s]
 82%|########1 | 243/297 [03:39<00:47,  1.13it/s]
 82%|########2 | 244/297 [03:40<00:46,  1.14it/s]
 82%|########2 | 245/297 [03:41<00:46,  1.12it/s]
 83%|########2 | 246/297 [03:42<00:45,  1.11it/s]
 83%|########3 | 247/297 [03:43<00:44,  1.11it/s]
 84%|########3 | 248/297 [03:44<00:48,  1.00it/s]
 84%|########3 | 249/297 [03:45<00:47,  1.01it/s]
 84%|########4 | 250/297 [03:46<00:45,  1.03it/s]
 85%|########4 | 251/297 [03:47<00:44,  1.03it/s]
 85%|########4 | 252/297 [03:48<00:42,  1.05it/s]
 85%|########5 | 253/297 [03:49<00:41,  1.07it/s]
 86%|########5 | 254/297 [03:50<00:40,  1.06it/s]
 86%|########5 | 255/297 [03:51<00:40,  1.04it/s]
 86%|########6 | 256/297 [03:52<00:40,  1.01it/s]
 87%|########6 | 257/297 [03:53<00:38,  1.05it/s]
 87%|########6 | 258/297 [03:54<00:36,  1.06it/s]
 87%|########7 | 259/297 [03:55<00:35,  1.08it/s]
 88%|########7 | 260/297 [03:56<00:34,  1.07it/s]
 88%|########7 | 261/297 [03:57<00:33,  1.08it/s]
 88%|########8 | 262/297 [03:57<00:32,  1.06it/s]
 89%|########8 | 263/297 [03:58<00:31,  1.08it/s]
 89%|########8 | 264/297 [03:59<00:31,  1.05it/s]
 89%|########9 | 265/297 [04:00<00:30,  1.04it/s]
 90%|########9 | 266/297 [04:01<00:30,  1.01it/s]
 90%|########9 | 267/297 [04:02<00:28,  1.06it/s]
 90%|######### | 268/297 [04:03<00:26,  1.08it/s]
 91%|######### | 269/297 [04:04<00:26,  1.04it/s]
 91%|######### | 270/297 [04:05<00:25,  1.04it/s]
 91%|#########1| 271/297 [04:06<00:23,  1.09it/s]
 92%|#########1| 272/297 [04:07<00:23,  1.07it/s]
 92%|#########1| 273/297 [04:08<00:22,  1.08it/s]
 92%|#########2| 274/297 [04:09<00:21,  1.08it/s]
 93%|#########2| 275/297 [04:10<00:20,  1.08it/s]
 93%|#########2| 276/297 [04:11<00:19,  1.10it/s]
 93%|#########3| 277/297 [04:12<00:18,  1.06it/s]
 94%|#########3| 278/297 [04:12<00:17,  1.07it/s]
 94%|#########3| 279/297 [04:13<00:16,  1.07it/s]
 94%|#########4| 280/297 [04:14<00:16,  1.06it/s]
 95%|#########4| 281/297 [04:15<00:14,  1.07it/s]
 95%|#########4| 282/297 [04:16<00:13,  1.11it/s]
 95%|#########5| 283/297 [04:17<00:12,  1.09it/s]
 96%|#########5| 284/297 [04:18<00:13,  1.00s/it]
 96%|#########5| 285/297 [04:19<00:11,  1.01it/s]
 96%|#########6| 286/297 [04:20<00:10,  1.04it/s]
 97%|#########6| 287/297 [04:21<00:09,  1.08it/s]
 97%|#########6| 288/297 [04:22<00:08,  1.10it/s]
 97%|#########7| 289/297 [04:23<00:07,  1.08it/s]
 98%|#########7| 290/297 [04:24<00:06,  1.11it/s]
 98%|#########7| 291/297 [04:25<00:05,  1.09it/s]
 98%|#########8| 292/297 [04:26<00:04,  1.07it/s]
 99%|#########8| 293/297 [04:26<00:03,  1.10it/s]
 99%|#########8| 294/297 [04:27<00:02,  1.12it/s]
 99%|#########9| 295/297 [04:28<00:01,  1.07it/s]
100%|#########9| 296/297 [04:29<00:00,  1.03it/s]
100%|##########| 297/297 [04:30<00:00,  1.19it/s]02/17/2022 17:18:41 - INFO - __main__ - Validation-set | bleu: 0.0 | accuracy: 1.0
02/17/2022 17:18:49 - INFO - __main__ - Test-set | bleu: 0.0 | accuracy: 1.0
Configuration saved in out/tweet/t5\config.json
Model weights saved in out/tweet/t5\pytorch_model.bin
tokenizer config file saved in out/tweet/t5\tokenizer_config.json
Special tokens file saved in out/tweet/t5\special_tokens_map.json
Copy vocab file to out/tweet/t5\spiece.model

100%|##########| 297/297 [04:46<00:00,  1.04it/s]

T5 version 2

!python run_translation_no_trainer.py \
  --model_name_or_path t5-small \
  --train_file data/translations-train.json \
  --validation_file data/translations-valid.json \
  --test_file data/translations-test.json \
  --per_device_train_batch_size 16 \
  --per_device_eval_batch_size 16 \
  --source_prefix "tweet classification" \
  --max_source_length 256 \
  --max_target_length 128 \
  --max_length 128 \
  --num_train_epochs 1 \
  --freeze_encoder \
  --output_dir out/tweet/t5_version_2
02/17/2022 17:23:00 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cpu
Use FP16 precision: False

02/17/2022 17:23:00 - WARNING - datasets.builder - Using custom data configuration default-c1907d9305fb2fbb
02/17/2022 17:23:00 - WARNING - datasets.builder - Reusing dataset json (C:\Users\Foka\.cache\huggingface\datasets\json\default-c1907d9305fb2fbb\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)

  0%|          | 0/3 [00:00<?, ?it/s]
100%|##########| 3/3 [00:00<00:00, 1504.41it/s]
loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985
Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    },
    "translation_en_to_fr": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to French: "
    },
    "translation_en_to_ro": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to Romanian: "
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 32128
}

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985
Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    },
    "translation_en_to_fr": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to French: "
    },
    "translation_en_to_ro": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to Romanian: "
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 32128
}

loading file https://huggingface.co/t5-small/resolve/main/spiece.model from cache at C:\Users\Foka/.cache\huggingface\transformers\65fc04e21f45f61430aea0c4fedffac16a4d20d78b8e6601d8d996ebefefecd2.3b69006860e7b5d0a63ffdddc01ddcd6b7c318a6f4fd793596552c741734c62d
loading file https://huggingface.co/t5-small/resolve/main/tokenizer.json from cache at C:\Users\Foka/.cache\huggingface\transformers\06779097c78e12f47ef67ecb728810c2ae757ee0a9efe9390c6419783d99382d.8627f1bd5d270a9fd2e5a51c8bec3223896587cc3cfe13edeabb0992ab43c529
loading file https://huggingface.co/t5-small/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/t5-small/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/t5-small/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at C:\Users\Foka/.cache\huggingface\transformers\fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985
Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    },
    "translation_en_to_fr": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to French: "
    },
    "translation_en_to_ro": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to Romanian: "
    }
  },
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 32128
}

loading weights file https://huggingface.co/t5-small/resolve/main/pytorch_model.bin from cache at C:\Users\Foka/.cache\huggingface\transformers\fee5a3a0ae379232608b6eed45d2d7a0d2966b9683728838412caccc41b4b0ed.ddacdc89ec88482db20c676f0861a336f3d0409f94748c209847b49529d73885
All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.
02/17/2022 17:23:07 - INFO - __main__ - Freezing model weights
02/17/2022 17:23:07 - INFO - __main__ - Using translation prefix: "tweet classification: "

Running tokenizer on dataset:   0%|          | 0/5 [00:00<?, ?ba/s]
Running tokenizer on dataset:  80%|########  | 4/5 [00:00<00:00, 31.58ba/s]
Running tokenizer on dataset: 100%|##########| 5/5 [00:00<00:00, 33.64ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]
Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 66.85ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]
Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 77.13ba/s]
02/17/2022 17:23:07 - INFO - __main__ - Sample 4497 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [10657, 13774, 10, 34, 31, 7, 16713, 239, 3158, 3, 2, 1], 'labels': [150, 5591, 1]}.
02/17/2022 17:23:07 - INFO - __main__ - Sample 697 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [10657, 13774, 10, 3320, 10041, 3, 6631, 7, 55, 3, 23, 410, 34, 541, 55, 3, 19293, 430, 18659, 2983, 89, 16948, 55, 1713, 7, 9, 26, 1713, 7, 127, 15, 2298, 49, 3, 24778, 1713, 1788, 6938, 2910, 29, 53, 1], 'labels': [5591, 1]}.
02/17/2022 17:23:07 - INFO - __main__ - Sample 3411 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [10657, 13774, 10, 8441, 352, 12, 217, 3320, 10041, 16, 20, 75, 3, 10266, 55, 1], 'labels': [150, 5591, 1]}.
02/17/2022 17:23:09 - INFO - __main__ - ***** Running training *****
02/17/2022 17:23:09 - INFO - __main__ -   Num examples = 4742
02/17/2022 17:23:09 - INFO - __main__ -   Num Epochs = 1
02/17/2022 17:23:09 - INFO - __main__ -   Instantaneous batch size per device = 16
02/17/2022 17:23:09 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 16
02/17/2022 17:23:09 - INFO - __main__ -   Gradient Accumulation steps = 1
02/17/2022 17:23:09 - INFO - __main__ -   Total optimization steps = 297

  0%|          | 0/297 [00:00<?, ?it/s]
  0%|          | 1/297 [00:00<02:34,  1.92it/s]
  1%|          | 2/297 [00:00<02:08,  2.29it/s]
  1%|1         | 3/297 [00:01<01:58,  2.47it/s]
  1%|1         | 4/297 [00:01<01:53,  2.58it/s]
  2%|1         | 5/297 [00:02<01:59,  2.45it/s]
  2%|2         | 6/297 [00:02<02:02,  2.37it/s]
  2%|2         | 7/297 [00:02<01:58,  2.46it/s]
  3%|2         | 8/297 [00:03<01:53,  2.55it/s]
  3%|3         | 9/297 [00:03<02:14,  2.14it/s]
  3%|3         | 10/297 [00:04<02:12,  2.17it/s]
  4%|3         | 11/297 [00:04<02:05,  2.28it/s]
  4%|4         | 12/297 [00:05<02:01,  2.34it/s]
  4%|4         | 13/297 [00:05<02:00,  2.36it/s]
  5%|4         | 14/297 [00:05<01:57,  2.42it/s]
  5%|5         | 15/297 [00:06<01:54,  2.45it/s]
  5%|5         | 16/297 [00:06<01:50,  2.54it/s]
  6%|5         | 17/297 [00:07<01:49,  2.57it/s]
  6%|6         | 18/297 [00:07<01:48,  2.58it/s]
  6%|6         | 19/297 [00:07<01:50,  2.53it/s]
  7%|6         | 20/297 [00:08<01:52,  2.46it/s]
  7%|7         | 21/297 [00:08<01:49,  2.53it/s]
  7%|7         | 22/297 [00:09<01:52,  2.45it/s]
  8%|7         | 23/297 [00:09<01:48,  2.52it/s]
  8%|8         | 24/297 [00:09<01:47,  2.55it/s]
  8%|8         | 25/297 [00:10<01:48,  2.51it/s]
  9%|8         | 26/297 [00:10<01:45,  2.56it/s]
  9%|9         | 27/297 [00:11<01:45,  2.56it/s]
  9%|9         | 28/297 [00:11<01:45,  2.55it/s]
 10%|9         | 29/297 [00:11<01:45,  2.54it/s]
 10%|#         | 30/297 [00:12<01:41,  2.62it/s]
 10%|#         | 31/297 [00:12<01:42,  2.59it/s]
 11%|#         | 32/297 [00:12<01:39,  2.66it/s]
 11%|#1        | 33/297 [00:13<01:39,  2.64it/s]
 11%|#1        | 34/297 [00:13<01:38,  2.67it/s]
 12%|#1        | 35/297 [00:14<01:38,  2.67it/s]
 12%|#2        | 36/297 [00:14<01:40,  2.60it/s]
 12%|#2        | 37/297 [00:14<01:45,  2.46it/s]
 13%|#2        | 38/297 [00:15<01:44,  2.48it/s]
 13%|#3        | 39/297 [00:15<01:42,  2.52it/s]
 13%|#3        | 40/297 [00:16<01:41,  2.53it/s]
 14%|#3        | 41/297 [00:16<01:42,  2.50it/s]
 14%|#4        | 42/297 [00:16<01:37,  2.62it/s]
 14%|#4        | 43/297 [00:17<01:38,  2.59it/s]
 15%|#4        | 44/297 [00:17<01:40,  2.52it/s]
 15%|#5        | 45/297 [00:18<01:37,  2.59it/s]
 15%|#5        | 46/297 [00:18<01:37,  2.59it/s]
 16%|#5        | 47/297 [00:18<01:37,  2.56it/s]
 16%|#6        | 48/297 [00:19<01:36,  2.57it/s]
 16%|#6        | 49/297 [00:19<01:36,  2.58it/s]
 17%|#6        | 50/297 [00:19<01:35,  2.59it/s]
 17%|#7        | 51/297 [00:20<01:35,  2.56it/s]
 18%|#7        | 52/297 [00:20<01:34,  2.58it/s]
 18%|#7        | 53/297 [00:21<01:34,  2.58it/s]
 18%|#8        | 54/297 [00:21<01:33,  2.60it/s]
 19%|#8        | 55/297 [00:21<01:34,  2.55it/s]
 19%|#8        | 56/297 [00:22<01:34,  2.54it/s]
 19%|#9        | 57/297 [00:22<01:36,  2.50it/s]
 20%|#9        | 58/297 [00:23<01:35,  2.52it/s]
 20%|#9        | 59/297 [00:23<01:33,  2.55it/s]
 20%|##        | 60/297 [00:23<01:31,  2.58it/s]
 21%|##        | 61/297 [00:24<01:36,  2.46it/s]
 21%|##        | 62/297 [00:24<01:32,  2.53it/s]
 21%|##1       | 63/297 [00:25<01:35,  2.45it/s]
 22%|##1       | 64/297 [00:25<01:32,  2.53it/s]
 22%|##1       | 65/297 [00:25<01:32,  2.51it/s]
 22%|##2       | 66/297 [00:26<01:30,  2.54it/s]
 23%|##2       | 67/297 [00:26<01:30,  2.54it/s]
 23%|##2       | 68/297 [00:27<01:36,  2.38it/s]
 23%|##3       | 69/297 [00:27<01:32,  2.46it/s]
 24%|##3       | 70/297 [00:27<01:33,  2.43it/s]
 24%|##3       | 71/297 [00:28<01:29,  2.52it/s]
 24%|##4       | 72/297 [00:28<01:29,  2.52it/s]
 25%|##4       | 73/297 [00:29<01:29,  2.49it/s]
 25%|##4       | 74/297 [00:29<01:31,  2.43it/s]
 25%|##5       | 75/297 [00:29<01:32,  2.39it/s]
 26%|##5       | 76/297 [00:30<01:31,  2.42it/s]
 26%|##5       | 77/297 [00:30<01:31,  2.40it/s]
 26%|##6       | 78/297 [00:31<01:29,  2.45it/s]
 27%|##6       | 79/297 [00:31<01:27,  2.48it/s]
 27%|##6       | 80/297 [00:31<01:26,  2.51it/s]
 27%|##7       | 81/297 [00:32<01:25,  2.53it/s]
 28%|##7       | 82/297 [00:32<01:26,  2.48it/s]
 28%|##7       | 83/297 [00:33<01:26,  2.47it/s]
 28%|##8       | 84/297 [00:33<01:29,  2.38it/s]
 29%|##8       | 85/297 [00:34<01:25,  2.49it/s]
 29%|##8       | 86/297 [00:34<01:23,  2.53it/s]
 29%|##9       | 87/297 [00:34<01:25,  2.46it/s]
 30%|##9       | 88/297 [00:35<01:27,  2.40it/s]
 30%|##9       | 89/297 [00:35<01:28,  2.35it/s]
 30%|###       | 90/297 [00:36<01:26,  2.39it/s]
 31%|###       | 91/297 [00:36<01:27,  2.35it/s]
 31%|###       | 92/297 [00:36<01:23,  2.44it/s]
 31%|###1      | 93/297 [00:37<01:22,  2.48it/s]
 32%|###1      | 94/297 [00:37<01:24,  2.40it/s]
 32%|###1      | 95/297 [00:38<01:21,  2.47it/s]
 32%|###2      | 96/297 [00:38<01:20,  2.50it/s]
 33%|###2      | 97/297 [00:38<01:21,  2.46it/s]
 33%|###2      | 98/297 [00:39<01:19,  2.49it/s]
 33%|###3      | 99/297 [00:39<01:19,  2.49it/s]
 34%|###3      | 100/297 [00:40<01:16,  2.56it/s]
 34%|###4      | 101/297 [00:40<01:15,  2.60it/s]
 34%|###4      | 102/297 [00:40<01:17,  2.52it/s]
 35%|###4      | 103/297 [00:41<01:21,  2.39it/s]
 35%|###5      | 104/297 [00:41<01:18,  2.46it/s]
 35%|###5      | 105/297 [00:42<01:17,  2.47it/s]
 36%|###5      | 106/297 [00:42<01:14,  2.55it/s]
 36%|###6      | 107/297 [00:42<01:15,  2.50it/s]
 36%|###6      | 108/297 [00:43<01:14,  2.53it/s]
 37%|###6      | 109/297 [00:43<01:14,  2.53it/s]
 37%|###7      | 110/297 [00:44<01:12,  2.57it/s]
 37%|###7      | 111/297 [00:44<01:11,  2.59it/s]
 38%|###7      | 112/297 [00:44<01:11,  2.60it/s]
 38%|###8      | 113/297 [00:45<01:09,  2.65it/s]
 38%|###8      | 114/297 [00:45<01:09,  2.64it/s]
 39%|###8      | 115/297 [00:46<01:12,  2.52it/s]
 39%|###9      | 116/297 [00:46<01:15,  2.41it/s]
 39%|###9      | 117/297 [00:46<01:10,  2.55it/s]
 40%|###9      | 118/297 [00:47<01:09,  2.58it/s]
 40%|####      | 119/297 [00:47<01:10,  2.52it/s]
 40%|####      | 120/297 [00:48<01:10,  2.53it/s]
 41%|####      | 121/297 [00:48<01:08,  2.56it/s]
 41%|####1     | 122/297 [00:48<01:08,  2.57it/s]
 41%|####1     | 123/297 [00:49<01:08,  2.55it/s]
 42%|####1     | 124/297 [00:49<01:11,  2.43it/s]
 42%|####2     | 125/297 [00:50<01:07,  2.53it/s]
 42%|####2     | 126/297 [00:50<01:11,  2.38it/s]
 43%|####2     | 127/297 [00:50<01:09,  2.45it/s]
 43%|####3     | 128/297 [00:51<01:07,  2.51it/s]
 43%|####3     | 129/297 [00:51<01:07,  2.49it/s]
 44%|####3     | 130/297 [00:52<01:07,  2.46it/s]
 44%|####4     | 131/297 [00:52<01:04,  2.56it/s]
 44%|####4     | 132/297 [00:52<01:07,  2.43it/s]
 45%|####4     | 133/297 [00:53<01:06,  2.45it/s]
 45%|####5     | 134/297 [00:53<01:06,  2.45it/s]
 45%|####5     | 135/297 [00:54<01:06,  2.43it/s]
 46%|####5     | 136/297 [00:54<01:02,  2.59it/s]
 46%|####6     | 137/297 [00:54<01:02,  2.55it/s]
 46%|####6     | 138/297 [00:55<01:05,  2.44it/s]
 47%|####6     | 139/297 [00:55<01:02,  2.53it/s]
 47%|####7     | 140/297 [00:56<01:02,  2.53it/s]
 47%|####7     | 141/297 [00:56<01:02,  2.51it/s]
 48%|####7     | 142/297 [00:56<00:59,  2.62it/s]
 48%|####8     | 143/297 [00:57<00:58,  2.63it/s]
 48%|####8     | 144/297 [00:57<00:58,  2.63it/s]
 49%|####8     | 145/297 [00:57<00:59,  2.57it/s]
 49%|####9     | 146/297 [00:58<00:59,  2.53it/s]
 49%|####9     | 147/297 [00:58<01:00,  2.48it/s]
 50%|####9     | 148/297 [00:59<00:58,  2.56it/s]
 50%|#####     | 149/297 [00:59<01:00,  2.44it/s]
 51%|#####     | 150/297 [00:59<00:58,  2.53it/s]
 51%|#####     | 151/297 [01:00<00:57,  2.56it/s]
 51%|#####1    | 152/297 [01:00<00:56,  2.55it/s]
 52%|#####1    | 153/297 [01:01<00:56,  2.56it/s]
 52%|#####1    | 154/297 [01:01<00:56,  2.55it/s]
 52%|#####2    | 155/297 [01:01<00:55,  2.56it/s]
 53%|#####2    | 156/297 [01:02<00:55,  2.54it/s]
 53%|#####2    | 157/297 [01:02<00:54,  2.59it/s]
 53%|#####3    | 158/297 [01:03<00:54,  2.53it/s]
 54%|#####3    | 159/297 [01:03<00:53,  2.58it/s]
 54%|#####3    | 160/297 [01:03<00:52,  2.62it/s]
 54%|#####4    | 161/297 [01:04<00:52,  2.60it/s]
 55%|#####4    | 162/297 [01:04<00:51,  2.63it/s]
 55%|#####4    | 163/297 [01:04<00:51,  2.61it/s]
 55%|#####5    | 164/297 [01:05<00:51,  2.56it/s]
 56%|#####5    | 165/297 [01:05<00:51,  2.57it/s]
 56%|#####5    | 166/297 [01:06<00:51,  2.52it/s]
 56%|#####6    | 167/297 [01:06<00:51,  2.51it/s]
 57%|#####6    | 168/297 [01:06<00:51,  2.53it/s]
 57%|#####6    | 169/297 [01:07<00:51,  2.50it/s]
 57%|#####7    | 170/297 [01:07<00:51,  2.48it/s]
 58%|#####7    | 171/297 [01:08<00:51,  2.46it/s]
 58%|#####7    | 172/297 [01:08<00:50,  2.46it/s]
 58%|#####8    | 173/297 [01:09<00:50,  2.46it/s]
 59%|#####8    | 174/297 [01:09<00:48,  2.54it/s]
 59%|#####8    | 175/297 [01:09<00:46,  2.60it/s]
 59%|#####9    | 176/297 [01:10<00:46,  2.60it/s]
 60%|#####9    | 177/297 [01:10<00:45,  2.66it/s]
 60%|#####9    | 178/297 [01:10<00:47,  2.52it/s]
 60%|######    | 179/297 [01:11<00:47,  2.49it/s]
 61%|######    | 180/297 [01:11<00:45,  2.60it/s]
 61%|######    | 181/297 [01:12<00:44,  2.63it/s]
 61%|######1   | 182/297 [01:12<00:44,  2.56it/s]
 62%|######1   | 183/297 [01:12<00:44,  2.56it/s]
 62%|######1   | 184/297 [01:13<00:43,  2.60it/s]
 62%|######2   | 185/297 [01:13<00:48,  2.33it/s]
 63%|######2   | 186/297 [01:14<00:46,  2.40it/s]
 63%|######2   | 187/297 [01:14<00:44,  2.45it/s]
 63%|######3   | 188/297 [01:14<00:44,  2.45it/s]
 64%|######3   | 189/297 [01:15<00:44,  2.41it/s]
 64%|######3   | 190/297 [01:15<00:43,  2.46it/s]
 64%|######4   | 191/297 [01:16<00:41,  2.54it/s]
 65%|######4   | 192/297 [01:16<00:41,  2.52it/s]
 65%|######4   | 193/297 [01:16<00:41,  2.49it/s]
 65%|######5   | 194/297 [01:17<00:40,  2.51it/s]
 66%|######5   | 195/297 [01:17<00:40,  2.54it/s]
 66%|######5   | 196/297 [01:18<00:40,  2.51it/s]
 66%|######6   | 197/297 [01:18<00:39,  2.52it/s]
 67%|######6   | 198/297 [01:18<00:39,  2.50it/s]
 67%|######7   | 199/297 [01:19<00:39,  2.49it/s]
 67%|######7   | 200/297 [01:19<00:39,  2.44it/s]
 68%|######7   | 201/297 [01:20<00:43,  2.23it/s]
 68%|######8   | 202/297 [01:20<00:40,  2.32it/s]
 68%|######8   | 203/297 [01:21<00:39,  2.38it/s]
 69%|######8   | 204/297 [01:21<00:37,  2.45it/s]
 69%|######9   | 205/297 [01:21<00:37,  2.43it/s]
 69%|######9   | 206/297 [01:22<00:36,  2.47it/s]
 70%|######9   | 207/297 [01:22<00:36,  2.44it/s]
 70%|#######   | 208/297 [01:23<00:36,  2.45it/s]
 70%|#######   | 209/297 [01:23<00:36,  2.39it/s]
 71%|#######   | 210/297 [01:23<00:35,  2.43it/s]
 71%|#######1  | 211/297 [01:24<00:34,  2.53it/s]
 71%|#######1  | 212/297 [01:24<00:33,  2.56it/s]
 72%|#######1  | 213/297 [01:25<00:32,  2.61it/s]
 72%|#######2  | 214/297 [01:25<00:32,  2.52it/s]
 72%|#######2  | 215/297 [01:25<00:32,  2.54it/s]
 73%|#######2  | 216/297 [01:26<00:34,  2.36it/s]
 73%|#######3  | 217/297 [01:26<00:33,  2.37it/s]
 73%|#######3  | 218/297 [01:27<00:32,  2.42it/s]
 74%|#######3  | 219/297 [01:27<00:32,  2.40it/s]
 74%|#######4  | 220/297 [01:28<00:31,  2.42it/s]
 74%|#######4  | 221/297 [01:28<00:31,  2.44it/s]
 75%|#######4  | 222/297 [01:28<00:30,  2.45it/s]
 75%|#######5  | 223/297 [01:29<00:30,  2.45it/s]
 75%|#######5  | 224/297 [01:29<00:32,  2.26it/s]
 76%|#######5  | 225/297 [01:30<00:31,  2.30it/s]
 76%|#######6  | 226/297 [01:30<00:29,  2.43it/s]
 76%|#######6  | 227/297 [01:30<00:28,  2.48it/s]
 77%|#######6  | 228/297 [01:31<00:28,  2.43it/s]
 77%|#######7  | 229/297 [01:31<00:27,  2.48it/s]
 77%|#######7  | 230/297 [01:32<00:26,  2.56it/s]
 78%|#######7  | 231/297 [01:32<00:25,  2.55it/s]
 78%|#######8  | 232/297 [01:32<00:25,  2.55it/s]
 78%|#######8  | 233/297 [01:33<00:24,  2.60it/s]
 79%|#######8  | 234/297 [01:33<00:24,  2.57it/s]
 79%|#######9  | 235/297 [01:34<00:24,  2.54it/s]
 79%|#######9  | 236/297 [01:34<00:24,  2.44it/s]
 80%|#######9  | 237/297 [01:34<00:24,  2.50it/s]
 80%|########  | 238/297 [01:35<00:23,  2.50it/s]
 80%|########  | 239/297 [01:35<00:22,  2.58it/s]
 81%|########  | 240/297 [01:35<00:21,  2.68it/s]
 81%|########1 | 241/297 [01:36<00:21,  2.57it/s]
 81%|########1 | 242/297 [01:36<00:21,  2.58it/s]
 82%|########1 | 243/297 [01:37<00:22,  2.44it/s]
 82%|########2 | 244/297 [01:37<00:21,  2.49it/s]
 82%|########2 | 245/297 [01:38<00:21,  2.41it/s]
 83%|########2 | 246/297 [01:38<00:20,  2.49it/s]
 83%|########3 | 247/297 [01:38<00:19,  2.58it/s]
 84%|########3 | 248/297 [01:39<00:18,  2.59it/s]
 84%|########3 | 249/297 [01:39<00:18,  2.66it/s]
 84%|########4 | 250/297 [01:39<00:17,  2.62it/s]
 85%|########4 | 251/297 [01:40<00:18,  2.55it/s]
 85%|########4 | 252/297 [01:40<00:17,  2.58it/s]
 85%|########5 | 253/297 [01:41<00:16,  2.60it/s]
 86%|########5 | 254/297 [01:41<00:16,  2.58it/s]
 86%|########5 | 255/297 [01:41<00:16,  2.60it/s]
 86%|########6 | 256/297 [01:42<00:15,  2.59it/s]
 87%|########6 | 257/297 [01:42<00:15,  2.64it/s]
 87%|########6 | 258/297 [01:43<00:15,  2.56it/s]
 87%|########7 | 259/297 [01:43<00:15,  2.49it/s]
 88%|########7 | 260/297 [01:43<00:14,  2.49it/s]
 88%|########7 | 261/297 [01:44<00:14,  2.49it/s]
 88%|########8 | 262/297 [01:44<00:14,  2.48it/s]
 89%|########8 | 263/297 [01:45<00:13,  2.45it/s]
 89%|########8 | 264/297 [01:45<00:13,  2.48it/s]
 89%|########9 | 265/297 [01:45<00:13,  2.45it/s]
 90%|########9 | 266/297 [01:46<00:12,  2.49it/s]
 90%|########9 | 267/297 [01:46<00:13,  2.30it/s]
 90%|######### | 268/297 [01:47<00:12,  2.38it/s]
 91%|######### | 269/297 [01:47<00:11,  2.38it/s]
 91%|######### | 270/297 [01:48<00:11,  2.33it/s]
 91%|#########1| 271/297 [01:48<00:10,  2.44it/s]
 92%|#########1| 272/297 [01:48<00:10,  2.39it/s]
 92%|#########1| 273/297 [01:49<00:09,  2.44it/s]
 92%|#########2| 274/297 [01:49<00:09,  2.42it/s]
 93%|#########2| 275/297 [01:50<00:08,  2.52it/s]
 93%|#########2| 276/297 [01:50<00:08,  2.44it/s]
 93%|#########3| 277/297 [01:50<00:08,  2.46it/s]
 94%|#########3| 278/297 [01:51<00:07,  2.46it/s]
 94%|#########3| 279/297 [01:51<00:07,  2.35it/s]
 94%|#########4| 280/297 [01:52<00:07,  2.41it/s]
 95%|#########4| 281/297 [01:52<00:06,  2.52it/s]
 95%|#########4| 282/297 [01:52<00:05,  2.53it/s]
 95%|#########5| 283/297 [01:53<00:05,  2.57it/s]
 96%|#########5| 284/297 [01:53<00:04,  2.66it/s]
 96%|#########5| 285/297 [01:54<00:04,  2.58it/s]
 96%|#########6| 286/297 [01:54<00:04,  2.59it/s]
 97%|#########6| 287/297 [01:54<00:03,  2.61it/s]
 97%|#########6| 288/297 [01:55<00:03,  2.64it/s]
 97%|#########7| 289/297 [01:55<00:03,  2.59it/s]
 98%|#########7| 290/297 [01:55<00:02,  2.56it/s]
 98%|#########7| 291/297 [01:56<00:02,  2.52it/s]
 98%|#########8| 292/297 [01:56<00:01,  2.54it/s]
 99%|#########8| 293/297 [01:57<00:01,  2.54it/s]
 99%|#########8| 294/297 [01:57<00:01,  2.48it/s]
 99%|#########9| 295/297 [01:57<00:00,  2.45it/s]
100%|#########9| 296/297 [01:58<00:00,  2.43it/s]
100%|##########| 297/297 [01:58<00:00,  2.87it/s]02/17/2022 17:25:16 - INFO - __main__ - Validation-set | bleu: 6.74998952187005 | accuracy: 1.0
02/17/2022 17:25:24 - INFO - __main__ - Test-set | bleu: 0.0 | accuracy: 1.0
Configuration saved in out/tweet/t5_version_2\config.json
Model weights saved in out/tweet/t5_version_2\pytorch_model.bin
tokenizer config file saved in out/tweet/t5_version_2\tokenizer_config.json
Special tokens file saved in out/tweet/t5_version_2\special_tokens_map.json
Copy vocab file to out/tweet/t5_version_2\spiece.model

100%|##########| 297/297 [02:15<00:00,  2.19it/s]

EVALUATING MODELS

Roberta

#valid
!python run_glue.py \
--model_name_or_path out/tweet/roberta_version_2 \
--output_dir out/tweet/roberta_version_2-evaluation \
--return_hidden_states --custom_model \
--train_file data/train.json --validation_file data/valid.json \
--do_eval \
--per_device_eval_batch_size 24 --max_seq_length 128 \
--return_hidden_states --custom_model
02/17/2022 17:22:05 - WARNING - __main__ - Process rank: -1, device: cpu, n_gpu: 0distributed training: False, 16-bits training: False
  0%|          | 0/2 [00:00<?, ?it/s]
100%|##########| 2/2 [00:00<00:00, 143.26it/s]
[INFO|configuration_utils.py:586] 2022-02-17 17:22:05,892 >> loading configuration file out/tweet/roberta_version_2\config.json
[INFO|configuration_utils.py:625] 2022-02-17 17:22:05,893 >> Model config RobertaConfig {
  "_name_or_path": "roberta-base",
02/17/2022 17:22:05 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=out/tweet/roberta_version_2-evaluation\runs\Feb17_17-22-05_DESKTOP-K706NKK,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=500,
logging_strategy=IntervalStrategy.STEPS,
lr_scheduler_type=SchedulerType.LINEAR,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
no_cuda=False,
num_train_epochs=3.0,
output_dir=out/tweet/roberta_version_2-evaluation,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=24,
per_device_train_batch_size=8,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
remove_unused_columns=True,
report_to=[],
resume_from_checkpoint=None,
run_name=out/tweet/roberta_version_2-evaluation,
save_on_each_node=False,
save_steps=500,
save_strategy=IntervalStrategy.STEPS,
save_total_limit=None,
seed=42,
sharded_ddp=[],
skip_memory_metrics=True,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_legacy_prediction_loop=False,
warmup_ratio=0.0,
warmup_steps=0,
weight_decay=0.0,
xpu_backend=None,
)
02/17/2022 17:22:05 - INFO - __main__ - load a local file for train: data/train.json
02/17/2022 17:22:05 - INFO - __main__ - load a local file for validation: data/valid.json
02/17/2022 17:22:05 - WARNING - datasets.builder - Using custom data configuration default-f2672b914d9c5a33
02/17/2022 17:22:05 - INFO - datasets.builder - Overwrite dataset info from restored data version.
02/17/2022 17:22:05 - INFO - datasets.info - Loading Dataset info from C:\Users\Foka\.cache\huggingface\datasets\json\default-f2672b914d9c5a33\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426
02/17/2022 17:22:05 - WARNING - datasets.builder - Reusing dataset json (C:\Users\Foka\.cache\huggingface\datasets\json\default-f2672b914d9c5a33\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)
02/17/2022 17:22:05 - INFO - datasets.info - Loading Dataset info from C:\Users\Foka\.cache\huggingface\datasets\json\default-f2672b914d9c5a33\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426
02/17/2022 17:22:05 - INFO - __main__ - Return hidden states from model: True
02/17/2022 17:22:05 - INFO - __main__ - Using implementation from: RobertaForSequenceClassificationCustomAlternative
02/17/2022 17:22:07 - INFO - datasets.arrow_dataset - Caching processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-f2672b914d9c5a33\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-d1d24efe1f314f1d.arrow
02/17/2022 17:22:07 - INFO - datasets.arrow_dataset - Caching processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-f2672b914d9c5a33\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-74073ef035f90484.arrow
02/17/2022 17:22:08 - INFO - __main__ - *** Evaluate ***
***** eval metrics *****
  eval_accuracy           =      0.938
  eval_loss               =      0.673
  eval_runtime            = 0:00:46.31
  eval_samples            =        500
  eval_samples_per_second =     10.795
  eval_steps_per_second   =      0.453
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": 0,
    "1": 1
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "0": 0,
    "1": 1
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

[INFO|tokenization_utils_base.py:1671] 2022-02-17 17:22:05,900 >> Didn't find file out/tweet/roberta_version_2\added_tokens.json. We won't load it.
[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:22:05,900 >> loading file out/tweet/roberta_version_2\vocab.json
[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:22:05,900 >> loading file out/tweet/roberta_version_2\merges.txt
[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:22:05,900 >> loading file out/tweet/roberta_version_2\tokenizer.json
[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:22:05,900 >> loading file None
[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:22:05,900 >> loading file out/tweet/roberta_version_2\special_tokens_map.json
[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:22:05,900 >> loading file out/tweet/roberta_version_2\tokenizer_config.json
[INFO|modeling_utils.py:1349] 2022-02-17 17:22:05,959 >> loading weights file out/tweet/roberta_version_2\pytorch_model.bin
[WARNING|modeling_utils.py:1609] 2022-02-17 17:22:07,196 >> Some weights of the model checkpoint at out/tweet/roberta_version_2 were not used when initializing RobertaForSequenceClassificationCustomAlternative: ['classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassificationCustomAlternative from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassificationCustomAlternative from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[WARNING|modeling_utils.py:1620] 2022-02-17 17:22:07,196 >> Some weights of RobertaForSequenceClassificationCustomAlternative were not initialized from the model checkpoint at out/tweet/roberta_version_2 and are newly initialized: ['classifier.dense_1_hidden.weight', 'classifier.dense_2.weight', 'classifier.dense_1_input.weight', 'classifier.dense_1_hidden.bias', 'classifier.dense_1_input.bias', 'classifier.dense_2.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

Running tokenizer on dataset:   0%|          | 0/5 [00:00<?, ?ba/s]
Running tokenizer on dataset:  60%|######    | 3/5 [00:00<00:00, 22.77ba/s]
Running tokenizer on dataset: 100%|##########| 5/5 [00:00<00:00, 26.11ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]
Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 58.98ba/s]
[INFO|trainer.py:540] 2022-02-17 17:22:08,390 >> The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassificationCustomAlternative.forward` and have been ignored: tweet.
[INFO|trainer.py:2243] 2022-02-17 17:22:08,392 >> ***** Running Evaluation *****
[INFO|trainer.py:2245] 2022-02-17 17:22:08,392 >>   Num examples = 500
[INFO|trainer.py:2248] 2022-02-17 17:22:08,392 >>   Batch size = 24

  0%|          | 0/21 [00:00<?, ?it/s]
 10%|9         | 2/21 [00:02<00:20,  1.09s/it]
 14%|#4        | 3/21 [00:04<00:27,  1.55s/it]
 19%|#9        | 4/21 [00:06<00:30,  1.80s/it]
 24%|##3       | 5/21 [00:08<00:30,  1.94s/it]
 29%|##8       | 6/21 [00:11<00:30,  2.03s/it]
 33%|###3      | 7/21 [00:13<00:29,  2.09s/it]
 38%|###8      | 8/21 [00:15<00:27,  2.13s/it]
 43%|####2     | 9/21 [00:17<00:25,  2.16s/it]
 48%|####7     | 10/21 [00:19<00:24,  2.18s/it]
 52%|#####2    | 11/21 [00:22<00:21,  2.20s/it]
 57%|#####7    | 12/21 [00:24<00:20,  2.25s/it]
 62%|######1   | 13/21 [00:26<00:17,  2.25s/it]
 67%|######6   | 14/21 [00:28<00:15,  2.23s/it]
 71%|#######1  | 15/21 [00:31<00:13,  2.24s/it]
 76%|#######6  | 16/21 [00:33<00:11,  2.24s/it]
 81%|########  | 17/21 [00:35<00:08,  2.22s/it]
 86%|########5 | 18/21 [00:37<00:06,  2.23s/it]
 90%|######### | 19/21 [00:40<00:04,  2.21s/it]
 95%|#########5| 20/21 [00:42<00:02,  2.20s/it]
100%|##########| 21/21 [00:44<00:00,  2.11s/it]
100%|##########| 21/21 [00:44<00:00,  2.10s/it]
[INFO|modelcard.py:449] 2022-02-17 17:22:55,278 >> Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Text Classification', 'type': 'text-classification'}}
#test
!python run_glue.py \
--model_name_or_path out/tweet/roberta_version_4 \
--output_dir out/tweet/roberta_version_4-evaluation \
--return_hidden_states --custom_model \
--train_file data/train.json --validation_file data/test.json \
--do_eval \
--per_device_eval_batch_size 24 --max_seq_length 128 \
--return_hidden_states --custom_model
02/16/2022 01:12:34 - WARNING - __main__ - Process rank: -1, device: cpu, n_gpu: 0distributed training: False, 16-bits training: False
02/16/2022 01:12:34 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
  0%|          | 0/2 [00:00<?, ?it/s]
100%|##########| 2/2 [00:00<00:00, 167.11it/s]
[INFO|configuration_utils.py:586] 2022-02-16 01:12:34,776 >> loading configuration file out/tweet/roberta_version_4\config.json
[INFO|configuration_utils.py:625] 2022-02-16 01:12:34,776 >> Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForSequenceClassificationCustomAlternative"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": 0,
    "1": 1
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "0": 0,
    "1": 1
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

[INFO|tokenization_utils_base.py:1671] 2022-02-16 01:12:34,779 >> Didn't find file out/tweet/roberta_version_4\added_tokens.json. We won't load it.
[INFO|tokenization_utils_base.py:1740] 2022-02-16 01:12:34,779 >> loading file out/tweet/roberta_version_4\vocab.json
[INFO|tokenization_utils_base.py:1740] 2022-02-16 01:12:34,779 >> loading file out/tweet/roberta_version_4\merges.txt
[INFO|tokenization_utils_base.py:1740] 2022-02-16 01:12:34,779 >> loading file out/tweet/roberta_version_4\tokenizer.json
[INFO|tokenization_utils_base.py:1740] 2022-02-16 01:12:34,779 >> loading file None
[INFO|tokenization_utils_base.py:1740] 2022-02-16 01:12:34,779 >> loading file out/tweet/roberta_version_4\special_tokens_map.json
[INFO|tokenization_utils_base.py:1740] 2022-02-16 01:12:34,780 >> loading file out/tweet/roberta_version_4\tokenizer_config.json
[INFO|modeling_utils.py:1349] 2022-02-16 01:12:34,829 >> loading weights file out/tweet/roberta_version_4\pytorch_model.bin
[INFO|modeling_utils.py:1618] 2022-02-16 01:12:35,990 >> All model checkpoint weights were used when initializing RobertaForSequenceClassificationCustomAlternative.

[INFO|modeling_utils.py:1626] 2022-02-16 01:12:35,990 >> All the weights of RobertaForSequenceClassificationCustomAlternative were initialized from the model checkpoint at out/tweet/roberta_version_4.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassificationCustomAlternative for predictions without further training.

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]
Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 26.36ba/s]
[INFO|trainer.py:540] 2022-02-16 01:12:36,822 >> The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassificationCustomAlternative.forward` and have been ignored: tweet.
[INFO|trainer.py:2243] 2022-02-16 01:12:36,823 >> ***** Running Evaluation *****
[INFO|trainer.py:2245] 2022-02-16 01:12:36,824 >>   Num examples = 500
[INFO|trainer.py:2248] 2022-02-16 01:12:36,824 >>   Batch size = 24

  0%|          | 0/21 [00:00<?, ?it/s]
 10%|9         | 2/21 [00:02<00:20,  1.08s/it]
 14%|#4        | 3/21 [00:04<00:27,  1.52s/it]
 19%|#9        | 4/21 [00:06<00:29,  1.76s/it]
 24%|##3       | 5/21 [00:08<00:30,  1.91s/it]
 29%|##8       | 6/21 [00:10<00:30,  2.00s/it]
 33%|###3      | 7/21 [00:13<00:29,  2.07s/it]
 38%|###8      | 8/21 [00:15<00:27,  2.12s/it]
 43%|####2     | 9/21 [00:17<00:25,  2.14s/it]
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=out/tweet/roberta_version_4-evaluation\runs\Feb16_01-12-34_DESKTOP-K706NKK,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=500,
logging_strategy=IntervalStrategy.STEPS,
lr_scheduler_type=SchedulerType.LINEAR,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
no_cuda=False,
num_train_epochs=3.0,
output_dir=out/tweet/roberta_version_4-evaluation,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=24,
per_device_train_batch_size=8,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
remove_unused_columns=True,
report_to=[],
resume_from_checkpoint=None,
run_name=out/tweet/roberta_version_4-evaluation,
save_on_each_node=False,
save_steps=500,
save_strategy=IntervalStrategy.STEPS,
save_total_limit=None,
seed=42,
sharded_ddp=[],
skip_memory_metrics=True,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_legacy_prediction_loop=False,
warmup_ratio=0.0,
warmup_steps=0,
weight_decay=0.0,
xpu_backend=None,
)
02/16/2022 01:12:34 - INFO - __main__ - load a local file for train: data/train.json
02/16/2022 01:12:34 - INFO - __main__ - load a local file for validation: data/test.json
02/16/2022 01:12:34 - WARNING - datasets.builder - Using custom data configuration default-aa408910693fa782
02/16/2022 01:12:34 - INFO - datasets.builder - Overwrite dataset info from restored data version.
02/16/2022 01:12:34 - INFO - datasets.info - Loading Dataset info from C:\Users\Foka\.cache\huggingface\datasets\json\default-aa408910693fa782\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426
02/16/2022 01:12:34 - WARNING - datasets.builder - Reusing dataset json (C:\Users\Foka\.cache\huggingface\datasets\json\default-aa408910693fa782\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)
02/16/2022 01:12:34 - INFO - datasets.info - Loading Dataset info from C:\Users\Foka\.cache\huggingface\datasets\json\default-aa408910693fa782\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426
02/16/2022 01:12:34 - INFO - __main__ - Return hidden states from model: True
02/16/2022 01:12:34 - INFO - __main__ - Using implementation from: RobertaForSequenceClassificationCustomAlternative
02/16/2022 01:12:36 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-aa408910693fa782\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-7c7dda0a4623bcbe.arrow
02/16/2022 01:12:36 - INFO - datasets.arrow_dataset - Caching processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-aa408910693fa782\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-eec123a569b1837d.arrow
02/16/2022 01:12:36 - INFO - __main__ - *** Evaluate ***
***** eval metrics *****
  eval_accuracy           =        1.0
  eval_loss               =     0.6472
  eval_runtime            = 0:00:45.49
  eval_samples            =        500
  eval_samples_per_second =     10.991
  eval_steps_per_second   =      0.462
 48%|####7     | 10/21 [00:19<00:23,  2.17s/it]
 52%|#####2    | 11/21 [00:21<00:21,  2.16s/it]
 57%|#####7    | 12/21 [00:24<00:19,  2.18s/it]
 62%|######1   | 13/21 [00:26<00:17,  2.18s/it]
 67%|######6   | 14/21 [00:28<00:15,  2.18s/it]
 71%|#######1  | 15/21 [00:30<00:13,  2.17s/it]
 76%|#######6  | 16/21 [00:32<00:10,  2.18s/it]
 81%|########  | 17/21 [00:34<00:08,  2.19s/it]
 86%|########5 | 18/21 [00:37<00:06,  2.19s/it]
 90%|######### | 19/21 [00:39<00:04,  2.19s/it]
 95%|#########5| 20/21 [00:41<00:02,  2.17s/it]
100%|##########| 21/21 [00:43<00:00,  2.06s/it]
100%|##########| 21/21 [00:43<00:00,  2.06s/it]
[INFO|modelcard.py:449] 2022-02-16 01:13:22,843 >> Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Text Classification', 'type': 'text-classification'}}

GPT2

#valid
!python run_glue.py \
--model_name_or_path out/tweet/gpt2_version_2 \
--output_dir out/tweet/gpt2_version_2-evaluation \
--return_hidden_states --custom_model \
--train_file data/train.json --validation_file data/valid.json \
--do_eval \
--per_device_eval_batch_size 24 --max_seq_length 128 \
--return_hidden_states --custom_model
02/17/2022 17:25:29 - WARNING - __main__ - Process rank: -1, device: cpu, n_gpu: 0distributed training: False, 16-bits training: False
02/17/2022 17:25:29 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=out/tweet/gpt2_version_2-evaluation\runs\Feb17_17-25-29_DESKTOP-K706NKK,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=500,
logging_strategy=IntervalStrategy.STEPS,
lr_scheduler_type=SchedulerType.LINEAR,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
no_cuda=False,
num_train_epochs=3.0,
output_dir=out/tweet/gpt2_version_2-evaluation,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=24,
per_device_train_batch_size=8,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
remove_unused_columns=True,
report_to=[],
resume_from_checkpoint=None,
run_name=out/tweet/gpt2_version_2-evaluation,
save_on_each_node=False,
save_steps=500,
save_strategy=IntervalStrategy.STEPS,
save_total_limit=None,
seed=42,
sharded_ddp=[],
skip_memory_metrics=True,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_legacy_prediction_loop=False,
warmup_ratio=0.0,
warmup_steps=0,
weight_decay=0.0,
xpu_backend=None,
)
02/17/2022 17:25:29 - INFO - __main__ - load a local file for train: data/train.json
02/17/2022 17:25:29 - INFO - __main__ - load a local file for validation: data/valid.json
02/17/2022 17:25:29 - WARNING - datasets.builder - Using custom data configuration default-f2672b914d9c5a33
02/17/2022 17:25:29 - INFO - datasets.builder - Overwrite dataset info from restored data version.
02/17/2022 17:25:29 - INFO - datasets.info - Loading Dataset info from C:\Users\Foka\.cache\huggingface\datasets\json\default-f2672b914d9c5a33\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426
02/17/2022 17:25:29 - WARNING - datasets.builder - Reusing dataset json (C:\Users\Foka\.cache\huggingface\datasets\json\default-f2672b914d9c5a33\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)
02/17/2022 17:25:29 - INFO - datasets.info - Loading Dataset info from C:\Users\Foka\.cache\huggingface\datasets\json\default-f2672b914d9c5a33\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426
02/17/2022 17:25:29 - INFO - __main__ - Return hidden states from model: True
02/17/2022 17:25:29 - INFO - __main__ - Using implementation from: GPT2ForSequenceClassificationCustom
02/17/2022 17:25:31 - INFO - datasets.arrow_dataset - Caching processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-f2672b914d9c5a33\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-212f78cac2ca92a1.arrow
02/17/2022 17:25:31 - INFO - datasets.arrow_dataset - Caching processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-f2672b914d9c5a33\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-95c22eb06b0faad8.arrow
02/17/2022 17:25:32 - INFO - __main__ - *** Evaluate ***
***** eval metrics *****
  eval_accuracy           =      0.938
  eval_loss               =     0.4886
  eval_runtime            = 0:01:01.53
  eval_samples            =        500
  eval_samples_per_second =      8.126
  eval_steps_per_second   =      0.341
  0%|          | 0/2 [00:00<?, ?it/s]
100%|##########| 2/2 [00:00<00:00, 2018.43it/s]
[INFO|configuration_utils.py:586] 2022-02-17 17:25:29,863 >> loading configuration file out/tweet/gpt2_version_2\config.json
[INFO|configuration_utils.py:625] 2022-02-17 17:25:29,864 >> Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2ForSequenceClassification"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": 0,
    "1": 1
  },
  "initializer_range": 0.02,
  "label2id": {
    "0": 0,
    "1": 1
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50256,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 50257
}

[INFO|tokenization_utils_base.py:1671] 2022-02-17 17:25:29,868 >> Didn't find file out/tweet/gpt2_version_2\added_tokens.json. We won't load it.
[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:25:29,869 >> loading file out/tweet/gpt2_version_2\vocab.json
[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:25:29,869 >> loading file out/tweet/gpt2_version_2\merges.txt
[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:25:29,869 >> loading file out/tweet/gpt2_version_2\tokenizer.json
[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:25:29,869 >> loading file None
[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:25:29,869 >> loading file out/tweet/gpt2_version_2\special_tokens_map.json
[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:25:29,869 >> loading file out/tweet/gpt2_version_2\tokenizer_config.json
[INFO|modeling_utils.py:1349] 2022-02-17 17:25:29,927 >> loading weights file out/tweet/gpt2_version_2\pytorch_model.bin
[WARNING|modeling_utils.py:1609] 2022-02-17 17:25:31,677 >> Some weights of the model checkpoint at out/tweet/gpt2_version_2 were not used when initializing GPT2ForSequenceClassificationCustom: ['score.weight']
- This IS expected if you are initializing GPT2ForSequenceClassificationCustom from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassificationCustom from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[WARNING|modeling_utils.py:1620] 2022-02-17 17:25:31,677 >> Some weights of GPT2ForSequenceClassificationCustom were not initialized from the model checkpoint at out/tweet/gpt2_version_2 and are newly initialized: ['score.out_proj.weight', 'score.dense_1_input.bias', 'score.dense_1_hidden.bias', 'score.dense_2.weight', 'score.dense_2.bias', 'score.dense_1_hidden.weight', 'score.dense_1_input.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

Running tokenizer on dataset:   0%|          | 0/5 [00:00<?, ?ba/s]
Running tokenizer on dataset:  40%|####      | 2/5 [00:00<00:00, 18.16ba/s]
Running tokenizer on dataset: 100%|##########| 5/5 [00:00<00:00, 25.52ba/s]

Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]
Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 58.98ba/s]
[INFO|trainer.py:540] 2022-02-17 17:25:32,736 >> The following columns in the evaluation set  don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: tweet.
[INFO|trainer.py:2243] 2022-02-17 17:25:32,737 >> ***** Running Evaluation *****
[INFO|trainer.py:2245] 2022-02-17 17:25:32,737 >>   Num examples = 500
[INFO|trainer.py:2248] 2022-02-17 17:25:32,737 >>   Batch size = 24

  0%|          | 0/21 [00:00<?, ?it/s]
 10%|9         | 2/21 [00:02<00:28,  1.48s/it]
 14%|#4        | 3/21 [00:05<00:37,  2.08s/it]
 19%|#9        | 4/21 [00:08<00:40,  2.40s/it]
 24%|##3       | 5/21 [00:11<00:41,  2.60s/it]
 29%|##8       | 6/21 [00:14<00:40,  2.70s/it]
 33%|###3      | 7/21 [00:17<00:38,  2.77s/it]
 38%|###8      | 8/21 [00:20<00:36,  2.81s/it]
 43%|####2     | 9/21 [00:23<00:34,  2.86s/it]
 48%|####7     | 10/21 [00:26<00:31,  2.89s/it]
 52%|#####2    | 11/21 [00:29<00:29,  2.90s/it]
 57%|#####7    | 12/21 [00:32<00:26,  2.91s/it]
 62%|######1   | 13/21 [00:35<00:23,  2.94s/it]
 67%|######6   | 14/21 [00:38<00:20,  2.96s/it]
 71%|#######1  | 15/21 [00:41<00:17,  2.97s/it]
 76%|#######6  | 16/21 [00:44<00:14,  2.95s/it]
 81%|########  | 17/21 [00:47<00:11,  2.95s/it]
 86%|########5 | 18/21 [00:50<00:08,  2.96s/it]
 90%|######### | 19/21 [00:53<00:05,  2.96s/it]
 95%|#########5| 20/21 [00:56<00:02,  2.97s/it]
100%|##########| 21/21 [00:58<00:00,  2.83s/it]
100%|##########| 21/21 [00:58<00:00,  2.79s/it]
[INFO|modelcard.py:449] 2022-02-17 17:26:34,864 >> Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Text Classification', 'type': 'text-classification'}}

T5

#train and test
!python run_translation.py \
--model_name_or_path out/tweet/t5_version_2 \
--output_dir out/tweet/t5-evaluation \
--train_file data/translations-train.json \
--validation_file data/translations-test.json \
--do_eval \
--per_device_eval_batch_size 16 \
--source_lang text \
--target_lang label \
--source_prefix "tweet classification" \
--max_source_length 256 \
--max_target_length 128 \
--predict_with_generate
02/17/2022 17:36:52 - WARNING - __main__ - Process rank: -1, device: cpu, n_gpu: 0distributed training: False, 16-bits training: False
02/17/2022 17:36:52 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
generation_max_length=None,
generation_num_beams=None,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
logging_dir=out/tweet/t5-evaluation\runs\Feb17_17-36-52_DESKTOP-K706NKK,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=500,
logging_strategy=IntervalStrategy.STEPS,
lr_scheduler_type=SchedulerType.LINEAR,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
no_cuda=False,
num_train_epochs=3.0,
output_dir=out/tweet/t5-evaluation,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=16,
per_device_train_batch_size=8,
predict_with_generate=True,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
remove_unused_columns=True,
report_to=[],
resume_from_checkpoint=None,
run_name=out/tweet/t5-evaluation,
save_on_each_node=False,
save_steps=500,
save_strategy=IntervalStrategy.STEPS,
save_total_limit=None,
seed=42,
sharded_ddp=[],
skip_memory_metrics=True,
sortish_sampler=False,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_legacy_prediction_loop=False,
warmup_ratio=0.0,
warmup_steps=0,
weight_decay=0.0,
xpu_backend=None,
)
02/17/2022 17:36:52 - WARNING - datasets.builder - Using custom data configuration default-6d5bc754bbaa91d7
02/17/2022 17:36:52 - INFO - datasets.builder - Overwrite dataset info from restored data version.
02/17/2022 17:36:52 - INFO - datasets.info - Loading Dataset info from C:\Users\Foka\.cache\huggingface\datasets\json\default-6d5bc754bbaa91d7\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426
02/17/2022 17:36:52 - WARNING - datasets.builder - Reusing dataset json (C:\Users\Foka\.cache\huggingface\datasets\json\default-6d5bc754bbaa91d7\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)
02/17/2022 17:36:52 - INFO - datasets.info - Loading Dataset info from C:\Users\Foka\.cache\huggingface\datasets\json\default-6d5bc754bbaa91d7\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426
02/17/2022 17:36:53 - INFO - __main__ - Using translation prefix: "tweet classification: "
02/17/2022 17:36:53 - INFO - datasets.arrow_dataset - Caching processed dataset at C:\Users\Foka\.cache\huggingface\datasets\json\default-6d5bc754bbaa91d7\0.0.0\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\cache-96f3d337ad66e082.arrow
02/17/2022 17:36:55 - INFO - __main__ - *** Evaluate ***
02/17/2022 17:37:09 - INFO - datasets.metric - Removing C:\Users\Foka\.cache\huggingface\metrics\accuracy\default\default_experiment-1-0.arrow
02/17/2022 17:37:09 - INFO - datasets.metric - Removing C:\Users\Foka\.cache\huggingface\metrics\sacrebleu\default\default_experiment-1-0.arrow
***** eval metrics *****
  eval_accuracy           =        1.0
  eval_bleu               =        0.0
  eval_gen_len            =      2.272
  eval_loss               =     0.5538
  eval_runtime            = 0:00:14.42
  eval_samples            =        500
  eval_samples_per_second =     34.659
  eval_steps_per_second   =      2.218
  0%|          | 0/2 [00:00<?, ?it/s]
100%|##########| 2/2 [00:00<00:00, 2020.86it/s]
[INFO|configuration_utils.py:586] 2022-02-17 17:36:52,675 >> loading configuration file out/tweet/t5_version_2\config.json
[INFO|configuration_utils.py:625] 2022-02-17 17:36:52,677 >> Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    },
    "translation_en_to_fr": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to French: "
    },
    "translation_en_to_ro": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to Romanian: "
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 32100
}

[INFO|tokenization_utils_base.py:1671] 2022-02-17 17:36:52,677 >> Didn't find file out/tweet/t5_version_2\added_tokens.json. We won't load it.
[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:36:52,677 >> loading file out/tweet/t5_version_2\spiece.model
[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:36:52,677 >> loading file out/tweet/t5_version_2\tokenizer.json
[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:36:52,677 >> loading file None
[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:36:52,677 >> loading file out/tweet/t5_version_2\special_tokens_map.json
[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:36:52,677 >> loading file out/tweet/t5_version_2\tokenizer_config.json
[INFO|modeling_utils.py:1349] 2022-02-17 17:36:52,771 >> loading weights file out/tweet/t5_version_2\pytorch_model.bin
[INFO|modeling_utils.py:1618] 2022-02-17 17:36:53,190 >> All model checkpoint weights were used when initializing T5ForConditionalGeneration.

[INFO|modeling_utils.py:1626] 2022-02-17 17:36:53,190 >> All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at out/tweet/t5_version_2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.

Running tokenizer on validation dataset:   0%|          | 0/1 [00:00<?, ?ba/s]
Running tokenizer on validation dataset: 100%|##########| 1/1 [00:00<00:00, 34.57ba/s]
[INFO|trainer.py:2243] 2022-02-17 17:36:55,016 >> ***** Running Evaluation *****
[INFO|trainer.py:2245] 2022-02-17 17:36:55,016 >>   Num examples = 500
[INFO|trainer.py:2248] 2022-02-17 17:36:55,016 >>   Batch size = 16

  0%|          | 0/32 [00:00<?, ?it/s]
  6%|6         | 2/32 [00:00<00:06,  4.31it/s]
  9%|9         | 3/32 [00:01<00:10,  2.75it/s]
 12%|#2        | 4/32 [00:01<00:10,  2.57it/s]
 16%|#5        | 5/32 [00:01<00:11,  2.43it/s]
 19%|#8        | 6/32 [00:02<00:11,  2.32it/s]
 22%|##1       | 7/32 [00:02<00:11,  2.14it/s]
 25%|##5       | 8/32 [00:03<00:11,  2.11it/s]
 28%|##8       | 9/32 [00:03<00:10,  2.12it/s]
 31%|###1      | 10/32 [00:04<00:09,  2.20it/s]
 34%|###4      | 11/32 [00:04<00:09,  2.33it/s]
 38%|###7      | 12/32 [00:05<00:08,  2.26it/s]
 41%|####      | 13/32 [00:05<00:08,  2.23it/s]
 44%|####3     | 14/32 [00:06<00:08,  2.23it/s]
 47%|####6     | 15/32 [00:06<00:07,  2.26it/s]
 50%|#####     | 16/32 [00:06<00:07,  2.25it/s]
 53%|#####3    | 17/32 [00:07<00:07,  2.09it/s]
 56%|#####6    | 18/32 [00:07<00:06,  2.15it/s]
 59%|#####9    | 19/32 [00:08<00:05,  2.21it/s]
 62%|######2   | 20/32 [00:08<00:05,  2.26it/s]
 66%|######5   | 21/32 [00:09<00:05,  2.16it/s]
 69%|######8   | 22/32 [00:09<00:04,  2.05it/s]
 72%|#######1  | 23/32 [00:10<00:04,  2.14it/s]
 75%|#######5  | 24/32 [00:10<00:03,  2.11it/s]
 78%|#######8  | 25/32 [00:11<00:03,  2.23it/s]
 81%|########1 | 26/32 [00:11<00:02,  2.14it/s]
 84%|########4 | 27/32 [00:12<00:02,  2.25it/s]
 88%|########7 | 28/32 [00:12<00:01,  2.12it/s]
 91%|######### | 29/32 [00:12<00:01,  2.23it/s]
 94%|#########3| 30/32 [00:13<00:00,  2.27it/s]
 97%|#########6| 31/32 [00:13<00:00,  2.34it/s]
100%|##########| 32/32 [00:13<00:00,  2.98it/s]
100%|##########| 32/32 [00:13<00:00,  2.30it/s]
[INFO|modelcard.py:449] 2022-02-17 17:37:10,066 >> Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Translation', 'type': 'translation'}}