{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GPT2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "pycharm": {
     "is_executing": true
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "02/16/2022 00:13:42 - INFO - __main__ - Distributed environment: NO\n",
      "Num processes: 1\n",
      "Process index: 0\n",
      "Local process index: 0\n",
      "Device: cpu\n",
      "Use FP16 precision: False\n",
      "\n",
      "02/16/2022 00:13:43 - WARNING - datasets.builder - Using custom data configuration default-67c9d932a627b7b8\n",
      "02/16/2022 00:13:43 - WARNING - datasets.builder - Reusing dataset json (C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)\n",
      "\n",
      "  0%|          | 0/3 [00:00<?, ?it/s]\n",
      "100%|##########| 3/3 [00:00<00:00, 1491.40it/s]\n",
      "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n",
      "Model config GPT2Config {\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "Could not locate the tokenizer configuration file, will try to use the model config instead.\n",
      "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n",
      "Model config GPT2Config {\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f\n",
      "loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b\n",
      "loading file https://huggingface.co/gpt2/resolve/main/tokenizer.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0\n",
      "loading file https://huggingface.co/gpt2/resolve/main/added_tokens.json from cache at None\n",
      "loading file https://huggingface.co/gpt2/resolve/main/special_tokens_map.json from cache at None\n",
      "loading file https://huggingface.co/gpt2/resolve/main/tokenizer_config.json from cache at None\n",
      "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n",
      "Model config GPT2Config {\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "02/16/2022 00:13:48 - INFO - __main__ - Return hidden states from model: False\n",
      "02/16/2022 00:13:48 - INFO - __main__ - Using implementation from: AutoModelForSequenceClassification\n",
      "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n",
      "All model checkpoint weights were used when initializing GPT2ForSequenceClassification.\n",
      "\n",
      "Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "Using pad_token, but it is not set yet.\n",
      "02/16/2022 00:13:50 - INFO - __main__ - Set PAD token to EOS: <|endoftext|>\n",
      "02/16/2022 00:13:50 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-18c6f53370629db4.arrow\n",
      "02/16/2022 00:13:50 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-da48038acf63cb08.arrow\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 55.70ba/s]\n",
      "02/16/2022 00:13:50 - INFO - __main__ - Sample 2755 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [2435, 284, 651, 3772, 0, 340, 338, 264, 3658, 6184, 108, 126, 253, 126, 240, 126, 246, 220, 220, 220, 1303, 82, 3658, 1303, 10464, 437, 220], 'labels': 0}.\n",
      "02/16/2022 00:13:50 - INFO - __main__ - Sample 2054 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [31, 7220, 2488, 7220, 220, 909, 1689, 1222, 696, 26, 8406, 268, 389, 262, 749, 1303, 17096, 11186, 220, 1893, 1222, 696, 26, 410, 79, 287, 2106, 13, 1303, 40954], 'labels': 1}.\n",
      "02/16/2022 00:13:50 - INFO - __main__ - Sample 551 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [31, 7220, 523, 318, 340, 572, 605, 326, 1303, 73, 15515, 389, 8720, 220, 287, 262, 2951, 286, 262, 1303, 8019, 83, 446, 14568, 30, 220], 'labels': 1}.\n",
      "02/16/2022 00:13:51 - INFO - __main__ - ***** Running training *****\n",
      "02/16/2022 00:13:51 - INFO - __main__ -   Num examples = 4742\n",
      "02/16/2022 00:13:51 - INFO - __main__ -   Num Epochs = 1\n",
      "02/16/2022 00:13:51 - INFO - __main__ -   Instantaneous batch size per device = 24\n",
      "02/16/2022 00:13:51 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 24\n",
      "02/16/2022 00:13:51 - INFO - __main__ -   Gradient Accumulation steps = 1\n",
      "02/16/2022 00:13:51 - INFO - __main__ -   Total optimization steps = 198\n",
      "\n",
      "  0%|          | 0/198 [00:00<?, ?it/s]\n",
      "  1%|          | 1/198 [00:03<10:40,  3.25s/it]\n",
      "  1%|1         | 2/198 [00:06<10:12,  3.12s/it]\n",
      "  2%|1         | 3/198 [00:10<11:42,  3.60s/it]\n",
      "  2%|2         | 4/198 [00:13<10:52,  3.37s/it]\n",
      "  3%|2         | 5/198 [00:17<11:54,  3.70s/it]\n",
      "  3%|3         | 6/198 [00:25<16:09,  5.05s/it]\n",
      "  4%|3         | 7/198 [00:30<16:15,  5.11s/it]\n",
      "  4%|4         | 8/198 [00:37<18:02,  5.70s/it]\n",
      "  5%|4         | 9/198 [00:42<17:15,  5.48s/it]\n",
      "  5%|5         | 10/198 [00:48<17:15,  5.51s/it]\n",
      "  6%|5         | 11/198 [00:51<15:20,  4.92s/it]\n",
      "  6%|6         | 12/198 [00:55<13:50,  4.47s/it]\n",
      "  7%|6         | 13/198 [01:00<14:10,  4.60s/it]\n",
      "  7%|7         | 14/198 [01:02<12:23,  4.04s/it]\n",
      "  8%|7         | 15/198 [01:10<15:20,  5.03s/it]\n",
      "  8%|8         | 16/198 [01:14<14:41,  4.84s/it]\n",
      "  9%|8         | 17/198 [01:18<13:32,  4.49s/it]\n",
      "  9%|9         | 18/198 [01:21<12:04,  4.03s/it]\n",
      " 10%|9         | 19/198 [01:24<11:16,  3.78s/it]\n",
      " 10%|#         | 20/198 [01:27<10:43,  3.61s/it]\n",
      " 11%|#         | 21/198 [01:31<11:11,  3.79s/it]\n",
      " 11%|#1        | 22/198 [01:35<10:44,  3.66s/it]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 12%|#1        | 23/198 [01:40<11:44,  4.02s/it]\n",
      " 12%|#2        | 24/198 [01:44<11:37,  4.01s/it]\n",
      " 13%|#2        | 25/198 [01:47<10:46,  3.74s/it]\n",
      " 13%|#3        | 26/198 [01:51<11:16,  3.93s/it]\n",
      " 14%|#3        | 27/198 [01:55<11:30,  4.04s/it]\n",
      " 14%|#4        | 28/198 [02:00<11:42,  4.13s/it]\n",
      " 15%|#4        | 29/198 [02:03<10:55,  3.88s/it]\n",
      " 15%|#5        | 30/198 [02:07<10:44,  3.84s/it]\n",
      " 16%|#5        | 31/198 [02:10<10:06,  3.63s/it]\n",
      " 16%|#6        | 32/198 [02:13<10:00,  3.62s/it]\n",
      " 17%|#6        | 33/198 [02:17<09:35,  3.49s/it]\n",
      " 17%|#7        | 34/198 [02:21<10:11,  3.73s/it]\n",
      " 18%|#7        | 35/198 [02:25<10:17,  3.79s/it]\n",
      " 18%|#8        | 36/198 [02:28<09:29,  3.51s/it]\n",
      " 19%|#8        | 37/198 [02:33<11:12,  4.18s/it]\n",
      " 19%|#9        | 38/198 [02:36<10:13,  3.84s/it]\n",
      " 20%|#9        | 39/198 [02:40<10:02,  3.79s/it]\n",
      " 20%|##        | 40/198 [02:44<10:18,  3.92s/it]\n",
      " 21%|##        | 41/198 [02:48<09:38,  3.68s/it]\n",
      " 21%|##1       | 42/198 [02:52<10:11,  3.92s/it]\n",
      " 22%|##1       | 43/198 [02:58<11:44,  4.55s/it]\n",
      " 22%|##2       | 44/198 [03:02<11:02,  4.30s/it]\n",
      " 23%|##2       | 45/198 [03:06<11:16,  4.42s/it]\n",
      " 23%|##3       | 46/198 [03:09<10:02,  3.96s/it]\n",
      " 24%|##3       | 47/198 [03:13<09:44,  3.87s/it]\n",
      " 24%|##4       | 48/198 [03:16<08:55,  3.57s/it]\n",
      " 25%|##4       | 49/198 [03:21<09:54,  3.99s/it]\n",
      " 25%|##5       | 50/198 [03:28<12:26,  5.04s/it]\n",
      " 26%|##5       | 51/198 [03:32<11:09,  4.55s/it]\n",
      " 26%|##6       | 52/198 [03:35<10:14,  4.21s/it]\n",
      " 27%|##6       | 53/198 [03:39<09:42,  4.02s/it]\n",
      " 27%|##7       | 54/198 [03:46<11:52,  4.95s/it]\n",
      " 28%|##7       | 55/198 [03:49<10:34,  4.44s/it]\n",
      " 28%|##8       | 56/198 [03:51<09:02,  3.82s/it]\n",
      " 29%|##8       | 57/198 [03:56<09:16,  3.95s/it]\n",
      " 29%|##9       | 58/198 [03:59<08:56,  3.83s/it]\n",
      " 30%|##9       | 59/198 [04:02<08:02,  3.47s/it]\n",
      " 30%|###       | 60/198 [04:05<07:40,  3.34s/it]\n",
      " 31%|###       | 61/198 [04:12<10:15,  4.49s/it]\n",
      " 31%|###1      | 62/198 [04:14<08:45,  3.86s/it]\n",
      " 32%|###1      | 63/198 [04:19<08:55,  3.97s/it]\n",
      " 32%|###2      | 64/198 [04:23<09:05,  4.07s/it]\n",
      " 33%|###2      | 65/198 [04:27<09:05,  4.10s/it]\n",
      " 33%|###3      | 66/198 [04:31<09:04,  4.12s/it]\n",
      " 34%|###3      | 67/198 [04:34<08:15,  3.79s/it]\n",
      " 34%|###4      | 68/198 [04:37<07:34,  3.50s/it]\n",
      " 35%|###4      | 69/198 [04:44<09:48,  4.56s/it]\n",
      " 35%|###5      | 70/198 [04:47<08:53,  4.17s/it]\n",
      " 36%|###5      | 71/198 [04:52<08:49,  4.17s/it]\n",
      " 36%|###6      | 72/198 [04:56<08:46,  4.18s/it]\n",
      " 37%|###6      | 73/198 [04:59<08:01,  3.85s/it]\n",
      " 37%|###7      | 74/198 [05:02<07:26,  3.60s/it]\n",
      " 38%|###7      | 75/198 [05:08<08:49,  4.31s/it]\n",
      " 38%|###8      | 76/198 [05:12<08:36,  4.23s/it]\n",
      " 39%|###8      | 77/198 [05:16<08:33,  4.24s/it]\n",
      " 39%|###9      | 78/198 [05:20<08:29,  4.25s/it]\n",
      " 40%|###9      | 79/198 [05:23<07:37,  3.84s/it]\n",
      " 40%|####      | 80/198 [05:26<06:55,  3.52s/it]\n",
      " 41%|####      | 81/198 [05:30<07:07,  3.66s/it]\n",
      " 41%|####1     | 82/198 [05:33<06:32,  3.39s/it]\n",
      " 42%|####1     | 83/198 [05:37<06:40,  3.48s/it]\n",
      " 42%|####2     | 84/198 [05:41<07:02,  3.71s/it]\n",
      " 43%|####2     | 85/198 [05:44<06:27,  3.43s/it]\n",
      " 43%|####3     | 86/198 [05:48<06:54,  3.70s/it]\n",
      " 44%|####3     | 87/198 [05:52<07:09,  3.87s/it]\n",
      " 44%|####4     | 88/198 [05:55<06:29,  3.54s/it]\n",
      " 45%|####4     | 89/198 [06:01<07:48,  4.29s/it]\n",
      " 45%|####5     | 90/198 [06:04<06:50,  3.80s/it]\n",
      " 46%|####5     | 91/198 [06:06<06:05,  3.42s/it]\n",
      " 46%|####6     | 92/198 [06:09<05:42,  3.23s/it]\n",
      " 47%|####6     | 93/198 [06:13<05:59,  3.42s/it]\n",
      " 47%|####7     | 94/198 [06:16<05:45,  3.33s/it]\n",
      " 48%|####7     | 95/198 [06:18<05:16,  3.07s/it]\n",
      " 48%|####8     | 96/198 [06:22<05:13,  3.07s/it]\n",
      " 49%|####8     | 97/198 [06:25<05:12,  3.09s/it]\n",
      " 49%|####9     | 98/198 [06:28<05:03,  3.03s/it]\n",
      " 50%|#####     | 99/198 [06:31<05:00,  3.03s/it]\n",
      " 51%|#####     | 100/198 [06:33<04:53,  3.00s/it]\n",
      " 51%|#####1    | 101/198 [06:36<04:50,  2.99s/it]\n",
      " 52%|#####1    | 102/198 [06:41<05:21,  3.35s/it]\n",
      " 52%|#####2    | 103/198 [06:45<05:39,  3.57s/it]\n",
      " 53%|#####2    | 104/198 [06:48<05:28,  3.49s/it]\n",
      " 53%|#####3    | 105/198 [06:54<06:28,  4.18s/it]\n",
      " 54%|#####3    | 106/198 [06:56<05:42,  3.72s/it]\n",
      " 54%|#####4    | 107/198 [07:00<05:21,  3.53s/it]\n",
      " 55%|#####4    | 108/198 [07:02<04:57,  3.30s/it]\n",
      " 55%|#####5    | 109/198 [07:05<04:32,  3.06s/it]\n",
      " 56%|#####5    | 110/198 [07:08<04:23,  2.99s/it]\n",
      " 56%|#####6    | 111/198 [07:10<04:10,  2.88s/it]\n",
      " 57%|#####6    | 112/198 [07:13<04:00,  2.80s/it]\n",
      " 57%|#####7    | 113/198 [07:16<04:15,  3.01s/it]\n",
      " 58%|#####7    | 114/198 [07:20<04:23,  3.13s/it]\n",
      " 58%|#####8    | 115/198 [07:23<04:26,  3.21s/it]\n",
      " 59%|#####8    | 116/198 [07:26<04:18,  3.15s/it]\n",
      " 59%|#####9    | 117/198 [07:30<04:21,  3.22s/it]\n",
      " 60%|#####9    | 118/198 [07:37<05:52,  4.41s/it]\n",
      " 60%|######    | 119/198 [07:42<06:16,  4.76s/it]\n",
      " 61%|######    | 120/198 [07:47<05:58,  4.60s/it]\n",
      " 61%|######1   | 121/198 [07:49<05:07,  4.00s/it]\n",
      " 62%|######1   | 122/198 [07:52<04:34,  3.61s/it]\n",
      " 62%|######2   | 123/198 [07:55<04:14,  3.40s/it]\n",
      " 63%|######2   | 124/198 [07:57<03:55,  3.19s/it]\n",
      " 63%|######3   | 125/198 [08:02<04:13,  3.47s/it]\n",
      " 64%|######3   | 126/198 [08:05<04:17,  3.57s/it]\n",
      " 64%|######4   | 127/198 [08:10<04:27,  3.77s/it]\n",
      " 65%|######4   | 128/198 [08:12<04:02,  3.47s/it]\n",
      " 65%|######5   | 129/198 [08:17<04:24,  3.84s/it]\n",
      " 66%|######5   | 130/198 [08:21<04:28,  3.95s/it]\n",
      " 66%|######6   | 131/198 [08:24<03:52,  3.47s/it]\n",
      " 67%|######6   | 132/198 [08:27<03:40,  3.34s/it]\n",
      " 67%|######7   | 133/198 [08:31<03:58,  3.66s/it]\n",
      " 68%|######7   | 134/198 [08:36<04:11,  3.93s/it]\n",
      " 68%|######8   | 135/198 [08:38<03:45,  3.58s/it]\n",
      " 69%|######8   | 136/198 [08:41<03:26,  3.32s/it]\n",
      " 69%|######9   | 137/198 [08:45<03:32,  3.49s/it]\n",
      " 70%|######9   | 138/198 [08:49<03:43,  3.72s/it]\n",
      " 70%|#######   | 139/198 [08:53<03:37,  3.68s/it]\n",
      " 71%|#######   | 140/198 [08:57<03:38,  3.76s/it]\n",
      " 71%|#######1  | 141/198 [09:00<03:18,  3.49s/it]\n",
      " 72%|#######1  | 142/198 [09:03<03:07,  3.34s/it]\n",
      " 72%|#######2  | 143/198 [09:07<03:17,  3.59s/it]\n",
      " 73%|#######2  | 144/198 [09:10<03:03,  3.41s/it]\n",
      " 73%|#######3  | 145/198 [09:13<02:58,  3.37s/it]\n",
      " 74%|#######3  | 146/198 [09:17<03:07,  3.60s/it]\n",
      " 74%|#######4  | 147/198 [09:21<03:01,  3.56s/it]\n",
      " 75%|#######4  | 148/198 [09:25<03:13,  3.88s/it]\n",
      " 75%|#######5  | 149/198 [09:29<03:02,  3.72s/it]\n",
      " 76%|#######5  | 150/198 [09:33<03:05,  3.86s/it]\n",
      " 76%|#######6  | 151/198 [09:36<02:51,  3.65s/it]\n",
      " 77%|#######6  | 152/198 [09:40<02:51,  3.73s/it]\n",
      " 77%|#######7  | 153/198 [09:43<02:40,  3.56s/it]\n",
      " 78%|#######7  | 154/198 [09:46<02:27,  3.35s/it]\n",
      " 78%|#######8  | 155/198 [09:50<02:31,  3.51s/it]\n",
      " 79%|#######8  | 156/198 [09:53<02:19,  3.33s/it]\n",
      " 79%|#######9  | 157/198 [09:56<02:12,  3.24s/it]\n",
      " 80%|#######9  | 158/198 [09:58<02:02,  3.05s/it]\n",
      " 80%|########  | 159/198 [10:01<01:52,  2.89s/it]\n",
      " 81%|########  | 160/198 [10:04<01:46,  2.81s/it]\n",
      " 81%|########1 | 161/198 [10:08<01:59,  3.23s/it]\n",
      " 82%|########1 | 162/198 [10:11<02:00,  3.36s/it]\n",
      " 82%|########2 | 163/198 [10:15<01:56,  3.32s/it]\n",
      " 83%|########2 | 164/198 [10:19<02:01,  3.58s/it]\n",
      " 83%|########3 | 165/198 [10:23<01:59,  3.63s/it]\n",
      " 84%|########3 | 166/198 [10:27<02:00,  3.78s/it]\n",
      " 84%|########4 | 167/198 [10:31<01:58,  3.83s/it]\n",
      " 85%|########4 | 168/198 [10:38<02:23,  4.79s/it]\n",
      " 85%|########5 | 169/198 [10:41<02:05,  4.33s/it]\n",
      " 86%|########5 | 170/198 [10:43<01:44,  3.74s/it]\n",
      " 86%|########6 | 171/198 [10:46<01:31,  3.40s/it]\n",
      " 87%|########6 | 172/198 [10:50<01:35,  3.66s/it]\n",
      " 87%|########7 | 173/198 [10:54<01:35,  3.81s/it]\n",
      " 88%|########7 | 174/198 [10:59<01:36,  4.00s/it]\n",
      " 88%|########8 | 175/198 [11:02<01:24,  3.68s/it]\n",
      " 89%|########8 | 176/198 [11:06<01:26,  3.94s/it]\n",
      " 89%|########9 | 177/198 [11:10<01:21,  3.89s/it]\n",
      " 90%|########9 | 178/198 [11:14<01:16,  3.85s/it]\n",
      " 90%|######### | 179/198 [11:17<01:07,  3.56s/it]\n",
      " 91%|######### | 180/198 [11:20<01:00,  3.34s/it]\n",
      " 91%|#########1| 181/198 [11:22<00:54,  3.18s/it]\n",
      " 92%|#########1| 182/198 [11:27<00:55,  3.49s/it]\n",
      " 92%|#########2| 183/198 [11:30<00:50,  3.36s/it]\n",
      " 93%|#########2| 184/198 [11:34<00:50,  3.64s/it]\n",
      " 93%|#########3| 185/198 [11:39<00:53,  4.08s/it]\n",
      " 94%|#########3| 186/198 [11:42<00:43,  3.66s/it]\n",
      " 94%|#########4| 187/198 [11:46<00:41,  3.80s/it]\n",
      " 95%|#########4| 188/198 [11:50<00:38,  3.84s/it]\n",
      " 95%|#########5| 189/198 [11:52<00:31,  3.47s/it]\n",
      " 96%|#########5| 190/198 [11:55<00:26,  3.34s/it]\n",
      " 96%|#########6| 191/198 [11:59<00:24,  3.52s/it]\n",
      " 97%|#########6| 192/198 [12:02<00:19,  3.22s/it]\n",
      " 97%|#########7| 193/198 [12:05<00:16,  3.31s/it]\n",
      " 98%|#########7| 194/198 [12:10<00:15,  3.81s/it]\n",
      " 98%|#########8| 195/198 [12:17<00:13,  4.65s/it]\n",
      " 99%|#########8| 196/198 [12:20<00:08,  4.17s/it]\n",
      " 99%|#########9| 197/198 [12:23<00:03,  3.68s/it]\n",
      "100%|##########| 198/198 [12:25<00:00,  3.28s/it]02/16/2022 00:26:49 - INFO - __main__ - Epoch 0: {'accuracy': 0.884}\n",
      "02/16/2022 00:27:16 - INFO - __main__ - Test-set evaluation: {'accuracy': 0.864}\n",
      "Configuration saved in out/tweet/gpt2\\config.json\n",
      "Model weights saved in out/tweet/gpt2\\pytorch_model.bin\n",
      "tokenizer config file saved in out/tweet/gpt2\\tokenizer_config.json\n",
      "Special tokens file saved in out/tweet/gpt2\\special_tokens_map.json\n",
      "\n",
      "100%|##########| 198/198 [13:25<00:00,  4.07s/it]\n"
     ]
    }
   ],
   "source": [
    "!python run_glue_no_trainer.py \\\n",
    "  --model_name_or_path gpt2 \\\n",
    "  --train_file data/train.json  \\\n",
    "  --validation_file data/valid.json \\\n",
    "  --test_file data/test.json \\\n",
    "  --per_device_train_batch_size 24 \\\n",
    "  --per_device_eval_batch_size 24 \\\n",
    "  --max_length 128 \\\n",
    "  --learning_rate 2e-5 \\\n",
    "  --num_train_epochs 1 \\\n",
    "  --output_dir out/tweet/gpt2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GPT2 version 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "02/16/2022 00:27:21 - INFO - __main__ - Distributed environment: NO\n",
      "Num processes: 1\n",
      "Process index: 0\n",
      "Local process index: 0\n",
      "Device: cpu\n",
      "Use FP16 precision: False\n",
      "\n",
      "02/16/2022 00:27:22 - WARNING - datasets.builder - Using custom data configuration default-67c9d932a627b7b8\n",
      "02/16/2022 00:27:22 - WARNING - datasets.builder - Reusing dataset json (C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)\n",
      "\n",
      "  0%|          | 0/3 [00:00<?, ?it/s]\n",
      "100%|##########| 3/3 [00:00<00:00, 176.25it/s]\n",
      "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n",
      "Model config GPT2Config {\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "Could not locate the tokenizer configuration file, will try to use the model config instead.\n",
      "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n",
      "Model config GPT2Config {\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f\n",
      "loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b\n",
      "loading file https://huggingface.co/gpt2/resolve/main/tokenizer.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0\n",
      "loading file https://huggingface.co/gpt2/resolve/main/added_tokens.json from cache at None\n",
      "loading file https://huggingface.co/gpt2/resolve/main/special_tokens_map.json from cache at None\n",
      "loading file https://huggingface.co/gpt2/resolve/main/tokenizer_config.json from cache at None\n",
      "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n",
      "Model config GPT2Config {\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "02/16/2022 00:27:28 - INFO - __main__ - Return hidden states from model: False\n",
      "02/16/2022 00:27:28 - INFO - __main__ - Using implementation from: AutoModelForSequenceClassification\n",
      "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n",
      "All model checkpoint weights were used when initializing GPT2ForSequenceClassification.\n",
      "\n",
      "Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "02/16/2022 00:27:29 - INFO - __main__ - Freezing model weights\n",
      "Using pad_token, but it is not set yet.\n",
      "02/16/2022 00:27:29 - INFO - __main__ - Set PAD token to EOS: <|endoftext|>\n",
      "02/16/2022 00:27:29 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-ba0dca0006a47e01.arrow\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 71.63ba/s]\n",
      "02/16/2022 00:27:29 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-d41f6257e87d100c.arrow\n",
      "02/16/2022 00:27:29 - INFO - __main__ - Sample 826 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [22940, 126, 222, 126, 250, 732, 262, 661, 22940, 126, 222, 126, 251, 6198, 4001, 6184, 95, 126, 222, 126, 250, 732, 262, 2330, 11, 1956, 19216, 10835, 13, 22940, 126, 222, 126, 251, 220, 220, 220, 220, 220, 6184, 95, 126, 222, 126, 99, 1303, 5304, 259, 19, 10879, 22940, 126, 222, 126, 99, 220], 'labels': 1}.\n",
      "02/16/2022 00:27:29 - INFO - __main__ - Sample 521 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [31, 7220, 8425, 31582, 416, 2488, 7220, 287, 269, 30520, 13, 884, 23374, 986, 220, 220], 'labels': 1}.\n",
      "02/16/2022 00:27:29 - INFO - __main__ - Sample 2806 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [4623, 68, 4964, 2168, 352, 286, 1303, 1169, 43764, 523, 355, 284, 3190, 3368, 4346, 13, 220, 220], 'labels': 0}.\n",
      "02/16/2022 00:27:30 - INFO - __main__ - ***** Running training *****\n",
      "02/16/2022 00:27:30 - INFO - __main__ -   Num examples = 4742\n",
      "02/16/2022 00:27:30 - INFO - __main__ -   Num Epochs = 1\n",
      "02/16/2022 00:27:30 - INFO - __main__ -   Instantaneous batch size per device = 24\n",
      "02/16/2022 00:27:30 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 24\n",
      "02/16/2022 00:27:30 - INFO - __main__ -   Gradient Accumulation steps = 1\n",
      "02/16/2022 00:27:30 - INFO - __main__ -   Total optimization steps = 198\n",
      "\n",
      "  0%|          | 0/198 [00:00<?, ?it/s]\n",
      "  1%|          | 1/198 [00:01<05:14,  1.59s/it]\n",
      "  1%|1         | 2/198 [00:02<04:21,  1.33s/it]\n",
      "  2%|1         | 3/198 [00:03<03:49,  1.18s/it]\n",
      "  2%|2         | 4/198 [00:05<04:26,  1.38s/it]\n",
      "  3%|2         | 5/198 [00:07<05:00,  1.56s/it]\n",
      "  3%|3         | 6/198 [00:08<04:27,  1.39s/it]\n",
      "  4%|3         | 7/198 [00:10<04:50,  1.52s/it]\n",
      "  4%|4         | 8/198 [00:11<04:25,  1.40s/it]\n",
      "  5%|4         | 9/198 [00:12<04:34,  1.45s/it]\n",
      "  5%|5         | 10/198 [00:14<04:35,  1.46s/it]\n",
      "  6%|5         | 11/198 [00:15<04:21,  1.40s/it]\n",
      "  6%|6         | 12/198 [00:16<03:55,  1.27s/it]\n",
      "  7%|6         | 13/198 [00:18<04:08,  1.34s/it]\n",
      "  7%|7         | 14/198 [00:20<05:34,  1.82s/it]\n",
      "  8%|7         | 15/198 [00:22<05:06,  1.67s/it]\n",
      "  8%|8         | 16/198 [00:23<04:25,  1.46s/it]\n",
      "  9%|8         | 17/198 [00:24<04:00,  1.33s/it]\n",
      "  9%|9         | 18/198 [00:27<05:22,  1.79s/it]\n",
      " 10%|9         | 19/198 [00:28<05:07,  1.72s/it]\n",
      " 10%|#         | 20/198 [00:29<04:24,  1.49s/it]\n",
      " 11%|#         | 21/198 [00:32<05:15,  1.78s/it]\n",
      " 11%|#1        | 22/198 [00:33<05:00,  1.71s/it]\n",
      " 12%|#1        | 23/198 [00:34<04:19,  1.48s/it]\n",
      " 12%|#2        | 24/198 [00:35<03:58,  1.37s/it]\n",
      " 13%|#2        | 25/198 [00:36<03:40,  1.27s/it]\n",
      " 13%|#3        | 26/198 [00:38<03:52,  1.35s/it]\n",
      " 14%|#3        | 27/198 [00:39<03:49,  1.34s/it]\n",
      " 14%|#4        | 28/198 [00:40<03:28,  1.22s/it]\n",
      " 15%|#4        | 29/198 [00:42<03:41,  1.31s/it]\n",
      " 15%|#5        | 30/198 [00:42<03:18,  1.18s/it]\n",
      " 16%|#5        | 31/198 [00:44<03:43,  1.34s/it]\n",
      " 16%|#6        | 32/198 [00:46<03:45,  1.36s/it]\n",
      " 17%|#6        | 33/198 [00:47<03:49,  1.39s/it]\n",
      " 17%|#7        | 34/198 [00:48<03:34,  1.31s/it]\n",
      " 18%|#7        | 35/198 [00:49<03:23,  1.25s/it]\n",
      " 18%|#8        | 36/198 [00:50<03:11,  1.18s/it]\n",
      " 19%|#8        | 37/198 [00:52<03:30,  1.31s/it]\n",
      " 19%|#9        | 38/198 [00:53<03:12,  1.20s/it]\n",
      " 20%|#9        | 39/198 [00:54<03:02,  1.15s/it]\n",
      " 20%|##        | 40/198 [00:56<03:29,  1.33s/it]\n",
      " 21%|##        | 41/198 [00:57<03:17,  1.26s/it]\n",
      " 21%|##1       | 42/198 [00:58<03:29,  1.35s/it]\n",
      " 22%|##1       | 43/198 [00:59<03:08,  1.22s/it]\n",
      " 22%|##2       | 44/198 [01:00<03:03,  1.19s/it]\n",
      " 23%|##2       | 45/198 [01:02<03:14,  1.27s/it]\n",
      " 23%|##3       | 46/198 [01:03<03:26,  1.36s/it]\n",
      " 24%|##3       | 47/198 [01:04<03:07,  1.24s/it]\n",
      " 24%|##4       | 48/198 [01:06<03:18,  1.32s/it]\n",
      " 25%|##4       | 49/198 [01:07<02:58,  1.19s/it]\n",
      " 25%|##5       | 50/198 [01:08<02:46,  1.12s/it]\n",
      " 26%|##5       | 51/198 [01:09<03:04,  1.26s/it]\n",
      " 26%|##6       | 52/198 [01:11<03:16,  1.34s/it]\n",
      " 27%|##6       | 53/198 [01:12<02:55,  1.21s/it]\n",
      " 27%|##7       | 54/198 [01:13<03:07,  1.30s/it]\n",
      " 28%|##7       | 55/198 [01:14<03:00,  1.26s/it]\n",
      " 28%|##8       | 56/198 [01:15<02:40,  1.13s/it]\n",
      " 29%|##8       | 57/198 [01:17<03:10,  1.35s/it]\n",
      " 29%|##9       | 58/198 [01:18<03:02,  1.30s/it]\n",
      " 30%|##9       | 59/198 [01:20<03:09,  1.37s/it]\n",
      " 30%|###       | 60/198 [01:21<02:45,  1.20s/it]\n",
      " 31%|###       | 61/198 [01:22<02:40,  1.17s/it]\n",
      " 31%|###1      | 62/198 [01:23<02:41,  1.18s/it]\n",
      " 32%|###1      | 63/198 [01:24<02:54,  1.29s/it]\n",
      " 32%|###2      | 64/198 [01:26<02:48,  1.26s/it]\n",
      " 33%|###2      | 65/198 [01:27<02:56,  1.33s/it]\n",
      " 33%|###3      | 66/198 [01:29<03:03,  1.39s/it]\n",
      " 34%|###3      | 67/198 [01:30<03:10,  1.45s/it]\n",
      " 34%|###4      | 68/198 [01:33<03:44,  1.73s/it]\n",
      " 35%|###4      | 69/198 [01:34<03:16,  1.52s/it]\n",
      " 35%|###5      | 70/198 [01:35<03:16,  1.53s/it]\n",
      " 36%|###5      | 71/198 [01:36<02:53,  1.37s/it]\n",
      " 36%|###6      | 72/198 [01:38<03:00,  1.43s/it]\n",
      " 37%|###6      | 73/198 [01:39<02:58,  1.43s/it]\n",
      " 37%|###7      | 74/198 [01:41<02:59,  1.45s/it]\n",
      " 38%|###7      | 75/198 [01:42<02:45,  1.34s/it]\n",
      " 38%|###8      | 76/198 [01:43<02:35,  1.28s/it]\n",
      " 39%|###8      | 77/198 [01:44<02:40,  1.33s/it]\n",
      " 39%|###9      | 78/198 [01:46<02:32,  1.27s/it]\n",
      " 40%|###9      | 79/198 [01:47<02:31,  1.27s/it]\n",
      " 40%|####      | 80/198 [01:48<02:28,  1.26s/it]\n",
      " 41%|####      | 81/198 [01:49<02:18,  1.19s/it]\n",
      " 41%|####1     | 82/198 [01:52<03:16,  1.69s/it]\n",
      " 42%|####1     | 83/198 [01:53<03:10,  1.65s/it]\n",
      " 42%|####2     | 84/198 [01:55<02:49,  1.49s/it]\n",
      " 43%|####2     | 85/198 [01:56<02:56,  1.56s/it]\n",
      " 43%|####3     | 86/198 [01:57<02:29,  1.34s/it]\n",
      " 44%|####3     | 87/198 [01:58<02:24,  1.30s/it]\n",
      " 44%|####4     | 88/198 [01:59<02:14,  1.23s/it]\n",
      " 45%|####4     | 89/198 [02:01<02:13,  1.22s/it]\n",
      " 45%|####5     | 90/198 [02:02<02:14,  1.24s/it]\n",
      " 46%|####5     | 91/198 [02:03<02:09,  1.21s/it]\n",
      " 46%|####6     | 92/198 [02:04<01:59,  1.13s/it]\n",
      " 47%|####6     | 93/198 [02:05<01:53,  1.08s/it]\n",
      " 47%|####7     | 94/198 [02:06<01:53,  1.09s/it]\n",
      " 48%|####7     | 95/198 [02:07<01:45,  1.02s/it]\n",
      " 48%|####8     | 96/198 [02:08<01:59,  1.17s/it]\n",
      " 49%|####8     | 97/198 [02:09<01:53,  1.12s/it]\n",
      " 49%|####9     | 98/198 [02:11<02:14,  1.35s/it]\n",
      " 50%|#####     | 99/198 [02:13<02:13,  1.35s/it]\n",
      " 51%|#####     | 100/198 [02:15<02:51,  1.75s/it]\n",
      " 51%|#####1    | 101/198 [02:18<03:02,  1.88s/it]\n",
      " 52%|#####1    | 102/198 [02:18<02:33,  1.60s/it]\n",
      " 52%|#####2    | 103/198 [02:19<02:09,  1.36s/it]\n",
      " 53%|#####2    | 104/198 [02:20<01:59,  1.27s/it]\n",
      " 53%|#####3    | 105/198 [02:22<02:07,  1.37s/it]\n",
      " 54%|#####3    | 106/198 [02:23<02:07,  1.38s/it]\n",
      " 54%|#####4    | 107/198 [02:25<02:06,  1.39s/it]\n",
      " 55%|#####4    | 108/198 [02:26<02:09,  1.43s/it]\n",
      " 55%|#####5    | 109/198 [02:29<02:47,  1.88s/it]\n",
      " 56%|#####5    | 110/198 [02:30<02:28,  1.69s/it]\n",
      " 56%|#####6    | 111/198 [02:31<02:07,  1.47s/it]\n",
      " 57%|#####6    | 112/198 [02:33<02:06,  1.47s/it]\n",
      " 57%|#####7    | 113/198 [02:34<01:51,  1.31s/it]\n",
      " 58%|#####7    | 114/198 [02:35<01:56,  1.39s/it]\n",
      " 58%|#####8    | 115/198 [02:36<01:46,  1.29s/it]\n",
      " 59%|#####8    | 116/198 [02:38<01:46,  1.29s/it]\n",
      " 59%|#####9    | 117/198 [02:39<01:39,  1.23s/it]\n",
      " 60%|#####9    | 118/198 [02:40<01:39,  1.25s/it]\n",
      " 60%|######    | 119/198 [02:41<01:30,  1.15s/it]\n",
      " 61%|######    | 120/198 [02:42<01:26,  1.11s/it]\n",
      " 61%|######1   | 121/198 [02:43<01:31,  1.19s/it]\n",
      " 62%|######1   | 122/198 [02:45<01:27,  1.15s/it]\n",
      " 62%|######2   | 123/198 [02:46<01:31,  1.22s/it]\n",
      " 63%|######2   | 124/198 [02:47<01:29,  1.21s/it]\n",
      " 63%|######3   | 125/198 [02:49<01:37,  1.33s/it]\n",
      " 64%|######3   | 126/198 [02:50<01:30,  1.25s/it]\n",
      " 64%|######4   | 127/198 [02:52<01:39,  1.41s/it]\n",
      " 65%|######4   | 128/198 [02:53<01:39,  1.42s/it]\n",
      " 65%|######5   | 129/198 [02:54<01:32,  1.34s/it]\n",
      " 66%|######5   | 130/198 [02:55<01:25,  1.25s/it]\n",
      " 66%|######6   | 131/198 [02:58<01:52,  1.68s/it]\n",
      " 67%|######6   | 132/198 [02:59<01:37,  1.47s/it]\n",
      " 67%|######7   | 133/198 [03:01<01:41,  1.56s/it]\n",
      " 68%|######7   | 134/198 [03:02<01:36,  1.50s/it]\n",
      " 68%|######8   | 135/198 [03:03<01:28,  1.41s/it]\n",
      " 69%|######8   | 136/198 [03:05<01:30,  1.47s/it]\n",
      " 69%|######9   | 137/198 [03:06<01:17,  1.26s/it]\n",
      " 70%|######9   | 138/198 [03:07<01:15,  1.27s/it]\n",
      " 70%|#######   | 139/198 [03:08<01:06,  1.13s/it]\n",
      " 71%|#######   | 140/198 [03:09<01:10,  1.21s/it]\n",
      " 71%|#######1  | 141/198 [03:11<01:28,  1.55s/it]\n",
      " 72%|#######1  | 142/198 [03:13<01:31,  1.63s/it]\n",
      " 72%|#######2  | 143/198 [03:15<01:26,  1.58s/it]\n",
      " 73%|#######2  | 144/198 [03:15<01:12,  1.35s/it]\n",
      " 73%|#######3  | 145/198 [03:16<01:03,  1.19s/it]\n",
      " 74%|#######3  | 146/198 [03:17<00:59,  1.15s/it]\n",
      " 74%|#######4  | 147/198 [03:18<00:56,  1.11s/it]\n",
      " 75%|#######4  | 148/198 [03:19<00:53,  1.07s/it]\n",
      " 75%|#######5  | 149/198 [03:21<00:59,  1.21s/it]\n",
      " 76%|#######5  | 150/198 [03:22<00:54,  1.14s/it]\n",
      " 76%|#######6  | 151/198 [03:23<00:50,  1.08s/it]\n",
      " 77%|#######6  | 152/198 [03:25<01:00,  1.32s/it]\n",
      " 77%|#######7  | 153/198 [03:26<01:02,  1.38s/it]\n",
      " 78%|#######7  | 154/198 [03:27<00:55,  1.27s/it]\n",
      " 78%|#######8  | 155/198 [03:29<00:55,  1.29s/it]\n",
      " 79%|#######8  | 156/198 [03:30<00:56,  1.35s/it]\n",
      " 79%|#######9  | 157/198 [03:31<00:49,  1.22s/it]\n",
      " 80%|#######9  | 158/198 [03:33<00:59,  1.49s/it]\n",
      " 80%|########  | 159/198 [03:34<00:52,  1.34s/it]\n",
      " 81%|########  | 160/198 [03:35<00:51,  1.36s/it]\n",
      " 81%|########1 | 161/198 [03:37<00:47,  1.27s/it]\n",
      " 82%|########1 | 162/198 [03:37<00:41,  1.15s/it]\n",
      " 82%|########2 | 163/198 [03:38<00:36,  1.06s/it]\n",
      " 83%|########2 | 164/198 [03:40<00:40,  1.20s/it]\n",
      " 83%|########3 | 165/198 [03:41<00:42,  1.30s/it]\n",
      " 84%|########3 | 166/198 [03:44<00:54,  1.69s/it]\n",
      " 84%|########4 | 167/198 [03:47<01:03,  2.06s/it]\n",
      " 85%|########4 | 168/198 [03:48<00:54,  1.81s/it]\n",
      " 85%|########5 | 169/198 [03:50<00:49,  1.70s/it]\n",
      " 86%|########5 | 170/198 [03:51<00:41,  1.49s/it]\n",
      " 86%|########6 | 171/198 [03:52<00:37,  1.37s/it]\n",
      " 87%|########6 | 172/198 [03:52<00:30,  1.18s/it]\n",
      " 87%|########7 | 173/198 [03:53<00:27,  1.09s/it]\n",
      " 88%|########7 | 174/198 [03:54<00:24,  1.03s/it]\n",
      " 88%|########8 | 175/198 [03:56<00:26,  1.15s/it]\n",
      " 89%|########8 | 176/198 [03:58<00:36,  1.67s/it]\n",
      " 89%|########9 | 177/198 [04:00<00:32,  1.56s/it]\n",
      " 90%|########9 | 178/198 [04:01<00:27,  1.38s/it]\n",
      " 90%|######### | 179/198 [04:02<00:23,  1.24s/it]\n",
      " 91%|######### | 180/198 [04:03<00:21,  1.18s/it]\n",
      " 91%|#########1| 181/198 [04:04<00:19,  1.12s/it]\n",
      " 92%|#########1| 182/198 [04:05<00:20,  1.26s/it]\n",
      " 92%|#########2| 183/198 [04:07<00:19,  1.28s/it]\n",
      " 93%|#########2| 184/198 [04:09<00:21,  1.54s/it]\n",
      " 93%|#########3| 185/198 [04:10<00:20,  1.54s/it]\n",
      " 94%|#########3| 186/198 [04:11<00:16,  1.42s/it]\n",
      " 94%|#########4| 187/198 [04:12<00:14,  1.31s/it]\n",
      " 95%|#########4| 188/198 [04:14<00:12,  1.29s/it]\n",
      " 95%|#########5| 189/198 [04:15<00:10,  1.15s/it]\n",
      " 96%|#########5| 190/198 [04:16<00:09,  1.25s/it]\n",
      " 96%|#########6| 191/198 [04:18<00:09,  1.34s/it]\n",
      " 97%|#########6| 192/198 [04:18<00:06,  1.17s/it]\n",
      " 97%|#########7| 193/198 [04:21<00:07,  1.51s/it]\n",
      " 98%|#########7| 194/198 [04:22<00:05,  1.39s/it]\n",
      " 98%|#########8| 195/198 [04:23<00:03,  1.31s/it]\n",
      " 99%|#########8| 196/198 [04:24<00:02,  1.17s/it]\n",
      " 99%|#########9| 197/198 [04:25<00:01,  1.10s/it]\n",
      "100%|##########| 198/198 [04:26<00:00,  1.09s/it]02/16/2022 00:32:30 - INFO - __main__ - Epoch 0: {'accuracy': 0.846}\n",
      "02/16/2022 00:32:57 - INFO - __main__ - Test-set evaluation: {'accuracy': 0.904}\n",
      "Configuration saved in out/tweet/gpt2_version_2\\config.json\n",
      "Model weights saved in out/tweet/gpt2_version_2\\pytorch_model.bin\n",
      "tokenizer config file saved in out/tweet/gpt2_version_2\\tokenizer_config.json\n",
      "Special tokens file saved in out/tweet/gpt2_version_2\\special_tokens_map.json\n",
      "\n",
      "100%|##########| 198/198 [05:27<00:00,  1.65s/it]\n"
     ]
    }
   ],
   "source": [
    "!python run_glue_no_trainer.py \\\n",
    "  --model_name_or_path gpt2 \\\n",
    "  --train_file data/train.json  \\\n",
    "  --validation_file data/valid.json \\\n",
    "  --test_file data/test.json \\\n",
    "  --per_device_train_batch_size 24 \\\n",
    "  --per_device_eval_batch_size 24 \\\n",
    "  --max_length 128 \\\n",
    "  --freeze_model \\\n",
    "  --learning_rate 2e-5 \\\n",
    "  --num_train_epochs 1 \\\n",
    "  --output_dir out/tweet/gpt2_version_2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GPT2 version 3 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "02/16/2022 00:33:00 - INFO - __main__ - Distributed environment: NO\n",
      "Num processes: 1\n",
      "Process index: 0\n",
      "Local process index: 0\n",
      "Device: cpu\n",
      "Use FP16 precision: False\n",
      "\n",
      "02/16/2022 00:33:00 - WARNING - datasets.builder - Using custom data configuration default-67c9d932a627b7b8\n",
      "02/16/2022 00:33:00 - WARNING - datasets.builder - Reusing dataset json (C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)\n",
      "\n",
      "  0%|          | 0/3 [00:00<?, ?it/s]\n",
      "100%|##########| 3/3 [00:00<00:00, 1504.23it/s]\n",
      "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n",
      "Model config GPT2Config {\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "Could not locate the tokenizer configuration file, will try to use the model config instead.\n",
      "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n",
      "Model config GPT2Config {\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f\n",
      "loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b\n",
      "loading file https://huggingface.co/gpt2/resolve/main/tokenizer.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0\n",
      "loading file https://huggingface.co/gpt2/resolve/main/added_tokens.json from cache at None\n",
      "loading file https://huggingface.co/gpt2/resolve/main/special_tokens_map.json from cache at None\n",
      "loading file https://huggingface.co/gpt2/resolve/main/tokenizer_config.json from cache at None\n",
      "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n",
      "Model config GPT2Config {\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "02/16/2022 00:33:06 - INFO - __main__ - Return hidden states from model: False\n",
      "02/16/2022 00:33:06 - INFO - __main__ - Using implementation from: GPT2ForSequenceClassificationCustom\n",
      "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n",
      "All model checkpoint weights were used when initializing GPT2ForSequenceClassificationCustom.\n",
      "\n",
      "Some weights of GPT2ForSequenceClassificationCustom were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.dense_1_hidden.bias', 'score.dense_1_input.bias', 'score.out_proj.weight', 'score.dense_2.bias', 'score.dense_1_hidden.weight', 'score.dense_2.weight', 'score.dense_1_input.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "02/16/2022 00:33:08 - INFO - __main__ - Freezing model weights\n",
      "Using pad_token, but it is not set yet.\n",
      "02/16/2022 00:33:08 - INFO - __main__ - Set PAD token to EOS: <|endoftext|>\n",
      "02/16/2022 00:33:08 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-f4385b00908c069e.arrow\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 32.35ba/s]\n",
      "02/16/2022 00:33:08 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-c36412d695a9c6f1.arrow\n",
      "02/16/2022 00:33:08 - INFO - __main__ - Sample 1528 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [258, 338, 281, 555, 2382, 7490, 764, 1303, 22584, 220], 'labels': 1}.\n",
      "02/16/2022 00:33:08 - INFO - __main__ - Sample 113 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [31, 7220, 43646, 2148, 20577, 1303, 37098, 13948, 1337, 329, 1303, 11085, 77, 602, 25, 5387, 16155, 220, 1303, 17089, 6894, 5171, 4763, 220], 'labels': 1}.\n",
      "02/16/2022 00:33:08 - INFO - __main__ - Sample 485 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [31, 7220, 2488, 7220, 883, 6886, 284, 220, 1303, 12480, 4604, 594, 1303, 5183, 445, 1303, 259, 31012, 1303, 42570, 6098, 999, 1303, 721, 16207, 481, 1309, 1303, 40954, 760, 674, 8666, 1303, 5539], 'labels': 1}.\n",
      "02/16/2022 00:33:09 - INFO - __main__ - ***** Running training *****\n",
      "02/16/2022 00:33:09 - INFO - __main__ -   Num examples = 4742\n",
      "02/16/2022 00:33:09 - INFO - __main__ -   Num Epochs = 1\n",
      "02/16/2022 00:33:09 - INFO - __main__ -   Instantaneous batch size per device = 24\n",
      "02/16/2022 00:33:09 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 24\n",
      "02/16/2022 00:33:09 - INFO - __main__ -   Gradient Accumulation steps = 1\n",
      "02/16/2022 00:33:09 - INFO - __main__ -   Total optimization steps = 198\n",
      "\n",
      "  0%|          | 0/198 [00:00<?, ?it/s]\n",
      "  1%|          | 1/198 [00:01<03:23,  1.03s/it]\n",
      "  1%|1         | 2/198 [00:02<03:32,  1.08s/it]\n",
      "  2%|1         | 3/198 [00:04<04:40,  1.44s/it]\n",
      "  2%|2         | 4/198 [00:05<04:33,  1.41s/it]\n",
      "  3%|2         | 5/198 [00:06<04:33,  1.42s/it]\n",
      "  3%|3         | 6/198 [00:08<04:18,  1.35s/it]\n",
      "  4%|3         | 7/198 [00:09<04:03,  1.28s/it]\n",
      "  4%|4         | 8/198 [00:10<03:43,  1.17s/it]\n",
      "  5%|4         | 9/198 [00:12<05:08,  1.63s/it]\n",
      "  5%|5         | 10/198 [00:14<05:01,  1.60s/it]\n",
      "  6%|5         | 11/198 [00:15<04:29,  1.44s/it]\n",
      "  6%|6         | 12/198 [00:17<04:39,  1.50s/it]\n",
      "  7%|6         | 13/198 [00:18<04:23,  1.42s/it]\n",
      "  7%|7         | 14/198 [00:19<04:04,  1.33s/it]\n",
      "  8%|7         | 15/198 [00:22<05:33,  1.82s/it]\n",
      "  8%|8         | 16/198 [00:23<05:13,  1.72s/it]\n",
      "  9%|8         | 17/198 [00:25<05:16,  1.75s/it]\n",
      "  9%|9         | 18/198 [00:26<04:28,  1.49s/it]\n",
      " 10%|9         | 19/198 [00:28<04:34,  1.53s/it]\n",
      " 10%|#         | 20/198 [00:30<04:55,  1.66s/it]\n",
      " 11%|#         | 21/198 [00:31<04:30,  1.53s/it]\n",
      " 11%|#1        | 22/198 [00:32<03:57,  1.35s/it]\n",
      " 12%|#1        | 23/198 [00:33<04:04,  1.39s/it]\n",
      " 12%|#2        | 24/198 [00:36<04:50,  1.67s/it]\n",
      " 13%|#2        | 25/198 [00:37<04:31,  1.57s/it]\n",
      " 13%|#3        | 26/198 [00:38<04:05,  1.43s/it]\n",
      " 14%|#3        | 27/198 [00:39<03:49,  1.34s/it]\n",
      " 14%|#4        | 28/198 [00:40<03:37,  1.28s/it]\n",
      " 15%|#4        | 29/198 [00:42<03:48,  1.35s/it]\n",
      " 15%|#5        | 30/198 [00:44<04:06,  1.47s/it]\n",
      " 16%|#5        | 31/198 [00:45<03:43,  1.34s/it]\n",
      " 16%|#6        | 32/198 [00:46<03:35,  1.30s/it]\n",
      " 17%|#6        | 33/198 [00:47<03:27,  1.26s/it]\n",
      " 17%|#7        | 34/198 [00:50<04:49,  1.76s/it]\n",
      " 18%|#7        | 35/198 [00:51<04:15,  1.57s/it]\n",
      " 18%|#8        | 36/198 [00:53<04:21,  1.61s/it]\n",
      " 19%|#8        | 37/198 [00:54<04:23,  1.64s/it]\n",
      " 19%|#9        | 38/198 [00:56<04:25,  1.66s/it]\n",
      " 20%|#9        | 39/198 [00:58<04:25,  1.67s/it]\n",
      " 20%|##        | 40/198 [00:59<04:03,  1.54s/it]\n",
      " 21%|##        | 41/198 [01:00<03:52,  1.48s/it]\n",
      " 21%|##1       | 42/198 [01:01<03:32,  1.36s/it]\n",
      " 22%|##1       | 43/198 [01:03<03:29,  1.35s/it]\n",
      " 22%|##2       | 44/198 [01:04<03:16,  1.28s/it]\n",
      " 23%|##2       | 45/198 [01:06<03:31,  1.38s/it]\n",
      " 23%|##3       | 46/198 [01:07<03:37,  1.43s/it]\n",
      " 24%|##3       | 47/198 [01:08<03:17,  1.31s/it]\n",
      " 24%|##4       | 48/198 [01:09<02:57,  1.19s/it]\n",
      " 25%|##4       | 49/198 [01:10<02:52,  1.16s/it]\n",
      " 25%|##5       | 50/198 [01:12<03:23,  1.38s/it]\n",
      " 26%|##5       | 51/198 [01:13<03:16,  1.34s/it]\n",
      " 26%|##6       | 52/198 [01:14<03:09,  1.30s/it]\n",
      " 27%|##6       | 53/198 [01:16<03:07,  1.29s/it]\n",
      " 27%|##7       | 54/198 [01:17<03:08,  1.31s/it]\n",
      " 28%|##7       | 55/198 [01:19<03:22,  1.42s/it]\n",
      " 28%|##8       | 56/198 [01:21<04:06,  1.74s/it]\n",
      " 29%|##8       | 57/198 [01:22<03:35,  1.53s/it]\n",
      " 29%|##9       | 58/198 [01:23<03:16,  1.40s/it]\n",
      " 30%|##9       | 59/198 [01:26<04:01,  1.74s/it]\n",
      " 30%|###       | 60/198 [01:28<03:57,  1.72s/it]\n",
      " 31%|###       | 61/198 [01:29<03:49,  1.67s/it]\n",
      " 31%|###1      | 62/198 [01:31<04:04,  1.80s/it]\n",
      " 32%|###1      | 63/198 [01:33<03:44,  1.67s/it]\n",
      " 32%|###2      | 64/198 [01:34<03:18,  1.48s/it]\n",
      " 33%|###2      | 65/198 [01:35<02:57,  1.33s/it]\n",
      " 33%|###3      | 66/198 [01:36<02:44,  1.25s/it]\n",
      " 34%|###3      | 67/198 [01:37<02:59,  1.37s/it]\n",
      " 34%|###4      | 68/198 [01:40<03:45,  1.73s/it]\n",
      " 35%|###4      | 69/198 [01:41<03:29,  1.63s/it]\n",
      " 35%|###5      | 70/198 [01:42<03:10,  1.49s/it]\n",
      " 36%|###5      | 71/198 [01:44<03:17,  1.55s/it]\n",
      " 36%|###6      | 72/198 [01:46<03:32,  1.69s/it]\n",
      " 37%|###6      | 73/198 [01:47<03:06,  1.49s/it]\n",
      " 37%|###7      | 74/198 [01:48<02:50,  1.38s/it]\n",
      " 38%|###7      | 75/198 [01:49<02:38,  1.29s/it]\n",
      " 38%|###8      | 76/198 [01:51<02:50,  1.40s/it]\n",
      " 39%|###8      | 77/198 [01:52<02:50,  1.41s/it]\n",
      " 39%|###9      | 78/198 [01:54<02:41,  1.34s/it]\n",
      " 40%|###9      | 79/198 [01:55<02:54,  1.46s/it]\n",
      " 40%|####      | 80/198 [01:58<03:39,  1.86s/it]\n",
      " 41%|####      | 81/198 [01:59<03:05,  1.59s/it]\n",
      " 41%|####1     | 82/198 [02:00<02:42,  1.40s/it]\n",
      " 42%|####1     | 83/198 [02:01<02:26,  1.28s/it]\n",
      " 42%|####2     | 84/198 [02:02<02:23,  1.25s/it]\n",
      " 43%|####2     | 85/198 [02:03<02:12,  1.17s/it]\n",
      " 43%|####3     | 86/198 [02:05<02:14,  1.20s/it]\n",
      " 44%|####3     | 87/198 [02:06<02:19,  1.26s/it]\n",
      " 44%|####4     | 88/198 [02:07<02:21,  1.29s/it]\n",
      " 45%|####4     | 89/198 [02:08<02:11,  1.20s/it]\n",
      " 45%|####5     | 90/198 [02:11<02:49,  1.57s/it]\n",
      " 46%|####5     | 91/198 [02:14<03:32,  1.98s/it]\n",
      " 46%|####6     | 92/198 [02:15<03:18,  1.87s/it]\n",
      " 47%|####6     | 93/198 [02:16<02:48,  1.61s/it]\n",
      " 47%|####7     | 94/198 [02:17<02:32,  1.47s/it]\n",
      " 48%|####7     | 95/198 [02:19<02:41,  1.57s/it]\n",
      " 48%|####8     | 96/198 [02:21<02:43,  1.60s/it]\n",
      " 49%|####8     | 97/198 [02:22<02:27,  1.46s/it]\n",
      " 49%|####9     | 98/198 [02:24<02:43,  1.63s/it]\n",
      " 50%|#####     | 99/198 [02:26<02:42,  1.64s/it]\n",
      " 51%|#####     | 100/198 [02:27<02:42,  1.66s/it]\n",
      " 51%|#####1    | 101/198 [02:29<02:29,  1.54s/it]\n",
      " 52%|#####1    | 102/198 [02:32<03:13,  2.01s/it]\n",
      " 52%|#####2    | 103/198 [02:33<02:45,  1.74s/it]\n",
      " 53%|#####2    | 104/198 [02:34<02:25,  1.55s/it]\n",
      " 53%|#####3    | 105/198 [02:35<02:19,  1.50s/it]\n",
      " 54%|#####3    | 106/198 [02:36<02:06,  1.37s/it]\n",
      " 54%|#####4    | 107/198 [02:38<01:57,  1.29s/it]\n",
      " 55%|#####4    | 108/198 [02:40<02:15,  1.50s/it]\n",
      " 55%|#####5    | 109/198 [02:41<02:18,  1.56s/it]\n",
      " 56%|#####5    | 110/198 [02:43<02:13,  1.52s/it]\n",
      " 56%|#####6    | 111/198 [02:44<02:09,  1.48s/it]\n",
      " 57%|#####6    | 112/198 [02:46<02:09,  1.51s/it]\n",
      " 57%|#####7    | 113/198 [02:47<02:03,  1.45s/it]\n",
      " 58%|#####7    | 114/198 [02:49<02:12,  1.58s/it]\n",
      " 58%|#####8    | 115/198 [02:50<02:01,  1.46s/it]\n",
      " 59%|#####8    | 116/198 [02:53<02:39,  1.95s/it]\n",
      " 59%|#####9    | 117/198 [02:54<02:21,  1.74s/it]\n",
      " 60%|#####9    | 118/198 [02:56<02:05,  1.57s/it]\n",
      " 60%|######    | 119/198 [02:57<02:04,  1.57s/it]\n",
      " 61%|######    | 120/198 [02:58<01:47,  1.38s/it]\n",
      " 61%|######1   | 121/198 [03:00<01:55,  1.50s/it]\n",
      " 62%|######1   | 122/198 [03:01<01:52,  1.48s/it]\n",
      " 62%|######2   | 123/198 [03:03<01:54,  1.53s/it]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 63%|######2   | 124/198 [03:04<01:46,  1.44s/it]\n",
      " 63%|######3   | 125/198 [03:06<01:50,  1.51s/it]\n",
      " 64%|######3   | 126/198 [03:07<01:40,  1.39s/it]\n",
      " 64%|######4   | 127/198 [03:08<01:28,  1.24s/it]\n",
      " 65%|######4   | 128/198 [03:09<01:23,  1.19s/it]\n",
      " 65%|######5   | 129/198 [03:10<01:24,  1.22s/it]\n",
      " 66%|######5   | 130/198 [03:11<01:21,  1.20s/it]\n",
      " 66%|######6   | 131/198 [03:13<01:21,  1.21s/it]\n",
      " 67%|######6   | 132/198 [03:13<01:14,  1.12s/it]\n",
      " 67%|######7   | 133/198 [03:15<01:24,  1.30s/it]\n",
      " 68%|######7   | 134/198 [03:16<01:21,  1.27s/it]\n",
      " 68%|######8   | 135/198 [03:17<01:14,  1.18s/it]\n",
      " 69%|######8   | 136/198 [03:18<01:09,  1.12s/it]\n",
      " 69%|######9   | 137/198 [03:20<01:09,  1.14s/it]\n",
      " 70%|######9   | 138/198 [03:21<01:17,  1.29s/it]\n",
      " 70%|#######   | 139/198 [03:24<01:48,  1.83s/it]\n",
      " 71%|#######   | 140/198 [03:26<01:43,  1.79s/it]\n",
      " 71%|#######1  | 141/198 [03:28<01:41,  1.78s/it]\n",
      " 72%|#######1  | 142/198 [03:29<01:37,  1.73s/it]\n",
      " 72%|#######2  | 143/198 [03:31<01:39,  1.81s/it]\n",
      " 73%|#######2  | 144/198 [03:33<01:38,  1.83s/it]\n",
      " 73%|#######3  | 145/198 [03:36<01:56,  2.20s/it]\n",
      " 74%|#######3  | 146/198 [03:38<01:47,  2.06s/it]\n",
      " 74%|#######4  | 147/198 [03:39<01:29,  1.75s/it]\n",
      " 75%|#######4  | 148/198 [03:40<01:22,  1.65s/it]\n",
      " 75%|#######5  | 149/198 [03:42<01:14,  1.52s/it]\n",
      " 76%|#######5  | 150/198 [03:43<01:08,  1.43s/it]\n",
      " 76%|#######6  | 151/198 [03:44<01:01,  1.31s/it]\n",
      " 77%|#######6  | 152/198 [03:45<00:54,  1.19s/it]\n",
      " 77%|#######7  | 153/198 [03:47<01:09,  1.55s/it]\n",
      " 78%|#######7  | 154/198 [03:49<01:10,  1.60s/it]\n",
      " 78%|#######8  | 155/198 [03:50<01:08,  1.59s/it]\n",
      " 79%|#######8  | 156/198 [03:51<00:58,  1.39s/it]\n",
      " 79%|#######9  | 157/198 [03:54<01:17,  1.89s/it]\n",
      " 80%|#######9  | 158/198 [03:55<01:04,  1.62s/it]\n",
      " 80%|########  | 159/198 [03:57<00:59,  1.54s/it]\n",
      " 81%|########  | 160/198 [03:58<00:53,  1.40s/it]\n",
      " 81%|########1 | 161/198 [04:00<00:57,  1.56s/it]\n",
      " 82%|########1 | 162/198 [04:01<00:54,  1.51s/it]\n",
      " 82%|########2 | 163/198 [04:03<00:55,  1.57s/it]\n",
      " 83%|########2 | 164/198 [04:04<00:52,  1.56s/it]\n",
      " 83%|########3 | 165/198 [04:06<00:50,  1.54s/it]\n",
      " 84%|########3 | 166/198 [04:07<00:45,  1.42s/it]\n",
      " 84%|########4 | 167/198 [04:09<00:44,  1.43s/it]\n",
      " 85%|########4 | 168/198 [04:10<00:45,  1.51s/it]\n",
      " 85%|########5 | 169/198 [04:12<00:45,  1.57s/it]\n",
      " 86%|########5 | 170/198 [04:14<00:45,  1.63s/it]\n",
      " 86%|########6 | 171/198 [04:15<00:40,  1.52s/it]\n",
      " 87%|########6 | 172/198 [04:17<00:40,  1.57s/it]\n",
      " 87%|########7 | 173/198 [04:18<00:40,  1.62s/it]\n",
      " 88%|########7 | 174/198 [04:20<00:38,  1.59s/it]\n",
      " 88%|########8 | 175/198 [04:21<00:33,  1.44s/it]\n",
      " 89%|########8 | 176/198 [04:23<00:35,  1.59s/it]\n",
      " 89%|########9 | 177/198 [04:25<00:34,  1.64s/it]\n",
      " 90%|########9 | 178/198 [04:26<00:28,  1.43s/it]\n",
      " 90%|######### | 179/198 [04:27<00:25,  1.36s/it]\n",
      " 91%|######### | 180/198 [04:28<00:23,  1.29s/it]\n",
      " 91%|#########1| 181/198 [04:31<00:30,  1.81s/it]\n",
      " 92%|#########1| 182/198 [04:32<00:25,  1.57s/it]\n",
      " 92%|#########2| 183/198 [04:33<00:21,  1.41s/it]\n",
      " 93%|#########2| 184/198 [04:34<00:19,  1.36s/it]\n",
      " 93%|#########3| 185/198 [04:36<00:17,  1.35s/it]\n",
      " 94%|#########3| 186/198 [04:37<00:17,  1.47s/it]\n",
      " 94%|#########4| 187/198 [04:39<00:16,  1.54s/it]\n",
      " 95%|#########4| 188/198 [04:40<00:13,  1.40s/it]\n",
      " 95%|#########5| 189/198 [04:41<00:11,  1.29s/it]\n",
      " 96%|#########5| 190/198 [04:42<00:10,  1.29s/it]\n",
      " 96%|#########6| 191/198 [04:44<00:08,  1.26s/it]\n",
      " 97%|#########6| 192/198 [04:45<00:07,  1.17s/it]\n",
      " 97%|#########7| 193/198 [04:46<00:06,  1.32s/it]\n",
      " 98%|#########7| 194/198 [04:47<00:05,  1.26s/it]\n",
      " 98%|#########8| 195/198 [04:49<00:03,  1.28s/it]\n",
      " 99%|#########8| 196/198 [04:50<00:02,  1.27s/it]\n",
      " 99%|#########9| 197/198 [04:51<00:01,  1.20s/it]\n",
      "100%|##########| 198/198 [04:52<00:00,  1.21s/it]02/16/2022 00:38:36 - INFO - __main__ - Epoch 0: {'accuracy': 0.676}\n",
      "02/16/2022 00:39:05 - INFO - __main__ - Test-set evaluation: {'accuracy': 0.636}\n",
      "Configuration saved in out/tweet/gpt2_version_3\\config.json\n",
      "Model weights saved in out/tweet/gpt2_version_3\\pytorch_model.bin\n",
      "tokenizer config file saved in out/tweet/gpt2_version_3\\tokenizer_config.json\n",
      "Special tokens file saved in out/tweet/gpt2_version_3\\special_tokens_map.json\n",
      "\n",
      "100%|##########| 198/198 [05:56<00:00,  1.80s/it]\n"
     ]
    }
   ],
   "source": [
    "!python run_glue_no_trainer.py \\\n",
    "  --model_name_or_path gpt2 \\\n",
    "  --train_file data/train.json  \\\n",
    "  --validation_file data/valid.json \\\n",
    "  --test_file data/test.json \\\n",
    "  --per_device_train_batch_size 24 \\\n",
    "  --per_device_eval_batch_size 24 \\\n",
    "  --max_length 128 \\\n",
    "  --freeze_model \\\n",
    "  --custom_model \\\n",
    "  --learning_rate 2e-5 \\\n",
    "  --num_train_epochs 1 \\\n",
    "  --output_dir out/tweet/gpt2_version_3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GPT2 version 4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "02/16/2022 00:39:07 - INFO - __main__ - Distributed environment: NO\n",
      "Num processes: 1\n",
      "Process index: 0\n",
      "Local process index: 0\n",
      "Device: cpu\n",
      "Use FP16 precision: False\n",
      "\n",
      "02/16/2022 00:39:08 - WARNING - datasets.builder - Using custom data configuration default-67c9d932a627b7b8\n",
      "02/16/2022 00:39:08 - WARNING - datasets.builder - Reusing dataset json (C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)\n",
      "\n",
      "  0%|          | 0/3 [00:00<?, ?it/s]\n",
      "100%|##########| 3/3 [00:00<00:00, 1505.31it/s]\n",
      "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n",
      "Model config GPT2Config {\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "Could not locate the tokenizer configuration file, will try to use the model config instead.\n",
      "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n",
      "Model config GPT2Config {\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f\n",
      "loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b\n",
      "loading file https://huggingface.co/gpt2/resolve/main/tokenizer.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0\n",
      "loading file https://huggingface.co/gpt2/resolve/main/added_tokens.json from cache at None\n",
      "loading file https://huggingface.co/gpt2/resolve/main/special_tokens_map.json from cache at None\n",
      "loading file https://huggingface.co/gpt2/resolve/main/tokenizer_config.json from cache at None\n",
      "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n",
      "Model config GPT2Config {\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "02/16/2022 00:39:14 - INFO - __main__ - Return hidden states from model: True\n",
      "02/16/2022 00:39:14 - INFO - __main__ - Using implementation from: GPT2ForSequenceClassificationCustom\n",
      "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n",
      "All model checkpoint weights were used when initializing GPT2ForSequenceClassificationCustom.\n",
      "\n",
      "Some weights of GPT2ForSequenceClassificationCustom were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.dense_1_hidden.weight', 'score.out_proj.weight', 'score.dense_1_input.bias', 'score.dense_2.bias', 'score.dense_2.weight', 'score.dense_1_input.weight', 'score.dense_1_hidden.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "02/16/2022 00:39:16 - INFO - __main__ - Freezing model weights\n",
      "Using pad_token, but it is not set yet.\n",
      "02/16/2022 00:39:16 - INFO - __main__ - Set PAD token to EOS: <|endoftext|>\n",
      "02/16/2022 00:39:16 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-5a65b7038a57b5cc.arrow\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 34.58ba/s]\n",
      "02/16/2022 00:39:16 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-5ed4052179e59c20.arrow\n",
      "02/16/2022 00:39:16 - INFO - __main__ - Sample 3838 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [31, 7220, 662, 12, 24071, 2488, 7220, 1303, 83, 34715, 34191, 40252, 1492, 1909, 6184, 108, 126, 253, 126, 239, 26604, 27214, 126, 253, 126, 237, 126, 120, 220, 220, 1303, 3605, 76, 13513], 'labels': 0}.\n",
      "02/16/2022 00:39:16 - INFO - __main__ - Sample 1761 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [10919, 257, 220, 995, 13, 611, 257, 582, 1718, 257, 15647, 588, 326, 11, 661, 561, 910, 340, 373, 5969, 13, 475, 275, 14, 66, 1303, 81, 623, 283, 1076, 88, 318, 257, 2415, 428, 318, 2938, 13], 'labels': 1}.\n",
      "02/16/2022 00:39:16 - INFO - __main__ - Sample 1111 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [31, 7220, 2488, 7220, 612, 318, 645, 3338, 1295, 329, 15102, 287, 428, 1499, 780, 286, 661, 588, 345, 1303, 65, 1967, 220], 'labels': 1}.\n",
      "02/16/2022 00:39:17 - INFO - __main__ - ***** Running training *****\n",
      "02/16/2022 00:39:17 - INFO - __main__ -   Num examples = 4742\n",
      "02/16/2022 00:39:17 - INFO - __main__ -   Num Epochs = 1\n",
      "02/16/2022 00:39:17 - INFO - __main__ -   Instantaneous batch size per device = 24\n",
      "02/16/2022 00:39:17 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 24\n",
      "02/16/2022 00:39:17 - INFO - __main__ -   Gradient Accumulation steps = 1\n",
      "02/16/2022 00:39:17 - INFO - __main__ -   Total optimization steps = 198\n",
      "\n",
      "  0%|          | 0/198 [00:00<?, ?it/s]\n",
      "  1%|          | 1/198 [00:01<05:35,  1.70s/it]\n",
      "  1%|1         | 2/198 [00:02<04:33,  1.39s/it]\n",
      "  2%|1         | 3/198 [00:04<04:24,  1.35s/it]\n",
      "  2%|2         | 4/198 [00:07<06:43,  2.08s/it]\n",
      "  3%|2         | 5/198 [00:08<05:50,  1.81s/it]\n",
      "  3%|3         | 6/198 [00:10<05:21,  1.68s/it]\n",
      "  4%|3         | 7/198 [00:11<05:11,  1.63s/it]\n",
      "  4%|4         | 8/198 [00:12<04:40,  1.48s/it]\n",
      "  5%|4         | 9/198 [00:14<04:30,  1.43s/it]\n",
      "  5%|5         | 10/198 [00:15<04:09,  1.32s/it]\n",
      "  6%|5         | 11/198 [00:16<04:29,  1.44s/it]\n",
      "  6%|6         | 12/198 [00:17<03:58,  1.28s/it]\n",
      "  7%|6         | 13/198 [00:19<04:18,  1.40s/it]\n",
      "  7%|7         | 14/198 [00:20<04:07,  1.35s/it]\n",
      "  8%|7         | 15/198 [00:22<04:03,  1.33s/it]\n",
      "  8%|8         | 16/198 [00:23<04:14,  1.40s/it]\n",
      "  9%|8         | 17/198 [00:24<04:01,  1.33s/it]\n",
      "  9%|9         | 18/198 [00:25<03:32,  1.18s/it]\n",
      " 10%|9         | 19/198 [00:27<03:50,  1.29s/it]\n",
      " 10%|#         | 20/198 [00:28<04:09,  1.40s/it]\n",
      " 11%|#         | 21/198 [00:29<03:54,  1.32s/it]\n",
      " 11%|#1        | 22/198 [00:30<03:33,  1.21s/it]\n",
      " 12%|#1        | 23/198 [00:32<03:57,  1.36s/it]\n",
      " 12%|#2        | 24/198 [00:33<03:45,  1.30s/it]\n",
      " 13%|#2        | 25/198 [00:35<03:52,  1.34s/it]\n",
      " 13%|#3        | 26/198 [00:36<03:39,  1.28s/it]\n",
      " 14%|#3        | 27/198 [00:37<03:31,  1.23s/it]\n",
      " 14%|#4        | 28/198 [00:39<04:07,  1.46s/it]\n",
      " 15%|#4        | 29/198 [00:40<04:03,  1.44s/it]\n",
      " 15%|#5        | 30/198 [00:41<03:35,  1.28s/it]\n",
      " 16%|#5        | 31/198 [00:43<03:55,  1.41s/it]\n",
      " 16%|#6        | 32/198 [00:45<04:08,  1.50s/it]\n",
      " 17%|#6        | 33/198 [00:46<03:45,  1.36s/it]\n",
      " 17%|#7        | 34/198 [00:48<04:39,  1.70s/it]\n",
      " 18%|#7        | 35/198 [00:50<04:25,  1.63s/it]\n",
      " 18%|#8        | 36/198 [00:51<04:02,  1.50s/it]\n",
      " 19%|#8        | 37/198 [00:52<03:48,  1.42s/it]\n",
      " 19%|#9        | 38/198 [00:53<03:35,  1.35s/it]\n",
      " 20%|#9        | 39/198 [00:55<03:43,  1.40s/it]\n",
      " 20%|##        | 40/198 [00:56<03:49,  1.45s/it]\n",
      " 21%|##        | 41/198 [00:57<03:25,  1.31s/it]\n",
      " 21%|##1       | 42/198 [00:59<03:33,  1.37s/it]\n",
      " 22%|##1       | 43/198 [01:00<03:15,  1.26s/it]\n",
      " 22%|##2       | 44/198 [01:01<03:06,  1.21s/it]\n",
      " 23%|##2       | 45/198 [01:02<02:52,  1.13s/it]\n",
      " 23%|##3       | 46/198 [01:03<02:58,  1.17s/it]\n",
      " 24%|##3       | 47/198 [01:04<02:45,  1.09s/it]\n",
      " 24%|##4       | 48/198 [01:05<02:39,  1.06s/it]\n",
      " 25%|##4       | 49/198 [01:07<03:20,  1.34s/it]\n",
      " 25%|##5       | 50/198 [01:08<03:16,  1.33s/it]\n",
      " 26%|##5       | 51/198 [01:10<03:31,  1.44s/it]\n",
      " 26%|##6       | 52/198 [01:12<03:31,  1.45s/it]\n",
      " 27%|##6       | 53/198 [01:13<03:13,  1.33s/it]\n",
      " 27%|##7       | 54/198 [01:14<02:53,  1.21s/it]\n",
      " 28%|##7       | 55/198 [01:15<02:51,  1.20s/it]\n",
      " 28%|##8       | 56/198 [01:16<03:11,  1.35s/it]\n",
      " 29%|##8       | 57/198 [01:17<02:54,  1.24s/it]\n",
      " 29%|##9       | 58/198 [01:19<02:58,  1.28s/it]\n",
      " 30%|##9       | 59/198 [01:20<02:49,  1.22s/it]\n",
      " 30%|###       | 60/198 [01:21<02:53,  1.26s/it]\n",
      " 31%|###       | 61/198 [01:22<02:39,  1.17s/it]\n",
      " 31%|###1      | 62/198 [01:25<03:56,  1.74s/it]\n",
      " 32%|###1      | 63/198 [01:26<03:28,  1.54s/it]\n",
      " 32%|###2      | 64/198 [01:29<04:29,  2.01s/it]\n",
      " 33%|###2      | 65/198 [01:31<04:06,  1.85s/it]\n",
      " 33%|###3      | 66/198 [01:33<04:24,  2.00s/it]\n",
      " 34%|###3      | 67/198 [01:35<04:12,  1.93s/it]\n",
      " 34%|###4      | 68/198 [01:36<03:33,  1.64s/it]\n",
      " 35%|###4      | 69/198 [01:37<03:21,  1.56s/it]\n",
      " 35%|###5      | 70/198 [01:38<02:58,  1.39s/it]\n",
      " 36%|###5      | 71/198 [01:40<02:50,  1.34s/it]\n",
      " 36%|###6      | 72/198 [01:41<03:01,  1.44s/it]\n",
      " 37%|###6      | 73/198 [01:43<02:58,  1.43s/it]\n",
      " 37%|###7      | 74/198 [01:44<02:46,  1.34s/it]\n",
      " 38%|###7      | 75/198 [01:45<02:58,  1.45s/it]\n",
      " 38%|###8      | 76/198 [01:47<03:17,  1.62s/it]\n",
      " 39%|###8      | 77/198 [01:49<03:12,  1.59s/it]\n",
      " 39%|###9      | 78/198 [01:51<03:26,  1.72s/it]\n",
      " 40%|###9      | 79/198 [01:52<03:07,  1.57s/it]\n",
      " 40%|####      | 80/198 [01:54<03:11,  1.62s/it]\n",
      " 41%|####      | 81/198 [01:55<02:47,  1.43s/it]\n",
      " 41%|####1     | 82/198 [01:57<02:51,  1.48s/it]\n",
      " 42%|####1     | 83/198 [01:58<02:42,  1.41s/it]\n",
      " 42%|####2     | 84/198 [01:59<02:32,  1.34s/it]\n",
      " 43%|####2     | 85/198 [02:00<02:26,  1.30s/it]\n",
      " 43%|####3     | 86/198 [02:01<02:13,  1.19s/it]\n",
      " 44%|####3     | 87/198 [02:02<02:04,  1.12s/it]\n",
      " 44%|####4     | 88/198 [02:03<02:10,  1.19s/it]\n",
      " 45%|####4     | 89/198 [02:05<02:14,  1.23s/it]\n",
      " 45%|####5     | 90/198 [02:08<03:13,  1.79s/it]\n",
      " 46%|####5     | 91/198 [02:11<03:39,  2.05s/it]\n",
      " 46%|####6     | 92/198 [02:12<03:08,  1.77s/it]\n",
      " 47%|####6     | 93/198 [02:13<02:46,  1.59s/it]\n",
      " 47%|####7     | 94/198 [02:15<02:57,  1.71s/it]\n",
      " 48%|####7     | 95/198 [02:16<02:43,  1.59s/it]\n",
      " 48%|####8     | 96/198 [02:18<02:47,  1.65s/it]\n",
      " 49%|####8     | 97/198 [02:19<02:35,  1.54s/it]\n",
      " 49%|####9     | 98/198 [02:20<02:23,  1.44s/it]\n",
      " 50%|#####     | 99/198 [02:22<02:36,  1.58s/it]\n",
      " 51%|#####     | 100/198 [02:25<03:02,  1.87s/it]\n",
      " 51%|#####1    | 101/198 [02:26<02:43,  1.68s/it]\n",
      " 52%|#####1    | 102/198 [02:28<02:43,  1.71s/it]\n",
      " 52%|#####2    | 103/198 [02:30<02:49,  1.78s/it]\n",
      " 53%|#####2    | 104/198 [02:32<02:48,  1.79s/it]\n",
      " 53%|#####3    | 105/198 [02:33<02:24,  1.55s/it]\n",
      " 54%|#####3    | 106/198 [02:34<02:12,  1.44s/it]\n",
      " 54%|#####4    | 107/198 [02:35<01:57,  1.29s/it]\n",
      " 55%|#####4    | 108/198 [02:36<02:05,  1.39s/it]\n",
      " 55%|#####5    | 109/198 [02:38<02:09,  1.45s/it]\n",
      " 56%|#####5    | 110/198 [02:40<02:15,  1.54s/it]\n",
      " 56%|#####6    | 111/198 [02:41<02:18,  1.59s/it]\n",
      " 57%|#####6    | 112/198 [02:43<02:19,  1.63s/it]\n",
      " 57%|#####7    | 113/198 [02:44<02:00,  1.41s/it]\n",
      " 58%|#####7    | 114/198 [02:45<01:54,  1.36s/it]\n",
      " 58%|#####8    | 115/198 [02:47<02:01,  1.46s/it]\n",
      " 59%|#####8    | 116/198 [02:49<02:05,  1.53s/it]\n",
      " 59%|#####9    | 117/198 [02:50<01:50,  1.36s/it]\n",
      " 60%|#####9    | 118/198 [02:51<01:48,  1.36s/it]\n",
      " 60%|######    | 119/198 [02:53<01:56,  1.47s/it]\n",
      " 61%|######    | 120/198 [02:54<01:42,  1.31s/it]\n",
      " 61%|######1   | 121/198 [02:55<01:34,  1.22s/it]\n",
      " 62%|######1   | 122/198 [02:56<01:43,  1.36s/it]\n",
      " 62%|######2   | 123/198 [02:57<01:37,  1.30s/it]\n",
      " 63%|######2   | 124/198 [02:59<01:32,  1.26s/it]\n",
      " 63%|######3   | 125/198 [03:00<01:28,  1.21s/it]\n",
      " 64%|######3   | 126/198 [03:01<01:36,  1.34s/it]\n",
      " 64%|######4   | 127/198 [03:03<01:37,  1.37s/it]\n",
      " 65%|######4   | 128/198 [03:04<01:32,  1.32s/it]\n",
      " 65%|######5   | 129/198 [03:05<01:28,  1.28s/it]\n",
      " 66%|######5   | 130/198 [03:07<01:34,  1.39s/it]\n",
      " 66%|######6   | 131/198 [03:10<02:06,  1.89s/it]\n",
      " 67%|######6   | 132/198 [03:11<01:53,  1.73s/it]\n",
      " 67%|######7   | 133/198 [03:14<02:20,  2.17s/it]\n",
      " 68%|######7   | 134/198 [03:16<02:14,  2.10s/it]\n",
      " 68%|######8   | 135/198 [03:18<02:06,  2.00s/it]\n",
      " 69%|######8   | 136/198 [03:19<01:44,  1.68s/it]\n",
      " 69%|######9   | 137/198 [03:20<01:29,  1.47s/it]\n",
      " 70%|######9   | 138/198 [03:21<01:20,  1.34s/it]\n",
      " 70%|#######   | 139/198 [03:22<01:17,  1.32s/it]\n",
      " 71%|#######   | 140/198 [03:24<01:12,  1.25s/it]\n",
      " 71%|#######1  | 141/198 [03:26<01:39,  1.75s/it]\n",
      " 72%|#######1  | 142/198 [03:28<01:35,  1.70s/it]\n",
      " 72%|#######2  | 143/198 [03:29<01:22,  1.50s/it]\n",
      " 73%|#######2  | 144/198 [03:30<01:14,  1.38s/it]\n",
      " 73%|#######3  | 145/198 [03:31<01:07,  1.27s/it]\n",
      " 74%|#######3  | 146/198 [03:33<01:12,  1.40s/it]\n",
      " 74%|#######4  | 147/198 [03:34<01:06,  1.30s/it]\n",
      " 75%|#######4  | 148/198 [03:37<01:29,  1.78s/it]\n",
      " 75%|#######5  | 149/198 [03:38<01:18,  1.61s/it]\n",
      " 76%|#######5  | 150/198 [03:40<01:23,  1.73s/it]\n",
      " 76%|#######6  | 151/198 [03:41<01:13,  1.57s/it]\n",
      " 77%|#######6  | 152/198 [03:42<01:02,  1.37s/it]\n",
      " 77%|#######7  | 153/198 [03:43<00:57,  1.29s/it]\n",
      " 78%|#######7  | 154/198 [03:45<01:02,  1.41s/it]\n",
      " 78%|#######8  | 155/198 [03:47<01:05,  1.52s/it]\n",
      " 79%|#######8  | 156/198 [03:48<00:59,  1.43s/it]\n",
      " 79%|#######9  | 157/198 [03:50<01:02,  1.51s/it]\n",
      " 80%|#######9  | 158/198 [03:51<00:54,  1.37s/it]\n",
      " 80%|########  | 159/198 [03:52<00:49,  1.27s/it]\n",
      " 81%|########  | 160/198 [03:53<00:46,  1.22s/it]\n",
      " 81%|########1 | 161/198 [03:54<00:42,  1.15s/it]\n",
      " 82%|########1 | 162/198 [03:55<00:41,  1.15s/it]\n",
      " 82%|########2 | 163/198 [03:56<00:38,  1.09s/it]\n",
      " 83%|########2 | 164/198 [03:57<00:35,  1.05s/it]\n",
      " 83%|########3 | 165/198 [03:58<00:40,  1.23s/it]\n",
      " 84%|########3 | 166/198 [04:00<00:39,  1.24s/it]\n",
      " 84%|########4 | 167/198 [04:01<00:41,  1.35s/it]\n",
      " 85%|########4 | 168/198 [04:04<00:54,  1.81s/it]\n",
      " 85%|########5 | 169/198 [04:07<00:57,  2.00s/it]\n",
      " 86%|########5 | 170/198 [04:09<00:57,  2.04s/it]\n",
      " 86%|########6 | 171/198 [04:10<00:47,  1.75s/it]\n",
      " 87%|########6 | 172/198 [04:12<00:44,  1.73s/it]\n",
      " 87%|########7 | 173/198 [04:13<00:37,  1.49s/it]\n",
      " 88%|########7 | 174/198 [04:14<00:38,  1.62s/it]\n",
      " 88%|########8 | 175/198 [04:16<00:36,  1.58s/it]\n",
      " 89%|########8 | 176/198 [04:17<00:30,  1.40s/it]\n",
      " 89%|########9 | 177/198 [04:19<00:32,  1.54s/it]\n",
      " 90%|########9 | 178/198 [04:20<00:27,  1.36s/it]\n",
      " 90%|######### | 179/198 [04:21<00:27,  1.43s/it]\n",
      " 91%|######### | 180/198 [04:22<00:23,  1.33s/it]\n",
      " 91%|#########1| 181/198 [04:24<00:21,  1.28s/it]\n",
      " 92%|#########1| 182/198 [04:25<00:19,  1.21s/it]\n",
      " 92%|#########2| 183/198 [04:27<00:23,  1.57s/it]\n",
      " 93%|#########2| 184/198 [04:28<00:20,  1.49s/it]\n",
      " 93%|#########3| 185/198 [04:29<00:18,  1.39s/it]\n",
      " 94%|#########3| 186/198 [04:33<00:22,  1.88s/it]\n",
      " 94%|#########4| 187/198 [04:34<00:17,  1.63s/it]\n",
      " 95%|#########4| 188/198 [04:35<00:16,  1.64s/it]\n",
      " 95%|#########5| 189/198 [04:36<00:13,  1.52s/it]\n",
      " 96%|#########5| 190/198 [04:37<00:10,  1.37s/it]\n",
      " 96%|#########6| 191/198 [04:39<00:09,  1.41s/it]\n",
      " 97%|#########6| 192/198 [04:41<00:09,  1.55s/it]\n",
      " 97%|#########7| 193/198 [04:43<00:08,  1.62s/it]\n",
      " 98%|#########7| 194/198 [04:44<00:06,  1.65s/it]\n",
      " 98%|#########8| 195/198 [04:46<00:04,  1.63s/it]\n",
      " 99%|#########8| 196/198 [04:47<00:03,  1.53s/it]\n",
      " 99%|#########9| 197/198 [04:49<00:01,  1.50s/it]\n",
      "100%|##########| 198/198 [04:49<00:00,  1.21s/it]02/16/2022 00:44:41 - INFO - __main__ - Epoch 0: {'accuracy': 0.728}\n",
      "02/16/2022 00:45:10 - INFO - __main__ - Test-set evaluation: {'accuracy': 0.732}\n",
      "Configuration saved in out/tweet/gpt2_version_4\\config.json\n",
      "Model weights saved in out/tweet/gpt2_version_4\\pytorch_model.bin\n",
      "tokenizer config file saved in out/tweet/gpt2_version_4\\tokenizer_config.json\n",
      "Special tokens file saved in out/tweet/gpt2_version_4\\special_tokens_map.json\n",
      "\n",
      "100%|##########| 198/198 [05:53<00:00,  1.78s/it]\n"
     ]
    }
   ],
   "source": [
    "!python run_glue_no_trainer.py \\\n",
    "  --model_name_or_path gpt2 \\\n",
    "  --train_file data/train.json  \\\n",
    "  --validation_file data/valid.json \\\n",
    "  --test_file data/test.json \\\n",
    "  --per_device_train_batch_size 24 \\\n",
    "  --per_device_eval_batch_size 24 \\\n",
    "  --max_length 128 \\\n",
    "  --freeze_model \\\n",
    "  --custom_model \\\n",
    "  --return_hidden_states \\\n",
    "  --learning_rate 2e-5 \\\n",
    "  --num_train_epochs 1 \\\n",
    "  --output_dir out/tweet/gpt2_version_4"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GPT2 version 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "02/17/2022 17:37:38 - INFO - __main__ - Distributed environment: NO\n",
      "Num processes: 1\n",
      "Process index: 0\n",
      "Local process index: 0\n",
      "Device: cpu\n",
      "Use FP16 precision: False\n",
      "\n",
      "02/17/2022 17:37:39 - WARNING - datasets.builder - Using custom data configuration default-67c9d932a627b7b8\n",
      "02/17/2022 17:37:39 - WARNING - datasets.builder - Reusing dataset json (C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)\n",
      "\n",
      "  0%|          | 0/3 [00:00<?, ?it/s]\n",
      "100%|##########| 3/3 [00:00<00:00, 1503.87it/s]\n",
      "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n",
      "Model config GPT2Config {\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "Could not locate the tokenizer configuration file, will try to use the model config instead.\n",
      "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n",
      "Model config GPT2Config {\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f\n",
      "loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b\n",
      "loading file https://huggingface.co/gpt2/resolve/main/tokenizer.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0\n",
      "loading file https://huggingface.co/gpt2/resolve/main/added_tokens.json from cache at None\n",
      "loading file https://huggingface.co/gpt2/resolve/main/special_tokens_map.json from cache at None\n",
      "loading file https://huggingface.co/gpt2/resolve/main/tokenizer_config.json from cache at None\n",
      "loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51\n",
      "Model config GPT2Config {\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2LMHeadModel\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "02/17/2022 17:37:45 - INFO - __main__ - Return hidden states from model: True\n",
      "02/17/2022 17:37:45 - INFO - __main__ - Using implementation from: GPT2ForSequenceClassificationCustom\n",
      "loading weights file https://huggingface.co/gpt2/resolve/main/pytorch_model.bin from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\752929ace039baa8ef70fe21cdf9ab9445773d20e733cf693d667982e210837e.323c769945a351daa25546176f8208b3004b6f563438a7603e7932bae9025925\n",
      "All model checkpoint weights were used when initializing GPT2ForSequenceClassificationCustom.\n",
      "\n",
      "Some weights of GPT2ForSequenceClassificationCustom were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.dense_1_input.weight', 'score.dense_2.weight', 'score.dense_2.bias', 'score.out_proj.weight', 'score.dense_1_hidden.weight', 'score.dense_1_hidden.bias', 'score.dense_1_input.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "02/17/2022 17:37:47 - INFO - __main__ - Freezing model weights\n",
      "Using pad_token, but it is not set yet.\n",
      "02/17/2022 17:37:47 - INFO - __main__ - Set PAD token to EOS: <|endoftext|>\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/5 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 5/5 [00:00<00:00, 46.33ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 5/5 [00:00<00:00, 46.33ba/s]\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 83.55ba/s]\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 100.09ba/s]\n",
      "02/17/2022 17:37:48 - INFO - __main__ - Sample 4558 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [31, 7220, 4953, 287, 262, 3223, 329, 616, 717, 2646, 286, 2488, 7220, 543, 318, 2488, 7220, 220, 220, 220, 1303, 41364, 469, 988, 1303, 276, 26240, 23411, 6184, 95, 126, 222, 126, 242, 986, 220], 'labels': 0}.\n",
      "02/17/2022 17:37:48 - INFO - __main__ - Sample 2249 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [1169, 1306, 1524, 614, 318, 262, 614, 329, 26420, 13, 27214, 126, 253, 126, 246, 5196, 460, 470, 892, 546, 326, 6184, 108, 126, 253, 126, 246, 3907, 1303, 14347, 1303, 1069, 4105, 220, 220, 1303, 37035, 1303, 320, 12756, 1303, 529, 669, 6042, 1303, 260, 10396, 3508, 1251, 1303, 15219], 'labels': 0}.\n",
      "02/17/2022 17:37:48 - INFO - __main__ - Sample 1448 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [2, 1416, 81, 315, 259, 1096, 1303, 82, 5570, 1222, 696, 26, 1303, 6381, 3455, 1303, 403, 6667, 11203, 540, 1303, 354, 5233, 1303, 2256, 6615, 287, 705, 32243, 1028, 10713, 25, 9265, 6, 220, 220], 'labels': 1}.\n",
      "02/17/2022 17:37:48 - INFO - __main__ - ***** Running training *****\n",
      "02/17/2022 17:37:48 - INFO - __main__ -   Num examples = 4742\n",
      "02/17/2022 17:37:48 - INFO - __main__ -   Num Epochs = 1\n",
      "02/17/2022 17:37:48 - INFO - __main__ -   Instantaneous batch size per device = 32\n",
      "02/17/2022 17:37:48 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 32\n",
      "02/17/2022 17:37:48 - INFO - __main__ -   Gradient Accumulation steps = 1\n",
      "02/17/2022 17:37:48 - INFO - __main__ -   Total optimization steps = 149\n",
      "\n",
      "  0%|          | 0/149 [00:00<?, ?it/s]\n",
      "  1%|          | 1/149 [00:01<03:56,  1.60s/it]\n",
      "  1%|1         | 2/149 [00:02<03:34,  1.46s/it]\n",
      "  2%|2         | 3/149 [00:04<03:39,  1.50s/it]\n",
      "  3%|2         | 4/149 [00:06<03:45,  1.56s/it]\n",
      "  3%|3         | 5/149 [00:08<04:23,  1.83s/it]\n",
      "  4%|4         | 6/149 [00:10<04:38,  1.95s/it]\n",
      "  5%|4         | 7/149 [00:12<04:25,  1.87s/it]\n",
      "  5%|5         | 8/149 [00:15<05:38,  2.40s/it]\n",
      "  6%|6         | 9/149 [00:17<05:01,  2.15s/it]\n",
      "  7%|6         | 10/149 [00:18<04:14,  1.83s/it]\n",
      "  7%|7         | 11/149 [00:20<04:15,  1.85s/it]\n",
      "  8%|8         | 12/149 [00:22<04:06,  1.80s/it]\n",
      "  9%|8         | 13/149 [00:23<04:01,  1.78s/it]\n",
      "  9%|9         | 14/149 [00:25<03:45,  1.67s/it]\n",
      " 10%|#         | 15/149 [00:27<04:19,  1.94s/it]\n",
      " 11%|#         | 16/149 [00:32<06:04,  2.74s/it]\n",
      " 11%|#1        | 17/149 [00:35<06:24,  2.91s/it]\n",
      " 12%|#2        | 18/149 [00:38<06:25,  2.94s/it]\n",
      " 13%|#2        | 19/149 [00:42<06:51,  3.16s/it]\n",
      " 13%|#3        | 20/149 [00:43<05:40,  2.64s/it]\n",
      " 14%|#4        | 21/149 [00:46<05:43,  2.69s/it]\n",
      " 15%|#4        | 22/149 [00:49<05:27,  2.58s/it]\n",
      " 15%|#5        | 23/149 [00:50<04:55,  2.34s/it]\n",
      " 16%|#6        | 24/149 [00:52<04:25,  2.12s/it]\n",
      " 17%|#6        | 25/149 [00:54<04:03,  1.96s/it]\n",
      " 17%|#7        | 26/149 [00:58<05:31,  2.70s/it]\n",
      " 18%|#8        | 27/149 [01:00<04:47,  2.36s/it]\n",
      " 19%|#8        | 28/149 [01:01<04:25,  2.20s/it]\n",
      " 19%|#9        | 29/149 [01:06<05:41,  2.85s/it]\n",
      " 20%|##        | 30/149 [01:07<04:59,  2.52s/it]\n",
      " 21%|##        | 31/149 [01:10<04:42,  2.40s/it]\n",
      " 21%|##1       | 32/149 [01:11<04:10,  2.14s/it]\n",
      " 22%|##2       | 33/149 [01:13<03:50,  1.99s/it]\n",
      " 23%|##2       | 34/149 [01:17<05:09,  2.69s/it]\n",
      " 23%|##3       | 35/149 [01:20<05:08,  2.71s/it]\n",
      " 24%|##4       | 36/149 [01:24<06:03,  3.22s/it]\n",
      " 25%|##4       | 37/149 [01:27<05:31,  2.96s/it]\n",
      " 26%|##5       | 38/149 [01:29<05:04,  2.75s/it]\n",
      " 26%|##6       | 39/149 [01:31<04:47,  2.62s/it]\n",
      " 27%|##6       | 40/149 [01:33<04:18,  2.37s/it]\n",
      " 28%|##7       | 41/149 [01:34<03:43,  2.07s/it]\n",
      " 28%|##8       | 42/149 [01:37<03:48,  2.13s/it]\n",
      " 29%|##8       | 43/149 [01:38<03:22,  1.91s/it]\n",
      " 30%|##9       | 44/149 [01:39<03:06,  1.77s/it]\n",
      " 30%|###       | 45/149 [01:41<02:49,  1.63s/it]\n",
      " 31%|###       | 46/149 [01:43<03:07,  1.82s/it]\n",
      " 32%|###1      | 47/149 [01:44<02:52,  1.69s/it]\n",
      " 32%|###2      | 48/149 [01:46<02:47,  1.66s/it]\n",
      " 33%|###2      | 49/149 [01:48<02:47,  1.67s/it]\n",
      " 34%|###3      | 50/149 [01:50<03:03,  1.86s/it]\n",
      " 34%|###4      | 51/149 [01:53<03:50,  2.35s/it]\n",
      " 35%|###4      | 52/149 [01:56<03:44,  2.31s/it]\n",
      " 36%|###5      | 53/149 [01:58<03:40,  2.30s/it]\n",
      " 36%|###6      | 54/149 [02:00<03:27,  2.18s/it]\n",
      " 37%|###6      | 55/149 [02:03<03:38,  2.33s/it]\n",
      " 38%|###7      | 56/149 [02:05<03:38,  2.34s/it]\n",
      " 38%|###8      | 57/149 [02:07<03:33,  2.32s/it]\n",
      " 39%|###8      | 58/149 [02:11<04:01,  2.65s/it]\n",
      " 40%|###9      | 59/149 [02:13<03:44,  2.50s/it]\n",
      " 40%|####      | 60/149 [02:15<03:38,  2.45s/it]\n",
      " 41%|####      | 61/149 [02:17<03:11,  2.18s/it]\n",
      " 42%|####1     | 62/149 [02:18<02:45,  1.90s/it]\n",
      " 42%|####2     | 63/149 [02:20<02:53,  2.02s/it]\n",
      " 43%|####2     | 64/149 [02:22<02:46,  1.96s/it]\n",
      " 44%|####3     | 65/149 [02:23<02:32,  1.81s/it]\n",
      " 44%|####4     | 66/149 [02:25<02:35,  1.88s/it]\n",
      " 45%|####4     | 67/149 [02:28<02:50,  2.08s/it]\n",
      " 46%|####5     | 68/149 [02:29<02:30,  1.86s/it]\n",
      " 46%|####6     | 69/149 [02:32<02:47,  2.09s/it]\n",
      " 47%|####6     | 70/149 [02:34<02:31,  1.91s/it]\n",
      " 48%|####7     | 71/149 [02:36<02:37,  2.02s/it]\n",
      " 48%|####8     | 72/149 [02:38<02:44,  2.14s/it]\n",
      " 49%|####8     | 73/149 [02:41<02:47,  2.20s/it]\n",
      " 50%|####9     | 74/149 [02:42<02:27,  1.97s/it]\n",
      " 50%|#####     | 75/149 [02:43<02:14,  1.82s/it]\n",
      " 51%|#####1    | 76/149 [02:45<02:05,  1.72s/it]\n",
      " 52%|#####1    | 77/149 [02:47<02:15,  1.88s/it]\n",
      " 52%|#####2    | 78/149 [02:48<02:00,  1.70s/it]\n",
      " 53%|#####3    | 79/149 [02:50<01:50,  1.58s/it]\n",
      " 54%|#####3    | 80/149 [02:52<02:11,  1.90s/it]\n",
      " 54%|#####4    | 81/149 [02:54<02:07,  1.87s/it]\n",
      " 55%|#####5    | 82/149 [02:56<02:12,  1.98s/it]\n",
      " 56%|#####5    | 83/149 [02:59<02:16,  2.07s/it]\n",
      " 56%|#####6    | 84/149 [03:01<02:12,  2.03s/it]\n",
      " 57%|#####7    | 85/149 [03:05<02:50,  2.67s/it]\n",
      " 58%|#####7    | 86/149 [03:06<02:23,  2.27s/it]\n",
      " 58%|#####8    | 87/149 [03:08<02:10,  2.11s/it]\n",
      " 59%|#####9    | 88/149 [03:10<01:59,  1.95s/it]\n",
      " 60%|#####9    | 89/149 [03:12<02:02,  2.04s/it]\n",
      " 60%|######    | 90/149 [03:14<02:01,  2.06s/it]\n",
      " 61%|######1   | 91/149 [03:15<01:51,  1.93s/it]\n",
      " 62%|######1   | 92/149 [03:17<01:40,  1.76s/it]\n",
      " 62%|######2   | 93/149 [03:18<01:29,  1.60s/it]\n",
      " 63%|######3   | 94/149 [03:22<02:13,  2.42s/it]\n",
      " 64%|######3   | 95/149 [03:25<02:15,  2.50s/it]\n",
      " 64%|######4   | 96/149 [03:29<02:37,  2.96s/it]\n",
      " 65%|######5   | 97/149 [03:31<02:17,  2.64s/it]\n",
      " 66%|######5   | 98/149 [03:33<02:03,  2.43s/it]\n",
      " 66%|######6   | 99/149 [03:35<01:51,  2.24s/it]\n",
      " 67%|######7   | 100/149 [03:37<01:49,  2.24s/it]\n",
      " 68%|######7   | 101/149 [03:38<01:35,  2.00s/it]\n",
      " 68%|######8   | 102/149 [03:40<01:23,  1.79s/it]\n",
      " 69%|######9   | 103/149 [03:44<01:54,  2.49s/it]\n",
      " 70%|######9   | 104/149 [03:46<01:42,  2.29s/it]\n",
      " 70%|#######   | 105/149 [03:48<01:40,  2.28s/it]\n",
      " 71%|#######1  | 106/149 [03:49<01:24,  1.96s/it]\n",
      " 72%|#######1  | 107/149 [03:51<01:27,  2.08s/it]\n",
      " 72%|#######2  | 108/149 [03:53<01:22,  2.02s/it]\n",
      " 73%|#######3  | 109/149 [03:55<01:16,  1.92s/it]\n",
      " 74%|#######3  | 110/149 [03:57<01:20,  2.06s/it]\n",
      " 74%|#######4  | 111/149 [03:59<01:16,  2.01s/it]\n",
      " 75%|#######5  | 112/149 [04:02<01:18,  2.11s/it]\n",
      " 76%|#######5  | 113/149 [04:04<01:15,  2.10s/it]\n",
      " 77%|#######6  | 114/149 [04:06<01:18,  2.25s/it]\n",
      " 77%|#######7  | 115/149 [04:08<01:13,  2.15s/it]\n",
      " 78%|#######7  | 116/149 [04:11<01:16,  2.33s/it]\n",
      " 79%|#######8  | 117/149 [04:14<01:22,  2.59s/it]\n",
      " 79%|#######9  | 118/149 [04:16<01:15,  2.44s/it]\n",
      " 80%|#######9  | 119/149 [04:18<01:03,  2.11s/it]\n",
      " 81%|########  | 120/149 [04:19<00:57,  1.97s/it]\n",
      " 81%|########1 | 121/149 [04:21<00:51,  1.83s/it]\n",
      " 82%|########1 | 122/149 [04:22<00:45,  1.68s/it]\n",
      " 83%|########2 | 123/149 [04:24<00:48,  1.86s/it]\n",
      " 83%|########3 | 124/149 [04:27<00:48,  1.94s/it]\n",
      " 84%|########3 | 125/149 [04:28<00:42,  1.79s/it]\n",
      " 85%|########4 | 126/149 [04:30<00:42,  1.84s/it]\n",
      " 85%|########5 | 127/149 [04:31<00:38,  1.74s/it]\n",
      " 86%|########5 | 128/149 [04:33<00:33,  1.58s/it]\n",
      " 87%|########6 | 129/149 [04:35<00:35,  1.78s/it]\n",
      " 87%|########7 | 130/149 [04:38<00:42,  2.23s/it]\n",
      " 88%|########7 | 131/149 [04:39<00:34,  1.93s/it]\n",
      " 89%|########8 | 132/149 [04:41<00:30,  1.77s/it]\n",
      " 89%|########9 | 133/149 [04:43<00:29,  1.83s/it]\n",
      " 90%|########9 | 134/149 [04:44<00:24,  1.65s/it]\n",
      " 91%|######### | 135/149 [04:46<00:24,  1.72s/it]\n",
      " 91%|#########1| 136/149 [04:47<00:21,  1.69s/it]\n",
      " 92%|#########1| 137/149 [04:50<00:22,  1.86s/it]\n",
      " 93%|#########2| 138/149 [04:51<00:19,  1.74s/it]\n",
      " 93%|#########3| 139/149 [04:55<00:23,  2.37s/it]\n",
      " 94%|#########3| 140/149 [04:57<00:20,  2.29s/it]\n",
      " 95%|#########4| 141/149 [04:59<00:16,  2.06s/it]\n",
      " 95%|#########5| 142/149 [05:00<00:13,  1.97s/it]\n",
      " 96%|#########5| 143/149 [05:02<00:10,  1.83s/it]\n",
      " 97%|#########6| 144/149 [05:04<00:09,  1.96s/it]\n",
      " 97%|#########7| 145/149 [05:06<00:08,  2.01s/it]\n",
      " 98%|#########7| 146/149 [05:08<00:05,  1.87s/it]\n",
      " 99%|#########8| 147/149 [05:11<00:04,  2.17s/it]\n",
      " 99%|#########9| 148/149 [05:13<00:02,  2.22s/it]\n",
      "100%|##########| 149/149 [05:13<00:00,  1.62s/it]02/17/2022 17:43:39 - INFO - __main__ - Epoch 0: {'accuracy': 0.888}\n",
      "02/17/2022 17:44:11 - INFO - __main__ - Test-set evaluation: {'accuracy': 0.914}\n",
      "Configuration saved in out/tweet/gpt2_version_5\\config.json\n",
      "Model weights saved in out/tweet/gpt2_version_5\\pytorch_model.bin\n",
      "tokenizer config file saved in out/tweet/gpt2_version_5\\tokenizer_config.json\n",
      "Special tokens file saved in out/tweet/gpt2_version_5\\special_tokens_map.json\n",
      "\n",
      "100%|##########| 149/149 [06:23<00:00,  2.57s/it]\n"
     ]
    }
   ],
   "source": [
    "!python run_glue_no_trainer.py \\\n",
    "  --model_name_or_path gpt2 \\\n",
    "  --train_file data/train.json  \\\n",
    "  --validation_file data/valid.json \\\n",
    "  --test_file data/test.json \\\n",
    "  --per_device_train_batch_size 32 \\\n",
    "  --per_device_eval_batch_size 32 \\\n",
    "  --max_length 128 \\\n",
    "  --freeze_model \\\n",
    "  --custom_model \\\n",
    "  --return_hidden_states \\\n",
    "  --learning_rate 2e-5 \\\n",
    "  --num_train_epochs 1 \\\n",
    "  --output_dir out/tweet/gpt2_version_5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Roberta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "02/16/2022 00:45:12 - INFO - __main__ - Distributed environment: NO\n",
      "Num processes: 1\n",
      "Process index: 0\n",
      "Local process index: 0\n",
      "Device: cpu\n",
      "Use FP16 precision: False\n",
      "\n",
      "02/16/2022 00:45:12 - WARNING - datasets.builder - Using custom data configuration default-67c9d932a627b7b8\n",
      "02/16/2022 00:45:12 - WARNING - datasets.builder - Reusing dataset json (C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)\n",
      "\n",
      "  0%|          | 0/3 [00:00<?, ?it/s]\n",
      "100%|##########| 3/3 [00:00<00:00, 1503.87it/s]\n",
      "loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b\n",
      "Model config RobertaConfig {\n",
      "  \"architectures\": [\n",
      "    \"RobertaForMaskedLM\"\n",
      "  ],\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"bos_token_id\": 0,\n",
      "  \"classifier_dropout\": null,\n",
      "  \"eos_token_id\": 2,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"layer_norm_eps\": 1e-05,\n",
      "  \"max_position_embeddings\": 514,\n",
      "  \"model_type\": \"roberta\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"pad_token_id\": 1,\n",
      "  \"position_embedding_type\": \"absolute\",\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"type_vocab_size\": 1,\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50265\n",
      "}\n",
      "\n",
      "Could not locate the tokenizer configuration file, will try to use the model config instead.\n",
      "loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b\n",
      "Model config RobertaConfig {\n",
      "  \"architectures\": [\n",
      "    \"RobertaForMaskedLM\"\n",
      "  ],\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"bos_token_id\": 0,\n",
      "  \"classifier_dropout\": null,\n",
      "  \"eos_token_id\": 2,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"layer_norm_eps\": 1e-05,\n",
      "  \"max_position_embeddings\": 514,\n",
      "  \"model_type\": \"roberta\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"pad_token_id\": 1,\n",
      "  \"position_embedding_type\": \"absolute\",\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"type_vocab_size\": 1,\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50265\n",
      "}\n",
      "\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/tokenizer.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/added_tokens.json from cache at None\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/special_tokens_map.json from cache at None\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json from cache at None\n",
      "loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b\n",
      "Model config RobertaConfig {\n",
      "  \"architectures\": [\n",
      "    \"RobertaForMaskedLM\"\n",
      "  ],\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"bos_token_id\": 0,\n",
      "  \"classifier_dropout\": null,\n",
      "  \"eos_token_id\": 2,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"layer_norm_eps\": 1e-05,\n",
      "  \"max_position_embeddings\": 514,\n",
      "  \"model_type\": \"roberta\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"pad_token_id\": 1,\n",
      "  \"position_embedding_type\": \"absolute\",\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"type_vocab_size\": 1,\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50265\n",
      "}\n",
      "\n",
      "02/16/2022 00:45:18 - INFO - __main__ - Return hidden states from model: False\n",
      "02/16/2022 00:45:18 - INFO - __main__ - Using implementation from: AutoModelForSequenceClassification\n",
      "loading weights file https://huggingface.co/roberta-base/resolve/main/pytorch_model.bin from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7\n",
      "Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']\n",
      "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "02/16/2022 00:45:20 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-9bed43ed70dc0bb2.arrow\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 35.81ba/s]\n",
      "02/16/2022 00:45:20 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-a7293927c8abf169.arrow\n",
      "02/16/2022 00:45:20 - INFO - __main__ - Sample 528 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 9226, 269, 16, 103, 9869, 138, 47, 33, 89, 6, 4716, 1827, 8, 787, 12105, 157, 626, 4, 1437, 1437, 849, 22122, 991, 30619, 849, 21363, 46730, 219, 2], 'labels': 1}.\n",
      "02/16/2022 00:45:20 - INFO - __main__ - Sample 3981 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 39398, 4056, 4333, 4056, 10674, 4056, 46, 849, 17693, 849, 16063, 1073, 5715, 849, 17827, 20168, 4183, 857, 299, 4, 35103, 849, 90, 25933, 849, 438, 4467, 849, 1794, 849, 28878, 16170, 849, 28481, 1794, 1437, 1437, 849, 28481, 3695, 4056, 7471, 4056, 18164, 1437, 2], 'labels': 0}.\n",
      "02/16/2022 00:45:20 - INFO - __main__ - Sample 4184 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 2716, 18, 4076, 103, 849, 267, 6988, 428, 33726, 849, 1452, 10071, 849, 1452, 10071, 9029, 849, 4082, 5536, 11819, 849, 10393, 19347, 849, 37096, 1437, 1437, 849, 31518, 849, 1193, 366, 3695, 4056, 7471, 4056, 18164, 1437, 2], 'labels': 0}.\n",
      "02/16/2022 00:45:21 - INFO - __main__ - ***** Running training *****\n",
      "02/16/2022 00:45:21 - INFO - __main__ -   Num examples = 4742\n",
      "02/16/2022 00:45:21 - INFO - __main__ -   Num Epochs = 1\n",
      "02/16/2022 00:45:21 - INFO - __main__ -   Instantaneous batch size per device = 24\n",
      "02/16/2022 00:45:21 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 24\n",
      "02/16/2022 00:45:21 - INFO - __main__ -   Gradient Accumulation steps = 1\n",
      "02/16/2022 00:45:21 - INFO - __main__ -   Total optimization steps = 198\n",
      "\n",
      "  0%|          | 0/198 [00:00<?, ?it/s]\n",
      "  1%|          | 1/198 [00:03<12:09,  3.70s/it]\n",
      "  1%|1         | 2/198 [00:06<09:32,  2.92s/it]\n",
      "  2%|1         | 3/198 [00:08<08:22,  2.57s/it]\n",
      "  2%|2         | 4/198 [00:14<12:25,  3.84s/it]\n",
      "  3%|2         | 5/198 [00:17<12:05,  3.76s/it]\n",
      "  3%|3         | 6/198 [00:20<10:48,  3.38s/it]\n",
      "  4%|3         | 7/198 [00:23<11:00,  3.46s/it]\n",
      "  4%|4         | 8/198 [00:27<11:16,  3.56s/it]\n",
      "  5%|4         | 9/198 [00:30<10:53,  3.46s/it]\n",
      "  5%|5         | 10/198 [00:33<10:11,  3.25s/it]\n",
      "  6%|5         | 11/198 [00:36<09:26,  3.03s/it]\n",
      "  6%|6         | 12/198 [00:38<09:01,  2.91s/it]\n",
      "  7%|6         | 13/198 [00:42<09:18,  3.02s/it]\n",
      "  7%|7         | 14/198 [00:45<09:07,  2.98s/it]\n",
      "  8%|7         | 15/198 [00:47<08:47,  2.88s/it]\n",
      "  8%|8         | 16/198 [00:49<08:05,  2.67s/it]\n",
      "  9%|8         | 17/198 [00:52<07:52,  2.61s/it]\n",
      "  9%|9         | 18/198 [00:55<08:38,  2.88s/it]\n",
      " 10%|9         | 19/198 [00:58<08:41,  2.91s/it]\n",
      " 10%|#         | 20/198 [01:01<08:23,  2.83s/it]\n",
      " 11%|#         | 21/198 [01:04<08:14,  2.79s/it]\n",
      " 11%|#1        | 22/198 [01:07<08:45,  2.99s/it]\n",
      " 12%|#1        | 23/198 [01:11<09:15,  3.17s/it]\n",
      " 12%|#2        | 24/198 [01:17<11:41,  4.03s/it]\n",
      " 13%|#2        | 25/198 [01:21<11:51,  4.11s/it]\n",
      " 13%|#3        | 26/198 [01:25<11:24,  3.98s/it]\n",
      " 14%|#3        | 27/198 [01:27<09:59,  3.50s/it]\n",
      " 14%|#4        | 28/198 [01:30<09:05,  3.21s/it]\n",
      " 15%|#4        | 29/198 [01:32<08:37,  3.06s/it]\n",
      " 15%|#5        | 30/198 [01:35<08:19,  2.97s/it]\n",
      " 16%|#5        | 31/198 [01:39<09:09,  3.29s/it]\n",
      " 16%|#6        | 32/198 [01:42<08:22,  3.03s/it]\n",
      " 17%|#6        | 33/198 [01:44<07:45,  2.82s/it]\n",
      " 17%|#7        | 34/198 [01:47<07:36,  2.78s/it]\n",
      " 18%|#7        | 35/198 [01:50<07:55,  2.92s/it]\n",
      " 18%|#8        | 36/198 [01:53<07:49,  2.90s/it]\n",
      " 19%|#8        | 37/198 [01:59<10:23,  3.87s/it]\n",
      " 19%|#9        | 38/198 [02:03<10:44,  4.03s/it]\n",
      " 20%|#9        | 39/198 [02:06<09:27,  3.57s/it]\n",
      " 20%|##        | 40/198 [02:09<09:31,  3.62s/it]\n",
      " 21%|##        | 41/198 [02:13<09:30,  3.63s/it]\n",
      " 21%|##1       | 42/198 [02:16<08:53,  3.42s/it]\n",
      " 22%|##1       | 43/198 [02:22<10:45,  4.17s/it]\n",
      " 22%|##2       | 44/198 [02:26<10:21,  4.04s/it]\n",
      " 23%|##2       | 45/198 [02:28<09:18,  3.65s/it]\n",
      " 23%|##3       | 46/198 [02:31<08:46,  3.46s/it]\n",
      " 24%|##3       | 47/198 [02:34<07:59,  3.18s/it]\n",
      " 24%|##4       | 48/198 [02:36<07:27,  2.98s/it]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 25%|##4       | 49/198 [02:40<07:56,  3.20s/it]\n",
      " 25%|##5       | 50/198 [02:43<07:30,  3.04s/it]\n",
      " 26%|##5       | 51/198 [02:47<07:55,  3.24s/it]\n",
      " 26%|##6       | 52/198 [02:50<08:15,  3.40s/it]\n",
      " 27%|##6       | 53/198 [02:53<07:46,  3.22s/it]\n",
      " 27%|##7       | 54/198 [02:55<06:56,  2.90s/it]\n",
      " 28%|##7       | 55/198 [03:01<09:10,  3.85s/it]\n",
      " 28%|##8       | 56/198 [03:04<08:23,  3.55s/it]\n",
      " 29%|##8       | 57/198 [03:08<08:11,  3.49s/it]\n",
      " 29%|##9       | 58/198 [03:11<07:48,  3.35s/it]\n",
      " 30%|##9       | 59/198 [03:15<08:45,  3.78s/it]\n",
      " 30%|###       | 60/198 [03:18<07:58,  3.47s/it]\n",
      " 31%|###       | 61/198 [03:24<09:37,  4.22s/it]\n",
      " 31%|###1      | 62/198 [03:27<08:49,  3.89s/it]\n",
      " 32%|###1      | 63/198 [03:30<07:56,  3.53s/it]\n",
      " 32%|###2      | 64/198 [03:33<07:37,  3.41s/it]\n",
      " 33%|###2      | 65/198 [03:37<07:43,  3.49s/it]\n",
      " 33%|###3      | 66/198 [03:39<06:54,  3.14s/it]\n",
      " 34%|###3      | 67/198 [03:42<06:38,  3.04s/it]\n",
      " 34%|###4      | 68/198 [03:46<07:15,  3.35s/it]\n",
      " 35%|###4      | 69/198 [03:50<07:50,  3.65s/it]\n",
      " 35%|###5      | 70/198 [03:54<07:33,  3.55s/it]\n",
      " 36%|###5      | 71/198 [03:57<07:24,  3.50s/it]\n",
      " 36%|###6      | 72/198 [03:59<06:45,  3.22s/it]\n",
      " 37%|###6      | 73/198 [04:04<07:14,  3.47s/it]\n",
      " 37%|###7      | 74/198 [04:06<06:32,  3.16s/it]\n",
      " 38%|###7      | 75/198 [04:08<06:00,  2.93s/it]\n",
      " 38%|###8      | 76/198 [04:11<05:40,  2.79s/it]\n",
      " 39%|###8      | 77/198 [04:15<06:27,  3.20s/it]\n",
      " 39%|###9      | 78/198 [04:18<06:19,  3.16s/it]\n",
      " 40%|###9      | 79/198 [04:21<05:55,  2.99s/it]\n",
      " 40%|####      | 80/198 [04:24<05:56,  3.02s/it]\n",
      " 41%|####      | 81/198 [04:28<06:44,  3.46s/it]\n",
      " 41%|####1     | 82/198 [04:31<06:15,  3.24s/it]\n",
      " 42%|####1     | 83/198 [04:34<05:59,  3.12s/it]\n",
      " 42%|####2     | 84/198 [04:36<05:35,  2.94s/it]\n",
      " 43%|####2     | 85/198 [04:42<07:11,  3.82s/it]\n",
      " 43%|####3     | 86/198 [04:44<06:13,  3.34s/it]\n",
      " 44%|####3     | 87/198 [04:47<05:43,  3.09s/it]\n",
      " 44%|####4     | 88/198 [04:49<05:12,  2.84s/it]\n",
      " 45%|####4     | 89/198 [04:52<05:11,  2.86s/it]\n",
      " 45%|####5     | 90/198 [04:56<05:52,  3.27s/it]\n",
      " 46%|####5     | 91/198 [04:59<05:29,  3.08s/it]\n",
      " 46%|####6     | 92/198 [05:02<05:28,  3.10s/it]\n",
      " 47%|####6     | 93/198 [05:04<05:01,  2.87s/it]\n",
      " 47%|####7     | 94/198 [05:07<04:58,  2.87s/it]\n",
      " 48%|####7     | 95/198 [05:11<05:34,  3.25s/it]\n",
      " 48%|####8     | 96/198 [05:15<05:44,  3.38s/it]\n",
      " 49%|####8     | 97/198 [05:19<05:49,  3.46s/it]\n",
      " 49%|####9     | 98/198 [05:22<05:51,  3.51s/it]\n",
      " 50%|#####     | 99/198 [05:25<05:16,  3.20s/it]\n",
      " 51%|#####     | 100/198 [05:29<05:27,  3.34s/it]\n",
      " 51%|#####1    | 101/198 [05:32<05:35,  3.46s/it]\n",
      " 52%|#####1    | 102/198 [05:35<05:14,  3.28s/it]\n",
      " 52%|#####2    | 103/198 [05:37<04:40,  2.95s/it]\n",
      " 53%|#####2    | 104/198 [05:40<04:40,  2.98s/it]\n",
      " 53%|#####3    | 105/198 [05:42<04:13,  2.73s/it]\n",
      " 54%|#####3    | 106/198 [05:45<04:12,  2.74s/it]\n",
      " 54%|#####4    | 107/198 [05:48<04:05,  2.70s/it]\n",
      " 55%|#####4    | 108/198 [05:51<04:05,  2.73s/it]\n",
      " 55%|#####5    | 109/198 [05:54<04:08,  2.80s/it]\n",
      " 56%|#####5    | 110/198 [05:57<04:20,  2.96s/it]\n",
      " 56%|#####6    | 111/198 [06:00<04:24,  3.04s/it]\n",
      " 57%|#####6    | 112/198 [06:04<04:37,  3.23s/it]\n",
      " 57%|#####7    | 113/198 [06:07<04:25,  3.12s/it]\n",
      " 58%|#####7    | 114/198 [06:09<04:12,  3.01s/it]\n",
      " 58%|#####8    | 115/198 [06:13<04:23,  3.18s/it]\n",
      " 59%|#####8    | 116/198 [06:16<04:19,  3.16s/it]\n",
      " 59%|#####9    | 117/198 [06:19<04:11,  3.11s/it]\n",
      " 60%|#####9    | 118/198 [06:23<04:26,  3.33s/it]\n",
      " 60%|######    | 119/198 [06:25<03:57,  3.00s/it]\n",
      " 61%|######    | 120/198 [06:29<04:09,  3.20s/it]\n",
      " 61%|######1   | 121/198 [06:33<04:17,  3.34s/it]\n",
      " 62%|######1   | 122/198 [06:39<05:14,  4.14s/it]\n",
      " 62%|######2   | 123/198 [06:43<05:09,  4.12s/it]\n",
      " 63%|######2   | 124/198 [06:45<04:35,  3.73s/it]\n",
      " 63%|######3   | 125/198 [06:48<04:11,  3.44s/it]\n",
      " 64%|######3   | 126/198 [06:51<03:46,  3.14s/it]\n",
      " 64%|######4   | 127/198 [06:54<03:53,  3.29s/it]\n",
      " 65%|######4   | 128/198 [06:57<03:39,  3.14s/it]\n",
      " 65%|######5   | 129/198 [07:00<03:33,  3.09s/it]\n",
      " 66%|######5   | 130/198 [07:03<03:27,  3.05s/it]\n",
      " 66%|######6   | 131/198 [07:06<03:14,  2.91s/it]\n",
      " 67%|######6   | 132/198 [07:08<03:02,  2.77s/it]\n",
      " 67%|######7   | 133/198 [07:10<02:52,  2.65s/it]\n",
      " 68%|######7   | 134/198 [07:13<02:43,  2.56s/it]\n",
      " 68%|######8   | 135/198 [07:16<02:55,  2.78s/it]\n",
      " 69%|######8   | 136/198 [07:19<02:48,  2.71s/it]\n",
      " 69%|######9   | 137/198 [07:22<03:04,  3.02s/it]\n",
      " 70%|######9   | 138/198 [07:28<03:43,  3.72s/it]\n",
      " 70%|#######   | 139/198 [07:31<03:29,  3.56s/it]\n",
      " 71%|#######   | 140/198 [07:34<03:26,  3.56s/it]\n",
      " 71%|#######1  | 141/198 [07:37<03:12,  3.37s/it]\n",
      " 72%|#######1  | 142/198 [07:41<03:12,  3.44s/it]\n",
      " 72%|#######2  | 143/198 [07:45<03:12,  3.50s/it]\n",
      " 73%|#######2  | 144/198 [07:47<02:56,  3.27s/it]\n",
      " 73%|#######3  | 145/198 [07:50<02:46,  3.14s/it]\n",
      " 74%|#######3  | 146/198 [07:53<02:36,  3.00s/it]\n",
      " 74%|#######4  | 147/198 [07:55<02:25,  2.85s/it]\n",
      " 75%|#######4  | 148/198 [07:59<02:34,  3.08s/it]\n",
      " 75%|#######5  | 149/198 [08:03<02:40,  3.27s/it]\n",
      " 76%|#######5  | 150/198 [08:05<02:23,  2.99s/it]\n",
      " 76%|#######6  | 151/198 [08:09<02:30,  3.21s/it]\n",
      " 77%|#######6  | 152/198 [08:12<02:33,  3.34s/it]\n",
      " 77%|#######7  | 153/198 [08:15<02:21,  3.14s/it]\n",
      " 78%|#######7  | 154/198 [08:19<02:25,  3.31s/it]\n",
      " 78%|#######8  | 155/198 [08:21<02:12,  3.08s/it]\n",
      " 79%|#######8  | 156/198 [08:24<01:58,  2.81s/it]\n",
      " 79%|#######9  | 157/198 [08:27<02:07,  3.10s/it]\n",
      " 80%|#######9  | 158/198 [08:30<02:00,  3.01s/it]\n",
      " 80%|########  | 159/198 [08:35<02:17,  3.52s/it]\n",
      " 81%|########  | 160/198 [08:37<02:01,  3.20s/it]\n",
      " 81%|########1 | 161/198 [08:40<01:51,  3.02s/it]\n",
      " 82%|########1 | 162/198 [08:45<02:10,  3.62s/it]\n",
      " 82%|########2 | 163/198 [08:48<02:02,  3.51s/it]\n",
      " 83%|########2 | 164/198 [08:52<02:02,  3.59s/it]\n",
      " 83%|########3 | 165/198 [08:58<02:22,  4.31s/it]\n",
      " 84%|########3 | 166/198 [09:00<02:00,  3.75s/it]\n",
      " 84%|########4 | 167/198 [09:03<01:42,  3.30s/it]\n",
      " 85%|########4 | 168/198 [09:06<01:41,  3.39s/it]\n",
      " 85%|########5 | 169/198 [09:09<01:33,  3.21s/it]\n",
      " 86%|########5 | 170/198 [09:12<01:25,  3.06s/it]\n",
      " 86%|########6 | 171/198 [09:17<01:38,  3.67s/it]\n",
      " 87%|########6 | 172/198 [09:19<01:27,  3.38s/it]\n",
      " 87%|########7 | 173/198 [09:25<01:41,  4.07s/it]\n",
      " 88%|########7 | 174/198 [09:28<01:28,  3.68s/it]\n",
      " 88%|########8 | 175/198 [09:31<01:22,  3.60s/it]\n",
      " 89%|########8 | 176/198 [09:35<01:21,  3.69s/it]\n",
      " 89%|########9 | 177/198 [09:38<01:09,  3.31s/it]\n",
      " 90%|########9 | 178/198 [09:40<00:59,  2.97s/it]\n",
      " 90%|######### | 179/198 [09:43<01:00,  3.17s/it]\n",
      " 91%|######### | 180/198 [09:47<00:58,  3.25s/it]\n",
      " 91%|#########1| 181/198 [09:50<00:52,  3.07s/it]\n",
      " 92%|#########1| 182/198 [09:53<00:49,  3.10s/it]\n",
      " 92%|#########2| 183/198 [09:55<00:41,  2.79s/it]\n",
      " 93%|#########2| 184/198 [09:58<00:42,  3.03s/it]\n",
      " 93%|#########3| 185/198 [10:01<00:36,  2.84s/it]\n",
      " 94%|#########3| 186/198 [10:03<00:31,  2.66s/it]\n",
      " 94%|#########4| 187/198 [10:05<00:27,  2.53s/it]\n",
      " 95%|#########4| 188/198 [10:08<00:25,  2.57s/it]\n",
      " 95%|#########5| 189/198 [10:13<00:29,  3.25s/it]\n",
      " 96%|#########5| 190/198 [10:16<00:26,  3.36s/it]\n",
      " 96%|#########6| 191/198 [10:19<00:21,  3.03s/it]\n",
      " 97%|#########6| 192/198 [10:22<00:18,  3.13s/it]\n",
      " 97%|#########7| 193/198 [10:24<00:14,  2.91s/it]\n",
      " 98%|#########7| 194/198 [10:28<00:12,  3.13s/it]\n",
      " 98%|#########8| 195/198 [10:31<00:09,  3.06s/it]\n",
      " 99%|#########8| 196/198 [10:36<00:07,  3.64s/it]\n",
      " 99%|#########9| 197/198 [10:39<00:03,  3.58s/it]\n",
      "100%|##########| 198/198 [10:41<00:00,  2.92s/it]02/16/2022 00:56:30 - INFO - __main__ - Epoch 0: {'accuracy': 0.948}\n",
      "02/16/2022 00:56:53 - INFO - __main__ - Test-set evaluation: {'accuracy': 0.942}\n",
      "Configuration saved in out/tweet/roberta\\config.json\n",
      "Model weights saved in out/tweet/roberta\\pytorch_model.bin\n",
      "tokenizer config file saved in out/tweet/roberta\\tokenizer_config.json\n",
      "Special tokens file saved in out/tweet/roberta\\special_tokens_map.json\n",
      "\n",
      "100%|##########| 198/198 [11:32<00:00,  3.50s/it]\n"
     ]
    }
   ],
   "source": [
    "!python run_glue_no_trainer.py \\\n",
    "  --model_name_or_path roberta-base \\\n",
    "  --train_file data/train.json  \\\n",
    "  --validation_file data/valid.json \\\n",
    "  --test_file data/test.json \\\n",
    "  --per_device_train_batch_size 24 \\\n",
    "  --per_device_eval_batch_size 24 \\\n",
    "  --max_length 128 \\\n",
    "  --learning_rate 2e-5 \\\n",
    "  --num_train_epochs 1 \\\n",
    "  --output_dir out/tweet/roberta"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Roberta version 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "02/16/2022 00:56:55 - INFO - __main__ - Distributed environment: NO\n",
      "Num processes: 1\n",
      "Process index: 0\n",
      "Local process index: 0\n",
      "Device: cpu\n",
      "Use FP16 precision: False\n",
      "\n",
      "02/16/2022 00:56:56 - WARNING - datasets.builder - Using custom data configuration default-67c9d932a627b7b8\n",
      "02/16/2022 00:56:56 - WARNING - datasets.builder - Reusing dataset json (C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)\n",
      "\n",
      "  0%|          | 0/3 [00:00<?, ?it/s]\n",
      "100%|##########| 3/3 [00:00<00:00, 1504.59it/s]\n",
      "loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b\n",
      "Model config RobertaConfig {\n",
      "  \"architectures\": [\n",
      "    \"RobertaForMaskedLM\"\n",
      "  ],\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"bos_token_id\": 0,\n",
      "  \"classifier_dropout\": null,\n",
      "  \"eos_token_id\": 2,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"layer_norm_eps\": 1e-05,\n",
      "  \"max_position_embeddings\": 514,\n",
      "  \"model_type\": \"roberta\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"pad_token_id\": 1,\n",
      "  \"position_embedding_type\": \"absolute\",\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"type_vocab_size\": 1,\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50265\n",
      "}\n",
      "\n",
      "Could not locate the tokenizer configuration file, will try to use the model config instead.\n",
      "loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b\n",
      "Model config RobertaConfig {\n",
      "  \"architectures\": [\n",
      "    \"RobertaForMaskedLM\"\n",
      "  ],\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"bos_token_id\": 0,\n",
      "  \"classifier_dropout\": null,\n",
      "  \"eos_token_id\": 2,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"layer_norm_eps\": 1e-05,\n",
      "  \"max_position_embeddings\": 514,\n",
      "  \"model_type\": \"roberta\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"pad_token_id\": 1,\n",
      "  \"position_embedding_type\": \"absolute\",\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"type_vocab_size\": 1,\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50265\n",
      "}\n",
      "\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/tokenizer.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/added_tokens.json from cache at None\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/special_tokens_map.json from cache at None\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json from cache at None\n",
      "loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b\n",
      "Model config RobertaConfig {\n",
      "  \"architectures\": [\n",
      "    \"RobertaForMaskedLM\"\n",
      "  ],\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"bos_token_id\": 0,\n",
      "  \"classifier_dropout\": null,\n",
      "  \"eos_token_id\": 2,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"layer_norm_eps\": 1e-05,\n",
      "  \"max_position_embeddings\": 514,\n",
      "  \"model_type\": \"roberta\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"pad_token_id\": 1,\n",
      "  \"position_embedding_type\": \"absolute\",\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"type_vocab_size\": 1,\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50265\n",
      "}\n",
      "\n",
      "02/16/2022 00:57:02 - INFO - __main__ - Return hidden states from model: False\n",
      "02/16/2022 00:57:02 - INFO - __main__ - Using implementation from: AutoModelForSequenceClassification\n",
      "loading weights file https://huggingface.co/roberta-base/resolve/main/pytorch_model.bin from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7\n",
      "Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']\n",
      "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "02/16/2022 00:57:03 - INFO - __main__ - Freezing model weights\n",
      "02/16/2022 00:57:03 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-12e8873686c6be8d.arrow\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 35.81ba/s]\n",
      "02/16/2022 00:57:03 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-6af3944f94b779cb.arrow\n",
      "02/16/2022 00:57:03 - INFO - __main__ - Sample 2678 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 118, 437, 98, 1437, 1437, 8, 849, 6504, 39264, 122, 14, 111, 849, 3707, 9856, 1635, 1437, 2], 'labels': 0}.\n",
      "02/16/2022 00:57:03 - INFO - __main__ - Sample 1289 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 1322, 47, 849, 14178, 359, 3914, 131, 619, 101, 952, 7258, 4056, 7471, 4056, 18164, 32, 30309, 154, 15, 47, 116, 4161, 1437, 849, 4903, 21210, 849, 90, 20564, 849, 119, 40879, 3695, 4056, 7471, 4056, 18164, 1437, 2], 'labels': 1}.\n",
      "02/16/2022 00:57:03 - INFO - __main__ - Sample 2660 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 21714, 3308, 7512, 13, 127, 15382, 186, 11, 885, 4575, 1437, 1437, 2], 'labels': 0}.\n",
      "02/16/2022 00:57:04 - INFO - __main__ - ***** Running training *****\n",
      "02/16/2022 00:57:04 - INFO - __main__ -   Num examples = 4742\n",
      "02/16/2022 00:57:04 - INFO - __main__ -   Num Epochs = 1\n",
      "02/16/2022 00:57:04 - INFO - __main__ -   Instantaneous batch size per device = 24\n",
      "02/16/2022 00:57:04 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 24\n",
      "02/16/2022 00:57:04 - INFO - __main__ -   Gradient Accumulation steps = 1\n",
      "02/16/2022 00:57:04 - INFO - __main__ -   Total optimization steps = 198\n",
      "\n",
      "  0%|          | 0/198 [00:00<?, ?it/s]\n",
      "  1%|          | 1/198 [00:00<02:44,  1.20it/s]\n",
      "  1%|1         | 2/198 [00:01<02:38,  1.24it/s]\n",
      "  2%|1         | 3/198 [00:02<03:22,  1.04s/it]\n",
      "  2%|2         | 4/198 [00:03<02:45,  1.17it/s]\n",
      "  3%|2         | 5/198 [00:04<03:19,  1.03s/it]\n",
      "  3%|3         | 6/198 [00:06<03:31,  1.10s/it]\n",
      "  4%|3         | 7/198 [00:06<03:15,  1.03s/it]\n",
      "  4%|4         | 8/198 [00:08<03:43,  1.18s/it]\n",
      "  5%|4         | 9/198 [00:09<03:29,  1.11s/it]\n",
      "  5%|5         | 10/198 [00:10<03:05,  1.01it/s]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  6%|5         | 11/198 [00:10<02:55,  1.06it/s]\n",
      "  6%|6         | 12/198 [00:11<02:57,  1.05it/s]\n",
      "  7%|6         | 13/198 [00:12<02:56,  1.05it/s]\n",
      "  7%|7         | 14/198 [00:13<02:41,  1.14it/s]\n",
      "  8%|7         | 15/198 [00:14<02:49,  1.08it/s]\n",
      "  8%|8         | 16/198 [00:15<02:47,  1.09it/s]\n",
      "  9%|8         | 17/198 [00:16<02:52,  1.05it/s]\n",
      "  9%|9         | 18/198 [00:18<03:28,  1.16s/it]\n",
      " 10%|9         | 19/198 [00:19<03:23,  1.14s/it]\n",
      " 10%|#         | 20/198 [00:20<02:59,  1.01s/it]\n",
      " 11%|#         | 21/198 [00:21<03:18,  1.12s/it]\n",
      " 11%|#1        | 22/198 [00:22<03:02,  1.03s/it]\n",
      " 12%|#1        | 23/198 [00:23<02:52,  1.01it/s]\n",
      " 12%|#2        | 24/198 [00:24<03:08,  1.08s/it]\n",
      " 13%|#2        | 25/198 [00:25<03:17,  1.14s/it]\n",
      " 13%|#3        | 26/198 [00:27<03:28,  1.21s/it]\n",
      " 14%|#3        | 27/198 [00:27<03:05,  1.09s/it]\n",
      " 14%|#4        | 28/198 [00:28<02:48,  1.01it/s]\n",
      " 15%|#4        | 29/198 [00:29<02:37,  1.08it/s]\n",
      " 15%|#5        | 30/198 [00:30<02:36,  1.07it/s]\n",
      " 16%|#5        | 31/198 [00:31<02:28,  1.12it/s]\n",
      " 16%|#6        | 32/198 [00:32<02:32,  1.09it/s]\n",
      " 17%|#6        | 33/198 [00:32<02:27,  1.12it/s]\n",
      " 17%|#7        | 34/198 [00:34<02:40,  1.02it/s]\n",
      " 18%|#7        | 35/198 [00:35<03:02,  1.12s/it]\n",
      " 18%|#8        | 36/198 [00:36<03:09,  1.17s/it]\n",
      " 19%|#8        | 37/198 [00:37<02:53,  1.08s/it]\n",
      " 19%|#9        | 38/198 [00:38<02:37,  1.02it/s]\n",
      " 20%|#9        | 39/198 [00:39<02:50,  1.08s/it]\n",
      " 20%|##        | 40/198 [00:41<02:56,  1.12s/it]\n",
      " 21%|##        | 41/198 [00:42<03:00,  1.15s/it]\n",
      " 21%|##1       | 42/198 [00:42<02:36,  1.00s/it]\n",
      " 22%|##1       | 43/198 [00:44<02:58,  1.15s/it]\n",
      " 22%|##2       | 44/198 [00:45<03:13,  1.26s/it]\n",
      " 23%|##2       | 45/198 [00:46<03:03,  1.20s/it]\n",
      " 23%|##3       | 46/198 [00:47<02:45,  1.09s/it]\n",
      " 24%|##3       | 47/198 [00:49<02:54,  1.15s/it]\n",
      " 24%|##4       | 48/198 [00:49<02:36,  1.05s/it]\n",
      " 25%|##4       | 49/198 [00:50<02:33,  1.03s/it]\n",
      " 25%|##5       | 50/198 [00:52<03:07,  1.27s/it]\n",
      " 26%|##5       | 51/198 [00:53<02:44,  1.12s/it]\n",
      " 26%|##6       | 52/198 [00:54<02:46,  1.14s/it]\n",
      " 27%|##6       | 53/198 [00:55<02:49,  1.17s/it]\n",
      " 27%|##7       | 54/198 [00:57<02:51,  1.19s/it]\n",
      " 28%|##7       | 55/198 [00:58<02:57,  1.24s/it]\n",
      " 28%|##8       | 56/198 [00:59<02:34,  1.09s/it]\n",
      " 29%|##8       | 57/198 [01:01<03:30,  1.49s/it]\n",
      " 29%|##9       | 58/198 [01:02<03:20,  1.43s/it]\n",
      " 30%|##9       | 59/198 [01:04<03:12,  1.39s/it]\n",
      " 30%|###       | 60/198 [01:05<02:50,  1.23s/it]\n",
      " 31%|###       | 61/198 [01:06<02:34,  1.13s/it]\n",
      " 31%|###1      | 62/198 [01:06<02:15,  1.00it/s]\n",
      " 32%|###1      | 63/198 [01:07<02:09,  1.04it/s]\n",
      " 32%|###2      | 64/198 [01:08<02:01,  1.10it/s]\n",
      " 33%|###2      | 65/198 [01:09<02:25,  1.09s/it]\n",
      " 33%|###3      | 66/198 [01:12<03:09,  1.44s/it]\n",
      " 34%|###3      | 67/198 [01:13<02:54,  1.33s/it]\n",
      " 34%|###4      | 68/198 [01:15<03:22,  1.56s/it]\n",
      " 35%|###4      | 69/198 [01:16<03:17,  1.53s/it]\n",
      " 35%|###5      | 70/198 [01:17<03:02,  1.43s/it]\n",
      " 36%|###5      | 71/198 [01:18<02:39,  1.25s/it]\n",
      " 36%|###6      | 72/198 [01:20<02:41,  1.28s/it]\n",
      " 37%|###6      | 73/198 [01:22<03:23,  1.62s/it]\n",
      " 37%|###7      | 74/198 [01:23<02:52,  1.39s/it]\n",
      " 38%|###7      | 75/198 [01:24<02:34,  1.26s/it]\n",
      " 38%|###8      | 76/198 [01:25<02:35,  1.27s/it]\n",
      " 39%|###8      | 77/198 [01:26<02:34,  1.28s/it]\n",
      " 39%|###9      | 78/198 [01:28<02:48,  1.40s/it]\n",
      " 40%|###9      | 79/198 [01:29<02:21,  1.19s/it]\n",
      " 40%|####      | 80/198 [01:30<02:25,  1.23s/it]\n",
      " 41%|####      | 81/198 [01:31<02:13,  1.14s/it]\n",
      " 41%|####1     | 82/198 [01:32<01:58,  1.03s/it]\n",
      " 42%|####1     | 83/198 [01:33<01:54,  1.00it/s]\n",
      " 42%|####2     | 84/198 [01:34<01:49,  1.04it/s]\n",
      " 43%|####2     | 85/198 [01:35<01:58,  1.05s/it]\n",
      " 43%|####3     | 86/198 [01:37<02:40,  1.43s/it]\n",
      " 44%|####3     | 87/198 [01:39<02:42,  1.46s/it]\n",
      " 44%|####4     | 88/198 [01:40<02:29,  1.36s/it]\n",
      " 45%|####4     | 89/198 [01:41<02:11,  1.21s/it]\n",
      " 45%|####5     | 90/198 [01:42<01:56,  1.08s/it]\n",
      " 46%|####5     | 91/198 [01:42<01:43,  1.04it/s]\n",
      " 46%|####6     | 92/198 [01:43<01:44,  1.02it/s]\n",
      " 47%|####6     | 93/198 [01:45<02:04,  1.18s/it]\n",
      " 47%|####7     | 94/198 [01:46<01:52,  1.08s/it]\n",
      " 48%|####7     | 95/198 [01:47<01:58,  1.15s/it]\n",
      " 48%|####8     | 96/198 [01:48<02:04,  1.22s/it]\n",
      " 49%|####8     | 97/198 [01:49<01:54,  1.13s/it]\n",
      " 49%|####9     | 98/198 [01:50<01:42,  1.02s/it]\n",
      " 50%|#####     | 99/198 [01:51<01:41,  1.02s/it]\n",
      " 51%|#####     | 100/198 [01:53<01:49,  1.12s/it]\n",
      " 51%|#####1    | 101/198 [01:53<01:38,  1.02s/it]\n",
      " 52%|#####1    | 102/198 [01:54<01:29,  1.08it/s]\n",
      " 52%|#####2    | 103/198 [01:55<01:42,  1.08s/it]\n",
      " 53%|#####2    | 104/198 [01:57<01:45,  1.12s/it]\n",
      " 53%|#####3    | 105/198 [01:58<01:40,  1.08s/it]\n",
      " 54%|#####3    | 106/198 [01:59<01:45,  1.14s/it]\n",
      " 54%|#####4    | 107/198 [02:00<01:47,  1.18s/it]\n",
      " 55%|#####4    | 108/198 [02:01<01:37,  1.09s/it]\n",
      " 55%|#####5    | 109/198 [02:02<01:39,  1.11s/it]\n",
      " 56%|#####5    | 110/198 [02:03<01:36,  1.09s/it]\n",
      " 56%|#####6    | 111/198 [02:05<01:38,  1.14s/it]\n",
      " 57%|#####6    | 112/198 [02:05<01:27,  1.02s/it]\n",
      " 57%|#####7    | 113/198 [02:06<01:24,  1.00it/s]\n",
      " 58%|#####7    | 114/198 [02:08<01:31,  1.08s/it]\n",
      " 58%|#####8    | 115/198 [02:08<01:25,  1.03s/it]\n",
      " 59%|#####8    | 116/198 [02:09<01:22,  1.01s/it]\n",
      " 59%|#####9    | 117/198 [02:10<01:16,  1.05it/s]\n",
      " 60%|#####9    | 118/198 [02:11<01:10,  1.13it/s]\n",
      " 60%|######    | 119/198 [02:12<01:11,  1.11it/s]\n",
      " 61%|######    | 120/198 [02:14<01:45,  1.36s/it]\n",
      " 61%|######1   | 121/198 [02:15<01:27,  1.13s/it]\n",
      " 62%|######1   | 122/198 [02:16<01:19,  1.04s/it]\n",
      " 62%|######2   | 123/198 [02:17<01:24,  1.13s/it]\n",
      " 63%|######2   | 124/198 [02:18<01:28,  1.19s/it]\n",
      " 63%|######3   | 125/198 [02:20<01:29,  1.23s/it]\n",
      " 64%|######3   | 126/198 [02:20<01:15,  1.05s/it]\n",
      " 64%|######4   | 127/198 [02:21<01:16,  1.08s/it]\n",
      " 65%|######4   | 128/198 [02:22<01:07,  1.04it/s]\n",
      " 65%|######5   | 129/198 [02:24<01:26,  1.26s/it]\n",
      " 66%|######5   | 130/198 [02:25<01:13,  1.08s/it]\n",
      " 66%|######6   | 131/198 [02:26<01:05,  1.03it/s]\n",
      " 67%|######6   | 132/198 [02:27<01:15,  1.14s/it]\n",
      " 67%|######7   | 133/198 [02:28<01:06,  1.02s/it]\n",
      " 68%|######7   | 134/198 [02:28<00:58,  1.10it/s]\n",
      " 68%|######8   | 135/198 [02:29<00:53,  1.18it/s]\n",
      " 69%|######8   | 136/198 [02:30<01:01,  1.01it/s]\n",
      " 69%|######9   | 137/198 [02:31<00:59,  1.03it/s]\n",
      " 70%|######9   | 138/198 [02:33<01:07,  1.12s/it]\n",
      " 70%|#######   | 139/198 [02:34<00:59,  1.01s/it]\n",
      " 71%|#######   | 140/198 [02:35<00:57,  1.01it/s]\n",
      " 71%|#######1  | 141/198 [02:37<01:14,  1.30s/it]\n",
      " 72%|#######1  | 142/198 [02:39<01:31,  1.63s/it]\n",
      " 72%|#######2  | 143/198 [02:40<01:17,  1.41s/it]\n",
      " 73%|#######2  | 144/198 [02:41<01:07,  1.25s/it]\n",
      " 73%|#######3  | 145/198 [02:42<01:01,  1.17s/it]\n",
      " 74%|#######3  | 146/198 [02:42<00:53,  1.04s/it]\n",
      " 74%|#######4  | 147/198 [02:43<00:46,  1.10it/s]\n",
      " 75%|#######4  | 148/198 [02:44<00:45,  1.09it/s]\n",
      " 75%|#######5  | 149/198 [02:46<01:05,  1.34s/it]\n",
      " 76%|#######5  | 150/198 [02:47<00:58,  1.22s/it]\n",
      " 76%|#######6  | 151/198 [02:49<01:02,  1.32s/it]\n",
      " 77%|#######6  | 152/198 [02:51<01:15,  1.65s/it]\n",
      " 77%|#######7  | 153/198 [02:52<01:01,  1.38s/it]\n",
      " 78%|#######7  | 154/198 [02:53<01:00,  1.38s/it]\n",
      " 78%|#######8  | 155/198 [02:54<00:52,  1.22s/it]\n",
      " 79%|#######8  | 156/198 [02:55<00:49,  1.18s/it]\n",
      " 79%|#######9  | 157/198 [02:56<00:44,  1.09s/it]\n",
      " 80%|#######9  | 158/198 [02:57<00:40,  1.02s/it]\n",
      " 80%|########  | 159/198 [02:58<00:40,  1.03s/it]\n",
      " 81%|########  | 160/198 [02:59<00:41,  1.09s/it]\n",
      " 81%|########1 | 161/198 [03:00<00:37,  1.01s/it]\n",
      " 82%|########1 | 162/198 [03:01<00:32,  1.10it/s]\n",
      " 82%|########2 | 163/198 [03:02<00:31,  1.12it/s]\n",
      " 83%|########2 | 164/198 [03:03<00:32,  1.04it/s]\n",
      " 83%|########3 | 165/198 [03:05<00:46,  1.41s/it]\n",
      " 84%|########3 | 166/198 [03:06<00:40,  1.25s/it]\n",
      " 84%|########4 | 167/198 [03:07<00:37,  1.22s/it]\n",
      " 85%|########4 | 168/198 [03:09<00:37,  1.25s/it]\n",
      " 85%|########5 | 169/198 [03:10<00:36,  1.26s/it]\n",
      " 86%|########5 | 170/198 [03:11<00:31,  1.12s/it]\n",
      " 86%|########6 | 171/198 [03:12<00:31,  1.18s/it]\n",
      " 87%|########6 | 172/198 [03:13<00:31,  1.20s/it]\n",
      " 87%|########7 | 173/198 [03:14<00:28,  1.14s/it]\n",
      " 88%|########7 | 174/198 [03:15<00:24,  1.03s/it]\n",
      " 88%|########8 | 175/198 [03:16<00:22,  1.04it/s]\n",
      " 89%|########8 | 176/198 [03:17<00:22,  1.01s/it]\n",
      " 89%|########9 | 177/198 [03:18<00:22,  1.07s/it]\n",
      " 90%|########9 | 178/198 [03:21<00:29,  1.48s/it]\n",
      " 90%|######### | 179/198 [03:22<00:25,  1.37s/it]\n",
      " 91%|######### | 180/198 [03:23<00:21,  1.22s/it]\n",
      " 91%|#########1| 181/198 [03:23<00:19,  1.13s/it]\n",
      " 92%|#########1| 182/198 [03:25<00:17,  1.12s/it]\n",
      " 92%|#########2| 183/198 [03:25<00:15,  1.02s/it]\n",
      " 93%|#########2| 184/198 [03:26<00:13,  1.02it/s]\n",
      " 93%|#########3| 185/198 [03:28<00:13,  1.08s/it]\n",
      " 94%|#########3| 186/198 [03:28<00:11,  1.01it/s]\n",
      " 94%|#########4| 187/198 [03:30<00:12,  1.09s/it]\n",
      " 95%|#########4| 188/198 [03:32<00:13,  1.34s/it]\n",
      " 95%|#########5| 189/198 [03:33<00:10,  1.22s/it]\n",
      " 96%|#########5| 190/198 [03:33<00:08,  1.11s/it]\n",
      " 96%|#########6| 191/198 [03:34<00:07,  1.00s/it]\n",
      " 97%|#########6| 192/198 [03:36<00:06,  1.16s/it]\n",
      " 97%|#########7| 193/198 [03:37<00:05,  1.12s/it]\n",
      " 98%|#########7| 194/198 [03:38<00:04,  1.08s/it]\n",
      " 98%|#########8| 195/198 [03:38<00:02,  1.02it/s]\n",
      " 99%|#########8| 196/198 [03:40<00:02,  1.22s/it]\n",
      " 99%|#########9| 197/198 [03:41<00:01,  1.16s/it]\n",
      "100%|##########| 198/198 [03:42<00:00,  1.05s/it]02/16/2022 01:01:14 - INFO - __main__ - Epoch 0: {'accuracy': 0.938}\n",
      "02/16/2022 01:01:37 - INFO - __main__ - Test-set evaluation: {'accuracy': 1.0}\n",
      "Configuration saved in out/tweet/roberta_version_2\\config.json\n",
      "Model weights saved in out/tweet/roberta_version_2\\pytorch_model.bin\n",
      "tokenizer config file saved in out/tweet/roberta_version_2\\tokenizer_config.json\n",
      "Special tokens file saved in out/tweet/roberta_version_2\\special_tokens_map.json\n",
      "\n",
      "100%|##########| 198/198 [04:33<00:00,  1.38s/it]\n"
     ]
    }
   ],
   "source": [
    "!python run_glue_no_trainer.py \\\n",
    "  --model_name_or_path roberta-base \\\n",
    "  --train_file data/train.json  \\\n",
    "  --validation_file data/valid.json \\\n",
    "  --test_file data/test.json \\\n",
    "  --per_device_train_batch_size 24 \\\n",
    "  --per_device_eval_batch_size 24 \\\n",
    "  --max_length 128 \\\n",
    "  --freeze_model \\\n",
    "  --learning_rate 2e-5 \\\n",
    "  --num_train_epochs 1 \\\n",
    "  --output_dir out/tweet/roberta_version_2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Roberta version 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "02/16/2022 01:01:39 - INFO - __main__ - Distributed environment: NO\n",
      "Num processes: 1\n",
      "Process index: 0\n",
      "Local process index: 0\n",
      "Device: cpu\n",
      "Use FP16 precision: False\n",
      "\n",
      "02/16/2022 01:01:40 - WARNING - datasets.builder - Using custom data configuration default-67c9d932a627b7b8\n",
      "02/16/2022 01:01:40 - WARNING - datasets.builder - Reusing dataset json (C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)\n",
      "\n",
      "  0%|          | 0/3 [00:00<?, ?it/s]\n",
      "100%|##########| 3/3 [00:00<00:00, 1507.66it/s]\n",
      "loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b\n",
      "Model config RobertaConfig {\n",
      "  \"architectures\": [\n",
      "    \"RobertaForMaskedLM\"\n",
      "  ],\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"bos_token_id\": 0,\n",
      "  \"classifier_dropout\": null,\n",
      "  \"eos_token_id\": 2,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"layer_norm_eps\": 1e-05,\n",
      "  \"max_position_embeddings\": 514,\n",
      "  \"model_type\": \"roberta\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"pad_token_id\": 1,\n",
      "  \"position_embedding_type\": \"absolute\",\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"type_vocab_size\": 1,\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50265\n",
      "}\n",
      "\n",
      "Could not locate the tokenizer configuration file, will try to use the model config instead.\n",
      "loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b\n",
      "Model config RobertaConfig {\n",
      "  \"architectures\": [\n",
      "    \"RobertaForMaskedLM\"\n",
      "  ],\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"bos_token_id\": 0,\n",
      "  \"classifier_dropout\": null,\n",
      "  \"eos_token_id\": 2,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"layer_norm_eps\": 1e-05,\n",
      "  \"max_position_embeddings\": 514,\n",
      "  \"model_type\": \"roberta\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"pad_token_id\": 1,\n",
      "  \"position_embedding_type\": \"absolute\",\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"type_vocab_size\": 1,\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50265\n",
      "}\n",
      "\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/tokenizer.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/added_tokens.json from cache at None\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/special_tokens_map.json from cache at None\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json from cache at None\n",
      "loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b\n",
      "Model config RobertaConfig {\n",
      "  \"architectures\": [\n",
      "    \"RobertaForMaskedLM\"\n",
      "  ],\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"bos_token_id\": 0,\n",
      "  \"classifier_dropout\": null,\n",
      "  \"eos_token_id\": 2,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"layer_norm_eps\": 1e-05,\n",
      "  \"max_position_embeddings\": 514,\n",
      "  \"model_type\": \"roberta\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"pad_token_id\": 1,\n",
      "  \"position_embedding_type\": \"absolute\",\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"type_vocab_size\": 1,\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50265\n",
      "}\n",
      "\n",
      "02/16/2022 01:01:46 - INFO - __main__ - Return hidden states from model: False\n",
      "02/16/2022 01:01:46 - INFO - __main__ - Using implementation from: RobertaForSequenceClassificationCustomAlternative\n",
      "loading weights file https://huggingface.co/roberta-base/resolve/main/pytorch_model.bin from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7\n",
      "Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassificationCustomAlternative: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']\n",
      "- This IS expected if you are initializing RobertaForSequenceClassificationCustomAlternative from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing RobertaForSequenceClassificationCustomAlternative from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of RobertaForSequenceClassificationCustomAlternative were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense_2.weight', 'classifier.dense_1_input.weight', 'classifier.dense_2.bias', 'classifier.out_proj.weight', 'classifier.dense_1_hidden.bias', 'classifier.dense_1_input.bias', 'classifier.dense_1_hidden.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "02/16/2022 01:01:48 - INFO - __main__ - Freezing model weights\n",
      "02/16/2022 01:01:48 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-ba2b749ff70d20c2.arrow\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 29.49ba/s]\n",
      "02/16/2022 01:01:48 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-376097e0887bad71.arrow\n",
      "02/16/2022 01:01:48 - INFO - __main__ - Sample 4466 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 8338, 365, 849, 11970, 409, 31, 3970, 727, 849, 28481, 268, 15, 849, 48056, 939, 437, 98, 1437, 1437, 849, 8656, 849, 8656, 254, 849, 45864, 849, 26949, 8585, 849, 12689, 627, 17693, 2], 'labels': 0}.\n",
      "02/16/2022 01:01:48 - INFO - __main__ - Sample 979 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 1039, 12105, 849, 18897, 2527, 718, 5, 4117, 12, 267, 13760, 4289, 16, 7, 6876, 14, 952, 7258, 4056, 7471, 4056, 48, 405, 531, 33, 57, 16, 354, 4, 3695, 4056, 7471, 4056, 46, 1437, 952, 7258, 4056, 7471, 4056, 18164, 1437, 2], 'labels': 1}.\n",
      "02/16/2022 01:01:48 - INFO - __main__ - Sample 2927 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 1039, 12105, 77, 16, 5, 92, 2642, 145, 703, 59, 787, 12105, 8, 110, 1108, 62, 116, 1437, 1437, 849, 17693, 1843, 10339, 4489, 849, 10120, 571, 5434, 2], 'labels': 0}.\n",
      "02/16/2022 01:01:49 - INFO - __main__ - ***** Running training *****\n",
      "02/16/2022 01:01:49 - INFO - __main__ -   Num examples = 4742\n",
      "02/16/2022 01:01:49 - INFO - __main__ -   Num Epochs = 1\n",
      "02/16/2022 01:01:49 - INFO - __main__ -   Instantaneous batch size per device = 24\n",
      "02/16/2022 01:01:49 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 24\n",
      "02/16/2022 01:01:49 - INFO - __main__ -   Gradient Accumulation steps = 1\n",
      "02/16/2022 01:01:49 - INFO - __main__ -   Total optimization steps = 198\n",
      "\n",
      "  0%|          | 0/198 [00:00<?, ?it/s]\n",
      "  1%|          | 1/198 [00:00<02:57,  1.11it/s]\n",
      "  1%|1         | 2/198 [00:02<03:38,  1.12s/it]\n",
      "  2%|1         | 3/198 [00:03<04:04,  1.25s/it]\n",
      "  2%|2         | 4/198 [00:04<03:42,  1.15s/it]\n",
      "  3%|2         | 5/198 [00:05<03:13,  1.00s/it]\n",
      "  3%|3         | 6/198 [00:06<03:48,  1.19s/it]\n",
      "  4%|3         | 7/198 [00:07<03:25,  1.08s/it]\n",
      "  4%|4         | 8/198 [00:09<04:20,  1.37s/it]\n",
      "  5%|4         | 9/198 [00:12<05:25,  1.72s/it]\n",
      "  5%|5         | 10/198 [00:13<04:31,  1.45s/it]\n",
      "  6%|5         | 11/198 [00:14<04:18,  1.38s/it]\n",
      "  6%|6         | 12/198 [00:15<04:15,  1.38s/it]\n",
      "  7%|6         | 13/198 [00:17<04:30,  1.46s/it]\n",
      "  7%|7         | 14/198 [00:18<04:13,  1.38s/it]\n",
      "  8%|7         | 15/198 [00:19<04:11,  1.37s/it]\n",
      "  8%|8         | 16/198 [00:21<04:22,  1.44s/it]\n",
      "  9%|8         | 17/198 [00:22<04:27,  1.48s/it]\n",
      "  9%|9         | 18/198 [00:23<03:47,  1.27s/it]\n",
      " 10%|9         | 19/198 [00:25<04:00,  1.34s/it]\n",
      " 10%|#         | 20/198 [00:26<03:28,  1.17s/it]\n",
      " 11%|#         | 21/198 [00:26<03:13,  1.09s/it]\n",
      " 11%|#1        | 22/198 [00:29<04:03,  1.38s/it]\n",
      " 12%|#1        | 23/198 [00:29<03:29,  1.20s/it]\n",
      " 12%|#2        | 24/198 [00:30<03:08,  1.08s/it]\n",
      " 13%|#2        | 25/198 [00:31<03:13,  1.12s/it]\n",
      " 13%|#3        | 26/198 [00:32<03:01,  1.05s/it]\n",
      " 14%|#3        | 27/198 [00:34<03:15,  1.15s/it]\n",
      " 14%|#4        | 28/198 [00:35<03:23,  1.20s/it]\n",
      " 15%|#4        | 29/198 [00:36<03:28,  1.23s/it]\n",
      " 15%|#5        | 30/198 [00:38<03:33,  1.27s/it]\n",
      " 16%|#5        | 31/198 [00:39<03:36,  1.29s/it]\n",
      " 16%|#6        | 32/198 [00:40<03:13,  1.17s/it]\n",
      " 17%|#6        | 33/198 [00:41<03:18,  1.20s/it]\n",
      " 17%|#7        | 34/198 [00:42<02:59,  1.09s/it]\n",
      " 18%|#7        | 35/198 [00:43<03:17,  1.21s/it]\n",
      " 18%|#8        | 36/198 [00:44<03:04,  1.14s/it]\n",
      " 19%|#8        | 37/198 [00:46<03:12,  1.19s/it]\n",
      " 19%|#9        | 38/198 [00:47<03:07,  1.17s/it]\n",
      " 20%|#9        | 39/198 [00:48<03:12,  1.21s/it]\n",
      " 20%|##        | 40/198 [00:49<02:52,  1.09s/it]\n",
      " 21%|##        | 41/198 [00:50<02:36,  1.00it/s]\n",
      " 21%|##1       | 42/198 [00:51<02:31,  1.03it/s]\n",
      " 22%|##1       | 43/198 [00:52<02:31,  1.02it/s]\n",
      " 22%|##2       | 44/198 [00:53<02:34,  1.00s/it]\n",
      " 23%|##2       | 45/198 [00:54<02:27,  1.04it/s]\n",
      " 23%|##3       | 46/198 [00:54<02:18,  1.10it/s]\n",
      " 24%|##3       | 47/198 [00:55<02:12,  1.14it/s]\n",
      " 24%|##4       | 48/198 [00:56<02:12,  1.13it/s]\n",
      " 25%|##4       | 49/198 [00:57<02:32,  1.03s/it]\n",
      " 25%|##5       | 50/198 [01:00<03:31,  1.43s/it]\n",
      " 26%|##5       | 51/198 [01:01<03:22,  1.38s/it]\n",
      " 26%|##6       | 52/198 [01:02<03:01,  1.25s/it]\n",
      " 27%|##6       | 53/198 [01:03<02:39,  1.10s/it]\n",
      " 27%|##7       | 54/198 [01:04<02:33,  1.06s/it]\n",
      " 28%|##7       | 55/198 [01:05<02:33,  1.07s/it]\n",
      " 28%|##8       | 56/198 [01:06<02:18,  1.03it/s]\n",
      " 29%|##8       | 57/198 [01:06<02:15,  1.04it/s]\n",
      " 29%|##9       | 58/198 [01:07<02:16,  1.02it/s]\n",
      " 30%|##9       | 59/198 [01:08<02:06,  1.10it/s]\n",
      " 30%|###       | 60/198 [01:09<02:02,  1.13it/s]\n",
      " 31%|###       | 61/198 [01:10<02:02,  1.12it/s]\n",
      " 31%|###1      | 62/198 [01:11<02:18,  1.02s/it]\n",
      " 32%|###1      | 63/198 [01:12<02:16,  1.01s/it]\n",
      " 32%|###2      | 64/198 [01:13<02:01,  1.10it/s]\n",
      " 33%|###2      | 65/198 [01:14<02:05,  1.06it/s]\n",
      " 33%|###3      | 66/198 [01:15<02:23,  1.09s/it]\n",
      " 34%|###3      | 67/198 [01:16<02:08,  1.02it/s]\n",
      " 34%|###4      | 68/198 [01:17<02:04,  1.04it/s]\n",
      " 35%|###4      | 69/198 [01:18<02:13,  1.03s/it]\n",
      " 35%|###5      | 70/198 [01:19<02:06,  1.01it/s]\n",
      " 36%|###5      | 71/198 [01:20<01:56,  1.09it/s]\n",
      " 36%|###6      | 72/198 [01:21<01:56,  1.08it/s]\n",
      " 37%|###6      | 73/198 [01:22<02:10,  1.04s/it]\n",
      " 37%|###7      | 74/198 [01:23<02:20,  1.13s/it]\n",
      " 38%|###7      | 75/198 [01:24<02:11,  1.07s/it]\n",
      " 38%|###8      | 76/198 [01:26<02:19,  1.15s/it]\n",
      " 39%|###8      | 77/198 [01:27<02:32,  1.26s/it]\n",
      " 39%|###9      | 78/198 [01:28<02:18,  1.15s/it]\n",
      " 40%|###9      | 79/198 [01:29<02:09,  1.09s/it]\n",
      " 40%|####      | 80/198 [01:30<02:00,  1.02s/it]\n",
      " 41%|####      | 81/198 [01:31<01:51,  1.05it/s]\n",
      " 41%|####1     | 82/198 [01:32<01:51,  1.04it/s]\n",
      " 42%|####1     | 83/198 [01:33<01:59,  1.04s/it]\n",
      " 42%|####2     | 84/198 [01:34<01:51,  1.03it/s]\n",
      " 43%|####2     | 85/198 [01:35<01:53,  1.00s/it]\n",
      " 43%|####3     | 86/198 [01:36<01:50,  1.02it/s]\n",
      " 44%|####3     | 87/198 [01:37<01:54,  1.03s/it]\n",
      " 44%|####4     | 88/198 [01:38<01:44,  1.05it/s]\n",
      " 45%|####4     | 89/198 [01:39<01:46,  1.03it/s]\n",
      " 45%|####5     | 90/198 [01:40<01:53,  1.05s/it]\n",
      " 46%|####5     | 91/198 [01:41<02:01,  1.14s/it]\n",
      " 46%|####6     | 92/198 [01:44<02:40,  1.52s/it]\n",
      " 47%|####6     | 93/198 [01:46<03:02,  1.74s/it]\n",
      " 47%|####7     | 94/198 [01:47<02:34,  1.49s/it]\n",
      " 48%|####7     | 95/198 [01:48<02:10,  1.27s/it]\n",
      " 48%|####8     | 96/198 [01:49<02:11,  1.29s/it]\n",
      " 49%|####8     | 97/198 [01:50<02:05,  1.24s/it]\n",
      " 49%|####9     | 98/198 [01:51<02:01,  1.21s/it]\n",
      " 50%|#####     | 99/198 [01:52<01:52,  1.14s/it]\n",
      " 51%|#####     | 100/198 [01:53<01:41,  1.03s/it]\n",
      " 51%|#####1    | 101/198 [01:54<01:42,  1.05s/it]\n",
      " 52%|#####1    | 102/198 [01:55<01:50,  1.15s/it]\n",
      " 52%|#####2    | 103/198 [01:56<01:38,  1.04s/it]\n",
      " 53%|#####2    | 104/198 [01:57<01:42,  1.09s/it]\n",
      " 53%|#####3    | 105/198 [01:59<02:01,  1.31s/it]\n",
      " 54%|#####3    | 106/198 [02:01<02:13,  1.46s/it]\n",
      " 54%|#####4    | 107/198 [02:02<02:09,  1.42s/it]\n",
      " 55%|#####4    | 108/198 [02:03<01:51,  1.24s/it]\n",
      " 55%|#####5    | 109/198 [02:04<01:48,  1.22s/it]\n",
      " 56%|#####5    | 110/198 [02:05<01:35,  1.09s/it]\n",
      " 56%|#####6    | 111/198 [02:06<01:28,  1.02s/it]\n",
      " 57%|#####6    | 112/198 [02:07<01:20,  1.06it/s]\n",
      " 57%|#####7    | 113/198 [02:08<01:25,  1.00s/it]\n",
      " 58%|#####7    | 114/198 [02:09<01:16,  1.10it/s]\n",
      " 58%|#####8    | 115/198 [02:09<01:13,  1.13it/s]\n",
      " 59%|#####8    | 116/198 [02:10<01:09,  1.18it/s]\n",
      " 59%|#####9    | 117/198 [02:12<01:20,  1.00it/s]\n",
      " 60%|#####9    | 118/198 [02:12<01:16,  1.04it/s]\n",
      " 60%|######    | 119/198 [02:14<01:24,  1.06s/it]\n",
      " 61%|######    | 120/198 [02:15<01:18,  1.00s/it]\n",
      " 61%|######1   | 121/198 [02:17<01:40,  1.31s/it]\n",
      " 62%|######1   | 122/198 [02:18<01:30,  1.19s/it]\n",
      " 62%|######2   | 123/198 [02:19<01:28,  1.18s/it]\n",
      " 63%|######2   | 124/198 [02:20<01:30,  1.22s/it]\n",
      " 63%|######3   | 125/198 [02:21<01:21,  1.12s/it]\n",
      " 64%|######3   | 126/198 [02:22<01:15,  1.05s/it]\n",
      " 64%|######4   | 127/198 [02:23<01:16,  1.08s/it]\n",
      " 65%|######4   | 128/198 [02:25<01:44,  1.49s/it]\n",
      " 65%|######5   | 129/198 [02:27<01:40,  1.46s/it]\n",
      " 66%|######5   | 130/198 [02:28<01:36,  1.42s/it]\n",
      " 66%|######6   | 131/198 [02:29<01:26,  1.29s/it]\n",
      " 67%|######6   | 132/198 [02:30<01:25,  1.29s/it]\n",
      " 67%|######7   | 133/198 [02:31<01:16,  1.18s/it]\n",
      " 68%|######7   | 134/198 [02:32<01:07,  1.06s/it]\n",
      " 68%|######8   | 135/198 [02:34<01:17,  1.23s/it]\n",
      " 69%|######8   | 136/198 [02:35<01:16,  1.23s/it]\n",
      " 69%|######9   | 137/198 [02:36<01:05,  1.07s/it]\n",
      " 70%|######9   | 138/198 [02:38<01:21,  1.36s/it]\n",
      " 70%|#######   | 139/198 [02:40<01:37,  1.66s/it]\n",
      " 71%|#######   | 140/198 [02:41<01:24,  1.45s/it]\n",
      " 71%|#######1  | 141/198 [02:42<01:20,  1.41s/it]\n",
      " 72%|#######1  | 142/198 [02:44<01:18,  1.40s/it]\n",
      " 72%|#######2  | 143/198 [02:44<01:06,  1.20s/it]\n",
      " 73%|#######2  | 144/198 [02:45<01:01,  1.13s/it]\n",
      " 73%|#######3  | 145/198 [02:48<01:20,  1.52s/it]\n",
      " 74%|#######3  | 146/198 [02:49<01:21,  1.57s/it]\n",
      " 74%|#######4  | 147/198 [02:51<01:16,  1.50s/it]\n",
      " 75%|#######4  | 148/198 [02:52<01:12,  1.45s/it]\n",
      " 75%|#######5  | 149/198 [02:53<01:03,  1.29s/it]\n",
      " 76%|#######5  | 150/198 [02:54<01:02,  1.30s/it]\n",
      " 76%|#######6  | 151/198 [02:55<00:56,  1.20s/it]\n",
      " 77%|#######6  | 152/198 [02:56<00:49,  1.08s/it]\n",
      " 77%|#######7  | 153/198 [02:58<00:55,  1.23s/it]\n",
      " 78%|#######7  | 154/198 [02:59<00:51,  1.16s/it]\n",
      " 78%|#######8  | 155/198 [03:00<00:45,  1.06s/it]\n",
      " 79%|#######8  | 156/198 [03:00<00:41,  1.02it/s]\n",
      " 79%|#######9  | 157/198 [03:01<00:41,  1.02s/it]\n",
      " 80%|#######9  | 158/198 [03:03<00:40,  1.02s/it]\n",
      " 80%|########  | 159/198 [03:03<00:37,  1.05it/s]\n",
      " 81%|########  | 160/198 [03:05<00:39,  1.05s/it]\n",
      " 81%|########1 | 161/198 [03:06<00:41,  1.13s/it]\n",
      " 82%|########1 | 162/198 [03:07<00:36,  1.00s/it]\n",
      " 82%|########2 | 163/198 [03:08<00:38,  1.10s/it]\n",
      " 83%|########2 | 164/198 [03:09<00:36,  1.08s/it]\n",
      " 83%|########3 | 165/198 [03:10<00:34,  1.03s/it]\n",
      " 84%|########3 | 166/198 [03:11<00:32,  1.03s/it]\n",
      " 84%|########4 | 167/198 [03:12<00:29,  1.04it/s]\n",
      " 85%|########4 | 168/198 [03:13<00:29,  1.02it/s]\n",
      " 85%|########5 | 169/198 [03:15<00:36,  1.25s/it]\n",
      " 86%|########5 | 170/198 [03:16<00:35,  1.25s/it]\n",
      " 86%|########6 | 171/198 [03:17<00:31,  1.18s/it]\n",
      " 87%|########6 | 172/198 [03:18<00:29,  1.13s/it]\n",
      " 87%|########7 | 173/198 [03:19<00:27,  1.09s/it]\n",
      " 88%|########7 | 174/198 [03:20<00:25,  1.05s/it]\n",
      " 88%|########8 | 175/198 [03:21<00:26,  1.16s/it]\n",
      " 89%|########8 | 176/198 [03:23<00:26,  1.20s/it]\n",
      " 89%|########9 | 177/198 [03:24<00:25,  1.20s/it]\n",
      " 90%|########9 | 178/198 [03:26<00:30,  1.55s/it]\n",
      " 90%|######### | 179/198 [03:27<00:27,  1.46s/it]\n",
      " 91%|######### | 180/198 [03:28<00:22,  1.26s/it]\n",
      " 91%|#########1| 181/198 [03:31<00:27,  1.63s/it]\n",
      " 92%|#########1| 182/198 [03:31<00:22,  1.38s/it]\n",
      " 92%|#########2| 183/198 [03:33<00:19,  1.28s/it]\n",
      " 93%|#########2| 184/198 [03:34<00:18,  1.29s/it]\n",
      " 93%|#########3| 185/198 [03:35<00:17,  1.37s/it]\n",
      " 94%|#########3| 186/198 [03:37<00:15,  1.30s/it]\n",
      " 94%|#########4| 187/198 [03:38<00:14,  1.31s/it]\n",
      " 95%|#########4| 188/198 [03:39<00:11,  1.15s/it]\n",
      " 95%|#########5| 189/198 [03:41<00:13,  1.49s/it]\n",
      " 96%|#########5| 190/198 [03:42<00:10,  1.27s/it]\n",
      " 96%|#########6| 191/198 [03:43<00:08,  1.27s/it]\n",
      " 97%|#########6| 192/198 [03:44<00:07,  1.22s/it]\n",
      " 97%|#########7| 193/198 [03:45<00:05,  1.10s/it]\n",
      " 98%|#########7| 194/198 [03:46<00:03,  1.01it/s]\n",
      " 98%|#########8| 195/198 [03:47<00:02,  1.03it/s]\n",
      " 99%|#########8| 196/198 [03:47<00:01,  1.11it/s]\n",
      " 99%|#########9| 197/198 [03:48<00:00,  1.01it/s]\n",
      "100%|##########| 198/198 [03:49<00:00,  1.19it/s]02/16/2022 01:06:06 - INFO - __main__ - Epoch 0: {'accuracy': 0.938}\n",
      "02/16/2022 01:06:29 - INFO - __main__ - Test-set evaluation: {'accuracy': 1.0}\n",
      "Configuration saved in out/tweet/roberta_version_3\\config.json\n",
      "Model weights saved in out/tweet/roberta_version_3\\pytorch_model.bin\n",
      "tokenizer config file saved in out/tweet/roberta_version_3\\tokenizer_config.json\n",
      "Special tokens file saved in out/tweet/roberta_version_3\\special_tokens_map.json\n",
      "\n",
      "100%|##########| 198/198 [04:40<00:00,  1.42s/it]\n"
     ]
    }
   ],
   "source": [
    "!python run_glue_no_trainer.py \\\n",
    "  --model_name_or_path roberta-base \\\n",
    "  --train_file data/train.json  \\\n",
    "  --validation_file data/valid.json \\\n",
    "  --test_file data/test.json \\\n",
    "  --per_device_train_batch_size 24 \\\n",
    "  --per_device_eval_batch_size 24 \\\n",
    "  --max_length 128 \\\n",
    "  --freeze_model \\\n",
    "  --custom_model \\\n",
    "  --learning_rate 2e-5 \\\n",
    "  --num_train_epochs 1 \\\n",
    "  --output_dir out/tweet/roberta_version_3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Roberta version 4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "02/16/2022 01:06:31 - INFO - __main__ - Distributed environment: NO\n",
      "Num processes: 1\n",
      "Process index: 0\n",
      "Local process index: 0\n",
      "Device: cpu\n",
      "Use FP16 precision: False\n",
      "\n",
      "02/16/2022 01:06:32 - WARNING - datasets.builder - Using custom data configuration default-67c9d932a627b7b8\n",
      "02/16/2022 01:06:32 - WARNING - datasets.builder - Reusing dataset json (C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)\n",
      "\n",
      "  0%|          | 0/3 [00:00<?, ?it/s]\n",
      "100%|##########| 3/3 [00:00<00:00, 1507.84it/s]\n",
      "loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b\n",
      "Model config RobertaConfig {\n",
      "  \"architectures\": [\n",
      "    \"RobertaForMaskedLM\"\n",
      "  ],\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"bos_token_id\": 0,\n",
      "  \"classifier_dropout\": null,\n",
      "  \"eos_token_id\": 2,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"layer_norm_eps\": 1e-05,\n",
      "  \"max_position_embeddings\": 514,\n",
      "  \"model_type\": \"roberta\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"pad_token_id\": 1,\n",
      "  \"position_embedding_type\": \"absolute\",\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"type_vocab_size\": 1,\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50265\n",
      "}\n",
      "\n",
      "Could not locate the tokenizer configuration file, will try to use the model config instead.\n",
      "loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b\n",
      "Model config RobertaConfig {\n",
      "  \"architectures\": [\n",
      "    \"RobertaForMaskedLM\"\n",
      "  ],\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"bos_token_id\": 0,\n",
      "  \"classifier_dropout\": null,\n",
      "  \"eos_token_id\": 2,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"layer_norm_eps\": 1e-05,\n",
      "  \"max_position_embeddings\": 514,\n",
      "  \"model_type\": \"roberta\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"pad_token_id\": 1,\n",
      "  \"position_embedding_type\": \"absolute\",\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"type_vocab_size\": 1,\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50265\n",
      "}\n",
      "\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/tokenizer.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/added_tokens.json from cache at None\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/special_tokens_map.json from cache at None\n",
      "loading file https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json from cache at None\n",
      "loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b\n",
      "Model config RobertaConfig {\n",
      "  \"architectures\": [\n",
      "    \"RobertaForMaskedLM\"\n",
      "  ],\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"bos_token_id\": 0,\n",
      "  \"classifier_dropout\": null,\n",
      "  \"eos_token_id\": 2,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"layer_norm_eps\": 1e-05,\n",
      "  \"max_position_embeddings\": 514,\n",
      "  \"model_type\": \"roberta\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"pad_token_id\": 1,\n",
      "  \"position_embedding_type\": \"absolute\",\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"type_vocab_size\": 1,\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50265\n",
      "}\n",
      "\n",
      "02/16/2022 01:06:38 - INFO - __main__ - Return hidden states from model: True\n",
      "02/16/2022 01:06:38 - INFO - __main__ - Using implementation from: RobertaForSequenceClassificationCustomAlternative\n",
      "loading weights file https://huggingface.co/roberta-base/resolve/main/pytorch_model.bin from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7\n",
      "Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassificationCustomAlternative: ['lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight']\n",
      "- This IS expected if you are initializing RobertaForSequenceClassificationCustomAlternative from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing RobertaForSequenceClassificationCustomAlternative from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of RobertaForSequenceClassificationCustomAlternative were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense_1_hidden.weight', 'classifier.dense_1_hidden.bias', 'classifier.dense_1_input.bias', 'classifier.dense_2.bias', 'classifier.dense_2.weight', 'classifier.out_proj.weight', 'classifier.dense_1_input.weight', 'classifier.out_proj.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "02/16/2022 01:06:40 - INFO - __main__ - Freezing model weights\n",
      "02/16/2022 01:06:40 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-73165df4ba3ef6cf.arrow\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 31.33ba/s]\n",
      "02/16/2022 01:06:40 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-67c9d932a627b7b8\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-015ce493f6b049f3.arrow\n",
      "02/16/2022 01:06:40 - INFO - __main__ - Sample 3979 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 1039, 12105, 787, 12105, 787, 12105, 787, 12105, 787, 12105, 45365, 5, 2526, 9, 84, 184, 1269, 4, 1437, 1437, 2], 'labels': 0}.\n",
      "02/16/2022 01:06:40 - INFO - __main__ - Sample 2415 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 10669, 99, 84, 247, 439, 149, 42, 94, 76, 7, 192, 82, 836, 10, 22, 27076, 113, 7, 5, 4773, 359, 3914, 131, 283, 259, 13, 960, 53, 1037, 1437, 1437, 2], 'labels': 0}.\n",
      "02/16/2022 01:06:40 - INFO - __main__ - Sample 2136 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [0, 1039, 12105, 849, 41468, 1809, 4473, 20126, 849, 41468, 1809, 4742, 21929, 1809, 849, 41468, 1809, 119, 1350, 90, 428, 4759, 415, 596, 1437, 849, 31336, 28465, 16, 8266, 1437, 787, 12105, 2], 'labels': 1}.\n",
      "02/16/2022 01:06:41 - INFO - __main__ - ***** Running training *****\n",
      "02/16/2022 01:06:41 - INFO - __main__ -   Num examples = 4742\n",
      "02/16/2022 01:06:41 - INFO - __main__ -   Num Epochs = 1\n",
      "02/16/2022 01:06:41 - INFO - __main__ -   Instantaneous batch size per device = 24\n",
      "02/16/2022 01:06:41 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 24\n",
      "02/16/2022 01:06:41 - INFO - __main__ -   Gradient Accumulation steps = 1\n",
      "02/16/2022 01:06:41 - INFO - __main__ -   Total optimization steps = 198\n",
      "\n",
      "  0%|          | 0/198 [00:00<?, ?it/s]\n",
      "  1%|          | 1/198 [00:02<07:48,  2.38s/it]\n",
      "  1%|1         | 2/198 [00:04<07:47,  2.39s/it]\n",
      "  2%|1         | 3/198 [00:06<06:30,  2.00s/it]\n",
      "  2%|2         | 4/198 [00:07<04:55,  1.52s/it]\n",
      "  3%|2         | 5/198 [00:08<04:20,  1.35s/it]\n",
      "  3%|3         | 6/198 [00:09<04:16,  1.34s/it]\n",
      "  4%|3         | 7/198 [00:10<04:14,  1.33s/it]\n",
      "  4%|4         | 8/198 [00:12<04:12,  1.33s/it]\n",
      "  5%|4         | 9/198 [00:13<03:49,  1.22s/it]\n",
      "  5%|5         | 10/198 [00:14<03:40,  1.17s/it]\n",
      "  6%|5         | 11/198 [00:15<03:24,  1.09s/it]\n",
      "  6%|6         | 12/198 [00:15<03:07,  1.01s/it]\n",
      "  7%|6         | 13/198 [00:16<03:09,  1.02s/it]\n",
      "  7%|7         | 14/198 [00:17<03:03,  1.00it/s]\n",
      "  8%|7         | 15/198 [00:18<02:56,  1.04it/s]\n",
      "  8%|8         | 16/198 [00:19<02:49,  1.08it/s]\n",
      "  9%|8         | 17/198 [00:20<02:56,  1.03it/s]\n",
      "  9%|9         | 18/198 [00:21<02:47,  1.07it/s]\n",
      " 10%|9         | 19/198 [00:22<02:53,  1.03it/s]\n",
      " 10%|#         | 20/198 [00:24<03:27,  1.16s/it]\n",
      " 11%|#         | 21/198 [00:24<03:01,  1.03s/it]\n",
      " 11%|#1        | 22/198 [00:26<03:34,  1.22s/it]\n",
      " 12%|#1        | 23/198 [00:27<03:20,  1.14s/it]\n",
      " 12%|#2        | 24/198 [00:28<03:08,  1.08s/it]\n",
      " 13%|#2        | 25/198 [00:29<03:17,  1.14s/it]\n",
      " 13%|#3        | 26/198 [00:31<03:34,  1.24s/it]\n",
      " 14%|#3        | 27/198 [00:32<03:38,  1.28s/it]\n",
      " 14%|#4        | 28/198 [00:33<03:16,  1.16s/it]\n",
      " 15%|#4        | 29/198 [00:34<03:02,  1.08s/it]\n",
      " 15%|#5        | 30/198 [00:35<02:55,  1.05s/it]\n",
      " 16%|#5        | 31/198 [00:36<02:43,  1.02it/s]\n",
      " 16%|#6        | 32/198 [00:37<02:48,  1.01s/it]\n",
      " 17%|#6        | 33/198 [00:38<02:51,  1.04s/it]\n",
      " 17%|#7        | 34/198 [00:39<02:37,  1.04it/s]\n",
      " 18%|#7        | 35/198 [00:39<02:29,  1.09it/s]\n",
      " 18%|#8        | 36/198 [00:41<03:20,  1.23s/it]\n",
      " 19%|#8        | 37/198 [00:44<04:16,  1.60s/it]\n",
      " 19%|#9        | 38/198 [00:45<03:42,  1.39s/it]\n",
      " 20%|#9        | 39/198 [00:46<03:35,  1.36s/it]\n",
      " 20%|##        | 40/198 [00:48<03:39,  1.39s/it]\n",
      " 21%|##        | 41/198 [00:48<03:16,  1.25s/it]\n",
      " 21%|##1       | 42/198 [00:50<03:28,  1.34s/it]\n",
      " 22%|##1       | 43/198 [00:52<03:57,  1.53s/it]\n",
      " 22%|##2       | 44/198 [00:53<03:30,  1.37s/it]\n",
      " 23%|##2       | 45/198 [00:54<03:05,  1.21s/it]\n",
      " 23%|##3       | 46/198 [00:55<02:46,  1.09s/it]\n",
      " 24%|##3       | 47/198 [00:55<02:30,  1.00it/s]\n",
      " 24%|##4       | 48/198 [00:56<02:20,  1.07it/s]\n",
      " 25%|##4       | 49/198 [00:57<02:11,  1.13it/s]\n",
      " 25%|##5       | 50/198 [00:58<02:15,  1.09it/s]\n",
      " 26%|##5       | 51/198 [00:59<02:09,  1.13it/s]\n",
      " 26%|##6       | 52/198 [01:00<02:24,  1.01it/s]\n",
      " 27%|##6       | 53/198 [01:01<02:11,  1.10it/s]\n",
      " 27%|##7       | 54/198 [01:02<02:15,  1.06it/s]\n",
      " 28%|##7       | 55/198 [01:03<02:28,  1.04s/it]\n",
      " 28%|##8       | 56/198 [01:04<02:14,  1.06it/s]\n",
      " 29%|##8       | 57/198 [01:05<02:30,  1.07s/it]\n",
      " 29%|##9       | 58/198 [01:06<02:24,  1.03s/it]\n",
      " 30%|##9       | 59/198 [01:07<02:35,  1.12s/it]\n",
      " 30%|###       | 60/198 [01:08<02:27,  1.07s/it]\n",
      " 31%|###       | 61/198 [01:10<02:35,  1.14s/it]\n",
      " 31%|###1      | 62/198 [01:11<02:43,  1.20s/it]\n",
      " 32%|###1      | 63/198 [01:12<02:48,  1.25s/it]\n",
      " 32%|###2      | 64/198 [01:13<02:36,  1.17s/it]\n",
      " 33%|###2      | 65/198 [01:14<02:15,  1.02s/it]\n",
      " 33%|###3      | 66/198 [01:16<02:40,  1.22s/it]\n",
      " 34%|###3      | 67/198 [01:18<03:20,  1.53s/it]\n",
      " 34%|###4      | 68/198 [01:19<02:54,  1.35s/it]\n",
      " 35%|###4      | 69/198 [01:20<03:02,  1.41s/it]\n",
      " 35%|###5      | 70/198 [01:21<02:44,  1.29s/it]\n",
      " 36%|###5      | 71/198 [01:22<02:27,  1.16s/it]\n",
      " 36%|###6      | 72/198 [01:23<02:23,  1.14s/it]\n",
      " 37%|###6      | 73/198 [01:25<02:25,  1.16s/it]\n",
      " 37%|###7      | 74/198 [01:26<02:32,  1.23s/it]\n",
      " 38%|###7      | 75/198 [01:27<02:15,  1.10s/it]\n",
      " 38%|###8      | 76/198 [01:29<03:03,  1.50s/it]\n",
      " 39%|###8      | 77/198 [01:30<02:47,  1.39s/it]\n",
      " 39%|###9      | 78/198 [01:31<02:28,  1.24s/it]\n",
      " 40%|###9      | 79/198 [01:32<02:12,  1.11s/it]\n",
      " 40%|####      | 80/198 [01:33<02:08,  1.09s/it]\n",
      " 41%|####      | 81/198 [01:34<01:56,  1.01it/s]\n",
      " 41%|####1     | 82/198 [01:35<01:46,  1.09it/s]\n",
      " 42%|####1     | 83/198 [01:36<01:52,  1.02it/s]\n",
      " 42%|####2     | 84/198 [01:36<01:46,  1.07it/s]\n",
      " 43%|####2     | 85/198 [01:37<01:41,  1.11it/s]\n",
      " 43%|####3     | 86/198 [01:38<01:38,  1.14it/s]\n",
      " 44%|####3     | 87/198 [01:41<02:30,  1.35s/it]\n",
      " 44%|####4     | 88/198 [01:42<02:35,  1.41s/it]\n",
      " 45%|####4     | 89/198 [01:43<02:15,  1.25s/it]\n",
      " 45%|####5     | 90/198 [01:44<01:59,  1.10s/it]\n",
      " 46%|####5     | 91/198 [01:46<02:41,  1.51s/it]\n",
      " 46%|####6     | 92/198 [01:47<02:21,  1.34s/it]\n",
      " 47%|####6     | 93/198 [01:48<02:09,  1.23s/it]\n",
      " 47%|####7     | 94/198 [01:49<02:01,  1.17s/it]\n",
      " 48%|####7     | 95/198 [01:50<01:52,  1.09s/it]\n",
      " 48%|####8     | 96/198 [01:52<02:05,  1.23s/it]\n",
      " 49%|####8     | 97/198 [01:52<01:51,  1.10s/it]\n",
      " 49%|####9     | 98/198 [01:53<01:41,  1.02s/it]\n",
      " 50%|#####     | 99/198 [01:54<01:38,  1.01it/s]\n",
      " 51%|#####     | 100/198 [01:55<01:35,  1.03it/s]\n",
      " 51%|#####1    | 101/198 [01:56<01:45,  1.09s/it]\n",
      " 52%|#####1    | 102/198 [01:58<01:47,  1.12s/it]\n",
      " 52%|#####2    | 103/198 [01:58<01:38,  1.03s/it]\n",
      " 53%|#####2    | 104/198 [02:00<01:45,  1.12s/it]\n",
      " 53%|#####3    | 105/198 [02:01<01:54,  1.23s/it]\n",
      " 54%|#####3    | 106/198 [02:02<01:48,  1.18s/it]\n",
      " 54%|#####4    | 107/198 [02:03<01:41,  1.12s/it]\n",
      " 55%|#####4    | 108/198 [02:04<01:30,  1.00s/it]\n",
      " 55%|#####5    | 109/198 [02:05<01:26,  1.03it/s]\n",
      " 56%|#####5    | 110/198 [02:06<01:34,  1.07s/it]\n",
      " 56%|#####6    | 111/198 [02:07<01:33,  1.08s/it]\n",
      " 57%|#####6    | 112/198 [02:08<01:33,  1.09s/it]\n",
      " 57%|#####7    | 113/198 [02:09<01:26,  1.02s/it]\n",
      " 58%|#####7    | 114/198 [02:11<01:34,  1.12s/it]\n",
      " 58%|#####8    | 115/198 [02:11<01:25,  1.03s/it]\n",
      " 59%|#####8    | 116/198 [02:13<01:31,  1.12s/it]\n",
      " 59%|#####9    | 117/198 [02:14<01:34,  1.16s/it]\n",
      " 60%|#####9    | 118/198 [02:15<01:28,  1.11s/it]\n",
      " 60%|######    | 119/198 [02:16<01:31,  1.16s/it]\n",
      " 61%|######    | 120/198 [02:18<01:34,  1.21s/it]\n",
      " 61%|######1   | 121/198 [02:19<01:26,  1.12s/it]\n",
      " 62%|######1   | 122/198 [02:20<01:22,  1.09s/it]\n",
      " 62%|######2   | 123/198 [02:21<01:26,  1.15s/it]\n",
      " 63%|######2   | 124/198 [02:22<01:28,  1.20s/it]\n",
      " 63%|######3   | 125/198 [02:25<01:55,  1.59s/it]\n",
      " 64%|######3   | 126/198 [02:27<02:00,  1.67s/it]\n",
      " 64%|######4   | 127/198 [02:27<01:37,  1.38s/it]\n",
      " 65%|######4   | 128/198 [02:29<01:35,  1.37s/it]\n",
      " 65%|######5   | 129/198 [02:30<01:33,  1.35s/it]\n",
      " 66%|######5   | 130/198 [02:31<01:21,  1.21s/it]\n",
      " 66%|######6   | 131/198 [02:32<01:11,  1.06s/it]\n",
      " 67%|######6   | 132/198 [02:32<01:07,  1.02s/it]\n",
      " 67%|######7   | 133/198 [02:34<01:16,  1.17s/it]\n",
      " 68%|######7   | 134/198 [02:35<01:13,  1.14s/it]\n",
      " 68%|######8   | 135/198 [02:37<01:28,  1.40s/it]\n",
      " 69%|######8   | 136/198 [02:38<01:15,  1.22s/it]\n",
      " 69%|######9   | 137/198 [02:39<01:17,  1.27s/it]\n",
      " 70%|######9   | 138/198 [02:40<01:14,  1.24s/it]\n",
      " 70%|#######   | 139/198 [02:42<01:13,  1.25s/it]\n",
      " 71%|#######   | 140/198 [02:44<01:31,  1.58s/it]\n",
      " 71%|#######1  | 141/198 [02:45<01:15,  1.33s/it]\n",
      " 72%|#######1  | 142/198 [02:46<01:11,  1.28s/it]\n",
      " 72%|#######2  | 143/198 [02:48<01:28,  1.60s/it]\n",
      " 73%|#######2  | 144/198 [02:49<01:16,  1.42s/it]\n",
      " 73%|#######3  | 145/198 [02:51<01:14,  1.40s/it]\n",
      " 74%|#######3  | 146/198 [02:51<01:03,  1.23s/it]\n",
      " 74%|#######4  | 147/198 [02:52<00:55,  1.09s/it]\n",
      " 75%|#######4  | 148/198 [02:53<00:51,  1.02s/it]\n",
      " 75%|#######5  | 149/198 [02:54<00:49,  1.01s/it]\n",
      " 76%|#######5  | 150/198 [02:55<00:46,  1.04it/s]\n",
      " 76%|#######6  | 151/198 [02:56<00:44,  1.07it/s]\n",
      " 77%|#######6  | 152/198 [02:57<00:48,  1.05s/it]\n",
      " 77%|#######7  | 153/198 [02:58<00:49,  1.11s/it]\n",
      " 78%|#######7  | 154/198 [02:59<00:44,  1.01s/it]\n",
      " 78%|#######8  | 155/198 [03:00<00:41,  1.04it/s]\n",
      " 79%|#######8  | 156/198 [03:01<00:44,  1.06s/it]\n",
      " 79%|#######9  | 157/198 [03:02<00:40,  1.02it/s]\n",
      " 80%|#######9  | 158/198 [03:03<00:40,  1.01s/it]\n",
      " 80%|########  | 159/198 [03:04<00:43,  1.10s/it]\n",
      " 81%|########  | 160/198 [03:06<00:42,  1.11s/it]\n",
      " 81%|########1 | 161/198 [03:07<00:38,  1.04s/it]\n",
      " 82%|########1 | 162/198 [03:08<00:38,  1.06s/it]\n",
      " 82%|########2 | 163/198 [03:09<00:45,  1.31s/it]\n",
      " 83%|########2 | 164/198 [03:10<00:40,  1.19s/it]\n",
      " 83%|########3 | 165/198 [03:11<00:36,  1.10s/it]\n",
      " 84%|########3 | 166/198 [03:12<00:32,  1.01s/it]\n",
      " 84%|########4 | 167/198 [03:13<00:33,  1.07s/it]\n",
      " 85%|########4 | 168/198 [03:15<00:34,  1.14s/it]\n",
      " 85%|########5 | 169/198 [03:16<00:31,  1.10s/it]\n",
      " 86%|########5 | 170/198 [03:17<00:34,  1.22s/it]\n",
      " 86%|########6 | 171/198 [03:18<00:33,  1.25s/it]\n",
      " 87%|########6 | 172/198 [03:19<00:30,  1.16s/it]\n",
      " 87%|########7 | 173/198 [03:21<00:35,  1.43s/it]\n",
      " 88%|########7 | 174/198 [03:23<00:33,  1.39s/it]\n",
      " 88%|########8 | 175/198 [03:24<00:33,  1.46s/it]\n",
      " 89%|########8 | 176/198 [03:26<00:31,  1.41s/it]\n",
      " 89%|########9 | 177/198 [03:27<00:29,  1.40s/it]\n",
      " 90%|########9 | 178/198 [03:28<00:27,  1.37s/it]\n",
      " 90%|######### | 179/198 [03:30<00:25,  1.32s/it]\n",
      " 91%|######### | 180/198 [03:30<00:21,  1.17s/it]\n",
      " 91%|#########1| 181/198 [03:31<00:18,  1.10s/it]\n",
      " 92%|#########1| 182/198 [03:33<00:18,  1.13s/it]\n",
      " 92%|#########2| 183/198 [03:33<00:15,  1.05s/it]\n",
      " 93%|#########2| 184/198 [03:35<00:16,  1.15s/it]\n",
      " 93%|#########3| 185/198 [03:36<00:13,  1.06s/it]\n",
      " 94%|#########3| 186/198 [03:37<00:13,  1.15s/it]\n",
      " 94%|#########4| 187/198 [03:38<00:11,  1.03s/it]\n",
      " 95%|#########4| 188/198 [03:39<00:10,  1.03s/it]\n",
      " 95%|#########5| 189/198 [03:40<00:09,  1.02s/it]\n",
      " 96%|#########5| 190/198 [03:41<00:08,  1.10s/it]\n",
      " 96%|#########6| 191/198 [03:42<00:08,  1.15s/it]\n",
      " 97%|#########6| 192/198 [03:44<00:07,  1.21s/it]\n",
      " 97%|#########7| 193/198 [03:45<00:05,  1.18s/it]\n",
      " 98%|#########7| 194/198 [03:46<00:04,  1.08s/it]\n",
      " 98%|#########8| 195/198 [03:46<00:02,  1.00it/s]\n",
      " 99%|#########8| 196/198 [03:47<00:01,  1.04it/s]\n",
      " 99%|#########9| 197/198 [03:48<00:00,  1.05it/s]\n",
      "100%|##########| 198/198 [03:49<00:00,  1.27it/s]02/16/2022 01:10:58 - INFO - __main__ - Epoch 0: {'accuracy': 0.938}\n",
      "02/16/2022 01:11:22 - INFO - __main__ - Test-set evaluation: {'accuracy': 1.0}\n",
      "Configuration saved in out/tweet/roberta_version_4\\config.json\n",
      "Model weights saved in out/tweet/roberta_version_4\\pytorch_model.bin\n",
      "tokenizer config file saved in out/tweet/roberta_version_4\\tokenizer_config.json\n",
      "Special tokens file saved in out/tweet/roberta_version_4\\special_tokens_map.json\n",
      "\n",
      "100%|##########| 198/198 [04:40<00:00,  1.42s/it]\n"
     ]
    }
   ],
   "source": [
    "!python run_glue_no_trainer.py \\\n",
    "  --model_name_or_path roberta-base \\\n",
    "  --train_file data/train.json  \\\n",
    "  --validation_file data/valid.json \\\n",
    "  --test_file data/test.json \\\n",
    "  --per_device_train_batch_size 24 \\\n",
    "  --per_device_eval_batch_size 24 \\\n",
    "  --max_length 128 \\\n",
    "  --freeze_model \\\n",
    "  --custom_model \\\n",
    "  --return_hidden_states \\\n",
    "  --learning_rate 2e-5 \\\n",
    "  --num_train_epochs 1 \\\n",
    "  --output_dir out/tweet/roberta_version_4"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# T5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "02/17/2022 17:13:52 - INFO - __main__ - Distributed environment: NO\n",
      "Num processes: 1\n",
      "Process index: 0\n",
      "Local process index: 0\n",
      "Device: cpu\n",
      "Use FP16 precision: False\n",
      "\n",
      "02/17/2022 17:13:53 - WARNING - datasets.builder - Using custom data configuration default-c1907d9305fb2fbb\n",
      "02/17/2022 17:13:53 - WARNING - datasets.builder - Reusing dataset json (C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-c1907d9305fb2fbb\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)\n",
      "\n",
      "  0%|          | 0/3 [00:00<?, ?it/s]\n",
      "100%|##########| 3/3 [00:00<00:00, 143.23it/s]\n",
      "loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985\n",
      "Model config T5Config {\n",
      "  \"architectures\": [\n",
      "    \"T5WithLMHeadModel\"\n",
      "  ],\n",
      "  \"d_ff\": 2048,\n",
      "  \"d_kv\": 64,\n",
      "  \"d_model\": 512,\n",
      "  \"decoder_start_token_id\": 0,\n",
      "  \"dropout_rate\": 0.1,\n",
      "  \"eos_token_id\": 1,\n",
      "  \"feed_forward_proj\": \"relu\",\n",
      "  \"initializer_factor\": 1.0,\n",
      "  \"is_encoder_decoder\": true,\n",
      "  \"layer_norm_epsilon\": 1e-06,\n",
      "  \"model_type\": \"t5\",\n",
      "  \"n_positions\": 512,\n",
      "  \"num_decoder_layers\": 6,\n",
      "  \"num_heads\": 8,\n",
      "  \"num_layers\": 6,\n",
      "  \"output_past\": true,\n",
      "  \"pad_token_id\": 0,\n",
      "  \"relative_attention_num_buckets\": 32,\n",
      "  \"task_specific_params\": {\n",
      "    \"summarization\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"length_penalty\": 2.0,\n",
      "      \"max_length\": 200,\n",
      "      \"min_length\": 30,\n",
      "      \"no_repeat_ngram_size\": 3,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"summarize: \"\n",
      "    },\n",
      "    \"translation_en_to_de\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to German: \"\n",
      "    },\n",
      "    \"translation_en_to_fr\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to French: \"\n",
      "    },\n",
      "    \"translation_en_to_ro\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to Romanian: \"\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 32128\n",
      "}\n",
      "\n",
      "Could not locate the tokenizer configuration file, will try to use the model config instead.\n",
      "loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985\n",
      "Model config T5Config {\n",
      "  \"architectures\": [\n",
      "    \"T5WithLMHeadModel\"\n",
      "  ],\n",
      "  \"d_ff\": 2048,\n",
      "  \"d_kv\": 64,\n",
      "  \"d_model\": 512,\n",
      "  \"decoder_start_token_id\": 0,\n",
      "  \"dropout_rate\": 0.1,\n",
      "  \"eos_token_id\": 1,\n",
      "  \"feed_forward_proj\": \"relu\",\n",
      "  \"initializer_factor\": 1.0,\n",
      "  \"is_encoder_decoder\": true,\n",
      "  \"layer_norm_epsilon\": 1e-06,\n",
      "  \"model_type\": \"t5\",\n",
      "  \"n_positions\": 512,\n",
      "  \"num_decoder_layers\": 6,\n",
      "  \"num_heads\": 8,\n",
      "  \"num_layers\": 6,\n",
      "  \"output_past\": true,\n",
      "  \"pad_token_id\": 0,\n",
      "  \"relative_attention_num_buckets\": 32,\n",
      "  \"task_specific_params\": {\n",
      "    \"summarization\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"length_penalty\": 2.0,\n",
      "      \"max_length\": 200,\n",
      "      \"min_length\": 30,\n",
      "      \"no_repeat_ngram_size\": 3,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"summarize: \"\n",
      "    },\n",
      "    \"translation_en_to_de\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to German: \"\n",
      "    },\n",
      "    \"translation_en_to_fr\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to French: \"\n",
      "    },\n",
      "    \"translation_en_to_ro\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to Romanian: \"\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 32128\n",
      "}\n",
      "\n",
      "loading file https://huggingface.co/t5-small/resolve/main/spiece.model from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\65fc04e21f45f61430aea0c4fedffac16a4d20d78b8e6601d8d996ebefefecd2.3b69006860e7b5d0a63ffdddc01ddcd6b7c318a6f4fd793596552c741734c62d\n",
      "loading file https://huggingface.co/t5-small/resolve/main/tokenizer.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\06779097c78e12f47ef67ecb728810c2ae757ee0a9efe9390c6419783d99382d.8627f1bd5d270a9fd2e5a51c8bec3223896587cc3cfe13edeabb0992ab43c529\n",
      "loading file https://huggingface.co/t5-small/resolve/main/added_tokens.json from cache at None\n",
      "loading file https://huggingface.co/t5-small/resolve/main/special_tokens_map.json from cache at None\n",
      "loading file https://huggingface.co/t5-small/resolve/main/tokenizer_config.json from cache at None\n",
      "loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985\n",
      "Model config T5Config {\n",
      "  \"architectures\": [\n",
      "    \"T5WithLMHeadModel\"\n",
      "  ],\n",
      "  \"d_ff\": 2048,\n",
      "  \"d_kv\": 64,\n",
      "  \"d_model\": 512,\n",
      "  \"decoder_start_token_id\": 0,\n",
      "  \"dropout_rate\": 0.1,\n",
      "  \"eos_token_id\": 1,\n",
      "  \"feed_forward_proj\": \"relu\",\n",
      "  \"initializer_factor\": 1.0,\n",
      "  \"is_encoder_decoder\": true,\n",
      "  \"layer_norm_epsilon\": 1e-06,\n",
      "  \"model_type\": \"t5\",\n",
      "  \"n_positions\": 512,\n",
      "  \"num_decoder_layers\": 6,\n",
      "  \"num_heads\": 8,\n",
      "  \"num_layers\": 6,\n",
      "  \"output_past\": true,\n",
      "  \"pad_token_id\": 0,\n",
      "  \"relative_attention_num_buckets\": 32,\n",
      "  \"task_specific_params\": {\n",
      "    \"summarization\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"length_penalty\": 2.0,\n",
      "      \"max_length\": 200,\n",
      "      \"min_length\": 30,\n",
      "      \"no_repeat_ngram_size\": 3,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"summarize: \"\n",
      "    },\n",
      "    \"translation_en_to_de\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to German: \"\n",
      "    },\n",
      "    \"translation_en_to_fr\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to French: \"\n",
      "    },\n",
      "    \"translation_en_to_ro\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to Romanian: \"\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 32128\n",
      "}\n",
      "\n",
      "loading weights file https://huggingface.co/t5-small/resolve/main/pytorch_model.bin from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fee5a3a0ae379232608b6eed45d2d7a0d2966b9683728838412caccc41b4b0ed.ddacdc89ec88482db20c676f0861a336f3d0409f94748c209847b49529d73885\n",
      "All model checkpoint weights were used when initializing T5ForConditionalGeneration.\n",
      "\n",
      "All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at t5-small.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.\n",
      "02/17/2022 17:14:00 - INFO - __main__ - Using translation prefix: \"tweet classification: \"\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/5 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset:  60%|######    | 3/5 [00:00<00:00, 28.92ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 5/5 [00:00<00:00, 32.34ba/s]\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 66.84ba/s]\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 77.13ba/s]\n",
      "02/17/2022 17:14:00 - INFO - __main__ - Sample 2469 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [10657, 13774, 10, 62, 33, 1095, 385, 151, 7, 3, 2, 1095, 1024, 9632, 151, 1713, 9229, 324, 1713, 2138, 1713, 19699, 9229, 324, 1439, 2, 1], 'labels': [150, 5591, 1]}.\n",
      "02/17/2022 17:14:00 - INFO - __main__ - Sample 3112 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [10657, 13774, 10, 175, 3075, 56, 129, 25, 2787, 21, 8, 647, 1439, 2, 1713, 3470, 1713, 28984, 1713, 89, 76, 2693, 1713, 14814, 1], 'labels': [150, 5591, 1]}.\n",
      "02/17/2022 17:14:00 - INFO - __main__ - Sample 1243 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [10657, 13774, 10, 3320, 10041, 125, 31, 7, 8, 1750, 344, 3, 9, 528, 210, 11, 3, 9, 6871, 58, 3, 9, 6871, 744, 31, 17, 3, 7, 11763, 16, 8, 4836, 5, 10802, 7, 1713, 1924, 210, 1273, 1927, 1050, 1439, 2, 1], 'labels': [5591, 1]}.\n",
      "\n",
      "Downloading:   0%|          | 0.00/2.37k [00:00<?, ?B/s]\n",
      "Downloading: 5.67kB [00:00, 1.42MB/s]                   \n",
      "02/17/2022 17:14:02 - INFO - __main__ - ***** Running training *****\n",
      "02/17/2022 17:14:02 - INFO - __main__ -   Num examples = 4742\n",
      "02/17/2022 17:14:02 - INFO - __main__ -   Num Epochs = 1\n",
      "02/17/2022 17:14:02 - INFO - __main__ -   Instantaneous batch size per device = 16\n",
      "02/17/2022 17:14:02 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 16\n",
      "02/17/2022 17:14:02 - INFO - __main__ -   Gradient Accumulation steps = 1\n",
      "02/17/2022 17:14:02 - INFO - __main__ -   Total optimization steps = 297\n",
      "\n",
      "  0%|          | 0/297 [00:00<?, ?it/s]\n",
      "  0%|          | 1/297 [00:00<04:27,  1.11it/s]\n",
      "  1%|          | 2/297 [00:01<04:16,  1.15it/s]\n",
      "  1%|1         | 3/297 [00:02<04:22,  1.12it/s]\n",
      "  1%|1         | 4/297 [00:03<04:21,  1.12it/s]\n",
      "  2%|1         | 5/297 [00:04<04:51,  1.00it/s]\n",
      "  2%|2         | 6/297 [00:05<04:41,  1.04it/s]\n",
      "  2%|2         | 7/297 [00:06<04:35,  1.05it/s]\n",
      "  3%|2         | 8/297 [00:07<04:21,  1.10it/s]\n",
      "  3%|3         | 9/297 [00:08<04:21,  1.10it/s]\n",
      "  3%|3         | 10/297 [00:09<04:20,  1.10it/s]\n",
      "  4%|3         | 11/297 [00:10<04:14,  1.12it/s]\n",
      "  4%|4         | 12/297 [00:11<04:19,  1.10it/s]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  4%|4         | 13/297 [00:11<04:15,  1.11it/s]\n",
      "  5%|4         | 14/297 [00:12<04:11,  1.12it/s]\n",
      "  5%|5         | 15/297 [00:13<04:07,  1.14it/s]\n",
      "  5%|5         | 16/297 [00:14<04:14,  1.10it/s]\n",
      "  6%|5         | 17/297 [00:15<04:07,  1.13it/s]\n",
      "  6%|6         | 18/297 [00:16<04:10,  1.11it/s]\n",
      "  6%|6         | 19/297 [00:17<04:22,  1.06it/s]\n",
      "  7%|6         | 20/297 [00:18<04:17,  1.07it/s]\n",
      "  7%|7         | 21/297 [00:19<04:19,  1.07it/s]\n",
      "  7%|7         | 22/297 [00:20<04:12,  1.09it/s]\n",
      "  8%|7         | 23/297 [00:20<04:03,  1.12it/s]\n",
      "  8%|8         | 24/297 [00:21<04:01,  1.13it/s]\n",
      "  8%|8         | 25/297 [00:22<04:02,  1.12it/s]\n",
      "  9%|8         | 26/297 [00:23<04:05,  1.10it/s]\n",
      "  9%|9         | 27/297 [00:24<04:01,  1.12it/s]\n",
      "  9%|9         | 28/297 [00:25<03:52,  1.16it/s]\n",
      " 10%|9         | 29/297 [00:26<03:49,  1.17it/s]\n",
      " 10%|#         | 30/297 [00:27<03:52,  1.15it/s]\n",
      " 10%|#         | 31/297 [00:27<03:43,  1.19it/s]\n",
      " 11%|#         | 32/297 [00:28<03:35,  1.23it/s]\n",
      " 11%|#1        | 33/297 [00:29<03:35,  1.23it/s]\n",
      " 11%|#1        | 34/297 [00:30<03:30,  1.25it/s]\n",
      " 12%|#1        | 35/297 [00:30<03:31,  1.24it/s]\n",
      " 12%|#2        | 36/297 [00:31<03:35,  1.21it/s]\n",
      " 12%|#2        | 37/297 [00:32<03:30,  1.23it/s]\n",
      " 13%|#2        | 38/297 [00:33<03:44,  1.15it/s]\n",
      " 13%|#3        | 39/297 [00:34<03:45,  1.14it/s]\n",
      " 13%|#3        | 40/297 [00:35<04:07,  1.04it/s]\n",
      " 14%|#3        | 41/297 [00:36<03:59,  1.07it/s]\n",
      " 14%|#4        | 42/297 [00:37<03:53,  1.09it/s]\n",
      " 14%|#4        | 43/297 [00:38<03:42,  1.14it/s]\n",
      " 15%|#4        | 44/297 [00:39<03:35,  1.17it/s]\n",
      " 15%|#5        | 45/297 [00:39<03:39,  1.15it/s]\n",
      " 15%|#5        | 46/297 [00:40<03:43,  1.12it/s]\n",
      " 16%|#5        | 47/297 [00:41<03:52,  1.08it/s]\n",
      " 16%|#6        | 48/297 [00:42<03:50,  1.08it/s]\n",
      " 16%|#6        | 49/297 [00:43<03:43,  1.11it/s]\n",
      " 17%|#6        | 50/297 [00:44<03:41,  1.12it/s]\n",
      " 17%|#7        | 51/297 [00:45<03:35,  1.14it/s]\n",
      " 18%|#7        | 52/297 [00:46<03:42,  1.10it/s]\n",
      " 18%|#7        | 53/297 [00:47<03:34,  1.14it/s]\n",
      " 18%|#8        | 54/297 [00:47<03:26,  1.18it/s]\n",
      " 19%|#8        | 55/297 [00:48<03:28,  1.16it/s]\n",
      " 19%|#8        | 56/297 [00:49<03:24,  1.18it/s]\n",
      " 19%|#9        | 57/297 [00:50<03:19,  1.20it/s]\n",
      " 20%|#9        | 58/297 [00:51<03:19,  1.20it/s]\n",
      " 20%|#9        | 59/297 [00:52<03:21,  1.18it/s]\n",
      " 20%|##        | 60/297 [00:53<03:26,  1.15it/s]\n",
      " 21%|##        | 61/297 [00:53<03:24,  1.16it/s]\n",
      " 21%|##        | 62/297 [00:55<03:40,  1.06it/s]\n",
      " 21%|##1       | 63/297 [00:55<03:38,  1.07it/s]\n",
      " 22%|##1       | 64/297 [00:56<03:30,  1.11it/s]\n",
      " 22%|##1       | 65/297 [00:57<03:30,  1.10it/s]\n",
      " 22%|##2       | 66/297 [00:58<03:22,  1.14it/s]\n",
      " 23%|##2       | 67/297 [00:59<03:25,  1.12it/s]\n",
      " 23%|##2       | 68/297 [01:00<03:25,  1.12it/s]\n",
      " 23%|##3       | 69/297 [01:01<03:23,  1.12it/s]\n",
      " 24%|##3       | 70/297 [01:02<03:17,  1.15it/s]\n",
      " 24%|##3       | 71/297 [01:02<03:13,  1.17it/s]\n",
      " 24%|##4       | 72/297 [01:03<03:10,  1.18it/s]\n",
      " 25%|##4       | 73/297 [01:04<03:14,  1.15it/s]\n",
      " 25%|##4       | 74/297 [01:05<03:25,  1.08it/s]\n",
      " 25%|##5       | 75/297 [01:06<03:20,  1.11it/s]\n",
      " 26%|##5       | 76/297 [01:07<03:18,  1.12it/s]\n",
      " 26%|##5       | 77/297 [01:08<03:13,  1.14it/s]\n",
      " 26%|##6       | 78/297 [01:08<03:04,  1.18it/s]\n",
      " 27%|##6       | 79/297 [01:09<03:12,  1.13it/s]\n",
      " 27%|##6       | 80/297 [01:10<03:12,  1.12it/s]\n",
      " 27%|##7       | 81/297 [01:11<03:09,  1.14it/s]\n",
      " 28%|##7       | 82/297 [01:12<03:08,  1.14it/s]\n",
      " 28%|##7       | 83/297 [01:13<03:10,  1.12it/s]\n",
      " 28%|##8       | 84/297 [01:14<03:13,  1.10it/s]\n",
      " 29%|##8       | 85/297 [01:15<03:09,  1.12it/s]\n",
      " 29%|##8       | 86/297 [01:16<03:06,  1.13it/s]\n",
      " 29%|##9       | 87/297 [01:17<03:05,  1.13it/s]\n",
      " 30%|##9       | 88/297 [01:17<03:06,  1.12it/s]\n",
      " 30%|##9       | 89/297 [01:18<03:03,  1.14it/s]\n",
      " 30%|###       | 90/297 [01:19<03:04,  1.12it/s]\n",
      " 31%|###       | 91/297 [01:20<03:06,  1.10it/s]\n",
      " 31%|###       | 92/297 [01:21<03:06,  1.10it/s]\n",
      " 31%|###1      | 93/297 [01:22<03:08,  1.08it/s]\n",
      " 32%|###1      | 94/297 [01:23<03:15,  1.04it/s]\n",
      " 32%|###1      | 95/297 [01:24<03:10,  1.06it/s]\n",
      " 32%|###2      | 96/297 [01:25<03:05,  1.08it/s]\n",
      " 33%|###2      | 97/297 [01:26<03:01,  1.10it/s]\n",
      " 33%|###2      | 98/297 [01:27<03:07,  1.06it/s]\n",
      " 33%|###3      | 99/297 [01:28<03:02,  1.09it/s]\n",
      " 34%|###3      | 100/297 [01:29<02:59,  1.10it/s]\n",
      " 34%|###4      | 101/297 [01:29<02:59,  1.09it/s]\n",
      " 34%|###4      | 102/297 [01:30<02:56,  1.11it/s]\n",
      " 35%|###4      | 103/297 [01:31<02:58,  1.09it/s]\n",
      " 35%|###5      | 104/297 [01:32<02:58,  1.08it/s]\n",
      " 35%|###5      | 105/297 [01:33<02:56,  1.09it/s]\n",
      " 36%|###5      | 106/297 [01:34<02:53,  1.10it/s]\n",
      " 36%|###6      | 107/297 [01:35<02:55,  1.08it/s]\n",
      " 36%|###6      | 108/297 [01:36<02:51,  1.10it/s]\n",
      " 37%|###6      | 109/297 [01:37<02:51,  1.09it/s]\n",
      " 37%|###7      | 110/297 [01:38<02:54,  1.07it/s]\n",
      " 37%|###7      | 111/297 [01:39<02:51,  1.09it/s]\n",
      " 38%|###7      | 112/297 [01:40<02:49,  1.09it/s]\n",
      " 38%|###8      | 113/297 [01:40<02:46,  1.10it/s]\n",
      " 38%|###8      | 114/297 [01:41<02:43,  1.12it/s]\n",
      " 39%|###8      | 115/297 [01:42<02:42,  1.12it/s]\n",
      " 39%|###9      | 116/297 [01:43<02:38,  1.14it/s]\n",
      " 39%|###9      | 117/297 [01:44<02:39,  1.13it/s]\n",
      " 40%|###9      | 118/297 [01:45<02:44,  1.08it/s]\n",
      " 40%|####      | 119/297 [01:46<02:41,  1.10it/s]\n",
      " 40%|####      | 120/297 [01:47<02:38,  1.12it/s]\n",
      " 41%|####      | 121/297 [01:48<02:44,  1.07it/s]\n",
      " 41%|####1     | 122/297 [01:49<02:40,  1.09it/s]\n",
      " 41%|####1     | 123/297 [01:49<02:36,  1.11it/s]\n",
      " 42%|####1     | 124/297 [01:50<02:36,  1.11it/s]\n",
      " 42%|####2     | 125/297 [01:51<02:35,  1.11it/s]\n",
      " 42%|####2     | 126/297 [01:52<02:35,  1.10it/s]\n",
      " 43%|####2     | 127/297 [01:53<02:34,  1.10it/s]\n",
      " 43%|####3     | 128/297 [01:54<02:31,  1.12it/s]\n",
      " 43%|####3     | 129/297 [01:55<02:35,  1.08it/s]\n",
      " 44%|####3     | 130/297 [01:56<02:29,  1.12it/s]\n",
      " 44%|####4     | 131/297 [01:57<02:28,  1.12it/s]\n",
      " 44%|####4     | 132/297 [01:58<02:28,  1.11it/s]\n",
      " 45%|####4     | 133/297 [01:58<02:26,  1.12it/s]\n",
      " 45%|####5     | 134/297 [01:59<02:27,  1.11it/s]\n",
      " 45%|####5     | 135/297 [02:00<02:27,  1.10it/s]\n",
      " 46%|####5     | 136/297 [02:01<02:25,  1.10it/s]\n",
      " 46%|####6     | 137/297 [02:02<02:26,  1.09it/s]\n",
      " 46%|####6     | 138/297 [02:03<02:22,  1.11it/s]\n",
      " 47%|####6     | 139/297 [02:04<02:21,  1.11it/s]\n",
      " 47%|####7     | 140/297 [02:05<02:21,  1.11it/s]\n",
      " 47%|####7     | 141/297 [02:06<02:23,  1.09it/s]\n",
      " 48%|####7     | 142/297 [02:07<02:18,  1.12it/s]\n",
      " 48%|####8     | 143/297 [02:07<02:17,  1.12it/s]\n",
      " 48%|####8     | 144/297 [02:08<02:14,  1.14it/s]\n",
      " 49%|####8     | 145/297 [02:09<02:14,  1.13it/s]\n",
      " 49%|####9     | 146/297 [02:10<02:11,  1.15it/s]\n",
      " 49%|####9     | 147/297 [02:11<02:12,  1.13it/s]\n",
      " 50%|####9     | 148/297 [02:12<02:10,  1.14it/s]\n",
      " 50%|#####     | 149/297 [02:13<02:08,  1.15it/s]\n",
      " 51%|#####     | 150/297 [02:14<02:13,  1.10it/s]\n",
      " 51%|#####     | 151/297 [02:15<02:10,  1.12it/s]\n",
      " 51%|#####1    | 152/297 [02:15<02:11,  1.11it/s]\n",
      " 52%|#####1    | 153/297 [02:16<02:08,  1.12it/s]\n",
      " 52%|#####1    | 154/297 [02:17<02:08,  1.11it/s]\n",
      " 52%|#####2    | 155/297 [02:18<02:08,  1.11it/s]\n",
      " 53%|#####2    | 156/297 [02:19<02:09,  1.09it/s]\n",
      " 53%|#####2    | 157/297 [02:20<02:07,  1.10it/s]\n",
      " 53%|#####3    | 158/297 [02:21<02:04,  1.12it/s]\n",
      " 54%|#####3    | 159/297 [02:22<02:02,  1.13it/s]\n",
      " 54%|#####3    | 160/297 [02:23<02:01,  1.13it/s]\n",
      " 54%|#####4    | 161/297 [02:23<01:56,  1.17it/s]\n",
      " 55%|#####4    | 162/297 [02:24<01:56,  1.16it/s]\n",
      " 55%|#####4    | 163/297 [02:25<01:59,  1.12it/s]\n",
      " 55%|#####5    | 164/297 [02:26<01:58,  1.12it/s]\n",
      " 56%|#####5    | 165/297 [02:27<01:55,  1.14it/s]\n",
      " 56%|#####5    | 166/297 [02:28<01:55,  1.14it/s]\n",
      " 56%|#####6    | 167/297 [02:29<01:54,  1.14it/s]\n",
      " 57%|#####6    | 168/297 [02:30<01:54,  1.12it/s]\n",
      " 57%|#####6    | 169/297 [02:31<02:02,  1.05it/s]\n",
      " 57%|#####7    | 170/297 [02:32<01:59,  1.06it/s]\n",
      " 58%|#####7    | 171/297 [02:33<01:57,  1.07it/s]\n",
      " 58%|#####7    | 172/297 [02:34<02:17,  1.10s/it]\n",
      " 58%|#####8    | 173/297 [02:35<02:11,  1.06s/it]\n",
      " 59%|#####8    | 174/297 [02:36<02:05,  1.02s/it]\n",
      " 59%|#####8    | 175/297 [02:37<01:58,  1.03it/s]\n",
      " 59%|#####9    | 176/297 [02:38<01:56,  1.04it/s]\n",
      " 60%|#####9    | 177/297 [02:39<01:57,  1.02it/s]\n",
      " 60%|#####9    | 178/297 [02:40<01:56,  1.02it/s]\n",
      " 60%|######    | 179/297 [02:41<01:53,  1.04it/s]\n",
      " 61%|######    | 180/297 [02:42<01:49,  1.07it/s]\n",
      " 61%|######    | 181/297 [02:42<01:47,  1.08it/s]\n",
      " 61%|######1   | 182/297 [02:43<01:45,  1.09it/s]\n",
      " 62%|######1   | 183/297 [02:44<01:43,  1.10it/s]\n",
      " 62%|######1   | 184/297 [02:45<01:43,  1.09it/s]\n",
      " 62%|######2   | 185/297 [02:46<01:45,  1.06it/s]\n",
      " 63%|######2   | 186/297 [02:47<01:43,  1.08it/s]\n",
      " 63%|######2   | 187/297 [02:48<01:45,  1.05it/s]\n",
      " 63%|######3   | 188/297 [02:49<01:40,  1.09it/s]\n",
      " 64%|######3   | 189/297 [02:50<01:37,  1.11it/s]\n",
      " 64%|######3   | 190/297 [02:51<01:35,  1.11it/s]\n",
      " 64%|######4   | 191/297 [02:52<01:36,  1.10it/s]\n",
      " 65%|######4   | 192/297 [02:53<01:35,  1.10it/s]\n",
      " 65%|######4   | 193/297 [02:54<01:37,  1.06it/s]\n",
      " 65%|######5   | 194/297 [02:54<01:35,  1.07it/s]\n",
      " 66%|######5   | 195/297 [02:55<01:37,  1.05it/s]\n",
      " 66%|######5   | 196/297 [02:56<01:32,  1.10it/s]\n",
      " 66%|######6   | 197/297 [02:57<01:29,  1.12it/s]\n",
      " 67%|######6   | 198/297 [02:58<01:41,  1.02s/it]\n",
      " 67%|######7   | 199/297 [03:00<01:41,  1.03s/it]\n",
      " 67%|######7   | 200/297 [03:00<01:37,  1.01s/it]\n",
      " 68%|######7   | 201/297 [03:01<01:32,  1.04it/s]\n",
      " 68%|######8   | 202/297 [03:02<01:27,  1.09it/s]\n",
      " 68%|######8   | 203/297 [03:03<01:24,  1.12it/s]\n",
      " 69%|######8   | 204/297 [03:04<01:23,  1.11it/s]\n",
      " 69%|######9   | 205/297 [03:05<01:26,  1.06it/s]\n",
      " 69%|######9   | 206/297 [03:06<01:28,  1.03it/s]\n",
      " 70%|######9   | 207/297 [03:07<01:26,  1.05it/s]\n",
      " 70%|#######   | 208/297 [03:08<01:26,  1.02it/s]\n",
      " 70%|#######   | 209/297 [03:09<01:22,  1.07it/s]\n",
      " 71%|#######   | 210/297 [03:10<01:21,  1.06it/s]\n",
      " 71%|#######1  | 211/297 [03:11<01:19,  1.08it/s]\n",
      " 71%|#######1  | 212/297 [03:12<01:18,  1.09it/s]\n",
      " 72%|#######1  | 213/297 [03:12<01:17,  1.08it/s]\n",
      " 72%|#######2  | 214/297 [03:13<01:17,  1.07it/s]\n",
      " 72%|#######2  | 215/297 [03:14<01:15,  1.08it/s]\n",
      " 73%|#######2  | 216/297 [03:15<01:14,  1.08it/s]\n",
      " 73%|#######3  | 217/297 [03:16<01:12,  1.10it/s]\n",
      " 73%|#######3  | 218/297 [03:17<01:10,  1.12it/s]\n",
      " 74%|#######3  | 219/297 [03:18<01:10,  1.11it/s]\n",
      " 74%|#######4  | 220/297 [03:19<01:09,  1.11it/s]\n",
      " 74%|#######4  | 221/297 [03:20<01:08,  1.12it/s]\n",
      " 75%|#######4  | 222/297 [03:20<01:05,  1.14it/s]\n",
      " 75%|#######5  | 223/297 [03:21<01:05,  1.13it/s]\n",
      " 75%|#######5  | 224/297 [03:22<01:05,  1.11it/s]\n",
      " 76%|#######5  | 225/297 [03:23<01:07,  1.06it/s]\n",
      " 76%|#######6  | 226/297 [03:24<01:04,  1.09it/s]\n",
      " 76%|#######6  | 227/297 [03:25<01:03,  1.10it/s]\n",
      " 77%|#######6  | 228/297 [03:26<01:01,  1.11it/s]\n",
      " 77%|#######7  | 229/297 [03:27<01:01,  1.10it/s]\n",
      " 77%|#######7  | 230/297 [03:28<01:01,  1.09it/s]\n",
      " 78%|#######7  | 231/297 [03:29<00:58,  1.13it/s]\n",
      " 78%|#######8  | 232/297 [03:29<00:55,  1.18it/s]\n",
      " 78%|#######8  | 233/297 [03:30<00:54,  1.18it/s]\n",
      " 79%|#######8  | 234/297 [03:31<00:55,  1.14it/s]\n",
      " 79%|#######9  | 235/297 [03:32<00:55,  1.12it/s]\n",
      " 79%|#######9  | 236/297 [03:33<00:55,  1.10it/s]\n",
      " 80%|#######9  | 237/297 [03:34<00:54,  1.11it/s]\n",
      " 80%|########  | 238/297 [03:35<00:54,  1.08it/s]\n",
      " 80%|########  | 239/297 [03:36<00:54,  1.06it/s]\n",
      " 81%|########  | 240/297 [03:37<00:52,  1.09it/s]\n",
      " 81%|########1 | 241/297 [03:38<00:51,  1.08it/s]\n",
      " 81%|########1 | 242/297 [03:39<00:49,  1.10it/s]\n",
      " 82%|########1 | 243/297 [03:39<00:47,  1.13it/s]\n",
      " 82%|########2 | 244/297 [03:40<00:46,  1.14it/s]\n",
      " 82%|########2 | 245/297 [03:41<00:46,  1.12it/s]\n",
      " 83%|########2 | 246/297 [03:42<00:45,  1.11it/s]\n",
      " 83%|########3 | 247/297 [03:43<00:44,  1.11it/s]\n",
      " 84%|########3 | 248/297 [03:44<00:48,  1.00it/s]\n",
      " 84%|########3 | 249/297 [03:45<00:47,  1.01it/s]\n",
      " 84%|########4 | 250/297 [03:46<00:45,  1.03it/s]\n",
      " 85%|########4 | 251/297 [03:47<00:44,  1.03it/s]\n",
      " 85%|########4 | 252/297 [03:48<00:42,  1.05it/s]\n",
      " 85%|########5 | 253/297 [03:49<00:41,  1.07it/s]\n",
      " 86%|########5 | 254/297 [03:50<00:40,  1.06it/s]\n",
      " 86%|########5 | 255/297 [03:51<00:40,  1.04it/s]\n",
      " 86%|########6 | 256/297 [03:52<00:40,  1.01it/s]\n",
      " 87%|########6 | 257/297 [03:53<00:38,  1.05it/s]\n",
      " 87%|########6 | 258/297 [03:54<00:36,  1.06it/s]\n",
      " 87%|########7 | 259/297 [03:55<00:35,  1.08it/s]\n",
      " 88%|########7 | 260/297 [03:56<00:34,  1.07it/s]\n",
      " 88%|########7 | 261/297 [03:57<00:33,  1.08it/s]\n",
      " 88%|########8 | 262/297 [03:57<00:32,  1.06it/s]\n",
      " 89%|########8 | 263/297 [03:58<00:31,  1.08it/s]\n",
      " 89%|########8 | 264/297 [03:59<00:31,  1.05it/s]\n",
      " 89%|########9 | 265/297 [04:00<00:30,  1.04it/s]\n",
      " 90%|########9 | 266/297 [04:01<00:30,  1.01it/s]\n",
      " 90%|########9 | 267/297 [04:02<00:28,  1.06it/s]\n",
      " 90%|######### | 268/297 [04:03<00:26,  1.08it/s]\n",
      " 91%|######### | 269/297 [04:04<00:26,  1.04it/s]\n",
      " 91%|######### | 270/297 [04:05<00:25,  1.04it/s]\n",
      " 91%|#########1| 271/297 [04:06<00:23,  1.09it/s]\n",
      " 92%|#########1| 272/297 [04:07<00:23,  1.07it/s]\n",
      " 92%|#########1| 273/297 [04:08<00:22,  1.08it/s]\n",
      " 92%|#########2| 274/297 [04:09<00:21,  1.08it/s]\n",
      " 93%|#########2| 275/297 [04:10<00:20,  1.08it/s]\n",
      " 93%|#########2| 276/297 [04:11<00:19,  1.10it/s]\n",
      " 93%|#########3| 277/297 [04:12<00:18,  1.06it/s]\n",
      " 94%|#########3| 278/297 [04:12<00:17,  1.07it/s]\n",
      " 94%|#########3| 279/297 [04:13<00:16,  1.07it/s]\n",
      " 94%|#########4| 280/297 [04:14<00:16,  1.06it/s]\n",
      " 95%|#########4| 281/297 [04:15<00:14,  1.07it/s]\n",
      " 95%|#########4| 282/297 [04:16<00:13,  1.11it/s]\n",
      " 95%|#########5| 283/297 [04:17<00:12,  1.09it/s]\n",
      " 96%|#########5| 284/297 [04:18<00:13,  1.00s/it]\n",
      " 96%|#########5| 285/297 [04:19<00:11,  1.01it/s]\n",
      " 96%|#########6| 286/297 [04:20<00:10,  1.04it/s]\n",
      " 97%|#########6| 287/297 [04:21<00:09,  1.08it/s]\n",
      " 97%|#########6| 288/297 [04:22<00:08,  1.10it/s]\n",
      " 97%|#########7| 289/297 [04:23<00:07,  1.08it/s]\n",
      " 98%|#########7| 290/297 [04:24<00:06,  1.11it/s]\n",
      " 98%|#########7| 291/297 [04:25<00:05,  1.09it/s]\n",
      " 98%|#########8| 292/297 [04:26<00:04,  1.07it/s]\n",
      " 99%|#########8| 293/297 [04:26<00:03,  1.10it/s]\n",
      " 99%|#########8| 294/297 [04:27<00:02,  1.12it/s]\n",
      " 99%|#########9| 295/297 [04:28<00:01,  1.07it/s]\n",
      "100%|#########9| 296/297 [04:29<00:00,  1.03it/s]\n",
      "100%|##########| 297/297 [04:30<00:00,  1.19it/s]02/17/2022 17:18:41 - INFO - __main__ - Validation-set | bleu: 0.0 | accuracy: 1.0\n",
      "02/17/2022 17:18:49 - INFO - __main__ - Test-set | bleu: 0.0 | accuracy: 1.0\n",
      "Configuration saved in out/tweet/t5\\config.json\n",
      "Model weights saved in out/tweet/t5\\pytorch_model.bin\n",
      "tokenizer config file saved in out/tweet/t5\\tokenizer_config.json\n",
      "Special tokens file saved in out/tweet/t5\\special_tokens_map.json\n",
      "Copy vocab file to out/tweet/t5\\spiece.model\n",
      "\n",
      "100%|##########| 297/297 [04:46<00:00,  1.04it/s]\n"
     ]
    }
   ],
   "source": [
    "!python run_translation_no_trainer.py \\\n",
    "  --model_name_or_path t5-small \\\n",
    "  --train_file data/translations-train.json \\\n",
    "  --validation_file data/translations-valid.json \\\n",
    "  --test_file data/translations-test.json \\\n",
    "  --per_device_train_batch_size 16 \\\n",
    "  --per_device_eval_batch_size 16 \\\n",
    "  --source_prefix \"tweet classification\" \\\n",
    "  --max_source_length 256 \\\n",
    "  --max_target_length 128 \\\n",
    "  --max_length 128 \\\n",
    "  --num_train_epochs 1 \\\n",
    "  --output_dir out/tweet/t5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# T5 version 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "02/17/2022 17:23:00 - INFO - __main__ - Distributed environment: NO\n",
      "Num processes: 1\n",
      "Process index: 0\n",
      "Local process index: 0\n",
      "Device: cpu\n",
      "Use FP16 precision: False\n",
      "\n",
      "02/17/2022 17:23:00 - WARNING - datasets.builder - Using custom data configuration default-c1907d9305fb2fbb\n",
      "02/17/2022 17:23:00 - WARNING - datasets.builder - Reusing dataset json (C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-c1907d9305fb2fbb\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)\n",
      "\n",
      "  0%|          | 0/3 [00:00<?, ?it/s]\n",
      "100%|##########| 3/3 [00:00<00:00, 1504.41it/s]\n",
      "loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985\n",
      "Model config T5Config {\n",
      "  \"architectures\": [\n",
      "    \"T5WithLMHeadModel\"\n",
      "  ],\n",
      "  \"d_ff\": 2048,\n",
      "  \"d_kv\": 64,\n",
      "  \"d_model\": 512,\n",
      "  \"decoder_start_token_id\": 0,\n",
      "  \"dropout_rate\": 0.1,\n",
      "  \"eos_token_id\": 1,\n",
      "  \"feed_forward_proj\": \"relu\",\n",
      "  \"initializer_factor\": 1.0,\n",
      "  \"is_encoder_decoder\": true,\n",
      "  \"layer_norm_epsilon\": 1e-06,\n",
      "  \"model_type\": \"t5\",\n",
      "  \"n_positions\": 512,\n",
      "  \"num_decoder_layers\": 6,\n",
      "  \"num_heads\": 8,\n",
      "  \"num_layers\": 6,\n",
      "  \"output_past\": true,\n",
      "  \"pad_token_id\": 0,\n",
      "  \"relative_attention_num_buckets\": 32,\n",
      "  \"task_specific_params\": {\n",
      "    \"summarization\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"length_penalty\": 2.0,\n",
      "      \"max_length\": 200,\n",
      "      \"min_length\": 30,\n",
      "      \"no_repeat_ngram_size\": 3,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"summarize: \"\n",
      "    },\n",
      "    \"translation_en_to_de\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to German: \"\n",
      "    },\n",
      "    \"translation_en_to_fr\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to French: \"\n",
      "    },\n",
      "    \"translation_en_to_ro\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to Romanian: \"\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 32128\n",
      "}\n",
      "\n",
      "Could not locate the tokenizer configuration file, will try to use the model config instead.\n",
      "loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985\n",
      "Model config T5Config {\n",
      "  \"architectures\": [\n",
      "    \"T5WithLMHeadModel\"\n",
      "  ],\n",
      "  \"d_ff\": 2048,\n",
      "  \"d_kv\": 64,\n",
      "  \"d_model\": 512,\n",
      "  \"decoder_start_token_id\": 0,\n",
      "  \"dropout_rate\": 0.1,\n",
      "  \"eos_token_id\": 1,\n",
      "  \"feed_forward_proj\": \"relu\",\n",
      "  \"initializer_factor\": 1.0,\n",
      "  \"is_encoder_decoder\": true,\n",
      "  \"layer_norm_epsilon\": 1e-06,\n",
      "  \"model_type\": \"t5\",\n",
      "  \"n_positions\": 512,\n",
      "  \"num_decoder_layers\": 6,\n",
      "  \"num_heads\": 8,\n",
      "  \"num_layers\": 6,\n",
      "  \"output_past\": true,\n",
      "  \"pad_token_id\": 0,\n",
      "  \"relative_attention_num_buckets\": 32,\n",
      "  \"task_specific_params\": {\n",
      "    \"summarization\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"length_penalty\": 2.0,\n",
      "      \"max_length\": 200,\n",
      "      \"min_length\": 30,\n",
      "      \"no_repeat_ngram_size\": 3,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"summarize: \"\n",
      "    },\n",
      "    \"translation_en_to_de\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to German: \"\n",
      "    },\n",
      "    \"translation_en_to_fr\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to French: \"\n",
      "    },\n",
      "    \"translation_en_to_ro\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to Romanian: \"\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 32128\n",
      "}\n",
      "\n",
      "loading file https://huggingface.co/t5-small/resolve/main/spiece.model from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\65fc04e21f45f61430aea0c4fedffac16a4d20d78b8e6601d8d996ebefefecd2.3b69006860e7b5d0a63ffdddc01ddcd6b7c318a6f4fd793596552c741734c62d\n",
      "loading file https://huggingface.co/t5-small/resolve/main/tokenizer.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\06779097c78e12f47ef67ecb728810c2ae757ee0a9efe9390c6419783d99382d.8627f1bd5d270a9fd2e5a51c8bec3223896587cc3cfe13edeabb0992ab43c529\n",
      "loading file https://huggingface.co/t5-small/resolve/main/added_tokens.json from cache at None\n",
      "loading file https://huggingface.co/t5-small/resolve/main/special_tokens_map.json from cache at None\n",
      "loading file https://huggingface.co/t5-small/resolve/main/tokenizer_config.json from cache at None\n",
      "loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985\n",
      "Model config T5Config {\n",
      "  \"architectures\": [\n",
      "    \"T5WithLMHeadModel\"\n",
      "  ],\n",
      "  \"d_ff\": 2048,\n",
      "  \"d_kv\": 64,\n",
      "  \"d_model\": 512,\n",
      "  \"decoder_start_token_id\": 0,\n",
      "  \"dropout_rate\": 0.1,\n",
      "  \"eos_token_id\": 1,\n",
      "  \"feed_forward_proj\": \"relu\",\n",
      "  \"initializer_factor\": 1.0,\n",
      "  \"is_encoder_decoder\": true,\n",
      "  \"layer_norm_epsilon\": 1e-06,\n",
      "  \"model_type\": \"t5\",\n",
      "  \"n_positions\": 512,\n",
      "  \"num_decoder_layers\": 6,\n",
      "  \"num_heads\": 8,\n",
      "  \"num_layers\": 6,\n",
      "  \"output_past\": true,\n",
      "  \"pad_token_id\": 0,\n",
      "  \"relative_attention_num_buckets\": 32,\n",
      "  \"task_specific_params\": {\n",
      "    \"summarization\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"length_penalty\": 2.0,\n",
      "      \"max_length\": 200,\n",
      "      \"min_length\": 30,\n",
      "      \"no_repeat_ngram_size\": 3,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"summarize: \"\n",
      "    },\n",
      "    \"translation_en_to_de\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to German: \"\n",
      "    },\n",
      "    \"translation_en_to_fr\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to French: \"\n",
      "    },\n",
      "    \"translation_en_to_ro\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to Romanian: \"\n",
      "    }\n",
      "  },\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 32128\n",
      "}\n",
      "\n",
      "loading weights file https://huggingface.co/t5-small/resolve/main/pytorch_model.bin from cache at C:\\Users\\Foka/.cache\\huggingface\\transformers\\fee5a3a0ae379232608b6eed45d2d7a0d2966b9683728838412caccc41b4b0ed.ddacdc89ec88482db20c676f0861a336f3d0409f94748c209847b49529d73885\n",
      "All model checkpoint weights were used when initializing T5ForConditionalGeneration.\n",
      "\n",
      "All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at t5-small.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.\n",
      "02/17/2022 17:23:07 - INFO - __main__ - Freezing model weights\n",
      "02/17/2022 17:23:07 - INFO - __main__ - Using translation prefix: \"tweet classification: \"\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/5 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset:  80%|########  | 4/5 [00:00<00:00, 31.58ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 5/5 [00:00<00:00, 33.64ba/s]\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 66.85ba/s]\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 77.13ba/s]\n",
      "02/17/2022 17:23:07 - INFO - __main__ - Sample 4497 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [10657, 13774, 10, 34, 31, 7, 16713, 239, 3158, 3, 2, 1], 'labels': [150, 5591, 1]}.\n",
      "02/17/2022 17:23:07 - INFO - __main__ - Sample 697 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [10657, 13774, 10, 3320, 10041, 3, 6631, 7, 55, 3, 23, 410, 34, 541, 55, 3, 19293, 430, 18659, 2983, 89, 16948, 55, 1713, 7, 9, 26, 1713, 7, 127, 15, 2298, 49, 3, 24778, 1713, 1788, 6938, 2910, 29, 53, 1], 'labels': [5591, 1]}.\n",
      "02/17/2022 17:23:07 - INFO - __main__ - Sample 3411 of the training set: {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'input_ids': [10657, 13774, 10, 8441, 352, 12, 217, 3320, 10041, 16, 20, 75, 3, 10266, 55, 1], 'labels': [150, 5591, 1]}.\n",
      "02/17/2022 17:23:09 - INFO - __main__ - ***** Running training *****\n",
      "02/17/2022 17:23:09 - INFO - __main__ -   Num examples = 4742\n",
      "02/17/2022 17:23:09 - INFO - __main__ -   Num Epochs = 1\n",
      "02/17/2022 17:23:09 - INFO - __main__ -   Instantaneous batch size per device = 16\n",
      "02/17/2022 17:23:09 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) = 16\n",
      "02/17/2022 17:23:09 - INFO - __main__ -   Gradient Accumulation steps = 1\n",
      "02/17/2022 17:23:09 - INFO - __main__ -   Total optimization steps = 297\n",
      "\n",
      "  0%|          | 0/297 [00:00<?, ?it/s]\n",
      "  0%|          | 1/297 [00:00<02:34,  1.92it/s]\n",
      "  1%|          | 2/297 [00:00<02:08,  2.29it/s]\n",
      "  1%|1         | 3/297 [00:01<01:58,  2.47it/s]\n",
      "  1%|1         | 4/297 [00:01<01:53,  2.58it/s]\n",
      "  2%|1         | 5/297 [00:02<01:59,  2.45it/s]\n",
      "  2%|2         | 6/297 [00:02<02:02,  2.37it/s]\n",
      "  2%|2         | 7/297 [00:02<01:58,  2.46it/s]\n",
      "  3%|2         | 8/297 [00:03<01:53,  2.55it/s]\n",
      "  3%|3         | 9/297 [00:03<02:14,  2.14it/s]\n",
      "  3%|3         | 10/297 [00:04<02:12,  2.17it/s]\n",
      "  4%|3         | 11/297 [00:04<02:05,  2.28it/s]\n",
      "  4%|4         | 12/297 [00:05<02:01,  2.34it/s]\n",
      "  4%|4         | 13/297 [00:05<02:00,  2.36it/s]\n",
      "  5%|4         | 14/297 [00:05<01:57,  2.42it/s]\n",
      "  5%|5         | 15/297 [00:06<01:54,  2.45it/s]\n",
      "  5%|5         | 16/297 [00:06<01:50,  2.54it/s]\n",
      "  6%|5         | 17/297 [00:07<01:49,  2.57it/s]\n",
      "  6%|6         | 18/297 [00:07<01:48,  2.58it/s]\n",
      "  6%|6         | 19/297 [00:07<01:50,  2.53it/s]\n",
      "  7%|6         | 20/297 [00:08<01:52,  2.46it/s]\n",
      "  7%|7         | 21/297 [00:08<01:49,  2.53it/s]\n",
      "  7%|7         | 22/297 [00:09<01:52,  2.45it/s]\n",
      "  8%|7         | 23/297 [00:09<01:48,  2.52it/s]\n",
      "  8%|8         | 24/297 [00:09<01:47,  2.55it/s]\n",
      "  8%|8         | 25/297 [00:10<01:48,  2.51it/s]\n",
      "  9%|8         | 26/297 [00:10<01:45,  2.56it/s]\n",
      "  9%|9         | 27/297 [00:11<01:45,  2.56it/s]\n",
      "  9%|9         | 28/297 [00:11<01:45,  2.55it/s]\n",
      " 10%|9         | 29/297 [00:11<01:45,  2.54it/s]\n",
      " 10%|#         | 30/297 [00:12<01:41,  2.62it/s]\n",
      " 10%|#         | 31/297 [00:12<01:42,  2.59it/s]\n",
      " 11%|#         | 32/297 [00:12<01:39,  2.66it/s]\n",
      " 11%|#1        | 33/297 [00:13<01:39,  2.64it/s]\n",
      " 11%|#1        | 34/297 [00:13<01:38,  2.67it/s]\n",
      " 12%|#1        | 35/297 [00:14<01:38,  2.67it/s]\n",
      " 12%|#2        | 36/297 [00:14<01:40,  2.60it/s]\n",
      " 12%|#2        | 37/297 [00:14<01:45,  2.46it/s]\n",
      " 13%|#2        | 38/297 [00:15<01:44,  2.48it/s]\n",
      " 13%|#3        | 39/297 [00:15<01:42,  2.52it/s]\n",
      " 13%|#3        | 40/297 [00:16<01:41,  2.53it/s]\n",
      " 14%|#3        | 41/297 [00:16<01:42,  2.50it/s]\n",
      " 14%|#4        | 42/297 [00:16<01:37,  2.62it/s]\n",
      " 14%|#4        | 43/297 [00:17<01:38,  2.59it/s]\n",
      " 15%|#4        | 44/297 [00:17<01:40,  2.52it/s]\n",
      " 15%|#5        | 45/297 [00:18<01:37,  2.59it/s]\n",
      " 15%|#5        | 46/297 [00:18<01:37,  2.59it/s]\n",
      " 16%|#5        | 47/297 [00:18<01:37,  2.56it/s]\n",
      " 16%|#6        | 48/297 [00:19<01:36,  2.57it/s]\n",
      " 16%|#6        | 49/297 [00:19<01:36,  2.58it/s]\n",
      " 17%|#6        | 50/297 [00:19<01:35,  2.59it/s]\n",
      " 17%|#7        | 51/297 [00:20<01:35,  2.56it/s]\n",
      " 18%|#7        | 52/297 [00:20<01:34,  2.58it/s]\n",
      " 18%|#7        | 53/297 [00:21<01:34,  2.58it/s]\n",
      " 18%|#8        | 54/297 [00:21<01:33,  2.60it/s]\n",
      " 19%|#8        | 55/297 [00:21<01:34,  2.55it/s]\n",
      " 19%|#8        | 56/297 [00:22<01:34,  2.54it/s]\n",
      " 19%|#9        | 57/297 [00:22<01:36,  2.50it/s]\n",
      " 20%|#9        | 58/297 [00:23<01:35,  2.52it/s]\n",
      " 20%|#9        | 59/297 [00:23<01:33,  2.55it/s]\n",
      " 20%|##        | 60/297 [00:23<01:31,  2.58it/s]\n",
      " 21%|##        | 61/297 [00:24<01:36,  2.46it/s]\n",
      " 21%|##        | 62/297 [00:24<01:32,  2.53it/s]\n",
      " 21%|##1       | 63/297 [00:25<01:35,  2.45it/s]\n",
      " 22%|##1       | 64/297 [00:25<01:32,  2.53it/s]\n",
      " 22%|##1       | 65/297 [00:25<01:32,  2.51it/s]\n",
      " 22%|##2       | 66/297 [00:26<01:30,  2.54it/s]\n",
      " 23%|##2       | 67/297 [00:26<01:30,  2.54it/s]\n",
      " 23%|##2       | 68/297 [00:27<01:36,  2.38it/s]\n",
      " 23%|##3       | 69/297 [00:27<01:32,  2.46it/s]\n",
      " 24%|##3       | 70/297 [00:27<01:33,  2.43it/s]\n",
      " 24%|##3       | 71/297 [00:28<01:29,  2.52it/s]\n",
      " 24%|##4       | 72/297 [00:28<01:29,  2.52it/s]\n",
      " 25%|##4       | 73/297 [00:29<01:29,  2.49it/s]\n",
      " 25%|##4       | 74/297 [00:29<01:31,  2.43it/s]\n",
      " 25%|##5       | 75/297 [00:29<01:32,  2.39it/s]\n",
      " 26%|##5       | 76/297 [00:30<01:31,  2.42it/s]\n",
      " 26%|##5       | 77/297 [00:30<01:31,  2.40it/s]\n",
      " 26%|##6       | 78/297 [00:31<01:29,  2.45it/s]\n",
      " 27%|##6       | 79/297 [00:31<01:27,  2.48it/s]\n",
      " 27%|##6       | 80/297 [00:31<01:26,  2.51it/s]\n",
      " 27%|##7       | 81/297 [00:32<01:25,  2.53it/s]\n",
      " 28%|##7       | 82/297 [00:32<01:26,  2.48it/s]\n",
      " 28%|##7       | 83/297 [00:33<01:26,  2.47it/s]\n",
      " 28%|##8       | 84/297 [00:33<01:29,  2.38it/s]\n",
      " 29%|##8       | 85/297 [00:34<01:25,  2.49it/s]\n",
      " 29%|##8       | 86/297 [00:34<01:23,  2.53it/s]\n",
      " 29%|##9       | 87/297 [00:34<01:25,  2.46it/s]\n",
      " 30%|##9       | 88/297 [00:35<01:27,  2.40it/s]\n",
      " 30%|##9       | 89/297 [00:35<01:28,  2.35it/s]\n",
      " 30%|###       | 90/297 [00:36<01:26,  2.39it/s]\n",
      " 31%|###       | 91/297 [00:36<01:27,  2.35it/s]\n",
      " 31%|###       | 92/297 [00:36<01:23,  2.44it/s]\n",
      " 31%|###1      | 93/297 [00:37<01:22,  2.48it/s]\n",
      " 32%|###1      | 94/297 [00:37<01:24,  2.40it/s]\n",
      " 32%|###1      | 95/297 [00:38<01:21,  2.47it/s]\n",
      " 32%|###2      | 96/297 [00:38<01:20,  2.50it/s]\n",
      " 33%|###2      | 97/297 [00:38<01:21,  2.46it/s]\n",
      " 33%|###2      | 98/297 [00:39<01:19,  2.49it/s]\n",
      " 33%|###3      | 99/297 [00:39<01:19,  2.49it/s]\n",
      " 34%|###3      | 100/297 [00:40<01:16,  2.56it/s]\n",
      " 34%|###4      | 101/297 [00:40<01:15,  2.60it/s]\n",
      " 34%|###4      | 102/297 [00:40<01:17,  2.52it/s]\n",
      " 35%|###4      | 103/297 [00:41<01:21,  2.39it/s]\n",
      " 35%|###5      | 104/297 [00:41<01:18,  2.46it/s]\n",
      " 35%|###5      | 105/297 [00:42<01:17,  2.47it/s]\n",
      " 36%|###5      | 106/297 [00:42<01:14,  2.55it/s]\n",
      " 36%|###6      | 107/297 [00:42<01:15,  2.50it/s]\n",
      " 36%|###6      | 108/297 [00:43<01:14,  2.53it/s]\n",
      " 37%|###6      | 109/297 [00:43<01:14,  2.53it/s]\n",
      " 37%|###7      | 110/297 [00:44<01:12,  2.57it/s]\n",
      " 37%|###7      | 111/297 [00:44<01:11,  2.59it/s]\n",
      " 38%|###7      | 112/297 [00:44<01:11,  2.60it/s]\n",
      " 38%|###8      | 113/297 [00:45<01:09,  2.65it/s]\n",
      " 38%|###8      | 114/297 [00:45<01:09,  2.64it/s]\n",
      " 39%|###8      | 115/297 [00:46<01:12,  2.52it/s]\n",
      " 39%|###9      | 116/297 [00:46<01:15,  2.41it/s]\n",
      " 39%|###9      | 117/297 [00:46<01:10,  2.55it/s]\n",
      " 40%|###9      | 118/297 [00:47<01:09,  2.58it/s]\n",
      " 40%|####      | 119/297 [00:47<01:10,  2.52it/s]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 40%|####      | 120/297 [00:48<01:10,  2.53it/s]\n",
      " 41%|####      | 121/297 [00:48<01:08,  2.56it/s]\n",
      " 41%|####1     | 122/297 [00:48<01:08,  2.57it/s]\n",
      " 41%|####1     | 123/297 [00:49<01:08,  2.55it/s]\n",
      " 42%|####1     | 124/297 [00:49<01:11,  2.43it/s]\n",
      " 42%|####2     | 125/297 [00:50<01:07,  2.53it/s]\n",
      " 42%|####2     | 126/297 [00:50<01:11,  2.38it/s]\n",
      " 43%|####2     | 127/297 [00:50<01:09,  2.45it/s]\n",
      " 43%|####3     | 128/297 [00:51<01:07,  2.51it/s]\n",
      " 43%|####3     | 129/297 [00:51<01:07,  2.49it/s]\n",
      " 44%|####3     | 130/297 [00:52<01:07,  2.46it/s]\n",
      " 44%|####4     | 131/297 [00:52<01:04,  2.56it/s]\n",
      " 44%|####4     | 132/297 [00:52<01:07,  2.43it/s]\n",
      " 45%|####4     | 133/297 [00:53<01:06,  2.45it/s]\n",
      " 45%|####5     | 134/297 [00:53<01:06,  2.45it/s]\n",
      " 45%|####5     | 135/297 [00:54<01:06,  2.43it/s]\n",
      " 46%|####5     | 136/297 [00:54<01:02,  2.59it/s]\n",
      " 46%|####6     | 137/297 [00:54<01:02,  2.55it/s]\n",
      " 46%|####6     | 138/297 [00:55<01:05,  2.44it/s]\n",
      " 47%|####6     | 139/297 [00:55<01:02,  2.53it/s]\n",
      " 47%|####7     | 140/297 [00:56<01:02,  2.53it/s]\n",
      " 47%|####7     | 141/297 [00:56<01:02,  2.51it/s]\n",
      " 48%|####7     | 142/297 [00:56<00:59,  2.62it/s]\n",
      " 48%|####8     | 143/297 [00:57<00:58,  2.63it/s]\n",
      " 48%|####8     | 144/297 [00:57<00:58,  2.63it/s]\n",
      " 49%|####8     | 145/297 [00:57<00:59,  2.57it/s]\n",
      " 49%|####9     | 146/297 [00:58<00:59,  2.53it/s]\n",
      " 49%|####9     | 147/297 [00:58<01:00,  2.48it/s]\n",
      " 50%|####9     | 148/297 [00:59<00:58,  2.56it/s]\n",
      " 50%|#####     | 149/297 [00:59<01:00,  2.44it/s]\n",
      " 51%|#####     | 150/297 [00:59<00:58,  2.53it/s]\n",
      " 51%|#####     | 151/297 [01:00<00:57,  2.56it/s]\n",
      " 51%|#####1    | 152/297 [01:00<00:56,  2.55it/s]\n",
      " 52%|#####1    | 153/297 [01:01<00:56,  2.56it/s]\n",
      " 52%|#####1    | 154/297 [01:01<00:56,  2.55it/s]\n",
      " 52%|#####2    | 155/297 [01:01<00:55,  2.56it/s]\n",
      " 53%|#####2    | 156/297 [01:02<00:55,  2.54it/s]\n",
      " 53%|#####2    | 157/297 [01:02<00:54,  2.59it/s]\n",
      " 53%|#####3    | 158/297 [01:03<00:54,  2.53it/s]\n",
      " 54%|#####3    | 159/297 [01:03<00:53,  2.58it/s]\n",
      " 54%|#####3    | 160/297 [01:03<00:52,  2.62it/s]\n",
      " 54%|#####4    | 161/297 [01:04<00:52,  2.60it/s]\n",
      " 55%|#####4    | 162/297 [01:04<00:51,  2.63it/s]\n",
      " 55%|#####4    | 163/297 [01:04<00:51,  2.61it/s]\n",
      " 55%|#####5    | 164/297 [01:05<00:51,  2.56it/s]\n",
      " 56%|#####5    | 165/297 [01:05<00:51,  2.57it/s]\n",
      " 56%|#####5    | 166/297 [01:06<00:51,  2.52it/s]\n",
      " 56%|#####6    | 167/297 [01:06<00:51,  2.51it/s]\n",
      " 57%|#####6    | 168/297 [01:06<00:51,  2.53it/s]\n",
      " 57%|#####6    | 169/297 [01:07<00:51,  2.50it/s]\n",
      " 57%|#####7    | 170/297 [01:07<00:51,  2.48it/s]\n",
      " 58%|#####7    | 171/297 [01:08<00:51,  2.46it/s]\n",
      " 58%|#####7    | 172/297 [01:08<00:50,  2.46it/s]\n",
      " 58%|#####8    | 173/297 [01:09<00:50,  2.46it/s]\n",
      " 59%|#####8    | 174/297 [01:09<00:48,  2.54it/s]\n",
      " 59%|#####8    | 175/297 [01:09<00:46,  2.60it/s]\n",
      " 59%|#####9    | 176/297 [01:10<00:46,  2.60it/s]\n",
      " 60%|#####9    | 177/297 [01:10<00:45,  2.66it/s]\n",
      " 60%|#####9    | 178/297 [01:10<00:47,  2.52it/s]\n",
      " 60%|######    | 179/297 [01:11<00:47,  2.49it/s]\n",
      " 61%|######    | 180/297 [01:11<00:45,  2.60it/s]\n",
      " 61%|######    | 181/297 [01:12<00:44,  2.63it/s]\n",
      " 61%|######1   | 182/297 [01:12<00:44,  2.56it/s]\n",
      " 62%|######1   | 183/297 [01:12<00:44,  2.56it/s]\n",
      " 62%|######1   | 184/297 [01:13<00:43,  2.60it/s]\n",
      " 62%|######2   | 185/297 [01:13<00:48,  2.33it/s]\n",
      " 63%|######2   | 186/297 [01:14<00:46,  2.40it/s]\n",
      " 63%|######2   | 187/297 [01:14<00:44,  2.45it/s]\n",
      " 63%|######3   | 188/297 [01:14<00:44,  2.45it/s]\n",
      " 64%|######3   | 189/297 [01:15<00:44,  2.41it/s]\n",
      " 64%|######3   | 190/297 [01:15<00:43,  2.46it/s]\n",
      " 64%|######4   | 191/297 [01:16<00:41,  2.54it/s]\n",
      " 65%|######4   | 192/297 [01:16<00:41,  2.52it/s]\n",
      " 65%|######4   | 193/297 [01:16<00:41,  2.49it/s]\n",
      " 65%|######5   | 194/297 [01:17<00:40,  2.51it/s]\n",
      " 66%|######5   | 195/297 [01:17<00:40,  2.54it/s]\n",
      " 66%|######5   | 196/297 [01:18<00:40,  2.51it/s]\n",
      " 66%|######6   | 197/297 [01:18<00:39,  2.52it/s]\n",
      " 67%|######6   | 198/297 [01:18<00:39,  2.50it/s]\n",
      " 67%|######7   | 199/297 [01:19<00:39,  2.49it/s]\n",
      " 67%|######7   | 200/297 [01:19<00:39,  2.44it/s]\n",
      " 68%|######7   | 201/297 [01:20<00:43,  2.23it/s]\n",
      " 68%|######8   | 202/297 [01:20<00:40,  2.32it/s]\n",
      " 68%|######8   | 203/297 [01:21<00:39,  2.38it/s]\n",
      " 69%|######8   | 204/297 [01:21<00:37,  2.45it/s]\n",
      " 69%|######9   | 205/297 [01:21<00:37,  2.43it/s]\n",
      " 69%|######9   | 206/297 [01:22<00:36,  2.47it/s]\n",
      " 70%|######9   | 207/297 [01:22<00:36,  2.44it/s]\n",
      " 70%|#######   | 208/297 [01:23<00:36,  2.45it/s]\n",
      " 70%|#######   | 209/297 [01:23<00:36,  2.39it/s]\n",
      " 71%|#######   | 210/297 [01:23<00:35,  2.43it/s]\n",
      " 71%|#######1  | 211/297 [01:24<00:34,  2.53it/s]\n",
      " 71%|#######1  | 212/297 [01:24<00:33,  2.56it/s]\n",
      " 72%|#######1  | 213/297 [01:25<00:32,  2.61it/s]\n",
      " 72%|#######2  | 214/297 [01:25<00:32,  2.52it/s]\n",
      " 72%|#######2  | 215/297 [01:25<00:32,  2.54it/s]\n",
      " 73%|#######2  | 216/297 [01:26<00:34,  2.36it/s]\n",
      " 73%|#######3  | 217/297 [01:26<00:33,  2.37it/s]\n",
      " 73%|#######3  | 218/297 [01:27<00:32,  2.42it/s]\n",
      " 74%|#######3  | 219/297 [01:27<00:32,  2.40it/s]\n",
      " 74%|#######4  | 220/297 [01:28<00:31,  2.42it/s]\n",
      " 74%|#######4  | 221/297 [01:28<00:31,  2.44it/s]\n",
      " 75%|#######4  | 222/297 [01:28<00:30,  2.45it/s]\n",
      " 75%|#######5  | 223/297 [01:29<00:30,  2.45it/s]\n",
      " 75%|#######5  | 224/297 [01:29<00:32,  2.26it/s]\n",
      " 76%|#######5  | 225/297 [01:30<00:31,  2.30it/s]\n",
      " 76%|#######6  | 226/297 [01:30<00:29,  2.43it/s]\n",
      " 76%|#######6  | 227/297 [01:30<00:28,  2.48it/s]\n",
      " 77%|#######6  | 228/297 [01:31<00:28,  2.43it/s]\n",
      " 77%|#######7  | 229/297 [01:31<00:27,  2.48it/s]\n",
      " 77%|#######7  | 230/297 [01:32<00:26,  2.56it/s]\n",
      " 78%|#######7  | 231/297 [01:32<00:25,  2.55it/s]\n",
      " 78%|#######8  | 232/297 [01:32<00:25,  2.55it/s]\n",
      " 78%|#######8  | 233/297 [01:33<00:24,  2.60it/s]\n",
      " 79%|#######8  | 234/297 [01:33<00:24,  2.57it/s]\n",
      " 79%|#######9  | 235/297 [01:34<00:24,  2.54it/s]\n",
      " 79%|#######9  | 236/297 [01:34<00:24,  2.44it/s]\n",
      " 80%|#######9  | 237/297 [01:34<00:24,  2.50it/s]\n",
      " 80%|########  | 238/297 [01:35<00:23,  2.50it/s]\n",
      " 80%|########  | 239/297 [01:35<00:22,  2.58it/s]\n",
      " 81%|########  | 240/297 [01:35<00:21,  2.68it/s]\n",
      " 81%|########1 | 241/297 [01:36<00:21,  2.57it/s]\n",
      " 81%|########1 | 242/297 [01:36<00:21,  2.58it/s]\n",
      " 82%|########1 | 243/297 [01:37<00:22,  2.44it/s]\n",
      " 82%|########2 | 244/297 [01:37<00:21,  2.49it/s]\n",
      " 82%|########2 | 245/297 [01:38<00:21,  2.41it/s]\n",
      " 83%|########2 | 246/297 [01:38<00:20,  2.49it/s]\n",
      " 83%|########3 | 247/297 [01:38<00:19,  2.58it/s]\n",
      " 84%|########3 | 248/297 [01:39<00:18,  2.59it/s]\n",
      " 84%|########3 | 249/297 [01:39<00:18,  2.66it/s]\n",
      " 84%|########4 | 250/297 [01:39<00:17,  2.62it/s]\n",
      " 85%|########4 | 251/297 [01:40<00:18,  2.55it/s]\n",
      " 85%|########4 | 252/297 [01:40<00:17,  2.58it/s]\n",
      " 85%|########5 | 253/297 [01:41<00:16,  2.60it/s]\n",
      " 86%|########5 | 254/297 [01:41<00:16,  2.58it/s]\n",
      " 86%|########5 | 255/297 [01:41<00:16,  2.60it/s]\n",
      " 86%|########6 | 256/297 [01:42<00:15,  2.59it/s]\n",
      " 87%|########6 | 257/297 [01:42<00:15,  2.64it/s]\n",
      " 87%|########6 | 258/297 [01:43<00:15,  2.56it/s]\n",
      " 87%|########7 | 259/297 [01:43<00:15,  2.49it/s]\n",
      " 88%|########7 | 260/297 [01:43<00:14,  2.49it/s]\n",
      " 88%|########7 | 261/297 [01:44<00:14,  2.49it/s]\n",
      " 88%|########8 | 262/297 [01:44<00:14,  2.48it/s]\n",
      " 89%|########8 | 263/297 [01:45<00:13,  2.45it/s]\n",
      " 89%|########8 | 264/297 [01:45<00:13,  2.48it/s]\n",
      " 89%|########9 | 265/297 [01:45<00:13,  2.45it/s]\n",
      " 90%|########9 | 266/297 [01:46<00:12,  2.49it/s]\n",
      " 90%|########9 | 267/297 [01:46<00:13,  2.30it/s]\n",
      " 90%|######### | 268/297 [01:47<00:12,  2.38it/s]\n",
      " 91%|######### | 269/297 [01:47<00:11,  2.38it/s]\n",
      " 91%|######### | 270/297 [01:48<00:11,  2.33it/s]\n",
      " 91%|#########1| 271/297 [01:48<00:10,  2.44it/s]\n",
      " 92%|#########1| 272/297 [01:48<00:10,  2.39it/s]\n",
      " 92%|#########1| 273/297 [01:49<00:09,  2.44it/s]\n",
      " 92%|#########2| 274/297 [01:49<00:09,  2.42it/s]\n",
      " 93%|#########2| 275/297 [01:50<00:08,  2.52it/s]\n",
      " 93%|#########2| 276/297 [01:50<00:08,  2.44it/s]\n",
      " 93%|#########3| 277/297 [01:50<00:08,  2.46it/s]\n",
      " 94%|#########3| 278/297 [01:51<00:07,  2.46it/s]\n",
      " 94%|#########3| 279/297 [01:51<00:07,  2.35it/s]\n",
      " 94%|#########4| 280/297 [01:52<00:07,  2.41it/s]\n",
      " 95%|#########4| 281/297 [01:52<00:06,  2.52it/s]\n",
      " 95%|#########4| 282/297 [01:52<00:05,  2.53it/s]\n",
      " 95%|#########5| 283/297 [01:53<00:05,  2.57it/s]\n",
      " 96%|#########5| 284/297 [01:53<00:04,  2.66it/s]\n",
      " 96%|#########5| 285/297 [01:54<00:04,  2.58it/s]\n",
      " 96%|#########6| 286/297 [01:54<00:04,  2.59it/s]\n",
      " 97%|#########6| 287/297 [01:54<00:03,  2.61it/s]\n",
      " 97%|#########6| 288/297 [01:55<00:03,  2.64it/s]\n",
      " 97%|#########7| 289/297 [01:55<00:03,  2.59it/s]\n",
      " 98%|#########7| 290/297 [01:55<00:02,  2.56it/s]\n",
      " 98%|#########7| 291/297 [01:56<00:02,  2.52it/s]\n",
      " 98%|#########8| 292/297 [01:56<00:01,  2.54it/s]\n",
      " 99%|#########8| 293/297 [01:57<00:01,  2.54it/s]\n",
      " 99%|#########8| 294/297 [01:57<00:01,  2.48it/s]\n",
      " 99%|#########9| 295/297 [01:57<00:00,  2.45it/s]\n",
      "100%|#########9| 296/297 [01:58<00:00,  2.43it/s]\n",
      "100%|##########| 297/297 [01:58<00:00,  2.87it/s]02/17/2022 17:25:16 - INFO - __main__ - Validation-set | bleu: 6.74998952187005 | accuracy: 1.0\n",
      "02/17/2022 17:25:24 - INFO - __main__ - Test-set | bleu: 0.0 | accuracy: 1.0\n",
      "Configuration saved in out/tweet/t5_version_2\\config.json\n",
      "Model weights saved in out/tweet/t5_version_2\\pytorch_model.bin\n",
      "tokenizer config file saved in out/tweet/t5_version_2\\tokenizer_config.json\n",
      "Special tokens file saved in out/tweet/t5_version_2\\special_tokens_map.json\n",
      "Copy vocab file to out/tweet/t5_version_2\\spiece.model\n",
      "\n",
      "100%|##########| 297/297 [02:15<00:00,  2.19it/s]\n"
     ]
    }
   ],
   "source": [
    "!python run_translation_no_trainer.py \\\n",
    "  --model_name_or_path t5-small \\\n",
    "  --train_file data/translations-train.json \\\n",
    "  --validation_file data/translations-valid.json \\\n",
    "  --test_file data/translations-test.json \\\n",
    "  --per_device_train_batch_size 16 \\\n",
    "  --per_device_eval_batch_size 16 \\\n",
    "  --source_prefix \"tweet classification\" \\\n",
    "  --max_source_length 256 \\\n",
    "  --max_target_length 128 \\\n",
    "  --max_length 128 \\\n",
    "  --num_train_epochs 1 \\\n",
    "  --freeze_encoder \\\n",
    "  --output_dir out/tweet/t5_version_2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# EVALUATING MODELS"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Roberta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "02/17/2022 17:22:05 - WARNING - __main__ - Process rank: -1, device: cpu, n_gpu: 0distributed training: False, 16-bits training: False"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "  0%|          | 0/2 [00:00<?, ?it/s]\n",
      "100%|##########| 2/2 [00:00<00:00, 143.26it/s]\n",
      "[INFO|configuration_utils.py:586] 2022-02-17 17:22:05,892 >> loading configuration file out/tweet/roberta_version_2\\config.json\n",
      "[INFO|configuration_utils.py:625] 2022-02-17 17:22:05,893 >> Model config RobertaConfig {\n",
      "  \"_name_or_path\": \"roberta-base\",\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "02/17/2022 17:22:05 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n",
      "_n_gpu=0,\n",
      "adafactor=False,\n",
      "adam_beta1=0.9,\n",
      "adam_beta2=0.999,\n",
      "adam_epsilon=1e-08,\n",
      "dataloader_drop_last=False,\n",
      "dataloader_num_workers=0,\n",
      "dataloader_pin_memory=True,\n",
      "ddp_find_unused_parameters=None,\n",
      "debug=[],\n",
      "deepspeed=None,\n",
      "disable_tqdm=False,\n",
      "do_eval=True,\n",
      "do_predict=False,\n",
      "do_train=False,\n",
      "eval_accumulation_steps=None,\n",
      "eval_steps=None,\n",
      "evaluation_strategy=IntervalStrategy.NO,\n",
      "fp16=False,\n",
      "fp16_backend=auto,\n",
      "fp16_full_eval=False,\n",
      "fp16_opt_level=O1,\n",
      "gradient_accumulation_steps=1,\n",
      "gradient_checkpointing=False,\n",
      "greater_is_better=None,\n",
      "group_by_length=False,\n",
      "hub_model_id=None,\n",
      "hub_strategy=HubStrategy.EVERY_SAVE,\n",
      "hub_token=<HUB_TOKEN>,\n",
      "ignore_data_skip=False,\n",
      "label_names=None,\n",
      "label_smoothing_factor=0.0,\n",
      "learning_rate=5e-05,\n",
      "length_column_name=length,\n",
      "load_best_model_at_end=False,\n",
      "local_rank=-1,\n",
      "log_level=-1,\n",
      "log_level_replica=-1,\n",
      "log_on_each_node=True,\n",
      "logging_dir=out/tweet/roberta_version_2-evaluation\\runs\\Feb17_17-22-05_DESKTOP-K706NKK,\n",
      "logging_first_step=False,\n",
      "logging_nan_inf_filter=True,\n",
      "logging_steps=500,\n",
      "logging_strategy=IntervalStrategy.STEPS,\n",
      "lr_scheduler_type=SchedulerType.LINEAR,\n",
      "max_grad_norm=1.0,\n",
      "max_steps=-1,\n",
      "metric_for_best_model=None,\n",
      "mp_parameters=,\n",
      "no_cuda=False,\n",
      "num_train_epochs=3.0,\n",
      "output_dir=out/tweet/roberta_version_2-evaluation,\n",
      "overwrite_output_dir=False,\n",
      "past_index=-1,\n",
      "per_device_eval_batch_size=24,\n",
      "per_device_train_batch_size=8,\n",
      "prediction_loss_only=False,\n",
      "push_to_hub=False,\n",
      "push_to_hub_model_id=None,\n",
      "push_to_hub_organization=None,\n",
      "push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
      "remove_unused_columns=True,\n",
      "report_to=[],\n",
      "resume_from_checkpoint=None,\n",
      "run_name=out/tweet/roberta_version_2-evaluation,\n",
      "save_on_each_node=False,\n",
      "save_steps=500,\n",
      "save_strategy=IntervalStrategy.STEPS,\n",
      "save_total_limit=None,\n",
      "seed=42,\n",
      "sharded_ddp=[],\n",
      "skip_memory_metrics=True,\n",
      "tpu_metrics_debug=False,\n",
      "tpu_num_cores=None,\n",
      "use_legacy_prediction_loop=False,\n",
      "warmup_ratio=0.0,\n",
      "warmup_steps=0,\n",
      "weight_decay=0.0,\n",
      "xpu_backend=None,\n",
      ")\n",
      "02/17/2022 17:22:05 - INFO - __main__ - load a local file for train: data/train.json\n",
      "02/17/2022 17:22:05 - INFO - __main__ - load a local file for validation: data/valid.json\n",
      "02/17/2022 17:22:05 - WARNING - datasets.builder - Using custom data configuration default-f2672b914d9c5a33\n",
      "02/17/2022 17:22:05 - INFO - datasets.builder - Overwrite dataset info from restored data version.\n",
      "02/17/2022 17:22:05 - INFO - datasets.info - Loading Dataset info from C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-f2672b914d9c5a33\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\n",
      "02/17/2022 17:22:05 - WARNING - datasets.builder - Reusing dataset json (C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-f2672b914d9c5a33\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)\n",
      "02/17/2022 17:22:05 - INFO - datasets.info - Loading Dataset info from C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-f2672b914d9c5a33\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\n",
      "02/17/2022 17:22:05 - INFO - __main__ - Return hidden states from model: True\n",
      "02/17/2022 17:22:05 - INFO - __main__ - Using implementation from: RobertaForSequenceClassificationCustomAlternative\n",
      "02/17/2022 17:22:07 - INFO - datasets.arrow_dataset - Caching processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-f2672b914d9c5a33\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-d1d24efe1f314f1d.arrow\n",
      "02/17/2022 17:22:07 - INFO - datasets.arrow_dataset - Caching processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-f2672b914d9c5a33\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-74073ef035f90484.arrow\n",
      "02/17/2022 17:22:08 - INFO - __main__ - *** Evaluate ***\n",
      "***** eval metrics *****\n",
      "  eval_accuracy           =      0.938\n",
      "  eval_loss               =      0.673\n",
      "  eval_runtime            = 0:00:46.31\n",
      "  eval_samples            =        500\n",
      "  eval_samples_per_second =     10.795\n",
      "  eval_steps_per_second   =      0.453\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  \"architectures\": [\n",
      "    \"RobertaForSequenceClassification\"\n",
      "  ],\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"bos_token_id\": 0,\n",
      "  \"classifier_dropout\": null,\n",
      "  \"eos_token_id\": 2,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"id2label\": {\n",
      "    \"0\": 0,\n",
      "    \"1\": 1\n",
      "  },\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"label2id\": {\n",
      "    \"0\": 0,\n",
      "    \"1\": 1\n",
      "  },\n",
      "  \"layer_norm_eps\": 1e-05,\n",
      "  \"max_position_embeddings\": 514,\n",
      "  \"model_type\": \"roberta\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"pad_token_id\": 1,\n",
      "  \"position_embedding_type\": \"absolute\",\n",
      "  \"problem_type\": \"single_label_classification\",\n",
      "  \"torch_dtype\": \"float32\",\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"type_vocab_size\": 1,\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50265\n",
      "}\n",
      "\n",
      "[INFO|tokenization_utils_base.py:1671] 2022-02-17 17:22:05,900 >> Didn't find file out/tweet/roberta_version_2\\added_tokens.json. We won't load it.\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:22:05,900 >> loading file out/tweet/roberta_version_2\\vocab.json\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:22:05,900 >> loading file out/tweet/roberta_version_2\\merges.txt\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:22:05,900 >> loading file out/tweet/roberta_version_2\\tokenizer.json\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:22:05,900 >> loading file None\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:22:05,900 >> loading file out/tweet/roberta_version_2\\special_tokens_map.json\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:22:05,900 >> loading file out/tweet/roberta_version_2\\tokenizer_config.json\n",
      "[INFO|modeling_utils.py:1349] 2022-02-17 17:22:05,959 >> loading weights file out/tweet/roberta_version_2\\pytorch_model.bin\n",
      "[WARNING|modeling_utils.py:1609] 2022-02-17 17:22:07,196 >> Some weights of the model checkpoint at out/tweet/roberta_version_2 were not used when initializing RobertaForSequenceClassificationCustomAlternative: ['classifier.dense.weight', 'classifier.dense.bias']\n",
      "- This IS expected if you are initializing RobertaForSequenceClassificationCustomAlternative from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing RobertaForSequenceClassificationCustomAlternative from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "[WARNING|modeling_utils.py:1620] 2022-02-17 17:22:07,196 >> Some weights of RobertaForSequenceClassificationCustomAlternative were not initialized from the model checkpoint at out/tweet/roberta_version_2 and are newly initialized: ['classifier.dense_1_hidden.weight', 'classifier.dense_2.weight', 'classifier.dense_1_input.weight', 'classifier.dense_1_hidden.bias', 'classifier.dense_1_input.bias', 'classifier.dense_2.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/5 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset:  60%|######    | 3/5 [00:00<00:00, 22.77ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 5/5 [00:00<00:00, 26.11ba/s]\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 58.98ba/s]\n",
      "[INFO|trainer.py:540] 2022-02-17 17:22:08,390 >> The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassificationCustomAlternative.forward` and have been ignored: tweet.\n",
      "[INFO|trainer.py:2243] 2022-02-17 17:22:08,392 >> ***** Running Evaluation *****\n",
      "[INFO|trainer.py:2245] 2022-02-17 17:22:08,392 >>   Num examples = 500\n",
      "[INFO|trainer.py:2248] 2022-02-17 17:22:08,392 >>   Batch size = 24\n",
      "\n",
      "  0%|          | 0/21 [00:00<?, ?it/s]\n",
      " 10%|9         | 2/21 [00:02<00:20,  1.09s/it]\n",
      " 14%|#4        | 3/21 [00:04<00:27,  1.55s/it]\n",
      " 19%|#9        | 4/21 [00:06<00:30,  1.80s/it]\n",
      " 24%|##3       | 5/21 [00:08<00:30,  1.94s/it]\n",
      " 29%|##8       | 6/21 [00:11<00:30,  2.03s/it]\n",
      " 33%|###3      | 7/21 [00:13<00:29,  2.09s/it]\n",
      " 38%|###8      | 8/21 [00:15<00:27,  2.13s/it]\n",
      " 43%|####2     | 9/21 [00:17<00:25,  2.16s/it]\n",
      " 48%|####7     | 10/21 [00:19<00:24,  2.18s/it]\n",
      " 52%|#####2    | 11/21 [00:22<00:21,  2.20s/it]\n",
      " 57%|#####7    | 12/21 [00:24<00:20,  2.25s/it]\n",
      " 62%|######1   | 13/21 [00:26<00:17,  2.25s/it]\n",
      " 67%|######6   | 14/21 [00:28<00:15,  2.23s/it]\n",
      " 71%|#######1  | 15/21 [00:31<00:13,  2.24s/it]\n",
      " 76%|#######6  | 16/21 [00:33<00:11,  2.24s/it]\n",
      " 81%|########  | 17/21 [00:35<00:08,  2.22s/it]\n",
      " 86%|########5 | 18/21 [00:37<00:06,  2.23s/it]\n",
      " 90%|######### | 19/21 [00:40<00:04,  2.21s/it]\n",
      " 95%|#########5| 20/21 [00:42<00:02,  2.20s/it]\n",
      "100%|##########| 21/21 [00:44<00:00,  2.11s/it]\n",
      "100%|##########| 21/21 [00:44<00:00,  2.10s/it]\n",
      "[INFO|modelcard.py:449] 2022-02-17 17:22:55,278 >> Dropping the following result as it does not have all the necessary fields:\n",
      "{'task': {'name': 'Text Classification', 'type': 'text-classification'}}\n"
     ]
    }
   ],
   "source": [
    "#valid\n",
    "!python run_glue.py \\\n",
    "--model_name_or_path out/tweet/roberta_version_2 \\\n",
    "--output_dir out/tweet/roberta_version_2-evaluation \\\n",
    "--return_hidden_states --custom_model \\\n",
    "--train_file data/train.json --validation_file data/valid.json \\\n",
    "--do_eval \\\n",
    "--per_device_eval_batch_size 24 --max_seq_length 128 \\\n",
    "--return_hidden_states --custom_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "02/16/2022 01:12:34 - WARNING - __main__ - Process rank: -1, device: cpu, n_gpu: 0distributed training: False, 16-bits training: False\n",
      "02/16/2022 01:12:34 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n",
      "_n_gpu=0,\n",
      "adafactor=False,\n",
      "adam_beta1=0.9,\n",
      "adam_beta2=0.999,\n",
      "adam_epsilon=1e-08,\n",
      "dataloader_drop_last=False,\n",
      "dataloader_num_workers=0,\n",
      "dataloader_pin_memory=True,\n",
      "ddp_find_unused_parameters=None,"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "  0%|          | 0/2 [00:00<?, ?it/s]\n",
      "100%|##########| 2/2 [00:00<00:00, 167.11it/s]\n",
      "[INFO|configuration_utils.py:586] 2022-02-16 01:12:34,776 >> loading configuration file out/tweet/roberta_version_4\\config.json\n",
      "[INFO|configuration_utils.py:625] 2022-02-16 01:12:34,776 >> Model config RobertaConfig {\n",
      "  \"_name_or_path\": \"roberta-base\",\n",
      "  \"architectures\": [\n",
      "    \"RobertaForSequenceClassificationCustomAlternative\"\n",
      "  ],\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"bos_token_id\": 0,\n",
      "  \"classifier_dropout\": null,\n",
      "  \"eos_token_id\": 2,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
      "  \"hidden_size\": 768,\n",
      "  \"id2label\": {\n",
      "    \"0\": 0,\n",
      "    \"1\": 1\n",
      "  },\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"intermediate_size\": 3072,\n",
      "  \"label2id\": {\n",
      "    \"0\": 0,\n",
      "    \"1\": 1\n",
      "  },\n",
      "  \"layer_norm_eps\": 1e-05,\n",
      "  \"max_position_embeddings\": 514,\n",
      "  \"model_type\": \"roberta\",\n",
      "  \"num_attention_heads\": 12,\n",
      "  \"num_hidden_layers\": 12,\n",
      "  \"pad_token_id\": 1,\n",
      "  \"position_embedding_type\": \"absolute\",\n",
      "  \"problem_type\": \"single_label_classification\",\n",
      "  \"torch_dtype\": \"float32\",\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"type_vocab_size\": 1,\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50265\n",
      "}\n",
      "\n",
      "[INFO|tokenization_utils_base.py:1671] 2022-02-16 01:12:34,779 >> Didn't find file out/tweet/roberta_version_4\\added_tokens.json. We won't load it.\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-16 01:12:34,779 >> loading file out/tweet/roberta_version_4\\vocab.json\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-16 01:12:34,779 >> loading file out/tweet/roberta_version_4\\merges.txt\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-16 01:12:34,779 >> loading file out/tweet/roberta_version_4\\tokenizer.json\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-16 01:12:34,779 >> loading file None\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-16 01:12:34,779 >> loading file out/tweet/roberta_version_4\\special_tokens_map.json\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-16 01:12:34,780 >> loading file out/tweet/roberta_version_4\\tokenizer_config.json\n",
      "[INFO|modeling_utils.py:1349] 2022-02-16 01:12:34,829 >> loading weights file out/tweet/roberta_version_4\\pytorch_model.bin\n",
      "[INFO|modeling_utils.py:1618] 2022-02-16 01:12:35,990 >> All model checkpoint weights were used when initializing RobertaForSequenceClassificationCustomAlternative.\n",
      "\n",
      "[INFO|modeling_utils.py:1626] 2022-02-16 01:12:35,990 >> All the weights of RobertaForSequenceClassificationCustomAlternative were initialized from the model checkpoint at out/tweet/roberta_version_4.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassificationCustomAlternative for predictions without further training.\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 26.36ba/s]\n",
      "[INFO|trainer.py:540] 2022-02-16 01:12:36,822 >> The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassificationCustomAlternative.forward` and have been ignored: tweet.\n",
      "[INFO|trainer.py:2243] 2022-02-16 01:12:36,823 >> ***** Running Evaluation *****\n",
      "[INFO|trainer.py:2245] 2022-02-16 01:12:36,824 >>   Num examples = 500\n",
      "[INFO|trainer.py:2248] 2022-02-16 01:12:36,824 >>   Batch size = 24\n",
      "\n",
      "  0%|          | 0/21 [00:00<?, ?it/s]\n",
      " 10%|9         | 2/21 [00:02<00:20,  1.08s/it]\n",
      " 14%|#4        | 3/21 [00:04<00:27,  1.52s/it]\n",
      " 19%|#9        | 4/21 [00:06<00:29,  1.76s/it]\n",
      " 24%|##3       | 5/21 [00:08<00:30,  1.91s/it]\n",
      " 29%|##8       | 6/21 [00:10<00:30,  2.00s/it]\n",
      " 33%|###3      | 7/21 [00:13<00:29,  2.07s/it]\n",
      " 38%|###8      | 8/21 [00:15<00:27,  2.12s/it]\n",
      " 43%|####2     | 9/21 [00:17<00:25,  2.14s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "debug=[],\n",
      "deepspeed=None,\n",
      "disable_tqdm=False,\n",
      "do_eval=True,\n",
      "do_predict=False,\n",
      "do_train=False,\n",
      "eval_accumulation_steps=None,\n",
      "eval_steps=None,\n",
      "evaluation_strategy=IntervalStrategy.NO,\n",
      "fp16=False,\n",
      "fp16_backend=auto,\n",
      "fp16_full_eval=False,\n",
      "fp16_opt_level=O1,\n",
      "gradient_accumulation_steps=1,\n",
      "gradient_checkpointing=False,\n",
      "greater_is_better=None,\n",
      "group_by_length=False,\n",
      "hub_model_id=None,\n",
      "hub_strategy=HubStrategy.EVERY_SAVE,\n",
      "hub_token=<HUB_TOKEN>,\n",
      "ignore_data_skip=False,\n",
      "label_names=None,\n",
      "label_smoothing_factor=0.0,\n",
      "learning_rate=5e-05,\n",
      "length_column_name=length,\n",
      "load_best_model_at_end=False,\n",
      "local_rank=-1,\n",
      "log_level=-1,\n",
      "log_level_replica=-1,\n",
      "log_on_each_node=True,\n",
      "logging_dir=out/tweet/roberta_version_4-evaluation\\runs\\Feb16_01-12-34_DESKTOP-K706NKK,\n",
      "logging_first_step=False,\n",
      "logging_nan_inf_filter=True,\n",
      "logging_steps=500,\n",
      "logging_strategy=IntervalStrategy.STEPS,\n",
      "lr_scheduler_type=SchedulerType.LINEAR,\n",
      "max_grad_norm=1.0,\n",
      "max_steps=-1,\n",
      "metric_for_best_model=None,\n",
      "mp_parameters=,\n",
      "no_cuda=False,\n",
      "num_train_epochs=3.0,\n",
      "output_dir=out/tweet/roberta_version_4-evaluation,\n",
      "overwrite_output_dir=False,\n",
      "past_index=-1,\n",
      "per_device_eval_batch_size=24,\n",
      "per_device_train_batch_size=8,\n",
      "prediction_loss_only=False,\n",
      "push_to_hub=False,\n",
      "push_to_hub_model_id=None,\n",
      "push_to_hub_organization=None,\n",
      "push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
      "remove_unused_columns=True,\n",
      "report_to=[],\n",
      "resume_from_checkpoint=None,\n",
      "run_name=out/tweet/roberta_version_4-evaluation,\n",
      "save_on_each_node=False,\n",
      "save_steps=500,\n",
      "save_strategy=IntervalStrategy.STEPS,\n",
      "save_total_limit=None,\n",
      "seed=42,\n",
      "sharded_ddp=[],\n",
      "skip_memory_metrics=True,\n",
      "tpu_metrics_debug=False,\n",
      "tpu_num_cores=None,\n",
      "use_legacy_prediction_loop=False,\n",
      "warmup_ratio=0.0,\n",
      "warmup_steps=0,\n",
      "weight_decay=0.0,\n",
      "xpu_backend=None,\n",
      ")\n",
      "02/16/2022 01:12:34 - INFO - __main__ - load a local file for train: data/train.json\n",
      "02/16/2022 01:12:34 - INFO - __main__ - load a local file for validation: data/test.json\n",
      "02/16/2022 01:12:34 - WARNING - datasets.builder - Using custom data configuration default-aa408910693fa782\n",
      "02/16/2022 01:12:34 - INFO - datasets.builder - Overwrite dataset info from restored data version.\n",
      "02/16/2022 01:12:34 - INFO - datasets.info - Loading Dataset info from C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-aa408910693fa782\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\n",
      "02/16/2022 01:12:34 - WARNING - datasets.builder - Reusing dataset json (C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-aa408910693fa782\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)\n",
      "02/16/2022 01:12:34 - INFO - datasets.info - Loading Dataset info from C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-aa408910693fa782\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\n",
      "02/16/2022 01:12:34 - INFO - __main__ - Return hidden states from model: True\n",
      "02/16/2022 01:12:34 - INFO - __main__ - Using implementation from: RobertaForSequenceClassificationCustomAlternative\n",
      "02/16/2022 01:12:36 - WARNING - datasets.arrow_dataset - Loading cached processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-aa408910693fa782\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-7c7dda0a4623bcbe.arrow\n",
      "02/16/2022 01:12:36 - INFO - datasets.arrow_dataset - Caching processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-aa408910693fa782\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-eec123a569b1837d.arrow\n",
      "02/16/2022 01:12:36 - INFO - __main__ - *** Evaluate ***\n",
      "***** eval metrics *****\n",
      "  eval_accuracy           =        1.0\n",
      "  eval_loss               =     0.6472\n",
      "  eval_runtime            = 0:00:45.49\n",
      "  eval_samples            =        500\n",
      "  eval_samples_per_second =     10.991\n",
      "  eval_steps_per_second   =      0.462\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      " 48%|####7     | 10/21 [00:19<00:23,  2.17s/it]\n",
      " 52%|#####2    | 11/21 [00:21<00:21,  2.16s/it]\n",
      " 57%|#####7    | 12/21 [00:24<00:19,  2.18s/it]\n",
      " 62%|######1   | 13/21 [00:26<00:17,  2.18s/it]\n",
      " 67%|######6   | 14/21 [00:28<00:15,  2.18s/it]\n",
      " 71%|#######1  | 15/21 [00:30<00:13,  2.17s/it]\n",
      " 76%|#######6  | 16/21 [00:32<00:10,  2.18s/it]\n",
      " 81%|########  | 17/21 [00:34<00:08,  2.19s/it]\n",
      " 86%|########5 | 18/21 [00:37<00:06,  2.19s/it]\n",
      " 90%|######### | 19/21 [00:39<00:04,  2.19s/it]\n",
      " 95%|#########5| 20/21 [00:41<00:02,  2.17s/it]\n",
      "100%|##########| 21/21 [00:43<00:00,  2.06s/it]\n",
      "100%|##########| 21/21 [00:43<00:00,  2.06s/it]\n",
      "[INFO|modelcard.py:449] 2022-02-16 01:13:22,843 >> Dropping the following result as it does not have all the necessary fields:\n",
      "{'task': {'name': 'Text Classification', 'type': 'text-classification'}}\n"
     ]
    }
   ],
   "source": [
    "#test\n",
    "!python run_glue.py \\\n",
    "--model_name_or_path out/tweet/roberta_version_4 \\\n",
    "--output_dir out/tweet/roberta_version_4-evaluation \\\n",
    "--return_hidden_states --custom_model \\\n",
    "--train_file data/train.json --validation_file data/test.json \\\n",
    "--do_eval \\\n",
    "--per_device_eval_batch_size 24 --max_seq_length 128 \\\n",
    "--return_hidden_states --custom_model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GPT2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "02/17/2022 17:25:29 - WARNING - __main__ - Process rank: -1, device: cpu, n_gpu: 0distributed training: False, 16-bits training: False\n",
      "02/17/2022 17:25:29 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n",
      "_n_gpu=0,\n",
      "adafactor=False,\n",
      "adam_beta1=0.9,\n",
      "adam_beta2=0.999,\n",
      "adam_epsilon=1e-08,\n",
      "dataloader_drop_last=False,\n",
      "dataloader_num_workers=0,\n",
      "dataloader_pin_memory=True,\n",
      "ddp_find_unused_parameters=None,\n",
      "debug=[],\n",
      "deepspeed=None,\n",
      "disable_tqdm=False,\n",
      "do_eval=True,\n",
      "do_predict=False,\n",
      "do_train=False,\n",
      "eval_accumulation_steps=None,\n",
      "eval_steps=None,\n",
      "evaluation_strategy=IntervalStrategy.NO,\n",
      "fp16=False,\n",
      "fp16_backend=auto,\n",
      "fp16_full_eval=False,\n",
      "fp16_opt_level=O1,\n",
      "gradient_accumulation_steps=1,\n",
      "gradient_checkpointing=False,\n",
      "greater_is_better=None,\n",
      "group_by_length=False,\n",
      "hub_model_id=None,\n",
      "hub_strategy=HubStrategy.EVERY_SAVE,\n",
      "hub_token=<HUB_TOKEN>,\n",
      "ignore_data_skip=False,\n",
      "label_names=None,\n",
      "label_smoothing_factor=0.0,\n",
      "learning_rate=5e-05,\n",
      "length_column_name=length,\n",
      "load_best_model_at_end=False,\n",
      "local_rank=-1,\n",
      "log_level=-1,\n",
      "log_level_replica=-1,\n",
      "log_on_each_node=True,\n",
      "logging_dir=out/tweet/gpt2_version_2-evaluation\\runs\\Feb17_17-25-29_DESKTOP-K706NKK,\n",
      "logging_first_step=False,\n",
      "logging_nan_inf_filter=True,\n",
      "logging_steps=500,\n",
      "logging_strategy=IntervalStrategy.STEPS,\n",
      "lr_scheduler_type=SchedulerType.LINEAR,\n",
      "max_grad_norm=1.0,\n",
      "max_steps=-1,\n",
      "metric_for_best_model=None,\n",
      "mp_parameters=,\n",
      "no_cuda=False,\n",
      "num_train_epochs=3.0,\n",
      "output_dir=out/tweet/gpt2_version_2-evaluation,\n",
      "overwrite_output_dir=False,\n",
      "past_index=-1,\n",
      "per_device_eval_batch_size=24,\n",
      "per_device_train_batch_size=8,\n",
      "prediction_loss_only=False,\n",
      "push_to_hub=False,\n",
      "push_to_hub_model_id=None,\n",
      "push_to_hub_organization=None,\n",
      "push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
      "remove_unused_columns=True,\n",
      "report_to=[],\n",
      "resume_from_checkpoint=None,\n",
      "run_name=out/tweet/gpt2_version_2-evaluation,\n",
      "save_on_each_node=False,\n",
      "save_steps=500,\n",
      "save_strategy=IntervalStrategy.STEPS,\n",
      "save_total_limit=None,\n",
      "seed=42,\n",
      "sharded_ddp=[],\n",
      "skip_memory_metrics=True,\n",
      "tpu_metrics_debug=False,\n",
      "tpu_num_cores=None,\n",
      "use_legacy_prediction_loop=False,\n",
      "warmup_ratio=0.0,\n",
      "warmup_steps=0,\n",
      "weight_decay=0.0,\n",
      "xpu_backend=None,\n",
      ")\n",
      "02/17/2022 17:25:29 - INFO - __main__ - load a local file for train: data/train.json\n",
      "02/17/2022 17:25:29 - INFO - __main__ - load a local file for validation: data/valid.json\n",
      "02/17/2022 17:25:29 - WARNING - datasets.builder - Using custom data configuration default-f2672b914d9c5a33\n",
      "02/17/2022 17:25:29 - INFO - datasets.builder - Overwrite dataset info from restored data version.\n",
      "02/17/2022 17:25:29 - INFO - datasets.info - Loading Dataset info from C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-f2672b914d9c5a33\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\n",
      "02/17/2022 17:25:29 - WARNING - datasets.builder - Reusing dataset json (C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-f2672b914d9c5a33\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)\n",
      "02/17/2022 17:25:29 - INFO - datasets.info - Loading Dataset info from C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-f2672b914d9c5a33\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\n",
      "02/17/2022 17:25:29 - INFO - __main__ - Return hidden states from model: True\n",
      "02/17/2022 17:25:29 - INFO - __main__ - Using implementation from: GPT2ForSequenceClassificationCustom\n",
      "02/17/2022 17:25:31 - INFO - datasets.arrow_dataset - Caching processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-f2672b914d9c5a33\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-212f78cac2ca92a1.arrow\n",
      "02/17/2022 17:25:31 - INFO - datasets.arrow_dataset - Caching processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-f2672b914d9c5a33\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-95c22eb06b0faad8.arrow\n",
      "02/17/2022 17:25:32 - INFO - __main__ - *** Evaluate ***\n",
      "***** eval metrics *****\n",
      "  eval_accuracy           =      0.938\n",
      "  eval_loss               =     0.4886\n",
      "  eval_runtime            = 0:01:01.53\n",
      "  eval_samples            =        500\n",
      "  eval_samples_per_second =      8.126\n",
      "  eval_steps_per_second   =      0.341\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "  0%|          | 0/2 [00:00<?, ?it/s]\n",
      "100%|##########| 2/2 [00:00<00:00, 2018.43it/s]\n",
      "[INFO|configuration_utils.py:586] 2022-02-17 17:25:29,863 >> loading configuration file out/tweet/gpt2_version_2\\config.json\n",
      "[INFO|configuration_utils.py:625] 2022-02-17 17:25:29,864 >> Model config GPT2Config {\n",
      "  \"_name_or_path\": \"gpt2\",\n",
      "  \"activation_function\": \"gelu_new\",\n",
      "  \"architectures\": [\n",
      "    \"GPT2ForSequenceClassification\"\n",
      "  ],\n",
      "  \"attn_pdrop\": 0.1,\n",
      "  \"bos_token_id\": 50256,\n",
      "  \"embd_pdrop\": 0.1,\n",
      "  \"eos_token_id\": 50256,\n",
      "  \"id2label\": {\n",
      "    \"0\": 0,\n",
      "    \"1\": 1\n",
      "  },\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"label2id\": {\n",
      "    \"0\": 0,\n",
      "    \"1\": 1\n",
      "  },\n",
      "  \"layer_norm_epsilon\": 1e-05,\n",
      "  \"model_type\": \"gpt2\",\n",
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"pad_token_id\": 50256,\n",
      "  \"reorder_and_upcast_attn\": false,\n",
      "  \"resid_pdrop\": 0.1,\n",
      "  \"scale_attn_by_inverse_layer_idx\": false,\n",
      "  \"scale_attn_weights\": true,\n",
      "  \"summary_activation\": null,\n",
      "  \"summary_first_dropout\": 0.1,\n",
      "  \"summary_proj_to_labels\": true,\n",
      "  \"summary_type\": \"cls_index\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"task_specific_params\": {\n",
      "    \"text-generation\": {\n",
      "      \"do_sample\": true,\n",
      "      \"max_length\": 50\n",
      "    }\n",
      "  },\n",
      "  \"torch_dtype\": \"float32\",\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 50257\n",
      "}\n",
      "\n",
      "[INFO|tokenization_utils_base.py:1671] 2022-02-17 17:25:29,868 >> Didn't find file out/tweet/gpt2_version_2\\added_tokens.json. We won't load it.\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:25:29,869 >> loading file out/tweet/gpt2_version_2\\vocab.json\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:25:29,869 >> loading file out/tweet/gpt2_version_2\\merges.txt\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:25:29,869 >> loading file out/tweet/gpt2_version_2\\tokenizer.json\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:25:29,869 >> loading file None\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:25:29,869 >> loading file out/tweet/gpt2_version_2\\special_tokens_map.json\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:25:29,869 >> loading file out/tweet/gpt2_version_2\\tokenizer_config.json\n",
      "[INFO|modeling_utils.py:1349] 2022-02-17 17:25:29,927 >> loading weights file out/tweet/gpt2_version_2\\pytorch_model.bin\n",
      "[WARNING|modeling_utils.py:1609] 2022-02-17 17:25:31,677 >> Some weights of the model checkpoint at out/tweet/gpt2_version_2 were not used when initializing GPT2ForSequenceClassificationCustom: ['score.weight']\n",
      "- This IS expected if you are initializing GPT2ForSequenceClassificationCustom from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing GPT2ForSequenceClassificationCustom from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "[WARNING|modeling_utils.py:1620] 2022-02-17 17:25:31,677 >> Some weights of GPT2ForSequenceClassificationCustom were not initialized from the model checkpoint at out/tweet/gpt2_version_2 and are newly initialized: ['score.out_proj.weight', 'score.dense_1_input.bias', 'score.dense_1_hidden.bias', 'score.dense_2.weight', 'score.dense_2.bias', 'score.dense_1_hidden.weight', 'score.dense_1_input.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/5 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset:  40%|####      | 2/5 [00:00<00:00, 18.16ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 5/5 [00:00<00:00, 25.52ba/s]\n",
      "\n",
      "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]\n",
      "Running tokenizer on dataset: 100%|##########| 1/1 [00:00<00:00, 58.98ba/s]\n",
      "[INFO|trainer.py:540] 2022-02-17 17:25:32,736 >> The following columns in the evaluation set  don't have a corresponding argument in `GPT2ForSequenceClassificationCustom.forward` and have been ignored: tweet.\n",
      "[INFO|trainer.py:2243] 2022-02-17 17:25:32,737 >> ***** Running Evaluation *****\n",
      "[INFO|trainer.py:2245] 2022-02-17 17:25:32,737 >>   Num examples = 500\n",
      "[INFO|trainer.py:2248] 2022-02-17 17:25:32,737 >>   Batch size = 24\n",
      "\n",
      "  0%|          | 0/21 [00:00<?, ?it/s]\n",
      " 10%|9         | 2/21 [00:02<00:28,  1.48s/it]\n",
      " 14%|#4        | 3/21 [00:05<00:37,  2.08s/it]\n",
      " 19%|#9        | 4/21 [00:08<00:40,  2.40s/it]\n",
      " 24%|##3       | 5/21 [00:11<00:41,  2.60s/it]\n",
      " 29%|##8       | 6/21 [00:14<00:40,  2.70s/it]\n",
      " 33%|###3      | 7/21 [00:17<00:38,  2.77s/it]\n",
      " 38%|###8      | 8/21 [00:20<00:36,  2.81s/it]\n",
      " 43%|####2     | 9/21 [00:23<00:34,  2.86s/it]\n",
      " 48%|####7     | 10/21 [00:26<00:31,  2.89s/it]\n",
      " 52%|#####2    | 11/21 [00:29<00:29,  2.90s/it]\n",
      " 57%|#####7    | 12/21 [00:32<00:26,  2.91s/it]\n",
      " 62%|######1   | 13/21 [00:35<00:23,  2.94s/it]\n",
      " 67%|######6   | 14/21 [00:38<00:20,  2.96s/it]\n",
      " 71%|#######1  | 15/21 [00:41<00:17,  2.97s/it]\n",
      " 76%|#######6  | 16/21 [00:44<00:14,  2.95s/it]\n",
      " 81%|########  | 17/21 [00:47<00:11,  2.95s/it]\n",
      " 86%|########5 | 18/21 [00:50<00:08,  2.96s/it]\n",
      " 90%|######### | 19/21 [00:53<00:05,  2.96s/it]\n",
      " 95%|#########5| 20/21 [00:56<00:02,  2.97s/it]\n",
      "100%|##########| 21/21 [00:58<00:00,  2.83s/it]\n",
      "100%|##########| 21/21 [00:58<00:00,  2.79s/it]\n",
      "[INFO|modelcard.py:449] 2022-02-17 17:26:34,864 >> Dropping the following result as it does not have all the necessary fields:\n",
      "{'task': {'name': 'Text Classification', 'type': 'text-classification'}}\n"
     ]
    }
   ],
   "source": [
    "#valid\n",
    "!python run_glue.py \\\n",
    "--model_name_or_path out/tweet/gpt2_version_2 \\\n",
    "--output_dir out/tweet/gpt2_version_2-evaluation \\\n",
    "--return_hidden_states --custom_model \\\n",
    "--train_file data/train.json --validation_file data/valid.json \\\n",
    "--do_eval \\\n",
    "--per_device_eval_batch_size 24 --max_seq_length 128 \\\n",
    "--return_hidden_states --custom_model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# T5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "02/17/2022 17:36:52 - WARNING - __main__ - Process rank: -1, device: cpu, n_gpu: 0distributed training: False, 16-bits training: False\n",
      "02/17/2022 17:36:52 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments(\n",
      "_n_gpu=0,\n",
      "adafactor=False,\n",
      "adam_beta1=0.9,\n",
      "adam_beta2=0.999,\n",
      "adam_epsilon=1e-08,\n",
      "dataloader_drop_last=False,\n",
      "dataloader_num_workers=0,\n",
      "dataloader_pin_memory=True,\n",
      "ddp_find_unused_parameters=None,\n",
      "debug=[],\n",
      "deepspeed=None,\n",
      "disable_tqdm=False,\n",
      "do_eval=True,\n",
      "do_predict=False,\n",
      "do_train=False,\n",
      "eval_accumulation_steps=None,\n",
      "eval_steps=None,\n",
      "evaluation_strategy=IntervalStrategy.NO,\n",
      "fp16=False,\n",
      "fp16_backend=auto,\n",
      "fp16_full_eval=False,\n",
      "fp16_opt_level=O1,\n",
      "generation_max_length=None,\n",
      "generation_num_beams=None,\n",
      "gradient_accumulation_steps=1,\n",
      "gradient_checkpointing=False,\n",
      "greater_is_better=None,\n",
      "group_by_length=False,\n",
      "hub_model_id=None,\n",
      "hub_strategy=HubStrategy.EVERY_SAVE,\n",
      "hub_token=<HUB_TOKEN>,\n",
      "ignore_data_skip=False,\n",
      "label_names=None,\n",
      "label_smoothing_factor=0.0,\n",
      "learning_rate=5e-05,\n",
      "length_column_name=length,\n",
      "load_best_model_at_end=False,\n",
      "local_rank=-1,\n",
      "log_level=-1,\n",
      "log_level_replica=-1,\n",
      "log_on_each_node=True,\n",
      "logging_dir=out/tweet/t5-evaluation\\runs\\Feb17_17-36-52_DESKTOP-K706NKK,\n",
      "logging_first_step=False,\n",
      "logging_nan_inf_filter=True,\n",
      "logging_steps=500,\n",
      "logging_strategy=IntervalStrategy.STEPS,\n",
      "lr_scheduler_type=SchedulerType.LINEAR,\n",
      "max_grad_norm=1.0,\n",
      "max_steps=-1,\n",
      "metric_for_best_model=None,\n",
      "mp_parameters=,\n",
      "no_cuda=False,\n",
      "num_train_epochs=3.0,\n",
      "output_dir=out/tweet/t5-evaluation,\n",
      "overwrite_output_dir=False,\n",
      "past_index=-1,\n",
      "per_device_eval_batch_size=16,\n",
      "per_device_train_batch_size=8,\n",
      "predict_with_generate=True,\n",
      "prediction_loss_only=False,\n",
      "push_to_hub=False,\n",
      "push_to_hub_model_id=None,\n",
      "push_to_hub_organization=None,\n",
      "push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
      "remove_unused_columns=True,\n",
      "report_to=[],\n",
      "resume_from_checkpoint=None,\n",
      "run_name=out/tweet/t5-evaluation,\n",
      "save_on_each_node=False,\n",
      "save_steps=500,\n",
      "save_strategy=IntervalStrategy.STEPS,\n",
      "save_total_limit=None,\n",
      "seed=42,\n",
      "sharded_ddp=[],\n",
      "skip_memory_metrics=True,\n",
      "sortish_sampler=False,\n",
      "tpu_metrics_debug=False,\n",
      "tpu_num_cores=None,\n",
      "use_legacy_prediction_loop=False,\n",
      "warmup_ratio=0.0,\n",
      "warmup_steps=0,\n",
      "weight_decay=0.0,\n",
      "xpu_backend=None,\n",
      ")\n",
      "02/17/2022 17:36:52 - WARNING - datasets.builder - Using custom data configuration default-6d5bc754bbaa91d7\n",
      "02/17/2022 17:36:52 - INFO - datasets.builder - Overwrite dataset info from restored data version.\n",
      "02/17/2022 17:36:52 - INFO - datasets.info - Loading Dataset info from C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-6d5bc754bbaa91d7\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\n",
      "02/17/2022 17:36:52 - WARNING - datasets.builder - Reusing dataset json (C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-6d5bc754bbaa91d7\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426)\n",
      "02/17/2022 17:36:52 - INFO - datasets.info - Loading Dataset info from C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-6d5bc754bbaa91d7\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\n",
      "02/17/2022 17:36:53 - INFO - __main__ - Using translation prefix: \"tweet classification: \"\n",
      "02/17/2022 17:36:53 - INFO - datasets.arrow_dataset - Caching processed dataset at C:\\Users\\Foka\\.cache\\huggingface\\datasets\\json\\default-6d5bc754bbaa91d7\\0.0.0\\c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426\\cache-96f3d337ad66e082.arrow\n",
      "02/17/2022 17:36:55 - INFO - __main__ - *** Evaluate ***\n",
      "02/17/2022 17:37:09 - INFO - datasets.metric - Removing C:\\Users\\Foka\\.cache\\huggingface\\metrics\\accuracy\\default\\default_experiment-1-0.arrow\n",
      "02/17/2022 17:37:09 - INFO - datasets.metric - Removing C:\\Users\\Foka\\.cache\\huggingface\\metrics\\sacrebleu\\default\\default_experiment-1-0.arrow\n",
      "***** eval metrics *****\n",
      "  eval_accuracy           =        1.0\n",
      "  eval_bleu               =        0.0\n",
      "  eval_gen_len            =      2.272\n",
      "  eval_loss               =     0.5538\n",
      "  eval_runtime            = 0:00:14.42\n",
      "  eval_samples            =        500\n",
      "  eval_samples_per_second =     34.659\n",
      "  eval_steps_per_second   =      2.218\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "  0%|          | 0/2 [00:00<?, ?it/s]\n",
      "100%|##########| 2/2 [00:00<00:00, 2020.86it/s]\n",
      "[INFO|configuration_utils.py:586] 2022-02-17 17:36:52,675 >> loading configuration file out/tweet/t5_version_2\\config.json\n",
      "[INFO|configuration_utils.py:625] 2022-02-17 17:36:52,677 >> Model config T5Config {\n",
      "  \"_name_or_path\": \"t5-small\",\n",
      "  \"architectures\": [\n",
      "    \"T5ForConditionalGeneration\"\n",
      "  ],\n",
      "  \"d_ff\": 2048,\n",
      "  \"d_kv\": 64,\n",
      "  \"d_model\": 512,\n",
      "  \"decoder_start_token_id\": 0,\n",
      "  \"dropout_rate\": 0.1,\n",
      "  \"eos_token_id\": 1,\n",
      "  \"feed_forward_proj\": \"relu\",\n",
      "  \"initializer_factor\": 1.0,\n",
      "  \"is_encoder_decoder\": true,\n",
      "  \"layer_norm_epsilon\": 1e-06,\n",
      "  \"model_type\": \"t5\",\n",
      "  \"n_positions\": 512,\n",
      "  \"num_decoder_layers\": 6,\n",
      "  \"num_heads\": 8,\n",
      "  \"num_layers\": 6,\n",
      "  \"output_past\": true,\n",
      "  \"pad_token_id\": 0,\n",
      "  \"relative_attention_num_buckets\": 32,\n",
      "  \"task_specific_params\": {\n",
      "    \"summarization\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"length_penalty\": 2.0,\n",
      "      \"max_length\": 200,\n",
      "      \"min_length\": 30,\n",
      "      \"no_repeat_ngram_size\": 3,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"summarize: \"\n",
      "    },\n",
      "    \"translation_en_to_de\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to German: \"\n",
      "    },\n",
      "    \"translation_en_to_fr\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to French: \"\n",
      "    },\n",
      "    \"translation_en_to_ro\": {\n",
      "      \"early_stopping\": true,\n",
      "      \"max_length\": 300,\n",
      "      \"num_beams\": 4,\n",
      "      \"prefix\": \"translate English to Romanian: \"\n",
      "    }\n",
      "  },\n",
      "  \"torch_dtype\": \"float32\",\n",
      "  \"transformers_version\": \"4.12.5\",\n",
      "  \"use_cache\": true,\n",
      "  \"vocab_size\": 32100\n",
      "}\n",
      "\n",
      "[INFO|tokenization_utils_base.py:1671] 2022-02-17 17:36:52,677 >> Didn't find file out/tweet/t5_version_2\\added_tokens.json. We won't load it.\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:36:52,677 >> loading file out/tweet/t5_version_2\\spiece.model\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:36:52,677 >> loading file out/tweet/t5_version_2\\tokenizer.json\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:36:52,677 >> loading file None\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:36:52,677 >> loading file out/tweet/t5_version_2\\special_tokens_map.json\n",
      "[INFO|tokenization_utils_base.py:1740] 2022-02-17 17:36:52,677 >> loading file out/tweet/t5_version_2\\tokenizer_config.json\n",
      "[INFO|modeling_utils.py:1349] 2022-02-17 17:36:52,771 >> loading weights file out/tweet/t5_version_2\\pytorch_model.bin\n",
      "[INFO|modeling_utils.py:1618] 2022-02-17 17:36:53,190 >> All model checkpoint weights were used when initializing T5ForConditionalGeneration.\n",
      "\n",
      "[INFO|modeling_utils.py:1626] 2022-02-17 17:36:53,190 >> All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at out/tweet/t5_version_2.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.\n",
      "\n",
      "Running tokenizer on validation dataset:   0%|          | 0/1 [00:00<?, ?ba/s]\n",
      "Running tokenizer on validation dataset: 100%|##########| 1/1 [00:00<00:00, 34.57ba/s]\n",
      "[INFO|trainer.py:2243] 2022-02-17 17:36:55,016 >> ***** Running Evaluation *****\n",
      "[INFO|trainer.py:2245] 2022-02-17 17:36:55,016 >>   Num examples = 500\n",
      "[INFO|trainer.py:2248] 2022-02-17 17:36:55,016 >>   Batch size = 16\n",
      "\n",
      "  0%|          | 0/32 [00:00<?, ?it/s]\n",
      "  6%|6         | 2/32 [00:00<00:06,  4.31it/s]\n",
      "  9%|9         | 3/32 [00:01<00:10,  2.75it/s]\n",
      " 12%|#2        | 4/32 [00:01<00:10,  2.57it/s]\n",
      " 16%|#5        | 5/32 [00:01<00:11,  2.43it/s]\n",
      " 19%|#8        | 6/32 [00:02<00:11,  2.32it/s]\n",
      " 22%|##1       | 7/32 [00:02<00:11,  2.14it/s]\n",
      " 25%|##5       | 8/32 [00:03<00:11,  2.11it/s]\n",
      " 28%|##8       | 9/32 [00:03<00:10,  2.12it/s]\n",
      " 31%|###1      | 10/32 [00:04<00:09,  2.20it/s]\n",
      " 34%|###4      | 11/32 [00:04<00:09,  2.33it/s]\n",
      " 38%|###7      | 12/32 [00:05<00:08,  2.26it/s]\n",
      " 41%|####      | 13/32 [00:05<00:08,  2.23it/s]\n",
      " 44%|####3     | 14/32 [00:06<00:08,  2.23it/s]\n",
      " 47%|####6     | 15/32 [00:06<00:07,  2.26it/s]\n",
      " 50%|#####     | 16/32 [00:06<00:07,  2.25it/s]\n",
      " 53%|#####3    | 17/32 [00:07<00:07,  2.09it/s]\n",
      " 56%|#####6    | 18/32 [00:07<00:06,  2.15it/s]\n",
      " 59%|#####9    | 19/32 [00:08<00:05,  2.21it/s]\n",
      " 62%|######2   | 20/32 [00:08<00:05,  2.26it/s]\n",
      " 66%|######5   | 21/32 [00:09<00:05,  2.16it/s]\n",
      " 69%|######8   | 22/32 [00:09<00:04,  2.05it/s]\n",
      " 72%|#######1  | 23/32 [00:10<00:04,  2.14it/s]\n",
      " 75%|#######5  | 24/32 [00:10<00:03,  2.11it/s]\n",
      " 78%|#######8  | 25/32 [00:11<00:03,  2.23it/s]\n",
      " 81%|########1 | 26/32 [00:11<00:02,  2.14it/s]\n",
      " 84%|########4 | 27/32 [00:12<00:02,  2.25it/s]\n",
      " 88%|########7 | 28/32 [00:12<00:01,  2.12it/s]\n",
      " 91%|######### | 29/32 [00:12<00:01,  2.23it/s]\n",
      " 94%|#########3| 30/32 [00:13<00:00,  2.27it/s]\n",
      " 97%|#########6| 31/32 [00:13<00:00,  2.34it/s]\n",
      "100%|##########| 32/32 [00:13<00:00,  2.98it/s]\n",
      "100%|##########| 32/32 [00:13<00:00,  2.30it/s]\n",
      "[INFO|modelcard.py:449] 2022-02-17 17:37:10,066 >> Dropping the following result as it does not have all the necessary fields:\n",
      "{'task': {'name': 'Translation', 'type': 'translation'}}\n"
     ]
    }
   ],
   "source": [
    "#train and test\n",
    "!python run_translation.py \\\n",
    "--model_name_or_path out/tweet/t5_version_2 \\\n",
    "--output_dir out/tweet/t5-evaluation \\\n",
    "--train_file data/translations-train.json \\\n",
    "--validation_file data/translations-test.json \\\n",
    "--do_eval \\\n",
    "--per_device_eval_batch_size 16 \\\n",
    "--source_lang text \\\n",
    "--target_lang label \\\n",
    "--source_prefix \"tweet classification\" \\\n",
    "--max_source_length 256 \\\n",
    "--max_target_length 128 \\\n",
    "--predict_with_generate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}