challenging-america-word-ga.../nn_trigram.ipynb

876 lines
103 KiB
Plaintext
Raw Normal View History

2023-05-09 23:56:29 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import lzma\n",
"from itertools import islice\n",
"import re\n",
"import sys\n",
"from torchtext.vocab import build_vocab_from_iterator\n",
"from torch import nn\n",
"from torch.utils.data import IterableDataset, DataLoader\n",
"import itertools\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"VOCAB_SIZE = 10_000\n",
"EMBED_SIZE = 400"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[1;32md:\\studia\\challenging-america-word-gap-prediction\\nn_trigram.ipynb Cell 3\u001b[0m in \u001b[0;36m<cell line: 17>\u001b[1;34m()\u001b[0m\n\u001b[0;32m <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=12'>13</a>\u001b[0m \u001b[39myield\u001b[39;00m get_words_from_line(line)\n\u001b[0;32m <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=14'>15</a>\u001b[0m vocab_size \u001b[39m=\u001b[39m \u001b[39m1_000\u001b[39m\n\u001b[1;32m---> <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=16'>17</a>\u001b[0m vocab \u001b[39m=\u001b[39m build_vocab_from_iterator(\n\u001b[0;32m <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=17'>18</a>\u001b[0m get_word_lines_from_file(\u001b[39m\"\u001b[39;49m\u001b[39mtrain/in.tsv.xz\u001b[39;49m\u001b[39m\"\u001b[39;49m),\n\u001b[0;32m <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=18'>19</a>\u001b[0m max_tokens \u001b[39m=\u001b[39;49m VOCAB_SIZE,\n\u001b[0;32m <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=19'>20</a>\u001b[0m specials \u001b[39m=\u001b[39;49m [\u001b[39m'\u001b[39;49m\u001b[39m<unk>\u001b[39;49m\u001b[39m'\u001b[39;49m])\n",
"File \u001b[1;32mc:\\PROGRAMY\\Anaconda3\\envs\\modelowanie-jezyka\\lib\\site-packages\\torchtext\\vocab\\vocab_factory.py:98\u001b[0m, in \u001b[0;36mbuild_vocab_from_iterator\u001b[1;34m(iterator, min_freq, specials, special_first, max_tokens)\u001b[0m\n\u001b[0;32m 72\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m 73\u001b[0m \u001b[39mBuild a Vocab from an iterator.\u001b[39;00m\n\u001b[0;32m 74\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 94\u001b[0m \u001b[39m >>> vocab = build_vocab_from_iterator(yield_tokens(file_path), specials=[\"<unk>\"])\u001b[39;00m\n\u001b[0;32m 95\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m 97\u001b[0m counter \u001b[39m=\u001b[39m Counter()\n\u001b[1;32m---> 98\u001b[0m \u001b[39mfor\u001b[39;00m tokens \u001b[39min\u001b[39;00m iterator:\n\u001b[0;32m 99\u001b[0m counter\u001b[39m.\u001b[39mupdate(tokens)\n\u001b[0;32m 101\u001b[0m specials \u001b[39m=\u001b[39m specials \u001b[39mor\u001b[39;00m []\n",
"\u001b[1;32md:\\studia\\challenging-america-word-gap-prediction\\nn_trigram.ipynb Cell 3\u001b[0m in \u001b[0;36mget_word_lines_from_file\u001b[1;34m(file_name)\u001b[0m\n\u001b[0;32m <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=9'>10</a>\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mget_word_lines_from_file\u001b[39m(file_name):\n\u001b[0;32m <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=10'>11</a>\u001b[0m \u001b[39mwith\u001b[39;00m lzma\u001b[39m.\u001b[39mopen(file_name, encoding\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mutf8\u001b[39m\u001b[39m'\u001b[39m, mode\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mrt\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m fh:\n\u001b[1;32m---> <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=11'>12</a>\u001b[0m \u001b[39mfor\u001b[39;00m line \u001b[39min\u001b[39;00m fh:\n\u001b[0;32m <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=12'>13</a>\u001b[0m \u001b[39myield\u001b[39;00m get_words_from_line(line)\n",
"File \u001b[1;32mc:\\PROGRAMY\\Anaconda3\\envs\\modelowanie-jezyka\\lib\\lzma.py:212\u001b[0m, in \u001b[0;36mLZMAFile.read1\u001b[1;34m(self, size)\u001b[0m\n\u001b[0;32m 210\u001b[0m \u001b[39mif\u001b[39;00m size \u001b[39m<\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[0;32m 211\u001b[0m size \u001b[39m=\u001b[39m io\u001b[39m.\u001b[39mDEFAULT_BUFFER_SIZE\n\u001b[1;32m--> 212\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_buffer\u001b[39m.\u001b[39;49mread1(size)\n",
"File \u001b[1;32mc:\\PROGRAMY\\Anaconda3\\envs\\modelowanie-jezyka\\lib\\_compression.py:68\u001b[0m, in \u001b[0;36mDecompressReader.readinto\u001b[1;34m(self, b)\u001b[0m\n\u001b[0;32m 66\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mreadinto\u001b[39m(\u001b[39mself\u001b[39m, b):\n\u001b[0;32m 67\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mmemoryview\u001b[39m(b) \u001b[39mas\u001b[39;00m view, view\u001b[39m.\u001b[39mcast(\u001b[39m\"\u001b[39m\u001b[39mB\u001b[39m\u001b[39m\"\u001b[39m) \u001b[39mas\u001b[39;00m byte_view:\n\u001b[1;32m---> 68\u001b[0m data \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mread(\u001b[39mlen\u001b[39;49m(byte_view))\n\u001b[0;32m 69\u001b[0m byte_view[:\u001b[39mlen\u001b[39m(data)] \u001b[39m=\u001b[39m data\n\u001b[0;32m 70\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mlen\u001b[39m(data)\n",
"File \u001b[1;32mc:\\PROGRAMY\\Anaconda3\\envs\\modelowanie-jezyka\\lib\\_compression.py:103\u001b[0m, in \u001b[0;36mDecompressReader.read\u001b[1;34m(self, size)\u001b[0m\n\u001b[0;32m 101\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 102\u001b[0m rawblock \u001b[39m=\u001b[39m \u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m--> 103\u001b[0m data \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_decompressor\u001b[39m.\u001b[39;49mdecompress(rawblock, size)\n\u001b[0;32m 104\u001b[0m \u001b[39mif\u001b[39;00m data:\n\u001b[0;32m 105\u001b[0m \u001b[39mbreak\u001b[39;00m\n",
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"def get_words_from_line(line):\n",
" line = line.rstrip()\n",
" line = line.split(\"\\t\")\n",
" text = line[-2] + \" \" + line[-1]\n",
" text = re.sub(r\"\\\\+n\", \" \", text)\n",
" text = re.sub('[^A-Za-z ]+', '', text)\n",
" for t in text.split():\n",
" yield t\n",
"\n",
"def get_word_lines_from_file(file_name):\n",
" with lzma.open(file_name, encoding='utf8', mode=\"rt\") as fh:\n",
" for line in fh:\n",
" yield get_words_from_line(line)\n",
"\n",
"vocab = build_vocab_from_iterator(\n",
" get_word_lines_from_file(\"train/in.tsv.xz\"),\n",
" max_tokens = VOCAB_SIZE,\n",
" specials = ['<unk>'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def look_ahead_iterator(gen):\n",
" first = None\n",
" second = None\n",
" for item in gen:\n",
" if first is not None and second is not None:\n",
" yield ((first, item), second)\n",
" first = second\n",
" second = item\n",
"\n",
"class Trigrams(IterableDataset):\n",
" def __init__(self, text_file, vocabulary_size):\n",
" self.vocab = vocab\n",
" self.vocab.set_default_index(self.vocab['<unk>'])\n",
" self.vocabulary_size = VOCAB_SIZE\n",
" self.text_file = text_file\n",
"\n",
" def __iter__(self):\n",
" return look_ahead_iterator(\n",
" (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))\n",
"\n",
"train_dataset = Trigrams(\"train/in.tsv.xz\", VOCAB_SIZE)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class TrigramNNModel(nn.Module):\n",
" def __init__(self, VOCAB_SIZE, EMBED_SIZE):\n",
" super(TrigramNNModel, self).__init__()\n",
" self.embeddings = nn.Embedding(VOCAB_SIZE, EMBED_SIZE)\n",
" self.hidden_layer = nn.Linear(EMBED_SIZE*2, 1200)\n",
" self.output_layer = nn.Linear(1200, VOCAB_SIZE)\n",
" self.softmax = nn.Softmax()\n",
"\n",
" def forward(self, x):\n",
" emb_2 = self.embeddings(x[0])\n",
" emb_1 = self.embeddings(x[1])\n",
" x = torch.cat([emb_2, emb_1], dim=1)\n",
" x = self.hidden_layer(x)\n",
" x = self.output_layer(x)\n",
" x = self.softmax(x)\n",
" return x\n",
"\n",
"model = TrigramNNModel(vocab_size, embed_size)\n",
"\n",
"vocab.set_default_index(vocab['<unk>'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\micha\\AppData\\Local\\Temp\\ipykernel_14016\\2809838665.py:15: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
" x = self.softmax(x)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 tensor(9.2713, grad_fn=<NllLossBackward0>)\n",
"1 LOSS DIFF: tensor(8.2370, grad_fn=<NllLossBackward0>) tensor(8.2154, grad_fn=<NllLossBackward0>)\n",
"2 LOSS DIFF: tensor(8.0085, grad_fn=<NllLossBackward0>) tensor(7.9711, grad_fn=<NllLossBackward0>)\n",
"3 LOSS DIFF: tensor(8.0149, grad_fn=<NllLossBackward0>) tensor(8.0085, grad_fn=<NllLossBackward0>)\n",
"4 LOSS DIFF: tensor(7.5328, grad_fn=<NllLossBackward0>) tensor(7.4404, grad_fn=<NllLossBackward0>)\n",
"5 LOSS DIFF: tensor(7.5367, grad_fn=<NllLossBackward0>) tensor(7.5328, grad_fn=<NllLossBackward0>)\n",
"6 LOSS DIFF: tensor(7.6733, grad_fn=<NllLossBackward0>) tensor(7.5367, grad_fn=<NllLossBackward0>)\n",
"7 LOSS DIFF: tensor(7.4703, grad_fn=<NllLossBackward0>) tensor(7.3663, grad_fn=<NllLossBackward0>)\n",
"8 LOSS DIFF: tensor(7.2923, grad_fn=<NllLossBackward0>) tensor(7.1224, grad_fn=<NllLossBackward0>)\n",
"9 LOSS DIFF: tensor(7.2912, grad_fn=<NllLossBackward0>) tensor(7.0721, grad_fn=<NllLossBackward0>)\n",
"10 LOSS DIFF: tensor(7.4529, grad_fn=<NllLossBackward0>) tensor(7.0255, grad_fn=<NllLossBackward0>)\n",
"11 LOSS DIFF: tensor(7.2017, grad_fn=<NllLossBackward0>) tensor(7.0108, grad_fn=<NllLossBackward0>)\n",
"12 LOSS DIFF: tensor(7.0689, grad_fn=<NllLossBackward0>) tensor(6.7964, grad_fn=<NllLossBackward0>)\n",
"13 LOSS DIFF: tensor(7.1870, grad_fn=<NllLossBackward0>) tensor(6.7505, grad_fn=<NllLossBackward0>)\n",
"14 LOSS DIFF: tensor(7.0149, grad_fn=<NllLossBackward0>) tensor(6.7360, grad_fn=<NllLossBackward0>)\n",
"15 LOSS DIFF: tensor(7.0185, grad_fn=<NllLossBackward0>) tensor(6.5064, grad_fn=<NllLossBackward0>)\n",
"16 LOSS DIFF: tensor(6.6809, grad_fn=<NllLossBackward0>) tensor(6.6315, grad_fn=<NllLossBackward0>)\n",
"17 LOSS DIFF: tensor(6.6161, grad_fn=<NllLossBackward0>) tensor(6.5363, grad_fn=<NllLossBackward0>)\n",
"18 LOSS DIFF: tensor(6.6186, grad_fn=<NllLossBackward0>) tensor(6.4474, grad_fn=<NllLossBackward0>)\n",
"19 LOSS DIFF: tensor(6.7242, grad_fn=<NllLossBackward0>) tensor(6.6186, grad_fn=<NllLossBackward0>)\n",
"20 LOSS DIFF: tensor(6.8363, grad_fn=<NllLossBackward0>) tensor(6.4740, grad_fn=<NllLossBackward0>)\n",
"21 LOSS DIFF: tensor(6.4746, grad_fn=<NllLossBackward0>) tensor(6.3583, grad_fn=<NllLossBackward0>)\n",
"22 LOSS DIFF: tensor(6.2821, grad_fn=<NllLossBackward0>) tensor(6.2621, grad_fn=<NllLossBackward0>)\n",
"23 LOSS DIFF: tensor(6.5530, grad_fn=<NllLossBackward0>) tensor(6.2821, grad_fn=<NllLossBackward0>)\n",
"24 LOSS DIFF: tensor(6.3082, grad_fn=<NllLossBackward0>) tensor(6.1749, grad_fn=<NllLossBackward0>)\n",
"25 LOSS DIFF: tensor(6.3215, grad_fn=<NllLossBackward0>) tensor(6.0069, grad_fn=<NllLossBackward0>)\n",
"26 LOSS DIFF: tensor(6.3455, grad_fn=<NllLossBackward0>) tensor(6.1887, grad_fn=<NllLossBackward0>)\n",
"27 LOSS DIFF: tensor(6.0695, grad_fn=<NllLossBackward0>) tensor(6.0053, grad_fn=<NllLossBackward0>)\n",
"28 LOSS DIFF: tensor(6.2298, grad_fn=<NllLossBackward0>) tensor(6.0553, grad_fn=<NllLossBackward0>)\n",
"29 LOSS DIFF: tensor(6.2879, grad_fn=<NllLossBackward0>) tensor(6.2298, grad_fn=<NllLossBackward0>)\n",
"30 LOSS DIFF: tensor(5.8552, grad_fn=<NllLossBackward0>) tensor(5.7972, grad_fn=<NllLossBackward0>)\n",
"31 LOSS DIFF: tensor(5.8884, grad_fn=<NllLossBackward0>) tensor(5.8552, grad_fn=<NllLossBackward0>)\n",
"32 LOSS DIFF: tensor(6.0852, grad_fn=<NllLossBackward0>) tensor(5.8884, grad_fn=<NllLossBackward0>)\n",
"33 LOSS DIFF: tensor(6.2040, grad_fn=<NllLossBackward0>) tensor(6.0852, grad_fn=<NllLossBackward0>)\n",
"34 LOSS DIFF: tensor(6.1036, grad_fn=<NllLossBackward0>) tensor(5.9439, grad_fn=<NllLossBackward0>)\n",
"35 LOSS DIFF: tensor(6.0782, grad_fn=<NllLossBackward0>) tensor(5.9413, grad_fn=<NllLossBackward0>)\n",
"36 LOSS DIFF: tensor(5.9607, grad_fn=<NllLossBackward0>) tensor(5.7949, grad_fn=<NllLossBackward0>)\n",
"37 LOSS DIFF: tensor(6.0354, grad_fn=<NllLossBackward0>) tensor(5.9607, grad_fn=<NllLossBackward0>)\n",
"38 LOSS DIFF: tensor(6.2669, grad_fn=<NllLossBackward0>) tensor(6.0243, grad_fn=<NllLossBackward0>)\n",
"39 LOSS DIFF: tensor(5.8678, grad_fn=<NllLossBackward0>) tensor(5.6556, grad_fn=<NllLossBackward0>)\n",
"40 LOSS DIFF: tensor(6.0265, grad_fn=<NllLossBackward0>) tensor(5.8678, grad_fn=<NllLossBackward0>)\n",
"41 LOSS DIFF: tensor(6.1147, grad_fn=<NllLossBackward0>) tensor(5.8050, grad_fn=<NllLossBackward0>)\n",
"100 tensor(5.8244, grad_fn=<NllLossBackward0>)\n",
"42 LOSS DIFF: tensor(5.8244, grad_fn=<NllLossBackward0>) tensor(5.7412, grad_fn=<NllLossBackward0>)\n",
"43 LOSS DIFF: tensor(5.9226, grad_fn=<NllLossBackward0>) tensor(5.8244, grad_fn=<NllLossBackward0>)\n",
"44 LOSS DIFF: tensor(5.9487, grad_fn=<NllLossBackward0>) tensor(5.9226, grad_fn=<NllLossBackward0>)\n",
"45 LOSS DIFF: tensor(5.8844, grad_fn=<NllLossBackward0>) tensor(5.3183, grad_fn=<NllLossBackward0>)\n",
"46 LOSS DIFF: tensor(6.0141, grad_fn=<NllLossBackward0>) tensor(5.8844, grad_fn=<NllLossBackward0>)\n",
"47 LOSS DIFF: tensor(6.1782, grad_fn=<NllLossBackward0>) tensor(5.8340, grad_fn=<NllLossBackward0>)\n",
"48 LOSS DIFF: tensor(5.8840, grad_fn=<NllLossBackward0>) tensor(5.7920, grad_fn=<NllLossBackward0>)\n",
"49 LOSS DIFF: tensor(5.7265, grad_fn=<NllLossBackward0>) tensor(5.6177, grad_fn=<NllLossBackward0>)\n",
"50 LOSS DIFF: tensor(5.9389, grad_fn=<NllLossBackward0>) tensor(5.7265, grad_fn=<NllLossBackward0>)\n",
"51 LOSS DIFF: tensor(5.6946, grad_fn=<NllLossBackward0>) tensor(5.6487, grad_fn=<NllLossBackward0>)\n",
"52 LOSS DIFF: tensor(5.8837, grad_fn=<NllLossBackward0>) tensor(5.6946, grad_fn=<NllLossBackward0>)\n",
"53 LOSS DIFF: tensor(5.9090, grad_fn=<NllLossBackward0>) tensor(5.8837, grad_fn=<NllLossBackward0>)\n",
"54 LOSS DIFF: tensor(5.9914, grad_fn=<NllLossBackward0>) tensor(5.9090, grad_fn=<NllLossBackward0>)\n",
"55 LOSS DIFF: tensor(5.8042, grad_fn=<NllLossBackward0>) tensor(5.7994, grad_fn=<NllLossBackward0>)\n",
"56 LOSS DIFF: tensor(5.9282, grad_fn=<NllLossBackward0>) tensor(5.8042, grad_fn=<NllLossBackward0>)\n",
"57 LOSS DIFF: tensor(5.9366, grad_fn=<NllLossBackward0>) tensor(5.7254, grad_fn=<NllLossBackward0>)\n",
"58 LOSS DIFF: tensor(5.7995, grad_fn=<NllLossBackward0>) tensor(5.7486, grad_fn=<NllLossBackward0>)\n",
"59 LOSS DIFF: tensor(5.6361, grad_fn=<NllLossBackward0>) tensor(5.5307, grad_fn=<NllLossBackward0>)\n",
"60 LOSS DIFF: tensor(5.7078, grad_fn=<NllLossBackward0>) tensor(5.6361, grad_fn=<NllLossBackward0>)\n",
"61 LOSS DIFF: tensor(5.7592, grad_fn=<NllLossBackward0>) tensor(5.7078, grad_fn=<NllLossBackward0>)\n",
"62 LOSS DIFF: tensor(5.7625, grad_fn=<NllLossBackward0>) tensor(5.5981, grad_fn=<NllLossBackward0>)\n",
"63 LOSS DIFF: tensor(5.8389, grad_fn=<NllLossBackward0>) tensor(5.7625, grad_fn=<NllLossBackward0>)\n",
"64 LOSS DIFF: tensor(5.7739, grad_fn=<NllLossBackward0>) tensor(5.7312, grad_fn=<NllLossBackward0>)\n",
"65 LOSS DIFF: tensor(5.9031, grad_fn=<NllLossBackward0>) tensor(5.6170, grad_fn=<NllLossBackward0>)\n",
"66 LOSS DIFF: tensor(5.7173, grad_fn=<NllLossBackward0>) tensor(5.5232, grad_fn=<NllLossBackward0>)\n",
"67 LOSS DIFF: tensor(5.7408, grad_fn=<NllLossBackward0>) tensor(5.7173, grad_fn=<NllLossBackward0>)\n",
"68 LOSS DIFF: tensor(5.8191, grad_fn=<NllLossBackward0>) tensor(5.7408, grad_fn=<NllLossBackward0>)\n",
"69 LOSS DIFF: tensor(6.0318, grad_fn=<NllLossBackward0>) tensor(5.8191, grad_fn=<NllLossBackward0>)\n",
"70 LOSS DIFF: tensor(5.6656, grad_fn=<NllLossBackward0>) tensor(5.5086, grad_fn=<NllLossBackward0>)\n",
"71 LOSS DIFF: tensor(5.7288, grad_fn=<NllLossBackward0>) tensor(5.6656, grad_fn=<NllLossBackward0>)\n",
"72 LOSS DIFF: tensor(6.0700, grad_fn=<NllLossBackward0>) tensor(5.7288, grad_fn=<NllLossBackward0>)\n",
"73 LOSS DIFF: tensor(5.8114, grad_fn=<NllLossBackward0>) tensor(5.5442, grad_fn=<NllLossBackward0>)\n",
"74 LOSS DIFF: tensor(5.8363, grad_fn=<NllLossBackward0>) tensor(5.5099, grad_fn=<NllLossBackward0>)\n",
"75 LOSS DIFF: tensor(5.8545, grad_fn=<NllLossBackward0>) tensor(5.8363, grad_fn=<NllLossBackward0>)\n",
"76 LOSS DIFF: tensor(5.9820, grad_fn=<NllLossBackward0>) tensor(5.8545, grad_fn=<NllLossBackward0>)\n",
"77 LOSS DIFF: tensor(5.8431, grad_fn=<NllLossBackward0>) tensor(5.7144, grad_fn=<NllLossBackward0>)\n",
"78 LOSS DIFF: tensor(5.9114, grad_fn=<NllLossBackward0>) tensor(5.8431, grad_fn=<NllLossBackward0>)\n",
"79 LOSS DIFF: tensor(5.8020, grad_fn=<NllLossBackward0>) tensor(5.4449, grad_fn=<NllLossBackward0>)\n",
"80 LOSS DIFF: tensor(5.8973, grad_fn=<NllLossBackward0>) tensor(5.5983, grad_fn=<NllLossBackward0>)\n",
"81 LOSS DIFF: tensor(5.6962, grad_fn=<NllLossBackward0>) tensor(5.6396, grad_fn=<NllLossBackward0>)\n",
"82 LOSS DIFF: tensor(5.6928, grad_fn=<NllLossBackward0>) tensor(5.5821, grad_fn=<NllLossBackward0>)\n",
"83 LOSS DIFF: tensor(5.7957, grad_fn=<NllLossBackward0>) tensor(5.6928, grad_fn=<NllLossBackward0>)\n",
"84 LOSS DIFF: tensor(5.5650, grad_fn=<NllLossBackward0>) tensor(5.5055, grad_fn=<NllLossBackward0>)\n",
"85 LOSS DIFF: tensor(5.6884, grad_fn=<NllLossBackward0>) tensor(5.5650, grad_fn=<NllLossBackward0>)\n",
"86 LOSS DIFF: tensor(5.7350, grad_fn=<NllLossBackward0>) tensor(5.6884, grad_fn=<NllLossBackward0>)\n",
"87 LOSS DIFF: tensor(5.6654, grad_fn=<NllLossBackward0>) tensor(5.5815, grad_fn=<NllLossBackward0>)\n",
"88 LOSS DIFF: tensor(5.7693, grad_fn=<NllLossBackward0>) tensor(5.3977, grad_fn=<NllLossBackward0>)\n",
"89 LOSS DIFF: tensor(5.5829, grad_fn=<NllLossBackward0>) tensor(5.5628, grad_fn=<NllLossBackward0>)\n",
"90 LOSS DIFF: tensor(5.8661, grad_fn=<NllLossBackward0>) tensor(5.5829, grad_fn=<NllLossBackward0>)\n",
"91 LOSS DIFF: tensor(5.4884, grad_fn=<NllLossBackward0>) tensor(5.4546, grad_fn=<NllLossBackward0>)\n",
"92 LOSS DIFF: tensor(5.6575, grad_fn=<NllLossBackward0>) tensor(5.4884, grad_fn=<NllLossBackward0>)\n",
"93 LOSS DIFF: tensor(5.8113, grad_fn=<NllLossBackward0>) tensor(5.6575, grad_fn=<NllLossBackward0>)\n",
"94 LOSS DIFF: tensor(5.6923, grad_fn=<NllLossBackward0>) tensor(5.5077, grad_fn=<NllLossBackward0>)\n",
"95 LOSS DIFF: tensor(5.7196, grad_fn=<NllLossBackward0>) tensor(5.6923, grad_fn=<NllLossBackward0>)\n",
"96 LOSS DIFF: tensor(5.6317, grad_fn=<NllLossBackward0>) tensor(5.6262, grad_fn=<NllLossBackward0>)\n",
"97 LOSS DIFF: tensor(5.7707, grad_fn=<NllLossBackward0>) tensor(5.6099, grad_fn=<NllLossBackward0>)\n",
"200 tensor(5.4212, grad_fn=<NllLossBackward0>)\n",
"98 LOSS DIFF: tensor(5.5956, grad_fn=<NllLossBackward0>) tensor(5.4212, grad_fn=<NllLossBackward0>)\n",
"99 LOSS DIFF: tensor(5.7422, grad_fn=<NllLossBackward0>) tensor(5.5956, grad_fn=<NllLossBackward0>)\n",
"100 LOSS DIFF: tensor(5.8166, grad_fn=<NllLossBackward0>) tensor(5.7422, grad_fn=<NllLossBackward0>)\n",
"101 LOSS DIFF: tensor(5.8615, grad_fn=<NllLossBackward0>) tensor(5.8166, grad_fn=<NllLossBackward0>)\n",
"102 LOSS DIFF: tensor(5.9617, grad_fn=<NllLossBackward0>) tensor(5.8615, grad_fn=<NllLossBackward0>)\n",
"103 LOSS DIFF: tensor(5.9847, grad_fn=<NllLossBackward0>) tensor(5.9617, grad_fn=<NllLossBackward0>)\n",
"104 LOSS DIFF: tensor(5.8443, grad_fn=<NllLossBackward0>) tensor(5.6014, grad_fn=<NllLossBackward0>)\n",
"105 LOSS DIFF: tensor(5.7755, grad_fn=<NllLossBackward0>) tensor(5.7413, grad_fn=<NllLossBackward0>)\n",
"106 LOSS DIFF: tensor(6.0574, grad_fn=<NllLossBackward0>) tensor(5.6690, grad_fn=<NllLossBackward0>)\n",
"107 LOSS DIFF: tensor(5.4708, grad_fn=<NllLossBackward0>) tensor(5.4460, grad_fn=<NllLossBackward0>)\n",
"108 LOSS DIFF: tensor(5.6402, grad_fn=<NllLossBackward0>) tensor(5.4708, grad_fn=<NllLossBackward0>)\n",
"109 LOSS DIFF: tensor(5.7016, grad_fn=<NllLossBackward0>) tensor(5.6402, grad_fn=<NllLossBackward0>)\n",
"110 LOSS DIFF: tensor(5.5643, grad_fn=<NllLossBackward0>) tensor(5.4158, grad_fn=<NllLossBackward0>)\n",
"111 LOSS DIFF: tensor(5.6958, grad_fn=<NllLossBackward0>) tensor(5.3094, grad_fn=<NllLossBackward0>)\n",
"112 LOSS DIFF: tensor(5.8296, grad_fn=<NllLossBackward0>) tensor(5.4617, grad_fn=<NllLossBackward0>)\n",
"113 LOSS DIFF: tensor(5.6992, grad_fn=<NllLossBackward0>) tensor(5.5483, grad_fn=<NllLossBackward0>)\n",
"114 LOSS DIFF: tensor(5.4980, grad_fn=<NllLossBackward0>) tensor(5.4310, grad_fn=<NllLossBackward0>)\n",
"115 LOSS DIFF: tensor(5.4942, grad_fn=<NllLossBackward0>) tensor(5.3832, grad_fn=<NllLossBackward0>)\n",
"116 LOSS DIFF: tensor(5.6928, grad_fn=<NllLossBackward0>) tensor(5.4942, grad_fn=<NllLossBackward0>)\n",
"117 LOSS DIFF: tensor(5.6334, grad_fn=<NllLossBackward0>) tensor(5.5606, grad_fn=<NllLossBackward0>)\n",
"118 LOSS DIFF: tensor(5.7307, grad_fn=<NllLossBackward0>) tensor(5.5210, grad_fn=<NllLossBackward0>)\n",
"119 LOSS DIFF: tensor(5.5673, grad_fn=<NllLossBackward0>) tensor(5.5488, grad_fn=<NllLossBackward0>)\n",
"120 LOSS DIFF: tensor(6.0060, grad_fn=<NllLossBackward0>) tensor(5.4800, grad_fn=<NllLossBackward0>)\n",
"121 LOSS DIFF: tensor(5.5278, grad_fn=<NllLossBackward0>) tensor(5.1856, grad_fn=<NllLossBackward0>)\n",
"122 LOSS DIFF: tensor(5.5388, grad_fn=<NllLossBackward0>) tensor(5.5278, grad_fn=<NllLossBackward0>)\n",
"123 LOSS DIFF: tensor(5.6835, grad_fn=<NllLossBackward0>) tensor(5.5388, grad_fn=<NllLossBackward0>)\n",
"124 LOSS DIFF: tensor(5.6808, grad_fn=<NllLossBackward0>) tensor(5.5417, grad_fn=<NllLossBackward0>)\n",
"125 LOSS DIFF: tensor(5.8665, grad_fn=<NllLossBackward0>) tensor(5.5828, grad_fn=<NllLossBackward0>)\n",
"126 LOSS DIFF: tensor(5.7710, grad_fn=<NllLossBackward0>) tensor(5.5468, grad_fn=<NllLossBackward0>)\n",
"127 LOSS DIFF: tensor(5.6604, grad_fn=<NllLossBackward0>) tensor(5.6368, grad_fn=<NllLossBackward0>)\n",
"128 LOSS DIFF: tensor(5.5983, grad_fn=<NllLossBackward0>) tensor(5.5213, grad_fn=<NllLossBackward0>)\n",
"129 LOSS DIFF: tensor(5.6943, grad_fn=<NllLossBackward0>) tensor(5.4842, grad_fn=<NllLossBackward0>)\n",
"130 LOSS DIFF: tensor(5.5073, grad_fn=<NllLossBackward0>) tensor(5.4259, grad_fn=<NllLossBackward0>)\n",
"131 LOSS DIFF: tensor(5.5320, grad_fn=<NllLossBackward0>) tensor(5.5073, grad_fn=<NllLossBackward0>)\n",
"132 LOSS DIFF: tensor(5.6082, grad_fn=<NllLossBackward0>) tensor(5.4292, grad_fn=<NllLossBackward0>)\n",
"133 LOSS DIFF: tensor(5.6768, grad_fn=<NllLossBackward0>) tensor(5.4724, grad_fn=<NllLossBackward0>)\n",
"134 LOSS DIFF: tensor(5.5272, grad_fn=<NllLossBackward0>) tensor(5.5222, grad_fn=<NllLossBackward0>)\n",
"135 LOSS DIFF: tensor(5.5190, grad_fn=<NllLossBackward0>) tensor(5.5016, grad_fn=<NllLossBackward0>)\n",
"136 LOSS DIFF: tensor(5.6560, grad_fn=<NllLossBackward0>) tensor(5.5190, grad_fn=<NllLossBackward0>)\n",
"137 LOSS DIFF: tensor(5.6775, grad_fn=<NllLossBackward0>) tensor(5.6560, grad_fn=<NllLossBackward0>)\n",
"138 LOSS DIFF: tensor(5.6694, grad_fn=<NllLossBackward0>) tensor(5.6686, grad_fn=<NllLossBackward0>)\n",
"139 LOSS DIFF: tensor(5.5788, grad_fn=<NllLossBackward0>) tensor(5.2768, grad_fn=<NllLossBackward0>)\n",
"140 LOSS DIFF: tensor(5.3935, grad_fn=<NllLossBackward0>) tensor(5.3774, grad_fn=<NllLossBackward0>)\n",
"141 LOSS DIFF: tensor(5.6068, grad_fn=<NllLossBackward0>) tensor(5.3935, grad_fn=<NllLossBackward0>)\n",
"142 LOSS DIFF: tensor(5.6336, grad_fn=<NllLossBackward0>) tensor(5.6068, grad_fn=<NllLossBackward0>)\n",
"143 LOSS DIFF: tensor(5.7687, grad_fn=<NllLossBackward0>) tensor(5.5630, grad_fn=<NllLossBackward0>)\n",
"144 LOSS DIFF: tensor(5.7539, grad_fn=<NllLossBackward0>) tensor(5.6827, grad_fn=<NllLossBackward0>)\n",
"145 LOSS DIFF: tensor(5.7485, grad_fn=<NllLossBackward0>) tensor(5.6277, grad_fn=<NllLossBackward0>)\n",
"300 tensor(5.8304, grad_fn=<NllLossBackward0>)\n",
"146 LOSS DIFF: tensor(5.8304, grad_fn=<NllLossBackward0>) tensor(5.5549, grad_fn=<NllLossBackward0>)\n",
"147 LOSS DIFF: tensor(5.5819, grad_fn=<NllLossBackward0>) tensor(5.4616, grad_fn=<NllLossBackward0>)\n",
"148 LOSS DIFF: tensor(5.6154, grad_fn=<NllLossBackward0>) tensor(5.5819, grad_fn=<NllLossBackward0>)\n",
"149 LOSS DIFF: tensor(5.7859, grad_fn=<NllLossBackward0>) tensor(5.3329, grad_fn=<NllLossBackward0>)\n",
"150 LOSS DIFF: tensor(5.5458, grad_fn=<NllLossBackward0>) tensor(5.5438, grad_fn=<NllLossBackward0>)\n",
"151 LOSS DIFF: tensor(5.7121, grad_fn=<NllLossBackward0>) tensor(5.5458, grad_fn=<NllLossBackward0>)\n",
"152 LOSS DIFF: tensor(5.6329, grad_fn=<NllLossBackward0>) tensor(5.2700, grad_fn=<NllLossBackward0>)\n",
"153 LOSS DIFF: tensor(5.6739, grad_fn=<NllLossBackward0>) tensor(5.3680, grad_fn=<NllLossBackward0>)\n",
"154 LOSS DIFF: tensor(5.7045, grad_fn=<NllLossBackward0>) tensor(5.6739, grad_fn=<NllLossBackward0>)\n",
"155 LOSS DIFF: tensor(5.5067, grad_fn=<NllLossBackward0>) tensor(5.2978, grad_fn=<NllLossBackward0>)\n",
"156 LOSS DIFF: tensor(5.5102, grad_fn=<NllLossBackward0>) tensor(5.5067, grad_fn=<NllLossBackward0>)\n",
"157 LOSS DIFF: tensor(5.5956, grad_fn=<NllLossBackward0>) tensor(5.4116, grad_fn=<NllLossBackward0>)\n",
"158 LOSS DIFF: tensor(5.5993, grad_fn=<NllLossBackward0>) tensor(5.4012, grad_fn=<NllLossBackward0>)\n",
"159 LOSS DIFF: tensor(5.6150, grad_fn=<NllLossBackward0>) tensor(5.3476, grad_fn=<NllLossBackward0>)\n",
"160 LOSS DIFF: tensor(5.4375, grad_fn=<NllLossBackward0>) tensor(5.4351, grad_fn=<NllLossBackward0>)\n",
"161 LOSS DIFF: tensor(5.7052, grad_fn=<NllLossBackward0>) tensor(5.4375, grad_fn=<NllLossBackward0>)\n",
"162 LOSS DIFF: tensor(5.7059, grad_fn=<NllLossBackward0>) tensor(5.5050, grad_fn=<NllLossBackward0>)\n",
"163 LOSS DIFF: tensor(5.7356, grad_fn=<NllLossBackward0>) tensor(5.5716, grad_fn=<NllLossBackward0>)\n",
"164 LOSS DIFF: tensor(5.7517, grad_fn=<NllLossBackward0>) tensor(5.5423, grad_fn=<NllLossBackward0>)\n",
"165 LOSS DIFF: tensor(5.7358, grad_fn=<NllLossBackward0>) tensor(5.4403, grad_fn=<NllLossBackward0>)\n",
"166 LOSS DIFF: tensor(5.6180, grad_fn=<NllLossBackward0>) tensor(5.4437, grad_fn=<NllLossBackward0>)\n",
"167 LOSS DIFF: tensor(5.5725, grad_fn=<NllLossBackward0>) tensor(5.2734, grad_fn=<NllLossBackward0>)\n",
"168 LOSS DIFF: tensor(5.8849, grad_fn=<NllLossBackward0>) tensor(5.3810, grad_fn=<NllLossBackward0>)\n",
"169 LOSS DIFF: tensor(5.5414, grad_fn=<NllLossBackward0>) tensor(5.5272, grad_fn=<NllLossBackward0>)\n",
"170 LOSS DIFF: tensor(5.5738, grad_fn=<NllLossBackward0>) tensor(5.3898, grad_fn=<NllLossBackward0>)\n",
"171 LOSS DIFF: tensor(5.7096, grad_fn=<NllLossBackward0>) tensor(5.2583, grad_fn=<NllLossBackward0>)\n",
"172 LOSS DIFF: tensor(5.7039, grad_fn=<NllLossBackward0>) tensor(5.6133, grad_fn=<NllLossBackward0>)\n",
"173 LOSS DIFF: tensor(5.5324, grad_fn=<NllLossBackward0>) tensor(5.5068, grad_fn=<NllLossBackward0>)\n",
"174 LOSS DIFF: tensor(5.5902, grad_fn=<NllLossBackward0>) tensor(5.4034, grad_fn=<NllLossBackward0>)\n",
"175 LOSS DIFF: tensor(5.5912, grad_fn=<NllLossBackward0>) tensor(5.5902, grad_fn=<NllLossBackward0>)\n",
"176 LOSS DIFF: tensor(5.7047, grad_fn=<NllLossBackward0>) tensor(5.5912, grad_fn=<NllLossBackward0>)\n",
"177 LOSS DIFF: tensor(5.6506, grad_fn=<NllLossBackward0>) tensor(5.4474, grad_fn=<NllLossBackward0>)\n",
"178 LOSS DIFF: tensor(5.5547, grad_fn=<NllLossBackward0>) tensor(5.5172, grad_fn=<NllLossBackward0>)\n",
"179 LOSS DIFF: tensor(5.5271, grad_fn=<NllLossBackward0>) tensor(5.2485, grad_fn=<NllLossBackward0>)\n",
"180 LOSS DIFF: tensor(5.5400, grad_fn=<NllLossBackward0>) tensor(5.4519, grad_fn=<NllLossBackward0>)\n",
"181 LOSS DIFF: tensor(5.6702, grad_fn=<NllLossBackward0>) tensor(5.5037, grad_fn=<NllLossBackward0>)\n",
"182 LOSS DIFF: tensor(5.5462, grad_fn=<NllLossBackward0>) tensor(5.4319, grad_fn=<NllLossBackward0>)\n",
"183 LOSS DIFF: tensor(5.5346, grad_fn=<NllLossBackward0>) tensor(5.4046, grad_fn=<NllLossBackward0>)\n",
"184 LOSS DIFF: tensor(5.5779, grad_fn=<NllLossBackward0>) tensor(5.5096, grad_fn=<NllLossBackward0>)\n",
"185 LOSS DIFF: tensor(5.5979, grad_fn=<NllLossBackward0>) tensor(5.4310, grad_fn=<NllLossBackward0>)\n",
"186 LOSS DIFF: tensor(5.4231, grad_fn=<NllLossBackward0>) tensor(5.2371, grad_fn=<NllLossBackward0>)\n",
"187 LOSS DIFF: tensor(5.6120, grad_fn=<NllLossBackward0>) tensor(5.4231, grad_fn=<NllLossBackward0>)\n",
"188 LOSS DIFF: tensor(5.4934, grad_fn=<NllLossBackward0>) tensor(5.1333, grad_fn=<NllLossBackward0>)\n",
"189 LOSS DIFF: tensor(5.5445, grad_fn=<NllLossBackward0>) tensor(5.2967, grad_fn=<NllLossBackward0>)\n",
"190 LOSS DIFF: tensor(5.5506, grad_fn=<NllLossBackward0>) tensor(5.5445, grad_fn=<NllLossBackward0>)\n",
"191 LOSS DIFF: tensor(5.6374, grad_fn=<NllLossBackward0>) tensor(5.5506, grad_fn=<NllLossBackward0>)\n",
"400 tensor(5.5743, grad_fn=<NllLossBackward0>)\n",
"192 LOSS DIFF: tensor(5.6050, grad_fn=<NllLossBackward0>) tensor(5.5743, grad_fn=<NllLossBackward0>)\n",
"193 LOSS DIFF: tensor(5.5826, grad_fn=<NllLossBackward0>) tensor(5.3787, grad_fn=<NllLossBackward0>)\n",
"194 LOSS DIFF: tensor(5.5223, grad_fn=<NllLossBackward0>) tensor(5.3267, grad_fn=<NllLossBackward0>)\n",
"195 LOSS DIFF: tensor(5.4600, grad_fn=<NllLossBackward0>) tensor(5.4485, grad_fn=<NllLossBackward0>)\n",
"196 LOSS DIFF: tensor(5.5178, grad_fn=<NllLossBackward0>) tensor(5.4600, grad_fn=<NllLossBackward0>)\n",
"197 LOSS DIFF: tensor(5.5514, grad_fn=<NllLossBackward0>) tensor(5.2249, grad_fn=<NllLossBackward0>)\n",
"198 LOSS DIFF: tensor(5.5651, grad_fn=<NllLossBackward0>) tensor(5.4807, grad_fn=<NllLossBackward0>)\n",
"199 LOSS DIFF: tensor(5.4252, grad_fn=<NllLossBackward0>) tensor(5.1542, grad_fn=<NllLossBackward0>)\n",
"200 LOSS DIFF: tensor(5.6503, grad_fn=<NllLossBackward0>) tensor(5.4252, grad_fn=<NllLossBackward0>)\n",
"201 LOSS DIFF: tensor(5.5460, grad_fn=<NllLossBackward0>) tensor(5.3643, grad_fn=<NllLossBackward0>)\n",
"202 LOSS DIFF: tensor(5.7145, grad_fn=<NllLossBackward0>) tensor(5.4959, grad_fn=<NllLossBackward0>)\n",
"203 LOSS DIFF: tensor(5.4506, grad_fn=<NllLossBackward0>) tensor(5.4382, grad_fn=<NllLossBackward0>)\n",
"204 LOSS DIFF: tensor(5.5514, grad_fn=<NllLossBackward0>) tensor(5.4506, grad_fn=<NllLossBackward0>)\n",
"205 LOSS DIFF: tensor(5.5680, grad_fn=<NllLossBackward0>) tensor(5.5468, grad_fn=<NllLossBackward0>)\n",
"206 LOSS DIFF: tensor(5.5970, grad_fn=<NllLossBackward0>) tensor(5.5680, grad_fn=<NllLossBackward0>)\n",
"207 LOSS DIFF: tensor(5.6742, grad_fn=<NllLossBackward0>) tensor(5.5970, grad_fn=<NllLossBackward0>)\n",
"208 LOSS DIFF: tensor(5.5306, grad_fn=<NllLossBackward0>) tensor(5.2061, grad_fn=<NllLossBackward0>)\n",
"209 LOSS DIFF: tensor(5.7571, grad_fn=<NllLossBackward0>) tensor(5.5306, grad_fn=<NllLossBackward0>)\n",
"210 LOSS DIFF: tensor(5.6525, grad_fn=<NllLossBackward0>) tensor(5.3833, grad_fn=<NllLossBackward0>)\n",
"211 LOSS DIFF: tensor(5.5354, grad_fn=<NllLossBackward0>) tensor(5.3948, grad_fn=<NllLossBackward0>)\n",
"212 LOSS DIFF: tensor(5.5960, grad_fn=<NllLossBackward0>) tensor(5.5354, grad_fn=<NllLossBackward0>)\n",
"213 LOSS DIFF: tensor(5.7113, grad_fn=<NllLossBackward0>) tensor(5.5470, grad_fn=<NllLossBackward0>)\n",
"214 LOSS DIFF: tensor(5.4059, grad_fn=<NllLossBackward0>) tensor(5.3649, grad_fn=<NllLossBackward0>)\n",
"215 LOSS DIFF: tensor(5.4863, grad_fn=<NllLossBackward0>) tensor(5.4004, grad_fn=<NllLossBackward0>)\n",
"216 LOSS DIFF: tensor(5.5381, grad_fn=<NllLossBackward0>) tensor(5.4863, grad_fn=<NllLossBackward0>)\n",
"217 LOSS DIFF: tensor(5.3652, grad_fn=<NllLossBackward0>) tensor(5.3540, grad_fn=<NllLossBackward0>)\n",
"218 LOSS DIFF: tensor(5.3894, grad_fn=<NllLossBackward0>) tensor(5.1646, grad_fn=<NllLossBackward0>)\n",
"219 LOSS DIFF: tensor(5.6803, grad_fn=<NllLossBackward0>) tensor(5.3894, grad_fn=<NllLossBackward0>)\n",
"220 LOSS DIFF: tensor(5.6113, grad_fn=<NllLossBackward0>) tensor(5.4769, grad_fn=<NllLossBackward0>)\n",
"221 LOSS DIFF: tensor(5.6813, grad_fn=<NllLossBackward0>) tensor(5.2015, grad_fn=<NllLossBackward0>)\n",
"222 LOSS DIFF: tensor(5.3458, grad_fn=<NllLossBackward0>) tensor(5.2679, grad_fn=<NllLossBackward0>)\n",
"223 LOSS DIFF: tensor(5.2445, grad_fn=<NllLossBackward0>) tensor(5.1445, grad_fn=<NllLossBackward0>)\n",
"224 LOSS DIFF: tensor(5.6649, grad_fn=<NllLossBackward0>) tensor(5.2441, grad_fn=<NllLossBackward0>)\n",
"225 LOSS DIFF: tensor(5.8539, grad_fn=<NllLossBackward0>) tensor(5.6026, grad_fn=<NllLossBackward0>)\n",
"226 LOSS DIFF: tensor(5.4560, grad_fn=<NllLossBackward0>) tensor(5.4208, grad_fn=<NllLossBackward0>)\n",
"227 LOSS DIFF: tensor(5.5729, grad_fn=<NllLossBackward0>) tensor(5.4560, grad_fn=<NllLossBackward0>)\n",
"228 LOSS DIFF: tensor(5.5996, grad_fn=<NllLossBackward0>) tensor(5.3175, grad_fn=<NllLossBackward0>)\n",
"229 LOSS DIFF: tensor(5.6685, grad_fn=<NllLossBackward0>) tensor(5.2451, grad_fn=<NllLossBackward0>)\n",
"230 LOSS DIFF: tensor(5.5938, grad_fn=<NllLossBackward0>) tensor(5.4874, grad_fn=<NllLossBackward0>)\n",
"231 LOSS DIFF: tensor(5.6228, grad_fn=<NllLossBackward0>) tensor(5.2840, grad_fn=<NllLossBackward0>)\n",
"232 LOSS DIFF: tensor(5.3415, grad_fn=<NllLossBackward0>) tensor(5.3339, grad_fn=<NllLossBackward0>)\n",
"233 LOSS DIFF: tensor(5.3861, grad_fn=<NllLossBackward0>) tensor(5.1807, grad_fn=<NllLossBackward0>)\n",
"234 LOSS DIFF: tensor(5.4093, grad_fn=<NllLossBackward0>) tensor(5.3861, grad_fn=<NllLossBackward0>)\n",
"235 LOSS DIFF: tensor(5.6085, grad_fn=<NllLossBackward0>) tensor(5.4093, grad_fn=<NllLossBackward0>)\n",
"236 LOSS DIFF: tensor(5.3475, grad_fn=<NllLossBackward0>) tensor(5.1380, grad_fn=<NllLossBackward0>)\n",
"237 LOSS DIFF: tensor(5.6542, grad_fn=<NllLossBackward0>) tensor(5.3475, grad_fn=<NllLossBackward0>)\n",
"238 LOSS DIFF: tensor(5.6034, grad_fn=<NllLossBackward0>) tensor(5.2396, grad_fn=<NllLossBackward0>)\n",
"239 LOSS DIFF: tensor(5.5599, grad_fn=<NllLossBackward0>) tensor(5.2510, grad_fn=<NllLossBackward0>)\n",
"240 LOSS DIFF: tensor(5.4534, grad_fn=<NllLossBackward0>) tensor(5.3629, grad_fn=<NllLossBackward0>)\n",
"500 tensor(5.5447, grad_fn=<NllLossBackward0>)\n",
"241 LOSS DIFF: tensor(5.5447, grad_fn=<NllLossBackward0>) tensor(5.4534, grad_fn=<NllLossBackward0>)\n",
"242 LOSS DIFF: tensor(5.4929, grad_fn=<NllLossBackward0>) tensor(5.3445, grad_fn=<NllLossBackward0>)\n",
"243 LOSS DIFF: tensor(5.4963, grad_fn=<NllLossBackward0>) tensor(5.3411, grad_fn=<NllLossBackward0>)\n",
"244 LOSS DIFF: tensor(5.3306, grad_fn=<NllLossBackward0>) tensor(5.1341, grad_fn=<NllLossBackward0>)\n",
"245 LOSS DIFF: tensor(5.3853, grad_fn=<NllLossBackward0>) tensor(5.3306, grad_fn=<NllLossBackward0>)\n",
"246 LOSS DIFF: tensor(5.5949, grad_fn=<NllLossBackward0>) tensor(5.3853, grad_fn=<NllLossBackward0>)\n",
"247 LOSS DIFF: tensor(5.5202, grad_fn=<NllLossBackward0>) tensor(5.2283, grad_fn=<NllLossBackward0>)\n",
"248 LOSS DIFF: tensor(5.5862, grad_fn=<NllLossBackward0>) tensor(5.5202, grad_fn=<NllLossBackward0>)\n",
"249 LOSS DIFF: tensor(5.5425, grad_fn=<NllLossBackward0>) tensor(5.2707, grad_fn=<NllLossBackward0>)\n",
"250 LOSS DIFF: tensor(5.6233, grad_fn=<NllLossBackward0>) tensor(5.2300, grad_fn=<NllLossBackward0>)\n",
"251 LOSS DIFF: tensor(5.4803, grad_fn=<NllLossBackward0>) tensor(5.3777, grad_fn=<NllLossBackward0>)\n",
"252 LOSS DIFF: tensor(5.6414, grad_fn=<NllLossBackward0>) tensor(5.3601, grad_fn=<NllLossBackward0>)\n",
"253 LOSS DIFF: tensor(5.2371, grad_fn=<NllLossBackward0>) tensor(5.2364, grad_fn=<NllLossBackward0>)\n",
"254 LOSS DIFF: tensor(5.3186, grad_fn=<NllLossBackward0>) tensor(5.2371, grad_fn=<NllLossBackward0>)\n",
"255 LOSS DIFF: tensor(5.6731, grad_fn=<NllLossBackward0>) tensor(5.3186, grad_fn=<NllLossBackward0>)\n",
"256 LOSS DIFF: tensor(5.5774, grad_fn=<NllLossBackward0>) tensor(5.5003, grad_fn=<NllLossBackward0>)\n",
"257 LOSS DIFF: tensor(5.6139, grad_fn=<NllLossBackward0>) tensor(5.0909, grad_fn=<NllLossBackward0>)\n",
"258 LOSS DIFF: tensor(5.4975, grad_fn=<NllLossBackward0>) tensor(5.3252, grad_fn=<NllLossBackward0>)\n",
"259 LOSS DIFF: tensor(5.1695, grad_fn=<NllLossBackward0>) tensor(5.1682, grad_fn=<NllLossBackward0>)\n",
"260 LOSS DIFF: tensor(5.4441, grad_fn=<NllLossBackward0>) tensor(5.1695, grad_fn=<NllLossBackward0>)\n",
"261 LOSS DIFF: tensor(5.5408, grad_fn=<NllLossBackward0>) tensor(5.4441, grad_fn=<NllLossBackward0>)\n",
"262 LOSS DIFF: tensor(5.5618, grad_fn=<NllLossBackward0>) tensor(5.5408, grad_fn=<NllLossBackward0>)\n",
"263 LOSS DIFF: tensor(5.5545, grad_fn=<NllLossBackward0>) tensor(5.5457, grad_fn=<NllLossBackward0>)\n",
"264 LOSS DIFF: tensor(5.6082, grad_fn=<NllLossBackward0>) tensor(5.5545, grad_fn=<NllLossBackward0>)\n",
"265 LOSS DIFF: tensor(5.3351, grad_fn=<NllLossBackward0>) tensor(5.3258, grad_fn=<NllLossBackward0>)\n",
"266 LOSS DIFF: tensor(5.5028, grad_fn=<NllLossBackward0>) tensor(5.3351, grad_fn=<NllLossBackward0>)\n",
"267 LOSS DIFF: tensor(5.4873, grad_fn=<NllLossBackward0>) tensor(5.3415, grad_fn=<NllLossBackward0>)\n",
"268 LOSS DIFF: tensor(5.5458, grad_fn=<NllLossBackward0>) tensor(5.4873, grad_fn=<NllLossBackward0>)\n",
"269 LOSS DIFF: tensor(5.3706, grad_fn=<NllLossBackward0>) tensor(5.3371, grad_fn=<NllLossBackward0>)\n",
"270 LOSS DIFF: tensor(5.5207, grad_fn=<NllLossBackward0>) tensor(5.3706, grad_fn=<NllLossBackward0>)\n",
"271 LOSS DIFF: tensor(5.4275, grad_fn=<NllLossBackward0>) tensor(5.3686, grad_fn=<NllLossBackward0>)\n",
"272 LOSS DIFF: tensor(5.5256, grad_fn=<NllLossBackward0>) tensor(5.4275, grad_fn=<NllLossBackward0>)\n",
"273 LOSS DIFF: tensor(5.3044, grad_fn=<NllLossBackward0>) tensor(5.1722, grad_fn=<NllLossBackward0>)\n",
"274 LOSS DIFF: tensor(5.1798, grad_fn=<NllLossBackward0>) tensor(5.0866, grad_fn=<NllLossBackward0>)\n",
"275 LOSS DIFF: tensor(5.5159, grad_fn=<NllLossBackward0>) tensor(5.1798, grad_fn=<NllLossBackward0>)\n",
"276 LOSS DIFF: tensor(5.3755, grad_fn=<NllLossBackward0>) tensor(5.3404, grad_fn=<NllLossBackward0>)\n",
"277 LOSS DIFF: tensor(5.3817, grad_fn=<NllLossBackward0>) tensor(5.3755, grad_fn=<NllLossBackward0>)\n",
"278 LOSS DIFF: tensor(5.5214, grad_fn=<NllLossBackward0>) tensor(5.3817, grad_fn=<NllLossBackward0>)\n",
"279 LOSS DIFF: tensor(5.4231, grad_fn=<NllLossBackward0>) tensor(5.4104, grad_fn=<NllLossBackward0>)\n",
"280 LOSS DIFF: tensor(5.7068, grad_fn=<NllLossBackward0>) tensor(5.4231, grad_fn=<NllLossBackward0>)\n",
"281 LOSS DIFF: tensor(5.6217, grad_fn=<NllLossBackward0>) tensor(5.3672, grad_fn=<NllLossBackward0>)\n",
"282 LOSS DIFF: tensor(5.5297, grad_fn=<NllLossBackward0>) tensor(5.2592, grad_fn=<NllLossBackward0>)\n",
"283 LOSS DIFF: tensor(5.4354, grad_fn=<NllLossBackward0>) tensor(5.1583, grad_fn=<NllLossBackward0>)\n",
"284 LOSS DIFF: tensor(5.3529, grad_fn=<NllLossBackward0>) tensor(5.3227, grad_fn=<NllLossBackward0>)\n",
"285 LOSS DIFF: tensor(5.5201, grad_fn=<NllLossBackward0>) tensor(5.3529, grad_fn=<NllLossBackward0>)\n",
"286 LOSS DIFF: tensor(5.3654, grad_fn=<NllLossBackward0>) tensor(5.3083, grad_fn=<NllLossBackward0>)\n",
"287 LOSS DIFF: tensor(5.3719, grad_fn=<NllLossBackward0>) tensor(5.3654, grad_fn=<NllLossBackward0>)\n",
"288 LOSS DIFF: tensor(5.7598, grad_fn=<NllLossBackward0>) tensor(5.3256, grad_fn=<NllLossBackward0>)\n",
"289 LOSS DIFF: tensor(5.4723, grad_fn=<NllLossBackward0>) tensor(5.3773, grad_fn=<NllLossBackward0>)\n",
"600 tensor(5.1854, grad_fn=<NllLossBackward0>)\n",
"290 LOSS DIFF: tensor(5.2626, grad_fn=<NllLossBackward0>) tensor(5.1854, grad_fn=<NllLossBackward0>)\n",
"291 LOSS DIFF: tensor(5.3265, grad_fn=<NllLossBackward0>) tensor(5.2626, grad_fn=<NllLossBackward0>)\n",
"292 LOSS DIFF: tensor(5.3546, grad_fn=<NllLossBackward0>) tensor(5.3265, grad_fn=<NllLossBackward0>)\n",
"293 LOSS DIFF: tensor(5.4134, grad_fn=<NllLossBackward0>) tensor(5.3546, grad_fn=<NllLossBackward0>)\n",
"294 LOSS DIFF: tensor(5.3317, grad_fn=<NllLossBackward0>) tensor(5.3061, grad_fn=<NllLossBackward0>)\n",
"295 LOSS DIFF: tensor(5.5886, grad_fn=<NllLossBackward0>) tensor(5.3317, grad_fn=<NllLossBackward0>)\n",
"296 LOSS DIFF: tensor(5.2714, grad_fn=<NllLossBackward0>) tensor(5.2538, grad_fn=<NllLossBackward0>)\n",
"297 LOSS DIFF: tensor(5.4437, grad_fn=<NllLossBackward0>) tensor(5.2699, grad_fn=<NllLossBackward0>)\n",
"298 LOSS DIFF: tensor(5.4026, grad_fn=<NllLossBackward0>) tensor(5.3539, grad_fn=<NllLossBackward0>)\n",
"299 LOSS DIFF: tensor(5.5344, grad_fn=<NllLossBackward0>) tensor(5.4026, grad_fn=<NllLossBackward0>)\n",
"300 LOSS DIFF: tensor(5.2724, grad_fn=<NllLossBackward0>) tensor(5.1554, grad_fn=<NllLossBackward0>)\n",
"301 LOSS DIFF: tensor(5.4204, grad_fn=<NllLossBackward0>) tensor(5.2614, grad_fn=<NllLossBackward0>)\n",
"302 LOSS DIFF: tensor(5.5588, grad_fn=<NllLossBackward0>) tensor(5.4204, grad_fn=<NllLossBackward0>)\n",
"303 LOSS DIFF: tensor(5.4821, grad_fn=<NllLossBackward0>) tensor(5.2939, grad_fn=<NllLossBackward0>)\n",
"304 LOSS DIFF: tensor(5.5529, grad_fn=<NllLossBackward0>) tensor(5.4821, grad_fn=<NllLossBackward0>)\n",
"305 LOSS DIFF: tensor(5.5659, grad_fn=<NllLossBackward0>) tensor(5.5529, grad_fn=<NllLossBackward0>)\n",
"306 LOSS DIFF: tensor(5.3128, grad_fn=<NllLossBackward0>) tensor(5.1975, grad_fn=<NllLossBackward0>)\n",
"307 LOSS DIFF: tensor(5.4044, grad_fn=<NllLossBackward0>) tensor(5.2514, grad_fn=<NllLossBackward0>)\n",
"308 LOSS DIFF: tensor(5.5461, grad_fn=<NllLossBackward0>) tensor(5.4044, grad_fn=<NllLossBackward0>)\n",
"309 LOSS DIFF: tensor(5.4835, grad_fn=<NllLossBackward0>) tensor(5.4153, grad_fn=<NllLossBackward0>)\n",
"310 LOSS DIFF: tensor(5.4990, grad_fn=<NllLossBackward0>) tensor(5.3391, grad_fn=<NllLossBackward0>)\n",
"311 LOSS DIFF: tensor(5.5111, grad_fn=<NllLossBackward0>) tensor(5.4990, grad_fn=<NllLossBackward0>)\n",
"312 LOSS DIFF: tensor(5.4828, grad_fn=<NllLossBackward0>) tensor(5.3784, grad_fn=<NllLossBackward0>)\n",
"313 LOSS DIFF: tensor(5.4165, grad_fn=<NllLossBackward0>) tensor(5.0706, grad_fn=<NllLossBackward0>)\n",
"314 LOSS DIFF: tensor(5.5142, grad_fn=<NllLossBackward0>) tensor(5.4165, grad_fn=<NllLossBackward0>)\n",
"315 LOSS DIFF: tensor(5.3397, grad_fn=<NllLossBackward0>) tensor(5.1207, grad_fn=<NllLossBackward0>)\n",
"316 LOSS DIFF: tensor(5.6205, grad_fn=<NllLossBackward0>) tensor(5.3397, grad_fn=<NllLossBackward0>)\n",
"317 LOSS DIFF: tensor(5.4190, grad_fn=<NllLossBackward0>) tensor(5.3573, grad_fn=<NllLossBackward0>)\n",
"318 LOSS DIFF: tensor(5.2788, grad_fn=<NllLossBackward0>) tensor(5.2728, grad_fn=<NllLossBackward0>)\n",
"319 LOSS DIFF: tensor(5.3070, grad_fn=<NllLossBackward0>) tensor(5.2788, grad_fn=<NllLossBackward0>)\n",
"320 LOSS DIFF: tensor(5.5223, grad_fn=<NllLossBackward0>) tensor(5.3070, grad_fn=<NllLossBackward0>)\n",
"321 LOSS DIFF: tensor(5.3895, grad_fn=<NllLossBackward0>) tensor(5.2946, grad_fn=<NllLossBackward0>)\n",
"322 LOSS DIFF: tensor(5.6954, grad_fn=<NllLossBackward0>) tensor(5.2766, grad_fn=<NllLossBackward0>)\n",
"323 LOSS DIFF: tensor(5.3206, grad_fn=<NllLossBackward0>) tensor(5.2566, grad_fn=<NllLossBackward0>)\n",
"324 LOSS DIFF: tensor(5.4333, grad_fn=<NllLossBackward0>) tensor(5.1247, grad_fn=<NllLossBackward0>)\n",
"325 LOSS DIFF: tensor(5.5108, grad_fn=<NllLossBackward0>) tensor(5.2871, grad_fn=<NllLossBackward0>)\n",
"326 LOSS DIFF: tensor(5.3659, grad_fn=<NllLossBackward0>) tensor(5.2939, grad_fn=<NllLossBackward0>)\n",
"327 LOSS DIFF: tensor(5.4602, grad_fn=<NllLossBackward0>) tensor(5.2214, grad_fn=<NllLossBackward0>)\n",
"328 LOSS DIFF: tensor(5.1405, grad_fn=<NllLossBackward0>) tensor(4.9549, grad_fn=<NllLossBackward0>)\n",
"329 LOSS DIFF: tensor(5.4136, grad_fn=<NllLossBackward0>) tensor(4.9053, grad_fn=<NllLossBackward0>)\n",
"330 LOSS DIFF: tensor(5.7120, grad_fn=<NllLossBackward0>) tensor(5.2294, grad_fn=<NllLossBackward0>)\n",
"331 LOSS DIFF: tensor(5.4775, grad_fn=<NllLossBackward0>) tensor(5.3224, grad_fn=<NllLossBackward0>)\n",
"332 LOSS DIFF: tensor(5.2917, grad_fn=<NllLossBackward0>) tensor(5.1672, grad_fn=<NllLossBackward0>)\n",
"333 LOSS DIFF: tensor(5.3209, grad_fn=<NllLossBackward0>) tensor(5.2917, grad_fn=<NllLossBackward0>)\n",
"334 LOSS DIFF: tensor(5.3745, grad_fn=<NllLossBackward0>) tensor(5.3209, grad_fn=<NllLossBackward0>)\n",
"335 LOSS DIFF: tensor(5.4889, grad_fn=<NllLossBackward0>) tensor(5.3172, grad_fn=<NllLossBackward0>)\n",
"336 LOSS DIFF: tensor(5.3614, grad_fn=<NllLossBackward0>) tensor(5.2868, grad_fn=<NllLossBackward0>)\n",
"337 LOSS DIFF: tensor(5.4456, grad_fn=<NllLossBackward0>) tensor(5.3614, grad_fn=<NllLossBackward0>)\n",
"338 LOSS DIFF: tensor(5.3012, grad_fn=<NllLossBackward0>) tensor(5.2641, grad_fn=<NllLossBackward0>)\n",
"339 LOSS DIFF: tensor(5.5309, grad_fn=<NllLossBackward0>) tensor(5.3012, grad_fn=<NllLossBackward0>)\n",
"340 LOSS DIFF: tensor(5.2953, grad_fn=<NllLossBackward0>) tensor(5.1931, grad_fn=<NllLossBackward0>)\n",
"341 LOSS DIFF: tensor(5.3908, grad_fn=<NllLossBackward0>) tensor(5.2953, grad_fn=<NllLossBackward0>)\n",
"342 LOSS DIFF: tensor(5.5060, grad_fn=<NllLossBackward0>) tensor(5.1682, grad_fn=<NllLossBackward0>)\n",
"700 tensor(5.1404, grad_fn=<NllLossBackward0>)\n",
"343 LOSS DIFF: tensor(5.3184, grad_fn=<NllLossBackward0>) tensor(4.8281, grad_fn=<NllLossBackward0>)\n",
"344 LOSS DIFF: tensor(5.4549, grad_fn=<NllLossBackward0>) tensor(5.3184, grad_fn=<NllLossBackward0>)\n",
"345 LOSS DIFF: tensor(5.4196, grad_fn=<NllLossBackward0>) tensor(5.4127, grad_fn=<NllLossBackward0>)\n",
"346 LOSS DIFF: tensor(5.4480, grad_fn=<NllLossBackward0>) tensor(5.4196, grad_fn=<NllLossBackward0>)\n",
"347 LOSS DIFF: tensor(5.5778, grad_fn=<NllLossBackward0>) tensor(5.3616, grad_fn=<NllLossBackward0>)\n",
"348 LOSS DIFF: tensor(5.2266, grad_fn=<NllLossBackward0>) tensor(5.1052, grad_fn=<NllLossBackward0>)\n",
"349 LOSS DIFF: tensor(5.4058, grad_fn=<NllLossBackward0>) tensor(5.2266, grad_fn=<NllLossBackward0>)\n",
"350 LOSS DIFF: tensor(5.2772, grad_fn=<NllLossBackward0>) tensor(5.1653, grad_fn=<NllLossBackward0>)\n",
"351 LOSS DIFF: tensor(5.3236, grad_fn=<NllLossBackward0>) tensor(5.2772, grad_fn=<NllLossBackward0>)\n",
"352 LOSS DIFF: tensor(5.3818, grad_fn=<NllLossBackward0>) tensor(5.3236, grad_fn=<NllLossBackward0>)\n",
"353 LOSS DIFF: tensor(5.1957, grad_fn=<NllLossBackward0>) tensor(5.1122, grad_fn=<NllLossBackward0>)\n",
"354 LOSS DIFF: tensor(5.2754, grad_fn=<NllLossBackward0>) tensor(5.1957, grad_fn=<NllLossBackward0>)\n",
"355 LOSS DIFF: tensor(5.4069, grad_fn=<NllLossBackward0>) tensor(5.2754, grad_fn=<NllLossBackward0>)\n",
"356 LOSS DIFF: tensor(5.3361, grad_fn=<NllLossBackward0>) tensor(5.1708, grad_fn=<NllLossBackward0>)\n",
"357 LOSS DIFF: tensor(5.5310, grad_fn=<NllLossBackward0>) tensor(5.2320, grad_fn=<NllLossBackward0>)\n",
"358 LOSS DIFF: tensor(5.5582, grad_fn=<NllLossBackward0>) tensor(5.3281, grad_fn=<NllLossBackward0>)\n",
"359 LOSS DIFF: tensor(5.4403, grad_fn=<NllLossBackward0>) tensor(5.0958, grad_fn=<NllLossBackward0>)\n",
"360 LOSS DIFF: tensor(5.3855, grad_fn=<NllLossBackward0>) tensor(5.3547, grad_fn=<NllLossBackward0>)\n",
"361 LOSS DIFF: tensor(5.4341, grad_fn=<NllLossBackward0>) tensor(5.3628, grad_fn=<NllLossBackward0>)\n",
"362 LOSS DIFF: tensor(5.4064, grad_fn=<NllLossBackward0>) tensor(5.3641, grad_fn=<NllLossBackward0>)\n",
"363 LOSS DIFF: tensor(5.4232, grad_fn=<NllLossBackward0>) tensor(5.4064, grad_fn=<NllLossBackward0>)\n",
"364 LOSS DIFF: tensor(5.4929, grad_fn=<NllLossBackward0>) tensor(5.2922, grad_fn=<NllLossBackward0>)\n",
"365 LOSS DIFF: tensor(5.2788, grad_fn=<NllLossBackward0>) tensor(5.1483, grad_fn=<NllLossBackward0>)\n",
"366 LOSS DIFF: tensor(5.3894, grad_fn=<NllLossBackward0>) tensor(5.1464, grad_fn=<NllLossBackward0>)\n",
"367 LOSS DIFF: tensor(5.5410, grad_fn=<NllLossBackward0>) tensor(5.3032, grad_fn=<NllLossBackward0>)\n",
"368 LOSS DIFF: tensor(5.4745, grad_fn=<NllLossBackward0>) tensor(5.3954, grad_fn=<NllLossBackward0>)\n",
"369 LOSS DIFF: tensor(5.4002, grad_fn=<NllLossBackward0>) tensor(5.2852, grad_fn=<NllLossBackward0>)\n",
"370 LOSS DIFF: tensor(5.5121, grad_fn=<NllLossBackward0>) tensor(5.1010, grad_fn=<NllLossBackward0>)\n",
"371 LOSS DIFF: tensor(5.1770, grad_fn=<NllLossBackward0>) tensor(4.9924, grad_fn=<NllLossBackward0>)\n",
"372 LOSS DIFF: tensor(5.2602, grad_fn=<NllLossBackward0>) tensor(5.0630, grad_fn=<NllLossBackward0>)\n",
"373 LOSS DIFF: tensor(5.1854, grad_fn=<NllLossBackward0>) tensor(5.1847, grad_fn=<NllLossBackward0>)\n",
"374 LOSS DIFF: tensor(5.4752, grad_fn=<NllLossBackward0>) tensor(5.1854, grad_fn=<NllLossBackward0>)\n",
"375 LOSS DIFF: tensor(5.3940, grad_fn=<NllLossBackward0>) tensor(4.9471, grad_fn=<NllLossBackward0>)\n",
"376 LOSS DIFF: tensor(5.4444, grad_fn=<NllLossBackward0>) tensor(5.3940, grad_fn=<NllLossBackward0>)\n",
"377 LOSS DIFF: tensor(5.2639, grad_fn=<NllLossBackward0>) tensor(5.2434, grad_fn=<NllLossBackward0>)\n",
"378 LOSS DIFF: tensor(5.5010, grad_fn=<NllLossBackward0>) tensor(5.2639, grad_fn=<NllLossBackward0>)\n",
"379 LOSS DIFF: tensor(5.3871, grad_fn=<NllLossBackward0>) tensor(5.2697, grad_fn=<NllLossBackward0>)\n",
"380 LOSS DIFF: tensor(5.5319, grad_fn=<NllLossBackward0>) tensor(5.2951, grad_fn=<NllLossBackward0>)\n",
"381 LOSS DIFF: tensor(5.2672, grad_fn=<NllLossBackward0>) tensor(5.0885, grad_fn=<NllLossBackward0>)\n",
"382 LOSS DIFF: tensor(5.3262, grad_fn=<NllLossBackward0>) tensor(5.2672, grad_fn=<NllLossBackward0>)\n",
"383 LOSS DIFF: tensor(5.4015, grad_fn=<NllLossBackward0>) tensor(5.3262, grad_fn=<NllLossBackward0>)\n",
"384 LOSS DIFF: tensor(5.2618, grad_fn=<NllLossBackward0>) tensor(5.2335, grad_fn=<NllLossBackward0>)\n",
"385 LOSS DIFF: tensor(5.3040, grad_fn=<NllLossBackward0>) tensor(5.2618, grad_fn=<NllLossBackward0>)\n",
"386 LOSS DIFF: tensor(5.2459, grad_fn=<NllLossBackward0>) tensor(5.0806, grad_fn=<NllLossBackward0>)\n",
"387 LOSS DIFF: tensor(5.3756, grad_fn=<NllLossBackward0>) tensor(5.2459, grad_fn=<NllLossBackward0>)\n",
"388 LOSS DIFF: tensor(5.3504, grad_fn=<NllLossBackward0>) tensor(5.1054, grad_fn=<NllLossBackward0>)\n",
"389 LOSS DIFF: tensor(5.2258, grad_fn=<NllLossBackward0>) tensor(5.1519, grad_fn=<NllLossBackward0>)\n",
"390 LOSS DIFF: tensor(5.2802, grad_fn=<NllLossBackward0>) tensor(5.2258, grad_fn=<NllLossBackward0>)\n",
"391 LOSS DIFF: tensor(5.3461, grad_fn=<NllLossBackward0>) tensor(5.2802, grad_fn=<NllLossBackward0>)\n",
"392 LOSS DIFF: tensor(5.3227, grad_fn=<NllLossBackward0>) tensor(5.2572, grad_fn=<NllLossBackward0>)\n",
"800 tensor(5.1938, grad_fn=<NllLossBackward0>)\n",
"393 LOSS DIFF: tensor(5.4509, grad_fn=<NllLossBackward0>) tensor(5.1938, grad_fn=<NllLossBackward0>)\n",
"394 LOSS DIFF: tensor(5.1965, grad_fn=<NllLossBackward0>) tensor(5.1726, grad_fn=<NllLossBackward0>)\n",
"395 LOSS DIFF: tensor(5.3317, grad_fn=<NllLossBackward0>) tensor(5.1965, grad_fn=<NllLossBackward0>)\n",
"396 LOSS DIFF: tensor(5.2442, grad_fn=<NllLossBackward0>) tensor(5.0167, grad_fn=<NllLossBackward0>)\n",
"397 LOSS DIFF: tensor(5.2592, grad_fn=<NllLossBackward0>) tensor(5.2442, grad_fn=<NllLossBackward0>)\n",
"398 LOSS DIFF: tensor(5.2272, grad_fn=<NllLossBackward0>) tensor(5.1738, grad_fn=<NllLossBackward0>)\n",
"399 LOSS DIFF: tensor(5.2863, grad_fn=<NllLossBackward0>) tensor(5.2272, grad_fn=<NllLossBackward0>)\n",
"400 LOSS DIFF: tensor(5.3143, grad_fn=<NllLossBackward0>) tensor(5.2863, grad_fn=<NllLossBackward0>)\n",
"401 LOSS DIFF: tensor(5.0616, grad_fn=<NllLossBackward0>) tensor(5.0013, grad_fn=<NllLossBackward0>)\n",
"402 LOSS DIFF: tensor(5.4039, grad_fn=<NllLossBackward0>) tensor(5.0616, grad_fn=<NllLossBackward0>)\n",
"403 LOSS DIFF: tensor(5.3913, grad_fn=<NllLossBackward0>) tensor(4.9984, grad_fn=<NllLossBackward0>)\n",
"404 LOSS DIFF: tensor(5.2658, grad_fn=<NllLossBackward0>) tensor(5.2179, grad_fn=<NllLossBackward0>)\n",
"405 LOSS DIFF: tensor(5.2846, grad_fn=<NllLossBackward0>) tensor(5.2658, grad_fn=<NllLossBackward0>)\n",
"406 LOSS DIFF: tensor(5.3590, grad_fn=<NllLossBackward0>) tensor(5.2846, grad_fn=<NllLossBackward0>)\n",
"407 LOSS DIFF: tensor(5.4706, grad_fn=<NllLossBackward0>) tensor(5.0496, grad_fn=<NllLossBackward0>)\n",
"408 LOSS DIFF: tensor(5.6955, grad_fn=<NllLossBackward0>) tensor(5.4706, grad_fn=<NllLossBackward0>)\n",
"409 LOSS DIFF: tensor(5.4540, grad_fn=<NllLossBackward0>) tensor(4.9054, grad_fn=<NllLossBackward0>)\n",
"410 LOSS DIFF: tensor(5.1788, grad_fn=<NllLossBackward0>) tensor(5.0048, grad_fn=<NllLossBackward0>)\n",
"411 LOSS DIFF: tensor(5.2213, grad_fn=<NllLossBackward0>) tensor(5.1788, grad_fn=<NllLossBackward0>)\n",
"412 LOSS DIFF: tensor(5.2282, grad_fn=<NllLossBackward0>) tensor(5.2213, grad_fn=<NllLossBackward0>)\n",
"413 LOSS DIFF: tensor(5.4138, grad_fn=<NllLossBackward0>) tensor(5.1972, grad_fn=<NllLossBackward0>)\n",
"414 LOSS DIFF: tensor(5.3300, grad_fn=<NllLossBackward0>) tensor(4.9654, grad_fn=<NllLossBackward0>)\n",
"415 LOSS DIFF: tensor(5.0692, grad_fn=<NllLossBackward0>) tensor(4.9775, grad_fn=<NllLossBackward0>)\n",
"416 LOSS DIFF: tensor(5.1780, grad_fn=<NllLossBackward0>) tensor(5.0692, grad_fn=<NllLossBackward0>)\n",
"417 LOSS DIFF: tensor(5.4131, grad_fn=<NllLossBackward0>) tensor(5.1780, grad_fn=<NllLossBackward0>)\n",
"418 LOSS DIFF: tensor(5.5625, grad_fn=<NllLossBackward0>) tensor(5.4131, grad_fn=<NllLossBackward0>)\n",
"419 LOSS DIFF: tensor(5.1862, grad_fn=<NllLossBackward0>) tensor(5.1502, grad_fn=<NllLossBackward0>)\n",
"420 LOSS DIFF: tensor(5.2858, grad_fn=<NllLossBackward0>) tensor(5.1862, grad_fn=<NllLossBackward0>)\n",
"421 LOSS DIFF: tensor(5.2607, grad_fn=<NllLossBackward0>) tensor(5.2394, grad_fn=<NllLossBackward0>)\n",
"422 LOSS DIFF: tensor(5.4085, grad_fn=<NllLossBackward0>) tensor(5.2607, grad_fn=<NllLossBackward0>)\n",
"423 LOSS DIFF: tensor(5.3268, grad_fn=<NllLossBackward0>) tensor(5.3040, grad_fn=<NllLossBackward0>)\n",
"424 LOSS DIFF: tensor(5.4477, grad_fn=<NllLossBackward0>) tensor(5.3268, grad_fn=<NllLossBackward0>)\n",
"425 LOSS DIFF: tensor(5.3032, grad_fn=<NllLossBackward0>) tensor(5.2228, grad_fn=<NllLossBackward0>)\n",
"426 LOSS DIFF: tensor(5.4339, grad_fn=<NllLossBackward0>) tensor(5.2517, grad_fn=<NllLossBackward0>)\n",
"427 LOSS DIFF: tensor(5.3693, grad_fn=<NllLossBackward0>) tensor(5.0677, grad_fn=<NllLossBackward0>)\n",
"428 LOSS DIFF: tensor(5.2379, grad_fn=<NllLossBackward0>) tensor(5.2100, grad_fn=<NllLossBackward0>)\n",
"429 LOSS DIFF: tensor(5.2541, grad_fn=<NllLossBackward0>) tensor(5.2379, grad_fn=<NllLossBackward0>)\n",
"430 LOSS DIFF: tensor(5.2259, grad_fn=<NllLossBackward0>) tensor(5.1291, grad_fn=<NllLossBackward0>)\n",
"431 LOSS DIFF: tensor(5.2455, grad_fn=<NllLossBackward0>) tensor(5.1523, grad_fn=<NllLossBackward0>)\n",
"432 LOSS DIFF: tensor(5.3854, grad_fn=<NllLossBackward0>) tensor(5.2147, grad_fn=<NllLossBackward0>)\n",
"433 LOSS DIFF: tensor(5.2580, grad_fn=<NllLossBackward0>) tensor(5.1674, grad_fn=<NllLossBackward0>)\n",
"434 LOSS DIFF: tensor(5.3666, grad_fn=<NllLossBackward0>) tensor(5.2580, grad_fn=<NllLossBackward0>)\n",
"435 LOSS DIFF: tensor(5.3990, grad_fn=<NllLossBackward0>) tensor(5.2895, grad_fn=<NllLossBackward0>)\n",
"436 LOSS DIFF: tensor(5.4095, grad_fn=<NllLossBackward0>) tensor(5.2050, grad_fn=<NllLossBackward0>)\n",
"437 LOSS DIFF: tensor(5.3580, grad_fn=<NllLossBackward0>) tensor(5.1551, grad_fn=<NllLossBackward0>)\n",
"438 LOSS DIFF: tensor(5.5038, grad_fn=<NllLossBackward0>) tensor(5.2894, grad_fn=<NllLossBackward0>)\n",
"439 LOSS DIFF: tensor(5.3097, grad_fn=<NllLossBackward0>) tensor(5.1047, grad_fn=<NllLossBackward0>)\n",
"440 LOSS DIFF: tensor(5.4076, grad_fn=<NllLossBackward0>) tensor(5.3097, grad_fn=<NllLossBackward0>)\n",
"441 LOSS DIFF: tensor(5.3938, grad_fn=<NllLossBackward0>) tensor(5.2490, grad_fn=<NllLossBackward0>)\n",
"442 LOSS DIFF: tensor(5.6185, grad_fn=<NllLossBackward0>) tensor(5.3873, grad_fn=<NllLossBackward0>)\n",
"900 tensor(5.2894, grad_fn=<NllLossBackward0>)\n",
"443 LOSS DIFF: tensor(5.2605, grad_fn=<NllLossBackward0>) tensor(5.0513, grad_fn=<NllLossBackward0>)\n",
"444 LOSS DIFF: tensor(5.5549, grad_fn=<NllLossBackward0>) tensor(5.2605, grad_fn=<NllLossBackward0>)\n",
"445 LOSS DIFF: tensor(5.1775, grad_fn=<NllLossBackward0>) tensor(5.1379, grad_fn=<NllLossBackward0>)\n",
"446 LOSS DIFF: tensor(5.3998, grad_fn=<NllLossBackward0>) tensor(5.1775, grad_fn=<NllLossBackward0>)\n",
"447 LOSS DIFF: tensor(5.4069, grad_fn=<NllLossBackward0>) tensor(5.3169, grad_fn=<NllLossBackward0>)\n",
"448 LOSS DIFF: tensor(5.2558, grad_fn=<NllLossBackward0>) tensor(4.9919, grad_fn=<NllLossBackward0>)\n",
"449 LOSS DIFF: tensor(5.4139, grad_fn=<NllLossBackward0>) tensor(5.2558, grad_fn=<NllLossBackward0>)\n",
"450 LOSS DIFF: tensor(5.4725, grad_fn=<NllLossBackward0>) tensor(5.4139, grad_fn=<NllLossBackward0>)\n",
"451 LOSS DIFF: tensor(5.3004, grad_fn=<NllLossBackward0>) tensor(5.1489, grad_fn=<NllLossBackward0>)\n",
"452 LOSS DIFF: tensor(5.3943, grad_fn=<NllLossBackward0>) tensor(5.3004, grad_fn=<NllLossBackward0>)\n",
"453 LOSS DIFF: tensor(5.2652, grad_fn=<NllLossBackward0>) tensor(5.0230, grad_fn=<NllLossBackward0>)\n",
"454 LOSS DIFF: tensor(5.3982, grad_fn=<NllLossBackward0>) tensor(5.2229, grad_fn=<NllLossBackward0>)\n",
"455 LOSS DIFF: tensor(5.4184, grad_fn=<NllLossBackward0>) tensor(5.2137, grad_fn=<NllLossBackward0>)\n",
"456 LOSS DIFF: tensor(5.6858, grad_fn=<NllLossBackward0>) tensor(5.1474, grad_fn=<NllLossBackward0>)\n",
"457 LOSS DIFF: tensor(5.3886, grad_fn=<NllLossBackward0>) tensor(5.1649, grad_fn=<NllLossBackward0>)\n",
"458 LOSS DIFF: tensor(5.3129, grad_fn=<NllLossBackward0>) tensor(5.2705, grad_fn=<NllLossBackward0>)\n",
"459 LOSS DIFF: tensor(5.4430, grad_fn=<NllLossBackward0>) tensor(5.0307, grad_fn=<NllLossBackward0>)\n",
"460 LOSS DIFF: tensor(5.4555, grad_fn=<NllLossBackward0>) tensor(5.3132, grad_fn=<NllLossBackward0>)\n",
"461 LOSS DIFF: tensor(5.2490, grad_fn=<NllLossBackward0>) tensor(4.9971, grad_fn=<NllLossBackward0>)\n",
"462 LOSS DIFF: tensor(5.4743, grad_fn=<NllLossBackward0>) tensor(5.1878, grad_fn=<NllLossBackward0>)\n",
"463 LOSS DIFF: tensor(5.2897, grad_fn=<NllLossBackward0>) tensor(4.9685, grad_fn=<NllLossBackward0>)\n",
"464 LOSS DIFF: tensor(5.3322, grad_fn=<NllLossBackward0>) tensor(5.1790, grad_fn=<NllLossBackward0>)\n",
"465 LOSS DIFF: tensor(5.2013, grad_fn=<NllLossBackward0>) tensor(5.0778, grad_fn=<NllLossBackward0>)\n",
"466 LOSS DIFF: tensor(5.2347, grad_fn=<NllLossBackward0>) tensor(5.0395, grad_fn=<NllLossBackward0>)\n",
"467 LOSS DIFF: tensor(5.2472, grad_fn=<NllLossBackward0>) tensor(5.2347, grad_fn=<NllLossBackward0>)\n",
"468 LOSS DIFF: tensor(5.3672, grad_fn=<NllLossBackward0>) tensor(5.1695, grad_fn=<NllLossBackward0>)\n",
"469 LOSS DIFF: tensor(5.3892, grad_fn=<NllLossBackward0>) tensor(5.3672, grad_fn=<NllLossBackward0>)\n",
"470 LOSS DIFF: tensor(5.1295, grad_fn=<NllLossBackward0>) tensor(5.1241, grad_fn=<NllLossBackward0>)\n",
"471 LOSS DIFF: tensor(5.2935, grad_fn=<NllLossBackward0>) tensor(5.1295, grad_fn=<NllLossBackward0>)\n",
"472 LOSS DIFF: tensor(5.4916, grad_fn=<NllLossBackward0>) tensor(5.2935, grad_fn=<NllLossBackward0>)\n",
"473 LOSS DIFF: tensor(5.2570, grad_fn=<NllLossBackward0>) tensor(5.0166, grad_fn=<NllLossBackward0>)\n",
"474 LOSS DIFF: tensor(5.3124, grad_fn=<NllLossBackward0>) tensor(5.1387, grad_fn=<NllLossBackward0>)\n",
"475 LOSS DIFF: tensor(5.2445, grad_fn=<NllLossBackward0>) tensor(5.1581, grad_fn=<NllLossBackward0>)\n",
"476 LOSS DIFF: tensor(5.4986, grad_fn=<NllLossBackward0>) tensor(5.2445, grad_fn=<NllLossBackward0>)\n",
"477 LOSS DIFF: tensor(5.2073, grad_fn=<NllLossBackward0>) tensor(5.1772, grad_fn=<NllLossBackward0>)\n",
"478 LOSS DIFF: tensor(5.2213, grad_fn=<NllLossBackward0>) tensor(5.0682, grad_fn=<NllLossBackward0>)\n",
"479 LOSS DIFF: tensor(5.2317, grad_fn=<NllLossBackward0>) tensor(5.2213, grad_fn=<NllLossBackward0>)\n",
"480 LOSS DIFF: tensor(5.2169, grad_fn=<NllLossBackward0>) tensor(4.8229, grad_fn=<NllLossBackward0>)\n",
"481 LOSS DIFF: tensor(5.4192, grad_fn=<NllLossBackward0>) tensor(5.2169, grad_fn=<NllLossBackward0>)\n",
"482 LOSS DIFF: tensor(5.3481, grad_fn=<NllLossBackward0>) tensor(5.1884, grad_fn=<NllLossBackward0>)\n",
"483 LOSS DIFF: tensor(5.4329, grad_fn=<NllLossBackward0>) tensor(5.3481, grad_fn=<NllLossBackward0>)\n",
"484 LOSS DIFF: tensor(5.1482, grad_fn=<NllLossBackward0>) tensor(4.8979, grad_fn=<NllLossBackward0>)\n",
"485 LOSS DIFF: tensor(5.3562, grad_fn=<NllLossBackward0>) tensor(5.1482, grad_fn=<NllLossBackward0>)\n",
"486 LOSS DIFF: tensor(5.5739, grad_fn=<NllLossBackward0>) tensor(5.3562, grad_fn=<NllLossBackward0>)\n",
"487 LOSS DIFF: tensor(5.0749, grad_fn=<NllLossBackward0>) tensor(4.9742, grad_fn=<NllLossBackward0>)\n",
"488 LOSS DIFF: tensor(5.2301, grad_fn=<NllLossBackward0>) tensor(5.0749, grad_fn=<NllLossBackward0>)\n",
"489 LOSS DIFF: tensor(5.4543, grad_fn=<NllLossBackward0>) tensor(5.2301, grad_fn=<NllLossBackward0>)\n",
"490 LOSS DIFF: tensor(5.2210, grad_fn=<NllLossBackward0>) tensor(4.9663, grad_fn=<NllLossBackward0>)\n",
"491 LOSS DIFF: tensor(5.3469, grad_fn=<NllLossBackward0>) tensor(5.2210, grad_fn=<NllLossBackward0>)\n",
"1000 tensor(5.4116, grad_fn=<NllLossBackward0>)\n",
"492 LOSS DIFF: tensor(5.4116, grad_fn=<NllLossBackward0>) tensor(5.2156, grad_fn=<NllLossBackward0>)\n",
"493 LOSS DIFF: tensor(5.1600, grad_fn=<NllLossBackward0>) tensor(4.9976, grad_fn=<NllLossBackward0>)\n",
"494 LOSS DIFF: tensor(5.2190, grad_fn=<NllLossBackward0>) tensor(5.1102, grad_fn=<NllLossBackward0>)\n",
"495 LOSS DIFF: tensor(5.1974, grad_fn=<NllLossBackward0>) tensor(5.0123, grad_fn=<NllLossBackward0>)\n",
"496 LOSS DIFF: tensor(5.3085, grad_fn=<NllLossBackward0>) tensor(5.1974, grad_fn=<NllLossBackward0>)\n",
"497 LOSS DIFF: tensor(5.3090, grad_fn=<NllLossBackward0>) tensor(5.3085, grad_fn=<NllLossBackward0>)\n",
"498 LOSS DIFF: tensor(5.3978, grad_fn=<NllLossBackward0>) tensor(5.0467, grad_fn=<NllLossBackward0>)\n",
"499 LOSS DIFF: tensor(5.3369, grad_fn=<NllLossBackward0>) tensor(5.0919, grad_fn=<NllLossBackward0>)\n",
"500 LOSS DIFF: tensor(5.3036, grad_fn=<NllLossBackward0>) tensor(5.2151, grad_fn=<NllLossBackward0>)\n"
]
}
],
"source": [
"device = 'cpu'\n",
"model = TrigramNNModel(VOCAB_SIZE, EMBED_SIZE).to(device)\n",
"data = DataLoader(train_dataset, batch_size=2_000)\n",
"optimizer = torch.optim.Adam(model.parameters())\n",
"criterion = torch.nn.NLLLoss()\n",
"\n",
"loss_track = []\n",
"last_loss = 1_000\n",
"trigger_count = 0\n",
"\n",
"model.train()\n",
"step = 0\n",
"for x, y in data:\n",
" x[0] = x[0].to(device)\n",
" x[1] = x[1].to(device)\n",
" y = y.to(device)\n",
" optimizer.zero_grad()\n",
" ypredicted = model(x)\n",
" loss = criterion(torch.log(ypredicted), y)\n",
" if step % 100 == 0:\n",
" print(step, loss)\n",
" step += 1\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" if loss > last_loss:\n",
" trigger_count += 1 \n",
" print(trigger_count, 'LOSS DIFF:', loss, last_loss)\n",
"\n",
" if trigger_count >= 500:\n",
" break\n",
"\n",
" loss_track.append(loss)\n",
" last_loss = loss"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAhYAAAGdCAYAAABO2DpVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAABdaElEQVR4nO3dd3hTZfsH8G+SpumgAygUCmXvDbKXoggiL+JWRAVxi+t1veLe4PwpvoobF+rr3ooIyJC9KSjDsvfqnknO74826XNOzknOSU+SNv1+rstLmvnkNM25cz/3cz8WSZIkEBEREZnAGukBEBERUfRgYEFERESmYWBBREREpmFgQURERKZhYEFERESmYWBBREREpmFgQURERKZhYEFERESmiQn3E7rdbhw8eBBJSUmwWCzhfnoiIiIKgiRJyM/PR0ZGBqxW7bxE2AOLgwcPIjMzM9xPS0RERCbYt28fmjdvrnl92AOLpKQkABUDS05ODvfTExERURDy8vKQmZnpPY9rCXtg4Zn+SE5OZmBBRERUywQqY2DxJhEREZmGgQURERGZhoEFERERmYaBBREREZmGgQURERGZhoEFERERmYaBBREREZmGgQURERGZhoEFERERmYaBBREREZmGgQURERGZhoEFERERmSbsm5CFyku/bUNOcTluHdEOjZPjIj0cIiKiOilqMhafrt6HD5fvwbGC0kgPhYiIqM6KmsAi1lbxUsqc7giPhIiIqO6KmsDCEcPAgoiIKNKiJrCI9QQWLgYWREREkRJ9gQUzFkRERBETPYEFayyIiIgiLnoCC06FEBERRVzUBRalzFgQERFFTPQEFpVTIeXMWBAREUVM9AQWLN4kIiKKOAYWREREZJqoCSzYIIuIiCjyoiaw8C43ZY0FERFRxERPYMGMBRERUcRFXWDB5aZERESREz2Bhc0GgFMhREREkRQ9gQWnQoiIiCLOcGCRn5+PO++8Ey1btkR8fDwGDx6M1atXh2JshjCwICIiijzDgcV1112HefPm4aOPPsLmzZsxatQojBw5EgcOHAjF+HRjYEFERBR5hgKL4uJifPXVV3juuecwfPhwtGvXDo899hjatWuHWbNmhWqMuji43JSIiCjiYozc2Ol0wuVyIS4uTnZ5fHw8li5dqnqf0tJSlJaWen/Oy8sLYpiBMWNBREQUeYYyFklJSRg0aBCefPJJHDx4EC6XCx9//DGWL1+OQ4cOqd5n+vTpSElJ8f6XmZlpysCVGFgQERFFnuEai48++giSJKFZs2ZwOByYOXMmJkyYAKtV/aGmTZuG3Nxc73/79u2r9qDVeDpvlnIqhIiIKGIMTYUAQNu2bbFo0SIUFhYiLy8PTZs2xWWXXYY2bdqo3t7hcMDhcFR7oIEwY0FERBR5QfexSExMRNOmTXHq1CnMnTsX48ePN3NchlUFFq6IjoOIiKguM5yxmDt3LiRJQseOHbFz507ce++96NSpE6655ppQjE83b2DBqRAiIqKIMZyxyM3NxdSpU9GpUydcffXVGDp0KObOnQu73R6K8enm3d2UUyFEREQRYzhjcemll+LSSy8NxViqxcEaCyIioojjXiFERERkmugLLFhjQUREFDHRE1hU1liUuyS43VKER0NERFQ3RU9gEVP1Upi1ICIiigwGFkRERGSa6AksbEJgwQJOIiKiiIiawMJisbCXBRERUYRFTWABcMkpERFRpEVnYMEaCyIiooiIrsCCUyFEREQRFV2BRWXGopSBBRERUUREZWDBjAUREVFkRFdgYWONBRERUSRFV2DBjAUREVFEMbAgIiIi00RVYOHwLjd1RXgkREREdVNUBRZcbkpERBRZ0RVYcCqEiIgooqIysGAfCyIiosiIqsDCzuWmREREERVVgQWnQoiIiCIrugKLyoxFOTMWREREERFVgYXdZgEAOF1ShEdCRERUN0VVYGGzVrwcp5uBBRERUSREVWBRlbHgVAgREVEkRFVgEVOZsShnxoKIiCgioiuwqMxYuFhjQUREFBHRFVhYKwKLcjenQoiIiCIhugKLyuWmXBVCREQUGdEVWFRmLFyssSAiIoqI6AosKmss2CCLiIgoMqIqsLCzjwUREVFERVVgYaucCmFgQUREFBlRFVjEsEEWERFRREVVYGHnqhAiIqKIiqrAgn0siIiIIiu6Agsbl5sSERFFUnQFFp69QjgVQkREFBHRFViweJOIiCiioiuwqMxYcCqEiIgoMqIrsLCxeJOIiCiSoiqw8HbeZI0FERFRRERVYMHOm0RERJEVVYGFncWbREREERVVgUUMO28SERFFVHQFFpVTIWXMWBAREUVEVAUWSXExAIBSpxvlDC6IiIjCLsoCC7v337nF5REcCRERUd0UVYGFzWrxZi1yihhYEBERhVtUBRYAkJpQkbXYuC8nsgMhIiKqg6IusNh3shgAcPcXGyM8EiIioron6gILIiIiipyoCyz6tEiN9BCIiIjqrKgLLJ4Y3w0A0DAxNsIjISIiqnuiLrCIj7UBAPtYEBERRUDUBRaxlW29y9nWm4iIKOyiLrCwe/YLcTNjQUREFG5RF1jEVO5wWu6SIEnMWhAREYVT1AUWnowFwOkQIiKicIu6wCJWCCw4HUJERBReURdYeKZCAKDcyYwFERFROEVfYGGtCizKuOSUiIgorKIusLBYLN7pEE6FEBERhVfUBRaAsDKEUyFERERhZSiwcLlcePjhh9G6dWvEx8ejbdu2ePLJJ2vcsk7PyhBOhRAREYVXjJEbP/vss5g1axY++OADdO3aFWvWrME111yDlJQU3H777aEao2FskkVERBQZhjIWy5Ytw/jx4zF27Fi0atUKF198MUaNGoVVq1aFanxBsVdOhbz42/Yal00hIiKKZoYCi8GDB2P+/PnYvn07AGDjxo1YunQpxowZE5LBBetQbgkAYN7WI9h5tCDCoyEiIqo7DE2F3H///cjLy0OnTp1gs9ngcrnw9NNPY+LEiZr3KS0tRWlpqffnvLy84EcbhMIyV1ifj4iIqC4zlLH4/PPPMWfOHHzyySdYt24dPvjgA7zwwgv44IMPNO8zffp0pKSkeP/LzMys9qCN4PbpRERE4WORDBQhZGZm4v7778fUqVO9lz311FP4+OOP8ffff6veRy1jkZmZidzcXCQnJ1dj6Npa3f+T999zrhuAIe3SQvI8REREdUVeXh5SUlICnr8NZSyKiopgtcrvYrPZ4Paz+sLhcCA5OVn2X6i9O6mv999cckpERBQ+hmosxo0bh6effhotWrRA165dsX79erz00kuYMmVKqMYXlLM6p6N3i1Ss35uDMicDCyIionAxFFi8+uqrePjhh3HLLbfg6NGjyMjIwI033ohHHnkkVOMLmqetN2ssiIiIwsdQYJGUlISXX34ZL7/8coiGY57YmMrum8xYEBERhU1U7hUCMGNBREQUCVEbWFTtF8LOm0REROEStYEFp0KIiIjCL2oDCzunQoiIiMIuagMLZiyIiIjCL3oDi8odTpmxICIiCp+oDSyqijcZWBAREYVL1AYWnAohIiIKv6gNLFi8SUREFH5RG1gwY0FERBR+URtY2CuLN51skEVERBQ2URtYxFRu7+50M7AgIiIKl+gNLDwZCzenQoiIiMIlegMLT8aCUyFERERhE8WBhSdjwcCCiIgoXKI3sLAxsCAiIgq3qA0sbJ6MBftYEBERhU3UBhaeGotl/5zApW8ux76TRREeERERUfSL3sCicioEAFbtOolHv98SwdEQERHVDdEbWFgtsp9zisoiNBIiIqK6I3oDC5v8pdltUftSiYiIaoyoPdsqMxaevUOIiIgodKL2bGtTBhbMWBAREYVc1J5t7TaL4ueofalEREQ1RtSebW1WRY0Fp0KIiIhCLmrPtsoaC2UGg4iIiMwXvYGFjTUWRERE4Ra1Z9sYxVSIspiTiIiIzBfFgYU8kHBL3IyMiIgo1KI2sFBmKMpdDCyIiIhCLWoDC+XyUu5ySkREFHpRG1j4ZCzczFgQERGFWtQGFsrlpT9tOgQ3gwsiIqKQitrAQm0VyMwFOwAAx/JLUebk1Ag
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.plot([t.detach().numpy() for t in loss_track])\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"torch.save(model.state_dict(), f'model_trigram-EMBED_SIZE={EMBED_SIZE}.bin')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"vocab_unique = set(vocab.get_stoi().keys())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\micha\\AppData\\Local\\Temp\\ipykernel_14016\\2809838665.py:15: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
" x = self.softmax(x)\n"
]
}
],
"source": [
"output = []\n",
"with lzma.open(\"dev-0/in.tsv.xz\", encoding='utf8', mode=\"rt\") as file:\n",
" for line in file:\n",
" line = line.split(\"\\t\")\n",
"\n",
" first_word = re.sub(r\"\\\\+n\", \" \", line[-2]).split()[-1]\n",
" first_word = re.sub('[^A-Za-z]+', '', first_word)\n",
"\n",
" second_word = re.sub(r\"\\\\+n\", \" \", line[-1]).split()[0]\n",
" second_word = re.sub('[^A-Za-z]+', '', second_word)\n",
"\n",
" if first_word not in vocab_unique:\n",
" word = \"<unk>\"\n",
" if second_word not in vocab_unique:\n",
" word = \"<unk>\"\n",
"\n",
" input_tokens = torch.tensor([vocab.forward([first_word]), vocab.forward([second_word])]).to(device)\n",
" out = model(input_tokens)\n",
"\n",
" top = torch.topk(out[0], 10)\n",
" top_indices = top.indices.tolist()\n",
" top_probs = top.values.tolist()\n",
" unk_bonus = 1 - sum(top_probs)\n",
" top_words = vocab.lookup_tokens(top_indices)\n",
" top_zipped = list(zip(top_words, top_probs))\n",
"\n",
" res = \"\"\n",
" for w, p in top_zipped:\n",
" if w == \"<unk>\":\n",
" res += f\":{(p + unk_bonus):.4f} \"\n",
" else:\n",
" res += f\"{w}:{p:.4f} \"\n",
" \n",
" res = res[:-1]\n",
" res += \"\\n\"\n",
" output.append(res)\n",
"\n",
"with open(f\"dev-0/out-EMBED_SIZE={EMBED_SIZE}.tsv\", mode=\"w\") as file:\n",
" file.writelines(output)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\micha\\AppData\\Local\\Temp\\ipykernel_14016\\2809838665.py:15: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
" x = self.softmax(x)\n"
]
}
],
"source": [
"model.eval()\n",
"\n",
"output = []\n",
"with lzma.open(\"test-A/in.tsv.xz\", encoding='utf8', mode=\"rt\") as file:\n",
" for line in file:\n",
" line = line.split(\"\\t\")\n",
"\n",
" first_word = re.sub(r\"\\\\+n\", \" \", line[-2]).split()[-1]\n",
" first_word = re.sub('[^A-Za-z]+', '', first_word)\n",
"\n",
" second_word = re.sub(r\"\\\\+n\", \" \", line[-1]).split()[0]\n",
" second_word = re.sub('[^A-Za-z]+', '', second_word)\n",
"\n",
" if first_word not in vocab_unique:\n",
" word = \"<unk>\"\n",
" if second_word not in vocab_unique:\n",
" word = \"<unk>\"\n",
"\n",
" input_tokens = torch.tensor([vocab.forward([first_word]), vocab.forward([second_word])]).to(device)\n",
" out = model(input_tokens)\n",
"\n",
" top = torch.topk(out[0], 10)\n",
" top_indices = top.indices.tolist()\n",
" top_probs = top.values.tolist()\n",
" unk_bonus = 1 - sum(top_probs)\n",
" top_words = vocab.lookup_tokens(top_indices)\n",
" top_zipped = list(zip(top_words, top_probs))\n",
"\n",
" res = \"\"\n",
" for w, p in top_zipped:\n",
" if w == \"<unk>\":\n",
" res += f\":{(p + unk_bonus):.4f} \"\n",
" else:\n",
" res += f\"{w}:{p:.4f} \"\n",
" \n",
" res = res[:-1]\n",
" res += \"\\n\"\n",
" output.append(res)\n",
"\n",
"with open(f\"test-A/out-EMBED_SIZE={EMBED_SIZE}.tsv\", mode=\"w\") as file:\n",
" file.writelines(output)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "modelowanie-jezyka",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.16"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}