152 KiB
152 KiB
import torch
import lzma
from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
from torch import nn
from torch.utils.data import IterableDataset
import itertools
from torch.utils.data import DataLoader
# torch.cuda.is_available()
# torch.cuda.device_count()
# torch.cuda.current_device()
# torch.cuda.device(0)
# torch.cuda.get_device_name(0)
def get_words_from_line(line):
line = line.rstrip()
line = re.sub(r'\\\\n', ' ', line)
line = re.sub(r'[^a-zA-Z] ', ' ', line)
line = line.lower()
yield '<s>'
for t in line.split():
yield t
yield '</s>'
def get_word_lines_from_file(file_name):
with lzma.open(file_name, encoding='utf8', mode="rt") as fh:
for line in fh:
yield get_words_from_line(line)
vocab_size = 10_000
vocab = build_vocab_from_iterator(
get_word_lines_from_file("train/in.tsv.xz"),
max_tokens = vocab_size,
specials = ['<unk>'])
vocab.set_default_index(vocab['<unk>'])
[1;31m---------------------------------------------------------------------------[0m [1;31mKeyboardInterrupt[0m Traceback (most recent call last) Cell [1;32mIn[80], line 18[0m [0;32m 14[0m [39myield[39;00m get_words_from_line(line) [0;32m 16[0m vocab_size [39m=[39m [39m10_000[39m [1;32m---> 18[0m vocab [39m=[39m build_vocab_from_iterator( [0;32m 19[0m get_word_lines_from_file([39m"[39;49m[39mtrain/in.tsv.xz[39;49m[39m"[39;49m), [0;32m 20[0m max_tokens [39m=[39;49m vocab_size, [0;32m 21[0m specials [39m=[39;49m [[39m'[39;49m[39m<unk>[39;49m[39m'[39;49m]) [0;32m 23[0m vocab[39m.[39mset_default_index(vocab[[39m'[39m[39m<unk>[39m[39m'[39m]) File [1;32mc:\PROGRAMY\Anaconda3\envs\scweet\lib\site-packages\torchtext\vocab\vocab_factory.py:99[0m, in [0;36mbuild_vocab_from_iterator[1;34m(iterator, min_freq, specials, special_first, max_tokens)[0m [0;32m 97[0m counter [39m=[39m Counter() [0;32m 98[0m [39mfor[39;00m tokens [39min[39;00m iterator: [1;32m---> 99[0m counter[39m.[39;49mupdate(tokens) [0;32m 101[0m specials [39m=[39m specials [39mor[39;00m [] [0;32m 103[0m [39m# First sort by descending frequency, then lexicographically[39;00m File [1;32mc:\PROGRAMY\Anaconda3\envs\scweet\lib\collections\__init__.py:637[0m, in [0;36mCounter.update[1;34m(self, iterable, **kwds)[0m [0;32m 635[0m [39msuper[39m(Counter, [39mself[39m)[39m.[39mupdate(iterable) [39m# fast path when counter is empty[39;00m [0;32m 636[0m [39melse[39;00m: [1;32m--> 637[0m _count_elements([39mself[39;49m, iterable) [0;32m 638[0m [39mif[39;00m kwds: [0;32m 639[0m [39mself[39m[39m.[39mupdate(kwds) Cell [1;32mIn[80], line 4[0m, in [0;36mget_words_from_line[1;34m(line)[0m [0;32m 2[0m line [39m=[39m line[39m.[39mrstrip() [0;32m 3[0m line [39m=[39m re[39m.[39msub([39mr[39m[39m'[39m[39m\\\\[39;00m[39mn[39m[39m'[39m, [39m'[39m[39m [39m[39m'[39m, line) [1;32m----> 4[0m line [39m=[39m re[39m.[39;49msub([39mr[39;49m[39m'[39;49m[39m[^a-zA-Z] [39;49m[39m'[39;49m, [39m'[39;49m[39m [39;49m[39m'[39;49m, line) [0;32m 5[0m line [39m=[39m line[39m.[39mlower() [0;32m 6[0m [39myield[39;00m [39m'[39m[39m<s>[39m[39m'[39m File [1;32mc:\PROGRAMY\Anaconda3\envs\scweet\lib\site-packages\regex\regex.py:278[0m, in [0;36msub[1;34m(pattern, repl, string, count, flags, pos, endpos, concurrent, timeout, ignore_unused, **kwargs)[0m [0;32m 272[0m [39m[39m[39m"""Return the string obtained by replacing the leftmost (or rightmost with a[39;00m [0;32m 273[0m [39mreverse pattern) non-overlapping occurrences of the pattern in string by the[39;00m [0;32m 274[0m [39mreplacement repl. repl can be either a string or a callable; if a string,[39;00m [0;32m 275[0m [39mbackslash escapes in it are processed; if a callable, it's passed the match[39;00m [0;32m 276[0m [39mobject and must return a replacement string to be used."""[39;00m [0;32m 277[0m pat [39m=[39m _compile(pattern, flags, ignore_unused, kwargs, [39mTrue[39;00m) [1;32m--> 278[0m [39mreturn[39;00m pat[39m.[39;49msub(repl, string, count, pos, endpos, concurrent, timeout) [1;31mKeyboardInterrupt[0m:
def look_ahead_iterator(gen):
prev = None
for item in gen:
if prev is not None:
yield (prev, item)
prev = item
class Bigrams(IterableDataset):
def __init__(self, text_file, vocabulary_size):
self.vocab = vocab
self.vocab.set_default_index(self.vocab['<unk>'])
self.vocabulary_size = vocabulary_size
self.text_file = text_file
def __iter__(self):
return look_ahead_iterator(
(self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))
train_dataset = Bigrams("train/in.tsv.xz", vocab_size)
next(iter(DataLoader(train_dataset, batch_size=10)))
[tensor([ 33, 0, 226, 35, 0, 6421, 6420, 219, 5781, 1]), tensor([ 0, 226, 35, 0, 6421, 6420, 219, 5781, 1, 113])]
embed_size = 100
class SimpleBigramNeuralLanguageModel(nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleBigramNeuralLanguageModel, self).__init__()
self.model = nn.Sequential(
nn.Embedding(vocabulary_size, embedding_size),
nn.Linear(embedding_size, 1000),
nn.ReLU(),
nn.Linear(1000, 500),
nn.ReLU(),
nn.Linear(500, vocabulary_size),
nn.Softmax()
)
def forward(self, x):
return self.model(x)
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size)
vocab.set_default_index(vocab['<unk>'])
ixs = torch.tensor(vocab.forward(['is']))
out = model(ixs)
out[0][vocab['is']]
tensor(8.4503e-05, grad_fn=<SelectBackward0>)
loss_track = []
device = 'cpu'
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
data = DataLoader(train_dataset, batch_size=6000)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()
last_loss = 1_000
trigger_count = 0
model.train()
step = 0
for x, y in data:
x = x.to(device)
y = y.to(device)
optimizer.zero_grad()
ypredicted = model(x)
loss = criterion(torch.log(ypredicted), y)
if step % 100 == 0:
print(step, loss)
step += 1
loss.backward()
optimizer.step()
if loss > last_loss:
trigger_count += 1
print(trigger_count, 'LOSS DIFF:', loss, last_loss)
if trigger_count >= 1_000:
break
loss_track.append(loss)
last_loss = loss
0 tensor(9.2249, grad_fn=<NllLossBackward0>) 1 LOSS DIFF: tensor(6.9568, grad_fn=<NllLossBackward0>) tensor(6.9539, grad_fn=<NllLossBackward0>) 2 LOSS DIFF: tensor(6.5283, grad_fn=<NllLossBackward0>) tensor(6.3437, grad_fn=<NllLossBackward0>) 3 LOSS DIFF: tensor(6.4010, grad_fn=<NllLossBackward0>) tensor(6.3773, grad_fn=<NllLossBackward0>) 4 LOSS DIFF: tensor(6.4818, grad_fn=<NllLossBackward0>) tensor(6.4010, grad_fn=<NllLossBackward0>) 5 LOSS DIFF: tensor(6.4520, grad_fn=<NllLossBackward0>) tensor(6.3898, grad_fn=<NllLossBackward0>) 6 LOSS DIFF: tensor(6.2989, grad_fn=<NllLossBackward0>) tensor(6.2184, grad_fn=<NllLossBackward0>) 7 LOSS DIFF: tensor(6.3109, grad_fn=<NllLossBackward0>) tensor(6.2989, grad_fn=<NllLossBackward0>) 8 LOSS DIFF: tensor(6.3028, grad_fn=<NllLossBackward0>) tensor(6.2805, grad_fn=<NllLossBackward0>) 9 LOSS DIFF: tensor(6.3590, grad_fn=<NllLossBackward0>) tensor(6.3028, grad_fn=<NllLossBackward0>) 10 LOSS DIFF: tensor(6.1484, grad_fn=<NllLossBackward0>) tensor(6.1278, grad_fn=<NllLossBackward0>) 11 LOSS DIFF: tensor(6.2458, grad_fn=<NllLossBackward0>) tensor(6.0779, grad_fn=<NllLossBackward0>) 12 LOSS DIFF: tensor(6.3209, grad_fn=<NllLossBackward0>) tensor(6.2458, grad_fn=<NllLossBackward0>) 13 LOSS DIFF: tensor(6.2801, grad_fn=<NllLossBackward0>) tensor(6.1436, grad_fn=<NllLossBackward0>) 14 LOSS DIFF: tensor(6.1245, grad_fn=<NllLossBackward0>) tensor(6.0657, grad_fn=<NllLossBackward0>) 15 LOSS DIFF: tensor(6.2682, grad_fn=<NllLossBackward0>) tensor(6.0906, grad_fn=<NllLossBackward0>) 16 LOSS DIFF: tensor(6.0394, grad_fn=<NllLossBackward0>) tensor(6.0062, grad_fn=<NllLossBackward0>) 17 LOSS DIFF: tensor(6.1070, grad_fn=<NllLossBackward0>) tensor(6.0394, grad_fn=<NllLossBackward0>) 18 LOSS DIFF: tensor(6.2271, grad_fn=<NllLossBackward0>) tensor(6.1070, grad_fn=<NllLossBackward0>) 19 LOSS DIFF: tensor(6.0964, grad_fn=<NllLossBackward0>) tensor(6.0577, grad_fn=<NllLossBackward0>) 20 LOSS DIFF: tensor(6.0909, grad_fn=<NllLossBackward0>) tensor(6.0436, grad_fn=<NllLossBackward0>) 21 LOSS DIFF: tensor(6.0210, grad_fn=<NllLossBackward0>) tensor(6.0016, grad_fn=<NllLossBackward0>) 22 LOSS DIFF: tensor(6.0296, grad_fn=<NllLossBackward0>) tensor(6.0210, grad_fn=<NllLossBackward0>) 23 LOSS DIFF: tensor(6.1812, grad_fn=<NllLossBackward0>) tensor(6.0296, grad_fn=<NllLossBackward0>) 24 LOSS DIFF: tensor(6.1665, grad_fn=<NllLossBackward0>) tensor(6.0736, grad_fn=<NllLossBackward0>) 25 LOSS DIFF: tensor(6.0107, grad_fn=<NllLossBackward0>) tensor(5.9340, grad_fn=<NllLossBackward0>) 26 LOSS DIFF: tensor(5.9806, grad_fn=<NllLossBackward0>) tensor(5.9473, grad_fn=<NllLossBackward0>) 27 LOSS DIFF: tensor(5.9364, grad_fn=<NllLossBackward0>) tensor(5.8515, grad_fn=<NllLossBackward0>) 28 LOSS DIFF: tensor(5.9202, grad_fn=<NllLossBackward0>) tensor(5.9180, grad_fn=<NllLossBackward0>) 29 LOSS DIFF: tensor(6.0357, grad_fn=<NllLossBackward0>) tensor(5.8964, grad_fn=<NllLossBackward0>) 30 LOSS DIFF: tensor(6.1189, grad_fn=<NllLossBackward0>) tensor(5.9309, grad_fn=<NllLossBackward0>) 31 LOSS DIFF: tensor(6.0280, grad_fn=<NllLossBackward0>) tensor(5.8488, grad_fn=<NllLossBackward0>) 32 LOSS DIFF: tensor(6.1555, grad_fn=<NllLossBackward0>) tensor(6.0280, grad_fn=<NllLossBackward0>) 33 LOSS DIFF: tensor(6.0389, grad_fn=<NllLossBackward0>) tensor(5.9000, grad_fn=<NllLossBackward0>) 34 LOSS DIFF: tensor(5.8367, grad_fn=<NllLossBackward0>) tensor(5.7437, grad_fn=<NllLossBackward0>) 35 LOSS DIFF: tensor(5.9835, grad_fn=<NllLossBackward0>) tensor(5.8367, grad_fn=<NllLossBackward0>) 36 LOSS DIFF: tensor(5.9613, grad_fn=<NllLossBackward0>) tensor(5.7643, grad_fn=<NllLossBackward0>) 37 LOSS DIFF: tensor(6.0189, grad_fn=<NllLossBackward0>) tensor(5.9613, grad_fn=<NllLossBackward0>) 38 LOSS DIFF: tensor(5.9064, grad_fn=<NllLossBackward0>) tensor(5.8300, grad_fn=<NllLossBackward0>) 39 LOSS DIFF: tensor(5.9395, grad_fn=<NllLossBackward0>) tensor(5.8984, grad_fn=<NllLossBackward0>) 40 LOSS DIFF: tensor(5.9919, grad_fn=<NllLossBackward0>) tensor(5.9395, grad_fn=<NllLossBackward0>) 41 LOSS DIFF: tensor(5.8834, grad_fn=<NllLossBackward0>) tensor(5.8792, grad_fn=<NllLossBackward0>) 42 LOSS DIFF: tensor(5.7971, grad_fn=<NllLossBackward0>) tensor(5.7641, grad_fn=<NllLossBackward0>) 43 LOSS DIFF: tensor(5.8632, grad_fn=<NllLossBackward0>) tensor(5.7971, grad_fn=<NllLossBackward0>) 44 LOSS DIFF: tensor(5.8988, grad_fn=<NllLossBackward0>) tensor(5.8632, grad_fn=<NllLossBackward0>) 45 LOSS DIFF: tensor(5.9258, grad_fn=<NllLossBackward0>) tensor(5.8670, grad_fn=<NllLossBackward0>) 100 tensor(5.8536, grad_fn=<NllLossBackward0>) 46 LOSS DIFF: tensor(5.8536, grad_fn=<NllLossBackward0>) tensor(5.8226, grad_fn=<NllLossBackward0>) 47 LOSS DIFF: tensor(5.8648, grad_fn=<NllLossBackward0>) tensor(5.8536, grad_fn=<NllLossBackward0>) 48 LOSS DIFF: tensor(6.0083, grad_fn=<NllLossBackward0>) tensor(5.8648, grad_fn=<NllLossBackward0>) 49 LOSS DIFF: tensor(5.8324, grad_fn=<NllLossBackward0>) tensor(5.7953, grad_fn=<NllLossBackward0>) 50 LOSS DIFF: tensor(5.9055, grad_fn=<NllLossBackward0>) tensor(5.8324, grad_fn=<NllLossBackward0>) 51 LOSS DIFF: tensor(5.9507, grad_fn=<NllLossBackward0>) tensor(5.7720, grad_fn=<NllLossBackward0>) 52 LOSS DIFF: tensor(5.8892, grad_fn=<NllLossBackward0>) tensor(5.7376, grad_fn=<NllLossBackward0>) 53 LOSS DIFF: tensor(5.8218, grad_fn=<NllLossBackward0>) tensor(5.6474, grad_fn=<NllLossBackward0>) 54 LOSS DIFF: tensor(5.8381, grad_fn=<NllLossBackward0>) tensor(5.8218, grad_fn=<NllLossBackward0>) 55 LOSS DIFF: tensor(5.9608, grad_fn=<NllLossBackward0>) tensor(5.8381, grad_fn=<NllLossBackward0>) 56 LOSS DIFF: tensor(5.9855, grad_fn=<NllLossBackward0>) tensor(5.9496, grad_fn=<NllLossBackward0>) 57 LOSS DIFF: tensor(5.9235, grad_fn=<NllLossBackward0>) tensor(5.7299, grad_fn=<NllLossBackward0>) 58 LOSS DIFF: tensor(5.9411, grad_fn=<NllLossBackward0>) tensor(5.7029, grad_fn=<NllLossBackward0>) 59 LOSS DIFF: tensor(5.8516, grad_fn=<NllLossBackward0>) tensor(5.7566, grad_fn=<NllLossBackward0>) 60 LOSS DIFF: tensor(5.8243, grad_fn=<NllLossBackward0>) tensor(5.6658, grad_fn=<NllLossBackward0>) 61 LOSS DIFF: tensor(5.8496, grad_fn=<NllLossBackward0>) tensor(5.7968, grad_fn=<NllLossBackward0>) 62 LOSS DIFF: tensor(5.7651, grad_fn=<NllLossBackward0>) tensor(5.6680, grad_fn=<NllLossBackward0>) 63 LOSS DIFF: tensor(5.8133, grad_fn=<NllLossBackward0>) tensor(5.7651, grad_fn=<NllLossBackward0>) 64 LOSS DIFF: tensor(5.8699, grad_fn=<NllLossBackward0>) tensor(5.4926, grad_fn=<NllLossBackward0>) 65 LOSS DIFF: tensor(5.7983, grad_fn=<NllLossBackward0>) tensor(5.7203, grad_fn=<NllLossBackward0>) 66 LOSS DIFF: tensor(5.8621, grad_fn=<NllLossBackward0>) tensor(5.4968, grad_fn=<NllLossBackward0>) 67 LOSS DIFF: tensor(5.8183, grad_fn=<NllLossBackward0>) tensor(5.6879, grad_fn=<NllLossBackward0>) 68 LOSS DIFF: tensor(5.7855, grad_fn=<NllLossBackward0>) tensor(5.7245, grad_fn=<NllLossBackward0>) 69 LOSS DIFF: tensor(5.7728, grad_fn=<NllLossBackward0>) tensor(5.6484, grad_fn=<NllLossBackward0>) 70 LOSS DIFF: tensor(5.7415, grad_fn=<NllLossBackward0>) tensor(5.5859, grad_fn=<NllLossBackward0>) 71 LOSS DIFF: tensor(5.7307, grad_fn=<NllLossBackward0>) tensor(5.6239, grad_fn=<NllLossBackward0>) 72 LOSS DIFF: tensor(5.7754, grad_fn=<NllLossBackward0>) tensor(5.6253, grad_fn=<NllLossBackward0>) 73 LOSS DIFF: tensor(5.8733, grad_fn=<NllLossBackward0>) tensor(5.5662, grad_fn=<NllLossBackward0>) 74 LOSS DIFF: tensor(5.7932, grad_fn=<NllLossBackward0>) tensor(5.7448, grad_fn=<NllLossBackward0>) 75 LOSS DIFF: tensor(5.7643, grad_fn=<NllLossBackward0>) tensor(5.6964, grad_fn=<NllLossBackward0>) 76 LOSS DIFF: tensor(5.6395, grad_fn=<NllLossBackward0>) tensor(5.6045, grad_fn=<NllLossBackward0>) 77 LOSS DIFF: tensor(5.7189, grad_fn=<NllLossBackward0>) tensor(5.6395, grad_fn=<NllLossBackward0>) 78 LOSS DIFF: tensor(5.7524, grad_fn=<NllLossBackward0>) tensor(5.5841, grad_fn=<NllLossBackward0>) 79 LOSS DIFF: tensor(5.7829, grad_fn=<NllLossBackward0>) tensor(5.5593, grad_fn=<NllLossBackward0>) 80 LOSS DIFF: tensor(5.8024, grad_fn=<NllLossBackward0>) tensor(5.7829, grad_fn=<NllLossBackward0>) 81 LOSS DIFF: tensor(5.8275, grad_fn=<NllLossBackward0>) tensor(5.7907, grad_fn=<NllLossBackward0>) 82 LOSS DIFF: tensor(5.6191, grad_fn=<NllLossBackward0>) tensor(5.5317, grad_fn=<NllLossBackward0>) 83 LOSS DIFF: tensor(5.7328, grad_fn=<NllLossBackward0>) tensor(5.6191, grad_fn=<NllLossBackward0>) 84 LOSS DIFF: tensor(5.7513, grad_fn=<NllLossBackward0>) tensor(5.6999, grad_fn=<NllLossBackward0>) 85 LOSS DIFF: tensor(5.7847, grad_fn=<NllLossBackward0>) tensor(5.7513, grad_fn=<NllLossBackward0>) 86 LOSS DIFF: tensor(5.7548, grad_fn=<NllLossBackward0>) tensor(5.6437, grad_fn=<NllLossBackward0>) 87 LOSS DIFF: tensor(5.7529, grad_fn=<NllLossBackward0>) tensor(5.7198, grad_fn=<NllLossBackward0>) 88 LOSS DIFF: tensor(5.7664, grad_fn=<NllLossBackward0>) tensor(5.5831, grad_fn=<NllLossBackward0>) 89 LOSS DIFF: tensor(5.7668, grad_fn=<NllLossBackward0>) tensor(5.6415, grad_fn=<NllLossBackward0>) 90 LOSS DIFF: tensor(5.7174, grad_fn=<NllLossBackward0>) tensor(5.6232, grad_fn=<NllLossBackward0>) 91 LOSS DIFF: tensor(5.7451, grad_fn=<NllLossBackward0>) tensor(5.6730, grad_fn=<NllLossBackward0>) 92 LOSS DIFF: tensor(5.7578, grad_fn=<NllLossBackward0>) tensor(5.7451, grad_fn=<NllLossBackward0>) 93 LOSS DIFF: tensor(5.6858, grad_fn=<NllLossBackward0>) tensor(5.4322, grad_fn=<NllLossBackward0>) 94 LOSS DIFF: tensor(5.7738, grad_fn=<NllLossBackward0>) tensor(5.6858, grad_fn=<NllLossBackward0>) 200 tensor(5.7337, grad_fn=<NllLossBackward0>) 95 LOSS DIFF: tensor(5.7337, grad_fn=<NllLossBackward0>) tensor(5.6356, grad_fn=<NllLossBackward0>) 96 LOSS DIFF: tensor(5.6635, grad_fn=<NllLossBackward0>) tensor(5.5954, grad_fn=<NllLossBackward0>) 97 LOSS DIFF: tensor(5.6635, grad_fn=<NllLossBackward0>) tensor(5.6516, grad_fn=<NllLossBackward0>) 98 LOSS DIFF: tensor(5.8410, grad_fn=<NllLossBackward0>) tensor(5.6141, grad_fn=<NllLossBackward0>) 99 LOSS DIFF: tensor(5.7671, grad_fn=<NllLossBackward0>) tensor(5.6264, grad_fn=<NllLossBackward0>) 100 LOSS DIFF: tensor(5.6642, grad_fn=<NllLossBackward0>) tensor(5.6263, grad_fn=<NllLossBackward0>) 101 LOSS DIFF: tensor(5.7031, grad_fn=<NllLossBackward0>) tensor(5.6022, grad_fn=<NllLossBackward0>) 102 LOSS DIFF: tensor(5.7371, grad_fn=<NllLossBackward0>) tensor(5.7031, grad_fn=<NllLossBackward0>) 103 LOSS DIFF: tensor(5.6638, grad_fn=<NllLossBackward0>) tensor(5.6220, grad_fn=<NllLossBackward0>) 104 LOSS DIFF: tensor(5.6687, grad_fn=<NllLossBackward0>) tensor(5.6638, grad_fn=<NllLossBackward0>) 105 LOSS DIFF: tensor(5.7376, grad_fn=<NllLossBackward0>) tensor(5.6687, grad_fn=<NllLossBackward0>) 106 LOSS DIFF: tensor(5.7511, grad_fn=<NllLossBackward0>) tensor(5.7249, grad_fn=<NllLossBackward0>) 107 LOSS DIFF: tensor(5.6811, grad_fn=<NllLossBackward0>) tensor(5.6714, grad_fn=<NllLossBackward0>) 108 LOSS DIFF: tensor(5.7101, grad_fn=<NllLossBackward0>) tensor(5.5892, grad_fn=<NllLossBackward0>) 109 LOSS DIFF: tensor(5.6188, grad_fn=<NllLossBackward0>) tensor(5.5320, grad_fn=<NllLossBackward0>) 110 LOSS DIFF: tensor(5.6656, grad_fn=<NllLossBackward0>) tensor(5.6188, grad_fn=<NllLossBackward0>) 111 LOSS DIFF: tensor(5.6711, grad_fn=<NllLossBackward0>) tensor(5.5220, grad_fn=<NllLossBackward0>) 112 LOSS DIFF: tensor(5.7719, grad_fn=<NllLossBackward0>) tensor(5.6711, grad_fn=<NllLossBackward0>) 113 LOSS DIFF: tensor(5.7275, grad_fn=<NllLossBackward0>) tensor(5.6023, grad_fn=<NllLossBackward0>) 114 LOSS DIFF: tensor(5.7216, grad_fn=<NllLossBackward0>) tensor(5.6046, grad_fn=<NllLossBackward0>) 115 LOSS DIFF: tensor(5.6189, grad_fn=<NllLossBackward0>) tensor(5.5715, grad_fn=<NllLossBackward0>) 116 LOSS DIFF: tensor(5.6879, grad_fn=<NllLossBackward0>) tensor(5.6189, grad_fn=<NllLossBackward0>) 117 LOSS DIFF: tensor(5.7076, grad_fn=<NllLossBackward0>) tensor(5.6879, grad_fn=<NllLossBackward0>) 118 LOSS DIFF: tensor(5.6123, grad_fn=<NllLossBackward0>) tensor(5.5496, grad_fn=<NllLossBackward0>) 119 LOSS DIFF: tensor(5.6219, grad_fn=<NllLossBackward0>) tensor(5.6123, grad_fn=<NllLossBackward0>) 120 LOSS DIFF: tensor(5.6567, grad_fn=<NllLossBackward0>) tensor(5.4889, grad_fn=<NllLossBackward0>) 121 LOSS DIFF: tensor(5.7262, grad_fn=<NllLossBackward0>) tensor(5.6334, grad_fn=<NllLossBackward0>) 122 LOSS DIFF: tensor(5.7325, grad_fn=<NllLossBackward0>) tensor(5.6450, grad_fn=<NllLossBackward0>) 123 LOSS DIFF: tensor(5.7161, grad_fn=<NllLossBackward0>) tensor(5.5794, grad_fn=<NllLossBackward0>) 124 LOSS DIFF: tensor(5.5623, grad_fn=<NllLossBackward0>) tensor(5.5361, grad_fn=<NllLossBackward0>) 125 LOSS DIFF: tensor(5.5797, grad_fn=<NllLossBackward0>) tensor(5.5623, grad_fn=<NllLossBackward0>) 126 LOSS DIFF: tensor(5.6225, grad_fn=<NllLossBackward0>) tensor(5.5797, grad_fn=<NllLossBackward0>) 127 LOSS DIFF: tensor(5.5912, grad_fn=<NllLossBackward0>) tensor(5.5347, grad_fn=<NllLossBackward0>) 128 LOSS DIFF: tensor(5.6655, grad_fn=<NllLossBackward0>) tensor(5.5912, grad_fn=<NllLossBackward0>) 129 LOSS DIFF: tensor(5.6695, grad_fn=<NllLossBackward0>) tensor(5.6655, grad_fn=<NllLossBackward0>) 130 LOSS DIFF: tensor(5.7027, grad_fn=<NllLossBackward0>) tensor(5.6695, grad_fn=<NllLossBackward0>) 131 LOSS DIFF: tensor(5.6836, grad_fn=<NllLossBackward0>) tensor(5.5821, grad_fn=<NllLossBackward0>) 132 LOSS DIFF: tensor(5.5875, grad_fn=<NllLossBackward0>) tensor(5.5289, grad_fn=<NllLossBackward0>) 133 LOSS DIFF: tensor(5.6111, grad_fn=<NllLossBackward0>) tensor(5.4911, grad_fn=<NllLossBackward0>) 134 LOSS DIFF: tensor(5.6462, grad_fn=<NllLossBackward0>) tensor(5.6111, grad_fn=<NllLossBackward0>) 135 LOSS DIFF: tensor(5.4761, grad_fn=<NllLossBackward0>) tensor(5.3862, grad_fn=<NllLossBackward0>) 136 LOSS DIFF: tensor(5.5751, grad_fn=<NllLossBackward0>) tensor(5.4761, grad_fn=<NllLossBackward0>) 137 LOSS DIFF: tensor(5.5107, grad_fn=<NllLossBackward0>) tensor(5.3580, grad_fn=<NllLossBackward0>) 138 LOSS DIFF: tensor(5.5294, grad_fn=<NllLossBackward0>) tensor(5.5032, grad_fn=<NllLossBackward0>) 139 LOSS DIFF: tensor(5.8044, grad_fn=<NllLossBackward0>) tensor(5.5294, grad_fn=<NllLossBackward0>) 140 LOSS DIFF: tensor(5.5610, grad_fn=<NllLossBackward0>) tensor(5.4624, grad_fn=<NllLossBackward0>) 141 LOSS DIFF: tensor(5.6199, grad_fn=<NllLossBackward0>) tensor(5.5610, grad_fn=<NllLossBackward0>) 142 LOSS DIFF: tensor(5.6073, grad_fn=<NllLossBackward0>) tensor(5.5645, grad_fn=<NllLossBackward0>) 143 LOSS DIFF: tensor(5.8155, grad_fn=<NllLossBackward0>) tensor(5.6073, grad_fn=<NllLossBackward0>) 144 LOSS DIFF: tensor(5.6119, grad_fn=<NllLossBackward0>) tensor(5.5148, grad_fn=<NllLossBackward0>) 145 LOSS DIFF: tensor(5.6557, grad_fn=<NllLossBackward0>) tensor(5.5193, grad_fn=<NllLossBackward0>) 300 tensor(5.5923, grad_fn=<NllLossBackward0>) 146 LOSS DIFF: tensor(5.6352, grad_fn=<NllLossBackward0>) tensor(5.5923, grad_fn=<NllLossBackward0>) 147 LOSS DIFF: tensor(5.6034, grad_fn=<NllLossBackward0>) tensor(5.4999, grad_fn=<NllLossBackward0>) 148 LOSS DIFF: tensor(5.6058, grad_fn=<NllLossBackward0>) tensor(5.6034, grad_fn=<NllLossBackward0>) 149 LOSS DIFF: tensor(5.6262, grad_fn=<NllLossBackward0>) tensor(5.5992, grad_fn=<NllLossBackward0>) 150 LOSS DIFF: tensor(5.6428, grad_fn=<NllLossBackward0>) tensor(5.5092, grad_fn=<NllLossBackward0>) 151 LOSS DIFF: tensor(5.6501, grad_fn=<NllLossBackward0>) tensor(5.5660, grad_fn=<NllLossBackward0>) 152 LOSS DIFF: tensor(5.6203, grad_fn=<NllLossBackward0>) tensor(5.5295, grad_fn=<NllLossBackward0>) 153 LOSS DIFF: tensor(5.6420, grad_fn=<NllLossBackward0>) tensor(5.6203, grad_fn=<NllLossBackward0>) 154 LOSS DIFF: tensor(5.7322, grad_fn=<NllLossBackward0>) tensor(5.4864, grad_fn=<NllLossBackward0>) 155 LOSS DIFF: tensor(5.6117, grad_fn=<NllLossBackward0>) tensor(5.4803, grad_fn=<NllLossBackward0>) 156 LOSS DIFF: tensor(5.5395, grad_fn=<NllLossBackward0>) tensor(5.4970, grad_fn=<NllLossBackward0>) 157 LOSS DIFF: tensor(5.6619, grad_fn=<NllLossBackward0>) tensor(5.5060, grad_fn=<NllLossBackward0>) 158 LOSS DIFF: tensor(5.6368, grad_fn=<NllLossBackward0>) tensor(5.5258, grad_fn=<NllLossBackward0>) 159 LOSS DIFF: tensor(5.5889, grad_fn=<NllLossBackward0>) tensor(5.5490, grad_fn=<NllLossBackward0>) 160 LOSS DIFF: tensor(5.6312, grad_fn=<NllLossBackward0>) tensor(5.5038, grad_fn=<NllLossBackward0>) 161 LOSS DIFF: tensor(5.5349, grad_fn=<NllLossBackward0>) tensor(5.5015, grad_fn=<NllLossBackward0>) 162 LOSS DIFF: tensor(5.6371, grad_fn=<NllLossBackward0>) tensor(5.5349, grad_fn=<NllLossBackward0>) 163 LOSS DIFF: tensor(5.6482, grad_fn=<NllLossBackward0>) tensor(5.6371, grad_fn=<NllLossBackward0>) 164 LOSS DIFF: tensor(5.6638, grad_fn=<NllLossBackward0>) tensor(5.6482, grad_fn=<NllLossBackward0>) 165 LOSS DIFF: tensor(5.6737, grad_fn=<NllLossBackward0>) tensor(5.4801, grad_fn=<NllLossBackward0>) 166 LOSS DIFF: tensor(5.4878, grad_fn=<NllLossBackward0>) tensor(5.4866, grad_fn=<NllLossBackward0>) 167 LOSS DIFF: tensor(5.6624, grad_fn=<NllLossBackward0>) tensor(5.4878, grad_fn=<NllLossBackward0>) 168 LOSS DIFF: tensor(5.5738, grad_fn=<NllLossBackward0>) tensor(5.5648, grad_fn=<NllLossBackward0>) 169 LOSS DIFF: tensor(5.5267, grad_fn=<NllLossBackward0>) tensor(5.4309, grad_fn=<NllLossBackward0>) 170 LOSS DIFF: tensor(5.6041, grad_fn=<NllLossBackward0>) tensor(5.3970, grad_fn=<NllLossBackward0>) 171 LOSS DIFF: tensor(5.6640, grad_fn=<NllLossBackward0>) tensor(5.4885, grad_fn=<NllLossBackward0>) 172 LOSS DIFF: tensor(5.6136, grad_fn=<NllLossBackward0>) tensor(5.4977, grad_fn=<NllLossBackward0>) 173 LOSS DIFF: tensor(5.6567, grad_fn=<NllLossBackward0>) tensor(5.5459, grad_fn=<NllLossBackward0>) 174 LOSS DIFF: tensor(5.5721, grad_fn=<NllLossBackward0>) tensor(5.4921, grad_fn=<NllLossBackward0>) 175 LOSS DIFF: tensor(5.5685, grad_fn=<NllLossBackward0>) tensor(5.5363, grad_fn=<NllLossBackward0>) 176 LOSS DIFF: tensor(5.5438, grad_fn=<NllLossBackward0>) tensor(5.4754, grad_fn=<NllLossBackward0>) 177 LOSS DIFF: tensor(5.6087, grad_fn=<NllLossBackward0>) tensor(5.5345, grad_fn=<NllLossBackward0>) 178 LOSS DIFF: tensor(5.5624, grad_fn=<NllLossBackward0>) tensor(5.3589, grad_fn=<NllLossBackward0>) 179 LOSS DIFF: tensor(5.6284, grad_fn=<NllLossBackward0>) tensor(5.4887, grad_fn=<NllLossBackward0>) 180 LOSS DIFF: tensor(5.4859, grad_fn=<NllLossBackward0>) tensor(5.4453, grad_fn=<NllLossBackward0>) 181 LOSS DIFF: tensor(5.4949, grad_fn=<NllLossBackward0>) tensor(5.4859, grad_fn=<NllLossBackward0>) 182 LOSS DIFF: tensor(5.5938, grad_fn=<NllLossBackward0>) tensor(5.4949, grad_fn=<NllLossBackward0>) 183 LOSS DIFF: tensor(5.5222, grad_fn=<NllLossBackward0>) tensor(5.4890, grad_fn=<NllLossBackward0>) 184 LOSS DIFF: tensor(5.6673, grad_fn=<NllLossBackward0>) tensor(5.5222, grad_fn=<NllLossBackward0>) 185 LOSS DIFF: tensor(5.6337, grad_fn=<NllLossBackward0>) tensor(5.5833, grad_fn=<NllLossBackward0>) 186 LOSS DIFF: tensor(5.7171, grad_fn=<NllLossBackward0>) tensor(5.6337, grad_fn=<NllLossBackward0>) 187 LOSS DIFF: tensor(5.5721, grad_fn=<NllLossBackward0>) tensor(5.4927, grad_fn=<NllLossBackward0>) 188 LOSS DIFF: tensor(5.5771, grad_fn=<NllLossBackward0>) tensor(5.5721, grad_fn=<NllLossBackward0>) 189 LOSS DIFF: tensor(5.6379, grad_fn=<NllLossBackward0>) tensor(5.5771, grad_fn=<NllLossBackward0>) 190 LOSS DIFF: tensor(5.6032, grad_fn=<NllLossBackward0>) tensor(5.4434, grad_fn=<NllLossBackward0>) 191 LOSS DIFF: tensor(5.5389, grad_fn=<NllLossBackward0>) tensor(5.3454, grad_fn=<NllLossBackward0>) 192 LOSS DIFF: tensor(5.6966, grad_fn=<NllLossBackward0>) tensor(5.4275, grad_fn=<NllLossBackward0>) 193 LOSS DIFF: tensor(5.3675, grad_fn=<NllLossBackward0>) tensor(5.3163, grad_fn=<NllLossBackward0>) 194 LOSS DIFF: tensor(5.4924, grad_fn=<NllLossBackward0>) tensor(5.3675, grad_fn=<NllLossBackward0>) 195 LOSS DIFF: tensor(5.5475, grad_fn=<NllLossBackward0>) tensor(5.4881, grad_fn=<NllLossBackward0>) 196 LOSS DIFF: tensor(5.6223, grad_fn=<NllLossBackward0>) tensor(5.3634, grad_fn=<NllLossBackward0>) 400 tensor(5.5316, grad_fn=<NllLossBackward0>) 197 LOSS DIFF: tensor(5.5377, grad_fn=<NllLossBackward0>) tensor(5.4920, grad_fn=<NllLossBackward0>) 198 LOSS DIFF: tensor(5.6185, grad_fn=<NllLossBackward0>) tensor(5.4576, grad_fn=<NllLossBackward0>) 199 LOSS DIFF: tensor(5.4915, grad_fn=<NllLossBackward0>) tensor(5.4151, grad_fn=<NllLossBackward0>) 200 LOSS DIFF: tensor(5.5837, grad_fn=<NllLossBackward0>) tensor(5.4915, grad_fn=<NllLossBackward0>) 201 LOSS DIFF: tensor(5.5875, grad_fn=<NllLossBackward0>) tensor(5.5837, grad_fn=<NllLossBackward0>) 202 LOSS DIFF: tensor(5.5331, grad_fn=<NllLossBackward0>) tensor(5.4873, grad_fn=<NllLossBackward0>) 203 LOSS DIFF: tensor(5.5345, grad_fn=<NllLossBackward0>) tensor(5.3964, grad_fn=<NllLossBackward0>) 204 LOSS DIFF: tensor(5.5764, grad_fn=<NllLossBackward0>) tensor(5.5345, grad_fn=<NllLossBackward0>) 205 LOSS DIFF: tensor(5.6070, grad_fn=<NllLossBackward0>) tensor(5.5764, grad_fn=<NllLossBackward0>) 206 LOSS DIFF: tensor(5.5005, grad_fn=<NllLossBackward0>) tensor(5.3572, grad_fn=<NllLossBackward0>) 207 LOSS DIFF: tensor(5.5520, grad_fn=<NllLossBackward0>) tensor(5.3860, grad_fn=<NllLossBackward0>) 208 LOSS DIFF: tensor(5.5800, grad_fn=<NllLossBackward0>) tensor(5.5520, grad_fn=<NllLossBackward0>) 209 LOSS DIFF: tensor(5.6465, grad_fn=<NllLossBackward0>) tensor(5.5469, grad_fn=<NllLossBackward0>) 210 LOSS DIFF: tensor(5.5691, grad_fn=<NllLossBackward0>) tensor(5.5241, grad_fn=<NllLossBackward0>) 211 LOSS DIFF: tensor(5.7237, grad_fn=<NllLossBackward0>) tensor(5.4803, grad_fn=<NllLossBackward0>) 212 LOSS DIFF: tensor(5.5532, grad_fn=<NllLossBackward0>) tensor(5.5012, grad_fn=<NllLossBackward0>) 213 LOSS DIFF: tensor(5.5011, grad_fn=<NllLossBackward0>) tensor(5.4712, grad_fn=<NllLossBackward0>) 214 LOSS DIFF: tensor(5.5370, grad_fn=<NllLossBackward0>) tensor(5.5011, grad_fn=<NllLossBackward0>) 215 LOSS DIFF: tensor(5.5579, grad_fn=<NllLossBackward0>) tensor(5.4126, grad_fn=<NllLossBackward0>) 216 LOSS DIFF: tensor(5.5109, grad_fn=<NllLossBackward0>) tensor(5.3875, grad_fn=<NllLossBackward0>) 217 LOSS DIFF: tensor(5.5403, grad_fn=<NllLossBackward0>) tensor(5.4174, grad_fn=<NllLossBackward0>) 218 LOSS DIFF: tensor(5.5404, grad_fn=<NllLossBackward0>) tensor(5.5403, grad_fn=<NllLossBackward0>) 219 LOSS DIFF: tensor(5.5593, grad_fn=<NllLossBackward0>) tensor(5.5404, grad_fn=<NllLossBackward0>) 220 LOSS DIFF: tensor(5.5262, grad_fn=<NllLossBackward0>) tensor(5.5250, grad_fn=<NllLossBackward0>) 221 LOSS DIFF: tensor(5.4107, grad_fn=<NllLossBackward0>) tensor(5.4092, grad_fn=<NllLossBackward0>) 222 LOSS DIFF: tensor(5.4920, grad_fn=<NllLossBackward0>) tensor(5.3499, grad_fn=<NllLossBackward0>) 223 LOSS DIFF: tensor(5.5064, grad_fn=<NllLossBackward0>) tensor(5.4920, grad_fn=<NllLossBackward0>) 224 LOSS DIFF: tensor(5.5648, grad_fn=<NllLossBackward0>) tensor(5.5064, grad_fn=<NllLossBackward0>) 225 LOSS DIFF: tensor(5.5107, grad_fn=<NllLossBackward0>) tensor(5.3439, grad_fn=<NllLossBackward0>) 226 LOSS DIFF: tensor(5.4968, grad_fn=<NllLossBackward0>) tensor(5.4720, grad_fn=<NllLossBackward0>) 227 LOSS DIFF: tensor(5.5473, grad_fn=<NllLossBackward0>) tensor(5.4854, grad_fn=<NllLossBackward0>) 228 LOSS DIFF: tensor(5.4800, grad_fn=<NllLossBackward0>) tensor(5.3762, grad_fn=<NllLossBackward0>) 229 LOSS DIFF: tensor(5.6251, grad_fn=<NllLossBackward0>) tensor(5.4800, grad_fn=<NllLossBackward0>) 230 LOSS DIFF: tensor(5.6237, grad_fn=<NllLossBackward0>) tensor(5.4478, grad_fn=<NllLossBackward0>) 231 LOSS DIFF: tensor(5.5439, grad_fn=<NllLossBackward0>) tensor(5.4108, grad_fn=<NllLossBackward0>) 232 LOSS DIFF: tensor(5.3186, grad_fn=<NllLossBackward0>) tensor(5.3012, grad_fn=<NllLossBackward0>) 233 LOSS DIFF: tensor(5.5069, grad_fn=<NllLossBackward0>) tensor(5.3186, grad_fn=<NllLossBackward0>) 234 LOSS DIFF: tensor(5.5190, grad_fn=<NllLossBackward0>) tensor(5.5043, grad_fn=<NllLossBackward0>) 235 LOSS DIFF: tensor(5.4706, grad_fn=<NllLossBackward0>) tensor(5.4560, grad_fn=<NllLossBackward0>) 236 LOSS DIFF: tensor(5.5252, grad_fn=<NllLossBackward0>) tensor(5.4706, grad_fn=<NllLossBackward0>) 237 LOSS DIFF: tensor(5.4765, grad_fn=<NllLossBackward0>) tensor(5.4103, grad_fn=<NllLossBackward0>) 238 LOSS DIFF: tensor(5.5218, grad_fn=<NllLossBackward0>) tensor(5.4765, grad_fn=<NllLossBackward0>) 239 LOSS DIFF: tensor(5.6028, grad_fn=<NllLossBackward0>) tensor(5.4596, grad_fn=<NllLossBackward0>) 240 LOSS DIFF: tensor(5.5504, grad_fn=<NllLossBackward0>) tensor(5.5021, grad_fn=<NllLossBackward0>) 241 LOSS DIFF: tensor(5.4777, grad_fn=<NllLossBackward0>) tensor(5.4091, grad_fn=<NllLossBackward0>) 242 LOSS DIFF: tensor(5.4404, grad_fn=<NllLossBackward0>) tensor(5.3918, grad_fn=<NllLossBackward0>) 243 LOSS DIFF: tensor(5.5580, grad_fn=<NllLossBackward0>) tensor(5.4404, grad_fn=<NllLossBackward0>) 244 LOSS DIFF: tensor(5.4812, grad_fn=<NllLossBackward0>) tensor(5.4398, grad_fn=<NllLossBackward0>) 500 tensor(5.5214, grad_fn=<NllLossBackward0>) 245 LOSS DIFF: tensor(5.5214, grad_fn=<NllLossBackward0>) tensor(5.4142, grad_fn=<NllLossBackward0>) 246 LOSS DIFF: tensor(5.6153, grad_fn=<NllLossBackward0>) tensor(5.5214, grad_fn=<NllLossBackward0>) 247 LOSS DIFF: tensor(5.4794, grad_fn=<NllLossBackward0>) tensor(5.4672, grad_fn=<NllLossBackward0>) 248 LOSS DIFF: tensor(5.5978, grad_fn=<NllLossBackward0>) tensor(5.4794, grad_fn=<NllLossBackward0>) 249 LOSS DIFF: tensor(5.4549, grad_fn=<NllLossBackward0>) tensor(5.3421, grad_fn=<NllLossBackward0>) 250 LOSS DIFF: tensor(5.4747, grad_fn=<NllLossBackward0>) tensor(5.4549, grad_fn=<NllLossBackward0>) 251 LOSS DIFF: tensor(5.5439, grad_fn=<NllLossBackward0>) tensor(5.3348, grad_fn=<NllLossBackward0>) 252 LOSS DIFF: tensor(5.5953, grad_fn=<NllLossBackward0>) tensor(5.5439, grad_fn=<NllLossBackward0>) 253 LOSS DIFF: tensor(5.5308, grad_fn=<NllLossBackward0>) tensor(5.4385, grad_fn=<NllLossBackward0>) 254 LOSS DIFF: tensor(5.5379, grad_fn=<NllLossBackward0>) tensor(5.4373, grad_fn=<NllLossBackward0>) 255 LOSS DIFF: tensor(5.5022, grad_fn=<NllLossBackward0>) tensor(5.4306, grad_fn=<NllLossBackward0>) 256 LOSS DIFF: tensor(5.5225, grad_fn=<NllLossBackward0>) tensor(5.4898, grad_fn=<NllLossBackward0>) 257 LOSS DIFF: tensor(5.6141, grad_fn=<NllLossBackward0>) tensor(5.5225, grad_fn=<NllLossBackward0>) 258 LOSS DIFF: tensor(5.4873, grad_fn=<NllLossBackward0>) tensor(5.4444, grad_fn=<NllLossBackward0>) 259 LOSS DIFF: tensor(5.6677, grad_fn=<NllLossBackward0>) tensor(5.4873, grad_fn=<NllLossBackward0>) 260 LOSS DIFF: tensor(5.5404, grad_fn=<NllLossBackward0>) tensor(5.4581, grad_fn=<NllLossBackward0>) 261 LOSS DIFF: tensor(5.5603, grad_fn=<NllLossBackward0>) tensor(5.3583, grad_fn=<NllLossBackward0>) 262 LOSS DIFF: tensor(5.5292, grad_fn=<NllLossBackward0>) tensor(5.2255, grad_fn=<NllLossBackward0>) 263 LOSS DIFF: tensor(5.4456, grad_fn=<NllLossBackward0>) tensor(5.3846, grad_fn=<NllLossBackward0>) 264 LOSS DIFF: tensor(5.4504, grad_fn=<NllLossBackward0>) tensor(5.4456, grad_fn=<NllLossBackward0>) 265 LOSS DIFF: tensor(5.4899, grad_fn=<NllLossBackward0>) tensor(5.3406, grad_fn=<NllLossBackward0>) 266 LOSS DIFF: tensor(5.5023, grad_fn=<NllLossBackward0>) tensor(5.4899, grad_fn=<NllLossBackward0>) 267 LOSS DIFF: tensor(5.3884, grad_fn=<NllLossBackward0>) tensor(5.2800, grad_fn=<NllLossBackward0>) 268 LOSS DIFF: tensor(5.4713, grad_fn=<NllLossBackward0>) tensor(5.3884, grad_fn=<NllLossBackward0>) 269 LOSS DIFF: tensor(5.4810, grad_fn=<NllLossBackward0>) tensor(5.4713, grad_fn=<NllLossBackward0>) 270 LOSS DIFF: tensor(5.3896, grad_fn=<NllLossBackward0>) tensor(5.3593, grad_fn=<NllLossBackward0>) 271 LOSS DIFF: tensor(5.5195, grad_fn=<NllLossBackward0>) tensor(5.3896, grad_fn=<NllLossBackward0>) 272 LOSS DIFF: tensor(5.4173, grad_fn=<NllLossBackward0>) tensor(5.3982, grad_fn=<NllLossBackward0>) 273 LOSS DIFF: tensor(5.5428, grad_fn=<NllLossBackward0>) tensor(5.3779, grad_fn=<NllLossBackward0>) 274 LOSS DIFF: tensor(5.4749, grad_fn=<NllLossBackward0>) tensor(5.4675, grad_fn=<NllLossBackward0>) 275 LOSS DIFF: tensor(5.3978, grad_fn=<NllLossBackward0>) tensor(5.2620, grad_fn=<NllLossBackward0>) 276 LOSS DIFF: tensor(5.4689, grad_fn=<NllLossBackward0>) tensor(5.3978, grad_fn=<NllLossBackward0>) 277 LOSS DIFF: tensor(5.4733, grad_fn=<NllLossBackward0>) tensor(5.4689, grad_fn=<NllLossBackward0>) 278 LOSS DIFF: tensor(5.5054, grad_fn=<NllLossBackward0>) tensor(5.4733, grad_fn=<NllLossBackward0>) 279 LOSS DIFF: tensor(5.4809, grad_fn=<NllLossBackward0>) tensor(5.4288, grad_fn=<NllLossBackward0>) 280 LOSS DIFF: tensor(5.5698, grad_fn=<NllLossBackward0>) tensor(5.4809, grad_fn=<NllLossBackward0>) 281 LOSS DIFF: tensor(5.5550, grad_fn=<NllLossBackward0>) tensor(5.4103, grad_fn=<NllLossBackward0>) 282 LOSS DIFF: tensor(5.5803, grad_fn=<NllLossBackward0>) tensor(5.5550, grad_fn=<NllLossBackward0>) 283 LOSS DIFF: tensor(5.5616, grad_fn=<NllLossBackward0>) tensor(5.4858, grad_fn=<NllLossBackward0>) 284 LOSS DIFF: tensor(5.4863, grad_fn=<NllLossBackward0>) tensor(5.3357, grad_fn=<NllLossBackward0>) 285 LOSS DIFF: tensor(5.3506, grad_fn=<NllLossBackward0>) tensor(5.2871, grad_fn=<NllLossBackward0>) 286 LOSS DIFF: tensor(5.6320, grad_fn=<NllLossBackward0>) tensor(5.3506, grad_fn=<NllLossBackward0>) 287 LOSS DIFF: tensor(5.4488, grad_fn=<NllLossBackward0>) tensor(5.4314, grad_fn=<NllLossBackward0>) 288 LOSS DIFF: tensor(5.4596, grad_fn=<NllLossBackward0>) tensor(5.4488, grad_fn=<NllLossBackward0>) 289 LOSS DIFF: tensor(5.5325, grad_fn=<NllLossBackward0>) tensor(5.4596, grad_fn=<NllLossBackward0>) 290 LOSS DIFF: tensor(5.4566, grad_fn=<NllLossBackward0>) tensor(5.2072, grad_fn=<NllLossBackward0>) 291 LOSS DIFF: tensor(5.4784, grad_fn=<NllLossBackward0>) tensor(5.4303, grad_fn=<NllLossBackward0>) 292 LOSS DIFF: tensor(5.4439, grad_fn=<NllLossBackward0>) tensor(5.3270, grad_fn=<NllLossBackward0>) 293 LOSS DIFF: tensor(5.5160, grad_fn=<NllLossBackward0>) tensor(5.4439, grad_fn=<NllLossBackward0>) 294 LOSS DIFF: tensor(5.4134, grad_fn=<NllLossBackward0>) tensor(5.3536, grad_fn=<NllLossBackward0>) 295 LOSS DIFF: tensor(5.4426, grad_fn=<NllLossBackward0>) tensor(5.4134, grad_fn=<NllLossBackward0>) 296 LOSS DIFF: tensor(5.3758, grad_fn=<NllLossBackward0>) tensor(5.3700, grad_fn=<NllLossBackward0>) 297 LOSS DIFF: tensor(5.5559, grad_fn=<NllLossBackward0>) tensor(5.3758, grad_fn=<NllLossBackward0>) 600 tensor(5.4824, grad_fn=<NllLossBackward0>) 298 LOSS DIFF: tensor(5.3795, grad_fn=<NllLossBackward0>) tensor(5.3762, grad_fn=<NllLossBackward0>) 299 LOSS DIFF: tensor(5.3878, grad_fn=<NllLossBackward0>) tensor(5.3795, grad_fn=<NllLossBackward0>) 300 LOSS DIFF: tensor(5.4699, grad_fn=<NllLossBackward0>) tensor(5.3878, grad_fn=<NllLossBackward0>) 301 LOSS DIFF: tensor(5.4967, grad_fn=<NllLossBackward0>) tensor(5.4699, grad_fn=<NllLossBackward0>) 302 LOSS DIFF: tensor(5.5724, grad_fn=<NllLossBackward0>) tensor(5.4967, grad_fn=<NllLossBackward0>) 303 LOSS DIFF: tensor(5.4520, grad_fn=<NllLossBackward0>) tensor(5.4072, grad_fn=<NllLossBackward0>) 304 LOSS DIFF: tensor(5.5089, grad_fn=<NllLossBackward0>) tensor(5.4520, grad_fn=<NllLossBackward0>) 305 LOSS DIFF: tensor(5.5398, grad_fn=<NllLossBackward0>) tensor(5.3168, grad_fn=<NllLossBackward0>) 306 LOSS DIFF: tensor(5.3561, grad_fn=<NllLossBackward0>) tensor(5.3058, grad_fn=<NllLossBackward0>) 307 LOSS DIFF: tensor(5.4668, grad_fn=<NllLossBackward0>) tensor(5.3448, grad_fn=<NllLossBackward0>) 308 LOSS DIFF: tensor(5.4964, grad_fn=<NllLossBackward0>) tensor(5.4668, grad_fn=<NllLossBackward0>) 309 LOSS DIFF: tensor(5.4440, grad_fn=<NllLossBackward0>) tensor(5.3221, grad_fn=<NllLossBackward0>) 310 LOSS DIFF: tensor(5.4516, grad_fn=<NllLossBackward0>) tensor(5.4289, grad_fn=<NllLossBackward0>) 311 LOSS DIFF: tensor(5.4969, grad_fn=<NllLossBackward0>) tensor(5.3983, grad_fn=<NllLossBackward0>) 312 LOSS DIFF: tensor(5.4254, grad_fn=<NllLossBackward0>) tensor(5.3790, grad_fn=<NllLossBackward0>) 313 LOSS DIFF: tensor(5.4874, grad_fn=<NllLossBackward0>) tensor(5.4254, grad_fn=<NllLossBackward0>) 314 LOSS DIFF: tensor(5.3839, grad_fn=<NllLossBackward0>) tensor(5.3470, grad_fn=<NllLossBackward0>) 315 LOSS DIFF: tensor(5.5822, grad_fn=<NllLossBackward0>) tensor(5.3839, grad_fn=<NllLossBackward0>) 316 LOSS DIFF: tensor(5.4169, grad_fn=<NllLossBackward0>) tensor(5.3044, grad_fn=<NllLossBackward0>) 317 LOSS DIFF: tensor(5.4778, grad_fn=<NllLossBackward0>) tensor(5.4169, grad_fn=<NllLossBackward0>) 318 LOSS DIFF: tensor(5.3589, grad_fn=<NllLossBackward0>) tensor(5.2238, grad_fn=<NllLossBackward0>) 319 LOSS DIFF: tensor(5.3547, grad_fn=<NllLossBackward0>) tensor(5.3184, grad_fn=<NllLossBackward0>) 320 LOSS DIFF: tensor(5.5022, grad_fn=<NllLossBackward0>) tensor(5.3547, grad_fn=<NllLossBackward0>) 321 LOSS DIFF: tensor(5.4749, grad_fn=<NllLossBackward0>) tensor(5.4294, grad_fn=<NllLossBackward0>) 322 LOSS DIFF: tensor(5.3813, grad_fn=<NllLossBackward0>) tensor(5.3557, grad_fn=<NllLossBackward0>) 323 LOSS DIFF: tensor(5.4019, grad_fn=<NllLossBackward0>) tensor(5.3813, grad_fn=<NllLossBackward0>) 324 LOSS DIFF: tensor(5.7250, grad_fn=<NllLossBackward0>) tensor(5.4019, grad_fn=<NllLossBackward0>) 325 LOSS DIFF: tensor(5.4055, grad_fn=<NllLossBackward0>) tensor(5.3304, grad_fn=<NllLossBackward0>) 326 LOSS DIFF: tensor(5.4721, grad_fn=<NllLossBackward0>) tensor(5.4055, grad_fn=<NllLossBackward0>) 327 LOSS DIFF: tensor(5.4590, grad_fn=<NllLossBackward0>) tensor(5.3773, grad_fn=<NllLossBackward0>) 328 LOSS DIFF: tensor(5.6097, grad_fn=<NllLossBackward0>) tensor(5.4590, grad_fn=<NllLossBackward0>) 329 LOSS DIFF: tensor(5.5304, grad_fn=<NllLossBackward0>) tensor(5.2807, grad_fn=<NllLossBackward0>) 330 LOSS DIFF: tensor(5.4286, grad_fn=<NllLossBackward0>) tensor(5.3879, grad_fn=<NllLossBackward0>) 331 LOSS DIFF: tensor(5.4221, grad_fn=<NllLossBackward0>) tensor(5.2779, grad_fn=<NllLossBackward0>) 332 LOSS DIFF: tensor(5.3690, grad_fn=<NllLossBackward0>) tensor(5.3191, grad_fn=<NllLossBackward0>) 333 LOSS DIFF: tensor(5.3814, grad_fn=<NllLossBackward0>) tensor(5.3690, grad_fn=<NllLossBackward0>) 334 LOSS DIFF: tensor(5.4241, grad_fn=<NllLossBackward0>) tensor(5.3760, grad_fn=<NllLossBackward0>) 335 LOSS DIFF: tensor(5.4727, grad_fn=<NllLossBackward0>) tensor(5.4241, grad_fn=<NllLossBackward0>) 336 LOSS DIFF: tensor(5.4216, grad_fn=<NllLossBackward0>) tensor(5.3401, grad_fn=<NllLossBackward0>) 337 LOSS DIFF: tensor(5.4938, grad_fn=<NllLossBackward0>) tensor(5.3908, grad_fn=<NllLossBackward0>) 338 LOSS DIFF: tensor(5.4742, grad_fn=<NllLossBackward0>) tensor(5.3384, grad_fn=<NllLossBackward0>) 339 LOSS DIFF: tensor(5.4628, grad_fn=<NllLossBackward0>) tensor(5.2785, grad_fn=<NllLossBackward0>) 340 LOSS DIFF: tensor(5.5419, grad_fn=<NllLossBackward0>) tensor(5.3019, grad_fn=<NllLossBackward0>) 341 LOSS DIFF: tensor(5.4736, grad_fn=<NllLossBackward0>) tensor(5.3646, grad_fn=<NllLossBackward0>) 342 LOSS DIFF: tensor(5.4150, grad_fn=<NllLossBackward0>) tensor(5.3511, grad_fn=<NllLossBackward0>) 343 LOSS DIFF: tensor(5.4531, grad_fn=<NllLossBackward0>) tensor(5.2982, grad_fn=<NllLossBackward0>) 344 LOSS DIFF: tensor(5.4617, grad_fn=<NllLossBackward0>) tensor(5.4531, grad_fn=<NllLossBackward0>) 345 LOSS DIFF: tensor(5.4939, grad_fn=<NllLossBackward0>) tensor(5.4617, grad_fn=<NllLossBackward0>) 346 LOSS DIFF: tensor(5.4178, grad_fn=<NllLossBackward0>) tensor(5.3127, grad_fn=<NllLossBackward0>) 700 tensor(5.7095, grad_fn=<NllLossBackward0>) 347 LOSS DIFF: tensor(5.7095, grad_fn=<NllLossBackward0>) tensor(5.3593, grad_fn=<NllLossBackward0>) 348 LOSS DIFF: tensor(5.4054, grad_fn=<NllLossBackward0>) tensor(5.3883, grad_fn=<NllLossBackward0>) 349 LOSS DIFF: tensor(5.6016, grad_fn=<NllLossBackward0>) tensor(5.4054, grad_fn=<NllLossBackward0>) 350 LOSS DIFF: tensor(5.4695, grad_fn=<NllLossBackward0>) tensor(5.4424, grad_fn=<NllLossBackward0>) 351 LOSS DIFF: tensor(5.5022, grad_fn=<NllLossBackward0>) tensor(5.4695, grad_fn=<NllLossBackward0>) 352 LOSS DIFF: tensor(5.5172, grad_fn=<NllLossBackward0>) tensor(5.4135, grad_fn=<NllLossBackward0>) 353 LOSS DIFF: tensor(5.5003, grad_fn=<NllLossBackward0>) tensor(5.3490, grad_fn=<NllLossBackward0>) 354 LOSS DIFF: tensor(5.3198, grad_fn=<NllLossBackward0>) tensor(5.2805, grad_fn=<NllLossBackward0>) 355 LOSS DIFF: tensor(5.3726, grad_fn=<NllLossBackward0>) tensor(5.3198, grad_fn=<NllLossBackward0>) 356 LOSS DIFF: tensor(5.3992, grad_fn=<NllLossBackward0>) tensor(5.3726, grad_fn=<NllLossBackward0>) 357 LOSS DIFF: tensor(5.5122, grad_fn=<NllLossBackward0>) tensor(5.3992, grad_fn=<NllLossBackward0>) 358 LOSS DIFF: tensor(5.6000, grad_fn=<NllLossBackward0>) tensor(5.3476, grad_fn=<NllLossBackward0>) 359 LOSS DIFF: tensor(5.4421, grad_fn=<NllLossBackward0>) tensor(5.3207, grad_fn=<NllLossBackward0>) 360 LOSS DIFF: tensor(5.6211, grad_fn=<NllLossBackward0>) tensor(5.4421, grad_fn=<NllLossBackward0>) 361 LOSS DIFF: tensor(5.3617, grad_fn=<NllLossBackward0>) tensor(5.3425, grad_fn=<NllLossBackward0>) 362 LOSS DIFF: tensor(5.3828, grad_fn=<NllLossBackward0>) tensor(5.3617, grad_fn=<NllLossBackward0>) 363 LOSS DIFF: tensor(5.4569, grad_fn=<NllLossBackward0>) tensor(5.3828, grad_fn=<NllLossBackward0>) 364 LOSS DIFF: tensor(5.4314, grad_fn=<NllLossBackward0>) tensor(5.2452, grad_fn=<NllLossBackward0>) 365 LOSS DIFF: tensor(5.5384, grad_fn=<NllLossBackward0>) tensor(5.4314, grad_fn=<NllLossBackward0>) 366 LOSS DIFF: tensor(5.4293, grad_fn=<NllLossBackward0>) tensor(5.3797, grad_fn=<NllLossBackward0>) 367 LOSS DIFF: tensor(5.4823, grad_fn=<NllLossBackward0>) tensor(5.4289, grad_fn=<NllLossBackward0>) 368 LOSS DIFF: tensor(5.4602, grad_fn=<NllLossBackward0>) tensor(5.3212, grad_fn=<NllLossBackward0>) 369 LOSS DIFF: tensor(5.4459, grad_fn=<NllLossBackward0>) tensor(5.3457, grad_fn=<NllLossBackward0>) 370 LOSS DIFF: tensor(5.5089, grad_fn=<NllLossBackward0>) tensor(5.3548, grad_fn=<NllLossBackward0>) 371 LOSS DIFF: tensor(5.3639, grad_fn=<NllLossBackward0>) tensor(5.2607, grad_fn=<NllLossBackward0>) 372 LOSS DIFF: tensor(5.4079, grad_fn=<NllLossBackward0>) tensor(5.3639, grad_fn=<NllLossBackward0>) 373 LOSS DIFF: tensor(5.5557, grad_fn=<NllLossBackward0>) tensor(5.4079, grad_fn=<NllLossBackward0>) 374 LOSS DIFF: tensor(5.3965, grad_fn=<NllLossBackward0>) tensor(5.3427, grad_fn=<NllLossBackward0>) 375 LOSS DIFF: tensor(5.4149, grad_fn=<NllLossBackward0>) tensor(5.3965, grad_fn=<NllLossBackward0>) 376 LOSS DIFF: tensor(5.3285, grad_fn=<NllLossBackward0>) tensor(5.3265, grad_fn=<NllLossBackward0>) 377 LOSS DIFF: tensor(5.3672, grad_fn=<NllLossBackward0>) tensor(5.3285, grad_fn=<NllLossBackward0>) 378 LOSS DIFF: tensor(5.4523, grad_fn=<NllLossBackward0>) tensor(5.3471, grad_fn=<NllLossBackward0>) 379 LOSS DIFF: tensor(5.4315, grad_fn=<NllLossBackward0>) tensor(5.4231, grad_fn=<NllLossBackward0>) 380 LOSS DIFF: tensor(5.5363, grad_fn=<NllLossBackward0>) tensor(5.4315, grad_fn=<NllLossBackward0>) 381 LOSS DIFF: tensor(5.4404, grad_fn=<NllLossBackward0>) tensor(5.4114, grad_fn=<NllLossBackward0>) 382 LOSS DIFF: tensor(5.2667, grad_fn=<NllLossBackward0>) tensor(5.2283, grad_fn=<NllLossBackward0>) 383 LOSS DIFF: tensor(5.3342, grad_fn=<NllLossBackward0>) tensor(5.2667, grad_fn=<NllLossBackward0>) 384 LOSS DIFF: tensor(5.4847, grad_fn=<NllLossBackward0>) tensor(5.3342, grad_fn=<NllLossBackward0>) 385 LOSS DIFF: tensor(5.5349, grad_fn=<NllLossBackward0>) tensor(5.4847, grad_fn=<NllLossBackward0>) 386 LOSS DIFF: tensor(5.4216, grad_fn=<NllLossBackward0>) tensor(5.2991, grad_fn=<NllLossBackward0>) 387 LOSS DIFF: tensor(5.4483, grad_fn=<NllLossBackward0>) tensor(5.3455, grad_fn=<NllLossBackward0>) 388 LOSS DIFF: tensor(5.4229, grad_fn=<NllLossBackward0>) tensor(5.3271, grad_fn=<NllLossBackward0>) 389 LOSS DIFF: tensor(5.5482, grad_fn=<NllLossBackward0>) tensor(5.4229, grad_fn=<NllLossBackward0>) 390 LOSS DIFF: tensor(5.4596, grad_fn=<NllLossBackward0>) tensor(5.3374, grad_fn=<NllLossBackward0>) 391 LOSS DIFF: tensor(5.4694, grad_fn=<NllLossBackward0>) tensor(5.4596, grad_fn=<NllLossBackward0>) 392 LOSS DIFF: tensor(5.4744, grad_fn=<NllLossBackward0>) tensor(5.3277, grad_fn=<NllLossBackward0>) 393 LOSS DIFF: tensor(5.4301, grad_fn=<NllLossBackward0>) tensor(5.3380, grad_fn=<NllLossBackward0>) 394 LOSS DIFF: tensor(5.2605, grad_fn=<NllLossBackward0>) tensor(5.2482, grad_fn=<NllLossBackward0>) 395 LOSS DIFF: tensor(5.4596, grad_fn=<NllLossBackward0>) tensor(5.2605, grad_fn=<NllLossBackward0>) 396 LOSS DIFF: tensor(5.3527, grad_fn=<NllLossBackward0>) tensor(5.2774, grad_fn=<NllLossBackward0>) 397 LOSS DIFF: tensor(5.5415, grad_fn=<NllLossBackward0>) tensor(5.3283, grad_fn=<NllLossBackward0>) 398 LOSS DIFF: tensor(5.5558, grad_fn=<NllLossBackward0>) tensor(5.4762, grad_fn=<NllLossBackward0>) 399 LOSS DIFF: tensor(5.3862, grad_fn=<NllLossBackward0>) tensor(5.3796, grad_fn=<NllLossBackward0>) 400 LOSS DIFF: tensor(5.5006, grad_fn=<NllLossBackward0>) tensor(5.2756, grad_fn=<NllLossBackward0>) 401 LOSS DIFF: tensor(5.4776, grad_fn=<NllLossBackward0>) tensor(5.2884, grad_fn=<NllLossBackward0>) 800 tensor(5.4405, grad_fn=<NllLossBackward0>) 402 LOSS DIFF: tensor(5.5078, grad_fn=<NllLossBackward0>) tensor(5.2731, grad_fn=<NllLossBackward0>) 403 LOSS DIFF: tensor(5.4186, grad_fn=<NllLossBackward0>) tensor(5.3394, grad_fn=<NllLossBackward0>) 404 LOSS DIFF: tensor(5.4645, grad_fn=<NllLossBackward0>) tensor(5.4186, grad_fn=<NllLossBackward0>) 405 LOSS DIFF: tensor(5.3991, grad_fn=<NllLossBackward0>) tensor(5.1863, grad_fn=<NllLossBackward0>) 406 LOSS DIFF: tensor(5.4625, grad_fn=<NllLossBackward0>) tensor(5.3991, grad_fn=<NllLossBackward0>) 407 LOSS DIFF: tensor(5.2887, grad_fn=<NllLossBackward0>) tensor(5.2630, grad_fn=<NllLossBackward0>) 408 LOSS DIFF: tensor(5.3613, grad_fn=<NllLossBackward0>) tensor(5.2887, grad_fn=<NllLossBackward0>) 409 LOSS DIFF: tensor(5.4549, grad_fn=<NllLossBackward0>) tensor(5.3613, grad_fn=<NllLossBackward0>) 410 LOSS DIFF: tensor(5.4254, grad_fn=<NllLossBackward0>) tensor(5.3545, grad_fn=<NllLossBackward0>) 411 LOSS DIFF: tensor(5.4779, grad_fn=<NllLossBackward0>) tensor(5.4254, grad_fn=<NllLossBackward0>) 412 LOSS DIFF: tensor(5.4206, grad_fn=<NllLossBackward0>) tensor(5.3494, grad_fn=<NllLossBackward0>) 413 LOSS DIFF: tensor(5.4468, grad_fn=<NllLossBackward0>) tensor(5.3558, grad_fn=<NllLossBackward0>) 414 LOSS DIFF: tensor(5.3703, grad_fn=<NllLossBackward0>) tensor(5.3009, grad_fn=<NllLossBackward0>) 415 LOSS DIFF: tensor(5.4129, grad_fn=<NllLossBackward0>) tensor(5.3703, grad_fn=<NllLossBackward0>) 416 LOSS DIFF: tensor(5.4347, grad_fn=<NllLossBackward0>) tensor(5.3186, grad_fn=<NllLossBackward0>) 417 LOSS DIFF: tensor(5.3410, grad_fn=<NllLossBackward0>) tensor(5.2797, grad_fn=<NllLossBackward0>) 418 LOSS DIFF: tensor(5.4206, grad_fn=<NllLossBackward0>) tensor(5.3410, grad_fn=<NllLossBackward0>) 419 LOSS DIFF: tensor(5.3961, grad_fn=<NllLossBackward0>) tensor(5.3201, grad_fn=<NllLossBackward0>) 420 LOSS DIFF: tensor(5.3999, grad_fn=<NllLossBackward0>) tensor(5.3961, grad_fn=<NllLossBackward0>) 421 LOSS DIFF: tensor(5.4644, grad_fn=<NllLossBackward0>) tensor(5.2622, grad_fn=<NllLossBackward0>) 422 LOSS DIFF: tensor(5.3218, grad_fn=<NllLossBackward0>) tensor(5.3111, grad_fn=<NllLossBackward0>) 423 LOSS DIFF: tensor(5.3554, grad_fn=<NllLossBackward0>) tensor(5.3218, grad_fn=<NllLossBackward0>) 424 LOSS DIFF: tensor(5.4028, grad_fn=<NllLossBackward0>) tensor(5.3554, grad_fn=<NllLossBackward0>) 425 LOSS DIFF: tensor(5.3832, grad_fn=<NllLossBackward0>) tensor(5.3375, grad_fn=<NllLossBackward0>) 426 LOSS DIFF: tensor(5.4313, grad_fn=<NllLossBackward0>) tensor(5.3181, grad_fn=<NllLossBackward0>) 427 LOSS DIFF: tensor(5.4721, grad_fn=<NllLossBackward0>) tensor(5.3831, grad_fn=<NllLossBackward0>) 428 LOSS DIFF: tensor(5.3902, grad_fn=<NllLossBackward0>) tensor(5.2394, grad_fn=<NllLossBackward0>) 429 LOSS DIFF: tensor(5.3492, grad_fn=<NllLossBackward0>) tensor(5.3336, grad_fn=<NllLossBackward0>) 430 LOSS DIFF: tensor(5.3523, grad_fn=<NllLossBackward0>) tensor(5.3492, grad_fn=<NllLossBackward0>) 431 LOSS DIFF: tensor(5.4211, grad_fn=<NllLossBackward0>) tensor(5.3486, grad_fn=<NllLossBackward0>) 432 LOSS DIFF: tensor(5.4755, grad_fn=<NllLossBackward0>) tensor(5.2288, grad_fn=<NllLossBackward0>) 433 LOSS DIFF: tensor(5.5728, grad_fn=<NllLossBackward0>) tensor(5.4755, grad_fn=<NllLossBackward0>) 434 LOSS DIFF: tensor(5.3855, grad_fn=<NllLossBackward0>) tensor(5.3527, grad_fn=<NllLossBackward0>) 435 LOSS DIFF: tensor(5.4776, grad_fn=<NllLossBackward0>) tensor(5.3855, grad_fn=<NllLossBackward0>) 436 LOSS DIFF: tensor(5.3750, grad_fn=<NllLossBackward0>) tensor(5.3262, grad_fn=<NllLossBackward0>) 437 LOSS DIFF: tensor(5.3902, grad_fn=<NllLossBackward0>) tensor(5.3750, grad_fn=<NllLossBackward0>) 438 LOSS DIFF: tensor(5.3135, grad_fn=<NllLossBackward0>) tensor(5.2863, grad_fn=<NllLossBackward0>) 439 LOSS DIFF: tensor(5.4483, grad_fn=<NllLossBackward0>) tensor(5.3135, grad_fn=<NllLossBackward0>) 440 LOSS DIFF: tensor(5.3201, grad_fn=<NllLossBackward0>) tensor(5.2603, grad_fn=<NllLossBackward0>) 441 LOSS DIFF: tensor(5.3807, grad_fn=<NllLossBackward0>) tensor(5.3201, grad_fn=<NllLossBackward0>) 442 LOSS DIFF: tensor(5.5009, grad_fn=<NllLossBackward0>) tensor(5.2434, grad_fn=<NllLossBackward0>) 443 LOSS DIFF: tensor(5.4282, grad_fn=<NllLossBackward0>) tensor(5.4278, grad_fn=<NllLossBackward0>) 444 LOSS DIFF: tensor(5.3787, grad_fn=<NllLossBackward0>) tensor(5.3128, grad_fn=<NllLossBackward0>) 445 LOSS DIFF: tensor(5.5917, grad_fn=<NllLossBackward0>) tensor(5.3324, grad_fn=<NllLossBackward0>) 446 LOSS DIFF: tensor(5.4186, grad_fn=<NllLossBackward0>) tensor(5.3144, grad_fn=<NllLossBackward0>) 447 LOSS DIFF: tensor(5.4553, grad_fn=<NllLossBackward0>) tensor(5.4186, grad_fn=<NllLossBackward0>) 448 LOSS DIFF: tensor(5.4903, grad_fn=<NllLossBackward0>) tensor(5.4553, grad_fn=<NllLossBackward0>) 449 LOSS DIFF: tensor(5.4295, grad_fn=<NllLossBackward0>) tensor(5.3503, grad_fn=<NllLossBackward0>) 450 LOSS DIFF: tensor(5.3945, grad_fn=<NllLossBackward0>) tensor(5.3607, grad_fn=<NllLossBackward0>) 451 LOSS DIFF: tensor(5.2822, grad_fn=<NllLossBackward0>) tensor(5.2387, grad_fn=<NllLossBackward0>) 452 LOSS DIFF: tensor(5.3334, grad_fn=<NllLossBackward0>) tensor(5.2822, grad_fn=<NllLossBackward0>) 453 LOSS DIFF: tensor(5.4073, grad_fn=<NllLossBackward0>) tensor(5.3334, grad_fn=<NllLossBackward0>) 454 LOSS DIFF: tensor(5.3797, grad_fn=<NllLossBackward0>) tensor(5.3469, grad_fn=<NllLossBackward0>) 455 LOSS DIFF: tensor(5.4848, grad_fn=<NllLossBackward0>) tensor(5.2529, grad_fn=<NllLossBackward0>) 900 tensor(5.3078, grad_fn=<NllLossBackward0>) 456 LOSS DIFF: tensor(5.4695, grad_fn=<NllLossBackward0>) tensor(5.3078, grad_fn=<NllLossBackward0>) 457 LOSS DIFF: tensor(5.4369, grad_fn=<NllLossBackward0>) tensor(5.3834, grad_fn=<NllLossBackward0>) 458 LOSS DIFF: tensor(5.4973, grad_fn=<NllLossBackward0>) tensor(5.4369, grad_fn=<NllLossBackward0>) 459 LOSS DIFF: tensor(5.4526, grad_fn=<NllLossBackward0>) tensor(5.3075, grad_fn=<NllLossBackward0>) 460 LOSS DIFF: tensor(5.4022, grad_fn=<NllLossBackward0>) tensor(5.2870, grad_fn=<NllLossBackward0>) 461 LOSS DIFF: tensor(5.3850, grad_fn=<NllLossBackward0>) tensor(5.2879, grad_fn=<NllLossBackward0>) 462 LOSS DIFF: tensor(5.4370, grad_fn=<NllLossBackward0>) tensor(5.3154, grad_fn=<NllLossBackward0>) 463 LOSS DIFF: tensor(5.4111, grad_fn=<NllLossBackward0>) tensor(5.3927, grad_fn=<NllLossBackward0>) 464 LOSS DIFF: tensor(5.4638, grad_fn=<NllLossBackward0>) tensor(5.4111, grad_fn=<NllLossBackward0>) 465 LOSS DIFF: tensor(5.3719, grad_fn=<NllLossBackward0>) tensor(5.3195, grad_fn=<NllLossBackward0>) 466 LOSS DIFF: tensor(5.4880, grad_fn=<NllLossBackward0>) tensor(5.3719, grad_fn=<NllLossBackward0>) 467 LOSS DIFF: tensor(5.4762, grad_fn=<NllLossBackward0>) tensor(5.4186, grad_fn=<NllLossBackward0>) 468 LOSS DIFF: tensor(5.3155, grad_fn=<NllLossBackward0>) tensor(5.2086, grad_fn=<NllLossBackward0>) 469 LOSS DIFF: tensor(5.4985, grad_fn=<NllLossBackward0>) tensor(5.3155, grad_fn=<NllLossBackward0>) 470 LOSS DIFF: tensor(5.4505, grad_fn=<NllLossBackward0>) tensor(5.3731, grad_fn=<NllLossBackward0>) 471 LOSS DIFF: tensor(5.4291, grad_fn=<NllLossBackward0>) tensor(5.3408, grad_fn=<NllLossBackward0>) 472 LOSS DIFF: tensor(5.3826, grad_fn=<NllLossBackward0>) tensor(5.3232, grad_fn=<NllLossBackward0>) 473 LOSS DIFF: tensor(5.4152, grad_fn=<NllLossBackward0>) tensor(5.3468, grad_fn=<NllLossBackward0>) 474 LOSS DIFF: tensor(5.4983, grad_fn=<NllLossBackward0>) tensor(5.4152, grad_fn=<NllLossBackward0>) 475 LOSS DIFF: tensor(5.5432, grad_fn=<NllLossBackward0>) tensor(5.3502, grad_fn=<NllLossBackward0>) 476 LOSS DIFF: tensor(5.3989, grad_fn=<NllLossBackward0>) tensor(5.3489, grad_fn=<NllLossBackward0>) 477 LOSS DIFF: tensor(5.4624, grad_fn=<NllLossBackward0>) tensor(5.3761, grad_fn=<NllLossBackward0>) 478 LOSS DIFF: tensor(5.4082, grad_fn=<NllLossBackward0>) tensor(5.4043, grad_fn=<NllLossBackward0>) 479 LOSS DIFF: tensor(5.4074, grad_fn=<NllLossBackward0>) tensor(5.3588, grad_fn=<NllLossBackward0>) 480 LOSS DIFF: tensor(5.4588, grad_fn=<NllLossBackward0>) tensor(5.4074, grad_fn=<NllLossBackward0>) 481 LOSS DIFF: tensor(5.3339, grad_fn=<NllLossBackward0>) tensor(5.2172, grad_fn=<NllLossBackward0>) 482 LOSS DIFF: tensor(5.4468, grad_fn=<NllLossBackward0>) tensor(5.3339, grad_fn=<NllLossBackward0>) 483 LOSS DIFF: tensor(5.4736, grad_fn=<NllLossBackward0>) tensor(5.4024, grad_fn=<NllLossBackward0>) 484 LOSS DIFF: tensor(5.3780, grad_fn=<NllLossBackward0>) tensor(5.3095, grad_fn=<NllLossBackward0>) 485 LOSS DIFF: tensor(5.4251, grad_fn=<NllLossBackward0>) tensor(5.3780, grad_fn=<NllLossBackward0>) 486 LOSS DIFF: tensor(5.4035, grad_fn=<NllLossBackward0>) tensor(5.3474, grad_fn=<NllLossBackward0>) 487 LOSS DIFF: tensor(5.3575, grad_fn=<NllLossBackward0>) tensor(5.2837, grad_fn=<NllLossBackward0>) 488 LOSS DIFF: tensor(5.4629, grad_fn=<NllLossBackward0>) tensor(5.3298, grad_fn=<NllLossBackward0>) 489 LOSS DIFF: tensor(5.4593, grad_fn=<NllLossBackward0>) tensor(5.4124, grad_fn=<NllLossBackward0>) 490 LOSS DIFF: tensor(5.4040, grad_fn=<NllLossBackward0>) tensor(5.3532, grad_fn=<NllLossBackward0>) 491 LOSS DIFF: tensor(5.4693, grad_fn=<NllLossBackward0>) tensor(5.4040, grad_fn=<NllLossBackward0>) 492 LOSS DIFF: tensor(5.4201, grad_fn=<NllLossBackward0>) tensor(5.3561, grad_fn=<NllLossBackward0>) 493 LOSS DIFF: tensor(5.4786, grad_fn=<NllLossBackward0>) tensor(5.4201, grad_fn=<NllLossBackward0>) 494 LOSS DIFF: tensor(5.3819, grad_fn=<NllLossBackward0>) tensor(5.3108, grad_fn=<NllLossBackward0>) 495 LOSS DIFF: tensor(5.3170, grad_fn=<NllLossBackward0>) tensor(5.3080, grad_fn=<NllLossBackward0>) 496 LOSS DIFF: tensor(5.3305, grad_fn=<NllLossBackward0>) tensor(5.2931, grad_fn=<NllLossBackward0>) 497 LOSS DIFF: tensor(5.3719, grad_fn=<NllLossBackward0>) tensor(5.3305, grad_fn=<NllLossBackward0>) 498 LOSS DIFF: tensor(5.3756, grad_fn=<NllLossBackward0>) tensor(5.3702, grad_fn=<NllLossBackward0>) 499 LOSS DIFF: tensor(5.4073, grad_fn=<NllLossBackward0>) tensor(5.1951, grad_fn=<NllLossBackward0>) 500 LOSS DIFF: tensor(5.4267, grad_fn=<NllLossBackward0>) tensor(5.3957, grad_fn=<NllLossBackward0>) 501 LOSS DIFF: tensor(5.3842, grad_fn=<NllLossBackward0>) tensor(5.3569, grad_fn=<NllLossBackward0>) 502 LOSS DIFF: tensor(5.4202, grad_fn=<NllLossBackward0>) tensor(5.3842, grad_fn=<NllLossBackward0>) 503 LOSS DIFF: tensor(5.3634, grad_fn=<NllLossBackward0>) tensor(5.2962, grad_fn=<NllLossBackward0>) 504 LOSS DIFF: tensor(5.4654, grad_fn=<NllLossBackward0>) tensor(5.3512, grad_fn=<NllLossBackward0>) 1000 tensor(5.4063, grad_fn=<NllLossBackward0>) 505 LOSS DIFF: tensor(5.4063, grad_fn=<NllLossBackward0>) tensor(5.3712, grad_fn=<NllLossBackward0>) 506 LOSS DIFF: tensor(5.3378, grad_fn=<NllLossBackward0>) tensor(5.2547, grad_fn=<NllLossBackward0>) 507 LOSS DIFF: tensor(5.3185, grad_fn=<NllLossBackward0>) tensor(5.2350, grad_fn=<NllLossBackward0>) 508 LOSS DIFF: tensor(5.3049, grad_fn=<NllLossBackward0>) tensor(5.1821, grad_fn=<NllLossBackward0>) 509 LOSS DIFF: tensor(5.4689, grad_fn=<NllLossBackward0>) tensor(5.3049, grad_fn=<NllLossBackward0>) 510 LOSS DIFF: tensor(5.1437, grad_fn=<NllLossBackward0>) tensor(5.1380, grad_fn=<NllLossBackward0>) 511 LOSS DIFF: tensor(5.3984, grad_fn=<NllLossBackward0>) tensor(5.1437, grad_fn=<NllLossBackward0>) 512 LOSS DIFF: tensor(5.5009, grad_fn=<NllLossBackward0>) tensor(5.2426, grad_fn=<NllLossBackward0>) 513 LOSS DIFF: tensor(5.3734, grad_fn=<NllLossBackward0>) tensor(5.3096, grad_fn=<NllLossBackward0>) 514 LOSS DIFF: tensor(5.3889, grad_fn=<NllLossBackward0>) tensor(5.3734, grad_fn=<NllLossBackward0>) 515 LOSS DIFF: tensor(5.4053, grad_fn=<NllLossBackward0>) tensor(5.3114, grad_fn=<NllLossBackward0>) 516 LOSS DIFF: tensor(5.3912, grad_fn=<NllLossBackward0>) tensor(5.2357, grad_fn=<NllLossBackward0>) 517 LOSS DIFF: tensor(5.4400, grad_fn=<NllLossBackward0>) tensor(5.3115, grad_fn=<NllLossBackward0>) 518 LOSS DIFF: tensor(5.4756, grad_fn=<NllLossBackward0>) tensor(5.2689, grad_fn=<NllLossBackward0>) 519 LOSS DIFF: tensor(5.3111, grad_fn=<NllLossBackward0>) tensor(5.1618, grad_fn=<NllLossBackward0>) 520 LOSS DIFF: tensor(5.3974, grad_fn=<NllLossBackward0>) tensor(5.3030, grad_fn=<NllLossBackward0>) 521 LOSS DIFF: tensor(5.3955, grad_fn=<NllLossBackward0>) tensor(5.2872, grad_fn=<NllLossBackward0>) 522 LOSS DIFF: tensor(5.4712, grad_fn=<NllLossBackward0>) tensor(5.3863, grad_fn=<NllLossBackward0>) 523 LOSS DIFF: tensor(5.4095, grad_fn=<NllLossBackward0>) tensor(5.3686, grad_fn=<NllLossBackward0>) 524 LOSS DIFF: tensor(5.3285, grad_fn=<NllLossBackward0>) tensor(5.2293, grad_fn=<NllLossBackward0>) 525 LOSS DIFF: tensor(5.3468, grad_fn=<NllLossBackward0>) tensor(5.2348, grad_fn=<NllLossBackward0>) 526 LOSS DIFF: tensor(5.3140, grad_fn=<NllLossBackward0>) tensor(5.2460, grad_fn=<NllLossBackward0>) 527 LOSS DIFF: tensor(5.3772, grad_fn=<NllLossBackward0>) tensor(5.3140, grad_fn=<NllLossBackward0>) 528 LOSS DIFF: tensor(5.3576, grad_fn=<NllLossBackward0>) tensor(5.3363, grad_fn=<NllLossBackward0>) 529 LOSS DIFF: tensor(5.2631, grad_fn=<NllLossBackward0>) tensor(5.2239, grad_fn=<NllLossBackward0>) 530 LOSS DIFF: tensor(5.4207, grad_fn=<NllLossBackward0>) tensor(5.2631, grad_fn=<NllLossBackward0>) 531 LOSS DIFF: tensor(5.4238, grad_fn=<NllLossBackward0>) tensor(5.2798, grad_fn=<NllLossBackward0>) 532 LOSS DIFF: tensor(5.4496, grad_fn=<NllLossBackward0>) tensor(5.2819, grad_fn=<NllLossBackward0>) 533 LOSS DIFF: tensor(5.2788, grad_fn=<NllLossBackward0>) tensor(5.2125, grad_fn=<NllLossBackward0>) 534 LOSS DIFF: tensor(5.3159, grad_fn=<NllLossBackward0>) tensor(5.2788, grad_fn=<NllLossBackward0>) 535 LOSS DIFF: tensor(5.3200, grad_fn=<NllLossBackward0>) tensor(5.3159, grad_fn=<NllLossBackward0>) 536 LOSS DIFF: tensor(5.3934, grad_fn=<NllLossBackward0>) tensor(5.3087, grad_fn=<NllLossBackward0>) 537 LOSS DIFF: tensor(5.2843, grad_fn=<NllLossBackward0>) tensor(5.2815, grad_fn=<NllLossBackward0>) 538 LOSS DIFF: tensor(5.5309, grad_fn=<NllLossBackward0>) tensor(5.2377, grad_fn=<NllLossBackward0>) 539 LOSS DIFF: tensor(5.4258, grad_fn=<NllLossBackward0>) tensor(5.3734, grad_fn=<NllLossBackward0>) 540 LOSS DIFF: tensor(5.4562, grad_fn=<NllLossBackward0>) tensor(5.2893, grad_fn=<NllLossBackward0>) 541 LOSS DIFF: tensor(5.3672, grad_fn=<NllLossBackward0>) tensor(5.3331, grad_fn=<NllLossBackward0>) 542 LOSS DIFF: tensor(5.3475, grad_fn=<NllLossBackward0>) tensor(5.3409, grad_fn=<NllLossBackward0>) 543 LOSS DIFF: tensor(5.3826, grad_fn=<NllLossBackward0>) tensor(5.3475, grad_fn=<NllLossBackward0>) 544 LOSS DIFF: tensor(5.4529, grad_fn=<NllLossBackward0>) tensor(5.3826, grad_fn=<NllLossBackward0>) 545 LOSS DIFF: tensor(5.4554, grad_fn=<NllLossBackward0>) tensor(5.3758, grad_fn=<NllLossBackward0>) 546 LOSS DIFF: tensor(5.3725, grad_fn=<NllLossBackward0>) tensor(5.2762, grad_fn=<NllLossBackward0>) 547 LOSS DIFF: tensor(5.3809, grad_fn=<NllLossBackward0>) tensor(5.3140, grad_fn=<NllLossBackward0>) 548 LOSS DIFF: tensor(5.4411, grad_fn=<NllLossBackward0>) tensor(5.3809, grad_fn=<NllLossBackward0>) 1100 tensor(5.2577, grad_fn=<NllLossBackward0>) 549 LOSS DIFF: tensor(5.3207, grad_fn=<NllLossBackward0>) tensor(5.2233, grad_fn=<NllLossBackward0>) 550 LOSS DIFF: tensor(5.3287, grad_fn=<NllLossBackward0>) tensor(5.3207, grad_fn=<NllLossBackward0>) 551 LOSS DIFF: tensor(5.4455, grad_fn=<NllLossBackward0>) tensor(5.3140, grad_fn=<NllLossBackward0>) 552 LOSS DIFF: tensor(5.3970, grad_fn=<NllLossBackward0>) tensor(5.3160, grad_fn=<NllLossBackward0>) 553 LOSS DIFF: tensor(5.4958, grad_fn=<NllLossBackward0>) tensor(5.3970, grad_fn=<NllLossBackward0>) 554 LOSS DIFF: tensor(5.4289, grad_fn=<NllLossBackward0>) tensor(5.3781, grad_fn=<NllLossBackward0>) 555 LOSS DIFF: tensor(5.3988, grad_fn=<NllLossBackward0>) tensor(5.2830, grad_fn=<NllLossBackward0>) 556 LOSS DIFF: tensor(5.3452, grad_fn=<NllLossBackward0>) tensor(5.3121, grad_fn=<NllLossBackward0>) 557 LOSS DIFF: tensor(5.3707, grad_fn=<NllLossBackward0>) tensor(5.3452, grad_fn=<NllLossBackward0>) 558 LOSS DIFF: tensor(5.4004, grad_fn=<NllLossBackward0>) tensor(5.3490, grad_fn=<NllLossBackward0>) 559 LOSS DIFF: tensor(5.3442, grad_fn=<NllLossBackward0>) tensor(5.2255, grad_fn=<NllLossBackward0>) 560 LOSS DIFF: tensor(5.3311, grad_fn=<NllLossBackward0>) tensor(5.3145, grad_fn=<NllLossBackward0>) 561 LOSS DIFF: tensor(5.4662, grad_fn=<NllLossBackward0>) tensor(5.3171, grad_fn=<NllLossBackward0>) 562 LOSS DIFF: tensor(5.3376, grad_fn=<NllLossBackward0>) tensor(5.3006, grad_fn=<NllLossBackward0>) 563 LOSS DIFF: tensor(5.3617, grad_fn=<NllLossBackward0>) tensor(5.3376, grad_fn=<NllLossBackward0>) 564 LOSS DIFF: tensor(5.3627, grad_fn=<NllLossBackward0>) tensor(5.3617, grad_fn=<NllLossBackward0>) 565 LOSS DIFF: tensor(5.3169, grad_fn=<NllLossBackward0>) tensor(5.2494, grad_fn=<NllLossBackward0>) 566 LOSS DIFF: tensor(5.3391, grad_fn=<NllLossBackward0>) tensor(5.2797, grad_fn=<NllLossBackward0>) 567 LOSS DIFF: tensor(5.3793, grad_fn=<NllLossBackward0>) tensor(5.3391, grad_fn=<NllLossBackward0>) 568 LOSS DIFF: tensor(5.3983, grad_fn=<NllLossBackward0>) tensor(5.3793, grad_fn=<NllLossBackward0>) 569 LOSS DIFF: tensor(5.3797, grad_fn=<NllLossBackward0>) tensor(5.1963, grad_fn=<NllLossBackward0>) 570 LOSS DIFF: tensor(5.3978, grad_fn=<NllLossBackward0>) tensor(5.3797, grad_fn=<NllLossBackward0>) 571 LOSS DIFF: tensor(5.4648, grad_fn=<NllLossBackward0>) tensor(5.2794, grad_fn=<NllLossBackward0>) 572 LOSS DIFF: tensor(5.3364, grad_fn=<NllLossBackward0>) tensor(5.3139, grad_fn=<NllLossBackward0>) 573 LOSS DIFF: tensor(5.3724, grad_fn=<NllLossBackward0>) tensor(5.3364, grad_fn=<NllLossBackward0>) 574 LOSS DIFF: tensor(5.4125, grad_fn=<NllLossBackward0>) tensor(5.3724, grad_fn=<NllLossBackward0>) 575 LOSS DIFF: tensor(5.4216, grad_fn=<NllLossBackward0>) tensor(5.3249, grad_fn=<NllLossBackward0>) 576 LOSS DIFF: tensor(5.3209, grad_fn=<NllLossBackward0>) tensor(5.2087, grad_fn=<NllLossBackward0>) 577 LOSS DIFF: tensor(5.2730, grad_fn=<NllLossBackward0>) tensor(5.2515, grad_fn=<NllLossBackward0>) 578 LOSS DIFF: tensor(5.3871, grad_fn=<NllLossBackward0>) tensor(5.2537, grad_fn=<NllLossBackward0>) 579 LOSS DIFF: tensor(5.2357, grad_fn=<NllLossBackward0>) tensor(5.1883, grad_fn=<NllLossBackward0>) 580 LOSS DIFF: tensor(5.4435, grad_fn=<NllLossBackward0>) tensor(5.2357, grad_fn=<NllLossBackward0>) 581 LOSS DIFF: tensor(5.3116, grad_fn=<NllLossBackward0>) tensor(5.2408, grad_fn=<NllLossBackward0>) 582 LOSS DIFF: tensor(5.4295, grad_fn=<NllLossBackward0>) tensor(5.3116, grad_fn=<NllLossBackward0>) 583 LOSS DIFF: tensor(5.3725, grad_fn=<NllLossBackward0>) tensor(5.2704, grad_fn=<NllLossBackward0>) 584 LOSS DIFF: tensor(5.3951, grad_fn=<NllLossBackward0>) tensor(5.3211, grad_fn=<NllLossBackward0>) 585 LOSS DIFF: tensor(5.4080, grad_fn=<NllLossBackward0>) tensor(5.3951, grad_fn=<NllLossBackward0>) 586 LOSS DIFF: tensor(5.3569, grad_fn=<NllLossBackward0>) tensor(5.2900, grad_fn=<NllLossBackward0>) 587 LOSS DIFF: tensor(5.3004, grad_fn=<NllLossBackward0>) tensor(5.2806, grad_fn=<NllLossBackward0>) 588 LOSS DIFF: tensor(5.3874, grad_fn=<NllLossBackward0>) tensor(5.3004, grad_fn=<NllLossBackward0>) 589 LOSS DIFF: tensor(5.4849, grad_fn=<NllLossBackward0>) tensor(5.2921, grad_fn=<NllLossBackward0>) 590 LOSS DIFF: tensor(5.2856, grad_fn=<NllLossBackward0>) tensor(5.2661, grad_fn=<NllLossBackward0>) 591 LOSS DIFF: tensor(5.4242, grad_fn=<NllLossBackward0>) tensor(5.2856, grad_fn=<NllLossBackward0>) 592 LOSS DIFF: tensor(5.2910, grad_fn=<NllLossBackward0>) tensor(5.1762, grad_fn=<NllLossBackward0>) 593 LOSS DIFF: tensor(5.3048, grad_fn=<NllLossBackward0>) tensor(5.1369, grad_fn=<NllLossBackward0>) 594 LOSS DIFF: tensor(5.3170, grad_fn=<NllLossBackward0>) tensor(5.3048, grad_fn=<NllLossBackward0>) 595 LOSS DIFF: tensor(5.4164, grad_fn=<NllLossBackward0>) tensor(5.3170, grad_fn=<NllLossBackward0>) 1200 tensor(5.2414, grad_fn=<NllLossBackward0>) 596 LOSS DIFF: tensor(5.4063, grad_fn=<NllLossBackward0>) tensor(5.2414, grad_fn=<NllLossBackward0>) 597 LOSS DIFF: tensor(5.3547, grad_fn=<NllLossBackward0>) tensor(5.2150, grad_fn=<NllLossBackward0>) 598 LOSS DIFF: tensor(5.2713, grad_fn=<NllLossBackward0>) tensor(5.2182, grad_fn=<NllLossBackward0>) 599 LOSS DIFF: tensor(5.2934, grad_fn=<NllLossBackward0>) tensor(5.2713, grad_fn=<NllLossBackward0>) 600 LOSS DIFF: tensor(5.3680, grad_fn=<NllLossBackward0>) tensor(5.2934, grad_fn=<NllLossBackward0>) 601 LOSS DIFF: tensor(5.3810, grad_fn=<NllLossBackward0>) tensor(5.2937, grad_fn=<NllLossBackward0>) 602 LOSS DIFF: tensor(5.2992, grad_fn=<NllLossBackward0>) tensor(5.2390, grad_fn=<NllLossBackward0>) 603 LOSS DIFF: tensor(5.3592, grad_fn=<NllLossBackward0>) tensor(5.2325, grad_fn=<NllLossBackward0>) 604 LOSS DIFF: tensor(5.4165, grad_fn=<NllLossBackward0>) tensor(5.2317, grad_fn=<NllLossBackward0>) 605 LOSS DIFF: tensor(5.5033, grad_fn=<NllLossBackward0>) tensor(5.4165, grad_fn=<NllLossBackward0>) 606 LOSS DIFF: tensor(5.4137, grad_fn=<NllLossBackward0>) tensor(5.1996, grad_fn=<NllLossBackward0>) 607 LOSS DIFF: tensor(5.5262, grad_fn=<NllLossBackward0>) tensor(5.4137, grad_fn=<NllLossBackward0>) 608 LOSS DIFF: tensor(5.3964, grad_fn=<NllLossBackward0>) tensor(5.3314, grad_fn=<NllLossBackward0>) 609 LOSS DIFF: tensor(5.3722, grad_fn=<NllLossBackward0>) tensor(5.3268, grad_fn=<NllLossBackward0>) 610 LOSS DIFF: tensor(5.3378, grad_fn=<NllLossBackward0>) tensor(5.3186, grad_fn=<NllLossBackward0>) 611 LOSS DIFF: tensor(5.4699, grad_fn=<NllLossBackward0>) tensor(5.3378, grad_fn=<NllLossBackward0>) 612 LOSS DIFF: tensor(5.4191, grad_fn=<NllLossBackward0>) tensor(5.3715, grad_fn=<NllLossBackward0>) 613 LOSS DIFF: tensor(5.3107, grad_fn=<NllLossBackward0>) tensor(5.2864, grad_fn=<NllLossBackward0>) 614 LOSS DIFF: tensor(5.3746, grad_fn=<NllLossBackward0>) tensor(5.2844, grad_fn=<NllLossBackward0>) 615 LOSS DIFF: tensor(5.4486, grad_fn=<NllLossBackward0>) tensor(5.3746, grad_fn=<NllLossBackward0>) 616 LOSS DIFF: tensor(5.4732, grad_fn=<NllLossBackward0>) tensor(5.4486, grad_fn=<NllLossBackward0>) 617 LOSS DIFF: tensor(5.3487, grad_fn=<NllLossBackward0>) tensor(5.2559, grad_fn=<NllLossBackward0>) 618 LOSS DIFF: tensor(5.3737, grad_fn=<NllLossBackward0>) tensor(5.3487, grad_fn=<NllLossBackward0>) 619 LOSS DIFF: tensor(5.3524, grad_fn=<NllLossBackward0>) tensor(5.3056, grad_fn=<NllLossBackward0>) 620 LOSS DIFF: tensor(5.4119, grad_fn=<NllLossBackward0>) tensor(5.3524, grad_fn=<NllLossBackward0>) 621 LOSS DIFF: tensor(5.3877, grad_fn=<NllLossBackward0>) tensor(5.3544, grad_fn=<NllLossBackward0>) 622 LOSS DIFF: tensor(5.3305, grad_fn=<NllLossBackward0>) tensor(5.3165, grad_fn=<NllLossBackward0>) 623 LOSS DIFF: tensor(5.4056, grad_fn=<NllLossBackward0>) tensor(5.3305, grad_fn=<NllLossBackward0>) 624 LOSS DIFF: tensor(5.3550, grad_fn=<NllLossBackward0>) tensor(5.3069, grad_fn=<NllLossBackward0>) 625 LOSS DIFF: tensor(5.3018, grad_fn=<NllLossBackward0>) tensor(5.2306, grad_fn=<NllLossBackward0>) 626 LOSS DIFF: tensor(5.3613, grad_fn=<NllLossBackward0>) tensor(5.3018, grad_fn=<NllLossBackward0>) 627 LOSS DIFF: tensor(5.3056, grad_fn=<NllLossBackward0>) tensor(5.2849, grad_fn=<NllLossBackward0>) 628 LOSS DIFF: tensor(5.4281, grad_fn=<NllLossBackward0>) tensor(5.1398, grad_fn=<NllLossBackward0>) 629 LOSS DIFF: tensor(5.3037, grad_fn=<NllLossBackward0>) tensor(5.2343, grad_fn=<NllLossBackward0>) 630 LOSS DIFF: tensor(5.3630, grad_fn=<NllLossBackward0>) tensor(5.2993, grad_fn=<NllLossBackward0>) 631 LOSS DIFF: tensor(5.3922, grad_fn=<NllLossBackward0>) tensor(5.3630, grad_fn=<NllLossBackward0>) 632 LOSS DIFF: tensor(5.3583, grad_fn=<NllLossBackward0>) tensor(5.2346, grad_fn=<NllLossBackward0>) 633 LOSS DIFF: tensor(5.3638, grad_fn=<NllLossBackward0>) tensor(5.3486, grad_fn=<NllLossBackward0>) 634 LOSS DIFF: tensor(5.2703, grad_fn=<NllLossBackward0>) tensor(5.2605, grad_fn=<NllLossBackward0>) 635 LOSS DIFF: tensor(5.3341, grad_fn=<NllLossBackward0>) tensor(5.2703, grad_fn=<NllLossBackward0>) 636 LOSS DIFF: tensor(5.3615, grad_fn=<NllLossBackward0>) tensor(5.3341, grad_fn=<NllLossBackward0>) 637 LOSS DIFF: tensor(5.3735, grad_fn=<NllLossBackward0>) tensor(5.3225, grad_fn=<NllLossBackward0>) 638 LOSS DIFF: tensor(5.3535, grad_fn=<NllLossBackward0>) tensor(5.2765, grad_fn=<NllLossBackward0>) 639 LOSS DIFF: tensor(5.4068, grad_fn=<NllLossBackward0>) tensor(5.3535, grad_fn=<NllLossBackward0>) 640 LOSS DIFF: tensor(5.3669, grad_fn=<NllLossBackward0>) tensor(5.2441, grad_fn=<NllLossBackward0>) 641 LOSS DIFF: tensor(5.3348, grad_fn=<NllLossBackward0>) tensor(5.2892, grad_fn=<NllLossBackward0>) 642 LOSS DIFF: tensor(5.4134, grad_fn=<NllLossBackward0>) tensor(5.3348, grad_fn=<NllLossBackward0>) 643 LOSS DIFF: tensor(5.3649, grad_fn=<NllLossBackward0>) tensor(5.3365, grad_fn=<NllLossBackward0>) 644 LOSS DIFF: tensor(5.3606, grad_fn=<NllLossBackward0>) tensor(5.2532, grad_fn=<NllLossBackward0>) 645 LOSS DIFF: tensor(5.3622, grad_fn=<NllLossBackward0>) tensor(5.2414, grad_fn=<NllLossBackward0>) 646 LOSS DIFF: tensor(5.3985, grad_fn=<NllLossBackward0>) tensor(5.3297, grad_fn=<NllLossBackward0>) 1300 tensor(5.2993, grad_fn=<NllLossBackward0>) 647 LOSS DIFF: tensor(5.2993, grad_fn=<NllLossBackward0>) tensor(5.2568, grad_fn=<NllLossBackward0>) 648 LOSS DIFF: tensor(5.3153, grad_fn=<NllLossBackward0>) tensor(5.2993, grad_fn=<NllLossBackward0>) 649 LOSS DIFF: tensor(5.3619, grad_fn=<NllLossBackward0>) tensor(5.2734, grad_fn=<NllLossBackward0>) 650 LOSS DIFF: tensor(5.4052, grad_fn=<NllLossBackward0>) tensor(5.2523, grad_fn=<NllLossBackward0>) 651 LOSS DIFF: tensor(5.3573, grad_fn=<NllLossBackward0>) tensor(5.3209, grad_fn=<NllLossBackward0>) 652 LOSS DIFF: tensor(5.2472, grad_fn=<NllLossBackward0>) tensor(5.2427, grad_fn=<NllLossBackward0>) 653 LOSS DIFF: tensor(5.4110, grad_fn=<NllLossBackward0>) tensor(5.2472, grad_fn=<NllLossBackward0>) 654 LOSS DIFF: tensor(5.2660, grad_fn=<NllLossBackward0>) tensor(5.2397, grad_fn=<NllLossBackward0>) 655 LOSS DIFF: tensor(5.3451, grad_fn=<NllLossBackward0>) tensor(5.2660, grad_fn=<NllLossBackward0>) 656 LOSS DIFF: tensor(5.2828, grad_fn=<NllLossBackward0>) tensor(5.1689, grad_fn=<NllLossBackward0>) 657 LOSS DIFF: tensor(5.3989, grad_fn=<NllLossBackward0>) tensor(5.2828, grad_fn=<NllLossBackward0>) 658 LOSS DIFF: tensor(5.3128, grad_fn=<NllLossBackward0>) tensor(5.2708, grad_fn=<NllLossBackward0>) 659 LOSS DIFF: tensor(5.2602, grad_fn=<NllLossBackward0>) tensor(5.2357, grad_fn=<NllLossBackward0>) 660 LOSS DIFF: tensor(5.3591, grad_fn=<NllLossBackward0>) tensor(5.2602, grad_fn=<NllLossBackward0>) 661 LOSS DIFF: tensor(5.4472, grad_fn=<NllLossBackward0>) tensor(5.2953, grad_fn=<NllLossBackward0>) 662 LOSS DIFF: tensor(5.2631, grad_fn=<NllLossBackward0>) tensor(5.1217, grad_fn=<NllLossBackward0>) 663 LOSS DIFF: tensor(5.3468, grad_fn=<NllLossBackward0>) tensor(5.2631, grad_fn=<NllLossBackward0>) 664 LOSS DIFF: tensor(5.3112, grad_fn=<NllLossBackward0>) tensor(5.1798, grad_fn=<NllLossBackward0>) 665 LOSS DIFF: tensor(5.4536, grad_fn=<NllLossBackward0>) tensor(5.3112, grad_fn=<NllLossBackward0>) 666 LOSS DIFF: tensor(5.2946, grad_fn=<NllLossBackward0>) tensor(5.2031, grad_fn=<NllLossBackward0>) 667 LOSS DIFF: tensor(5.3658, grad_fn=<NllLossBackward0>) tensor(5.2946, grad_fn=<NllLossBackward0>) 668 LOSS DIFF: tensor(5.3176, grad_fn=<NllLossBackward0>) tensor(5.3126, grad_fn=<NllLossBackward0>) 669 LOSS DIFF: tensor(5.3397, grad_fn=<NllLossBackward0>) tensor(5.2761, grad_fn=<NllLossBackward0>) 670 LOSS DIFF: tensor(5.3414, grad_fn=<NllLossBackward0>) tensor(5.1992, grad_fn=<NllLossBackward0>) 671 LOSS DIFF: tensor(5.3593, grad_fn=<NllLossBackward0>) tensor(5.2940, grad_fn=<NllLossBackward0>) 672 LOSS DIFF: tensor(5.3734, grad_fn=<NllLossBackward0>) tensor(5.3593, grad_fn=<NllLossBackward0>) 673 LOSS DIFF: tensor(5.3879, grad_fn=<NllLossBackward0>) tensor(5.3734, grad_fn=<NllLossBackward0>) 674 LOSS DIFF: tensor(5.4095, grad_fn=<NllLossBackward0>) tensor(5.3879, grad_fn=<NllLossBackward0>) 675 LOSS DIFF: tensor(5.3731, grad_fn=<NllLossBackward0>) tensor(5.3149, grad_fn=<NllLossBackward0>) 676 LOSS DIFF: tensor(5.3762, grad_fn=<NllLossBackward0>) tensor(5.2030, grad_fn=<NllLossBackward0>) 677 LOSS DIFF: tensor(5.3640, grad_fn=<NllLossBackward0>) tensor(5.2093, grad_fn=<NllLossBackward0>) 678 LOSS DIFF: tensor(5.3913, grad_fn=<NllLossBackward0>) tensor(5.3640, grad_fn=<NllLossBackward0>) 679 LOSS DIFF: tensor(5.3979, grad_fn=<NllLossBackward0>) tensor(5.3913, grad_fn=<NllLossBackward0>) 680 LOSS DIFF: tensor(5.3584, grad_fn=<NllLossBackward0>) tensor(5.2680, grad_fn=<NllLossBackward0>) 681 LOSS DIFF: tensor(5.3767, grad_fn=<NllLossBackward0>) tensor(5.3584, grad_fn=<NllLossBackward0>) 682 LOSS DIFF: tensor(5.3828, grad_fn=<NllLossBackward0>) tensor(5.2542, grad_fn=<NllLossBackward0>) 683 LOSS DIFF: tensor(5.3277, grad_fn=<NllLossBackward0>) tensor(5.2771, grad_fn=<NllLossBackward0>) 684 LOSS DIFF: tensor(5.2910, grad_fn=<NllLossBackward0>) tensor(5.2756, grad_fn=<NllLossBackward0>) 685 LOSS DIFF: tensor(5.3150, grad_fn=<NllLossBackward0>) tensor(5.2910, grad_fn=<NllLossBackward0>) 686 LOSS DIFF: tensor(5.3208, grad_fn=<NllLossBackward0>) tensor(5.3150, grad_fn=<NllLossBackward0>) 687 LOSS DIFF: tensor(5.4099, grad_fn=<NllLossBackward0>) tensor(5.1751, grad_fn=<NllLossBackward0>) 688 LOSS DIFF: tensor(5.3103, grad_fn=<NllLossBackward0>) tensor(5.1557, grad_fn=<NllLossBackward0>) 689 LOSS DIFF: tensor(5.2464, grad_fn=<NllLossBackward0>) tensor(5.2038, grad_fn=<NllLossBackward0>) 690 LOSS DIFF: tensor(5.4148, grad_fn=<NllLossBackward0>) tensor(5.2464, grad_fn=<NllLossBackward0>) 691 LOSS DIFF: tensor(5.3898, grad_fn=<NllLossBackward0>) tensor(5.1863, grad_fn=<NllLossBackward0>) 692 LOSS DIFF: tensor(5.3926, grad_fn=<NllLossBackward0>) tensor(5.3898, grad_fn=<NllLossBackward0>) 693 LOSS DIFF: tensor(5.3975, grad_fn=<NllLossBackward0>) tensor(5.2156, grad_fn=<NllLossBackward0>) 694 LOSS DIFF: tensor(5.2680, grad_fn=<NllLossBackward0>) tensor(5.2367, grad_fn=<NllLossBackward0>) 695 LOSS DIFF: tensor(5.4590, grad_fn=<NllLossBackward0>) tensor(5.1675, grad_fn=<NllLossBackward0>) 696 LOSS DIFF: tensor(5.3168, grad_fn=<NllLossBackward0>) tensor(5.2447, grad_fn=<NllLossBackward0>) 697 LOSS DIFF: tensor(5.3581, grad_fn=<NllLossBackward0>) tensor(5.2256, grad_fn=<NllLossBackward0>) 698 LOSS DIFF: tensor(5.3668, grad_fn=<NllLossBackward0>) tensor(5.3399, grad_fn=<NllLossBackward0>) 1400 tensor(5.4240, grad_fn=<NllLossBackward0>) 699 LOSS DIFF: tensor(5.4240, grad_fn=<NllLossBackward0>) tensor(5.2860, grad_fn=<NllLossBackward0>) 700 LOSS DIFF: tensor(5.4507, grad_fn=<NllLossBackward0>) tensor(5.2273, grad_fn=<NllLossBackward0>) 701 LOSS DIFF: tensor(5.3034, grad_fn=<NllLossBackward0>) tensor(5.2823, grad_fn=<NllLossBackward0>) 702 LOSS DIFF: tensor(5.3641, grad_fn=<NllLossBackward0>) tensor(5.2678, grad_fn=<NllLossBackward0>) 703 LOSS DIFF: tensor(5.3712, grad_fn=<NllLossBackward0>) tensor(5.3641, grad_fn=<NllLossBackward0>) 704 LOSS DIFF: tensor(5.3199, grad_fn=<NllLossBackward0>) tensor(5.2634, grad_fn=<NllLossBackward0>) 705 LOSS DIFF: tensor(5.2937, grad_fn=<NllLossBackward0>) tensor(5.2929, grad_fn=<NllLossBackward0>) 706 LOSS DIFF: tensor(5.4281, grad_fn=<NllLossBackward0>) tensor(5.2937, grad_fn=<NllLossBackward0>) 707 LOSS DIFF: tensor(5.3490, grad_fn=<NllLossBackward0>) tensor(5.2559, grad_fn=<NllLossBackward0>) 708 LOSS DIFF: tensor(5.2956, grad_fn=<NllLossBackward0>) tensor(5.2263, grad_fn=<NllLossBackward0>) 709 LOSS DIFF: tensor(5.3573, grad_fn=<NllLossBackward0>) tensor(5.2956, grad_fn=<NllLossBackward0>) 710 LOSS DIFF: tensor(5.2388, grad_fn=<NllLossBackward0>) tensor(5.1368, grad_fn=<NllLossBackward0>) 711 LOSS DIFF: tensor(5.4568, grad_fn=<NllLossBackward0>) tensor(5.2388, grad_fn=<NllLossBackward0>) 712 LOSS DIFF: tensor(5.3657, grad_fn=<NllLossBackward0>) tensor(5.2206, grad_fn=<NllLossBackward0>) 713 LOSS DIFF: tensor(5.3937, grad_fn=<NllLossBackward0>) tensor(5.3657, grad_fn=<NllLossBackward0>) 714 LOSS DIFF: tensor(5.3151, grad_fn=<NllLossBackward0>) tensor(5.2181, grad_fn=<NllLossBackward0>) 715 LOSS DIFF: tensor(5.3477, grad_fn=<NllLossBackward0>) tensor(5.3151, grad_fn=<NllLossBackward0>) 716 LOSS DIFF: tensor(5.3319, grad_fn=<NllLossBackward0>) tensor(5.2977, grad_fn=<NllLossBackward0>) 717 LOSS DIFF: tensor(5.2638, grad_fn=<NllLossBackward0>) tensor(5.1780, grad_fn=<NllLossBackward0>) 718 LOSS DIFF: tensor(5.2669, grad_fn=<NllLossBackward0>) tensor(5.2638, grad_fn=<NllLossBackward0>) 719 LOSS DIFF: tensor(5.2977, grad_fn=<NllLossBackward0>) tensor(5.2669, grad_fn=<NllLossBackward0>) 720 LOSS DIFF: tensor(5.4203, grad_fn=<NllLossBackward0>) tensor(5.2977, grad_fn=<NllLossBackward0>) 721 LOSS DIFF: tensor(5.3931, grad_fn=<NllLossBackward0>) tensor(5.3073, grad_fn=<NllLossBackward0>) 722 LOSS DIFF: tensor(5.2668, grad_fn=<NllLossBackward0>) tensor(5.2528, grad_fn=<NllLossBackward0>) 723 LOSS DIFF: tensor(5.2713, grad_fn=<NllLossBackward0>) tensor(5.2102, grad_fn=<NllLossBackward0>) 724 LOSS DIFF: tensor(5.4657, grad_fn=<NllLossBackward0>) tensor(5.2713, grad_fn=<NllLossBackward0>) 725 LOSS DIFF: tensor(5.3160, grad_fn=<NllLossBackward0>) tensor(5.2097, grad_fn=<NllLossBackward0>) 726 LOSS DIFF: tensor(5.2945, grad_fn=<NllLossBackward0>) tensor(5.2223, grad_fn=<NllLossBackward0>) 727 LOSS DIFF: tensor(5.2871, grad_fn=<NllLossBackward0>) tensor(5.2417, grad_fn=<NllLossBackward0>) 728 LOSS DIFF: tensor(5.3049, grad_fn=<NllLossBackward0>) tensor(5.2871, grad_fn=<NllLossBackward0>) 729 LOSS DIFF: tensor(5.2566, grad_fn=<NllLossBackward0>) tensor(5.2405, grad_fn=<NllLossBackward0>) 730 LOSS DIFF: tensor(5.3831, grad_fn=<NllLossBackward0>) tensor(5.2566, grad_fn=<NllLossBackward0>) 731 LOSS DIFF: tensor(5.3322, grad_fn=<NllLossBackward0>) tensor(5.2234, grad_fn=<NllLossBackward0>) 732 LOSS DIFF: tensor(5.3731, grad_fn=<NllLossBackward0>) tensor(5.2365, grad_fn=<NllLossBackward0>) 733 LOSS DIFF: tensor(5.4400, grad_fn=<NllLossBackward0>) tensor(5.3731, grad_fn=<NllLossBackward0>) 734 LOSS DIFF: tensor(5.4715, grad_fn=<NllLossBackward0>) tensor(5.3013, grad_fn=<NllLossBackward0>) 735 LOSS DIFF: tensor(5.4422, grad_fn=<NllLossBackward0>) tensor(5.4010, grad_fn=<NllLossBackward0>) 736 LOSS DIFF: tensor(5.2298, grad_fn=<NllLossBackward0>) tensor(5.2163, grad_fn=<NllLossBackward0>) 737 LOSS DIFF: tensor(5.2493, grad_fn=<NllLossBackward0>) tensor(5.2298, grad_fn=<NllLossBackward0>) 738 LOSS DIFF: tensor(5.2958, grad_fn=<NllLossBackward0>) tensor(5.2493, grad_fn=<NllLossBackward0>) 739 LOSS DIFF: tensor(5.4094, grad_fn=<NllLossBackward0>) tensor(5.2502, grad_fn=<NllLossBackward0>) 740 LOSS DIFF: tensor(5.2576, grad_fn=<NllLossBackward0>) tensor(5.2305, grad_fn=<NllLossBackward0>) 741 LOSS DIFF: tensor(5.3885, grad_fn=<NllLossBackward0>) tensor(5.2576, grad_fn=<NllLossBackward0>) 742 LOSS DIFF: tensor(5.3493, grad_fn=<NllLossBackward0>) tensor(5.3387, grad_fn=<NllLossBackward0>) 743 LOSS DIFF: tensor(5.2640, grad_fn=<NllLossBackward0>) tensor(5.1842, grad_fn=<NllLossBackward0>) 744 LOSS DIFF: tensor(5.3568, grad_fn=<NllLossBackward0>) tensor(5.2640, grad_fn=<NllLossBackward0>) 745 LOSS DIFF: tensor(5.4262, grad_fn=<NllLossBackward0>) tensor(5.3232, grad_fn=<NllLossBackward0>) 746 LOSS DIFF: tensor(5.3020, grad_fn=<NllLossBackward0>) tensor(5.2816, grad_fn=<NllLossBackward0>) 1500 tensor(5.1988, grad_fn=<NllLossBackward0>) 747 LOSS DIFF: tensor(5.2921, grad_fn=<NllLossBackward0>) tensor(5.1988, grad_fn=<NllLossBackward0>) 748 LOSS DIFF: tensor(5.3279, grad_fn=<NllLossBackward0>) tensor(5.2921, grad_fn=<NllLossBackward0>) 749 LOSS DIFF: tensor(5.3318, grad_fn=<NllLossBackward0>) tensor(5.0392, grad_fn=<NllLossBackward0>) 750 LOSS DIFF: tensor(5.4100, grad_fn=<NllLossBackward0>) tensor(5.1959, grad_fn=<NllLossBackward0>) 751 LOSS DIFF: tensor(5.2634, grad_fn=<NllLossBackward0>) tensor(5.2334, grad_fn=<NllLossBackward0>) 752 LOSS DIFF: tensor(5.2761, grad_fn=<NllLossBackward0>) tensor(5.2634, grad_fn=<NllLossBackward0>) 753 LOSS DIFF: tensor(5.3743, grad_fn=<NllLossBackward0>) tensor(5.2761, grad_fn=<NllLossBackward0>) 754 LOSS DIFF: tensor(5.4399, grad_fn=<NllLossBackward0>) tensor(5.2495, grad_fn=<NllLossBackward0>) 755 LOSS DIFF: tensor(5.3723, grad_fn=<NllLossBackward0>) tensor(5.2125, grad_fn=<NllLossBackward0>) 756 LOSS DIFF: tensor(5.4313, grad_fn=<NllLossBackward0>) tensor(5.2310, grad_fn=<NllLossBackward0>) 757 LOSS DIFF: tensor(5.3316, grad_fn=<NllLossBackward0>) tensor(5.2243, grad_fn=<NllLossBackward0>) 758 LOSS DIFF: tensor(5.3435, grad_fn=<NllLossBackward0>) tensor(5.3128, grad_fn=<NllLossBackward0>) 759 LOSS DIFF: tensor(5.3396, grad_fn=<NllLossBackward0>) tensor(5.1988, grad_fn=<NllLossBackward0>) 760 LOSS DIFF: tensor(5.3344, grad_fn=<NllLossBackward0>) tensor(5.2798, grad_fn=<NllLossBackward0>) 761 LOSS DIFF: tensor(5.3503, grad_fn=<NllLossBackward0>) tensor(5.2845, grad_fn=<NllLossBackward0>) 762 LOSS DIFF: tensor(5.3522, grad_fn=<NllLossBackward0>) tensor(5.3503, grad_fn=<NllLossBackward0>) 763 LOSS DIFF: tensor(5.2487, grad_fn=<NllLossBackward0>) tensor(5.2103, grad_fn=<NllLossBackward0>) 764 LOSS DIFF: tensor(5.3914, grad_fn=<NllLossBackward0>) tensor(5.2487, grad_fn=<NllLossBackward0>) 765 LOSS DIFF: tensor(5.3346, grad_fn=<NllLossBackward0>) tensor(5.3265, grad_fn=<NllLossBackward0>) 766 LOSS DIFF: tensor(5.3932, grad_fn=<NllLossBackward0>) tensor(5.2668, grad_fn=<NllLossBackward0>) 767 LOSS DIFF: tensor(5.3308, grad_fn=<NllLossBackward0>) tensor(5.2136, grad_fn=<NllLossBackward0>) 768 LOSS DIFF: tensor(5.2342, grad_fn=<NllLossBackward0>) tensor(5.1842, grad_fn=<NllLossBackward0>) 769 LOSS DIFF: tensor(5.2779, grad_fn=<NllLossBackward0>) tensor(5.2342, grad_fn=<NllLossBackward0>) 770 LOSS DIFF: tensor(5.3309, grad_fn=<NllLossBackward0>) tensor(5.2779, grad_fn=<NllLossBackward0>) 771 LOSS DIFF: tensor(5.2772, grad_fn=<NllLossBackward0>) tensor(5.2208, grad_fn=<NllLossBackward0>) 772 LOSS DIFF: tensor(5.2998, grad_fn=<NllLossBackward0>) tensor(5.2772, grad_fn=<NllLossBackward0>) 773 LOSS DIFF: tensor(5.3198, grad_fn=<NllLossBackward0>) tensor(5.2998, grad_fn=<NllLossBackward0>) 774 LOSS DIFF: tensor(5.4071, grad_fn=<NllLossBackward0>) tensor(5.2555, grad_fn=<NllLossBackward0>) 775 LOSS DIFF: tensor(5.3407, grad_fn=<NllLossBackward0>) tensor(5.2137, grad_fn=<NllLossBackward0>) 776 LOSS DIFF: tensor(5.3168, grad_fn=<NllLossBackward0>) tensor(5.1123, grad_fn=<NllLossBackward0>) 777 LOSS DIFF: tensor(5.3270, grad_fn=<NllLossBackward0>) tensor(5.3168, grad_fn=<NllLossBackward0>) 778 LOSS DIFF: tensor(5.2770, grad_fn=<NllLossBackward0>) tensor(5.1605, grad_fn=<NllLossBackward0>) 779 LOSS DIFF: tensor(5.3174, grad_fn=<NllLossBackward0>) tensor(5.2770, grad_fn=<NllLossBackward0>) 780 LOSS DIFF: tensor(5.5412, grad_fn=<NllLossBackward0>) tensor(5.2626, grad_fn=<NllLossBackward0>) 781 LOSS DIFF: tensor(5.3245, grad_fn=<NllLossBackward0>) tensor(5.2973, grad_fn=<NllLossBackward0>) 782 LOSS DIFF: tensor(5.2911, grad_fn=<NllLossBackward0>) tensor(5.2910, grad_fn=<NllLossBackward0>) 783 LOSS DIFF: tensor(5.3198, grad_fn=<NllLossBackward0>) tensor(5.2911, grad_fn=<NllLossBackward0>) 784 LOSS DIFF: tensor(5.2661, grad_fn=<NllLossBackward0>) tensor(5.2297, grad_fn=<NllLossBackward0>) 785 LOSS DIFF: tensor(5.3086, grad_fn=<NllLossBackward0>) tensor(5.2661, grad_fn=<NllLossBackward0>) 786 LOSS DIFF: tensor(5.3143, grad_fn=<NllLossBackward0>) tensor(5.3086, grad_fn=<NllLossBackward0>) 787 LOSS DIFF: tensor(5.3467, grad_fn=<NllLossBackward0>) tensor(5.3143, grad_fn=<NllLossBackward0>) 788 LOSS DIFF: tensor(5.3771, grad_fn=<NllLossBackward0>) tensor(5.3003, grad_fn=<NllLossBackward0>) 789 LOSS DIFF: tensor(5.2802, grad_fn=<NllLossBackward0>) tensor(5.2619, grad_fn=<NllLossBackward0>) 790 LOSS DIFF: tensor(5.3205, grad_fn=<NllLossBackward0>) tensor(5.2489, grad_fn=<NllLossBackward0>) 791 LOSS DIFF: tensor(5.3028, grad_fn=<NllLossBackward0>) tensor(5.1770, grad_fn=<NllLossBackward0>) 792 LOSS DIFF: tensor(5.3130, grad_fn=<NllLossBackward0>) tensor(5.3028, grad_fn=<NllLossBackward0>) 793 LOSS DIFF: tensor(5.2011, grad_fn=<NllLossBackward0>) tensor(5.0365, grad_fn=<NllLossBackward0>) 794 LOSS DIFF: tensor(5.2648, grad_fn=<NllLossBackward0>) tensor(5.2011, grad_fn=<NllLossBackward0>) 795 LOSS DIFF: tensor(5.3135, grad_fn=<NllLossBackward0>) tensor(5.2648, grad_fn=<NllLossBackward0>) 796 LOSS DIFF: tensor(5.3958, grad_fn=<NllLossBackward0>) tensor(5.3135, grad_fn=<NllLossBackward0>) 797 LOSS DIFF: tensor(5.3604, grad_fn=<NllLossBackward0>) tensor(5.1652, grad_fn=<NllLossBackward0>) 1600 tensor(5.3680, grad_fn=<NllLossBackward0>) 798 LOSS DIFF: tensor(5.3680, grad_fn=<NllLossBackward0>) tensor(5.2941, grad_fn=<NllLossBackward0>) 799 LOSS DIFF: tensor(5.2164, grad_fn=<NllLossBackward0>) tensor(5.1485, grad_fn=<NllLossBackward0>) 800 LOSS DIFF: tensor(5.3943, grad_fn=<NllLossBackward0>) tensor(5.2164, grad_fn=<NllLossBackward0>) 801 LOSS DIFF: tensor(5.2456, grad_fn=<NllLossBackward0>) tensor(5.1408, grad_fn=<NllLossBackward0>) 802 LOSS DIFF: tensor(5.2624, grad_fn=<NllLossBackward0>) tensor(5.2268, grad_fn=<NllLossBackward0>) 803 LOSS DIFF: tensor(5.3054, grad_fn=<NllLossBackward0>) tensor(5.1765, grad_fn=<NllLossBackward0>) 804 LOSS DIFF: tensor(5.3530, grad_fn=<NllLossBackward0>) tensor(5.3054, grad_fn=<NllLossBackward0>) 805 LOSS DIFF: tensor(5.3219, grad_fn=<NllLossBackward0>) tensor(5.2960, grad_fn=<NllLossBackward0>) 806 LOSS DIFF: tensor(5.3445, grad_fn=<NllLossBackward0>) tensor(5.2025, grad_fn=<NllLossBackward0>) 807 LOSS DIFF: tensor(5.4269, grad_fn=<NllLossBackward0>) tensor(5.2403, grad_fn=<NllLossBackward0>) 808 LOSS DIFF: tensor(5.3550, grad_fn=<NllLossBackward0>) tensor(5.2981, grad_fn=<NllLossBackward0>) 809 LOSS DIFF: tensor(5.2882, grad_fn=<NllLossBackward0>) tensor(5.2592, grad_fn=<NllLossBackward0>) 810 LOSS DIFF: tensor(5.3459, grad_fn=<NllLossBackward0>) tensor(5.2882, grad_fn=<NllLossBackward0>) 811 LOSS DIFF: tensor(5.3961, grad_fn=<NllLossBackward0>) tensor(5.2398, grad_fn=<NllLossBackward0>) 812 LOSS DIFF: tensor(5.3464, grad_fn=<NllLossBackward0>) tensor(5.2061, grad_fn=<NllLossBackward0>) 813 LOSS DIFF: tensor(5.4667, grad_fn=<NllLossBackward0>) tensor(5.3051, grad_fn=<NllLossBackward0>) 814 LOSS DIFF: tensor(5.3144, grad_fn=<NllLossBackward0>) tensor(5.2452, grad_fn=<NllLossBackward0>) 815 LOSS DIFF: tensor(5.3118, grad_fn=<NllLossBackward0>) tensor(5.1809, grad_fn=<NllLossBackward0>) 816 LOSS DIFF: tensor(5.2670, grad_fn=<NllLossBackward0>) tensor(5.2661, grad_fn=<NllLossBackward0>) 817 LOSS DIFF: tensor(5.2897, grad_fn=<NllLossBackward0>) tensor(5.2135, grad_fn=<NllLossBackward0>) 818 LOSS DIFF: tensor(5.3138, grad_fn=<NllLossBackward0>) tensor(5.2798, grad_fn=<NllLossBackward0>) 819 LOSS DIFF: tensor(5.3730, grad_fn=<NllLossBackward0>) tensor(5.3138, grad_fn=<NllLossBackward0>) 820 LOSS DIFF: tensor(5.3392, grad_fn=<NllLossBackward0>) tensor(5.3115, grad_fn=<NllLossBackward0>) 821 LOSS DIFF: tensor(5.3534, grad_fn=<NllLossBackward0>) tensor(5.2959, grad_fn=<NllLossBackward0>) 822 LOSS DIFF: tensor(5.3893, grad_fn=<NllLossBackward0>) tensor(5.3500, grad_fn=<NllLossBackward0>) 823 LOSS DIFF: tensor(5.2580, grad_fn=<NllLossBackward0>) tensor(5.1436, grad_fn=<NllLossBackward0>) 824 LOSS DIFF: tensor(5.2688, grad_fn=<NllLossBackward0>) tensor(5.2580, grad_fn=<NllLossBackward0>) 825 LOSS DIFF: tensor(5.3212, grad_fn=<NllLossBackward0>) tensor(5.2688, grad_fn=<NllLossBackward0>) 826 LOSS DIFF: tensor(5.3839, grad_fn=<NllLossBackward0>) tensor(5.2897, grad_fn=<NllLossBackward0>) 827 LOSS DIFF: tensor(5.3353, grad_fn=<NllLossBackward0>) tensor(5.2536, grad_fn=<NllLossBackward0>) 828 LOSS DIFF: tensor(5.2735, grad_fn=<NllLossBackward0>) tensor(5.2156, grad_fn=<NllLossBackward0>) 829 LOSS DIFF: tensor(5.3446, grad_fn=<NllLossBackward0>) tensor(5.2735, grad_fn=<NllLossBackward0>) 830 LOSS DIFF: tensor(5.3156, grad_fn=<NllLossBackward0>) tensor(5.2965, grad_fn=<NllLossBackward0>) 831 LOSS DIFF: tensor(5.3263, grad_fn=<NllLossBackward0>) tensor(5.2847, grad_fn=<NllLossBackward0>) 832 LOSS DIFF: tensor(5.2776, grad_fn=<NllLossBackward0>) tensor(5.2448, grad_fn=<NllLossBackward0>) 833 LOSS DIFF: tensor(5.3394, grad_fn=<NllLossBackward0>) tensor(5.2776, grad_fn=<NllLossBackward0>) 834 LOSS DIFF: tensor(5.3633, grad_fn=<NllLossBackward0>) tensor(5.2746, grad_fn=<NllLossBackward0>) 835 LOSS DIFF: tensor(5.2726, grad_fn=<NllLossBackward0>) tensor(5.2409, grad_fn=<NllLossBackward0>) 836 LOSS DIFF: tensor(5.2986, grad_fn=<NllLossBackward0>) tensor(5.2726, grad_fn=<NllLossBackward0>) 837 LOSS DIFF: tensor(5.2534, grad_fn=<NllLossBackward0>) tensor(5.1774, grad_fn=<NllLossBackward0>) 838 LOSS DIFF: tensor(5.3111, grad_fn=<NllLossBackward0>) tensor(5.2534, grad_fn=<NllLossBackward0>) 839 LOSS DIFF: tensor(5.3127, grad_fn=<NllLossBackward0>) tensor(5.3111, grad_fn=<NllLossBackward0>) 840 LOSS DIFF: tensor(5.4215, grad_fn=<NllLossBackward0>) tensor(5.2348, grad_fn=<NllLossBackward0>) 841 LOSS DIFF: tensor(5.2974, grad_fn=<NllLossBackward0>) tensor(5.1407, grad_fn=<NllLossBackward0>) 842 LOSS DIFF: tensor(5.3341, grad_fn=<NllLossBackward0>) tensor(5.2498, grad_fn=<NllLossBackward0>) 843 LOSS DIFF: tensor(5.3087, grad_fn=<NllLossBackward0>) tensor(5.2148, grad_fn=<NllLossBackward0>) 844 LOSS DIFF: tensor(5.2507, grad_fn=<NllLossBackward0>) tensor(5.1230, grad_fn=<NllLossBackward0>) 1700 tensor(5.3550, grad_fn=<NllLossBackward0>) 845 LOSS DIFF: tensor(5.3550, grad_fn=<NllLossBackward0>) tensor(5.2507, grad_fn=<NllLossBackward0>) 846 LOSS DIFF: tensor(5.3766, grad_fn=<NllLossBackward0>) tensor(5.3550, grad_fn=<NllLossBackward0>) 847 LOSS DIFF: tensor(5.2487, grad_fn=<NllLossBackward0>) tensor(5.2300, grad_fn=<NllLossBackward0>) 848 LOSS DIFF: tensor(5.3142, grad_fn=<NllLossBackward0>) tensor(5.2487, grad_fn=<NllLossBackward0>) 849 LOSS DIFF: tensor(5.3734, grad_fn=<NllLossBackward0>) tensor(5.2986, grad_fn=<NllLossBackward0>) 850 LOSS DIFF: tensor(5.2452, grad_fn=<NllLossBackward0>) tensor(5.1219, grad_fn=<NllLossBackward0>) 851 LOSS DIFF: tensor(5.2957, grad_fn=<NllLossBackward0>) tensor(5.2452, grad_fn=<NllLossBackward0>) 852 LOSS DIFF: tensor(5.2852, grad_fn=<NllLossBackward0>) tensor(5.2758, grad_fn=<NllLossBackward0>) 853 LOSS DIFF: tensor(5.3498, grad_fn=<NllLossBackward0>) tensor(5.2852, grad_fn=<NllLossBackward0>) 854 LOSS DIFF: tensor(5.4008, grad_fn=<NllLossBackward0>) tensor(5.3498, grad_fn=<NllLossBackward0>) 855 LOSS DIFF: tensor(5.2165, grad_fn=<NllLossBackward0>) tensor(5.1128, grad_fn=<NllLossBackward0>) 856 LOSS DIFF: tensor(5.2850, grad_fn=<NllLossBackward0>) tensor(5.2165, grad_fn=<NllLossBackward0>) 857 LOSS DIFF: tensor(5.3881, grad_fn=<NllLossBackward0>) tensor(5.2850, grad_fn=<NllLossBackward0>) 858 LOSS DIFF: tensor(5.2249, grad_fn=<NllLossBackward0>) tensor(5.2228, grad_fn=<NllLossBackward0>) 859 LOSS DIFF: tensor(5.2559, grad_fn=<NllLossBackward0>) tensor(5.2249, grad_fn=<NllLossBackward0>) 860 LOSS DIFF: tensor(5.2867, grad_fn=<NllLossBackward0>) tensor(5.2559, grad_fn=<NllLossBackward0>) 861 LOSS DIFF: tensor(5.4387, grad_fn=<NllLossBackward0>) tensor(5.2314, grad_fn=<NllLossBackward0>) 862 LOSS DIFF: tensor(5.2867, grad_fn=<NllLossBackward0>) tensor(5.2233, grad_fn=<NllLossBackward0>) 863 LOSS DIFF: tensor(5.3220, grad_fn=<NllLossBackward0>) tensor(5.2867, grad_fn=<NllLossBackward0>) 864 LOSS DIFF: tensor(5.2581, grad_fn=<NllLossBackward0>) tensor(5.2269, grad_fn=<NllLossBackward0>) 865 LOSS DIFF: tensor(5.2703, grad_fn=<NllLossBackward0>) tensor(5.2581, grad_fn=<NllLossBackward0>) 866 LOSS DIFF: tensor(5.2300, grad_fn=<NllLossBackward0>) tensor(5.1481, grad_fn=<NllLossBackward0>) 867 LOSS DIFF: tensor(5.2460, grad_fn=<NllLossBackward0>) tensor(5.2300, grad_fn=<NllLossBackward0>) 868 LOSS DIFF: tensor(5.3260, grad_fn=<NllLossBackward0>) tensor(5.2460, grad_fn=<NllLossBackward0>) 869 LOSS DIFF: tensor(5.2582, grad_fn=<NllLossBackward0>) tensor(5.1454, grad_fn=<NllLossBackward0>) 870 LOSS DIFF: tensor(5.3153, grad_fn=<NllLossBackward0>) tensor(5.2582, grad_fn=<NllLossBackward0>) 871 LOSS DIFF: tensor(5.2967, grad_fn=<NllLossBackward0>) tensor(5.0807, grad_fn=<NllLossBackward0>) 872 LOSS DIFF: tensor(5.3636, grad_fn=<NllLossBackward0>) tensor(5.2188, grad_fn=<NllLossBackward0>) 873 LOSS DIFF: tensor(5.3807, grad_fn=<NllLossBackward0>) tensor(5.3636, grad_fn=<NllLossBackward0>) 874 LOSS DIFF: tensor(5.3318, grad_fn=<NllLossBackward0>) tensor(5.2364, grad_fn=<NllLossBackward0>) 875 LOSS DIFF: tensor(5.3220, grad_fn=<NllLossBackward0>) tensor(5.2170, grad_fn=<NllLossBackward0>) 876 LOSS DIFF: tensor(5.2753, grad_fn=<NllLossBackward0>) tensor(5.1677, grad_fn=<NllLossBackward0>) 877 LOSS DIFF: tensor(5.3142, grad_fn=<NllLossBackward0>) tensor(5.2753, grad_fn=<NllLossBackward0>) 878 LOSS DIFF: tensor(5.3142, grad_fn=<NllLossBackward0>) tensor(5.1974, grad_fn=<NllLossBackward0>) 879 LOSS DIFF: tensor(5.1746, grad_fn=<NllLossBackward0>) tensor(5.0885, grad_fn=<NllLossBackward0>) 880 LOSS DIFF: tensor(5.3789, grad_fn=<NllLossBackward0>) tensor(5.1746, grad_fn=<NllLossBackward0>) 881 LOSS DIFF: tensor(5.3057, grad_fn=<NllLossBackward0>) tensor(5.2196, grad_fn=<NllLossBackward0>) 882 LOSS DIFF: tensor(5.2886, grad_fn=<NllLossBackward0>) tensor(5.2158, grad_fn=<NllLossBackward0>) 883 LOSS DIFF: tensor(5.3288, grad_fn=<NllLossBackward0>) tensor(5.2491, grad_fn=<NllLossBackward0>) 884 LOSS DIFF: tensor(5.4903, grad_fn=<NllLossBackward0>) tensor(5.3288, grad_fn=<NllLossBackward0>) 885 LOSS DIFF: tensor(5.4034, grad_fn=<NllLossBackward0>) tensor(5.2798, grad_fn=<NllLossBackward0>) 886 LOSS DIFF: tensor(5.3601, grad_fn=<NllLossBackward0>) tensor(5.1771, grad_fn=<NllLossBackward0>) 887 LOSS DIFF: tensor(5.2809, grad_fn=<NllLossBackward0>) tensor(5.1809, grad_fn=<NllLossBackward0>) 888 LOSS DIFF: tensor(5.3620, grad_fn=<NllLossBackward0>) tensor(5.2748, grad_fn=<NllLossBackward0>) 889 LOSS DIFF: tensor(5.3855, grad_fn=<NllLossBackward0>) tensor(5.2573, grad_fn=<NllLossBackward0>) 890 LOSS DIFF: tensor(5.3124, grad_fn=<NllLossBackward0>) tensor(5.2379, grad_fn=<NllLossBackward0>) 891 LOSS DIFF: tensor(5.3192, grad_fn=<NllLossBackward0>) tensor(5.3124, grad_fn=<NllLossBackward0>) 892 LOSS DIFF: tensor(5.3423, grad_fn=<NllLossBackward0>) tensor(5.3192, grad_fn=<NllLossBackward0>) 893 LOSS DIFF: tensor(5.4086, grad_fn=<NllLossBackward0>) tensor(5.1976, grad_fn=<NllLossBackward0>) 894 LOSS DIFF: tensor(5.3156, grad_fn=<NllLossBackward0>) tensor(5.2619, grad_fn=<NllLossBackward0>) 895 LOSS DIFF: tensor(5.3277, grad_fn=<NllLossBackward0>) tensor(5.3156, grad_fn=<NllLossBackward0>) 896 LOSS DIFF: tensor(5.2352, grad_fn=<NllLossBackward0>) tensor(5.2142, grad_fn=<NllLossBackward0>) 897 LOSS DIFF: tensor(5.3471, grad_fn=<NllLossBackward0>) tensor(5.2059, grad_fn=<NllLossBackward0>) 898 LOSS DIFF: tensor(5.2658, grad_fn=<NllLossBackward0>) tensor(5.1801, grad_fn=<NllLossBackward0>) 1800 tensor(5.4171, grad_fn=<NllLossBackward0>) 899 LOSS DIFF: tensor(5.4171, grad_fn=<NllLossBackward0>) tensor(5.2658, grad_fn=<NllLossBackward0>) 900 LOSS DIFF: tensor(5.3919, grad_fn=<NllLossBackward0>) tensor(5.2872, grad_fn=<NllLossBackward0>) 901 LOSS DIFF: tensor(5.2667, grad_fn=<NllLossBackward0>) tensor(5.1940, grad_fn=<NllLossBackward0>) 902 LOSS DIFF: tensor(5.3631, grad_fn=<NllLossBackward0>) tensor(5.2667, grad_fn=<NllLossBackward0>) 903 LOSS DIFF: tensor(5.3693, grad_fn=<NllLossBackward0>) tensor(5.2566, grad_fn=<NllLossBackward0>) 904 LOSS DIFF: tensor(5.3239, grad_fn=<NllLossBackward0>) tensor(5.2152, grad_fn=<NllLossBackward0>) 905 LOSS DIFF: tensor(5.3641, grad_fn=<NllLossBackward0>) tensor(5.3239, grad_fn=<NllLossBackward0>) 906 LOSS DIFF: tensor(5.2443, grad_fn=<NllLossBackward0>) tensor(5.1951, grad_fn=<NllLossBackward0>) 907 LOSS DIFF: tensor(5.4277, grad_fn=<NllLossBackward0>) tensor(5.1634, grad_fn=<NllLossBackward0>) 908 LOSS DIFF: tensor(5.2730, grad_fn=<NllLossBackward0>) tensor(5.0604, grad_fn=<NllLossBackward0>) 909 LOSS DIFF: tensor(5.2867, grad_fn=<NllLossBackward0>) tensor(5.2566, grad_fn=<NllLossBackward0>) 910 LOSS DIFF: tensor(5.4127, grad_fn=<NllLossBackward0>) tensor(5.2155, grad_fn=<NllLossBackward0>) 911 LOSS DIFF: tensor(5.3634, grad_fn=<NllLossBackward0>) tensor(5.3211, grad_fn=<NllLossBackward0>) 912 LOSS DIFF: tensor(5.2831, grad_fn=<NllLossBackward0>) tensor(5.2335, grad_fn=<NllLossBackward0>) 913 LOSS DIFF: tensor(5.2755, grad_fn=<NllLossBackward0>) tensor(5.2735, grad_fn=<NllLossBackward0>) 914 LOSS DIFF: tensor(5.2826, grad_fn=<NllLossBackward0>) tensor(5.2755, grad_fn=<NllLossBackward0>) 915 LOSS DIFF: tensor(5.3887, grad_fn=<NllLossBackward0>) tensor(5.0861, grad_fn=<NllLossBackward0>) 916 LOSS DIFF: tensor(5.3065, grad_fn=<NllLossBackward0>) tensor(5.2729, grad_fn=<NllLossBackward0>) 917 LOSS DIFF: tensor(5.2632, grad_fn=<NllLossBackward0>) tensor(5.1560, grad_fn=<NllLossBackward0>) 918 LOSS DIFF: tensor(5.2920, grad_fn=<NllLossBackward0>) tensor(5.1884, grad_fn=<NllLossBackward0>) 919 LOSS DIFF: tensor(5.3229, grad_fn=<NllLossBackward0>) tensor(5.2920, grad_fn=<NllLossBackward0>) 920 LOSS DIFF: tensor(5.2855, grad_fn=<NllLossBackward0>) tensor(5.1965, grad_fn=<NllLossBackward0>) 921 LOSS DIFF: tensor(5.3634, grad_fn=<NllLossBackward0>) tensor(5.2855, grad_fn=<NllLossBackward0>) 922 LOSS DIFF: tensor(5.3724, grad_fn=<NllLossBackward0>) tensor(5.0690, grad_fn=<NllLossBackward0>) 923 LOSS DIFF: tensor(5.2805, grad_fn=<NllLossBackward0>) tensor(5.2636, grad_fn=<NllLossBackward0>) 924 LOSS DIFF: tensor(5.2306, grad_fn=<NllLossBackward0>) tensor(5.0033, grad_fn=<NllLossBackward0>) 925 LOSS DIFF: tensor(5.2542, grad_fn=<NllLossBackward0>) tensor(5.2243, grad_fn=<NllLossBackward0>) 926 LOSS DIFF: tensor(5.3378, grad_fn=<NllLossBackward0>) tensor(5.2542, grad_fn=<NllLossBackward0>) 927 LOSS DIFF: tensor(5.2164, grad_fn=<NllLossBackward0>) tensor(5.1267, grad_fn=<NllLossBackward0>) 928 LOSS DIFF: tensor(5.3090, grad_fn=<NllLossBackward0>) tensor(5.2164, grad_fn=<NllLossBackward0>) 929 LOSS DIFF: tensor(5.3777, grad_fn=<NllLossBackward0>) tensor(5.3090, grad_fn=<NllLossBackward0>) 930 LOSS DIFF: tensor(5.2597, grad_fn=<NllLossBackward0>) tensor(5.2556, grad_fn=<NllLossBackward0>) 931 LOSS DIFF: tensor(5.4438, grad_fn=<NllLossBackward0>) tensor(5.2080, grad_fn=<NllLossBackward0>) 932 LOSS DIFF: tensor(5.2762, grad_fn=<NllLossBackward0>) tensor(5.2386, grad_fn=<NllLossBackward0>) 933 LOSS DIFF: tensor(5.3475, grad_fn=<NllLossBackward0>) tensor(5.1511, grad_fn=<NllLossBackward0>) 934 LOSS DIFF: tensor(5.3897, grad_fn=<NllLossBackward0>) tensor(5.3475, grad_fn=<NllLossBackward0>) 935 LOSS DIFF: tensor(5.2932, grad_fn=<NllLossBackward0>) tensor(5.1943, grad_fn=<NllLossBackward0>) 936 LOSS DIFF: tensor(5.3678, grad_fn=<NllLossBackward0>) tensor(5.2932, grad_fn=<NllLossBackward0>) 937 LOSS DIFF: tensor(5.3282, grad_fn=<NllLossBackward0>) tensor(5.2433, grad_fn=<NllLossBackward0>) 938 LOSS DIFF: tensor(5.3416, grad_fn=<NllLossBackward0>) tensor(5.3282, grad_fn=<NllLossBackward0>) 939 LOSS DIFF: tensor(5.2709, grad_fn=<NllLossBackward0>) tensor(5.1789, grad_fn=<NllLossBackward0>) 940 LOSS DIFF: tensor(5.3140, grad_fn=<NllLossBackward0>) tensor(5.2709, grad_fn=<NllLossBackward0>) 941 LOSS DIFF: tensor(5.2993, grad_fn=<NllLossBackward0>) tensor(5.2861, grad_fn=<NllLossBackward0>) 942 LOSS DIFF: tensor(5.1903, grad_fn=<NllLossBackward0>) tensor(5.1216, grad_fn=<NllLossBackward0>) 943 LOSS DIFF: tensor(5.2935, grad_fn=<NllLossBackward0>) tensor(5.1903, grad_fn=<NllLossBackward0>) 944 LOSS DIFF: tensor(5.2984, grad_fn=<NllLossBackward0>) tensor(5.2935, grad_fn=<NllLossBackward0>) 945 LOSS DIFF: tensor(5.3579, grad_fn=<NllLossBackward0>) tensor(5.2984, grad_fn=<NllLossBackward0>) 946 LOSS DIFF: tensor(5.2808, grad_fn=<NllLossBackward0>) tensor(5.1785, grad_fn=<NllLossBackward0>) 947 LOSS DIFF: tensor(5.2995, grad_fn=<NllLossBackward0>) tensor(5.2629, grad_fn=<NllLossBackward0>) 948 LOSS DIFF: tensor(5.3437, grad_fn=<NllLossBackward0>) tensor(5.2995, grad_fn=<NllLossBackward0>) 949 LOSS DIFF: tensor(5.3592, grad_fn=<NllLossBackward0>) tensor(5.3437, grad_fn=<NllLossBackward0>) 950 LOSS DIFF: tensor(5.4155, grad_fn=<NllLossBackward0>) tensor(5.3592, grad_fn=<NllLossBackward0>) 951 LOSS DIFF: tensor(5.3014, grad_fn=<NllLossBackward0>) tensor(5.2301, grad_fn=<NllLossBackward0>) 1900 tensor(5.3040, grad_fn=<NllLossBackward0>) 952 LOSS DIFF: tensor(5.3040, grad_fn=<NllLossBackward0>) tensor(5.2344, grad_fn=<NllLossBackward0>) 953 LOSS DIFF: tensor(5.2827, grad_fn=<NllLossBackward0>) tensor(5.2677, grad_fn=<NllLossBackward0>) 954 LOSS DIFF: tensor(5.3628, grad_fn=<NllLossBackward0>) tensor(5.2827, grad_fn=<NllLossBackward0>) 955 LOSS DIFF: tensor(5.2943, grad_fn=<NllLossBackward0>) tensor(5.2210, grad_fn=<NllLossBackward0>) 956 LOSS DIFF: tensor(5.1808, grad_fn=<NllLossBackward0>) tensor(5.1610, grad_fn=<NllLossBackward0>) 957 LOSS DIFF: tensor(5.3546, grad_fn=<NllLossBackward0>) tensor(5.1808, grad_fn=<NllLossBackward0>) 958 LOSS DIFF: tensor(5.1927, grad_fn=<NllLossBackward0>) tensor(5.1525, grad_fn=<NllLossBackward0>) 959 LOSS DIFF: tensor(5.3402, grad_fn=<NllLossBackward0>) tensor(5.1927, grad_fn=<NllLossBackward0>) 960 LOSS DIFF: tensor(5.3660, grad_fn=<NllLossBackward0>) tensor(5.2197, grad_fn=<NllLossBackward0>) 961 LOSS DIFF: tensor(5.3701, grad_fn=<NllLossBackward0>) tensor(5.3660, grad_fn=<NllLossBackward0>) 962 LOSS DIFF: tensor(5.1755, grad_fn=<NllLossBackward0>) tensor(5.1572, grad_fn=<NllLossBackward0>) 963 LOSS DIFF: tensor(5.2423, grad_fn=<NllLossBackward0>) tensor(5.1755, grad_fn=<NllLossBackward0>) 964 LOSS DIFF: tensor(5.4032, grad_fn=<NllLossBackward0>) tensor(5.2423, grad_fn=<NllLossBackward0>) 965 LOSS DIFF: tensor(5.3041, grad_fn=<NllLossBackward0>) tensor(5.1882, grad_fn=<NllLossBackward0>) 966 LOSS DIFF: tensor(5.3328, grad_fn=<NllLossBackward0>) tensor(5.3041, grad_fn=<NllLossBackward0>) 967 LOSS DIFF: tensor(5.1994, grad_fn=<NllLossBackward0>) tensor(5.1086, grad_fn=<NllLossBackward0>) 968 LOSS DIFF: tensor(5.2771, grad_fn=<NllLossBackward0>) tensor(5.1994, grad_fn=<NllLossBackward0>) 969 LOSS DIFF: tensor(5.3016, grad_fn=<NllLossBackward0>) tensor(5.2771, grad_fn=<NllLossBackward0>) 970 LOSS DIFF: tensor(5.3162, grad_fn=<NllLossBackward0>) tensor(5.3016, grad_fn=<NllLossBackward0>) 971 LOSS DIFF: tensor(5.3276, grad_fn=<NllLossBackward0>) tensor(5.2404, grad_fn=<NllLossBackward0>) 972 LOSS DIFF: tensor(5.3335, grad_fn=<NllLossBackward0>) tensor(5.3276, grad_fn=<NllLossBackward0>) 973 LOSS DIFF: tensor(5.3803, grad_fn=<NllLossBackward0>) tensor(5.2597, grad_fn=<NllLossBackward0>) 974 LOSS DIFF: tensor(5.2477, grad_fn=<NllLossBackward0>) tensor(5.1569, grad_fn=<NllLossBackward0>) 975 LOSS DIFF: tensor(5.3720, grad_fn=<NllLossBackward0>) tensor(5.2477, grad_fn=<NllLossBackward0>) 976 LOSS DIFF: tensor(5.3752, grad_fn=<NllLossBackward0>) tensor(5.3720, grad_fn=<NllLossBackward0>) 977 LOSS DIFF: tensor(5.2881, grad_fn=<NllLossBackward0>) tensor(5.2406, grad_fn=<NllLossBackward0>) 978 LOSS DIFF: tensor(5.4561, grad_fn=<NllLossBackward0>) tensor(5.2564, grad_fn=<NllLossBackward0>) 979 LOSS DIFF: tensor(5.3796, grad_fn=<NllLossBackward0>) tensor(5.3418, grad_fn=<NllLossBackward0>) 980 LOSS DIFF: tensor(5.2454, grad_fn=<NllLossBackward0>) tensor(5.2276, grad_fn=<NllLossBackward0>) 981 LOSS DIFF: tensor(5.3129, grad_fn=<NllLossBackward0>) tensor(5.2454, grad_fn=<NllLossBackward0>) 982 LOSS DIFF: tensor(5.3334, grad_fn=<NllLossBackward0>) tensor(5.3129, grad_fn=<NllLossBackward0>) 983 LOSS DIFF: tensor(5.3955, grad_fn=<NllLossBackward0>) tensor(5.3334, grad_fn=<NllLossBackward0>) 984 LOSS DIFF: tensor(5.4304, grad_fn=<NllLossBackward0>) tensor(5.2307, grad_fn=<NllLossBackward0>) 985 LOSS DIFF: tensor(5.3111, grad_fn=<NllLossBackward0>) tensor(5.1737, grad_fn=<NllLossBackward0>) 986 LOSS DIFF: tensor(5.3549, grad_fn=<NllLossBackward0>) tensor(5.3111, grad_fn=<NllLossBackward0>) 987 LOSS DIFF: tensor(5.3662, grad_fn=<NllLossBackward0>) tensor(5.2584, grad_fn=<NllLossBackward0>) 988 LOSS DIFF: tensor(5.3705, grad_fn=<NllLossBackward0>) tensor(5.1949, grad_fn=<NllLossBackward0>) 989 LOSS DIFF: tensor(5.2877, grad_fn=<NllLossBackward0>) tensor(5.2517, grad_fn=<NllLossBackward0>) 990 LOSS DIFF: tensor(5.2987, grad_fn=<NllLossBackward0>) tensor(5.2175, grad_fn=<NllLossBackward0>) 991 LOSS DIFF: tensor(5.3813, grad_fn=<NllLossBackward0>) tensor(5.1823, grad_fn=<NllLossBackward0>) 992 LOSS DIFF: tensor(5.3100, grad_fn=<NllLossBackward0>) tensor(5.2477, grad_fn=<NllLossBackward0>) 993 LOSS DIFF: tensor(5.3208, grad_fn=<NllLossBackward0>) tensor(5.1584, grad_fn=<NllLossBackward0>) 994 LOSS DIFF: tensor(5.3709, grad_fn=<NllLossBackward0>) tensor(5.3208, grad_fn=<NllLossBackward0>) 995 LOSS DIFF: tensor(5.2744, grad_fn=<NllLossBackward0>) tensor(5.1538, grad_fn=<NllLossBackward0>) 996 LOSS DIFF: tensor(5.2920, grad_fn=<NllLossBackward0>) tensor(5.2744, grad_fn=<NllLossBackward0>) 997 LOSS DIFF: tensor(5.3297, grad_fn=<NllLossBackward0>) tensor(5.2446, grad_fn=<NllLossBackward0>) 998 LOSS DIFF: tensor(5.3818, grad_fn=<NllLossBackward0>) tensor(5.3297, grad_fn=<NllLossBackward0>) 999 LOSS DIFF: tensor(5.2615, grad_fn=<NllLossBackward0>) tensor(5.1173, grad_fn=<NllLossBackward0>) 1000 LOSS DIFF: tensor(5.3420, grad_fn=<NllLossBackward0>) tensor(5.2615, grad_fn=<NllLossBackward0>)
loss_track2 = [t.detach().numpy() for t in loss_track]
import matplotlib.pyplot as plt
plt.plot(loss_track2)
plt.show()
torch.save(model.state_dict(), 'model.bin')
device = 'cpu'
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
model.load_state_dict(torch.load('model.bin'))
model.eval()
ixs = torch.tensor(vocab.forward(['he'])).to(device)
out = model(ixs)
top = torch.topk(out[0], 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
list(zip(top_words, top_indices, top_probs))
c:\PROGRAMY\Anaconda3\envs\scweet\lib\site-packages\torch\nn\modules\container.py:217: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument. input = module(input)
[('<unk>', 0, 0.1108938604593277), ('was', 12, 0.0792110487818718), ('had', 37, 0.07402306795120239), ('is', 8, 0.04529397189617157), ('has', 39, 0.03909718990325928), ('would', 48, 0.038855526596307755), ('said', 43, 0.022579118609428406), ('will', 27, 0.02008220925927162), ('went', 251, 0.013605386018753052), ('did', 151, 0.013007525354623795)]
def prediction(word: str) -> str:
ixs = torch.tensor(vocab.forward([word])).to(device)
out = model(ixs)
top = torch.topk(out[0], 5)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
zipped = list(zip(top_words, top_probs))
for index, element in enumerate(zipped):
unk = None
if '<unk>' in element:
unk = zipped.pop(index)
zipped.append(('', unk[1]))
break
if unk is None:
zipped[-1] = ('', zipped[-1][1])
return ' '.join([f'{x[0]}:{x[1]}' for x in zipped])
def create_outputs(folder_name):
print(f'Creating outputs in {folder_name}')
with lzma.open(f'{folder_name}/in.tsv.xz', mode='rt', encoding='utf-8') as fid:
with open(f'{folder_name}/out.tsv', 'w', encoding='utf-8', newline='\n') as f:
for line in fid:
separated = line.split('\t')
prefix = separated[6].replace(r'\n', ' ').split()[-1]
output_line = prediction(prefix)
f.write(output_line + '\n')
create_outputs('dev-0')
create_outputs('test-A')
Creating outputs in dev-0 Creating outputs in test-A