challenging-america-word-ga.../nb_nn.ipynb
2023-04-28 00:39:34 +02:00

152 KiB
Raw Blame History

import torch
import lzma
from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
from torch import nn
from torch.utils.data import IterableDataset
import itertools
from torch.utils.data import DataLoader
# torch.cuda.is_available()
# torch.cuda.device_count()
# torch.cuda.current_device()
# torch.cuda.device(0)
# torch.cuda.get_device_name(0)
def get_words_from_line(line):
  line = line.rstrip()
  line = re.sub(r'\\\\n', ' ', line)
  line = re.sub(r'[^a-zA-Z] ', ' ', line)
  line = line.lower()
  yield '<s>'
  for t in line.split():
    yield t
  yield '</s>'

def get_word_lines_from_file(file_name):
  with lzma.open(file_name, encoding='utf8', mode="rt") as fh:
    for line in fh:
      yield get_words_from_line(line)

vocab_size = 10_000

vocab = build_vocab_from_iterator(
    get_word_lines_from_file("train/in.tsv.xz"),
    max_tokens = vocab_size,
    specials = ['<unk>'])

vocab.set_default_index(vocab['<unk>'])
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[80], line 18
     14       yield get_words_from_line(line)
     16 vocab_size = 10_000
---> 18 vocab = build_vocab_from_iterator(
     19     get_word_lines_from_file("train/in.tsv.xz"),
     20     max_tokens = vocab_size,
     21     specials = ['<unk>'])
     23 vocab.set_default_index(vocab['<unk>'])

File c:\PROGRAMY\Anaconda3\envs\scweet\lib\site-packages\torchtext\vocab\vocab_factory.py:99, in build_vocab_from_iterator(iterator, min_freq, specials, special_first, max_tokens)
     97 counter = Counter()
     98 for tokens in iterator:
---> 99     counter.update(tokens)
    101 specials = specials or []
    103 # First sort by descending frequency, then lexicographically

File c:\PROGRAMY\Anaconda3\envs\scweet\lib\collections\__init__.py:637, in Counter.update(self, iterable, **kwds)
    635             super(Counter, self).update(iterable) # fast path when counter is empty
    636     else:
--> 637         _count_elements(self, iterable)
    638 if kwds:
    639     self.update(kwds)

Cell In[80], line 4, in get_words_from_line(line)
      2 line = line.rstrip()
      3 line = re.sub(r'\\\\n', ' ', line)
----> 4 line = re.sub(r'[^a-zA-Z] ', ' ', line)
      5 line = line.lower()
      6 yield '<s>'

File c:\PROGRAMY\Anaconda3\envs\scweet\lib\site-packages\regex\regex.py:278, in sub(pattern, repl, string, count, flags, pos, endpos, concurrent, timeout, ignore_unused, **kwargs)
    272 """Return the string obtained by replacing the leftmost (or rightmost with a
    273 reverse pattern) non-overlapping occurrences of the pattern in string by the
    274 replacement repl. repl can be either a string or a callable; if a string,
    275 backslash escapes in it are processed; if a callable, it's passed the match
    276 object and must return a replacement string to be used."""
    277 pat = _compile(pattern, flags, ignore_unused, kwargs, True)
--> 278 return pat.sub(repl, string, count, pos, endpos, concurrent, timeout)

KeyboardInterrupt: 
def look_ahead_iterator(gen):
   prev = None
   for item in gen:
      if prev is not None:
         yield (prev, item)
      prev = item

class Bigrams(IterableDataset):
  def __init__(self, text_file, vocabulary_size):
      self.vocab = vocab
      self.vocab.set_default_index(self.vocab['<unk>'])
      self.vocabulary_size = vocabulary_size
      self.text_file = text_file

  def __iter__(self):
     return look_ahead_iterator(
         (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))

train_dataset = Bigrams("train/in.tsv.xz", vocab_size)
next(iter(DataLoader(train_dataset, batch_size=10)))
[tensor([  33,    0,  226,   35,    0, 6421, 6420,  219, 5781,    1]),
 tensor([   0,  226,   35,    0, 6421, 6420,  219, 5781,    1,  113])]
embed_size = 100

class SimpleBigramNeuralLanguageModel(nn.Module):
  def __init__(self, vocabulary_size, embedding_size):
      super(SimpleBigramNeuralLanguageModel, self).__init__()
      self.model = nn.Sequential(
          nn.Embedding(vocabulary_size, embedding_size),
          nn.Linear(embedding_size, 1000),
          nn.ReLU(),
          nn.Linear(1000, 500),
          nn.ReLU(),
          nn.Linear(500, vocabulary_size),
          nn.Softmax()
      )

  def forward(self, x):
      return self.model(x)

model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size)

vocab.set_default_index(vocab['<unk>'])
ixs = torch.tensor(vocab.forward(['is']))
out = model(ixs)
out[0][vocab['is']]
tensor(8.4503e-05, grad_fn=<SelectBackward0>)
loss_track = []

device = 'cpu'
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
data = DataLoader(train_dataset, batch_size=6000)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()

last_loss = 1_000
trigger_count = 0

model.train()
step = 0
for x, y in data:
   x = x.to(device)
   y = y.to(device)
   optimizer.zero_grad()
   ypredicted = model(x)
   loss = criterion(torch.log(ypredicted), y)
   if step % 100 == 0:
      print(step, loss)
   step += 1
   loss.backward()
   optimizer.step()

   if loss > last_loss:
      trigger_count += 1 
      print(trigger_count, 'LOSS DIFF:', loss, last_loss)

   if trigger_count >= 1_000:
      break

   loss_track.append(loss)
   last_loss = loss
0 tensor(9.2249, grad_fn=<NllLossBackward0>)
1 LOSS DIFF: tensor(6.9568, grad_fn=<NllLossBackward0>) tensor(6.9539, grad_fn=<NllLossBackward0>)
2 LOSS DIFF: tensor(6.5283, grad_fn=<NllLossBackward0>) tensor(6.3437, grad_fn=<NllLossBackward0>)
3 LOSS DIFF: tensor(6.4010, grad_fn=<NllLossBackward0>) tensor(6.3773, grad_fn=<NllLossBackward0>)
4 LOSS DIFF: tensor(6.4818, grad_fn=<NllLossBackward0>) tensor(6.4010, grad_fn=<NllLossBackward0>)
5 LOSS DIFF: tensor(6.4520, grad_fn=<NllLossBackward0>) tensor(6.3898, grad_fn=<NllLossBackward0>)
6 LOSS DIFF: tensor(6.2989, grad_fn=<NllLossBackward0>) tensor(6.2184, grad_fn=<NllLossBackward0>)
7 LOSS DIFF: tensor(6.3109, grad_fn=<NllLossBackward0>) tensor(6.2989, grad_fn=<NllLossBackward0>)
8 LOSS DIFF: tensor(6.3028, grad_fn=<NllLossBackward0>) tensor(6.2805, grad_fn=<NllLossBackward0>)
9 LOSS DIFF: tensor(6.3590, grad_fn=<NllLossBackward0>) tensor(6.3028, grad_fn=<NllLossBackward0>)
10 LOSS DIFF: tensor(6.1484, grad_fn=<NllLossBackward0>) tensor(6.1278, grad_fn=<NllLossBackward0>)
11 LOSS DIFF: tensor(6.2458, grad_fn=<NllLossBackward0>) tensor(6.0779, grad_fn=<NllLossBackward0>)
12 LOSS DIFF: tensor(6.3209, grad_fn=<NllLossBackward0>) tensor(6.2458, grad_fn=<NllLossBackward0>)
13 LOSS DIFF: tensor(6.2801, grad_fn=<NllLossBackward0>) tensor(6.1436, grad_fn=<NllLossBackward0>)
14 LOSS DIFF: tensor(6.1245, grad_fn=<NllLossBackward0>) tensor(6.0657, grad_fn=<NllLossBackward0>)
15 LOSS DIFF: tensor(6.2682, grad_fn=<NllLossBackward0>) tensor(6.0906, grad_fn=<NllLossBackward0>)
16 LOSS DIFF: tensor(6.0394, grad_fn=<NllLossBackward0>) tensor(6.0062, grad_fn=<NllLossBackward0>)
17 LOSS DIFF: tensor(6.1070, grad_fn=<NllLossBackward0>) tensor(6.0394, grad_fn=<NllLossBackward0>)
18 LOSS DIFF: tensor(6.2271, grad_fn=<NllLossBackward0>) tensor(6.1070, grad_fn=<NllLossBackward0>)
19 LOSS DIFF: tensor(6.0964, grad_fn=<NllLossBackward0>) tensor(6.0577, grad_fn=<NllLossBackward0>)
20 LOSS DIFF: tensor(6.0909, grad_fn=<NllLossBackward0>) tensor(6.0436, grad_fn=<NllLossBackward0>)
21 LOSS DIFF: tensor(6.0210, grad_fn=<NllLossBackward0>) tensor(6.0016, grad_fn=<NllLossBackward0>)
22 LOSS DIFF: tensor(6.0296, grad_fn=<NllLossBackward0>) tensor(6.0210, grad_fn=<NllLossBackward0>)
23 LOSS DIFF: tensor(6.1812, grad_fn=<NllLossBackward0>) tensor(6.0296, grad_fn=<NllLossBackward0>)
24 LOSS DIFF: tensor(6.1665, grad_fn=<NllLossBackward0>) tensor(6.0736, grad_fn=<NllLossBackward0>)
25 LOSS DIFF: tensor(6.0107, grad_fn=<NllLossBackward0>) tensor(5.9340, grad_fn=<NllLossBackward0>)
26 LOSS DIFF: tensor(5.9806, grad_fn=<NllLossBackward0>) tensor(5.9473, grad_fn=<NllLossBackward0>)
27 LOSS DIFF: tensor(5.9364, grad_fn=<NllLossBackward0>) tensor(5.8515, grad_fn=<NllLossBackward0>)
28 LOSS DIFF: tensor(5.9202, grad_fn=<NllLossBackward0>) tensor(5.9180, grad_fn=<NllLossBackward0>)
29 LOSS DIFF: tensor(6.0357, grad_fn=<NllLossBackward0>) tensor(5.8964, grad_fn=<NllLossBackward0>)
30 LOSS DIFF: tensor(6.1189, grad_fn=<NllLossBackward0>) tensor(5.9309, grad_fn=<NllLossBackward0>)
31 LOSS DIFF: tensor(6.0280, grad_fn=<NllLossBackward0>) tensor(5.8488, grad_fn=<NllLossBackward0>)
32 LOSS DIFF: tensor(6.1555, grad_fn=<NllLossBackward0>) tensor(6.0280, grad_fn=<NllLossBackward0>)
33 LOSS DIFF: tensor(6.0389, grad_fn=<NllLossBackward0>) tensor(5.9000, grad_fn=<NllLossBackward0>)
34 LOSS DIFF: tensor(5.8367, grad_fn=<NllLossBackward0>) tensor(5.7437, grad_fn=<NllLossBackward0>)
35 LOSS DIFF: tensor(5.9835, grad_fn=<NllLossBackward0>) tensor(5.8367, grad_fn=<NllLossBackward0>)
36 LOSS DIFF: tensor(5.9613, grad_fn=<NllLossBackward0>) tensor(5.7643, grad_fn=<NllLossBackward0>)
37 LOSS DIFF: tensor(6.0189, grad_fn=<NllLossBackward0>) tensor(5.9613, grad_fn=<NllLossBackward0>)
38 LOSS DIFF: tensor(5.9064, grad_fn=<NllLossBackward0>) tensor(5.8300, grad_fn=<NllLossBackward0>)
39 LOSS DIFF: tensor(5.9395, grad_fn=<NllLossBackward0>) tensor(5.8984, grad_fn=<NllLossBackward0>)
40 LOSS DIFF: tensor(5.9919, grad_fn=<NllLossBackward0>) tensor(5.9395, grad_fn=<NllLossBackward0>)
41 LOSS DIFF: tensor(5.8834, grad_fn=<NllLossBackward0>) tensor(5.8792, grad_fn=<NllLossBackward0>)
42 LOSS DIFF: tensor(5.7971, grad_fn=<NllLossBackward0>) tensor(5.7641, grad_fn=<NllLossBackward0>)
43 LOSS DIFF: tensor(5.8632, grad_fn=<NllLossBackward0>) tensor(5.7971, grad_fn=<NllLossBackward0>)
44 LOSS DIFF: tensor(5.8988, grad_fn=<NllLossBackward0>) tensor(5.8632, grad_fn=<NllLossBackward0>)
45 LOSS DIFF: tensor(5.9258, grad_fn=<NllLossBackward0>) tensor(5.8670, grad_fn=<NllLossBackward0>)
100 tensor(5.8536, grad_fn=<NllLossBackward0>)
46 LOSS DIFF: tensor(5.8536, grad_fn=<NllLossBackward0>) tensor(5.8226, grad_fn=<NllLossBackward0>)
47 LOSS DIFF: tensor(5.8648, grad_fn=<NllLossBackward0>) tensor(5.8536, grad_fn=<NllLossBackward0>)
48 LOSS DIFF: tensor(6.0083, grad_fn=<NllLossBackward0>) tensor(5.8648, grad_fn=<NllLossBackward0>)
49 LOSS DIFF: tensor(5.8324, grad_fn=<NllLossBackward0>) tensor(5.7953, grad_fn=<NllLossBackward0>)
50 LOSS DIFF: tensor(5.9055, grad_fn=<NllLossBackward0>) tensor(5.8324, grad_fn=<NllLossBackward0>)
51 LOSS DIFF: tensor(5.9507, grad_fn=<NllLossBackward0>) tensor(5.7720, grad_fn=<NllLossBackward0>)
52 LOSS DIFF: tensor(5.8892, grad_fn=<NllLossBackward0>) tensor(5.7376, grad_fn=<NllLossBackward0>)
53 LOSS DIFF: tensor(5.8218, grad_fn=<NllLossBackward0>) tensor(5.6474, grad_fn=<NllLossBackward0>)
54 LOSS DIFF: tensor(5.8381, grad_fn=<NllLossBackward0>) tensor(5.8218, grad_fn=<NllLossBackward0>)
55 LOSS DIFF: tensor(5.9608, grad_fn=<NllLossBackward0>) tensor(5.8381, grad_fn=<NllLossBackward0>)
56 LOSS DIFF: tensor(5.9855, grad_fn=<NllLossBackward0>) tensor(5.9496, grad_fn=<NllLossBackward0>)
57 LOSS DIFF: tensor(5.9235, grad_fn=<NllLossBackward0>) tensor(5.7299, grad_fn=<NllLossBackward0>)
58 LOSS DIFF: tensor(5.9411, grad_fn=<NllLossBackward0>) tensor(5.7029, grad_fn=<NllLossBackward0>)
59 LOSS DIFF: tensor(5.8516, grad_fn=<NllLossBackward0>) tensor(5.7566, grad_fn=<NllLossBackward0>)
60 LOSS DIFF: tensor(5.8243, grad_fn=<NllLossBackward0>) tensor(5.6658, grad_fn=<NllLossBackward0>)
61 LOSS DIFF: tensor(5.8496, grad_fn=<NllLossBackward0>) tensor(5.7968, grad_fn=<NllLossBackward0>)
62 LOSS DIFF: tensor(5.7651, grad_fn=<NllLossBackward0>) tensor(5.6680, grad_fn=<NllLossBackward0>)
63 LOSS DIFF: tensor(5.8133, grad_fn=<NllLossBackward0>) tensor(5.7651, grad_fn=<NllLossBackward0>)
64 LOSS DIFF: tensor(5.8699, grad_fn=<NllLossBackward0>) tensor(5.4926, grad_fn=<NllLossBackward0>)
65 LOSS DIFF: tensor(5.7983, grad_fn=<NllLossBackward0>) tensor(5.7203, grad_fn=<NllLossBackward0>)
66 LOSS DIFF: tensor(5.8621, grad_fn=<NllLossBackward0>) tensor(5.4968, grad_fn=<NllLossBackward0>)
67 LOSS DIFF: tensor(5.8183, grad_fn=<NllLossBackward0>) tensor(5.6879, grad_fn=<NllLossBackward0>)
68 LOSS DIFF: tensor(5.7855, grad_fn=<NllLossBackward0>) tensor(5.7245, grad_fn=<NllLossBackward0>)
69 LOSS DIFF: tensor(5.7728, grad_fn=<NllLossBackward0>) tensor(5.6484, grad_fn=<NllLossBackward0>)
70 LOSS DIFF: tensor(5.7415, grad_fn=<NllLossBackward0>) tensor(5.5859, grad_fn=<NllLossBackward0>)
71 LOSS DIFF: tensor(5.7307, grad_fn=<NllLossBackward0>) tensor(5.6239, grad_fn=<NllLossBackward0>)
72 LOSS DIFF: tensor(5.7754, grad_fn=<NllLossBackward0>) tensor(5.6253, grad_fn=<NllLossBackward0>)
73 LOSS DIFF: tensor(5.8733, grad_fn=<NllLossBackward0>) tensor(5.5662, grad_fn=<NllLossBackward0>)
74 LOSS DIFF: tensor(5.7932, grad_fn=<NllLossBackward0>) tensor(5.7448, grad_fn=<NllLossBackward0>)
75 LOSS DIFF: tensor(5.7643, grad_fn=<NllLossBackward0>) tensor(5.6964, grad_fn=<NllLossBackward0>)
76 LOSS DIFF: tensor(5.6395, grad_fn=<NllLossBackward0>) tensor(5.6045, grad_fn=<NllLossBackward0>)
77 LOSS DIFF: tensor(5.7189, grad_fn=<NllLossBackward0>) tensor(5.6395, grad_fn=<NllLossBackward0>)
78 LOSS DIFF: tensor(5.7524, grad_fn=<NllLossBackward0>) tensor(5.5841, grad_fn=<NllLossBackward0>)
79 LOSS DIFF: tensor(5.7829, grad_fn=<NllLossBackward0>) tensor(5.5593, grad_fn=<NllLossBackward0>)
80 LOSS DIFF: tensor(5.8024, grad_fn=<NllLossBackward0>) tensor(5.7829, grad_fn=<NllLossBackward0>)
81 LOSS DIFF: tensor(5.8275, grad_fn=<NllLossBackward0>) tensor(5.7907, grad_fn=<NllLossBackward0>)
82 LOSS DIFF: tensor(5.6191, grad_fn=<NllLossBackward0>) tensor(5.5317, grad_fn=<NllLossBackward0>)
83 LOSS DIFF: tensor(5.7328, grad_fn=<NllLossBackward0>) tensor(5.6191, grad_fn=<NllLossBackward0>)
84 LOSS DIFF: tensor(5.7513, grad_fn=<NllLossBackward0>) tensor(5.6999, grad_fn=<NllLossBackward0>)
85 LOSS DIFF: tensor(5.7847, grad_fn=<NllLossBackward0>) tensor(5.7513, grad_fn=<NllLossBackward0>)
86 LOSS DIFF: tensor(5.7548, grad_fn=<NllLossBackward0>) tensor(5.6437, grad_fn=<NllLossBackward0>)
87 LOSS DIFF: tensor(5.7529, grad_fn=<NllLossBackward0>) tensor(5.7198, grad_fn=<NllLossBackward0>)
88 LOSS DIFF: tensor(5.7664, grad_fn=<NllLossBackward0>) tensor(5.5831, grad_fn=<NllLossBackward0>)
89 LOSS DIFF: tensor(5.7668, grad_fn=<NllLossBackward0>) tensor(5.6415, grad_fn=<NllLossBackward0>)
90 LOSS DIFF: tensor(5.7174, grad_fn=<NllLossBackward0>) tensor(5.6232, grad_fn=<NllLossBackward0>)
91 LOSS DIFF: tensor(5.7451, grad_fn=<NllLossBackward0>) tensor(5.6730, grad_fn=<NllLossBackward0>)
92 LOSS DIFF: tensor(5.7578, grad_fn=<NllLossBackward0>) tensor(5.7451, grad_fn=<NllLossBackward0>)
93 LOSS DIFF: tensor(5.6858, grad_fn=<NllLossBackward0>) tensor(5.4322, grad_fn=<NllLossBackward0>)
94 LOSS DIFF: tensor(5.7738, grad_fn=<NllLossBackward0>) tensor(5.6858, grad_fn=<NllLossBackward0>)
200 tensor(5.7337, grad_fn=<NllLossBackward0>)
95 LOSS DIFF: tensor(5.7337, grad_fn=<NllLossBackward0>) tensor(5.6356, grad_fn=<NllLossBackward0>)
96 LOSS DIFF: tensor(5.6635, grad_fn=<NllLossBackward0>) tensor(5.5954, grad_fn=<NllLossBackward0>)
97 LOSS DIFF: tensor(5.6635, grad_fn=<NllLossBackward0>) tensor(5.6516, grad_fn=<NllLossBackward0>)
98 LOSS DIFF: tensor(5.8410, grad_fn=<NllLossBackward0>) tensor(5.6141, grad_fn=<NllLossBackward0>)
99 LOSS DIFF: tensor(5.7671, grad_fn=<NllLossBackward0>) tensor(5.6264, grad_fn=<NllLossBackward0>)
100 LOSS DIFF: tensor(5.6642, grad_fn=<NllLossBackward0>) tensor(5.6263, grad_fn=<NllLossBackward0>)
101 LOSS DIFF: tensor(5.7031, grad_fn=<NllLossBackward0>) tensor(5.6022, grad_fn=<NllLossBackward0>)
102 LOSS DIFF: tensor(5.7371, grad_fn=<NllLossBackward0>) tensor(5.7031, grad_fn=<NllLossBackward0>)
103 LOSS DIFF: tensor(5.6638, grad_fn=<NllLossBackward0>) tensor(5.6220, grad_fn=<NllLossBackward0>)
104 LOSS DIFF: tensor(5.6687, grad_fn=<NllLossBackward0>) tensor(5.6638, grad_fn=<NllLossBackward0>)
105 LOSS DIFF: tensor(5.7376, grad_fn=<NllLossBackward0>) tensor(5.6687, grad_fn=<NllLossBackward0>)
106 LOSS DIFF: tensor(5.7511, grad_fn=<NllLossBackward0>) tensor(5.7249, grad_fn=<NllLossBackward0>)
107 LOSS DIFF: tensor(5.6811, grad_fn=<NllLossBackward0>) tensor(5.6714, grad_fn=<NllLossBackward0>)
108 LOSS DIFF: tensor(5.7101, grad_fn=<NllLossBackward0>) tensor(5.5892, grad_fn=<NllLossBackward0>)
109 LOSS DIFF: tensor(5.6188, grad_fn=<NllLossBackward0>) tensor(5.5320, grad_fn=<NllLossBackward0>)
110 LOSS DIFF: tensor(5.6656, grad_fn=<NllLossBackward0>) tensor(5.6188, grad_fn=<NllLossBackward0>)
111 LOSS DIFF: tensor(5.6711, grad_fn=<NllLossBackward0>) tensor(5.5220, grad_fn=<NllLossBackward0>)
112 LOSS DIFF: tensor(5.7719, grad_fn=<NllLossBackward0>) tensor(5.6711, grad_fn=<NllLossBackward0>)
113 LOSS DIFF: tensor(5.7275, grad_fn=<NllLossBackward0>) tensor(5.6023, grad_fn=<NllLossBackward0>)
114 LOSS DIFF: tensor(5.7216, grad_fn=<NllLossBackward0>) tensor(5.6046, grad_fn=<NllLossBackward0>)
115 LOSS DIFF: tensor(5.6189, grad_fn=<NllLossBackward0>) tensor(5.5715, grad_fn=<NllLossBackward0>)
116 LOSS DIFF: tensor(5.6879, grad_fn=<NllLossBackward0>) tensor(5.6189, grad_fn=<NllLossBackward0>)
117 LOSS DIFF: tensor(5.7076, grad_fn=<NllLossBackward0>) tensor(5.6879, grad_fn=<NllLossBackward0>)
118 LOSS DIFF: tensor(5.6123, grad_fn=<NllLossBackward0>) tensor(5.5496, grad_fn=<NllLossBackward0>)
119 LOSS DIFF: tensor(5.6219, grad_fn=<NllLossBackward0>) tensor(5.6123, grad_fn=<NllLossBackward0>)
120 LOSS DIFF: tensor(5.6567, grad_fn=<NllLossBackward0>) tensor(5.4889, grad_fn=<NllLossBackward0>)
121 LOSS DIFF: tensor(5.7262, grad_fn=<NllLossBackward0>) tensor(5.6334, grad_fn=<NllLossBackward0>)
122 LOSS DIFF: tensor(5.7325, grad_fn=<NllLossBackward0>) tensor(5.6450, grad_fn=<NllLossBackward0>)
123 LOSS DIFF: tensor(5.7161, grad_fn=<NllLossBackward0>) tensor(5.5794, grad_fn=<NllLossBackward0>)
124 LOSS DIFF: tensor(5.5623, grad_fn=<NllLossBackward0>) tensor(5.5361, grad_fn=<NllLossBackward0>)
125 LOSS DIFF: tensor(5.5797, grad_fn=<NllLossBackward0>) tensor(5.5623, grad_fn=<NllLossBackward0>)
126 LOSS DIFF: tensor(5.6225, grad_fn=<NllLossBackward0>) tensor(5.5797, grad_fn=<NllLossBackward0>)
127 LOSS DIFF: tensor(5.5912, grad_fn=<NllLossBackward0>) tensor(5.5347, grad_fn=<NllLossBackward0>)
128 LOSS DIFF: tensor(5.6655, grad_fn=<NllLossBackward0>) tensor(5.5912, grad_fn=<NllLossBackward0>)
129 LOSS DIFF: tensor(5.6695, grad_fn=<NllLossBackward0>) tensor(5.6655, grad_fn=<NllLossBackward0>)
130 LOSS DIFF: tensor(5.7027, grad_fn=<NllLossBackward0>) tensor(5.6695, grad_fn=<NllLossBackward0>)
131 LOSS DIFF: tensor(5.6836, grad_fn=<NllLossBackward0>) tensor(5.5821, grad_fn=<NllLossBackward0>)
132 LOSS DIFF: tensor(5.5875, grad_fn=<NllLossBackward0>) tensor(5.5289, grad_fn=<NllLossBackward0>)
133 LOSS DIFF: tensor(5.6111, grad_fn=<NllLossBackward0>) tensor(5.4911, grad_fn=<NllLossBackward0>)
134 LOSS DIFF: tensor(5.6462, grad_fn=<NllLossBackward0>) tensor(5.6111, grad_fn=<NllLossBackward0>)
135 LOSS DIFF: tensor(5.4761, grad_fn=<NllLossBackward0>) tensor(5.3862, grad_fn=<NllLossBackward0>)
136 LOSS DIFF: tensor(5.5751, grad_fn=<NllLossBackward0>) tensor(5.4761, grad_fn=<NllLossBackward0>)
137 LOSS DIFF: tensor(5.5107, grad_fn=<NllLossBackward0>) tensor(5.3580, grad_fn=<NllLossBackward0>)
138 LOSS DIFF: tensor(5.5294, grad_fn=<NllLossBackward0>) tensor(5.5032, grad_fn=<NllLossBackward0>)
139 LOSS DIFF: tensor(5.8044, grad_fn=<NllLossBackward0>) tensor(5.5294, grad_fn=<NllLossBackward0>)
140 LOSS DIFF: tensor(5.5610, grad_fn=<NllLossBackward0>) tensor(5.4624, grad_fn=<NllLossBackward0>)
141 LOSS DIFF: tensor(5.6199, grad_fn=<NllLossBackward0>) tensor(5.5610, grad_fn=<NllLossBackward0>)
142 LOSS DIFF: tensor(5.6073, grad_fn=<NllLossBackward0>) tensor(5.5645, grad_fn=<NllLossBackward0>)
143 LOSS DIFF: tensor(5.8155, grad_fn=<NllLossBackward0>) tensor(5.6073, grad_fn=<NllLossBackward0>)
144 LOSS DIFF: tensor(5.6119, grad_fn=<NllLossBackward0>) tensor(5.5148, grad_fn=<NllLossBackward0>)
145 LOSS DIFF: tensor(5.6557, grad_fn=<NllLossBackward0>) tensor(5.5193, grad_fn=<NllLossBackward0>)
300 tensor(5.5923, grad_fn=<NllLossBackward0>)
146 LOSS DIFF: tensor(5.6352, grad_fn=<NllLossBackward0>) tensor(5.5923, grad_fn=<NllLossBackward0>)
147 LOSS DIFF: tensor(5.6034, grad_fn=<NllLossBackward0>) tensor(5.4999, grad_fn=<NllLossBackward0>)
148 LOSS DIFF: tensor(5.6058, grad_fn=<NllLossBackward0>) tensor(5.6034, grad_fn=<NllLossBackward0>)
149 LOSS DIFF: tensor(5.6262, grad_fn=<NllLossBackward0>) tensor(5.5992, grad_fn=<NllLossBackward0>)
150 LOSS DIFF: tensor(5.6428, grad_fn=<NllLossBackward0>) tensor(5.5092, grad_fn=<NllLossBackward0>)
151 LOSS DIFF: tensor(5.6501, grad_fn=<NllLossBackward0>) tensor(5.5660, grad_fn=<NllLossBackward0>)
152 LOSS DIFF: tensor(5.6203, grad_fn=<NllLossBackward0>) tensor(5.5295, grad_fn=<NllLossBackward0>)
153 LOSS DIFF: tensor(5.6420, grad_fn=<NllLossBackward0>) tensor(5.6203, grad_fn=<NllLossBackward0>)
154 LOSS DIFF: tensor(5.7322, grad_fn=<NllLossBackward0>) tensor(5.4864, grad_fn=<NllLossBackward0>)
155 LOSS DIFF: tensor(5.6117, grad_fn=<NllLossBackward0>) tensor(5.4803, grad_fn=<NllLossBackward0>)
156 LOSS DIFF: tensor(5.5395, grad_fn=<NllLossBackward0>) tensor(5.4970, grad_fn=<NllLossBackward0>)
157 LOSS DIFF: tensor(5.6619, grad_fn=<NllLossBackward0>) tensor(5.5060, grad_fn=<NllLossBackward0>)
158 LOSS DIFF: tensor(5.6368, grad_fn=<NllLossBackward0>) tensor(5.5258, grad_fn=<NllLossBackward0>)
159 LOSS DIFF: tensor(5.5889, grad_fn=<NllLossBackward0>) tensor(5.5490, grad_fn=<NllLossBackward0>)
160 LOSS DIFF: tensor(5.6312, grad_fn=<NllLossBackward0>) tensor(5.5038, grad_fn=<NllLossBackward0>)
161 LOSS DIFF: tensor(5.5349, grad_fn=<NllLossBackward0>) tensor(5.5015, grad_fn=<NllLossBackward0>)
162 LOSS DIFF: tensor(5.6371, grad_fn=<NllLossBackward0>) tensor(5.5349, grad_fn=<NllLossBackward0>)
163 LOSS DIFF: tensor(5.6482, grad_fn=<NllLossBackward0>) tensor(5.6371, grad_fn=<NllLossBackward0>)
164 LOSS DIFF: tensor(5.6638, grad_fn=<NllLossBackward0>) tensor(5.6482, grad_fn=<NllLossBackward0>)
165 LOSS DIFF: tensor(5.6737, grad_fn=<NllLossBackward0>) tensor(5.4801, grad_fn=<NllLossBackward0>)
166 LOSS DIFF: tensor(5.4878, grad_fn=<NllLossBackward0>) tensor(5.4866, grad_fn=<NllLossBackward0>)
167 LOSS DIFF: tensor(5.6624, grad_fn=<NllLossBackward0>) tensor(5.4878, grad_fn=<NllLossBackward0>)
168 LOSS DIFF: tensor(5.5738, grad_fn=<NllLossBackward0>) tensor(5.5648, grad_fn=<NllLossBackward0>)
169 LOSS DIFF: tensor(5.5267, grad_fn=<NllLossBackward0>) tensor(5.4309, grad_fn=<NllLossBackward0>)
170 LOSS DIFF: tensor(5.6041, grad_fn=<NllLossBackward0>) tensor(5.3970, grad_fn=<NllLossBackward0>)
171 LOSS DIFF: tensor(5.6640, grad_fn=<NllLossBackward0>) tensor(5.4885, grad_fn=<NllLossBackward0>)
172 LOSS DIFF: tensor(5.6136, grad_fn=<NllLossBackward0>) tensor(5.4977, grad_fn=<NllLossBackward0>)
173 LOSS DIFF: tensor(5.6567, grad_fn=<NllLossBackward0>) tensor(5.5459, grad_fn=<NllLossBackward0>)
174 LOSS DIFF: tensor(5.5721, grad_fn=<NllLossBackward0>) tensor(5.4921, grad_fn=<NllLossBackward0>)
175 LOSS DIFF: tensor(5.5685, grad_fn=<NllLossBackward0>) tensor(5.5363, grad_fn=<NllLossBackward0>)
176 LOSS DIFF: tensor(5.5438, grad_fn=<NllLossBackward0>) tensor(5.4754, grad_fn=<NllLossBackward0>)
177 LOSS DIFF: tensor(5.6087, grad_fn=<NllLossBackward0>) tensor(5.5345, grad_fn=<NllLossBackward0>)
178 LOSS DIFF: tensor(5.5624, grad_fn=<NllLossBackward0>) tensor(5.3589, grad_fn=<NllLossBackward0>)
179 LOSS DIFF: tensor(5.6284, grad_fn=<NllLossBackward0>) tensor(5.4887, grad_fn=<NllLossBackward0>)
180 LOSS DIFF: tensor(5.4859, grad_fn=<NllLossBackward0>) tensor(5.4453, grad_fn=<NllLossBackward0>)
181 LOSS DIFF: tensor(5.4949, grad_fn=<NllLossBackward0>) tensor(5.4859, grad_fn=<NllLossBackward0>)
182 LOSS DIFF: tensor(5.5938, grad_fn=<NllLossBackward0>) tensor(5.4949, grad_fn=<NllLossBackward0>)
183 LOSS DIFF: tensor(5.5222, grad_fn=<NllLossBackward0>) tensor(5.4890, grad_fn=<NllLossBackward0>)
184 LOSS DIFF: tensor(5.6673, grad_fn=<NllLossBackward0>) tensor(5.5222, grad_fn=<NllLossBackward0>)
185 LOSS DIFF: tensor(5.6337, grad_fn=<NllLossBackward0>) tensor(5.5833, grad_fn=<NllLossBackward0>)
186 LOSS DIFF: tensor(5.7171, grad_fn=<NllLossBackward0>) tensor(5.6337, grad_fn=<NllLossBackward0>)
187 LOSS DIFF: tensor(5.5721, grad_fn=<NllLossBackward0>) tensor(5.4927, grad_fn=<NllLossBackward0>)
188 LOSS DIFF: tensor(5.5771, grad_fn=<NllLossBackward0>) tensor(5.5721, grad_fn=<NllLossBackward0>)
189 LOSS DIFF: tensor(5.6379, grad_fn=<NllLossBackward0>) tensor(5.5771, grad_fn=<NllLossBackward0>)
190 LOSS DIFF: tensor(5.6032, grad_fn=<NllLossBackward0>) tensor(5.4434, grad_fn=<NllLossBackward0>)
191 LOSS DIFF: tensor(5.5389, grad_fn=<NllLossBackward0>) tensor(5.3454, grad_fn=<NllLossBackward0>)
192 LOSS DIFF: tensor(5.6966, grad_fn=<NllLossBackward0>) tensor(5.4275, grad_fn=<NllLossBackward0>)
193 LOSS DIFF: tensor(5.3675, grad_fn=<NllLossBackward0>) tensor(5.3163, grad_fn=<NllLossBackward0>)
194 LOSS DIFF: tensor(5.4924, grad_fn=<NllLossBackward0>) tensor(5.3675, grad_fn=<NllLossBackward0>)
195 LOSS DIFF: tensor(5.5475, grad_fn=<NllLossBackward0>) tensor(5.4881, grad_fn=<NllLossBackward0>)
196 LOSS DIFF: tensor(5.6223, grad_fn=<NllLossBackward0>) tensor(5.3634, grad_fn=<NllLossBackward0>)
400 tensor(5.5316, grad_fn=<NllLossBackward0>)
197 LOSS DIFF: tensor(5.5377, grad_fn=<NllLossBackward0>) tensor(5.4920, grad_fn=<NllLossBackward0>)
198 LOSS DIFF: tensor(5.6185, grad_fn=<NllLossBackward0>) tensor(5.4576, grad_fn=<NllLossBackward0>)
199 LOSS DIFF: tensor(5.4915, grad_fn=<NllLossBackward0>) tensor(5.4151, grad_fn=<NllLossBackward0>)
200 LOSS DIFF: tensor(5.5837, grad_fn=<NllLossBackward0>) tensor(5.4915, grad_fn=<NllLossBackward0>)
201 LOSS DIFF: tensor(5.5875, grad_fn=<NllLossBackward0>) tensor(5.5837, grad_fn=<NllLossBackward0>)
202 LOSS DIFF: tensor(5.5331, grad_fn=<NllLossBackward0>) tensor(5.4873, grad_fn=<NllLossBackward0>)
203 LOSS DIFF: tensor(5.5345, grad_fn=<NllLossBackward0>) tensor(5.3964, grad_fn=<NllLossBackward0>)
204 LOSS DIFF: tensor(5.5764, grad_fn=<NllLossBackward0>) tensor(5.5345, grad_fn=<NllLossBackward0>)
205 LOSS DIFF: tensor(5.6070, grad_fn=<NllLossBackward0>) tensor(5.5764, grad_fn=<NllLossBackward0>)
206 LOSS DIFF: tensor(5.5005, grad_fn=<NllLossBackward0>) tensor(5.3572, grad_fn=<NllLossBackward0>)
207 LOSS DIFF: tensor(5.5520, grad_fn=<NllLossBackward0>) tensor(5.3860, grad_fn=<NllLossBackward0>)
208 LOSS DIFF: tensor(5.5800, grad_fn=<NllLossBackward0>) tensor(5.5520, grad_fn=<NllLossBackward0>)
209 LOSS DIFF: tensor(5.6465, grad_fn=<NllLossBackward0>) tensor(5.5469, grad_fn=<NllLossBackward0>)
210 LOSS DIFF: tensor(5.5691, grad_fn=<NllLossBackward0>) tensor(5.5241, grad_fn=<NllLossBackward0>)
211 LOSS DIFF: tensor(5.7237, grad_fn=<NllLossBackward0>) tensor(5.4803, grad_fn=<NllLossBackward0>)
212 LOSS DIFF: tensor(5.5532, grad_fn=<NllLossBackward0>) tensor(5.5012, grad_fn=<NllLossBackward0>)
213 LOSS DIFF: tensor(5.5011, grad_fn=<NllLossBackward0>) tensor(5.4712, grad_fn=<NllLossBackward0>)
214 LOSS DIFF: tensor(5.5370, grad_fn=<NllLossBackward0>) tensor(5.5011, grad_fn=<NllLossBackward0>)
215 LOSS DIFF: tensor(5.5579, grad_fn=<NllLossBackward0>) tensor(5.4126, grad_fn=<NllLossBackward0>)
216 LOSS DIFF: tensor(5.5109, grad_fn=<NllLossBackward0>) tensor(5.3875, grad_fn=<NllLossBackward0>)
217 LOSS DIFF: tensor(5.5403, grad_fn=<NllLossBackward0>) tensor(5.4174, grad_fn=<NllLossBackward0>)
218 LOSS DIFF: tensor(5.5404, grad_fn=<NllLossBackward0>) tensor(5.5403, grad_fn=<NllLossBackward0>)
219 LOSS DIFF: tensor(5.5593, grad_fn=<NllLossBackward0>) tensor(5.5404, grad_fn=<NllLossBackward0>)
220 LOSS DIFF: tensor(5.5262, grad_fn=<NllLossBackward0>) tensor(5.5250, grad_fn=<NllLossBackward0>)
221 LOSS DIFF: tensor(5.4107, grad_fn=<NllLossBackward0>) tensor(5.4092, grad_fn=<NllLossBackward0>)
222 LOSS DIFF: tensor(5.4920, grad_fn=<NllLossBackward0>) tensor(5.3499, grad_fn=<NllLossBackward0>)
223 LOSS DIFF: tensor(5.5064, grad_fn=<NllLossBackward0>) tensor(5.4920, grad_fn=<NllLossBackward0>)
224 LOSS DIFF: tensor(5.5648, grad_fn=<NllLossBackward0>) tensor(5.5064, grad_fn=<NllLossBackward0>)
225 LOSS DIFF: tensor(5.5107, grad_fn=<NllLossBackward0>) tensor(5.3439, grad_fn=<NllLossBackward0>)
226 LOSS DIFF: tensor(5.4968, grad_fn=<NllLossBackward0>) tensor(5.4720, grad_fn=<NllLossBackward0>)
227 LOSS DIFF: tensor(5.5473, grad_fn=<NllLossBackward0>) tensor(5.4854, grad_fn=<NllLossBackward0>)
228 LOSS DIFF: tensor(5.4800, grad_fn=<NllLossBackward0>) tensor(5.3762, grad_fn=<NllLossBackward0>)
229 LOSS DIFF: tensor(5.6251, grad_fn=<NllLossBackward0>) tensor(5.4800, grad_fn=<NllLossBackward0>)
230 LOSS DIFF: tensor(5.6237, grad_fn=<NllLossBackward0>) tensor(5.4478, grad_fn=<NllLossBackward0>)
231 LOSS DIFF: tensor(5.5439, grad_fn=<NllLossBackward0>) tensor(5.4108, grad_fn=<NllLossBackward0>)
232 LOSS DIFF: tensor(5.3186, grad_fn=<NllLossBackward0>) tensor(5.3012, grad_fn=<NllLossBackward0>)
233 LOSS DIFF: tensor(5.5069, grad_fn=<NllLossBackward0>) tensor(5.3186, grad_fn=<NllLossBackward0>)
234 LOSS DIFF: tensor(5.5190, grad_fn=<NllLossBackward0>) tensor(5.5043, grad_fn=<NllLossBackward0>)
235 LOSS DIFF: tensor(5.4706, grad_fn=<NllLossBackward0>) tensor(5.4560, grad_fn=<NllLossBackward0>)
236 LOSS DIFF: tensor(5.5252, grad_fn=<NllLossBackward0>) tensor(5.4706, grad_fn=<NllLossBackward0>)
237 LOSS DIFF: tensor(5.4765, grad_fn=<NllLossBackward0>) tensor(5.4103, grad_fn=<NllLossBackward0>)
238 LOSS DIFF: tensor(5.5218, grad_fn=<NllLossBackward0>) tensor(5.4765, grad_fn=<NllLossBackward0>)
239 LOSS DIFF: tensor(5.6028, grad_fn=<NllLossBackward0>) tensor(5.4596, grad_fn=<NllLossBackward0>)
240 LOSS DIFF: tensor(5.5504, grad_fn=<NllLossBackward0>) tensor(5.5021, grad_fn=<NllLossBackward0>)
241 LOSS DIFF: tensor(5.4777, grad_fn=<NllLossBackward0>) tensor(5.4091, grad_fn=<NllLossBackward0>)
242 LOSS DIFF: tensor(5.4404, grad_fn=<NllLossBackward0>) tensor(5.3918, grad_fn=<NllLossBackward0>)
243 LOSS DIFF: tensor(5.5580, grad_fn=<NllLossBackward0>) tensor(5.4404, grad_fn=<NllLossBackward0>)
244 LOSS DIFF: tensor(5.4812, grad_fn=<NllLossBackward0>) tensor(5.4398, grad_fn=<NllLossBackward0>)
500 tensor(5.5214, grad_fn=<NllLossBackward0>)
245 LOSS DIFF: tensor(5.5214, grad_fn=<NllLossBackward0>) tensor(5.4142, grad_fn=<NllLossBackward0>)
246 LOSS DIFF: tensor(5.6153, grad_fn=<NllLossBackward0>) tensor(5.5214, grad_fn=<NllLossBackward0>)
247 LOSS DIFF: tensor(5.4794, grad_fn=<NllLossBackward0>) tensor(5.4672, grad_fn=<NllLossBackward0>)
248 LOSS DIFF: tensor(5.5978, grad_fn=<NllLossBackward0>) tensor(5.4794, grad_fn=<NllLossBackward0>)
249 LOSS DIFF: tensor(5.4549, grad_fn=<NllLossBackward0>) tensor(5.3421, grad_fn=<NllLossBackward0>)
250 LOSS DIFF: tensor(5.4747, grad_fn=<NllLossBackward0>) tensor(5.4549, grad_fn=<NllLossBackward0>)
251 LOSS DIFF: tensor(5.5439, grad_fn=<NllLossBackward0>) tensor(5.3348, grad_fn=<NllLossBackward0>)
252 LOSS DIFF: tensor(5.5953, grad_fn=<NllLossBackward0>) tensor(5.5439, grad_fn=<NllLossBackward0>)
253 LOSS DIFF: tensor(5.5308, grad_fn=<NllLossBackward0>) tensor(5.4385, grad_fn=<NllLossBackward0>)
254 LOSS DIFF: tensor(5.5379, grad_fn=<NllLossBackward0>) tensor(5.4373, grad_fn=<NllLossBackward0>)
255 LOSS DIFF: tensor(5.5022, grad_fn=<NllLossBackward0>) tensor(5.4306, grad_fn=<NllLossBackward0>)
256 LOSS DIFF: tensor(5.5225, grad_fn=<NllLossBackward0>) tensor(5.4898, grad_fn=<NllLossBackward0>)
257 LOSS DIFF: tensor(5.6141, grad_fn=<NllLossBackward0>) tensor(5.5225, grad_fn=<NllLossBackward0>)
258 LOSS DIFF: tensor(5.4873, grad_fn=<NllLossBackward0>) tensor(5.4444, grad_fn=<NllLossBackward0>)
259 LOSS DIFF: tensor(5.6677, grad_fn=<NllLossBackward0>) tensor(5.4873, grad_fn=<NllLossBackward0>)
260 LOSS DIFF: tensor(5.5404, grad_fn=<NllLossBackward0>) tensor(5.4581, grad_fn=<NllLossBackward0>)
261 LOSS DIFF: tensor(5.5603, grad_fn=<NllLossBackward0>) tensor(5.3583, grad_fn=<NllLossBackward0>)
262 LOSS DIFF: tensor(5.5292, grad_fn=<NllLossBackward0>) tensor(5.2255, grad_fn=<NllLossBackward0>)
263 LOSS DIFF: tensor(5.4456, grad_fn=<NllLossBackward0>) tensor(5.3846, grad_fn=<NllLossBackward0>)
264 LOSS DIFF: tensor(5.4504, grad_fn=<NllLossBackward0>) tensor(5.4456, grad_fn=<NllLossBackward0>)
265 LOSS DIFF: tensor(5.4899, grad_fn=<NllLossBackward0>) tensor(5.3406, grad_fn=<NllLossBackward0>)
266 LOSS DIFF: tensor(5.5023, grad_fn=<NllLossBackward0>) tensor(5.4899, grad_fn=<NllLossBackward0>)
267 LOSS DIFF: tensor(5.3884, grad_fn=<NllLossBackward0>) tensor(5.2800, grad_fn=<NllLossBackward0>)
268 LOSS DIFF: tensor(5.4713, grad_fn=<NllLossBackward0>) tensor(5.3884, grad_fn=<NllLossBackward0>)
269 LOSS DIFF: tensor(5.4810, grad_fn=<NllLossBackward0>) tensor(5.4713, grad_fn=<NllLossBackward0>)
270 LOSS DIFF: tensor(5.3896, grad_fn=<NllLossBackward0>) tensor(5.3593, grad_fn=<NllLossBackward0>)
271 LOSS DIFF: tensor(5.5195, grad_fn=<NllLossBackward0>) tensor(5.3896, grad_fn=<NllLossBackward0>)
272 LOSS DIFF: tensor(5.4173, grad_fn=<NllLossBackward0>) tensor(5.3982, grad_fn=<NllLossBackward0>)
273 LOSS DIFF: tensor(5.5428, grad_fn=<NllLossBackward0>) tensor(5.3779, grad_fn=<NllLossBackward0>)
274 LOSS DIFF: tensor(5.4749, grad_fn=<NllLossBackward0>) tensor(5.4675, grad_fn=<NllLossBackward0>)
275 LOSS DIFF: tensor(5.3978, grad_fn=<NllLossBackward0>) tensor(5.2620, grad_fn=<NllLossBackward0>)
276 LOSS DIFF: tensor(5.4689, grad_fn=<NllLossBackward0>) tensor(5.3978, grad_fn=<NllLossBackward0>)
277 LOSS DIFF: tensor(5.4733, grad_fn=<NllLossBackward0>) tensor(5.4689, grad_fn=<NllLossBackward0>)
278 LOSS DIFF: tensor(5.5054, grad_fn=<NllLossBackward0>) tensor(5.4733, grad_fn=<NllLossBackward0>)
279 LOSS DIFF: tensor(5.4809, grad_fn=<NllLossBackward0>) tensor(5.4288, grad_fn=<NllLossBackward0>)
280 LOSS DIFF: tensor(5.5698, grad_fn=<NllLossBackward0>) tensor(5.4809, grad_fn=<NllLossBackward0>)
281 LOSS DIFF: tensor(5.5550, grad_fn=<NllLossBackward0>) tensor(5.4103, grad_fn=<NllLossBackward0>)
282 LOSS DIFF: tensor(5.5803, grad_fn=<NllLossBackward0>) tensor(5.5550, grad_fn=<NllLossBackward0>)
283 LOSS DIFF: tensor(5.5616, grad_fn=<NllLossBackward0>) tensor(5.4858, grad_fn=<NllLossBackward0>)
284 LOSS DIFF: tensor(5.4863, grad_fn=<NllLossBackward0>) tensor(5.3357, grad_fn=<NllLossBackward0>)
285 LOSS DIFF: tensor(5.3506, grad_fn=<NllLossBackward0>) tensor(5.2871, grad_fn=<NllLossBackward0>)
286 LOSS DIFF: tensor(5.6320, grad_fn=<NllLossBackward0>) tensor(5.3506, grad_fn=<NllLossBackward0>)
287 LOSS DIFF: tensor(5.4488, grad_fn=<NllLossBackward0>) tensor(5.4314, grad_fn=<NllLossBackward0>)
288 LOSS DIFF: tensor(5.4596, grad_fn=<NllLossBackward0>) tensor(5.4488, grad_fn=<NllLossBackward0>)
289 LOSS DIFF: tensor(5.5325, grad_fn=<NllLossBackward0>) tensor(5.4596, grad_fn=<NllLossBackward0>)
290 LOSS DIFF: tensor(5.4566, grad_fn=<NllLossBackward0>) tensor(5.2072, grad_fn=<NllLossBackward0>)
291 LOSS DIFF: tensor(5.4784, grad_fn=<NllLossBackward0>) tensor(5.4303, grad_fn=<NllLossBackward0>)
292 LOSS DIFF: tensor(5.4439, grad_fn=<NllLossBackward0>) tensor(5.3270, grad_fn=<NllLossBackward0>)
293 LOSS DIFF: tensor(5.5160, grad_fn=<NllLossBackward0>) tensor(5.4439, grad_fn=<NllLossBackward0>)
294 LOSS DIFF: tensor(5.4134, grad_fn=<NllLossBackward0>) tensor(5.3536, grad_fn=<NllLossBackward0>)
295 LOSS DIFF: tensor(5.4426, grad_fn=<NllLossBackward0>) tensor(5.4134, grad_fn=<NllLossBackward0>)
296 LOSS DIFF: tensor(5.3758, grad_fn=<NllLossBackward0>) tensor(5.3700, grad_fn=<NllLossBackward0>)
297 LOSS DIFF: tensor(5.5559, grad_fn=<NllLossBackward0>) tensor(5.3758, grad_fn=<NllLossBackward0>)
600 tensor(5.4824, grad_fn=<NllLossBackward0>)
298 LOSS DIFF: tensor(5.3795, grad_fn=<NllLossBackward0>) tensor(5.3762, grad_fn=<NllLossBackward0>)
299 LOSS DIFF: tensor(5.3878, grad_fn=<NllLossBackward0>) tensor(5.3795, grad_fn=<NllLossBackward0>)
300 LOSS DIFF: tensor(5.4699, grad_fn=<NllLossBackward0>) tensor(5.3878, grad_fn=<NllLossBackward0>)
301 LOSS DIFF: tensor(5.4967, grad_fn=<NllLossBackward0>) tensor(5.4699, grad_fn=<NllLossBackward0>)
302 LOSS DIFF: tensor(5.5724, grad_fn=<NllLossBackward0>) tensor(5.4967, grad_fn=<NllLossBackward0>)
303 LOSS DIFF: tensor(5.4520, grad_fn=<NllLossBackward0>) tensor(5.4072, grad_fn=<NllLossBackward0>)
304 LOSS DIFF: tensor(5.5089, grad_fn=<NllLossBackward0>) tensor(5.4520, grad_fn=<NllLossBackward0>)
305 LOSS DIFF: tensor(5.5398, grad_fn=<NllLossBackward0>) tensor(5.3168, grad_fn=<NllLossBackward0>)
306 LOSS DIFF: tensor(5.3561, grad_fn=<NllLossBackward0>) tensor(5.3058, grad_fn=<NllLossBackward0>)
307 LOSS DIFF: tensor(5.4668, grad_fn=<NllLossBackward0>) tensor(5.3448, grad_fn=<NllLossBackward0>)
308 LOSS DIFF: tensor(5.4964, grad_fn=<NllLossBackward0>) tensor(5.4668, grad_fn=<NllLossBackward0>)
309 LOSS DIFF: tensor(5.4440, grad_fn=<NllLossBackward0>) tensor(5.3221, grad_fn=<NllLossBackward0>)
310 LOSS DIFF: tensor(5.4516, grad_fn=<NllLossBackward0>) tensor(5.4289, grad_fn=<NllLossBackward0>)
311 LOSS DIFF: tensor(5.4969, grad_fn=<NllLossBackward0>) tensor(5.3983, grad_fn=<NllLossBackward0>)
312 LOSS DIFF: tensor(5.4254, grad_fn=<NllLossBackward0>) tensor(5.3790, grad_fn=<NllLossBackward0>)
313 LOSS DIFF: tensor(5.4874, grad_fn=<NllLossBackward0>) tensor(5.4254, grad_fn=<NllLossBackward0>)
314 LOSS DIFF: tensor(5.3839, grad_fn=<NllLossBackward0>) tensor(5.3470, grad_fn=<NllLossBackward0>)
315 LOSS DIFF: tensor(5.5822, grad_fn=<NllLossBackward0>) tensor(5.3839, grad_fn=<NllLossBackward0>)
316 LOSS DIFF: tensor(5.4169, grad_fn=<NllLossBackward0>) tensor(5.3044, grad_fn=<NllLossBackward0>)
317 LOSS DIFF: tensor(5.4778, grad_fn=<NllLossBackward0>) tensor(5.4169, grad_fn=<NllLossBackward0>)
318 LOSS DIFF: tensor(5.3589, grad_fn=<NllLossBackward0>) tensor(5.2238, grad_fn=<NllLossBackward0>)
319 LOSS DIFF: tensor(5.3547, grad_fn=<NllLossBackward0>) tensor(5.3184, grad_fn=<NllLossBackward0>)
320 LOSS DIFF: tensor(5.5022, grad_fn=<NllLossBackward0>) tensor(5.3547, grad_fn=<NllLossBackward0>)
321 LOSS DIFF: tensor(5.4749, grad_fn=<NllLossBackward0>) tensor(5.4294, grad_fn=<NllLossBackward0>)
322 LOSS DIFF: tensor(5.3813, grad_fn=<NllLossBackward0>) tensor(5.3557, grad_fn=<NllLossBackward0>)
323 LOSS DIFF: tensor(5.4019, grad_fn=<NllLossBackward0>) tensor(5.3813, grad_fn=<NllLossBackward0>)
324 LOSS DIFF: tensor(5.7250, grad_fn=<NllLossBackward0>) tensor(5.4019, grad_fn=<NllLossBackward0>)
325 LOSS DIFF: tensor(5.4055, grad_fn=<NllLossBackward0>) tensor(5.3304, grad_fn=<NllLossBackward0>)
326 LOSS DIFF: tensor(5.4721, grad_fn=<NllLossBackward0>) tensor(5.4055, grad_fn=<NllLossBackward0>)
327 LOSS DIFF: tensor(5.4590, grad_fn=<NllLossBackward0>) tensor(5.3773, grad_fn=<NllLossBackward0>)
328 LOSS DIFF: tensor(5.6097, grad_fn=<NllLossBackward0>) tensor(5.4590, grad_fn=<NllLossBackward0>)
329 LOSS DIFF: tensor(5.5304, grad_fn=<NllLossBackward0>) tensor(5.2807, grad_fn=<NllLossBackward0>)
330 LOSS DIFF: tensor(5.4286, grad_fn=<NllLossBackward0>) tensor(5.3879, grad_fn=<NllLossBackward0>)
331 LOSS DIFF: tensor(5.4221, grad_fn=<NllLossBackward0>) tensor(5.2779, grad_fn=<NllLossBackward0>)
332 LOSS DIFF: tensor(5.3690, grad_fn=<NllLossBackward0>) tensor(5.3191, grad_fn=<NllLossBackward0>)
333 LOSS DIFF: tensor(5.3814, grad_fn=<NllLossBackward0>) tensor(5.3690, grad_fn=<NllLossBackward0>)
334 LOSS DIFF: tensor(5.4241, grad_fn=<NllLossBackward0>) tensor(5.3760, grad_fn=<NllLossBackward0>)
335 LOSS DIFF: tensor(5.4727, grad_fn=<NllLossBackward0>) tensor(5.4241, grad_fn=<NllLossBackward0>)
336 LOSS DIFF: tensor(5.4216, grad_fn=<NllLossBackward0>) tensor(5.3401, grad_fn=<NllLossBackward0>)
337 LOSS DIFF: tensor(5.4938, grad_fn=<NllLossBackward0>) tensor(5.3908, grad_fn=<NllLossBackward0>)
338 LOSS DIFF: tensor(5.4742, grad_fn=<NllLossBackward0>) tensor(5.3384, grad_fn=<NllLossBackward0>)
339 LOSS DIFF: tensor(5.4628, grad_fn=<NllLossBackward0>) tensor(5.2785, grad_fn=<NllLossBackward0>)
340 LOSS DIFF: tensor(5.5419, grad_fn=<NllLossBackward0>) tensor(5.3019, grad_fn=<NllLossBackward0>)
341 LOSS DIFF: tensor(5.4736, grad_fn=<NllLossBackward0>) tensor(5.3646, grad_fn=<NllLossBackward0>)
342 LOSS DIFF: tensor(5.4150, grad_fn=<NllLossBackward0>) tensor(5.3511, grad_fn=<NllLossBackward0>)
343 LOSS DIFF: tensor(5.4531, grad_fn=<NllLossBackward0>) tensor(5.2982, grad_fn=<NllLossBackward0>)
344 LOSS DIFF: tensor(5.4617, grad_fn=<NllLossBackward0>) tensor(5.4531, grad_fn=<NllLossBackward0>)
345 LOSS DIFF: tensor(5.4939, grad_fn=<NllLossBackward0>) tensor(5.4617, grad_fn=<NllLossBackward0>)
346 LOSS DIFF: tensor(5.4178, grad_fn=<NllLossBackward0>) tensor(5.3127, grad_fn=<NllLossBackward0>)
700 tensor(5.7095, grad_fn=<NllLossBackward0>)
347 LOSS DIFF: tensor(5.7095, grad_fn=<NllLossBackward0>) tensor(5.3593, grad_fn=<NllLossBackward0>)
348 LOSS DIFF: tensor(5.4054, grad_fn=<NllLossBackward0>) tensor(5.3883, grad_fn=<NllLossBackward0>)
349 LOSS DIFF: tensor(5.6016, grad_fn=<NllLossBackward0>) tensor(5.4054, grad_fn=<NllLossBackward0>)
350 LOSS DIFF: tensor(5.4695, grad_fn=<NllLossBackward0>) tensor(5.4424, grad_fn=<NllLossBackward0>)
351 LOSS DIFF: tensor(5.5022, grad_fn=<NllLossBackward0>) tensor(5.4695, grad_fn=<NllLossBackward0>)
352 LOSS DIFF: tensor(5.5172, grad_fn=<NllLossBackward0>) tensor(5.4135, grad_fn=<NllLossBackward0>)
353 LOSS DIFF: tensor(5.5003, grad_fn=<NllLossBackward0>) tensor(5.3490, grad_fn=<NllLossBackward0>)
354 LOSS DIFF: tensor(5.3198, grad_fn=<NllLossBackward0>) tensor(5.2805, grad_fn=<NllLossBackward0>)
355 LOSS DIFF: tensor(5.3726, grad_fn=<NllLossBackward0>) tensor(5.3198, grad_fn=<NllLossBackward0>)
356 LOSS DIFF: tensor(5.3992, grad_fn=<NllLossBackward0>) tensor(5.3726, grad_fn=<NllLossBackward0>)
357 LOSS DIFF: tensor(5.5122, grad_fn=<NllLossBackward0>) tensor(5.3992, grad_fn=<NllLossBackward0>)
358 LOSS DIFF: tensor(5.6000, grad_fn=<NllLossBackward0>) tensor(5.3476, grad_fn=<NllLossBackward0>)
359 LOSS DIFF: tensor(5.4421, grad_fn=<NllLossBackward0>) tensor(5.3207, grad_fn=<NllLossBackward0>)
360 LOSS DIFF: tensor(5.6211, grad_fn=<NllLossBackward0>) tensor(5.4421, grad_fn=<NllLossBackward0>)
361 LOSS DIFF: tensor(5.3617, grad_fn=<NllLossBackward0>) tensor(5.3425, grad_fn=<NllLossBackward0>)
362 LOSS DIFF: tensor(5.3828, grad_fn=<NllLossBackward0>) tensor(5.3617, grad_fn=<NllLossBackward0>)
363 LOSS DIFF: tensor(5.4569, grad_fn=<NllLossBackward0>) tensor(5.3828, grad_fn=<NllLossBackward0>)
364 LOSS DIFF: tensor(5.4314, grad_fn=<NllLossBackward0>) tensor(5.2452, grad_fn=<NllLossBackward0>)
365 LOSS DIFF: tensor(5.5384, grad_fn=<NllLossBackward0>) tensor(5.4314, grad_fn=<NllLossBackward0>)
366 LOSS DIFF: tensor(5.4293, grad_fn=<NllLossBackward0>) tensor(5.3797, grad_fn=<NllLossBackward0>)
367 LOSS DIFF: tensor(5.4823, grad_fn=<NllLossBackward0>) tensor(5.4289, grad_fn=<NllLossBackward0>)
368 LOSS DIFF: tensor(5.4602, grad_fn=<NllLossBackward0>) tensor(5.3212, grad_fn=<NllLossBackward0>)
369 LOSS DIFF: tensor(5.4459, grad_fn=<NllLossBackward0>) tensor(5.3457, grad_fn=<NllLossBackward0>)
370 LOSS DIFF: tensor(5.5089, grad_fn=<NllLossBackward0>) tensor(5.3548, grad_fn=<NllLossBackward0>)
371 LOSS DIFF: tensor(5.3639, grad_fn=<NllLossBackward0>) tensor(5.2607, grad_fn=<NllLossBackward0>)
372 LOSS DIFF: tensor(5.4079, grad_fn=<NllLossBackward0>) tensor(5.3639, grad_fn=<NllLossBackward0>)
373 LOSS DIFF: tensor(5.5557, grad_fn=<NllLossBackward0>) tensor(5.4079, grad_fn=<NllLossBackward0>)
374 LOSS DIFF: tensor(5.3965, grad_fn=<NllLossBackward0>) tensor(5.3427, grad_fn=<NllLossBackward0>)
375 LOSS DIFF: tensor(5.4149, grad_fn=<NllLossBackward0>) tensor(5.3965, grad_fn=<NllLossBackward0>)
376 LOSS DIFF: tensor(5.3285, grad_fn=<NllLossBackward0>) tensor(5.3265, grad_fn=<NllLossBackward0>)
377 LOSS DIFF: tensor(5.3672, grad_fn=<NllLossBackward0>) tensor(5.3285, grad_fn=<NllLossBackward0>)
378 LOSS DIFF: tensor(5.4523, grad_fn=<NllLossBackward0>) tensor(5.3471, grad_fn=<NllLossBackward0>)
379 LOSS DIFF: tensor(5.4315, grad_fn=<NllLossBackward0>) tensor(5.4231, grad_fn=<NllLossBackward0>)
380 LOSS DIFF: tensor(5.5363, grad_fn=<NllLossBackward0>) tensor(5.4315, grad_fn=<NllLossBackward0>)
381 LOSS DIFF: tensor(5.4404, grad_fn=<NllLossBackward0>) tensor(5.4114, grad_fn=<NllLossBackward0>)
382 LOSS DIFF: tensor(5.2667, grad_fn=<NllLossBackward0>) tensor(5.2283, grad_fn=<NllLossBackward0>)
383 LOSS DIFF: tensor(5.3342, grad_fn=<NllLossBackward0>) tensor(5.2667, grad_fn=<NllLossBackward0>)
384 LOSS DIFF: tensor(5.4847, grad_fn=<NllLossBackward0>) tensor(5.3342, grad_fn=<NllLossBackward0>)
385 LOSS DIFF: tensor(5.5349, grad_fn=<NllLossBackward0>) tensor(5.4847, grad_fn=<NllLossBackward0>)
386 LOSS DIFF: tensor(5.4216, grad_fn=<NllLossBackward0>) tensor(5.2991, grad_fn=<NllLossBackward0>)
387 LOSS DIFF: tensor(5.4483, grad_fn=<NllLossBackward0>) tensor(5.3455, grad_fn=<NllLossBackward0>)
388 LOSS DIFF: tensor(5.4229, grad_fn=<NllLossBackward0>) tensor(5.3271, grad_fn=<NllLossBackward0>)
389 LOSS DIFF: tensor(5.5482, grad_fn=<NllLossBackward0>) tensor(5.4229, grad_fn=<NllLossBackward0>)
390 LOSS DIFF: tensor(5.4596, grad_fn=<NllLossBackward0>) tensor(5.3374, grad_fn=<NllLossBackward0>)
391 LOSS DIFF: tensor(5.4694, grad_fn=<NllLossBackward0>) tensor(5.4596, grad_fn=<NllLossBackward0>)
392 LOSS DIFF: tensor(5.4744, grad_fn=<NllLossBackward0>) tensor(5.3277, grad_fn=<NllLossBackward0>)
393 LOSS DIFF: tensor(5.4301, grad_fn=<NllLossBackward0>) tensor(5.3380, grad_fn=<NllLossBackward0>)
394 LOSS DIFF: tensor(5.2605, grad_fn=<NllLossBackward0>) tensor(5.2482, grad_fn=<NllLossBackward0>)
395 LOSS DIFF: tensor(5.4596, grad_fn=<NllLossBackward0>) tensor(5.2605, grad_fn=<NllLossBackward0>)
396 LOSS DIFF: tensor(5.3527, grad_fn=<NllLossBackward0>) tensor(5.2774, grad_fn=<NllLossBackward0>)
397 LOSS DIFF: tensor(5.5415, grad_fn=<NllLossBackward0>) tensor(5.3283, grad_fn=<NllLossBackward0>)
398 LOSS DIFF: tensor(5.5558, grad_fn=<NllLossBackward0>) tensor(5.4762, grad_fn=<NllLossBackward0>)
399 LOSS DIFF: tensor(5.3862, grad_fn=<NllLossBackward0>) tensor(5.3796, grad_fn=<NllLossBackward0>)
400 LOSS DIFF: tensor(5.5006, grad_fn=<NllLossBackward0>) tensor(5.2756, grad_fn=<NllLossBackward0>)
401 LOSS DIFF: tensor(5.4776, grad_fn=<NllLossBackward0>) tensor(5.2884, grad_fn=<NllLossBackward0>)
800 tensor(5.4405, grad_fn=<NllLossBackward0>)
402 LOSS DIFF: tensor(5.5078, grad_fn=<NllLossBackward0>) tensor(5.2731, grad_fn=<NllLossBackward0>)
403 LOSS DIFF: tensor(5.4186, grad_fn=<NllLossBackward0>) tensor(5.3394, grad_fn=<NllLossBackward0>)
404 LOSS DIFF: tensor(5.4645, grad_fn=<NllLossBackward0>) tensor(5.4186, grad_fn=<NllLossBackward0>)
405 LOSS DIFF: tensor(5.3991, grad_fn=<NllLossBackward0>) tensor(5.1863, grad_fn=<NllLossBackward0>)
406 LOSS DIFF: tensor(5.4625, grad_fn=<NllLossBackward0>) tensor(5.3991, grad_fn=<NllLossBackward0>)
407 LOSS DIFF: tensor(5.2887, grad_fn=<NllLossBackward0>) tensor(5.2630, grad_fn=<NllLossBackward0>)
408 LOSS DIFF: tensor(5.3613, grad_fn=<NllLossBackward0>) tensor(5.2887, grad_fn=<NllLossBackward0>)
409 LOSS DIFF: tensor(5.4549, grad_fn=<NllLossBackward0>) tensor(5.3613, grad_fn=<NllLossBackward0>)
410 LOSS DIFF: tensor(5.4254, grad_fn=<NllLossBackward0>) tensor(5.3545, grad_fn=<NllLossBackward0>)
411 LOSS DIFF: tensor(5.4779, grad_fn=<NllLossBackward0>) tensor(5.4254, grad_fn=<NllLossBackward0>)
412 LOSS DIFF: tensor(5.4206, grad_fn=<NllLossBackward0>) tensor(5.3494, grad_fn=<NllLossBackward0>)
413 LOSS DIFF: tensor(5.4468, grad_fn=<NllLossBackward0>) tensor(5.3558, grad_fn=<NllLossBackward0>)
414 LOSS DIFF: tensor(5.3703, grad_fn=<NllLossBackward0>) tensor(5.3009, grad_fn=<NllLossBackward0>)
415 LOSS DIFF: tensor(5.4129, grad_fn=<NllLossBackward0>) tensor(5.3703, grad_fn=<NllLossBackward0>)
416 LOSS DIFF: tensor(5.4347, grad_fn=<NllLossBackward0>) tensor(5.3186, grad_fn=<NllLossBackward0>)
417 LOSS DIFF: tensor(5.3410, grad_fn=<NllLossBackward0>) tensor(5.2797, grad_fn=<NllLossBackward0>)
418 LOSS DIFF: tensor(5.4206, grad_fn=<NllLossBackward0>) tensor(5.3410, grad_fn=<NllLossBackward0>)
419 LOSS DIFF: tensor(5.3961, grad_fn=<NllLossBackward0>) tensor(5.3201, grad_fn=<NllLossBackward0>)
420 LOSS DIFF: tensor(5.3999, grad_fn=<NllLossBackward0>) tensor(5.3961, grad_fn=<NllLossBackward0>)
421 LOSS DIFF: tensor(5.4644, grad_fn=<NllLossBackward0>) tensor(5.2622, grad_fn=<NllLossBackward0>)
422 LOSS DIFF: tensor(5.3218, grad_fn=<NllLossBackward0>) tensor(5.3111, grad_fn=<NllLossBackward0>)
423 LOSS DIFF: tensor(5.3554, grad_fn=<NllLossBackward0>) tensor(5.3218, grad_fn=<NllLossBackward0>)
424 LOSS DIFF: tensor(5.4028, grad_fn=<NllLossBackward0>) tensor(5.3554, grad_fn=<NllLossBackward0>)
425 LOSS DIFF: tensor(5.3832, grad_fn=<NllLossBackward0>) tensor(5.3375, grad_fn=<NllLossBackward0>)
426 LOSS DIFF: tensor(5.4313, grad_fn=<NllLossBackward0>) tensor(5.3181, grad_fn=<NllLossBackward0>)
427 LOSS DIFF: tensor(5.4721, grad_fn=<NllLossBackward0>) tensor(5.3831, grad_fn=<NllLossBackward0>)
428 LOSS DIFF: tensor(5.3902, grad_fn=<NllLossBackward0>) tensor(5.2394, grad_fn=<NllLossBackward0>)
429 LOSS DIFF: tensor(5.3492, grad_fn=<NllLossBackward0>) tensor(5.3336, grad_fn=<NllLossBackward0>)
430 LOSS DIFF: tensor(5.3523, grad_fn=<NllLossBackward0>) tensor(5.3492, grad_fn=<NllLossBackward0>)
431 LOSS DIFF: tensor(5.4211, grad_fn=<NllLossBackward0>) tensor(5.3486, grad_fn=<NllLossBackward0>)
432 LOSS DIFF: tensor(5.4755, grad_fn=<NllLossBackward0>) tensor(5.2288, grad_fn=<NllLossBackward0>)
433 LOSS DIFF: tensor(5.5728, grad_fn=<NllLossBackward0>) tensor(5.4755, grad_fn=<NllLossBackward0>)
434 LOSS DIFF: tensor(5.3855, grad_fn=<NllLossBackward0>) tensor(5.3527, grad_fn=<NllLossBackward0>)
435 LOSS DIFF: tensor(5.4776, grad_fn=<NllLossBackward0>) tensor(5.3855, grad_fn=<NllLossBackward0>)
436 LOSS DIFF: tensor(5.3750, grad_fn=<NllLossBackward0>) tensor(5.3262, grad_fn=<NllLossBackward0>)
437 LOSS DIFF: tensor(5.3902, grad_fn=<NllLossBackward0>) tensor(5.3750, grad_fn=<NllLossBackward0>)
438 LOSS DIFF: tensor(5.3135, grad_fn=<NllLossBackward0>) tensor(5.2863, grad_fn=<NllLossBackward0>)
439 LOSS DIFF: tensor(5.4483, grad_fn=<NllLossBackward0>) tensor(5.3135, grad_fn=<NllLossBackward0>)
440 LOSS DIFF: tensor(5.3201, grad_fn=<NllLossBackward0>) tensor(5.2603, grad_fn=<NllLossBackward0>)
441 LOSS DIFF: tensor(5.3807, grad_fn=<NllLossBackward0>) tensor(5.3201, grad_fn=<NllLossBackward0>)
442 LOSS DIFF: tensor(5.5009, grad_fn=<NllLossBackward0>) tensor(5.2434, grad_fn=<NllLossBackward0>)
443 LOSS DIFF: tensor(5.4282, grad_fn=<NllLossBackward0>) tensor(5.4278, grad_fn=<NllLossBackward0>)
444 LOSS DIFF: tensor(5.3787, grad_fn=<NllLossBackward0>) tensor(5.3128, grad_fn=<NllLossBackward0>)
445 LOSS DIFF: tensor(5.5917, grad_fn=<NllLossBackward0>) tensor(5.3324, grad_fn=<NllLossBackward0>)
446 LOSS DIFF: tensor(5.4186, grad_fn=<NllLossBackward0>) tensor(5.3144, grad_fn=<NllLossBackward0>)
447 LOSS DIFF: tensor(5.4553, grad_fn=<NllLossBackward0>) tensor(5.4186, grad_fn=<NllLossBackward0>)
448 LOSS DIFF: tensor(5.4903, grad_fn=<NllLossBackward0>) tensor(5.4553, grad_fn=<NllLossBackward0>)
449 LOSS DIFF: tensor(5.4295, grad_fn=<NllLossBackward0>) tensor(5.3503, grad_fn=<NllLossBackward0>)
450 LOSS DIFF: tensor(5.3945, grad_fn=<NllLossBackward0>) tensor(5.3607, grad_fn=<NllLossBackward0>)
451 LOSS DIFF: tensor(5.2822, grad_fn=<NllLossBackward0>) tensor(5.2387, grad_fn=<NllLossBackward0>)
452 LOSS DIFF: tensor(5.3334, grad_fn=<NllLossBackward0>) tensor(5.2822, grad_fn=<NllLossBackward0>)
453 LOSS DIFF: tensor(5.4073, grad_fn=<NllLossBackward0>) tensor(5.3334, grad_fn=<NllLossBackward0>)
454 LOSS DIFF: tensor(5.3797, grad_fn=<NllLossBackward0>) tensor(5.3469, grad_fn=<NllLossBackward0>)
455 LOSS DIFF: tensor(5.4848, grad_fn=<NllLossBackward0>) tensor(5.2529, grad_fn=<NllLossBackward0>)
900 tensor(5.3078, grad_fn=<NllLossBackward0>)
456 LOSS DIFF: tensor(5.4695, grad_fn=<NllLossBackward0>) tensor(5.3078, grad_fn=<NllLossBackward0>)
457 LOSS DIFF: tensor(5.4369, grad_fn=<NllLossBackward0>) tensor(5.3834, grad_fn=<NllLossBackward0>)
458 LOSS DIFF: tensor(5.4973, grad_fn=<NllLossBackward0>) tensor(5.4369, grad_fn=<NllLossBackward0>)
459 LOSS DIFF: tensor(5.4526, grad_fn=<NllLossBackward0>) tensor(5.3075, grad_fn=<NllLossBackward0>)
460 LOSS DIFF: tensor(5.4022, grad_fn=<NllLossBackward0>) tensor(5.2870, grad_fn=<NllLossBackward0>)
461 LOSS DIFF: tensor(5.3850, grad_fn=<NllLossBackward0>) tensor(5.2879, grad_fn=<NllLossBackward0>)
462 LOSS DIFF: tensor(5.4370, grad_fn=<NllLossBackward0>) tensor(5.3154, grad_fn=<NllLossBackward0>)
463 LOSS DIFF: tensor(5.4111, grad_fn=<NllLossBackward0>) tensor(5.3927, grad_fn=<NllLossBackward0>)
464 LOSS DIFF: tensor(5.4638, grad_fn=<NllLossBackward0>) tensor(5.4111, grad_fn=<NllLossBackward0>)
465 LOSS DIFF: tensor(5.3719, grad_fn=<NllLossBackward0>) tensor(5.3195, grad_fn=<NllLossBackward0>)
466 LOSS DIFF: tensor(5.4880, grad_fn=<NllLossBackward0>) tensor(5.3719, grad_fn=<NllLossBackward0>)
467 LOSS DIFF: tensor(5.4762, grad_fn=<NllLossBackward0>) tensor(5.4186, grad_fn=<NllLossBackward0>)
468 LOSS DIFF: tensor(5.3155, grad_fn=<NllLossBackward0>) tensor(5.2086, grad_fn=<NllLossBackward0>)
469 LOSS DIFF: tensor(5.4985, grad_fn=<NllLossBackward0>) tensor(5.3155, grad_fn=<NllLossBackward0>)
470 LOSS DIFF: tensor(5.4505, grad_fn=<NllLossBackward0>) tensor(5.3731, grad_fn=<NllLossBackward0>)
471 LOSS DIFF: tensor(5.4291, grad_fn=<NllLossBackward0>) tensor(5.3408, grad_fn=<NllLossBackward0>)
472 LOSS DIFF: tensor(5.3826, grad_fn=<NllLossBackward0>) tensor(5.3232, grad_fn=<NllLossBackward0>)
473 LOSS DIFF: tensor(5.4152, grad_fn=<NllLossBackward0>) tensor(5.3468, grad_fn=<NllLossBackward0>)
474 LOSS DIFF: tensor(5.4983, grad_fn=<NllLossBackward0>) tensor(5.4152, grad_fn=<NllLossBackward0>)
475 LOSS DIFF: tensor(5.5432, grad_fn=<NllLossBackward0>) tensor(5.3502, grad_fn=<NllLossBackward0>)
476 LOSS DIFF: tensor(5.3989, grad_fn=<NllLossBackward0>) tensor(5.3489, grad_fn=<NllLossBackward0>)
477 LOSS DIFF: tensor(5.4624, grad_fn=<NllLossBackward0>) tensor(5.3761, grad_fn=<NllLossBackward0>)
478 LOSS DIFF: tensor(5.4082, grad_fn=<NllLossBackward0>) tensor(5.4043, grad_fn=<NllLossBackward0>)
479 LOSS DIFF: tensor(5.4074, grad_fn=<NllLossBackward0>) tensor(5.3588, grad_fn=<NllLossBackward0>)
480 LOSS DIFF: tensor(5.4588, grad_fn=<NllLossBackward0>) tensor(5.4074, grad_fn=<NllLossBackward0>)
481 LOSS DIFF: tensor(5.3339, grad_fn=<NllLossBackward0>) tensor(5.2172, grad_fn=<NllLossBackward0>)
482 LOSS DIFF: tensor(5.4468, grad_fn=<NllLossBackward0>) tensor(5.3339, grad_fn=<NllLossBackward0>)
483 LOSS DIFF: tensor(5.4736, grad_fn=<NllLossBackward0>) tensor(5.4024, grad_fn=<NllLossBackward0>)
484 LOSS DIFF: tensor(5.3780, grad_fn=<NllLossBackward0>) tensor(5.3095, grad_fn=<NllLossBackward0>)
485 LOSS DIFF: tensor(5.4251, grad_fn=<NllLossBackward0>) tensor(5.3780, grad_fn=<NllLossBackward0>)
486 LOSS DIFF: tensor(5.4035, grad_fn=<NllLossBackward0>) tensor(5.3474, grad_fn=<NllLossBackward0>)
487 LOSS DIFF: tensor(5.3575, grad_fn=<NllLossBackward0>) tensor(5.2837, grad_fn=<NllLossBackward0>)
488 LOSS DIFF: tensor(5.4629, grad_fn=<NllLossBackward0>) tensor(5.3298, grad_fn=<NllLossBackward0>)
489 LOSS DIFF: tensor(5.4593, grad_fn=<NllLossBackward0>) tensor(5.4124, grad_fn=<NllLossBackward0>)
490 LOSS DIFF: tensor(5.4040, grad_fn=<NllLossBackward0>) tensor(5.3532, grad_fn=<NllLossBackward0>)
491 LOSS DIFF: tensor(5.4693, grad_fn=<NllLossBackward0>) tensor(5.4040, grad_fn=<NllLossBackward0>)
492 LOSS DIFF: tensor(5.4201, grad_fn=<NllLossBackward0>) tensor(5.3561, grad_fn=<NllLossBackward0>)
493 LOSS DIFF: tensor(5.4786, grad_fn=<NllLossBackward0>) tensor(5.4201, grad_fn=<NllLossBackward0>)
494 LOSS DIFF: tensor(5.3819, grad_fn=<NllLossBackward0>) tensor(5.3108, grad_fn=<NllLossBackward0>)
495 LOSS DIFF: tensor(5.3170, grad_fn=<NllLossBackward0>) tensor(5.3080, grad_fn=<NllLossBackward0>)
496 LOSS DIFF: tensor(5.3305, grad_fn=<NllLossBackward0>) tensor(5.2931, grad_fn=<NllLossBackward0>)
497 LOSS DIFF: tensor(5.3719, grad_fn=<NllLossBackward0>) tensor(5.3305, grad_fn=<NllLossBackward0>)
498 LOSS DIFF: tensor(5.3756, grad_fn=<NllLossBackward0>) tensor(5.3702, grad_fn=<NllLossBackward0>)
499 LOSS DIFF: tensor(5.4073, grad_fn=<NllLossBackward0>) tensor(5.1951, grad_fn=<NllLossBackward0>)
500 LOSS DIFF: tensor(5.4267, grad_fn=<NllLossBackward0>) tensor(5.3957, grad_fn=<NllLossBackward0>)
501 LOSS DIFF: tensor(5.3842, grad_fn=<NllLossBackward0>) tensor(5.3569, grad_fn=<NllLossBackward0>)
502 LOSS DIFF: tensor(5.4202, grad_fn=<NllLossBackward0>) tensor(5.3842, grad_fn=<NllLossBackward0>)
503 LOSS DIFF: tensor(5.3634, grad_fn=<NllLossBackward0>) tensor(5.2962, grad_fn=<NllLossBackward0>)
504 LOSS DIFF: tensor(5.4654, grad_fn=<NllLossBackward0>) tensor(5.3512, grad_fn=<NllLossBackward0>)
1000 tensor(5.4063, grad_fn=<NllLossBackward0>)
505 LOSS DIFF: tensor(5.4063, grad_fn=<NllLossBackward0>) tensor(5.3712, grad_fn=<NllLossBackward0>)
506 LOSS DIFF: tensor(5.3378, grad_fn=<NllLossBackward0>) tensor(5.2547, grad_fn=<NllLossBackward0>)
507 LOSS DIFF: tensor(5.3185, grad_fn=<NllLossBackward0>) tensor(5.2350, grad_fn=<NllLossBackward0>)
508 LOSS DIFF: tensor(5.3049, grad_fn=<NllLossBackward0>) tensor(5.1821, grad_fn=<NllLossBackward0>)
509 LOSS DIFF: tensor(5.4689, grad_fn=<NllLossBackward0>) tensor(5.3049, grad_fn=<NllLossBackward0>)
510 LOSS DIFF: tensor(5.1437, grad_fn=<NllLossBackward0>) tensor(5.1380, grad_fn=<NllLossBackward0>)
511 LOSS DIFF: tensor(5.3984, grad_fn=<NllLossBackward0>) tensor(5.1437, grad_fn=<NllLossBackward0>)
512 LOSS DIFF: tensor(5.5009, grad_fn=<NllLossBackward0>) tensor(5.2426, grad_fn=<NllLossBackward0>)
513 LOSS DIFF: tensor(5.3734, grad_fn=<NllLossBackward0>) tensor(5.3096, grad_fn=<NllLossBackward0>)
514 LOSS DIFF: tensor(5.3889, grad_fn=<NllLossBackward0>) tensor(5.3734, grad_fn=<NllLossBackward0>)
515 LOSS DIFF: tensor(5.4053, grad_fn=<NllLossBackward0>) tensor(5.3114, grad_fn=<NllLossBackward0>)
516 LOSS DIFF: tensor(5.3912, grad_fn=<NllLossBackward0>) tensor(5.2357, grad_fn=<NllLossBackward0>)
517 LOSS DIFF: tensor(5.4400, grad_fn=<NllLossBackward0>) tensor(5.3115, grad_fn=<NllLossBackward0>)
518 LOSS DIFF: tensor(5.4756, grad_fn=<NllLossBackward0>) tensor(5.2689, grad_fn=<NllLossBackward0>)
519 LOSS DIFF: tensor(5.3111, grad_fn=<NllLossBackward0>) tensor(5.1618, grad_fn=<NllLossBackward0>)
520 LOSS DIFF: tensor(5.3974, grad_fn=<NllLossBackward0>) tensor(5.3030, grad_fn=<NllLossBackward0>)
521 LOSS DIFF: tensor(5.3955, grad_fn=<NllLossBackward0>) tensor(5.2872, grad_fn=<NllLossBackward0>)
522 LOSS DIFF: tensor(5.4712, grad_fn=<NllLossBackward0>) tensor(5.3863, grad_fn=<NllLossBackward0>)
523 LOSS DIFF: tensor(5.4095, grad_fn=<NllLossBackward0>) tensor(5.3686, grad_fn=<NllLossBackward0>)
524 LOSS DIFF: tensor(5.3285, grad_fn=<NllLossBackward0>) tensor(5.2293, grad_fn=<NllLossBackward0>)
525 LOSS DIFF: tensor(5.3468, grad_fn=<NllLossBackward0>) tensor(5.2348, grad_fn=<NllLossBackward0>)
526 LOSS DIFF: tensor(5.3140, grad_fn=<NllLossBackward0>) tensor(5.2460, grad_fn=<NllLossBackward0>)
527 LOSS DIFF: tensor(5.3772, grad_fn=<NllLossBackward0>) tensor(5.3140, grad_fn=<NllLossBackward0>)
528 LOSS DIFF: tensor(5.3576, grad_fn=<NllLossBackward0>) tensor(5.3363, grad_fn=<NllLossBackward0>)
529 LOSS DIFF: tensor(5.2631, grad_fn=<NllLossBackward0>) tensor(5.2239, grad_fn=<NllLossBackward0>)
530 LOSS DIFF: tensor(5.4207, grad_fn=<NllLossBackward0>) tensor(5.2631, grad_fn=<NllLossBackward0>)
531 LOSS DIFF: tensor(5.4238, grad_fn=<NllLossBackward0>) tensor(5.2798, grad_fn=<NllLossBackward0>)
532 LOSS DIFF: tensor(5.4496, grad_fn=<NllLossBackward0>) tensor(5.2819, grad_fn=<NllLossBackward0>)
533 LOSS DIFF: tensor(5.2788, grad_fn=<NllLossBackward0>) tensor(5.2125, grad_fn=<NllLossBackward0>)
534 LOSS DIFF: tensor(5.3159, grad_fn=<NllLossBackward0>) tensor(5.2788, grad_fn=<NllLossBackward0>)
535 LOSS DIFF: tensor(5.3200, grad_fn=<NllLossBackward0>) tensor(5.3159, grad_fn=<NllLossBackward0>)
536 LOSS DIFF: tensor(5.3934, grad_fn=<NllLossBackward0>) tensor(5.3087, grad_fn=<NllLossBackward0>)
537 LOSS DIFF: tensor(5.2843, grad_fn=<NllLossBackward0>) tensor(5.2815, grad_fn=<NllLossBackward0>)
538 LOSS DIFF: tensor(5.5309, grad_fn=<NllLossBackward0>) tensor(5.2377, grad_fn=<NllLossBackward0>)
539 LOSS DIFF: tensor(5.4258, grad_fn=<NllLossBackward0>) tensor(5.3734, grad_fn=<NllLossBackward0>)
540 LOSS DIFF: tensor(5.4562, grad_fn=<NllLossBackward0>) tensor(5.2893, grad_fn=<NllLossBackward0>)
541 LOSS DIFF: tensor(5.3672, grad_fn=<NllLossBackward0>) tensor(5.3331, grad_fn=<NllLossBackward0>)
542 LOSS DIFF: tensor(5.3475, grad_fn=<NllLossBackward0>) tensor(5.3409, grad_fn=<NllLossBackward0>)
543 LOSS DIFF: tensor(5.3826, grad_fn=<NllLossBackward0>) tensor(5.3475, grad_fn=<NllLossBackward0>)
544 LOSS DIFF: tensor(5.4529, grad_fn=<NllLossBackward0>) tensor(5.3826, grad_fn=<NllLossBackward0>)
545 LOSS DIFF: tensor(5.4554, grad_fn=<NllLossBackward0>) tensor(5.3758, grad_fn=<NllLossBackward0>)
546 LOSS DIFF: tensor(5.3725, grad_fn=<NllLossBackward0>) tensor(5.2762, grad_fn=<NllLossBackward0>)
547 LOSS DIFF: tensor(5.3809, grad_fn=<NllLossBackward0>) tensor(5.3140, grad_fn=<NllLossBackward0>)
548 LOSS DIFF: tensor(5.4411, grad_fn=<NllLossBackward0>) tensor(5.3809, grad_fn=<NllLossBackward0>)
1100 tensor(5.2577, grad_fn=<NllLossBackward0>)
549 LOSS DIFF: tensor(5.3207, grad_fn=<NllLossBackward0>) tensor(5.2233, grad_fn=<NllLossBackward0>)
550 LOSS DIFF: tensor(5.3287, grad_fn=<NllLossBackward0>) tensor(5.3207, grad_fn=<NllLossBackward0>)
551 LOSS DIFF: tensor(5.4455, grad_fn=<NllLossBackward0>) tensor(5.3140, grad_fn=<NllLossBackward0>)
552 LOSS DIFF: tensor(5.3970, grad_fn=<NllLossBackward0>) tensor(5.3160, grad_fn=<NllLossBackward0>)
553 LOSS DIFF: tensor(5.4958, grad_fn=<NllLossBackward0>) tensor(5.3970, grad_fn=<NllLossBackward0>)
554 LOSS DIFF: tensor(5.4289, grad_fn=<NllLossBackward0>) tensor(5.3781, grad_fn=<NllLossBackward0>)
555 LOSS DIFF: tensor(5.3988, grad_fn=<NllLossBackward0>) tensor(5.2830, grad_fn=<NllLossBackward0>)
556 LOSS DIFF: tensor(5.3452, grad_fn=<NllLossBackward0>) tensor(5.3121, grad_fn=<NllLossBackward0>)
557 LOSS DIFF: tensor(5.3707, grad_fn=<NllLossBackward0>) tensor(5.3452, grad_fn=<NllLossBackward0>)
558 LOSS DIFF: tensor(5.4004, grad_fn=<NllLossBackward0>) tensor(5.3490, grad_fn=<NllLossBackward0>)
559 LOSS DIFF: tensor(5.3442, grad_fn=<NllLossBackward0>) tensor(5.2255, grad_fn=<NllLossBackward0>)
560 LOSS DIFF: tensor(5.3311, grad_fn=<NllLossBackward0>) tensor(5.3145, grad_fn=<NllLossBackward0>)
561 LOSS DIFF: tensor(5.4662, grad_fn=<NllLossBackward0>) tensor(5.3171, grad_fn=<NllLossBackward0>)
562 LOSS DIFF: tensor(5.3376, grad_fn=<NllLossBackward0>) tensor(5.3006, grad_fn=<NllLossBackward0>)
563 LOSS DIFF: tensor(5.3617, grad_fn=<NllLossBackward0>) tensor(5.3376, grad_fn=<NllLossBackward0>)
564 LOSS DIFF: tensor(5.3627, grad_fn=<NllLossBackward0>) tensor(5.3617, grad_fn=<NllLossBackward0>)
565 LOSS DIFF: tensor(5.3169, grad_fn=<NllLossBackward0>) tensor(5.2494, grad_fn=<NllLossBackward0>)
566 LOSS DIFF: tensor(5.3391, grad_fn=<NllLossBackward0>) tensor(5.2797, grad_fn=<NllLossBackward0>)
567 LOSS DIFF: tensor(5.3793, grad_fn=<NllLossBackward0>) tensor(5.3391, grad_fn=<NllLossBackward0>)
568 LOSS DIFF: tensor(5.3983, grad_fn=<NllLossBackward0>) tensor(5.3793, grad_fn=<NllLossBackward0>)
569 LOSS DIFF: tensor(5.3797, grad_fn=<NllLossBackward0>) tensor(5.1963, grad_fn=<NllLossBackward0>)
570 LOSS DIFF: tensor(5.3978, grad_fn=<NllLossBackward0>) tensor(5.3797, grad_fn=<NllLossBackward0>)
571 LOSS DIFF: tensor(5.4648, grad_fn=<NllLossBackward0>) tensor(5.2794, grad_fn=<NllLossBackward0>)
572 LOSS DIFF: tensor(5.3364, grad_fn=<NllLossBackward0>) tensor(5.3139, grad_fn=<NllLossBackward0>)
573 LOSS DIFF: tensor(5.3724, grad_fn=<NllLossBackward0>) tensor(5.3364, grad_fn=<NllLossBackward0>)
574 LOSS DIFF: tensor(5.4125, grad_fn=<NllLossBackward0>) tensor(5.3724, grad_fn=<NllLossBackward0>)
575 LOSS DIFF: tensor(5.4216, grad_fn=<NllLossBackward0>) tensor(5.3249, grad_fn=<NllLossBackward0>)
576 LOSS DIFF: tensor(5.3209, grad_fn=<NllLossBackward0>) tensor(5.2087, grad_fn=<NllLossBackward0>)
577 LOSS DIFF: tensor(5.2730, grad_fn=<NllLossBackward0>) tensor(5.2515, grad_fn=<NllLossBackward0>)
578 LOSS DIFF: tensor(5.3871, grad_fn=<NllLossBackward0>) tensor(5.2537, grad_fn=<NllLossBackward0>)
579 LOSS DIFF: tensor(5.2357, grad_fn=<NllLossBackward0>) tensor(5.1883, grad_fn=<NllLossBackward0>)
580 LOSS DIFF: tensor(5.4435, grad_fn=<NllLossBackward0>) tensor(5.2357, grad_fn=<NllLossBackward0>)
581 LOSS DIFF: tensor(5.3116, grad_fn=<NllLossBackward0>) tensor(5.2408, grad_fn=<NllLossBackward0>)
582 LOSS DIFF: tensor(5.4295, grad_fn=<NllLossBackward0>) tensor(5.3116, grad_fn=<NllLossBackward0>)
583 LOSS DIFF: tensor(5.3725, grad_fn=<NllLossBackward0>) tensor(5.2704, grad_fn=<NllLossBackward0>)
584 LOSS DIFF: tensor(5.3951, grad_fn=<NllLossBackward0>) tensor(5.3211, grad_fn=<NllLossBackward0>)
585 LOSS DIFF: tensor(5.4080, grad_fn=<NllLossBackward0>) tensor(5.3951, grad_fn=<NllLossBackward0>)
586 LOSS DIFF: tensor(5.3569, grad_fn=<NllLossBackward0>) tensor(5.2900, grad_fn=<NllLossBackward0>)
587 LOSS DIFF: tensor(5.3004, grad_fn=<NllLossBackward0>) tensor(5.2806, grad_fn=<NllLossBackward0>)
588 LOSS DIFF: tensor(5.3874, grad_fn=<NllLossBackward0>) tensor(5.3004, grad_fn=<NllLossBackward0>)
589 LOSS DIFF: tensor(5.4849, grad_fn=<NllLossBackward0>) tensor(5.2921, grad_fn=<NllLossBackward0>)
590 LOSS DIFF: tensor(5.2856, grad_fn=<NllLossBackward0>) tensor(5.2661, grad_fn=<NllLossBackward0>)
591 LOSS DIFF: tensor(5.4242, grad_fn=<NllLossBackward0>) tensor(5.2856, grad_fn=<NllLossBackward0>)
592 LOSS DIFF: tensor(5.2910, grad_fn=<NllLossBackward0>) tensor(5.1762, grad_fn=<NllLossBackward0>)
593 LOSS DIFF: tensor(5.3048, grad_fn=<NllLossBackward0>) tensor(5.1369, grad_fn=<NllLossBackward0>)
594 LOSS DIFF: tensor(5.3170, grad_fn=<NllLossBackward0>) tensor(5.3048, grad_fn=<NllLossBackward0>)
595 LOSS DIFF: tensor(5.4164, grad_fn=<NllLossBackward0>) tensor(5.3170, grad_fn=<NllLossBackward0>)
1200 tensor(5.2414, grad_fn=<NllLossBackward0>)
596 LOSS DIFF: tensor(5.4063, grad_fn=<NllLossBackward0>) tensor(5.2414, grad_fn=<NllLossBackward0>)
597 LOSS DIFF: tensor(5.3547, grad_fn=<NllLossBackward0>) tensor(5.2150, grad_fn=<NllLossBackward0>)
598 LOSS DIFF: tensor(5.2713, grad_fn=<NllLossBackward0>) tensor(5.2182, grad_fn=<NllLossBackward0>)
599 LOSS DIFF: tensor(5.2934, grad_fn=<NllLossBackward0>) tensor(5.2713, grad_fn=<NllLossBackward0>)
600 LOSS DIFF: tensor(5.3680, grad_fn=<NllLossBackward0>) tensor(5.2934, grad_fn=<NllLossBackward0>)
601 LOSS DIFF: tensor(5.3810, grad_fn=<NllLossBackward0>) tensor(5.2937, grad_fn=<NllLossBackward0>)
602 LOSS DIFF: tensor(5.2992, grad_fn=<NllLossBackward0>) tensor(5.2390, grad_fn=<NllLossBackward0>)
603 LOSS DIFF: tensor(5.3592, grad_fn=<NllLossBackward0>) tensor(5.2325, grad_fn=<NllLossBackward0>)
604 LOSS DIFF: tensor(5.4165, grad_fn=<NllLossBackward0>) tensor(5.2317, grad_fn=<NllLossBackward0>)
605 LOSS DIFF: tensor(5.5033, grad_fn=<NllLossBackward0>) tensor(5.4165, grad_fn=<NllLossBackward0>)
606 LOSS DIFF: tensor(5.4137, grad_fn=<NllLossBackward0>) tensor(5.1996, grad_fn=<NllLossBackward0>)
607 LOSS DIFF: tensor(5.5262, grad_fn=<NllLossBackward0>) tensor(5.4137, grad_fn=<NllLossBackward0>)
608 LOSS DIFF: tensor(5.3964, grad_fn=<NllLossBackward0>) tensor(5.3314, grad_fn=<NllLossBackward0>)
609 LOSS DIFF: tensor(5.3722, grad_fn=<NllLossBackward0>) tensor(5.3268, grad_fn=<NllLossBackward0>)
610 LOSS DIFF: tensor(5.3378, grad_fn=<NllLossBackward0>) tensor(5.3186, grad_fn=<NllLossBackward0>)
611 LOSS DIFF: tensor(5.4699, grad_fn=<NllLossBackward0>) tensor(5.3378, grad_fn=<NllLossBackward0>)
612 LOSS DIFF: tensor(5.4191, grad_fn=<NllLossBackward0>) tensor(5.3715, grad_fn=<NllLossBackward0>)
613 LOSS DIFF: tensor(5.3107, grad_fn=<NllLossBackward0>) tensor(5.2864, grad_fn=<NllLossBackward0>)
614 LOSS DIFF: tensor(5.3746, grad_fn=<NllLossBackward0>) tensor(5.2844, grad_fn=<NllLossBackward0>)
615 LOSS DIFF: tensor(5.4486, grad_fn=<NllLossBackward0>) tensor(5.3746, grad_fn=<NllLossBackward0>)
616 LOSS DIFF: tensor(5.4732, grad_fn=<NllLossBackward0>) tensor(5.4486, grad_fn=<NllLossBackward0>)
617 LOSS DIFF: tensor(5.3487, grad_fn=<NllLossBackward0>) tensor(5.2559, grad_fn=<NllLossBackward0>)
618 LOSS DIFF: tensor(5.3737, grad_fn=<NllLossBackward0>) tensor(5.3487, grad_fn=<NllLossBackward0>)
619 LOSS DIFF: tensor(5.3524, grad_fn=<NllLossBackward0>) tensor(5.3056, grad_fn=<NllLossBackward0>)
620 LOSS DIFF: tensor(5.4119, grad_fn=<NllLossBackward0>) tensor(5.3524, grad_fn=<NllLossBackward0>)
621 LOSS DIFF: tensor(5.3877, grad_fn=<NllLossBackward0>) tensor(5.3544, grad_fn=<NllLossBackward0>)
622 LOSS DIFF: tensor(5.3305, grad_fn=<NllLossBackward0>) tensor(5.3165, grad_fn=<NllLossBackward0>)
623 LOSS DIFF: tensor(5.4056, grad_fn=<NllLossBackward0>) tensor(5.3305, grad_fn=<NllLossBackward0>)
624 LOSS DIFF: tensor(5.3550, grad_fn=<NllLossBackward0>) tensor(5.3069, grad_fn=<NllLossBackward0>)
625 LOSS DIFF: tensor(5.3018, grad_fn=<NllLossBackward0>) tensor(5.2306, grad_fn=<NllLossBackward0>)
626 LOSS DIFF: tensor(5.3613, grad_fn=<NllLossBackward0>) tensor(5.3018, grad_fn=<NllLossBackward0>)
627 LOSS DIFF: tensor(5.3056, grad_fn=<NllLossBackward0>) tensor(5.2849, grad_fn=<NllLossBackward0>)
628 LOSS DIFF: tensor(5.4281, grad_fn=<NllLossBackward0>) tensor(5.1398, grad_fn=<NllLossBackward0>)
629 LOSS DIFF: tensor(5.3037, grad_fn=<NllLossBackward0>) tensor(5.2343, grad_fn=<NllLossBackward0>)
630 LOSS DIFF: tensor(5.3630, grad_fn=<NllLossBackward0>) tensor(5.2993, grad_fn=<NllLossBackward0>)
631 LOSS DIFF: tensor(5.3922, grad_fn=<NllLossBackward0>) tensor(5.3630, grad_fn=<NllLossBackward0>)
632 LOSS DIFF: tensor(5.3583, grad_fn=<NllLossBackward0>) tensor(5.2346, grad_fn=<NllLossBackward0>)
633 LOSS DIFF: tensor(5.3638, grad_fn=<NllLossBackward0>) tensor(5.3486, grad_fn=<NllLossBackward0>)
634 LOSS DIFF: tensor(5.2703, grad_fn=<NllLossBackward0>) tensor(5.2605, grad_fn=<NllLossBackward0>)
635 LOSS DIFF: tensor(5.3341, grad_fn=<NllLossBackward0>) tensor(5.2703, grad_fn=<NllLossBackward0>)
636 LOSS DIFF: tensor(5.3615, grad_fn=<NllLossBackward0>) tensor(5.3341, grad_fn=<NllLossBackward0>)
637 LOSS DIFF: tensor(5.3735, grad_fn=<NllLossBackward0>) tensor(5.3225, grad_fn=<NllLossBackward0>)
638 LOSS DIFF: tensor(5.3535, grad_fn=<NllLossBackward0>) tensor(5.2765, grad_fn=<NllLossBackward0>)
639 LOSS DIFF: tensor(5.4068, grad_fn=<NllLossBackward0>) tensor(5.3535, grad_fn=<NllLossBackward0>)
640 LOSS DIFF: tensor(5.3669, grad_fn=<NllLossBackward0>) tensor(5.2441, grad_fn=<NllLossBackward0>)
641 LOSS DIFF: tensor(5.3348, grad_fn=<NllLossBackward0>) tensor(5.2892, grad_fn=<NllLossBackward0>)
642 LOSS DIFF: tensor(5.4134, grad_fn=<NllLossBackward0>) tensor(5.3348, grad_fn=<NllLossBackward0>)
643 LOSS DIFF: tensor(5.3649, grad_fn=<NllLossBackward0>) tensor(5.3365, grad_fn=<NllLossBackward0>)
644 LOSS DIFF: tensor(5.3606, grad_fn=<NllLossBackward0>) tensor(5.2532, grad_fn=<NllLossBackward0>)
645 LOSS DIFF: tensor(5.3622, grad_fn=<NllLossBackward0>) tensor(5.2414, grad_fn=<NllLossBackward0>)
646 LOSS DIFF: tensor(5.3985, grad_fn=<NllLossBackward0>) tensor(5.3297, grad_fn=<NllLossBackward0>)
1300 tensor(5.2993, grad_fn=<NllLossBackward0>)
647 LOSS DIFF: tensor(5.2993, grad_fn=<NllLossBackward0>) tensor(5.2568, grad_fn=<NllLossBackward0>)
648 LOSS DIFF: tensor(5.3153, grad_fn=<NllLossBackward0>) tensor(5.2993, grad_fn=<NllLossBackward0>)
649 LOSS DIFF: tensor(5.3619, grad_fn=<NllLossBackward0>) tensor(5.2734, grad_fn=<NllLossBackward0>)
650 LOSS DIFF: tensor(5.4052, grad_fn=<NllLossBackward0>) tensor(5.2523, grad_fn=<NllLossBackward0>)
651 LOSS DIFF: tensor(5.3573, grad_fn=<NllLossBackward0>) tensor(5.3209, grad_fn=<NllLossBackward0>)
652 LOSS DIFF: tensor(5.2472, grad_fn=<NllLossBackward0>) tensor(5.2427, grad_fn=<NllLossBackward0>)
653 LOSS DIFF: tensor(5.4110, grad_fn=<NllLossBackward0>) tensor(5.2472, grad_fn=<NllLossBackward0>)
654 LOSS DIFF: tensor(5.2660, grad_fn=<NllLossBackward0>) tensor(5.2397, grad_fn=<NllLossBackward0>)
655 LOSS DIFF: tensor(5.3451, grad_fn=<NllLossBackward0>) tensor(5.2660, grad_fn=<NllLossBackward0>)
656 LOSS DIFF: tensor(5.2828, grad_fn=<NllLossBackward0>) tensor(5.1689, grad_fn=<NllLossBackward0>)
657 LOSS DIFF: tensor(5.3989, grad_fn=<NllLossBackward0>) tensor(5.2828, grad_fn=<NllLossBackward0>)
658 LOSS DIFF: tensor(5.3128, grad_fn=<NllLossBackward0>) tensor(5.2708, grad_fn=<NllLossBackward0>)
659 LOSS DIFF: tensor(5.2602, grad_fn=<NllLossBackward0>) tensor(5.2357, grad_fn=<NllLossBackward0>)
660 LOSS DIFF: tensor(5.3591, grad_fn=<NllLossBackward0>) tensor(5.2602, grad_fn=<NllLossBackward0>)
661 LOSS DIFF: tensor(5.4472, grad_fn=<NllLossBackward0>) tensor(5.2953, grad_fn=<NllLossBackward0>)
662 LOSS DIFF: tensor(5.2631, grad_fn=<NllLossBackward0>) tensor(5.1217, grad_fn=<NllLossBackward0>)
663 LOSS DIFF: tensor(5.3468, grad_fn=<NllLossBackward0>) tensor(5.2631, grad_fn=<NllLossBackward0>)
664 LOSS DIFF: tensor(5.3112, grad_fn=<NllLossBackward0>) tensor(5.1798, grad_fn=<NllLossBackward0>)
665 LOSS DIFF: tensor(5.4536, grad_fn=<NllLossBackward0>) tensor(5.3112, grad_fn=<NllLossBackward0>)
666 LOSS DIFF: tensor(5.2946, grad_fn=<NllLossBackward0>) tensor(5.2031, grad_fn=<NllLossBackward0>)
667 LOSS DIFF: tensor(5.3658, grad_fn=<NllLossBackward0>) tensor(5.2946, grad_fn=<NllLossBackward0>)
668 LOSS DIFF: tensor(5.3176, grad_fn=<NllLossBackward0>) tensor(5.3126, grad_fn=<NllLossBackward0>)
669 LOSS DIFF: tensor(5.3397, grad_fn=<NllLossBackward0>) tensor(5.2761, grad_fn=<NllLossBackward0>)
670 LOSS DIFF: tensor(5.3414, grad_fn=<NllLossBackward0>) tensor(5.1992, grad_fn=<NllLossBackward0>)
671 LOSS DIFF: tensor(5.3593, grad_fn=<NllLossBackward0>) tensor(5.2940, grad_fn=<NllLossBackward0>)
672 LOSS DIFF: tensor(5.3734, grad_fn=<NllLossBackward0>) tensor(5.3593, grad_fn=<NllLossBackward0>)
673 LOSS DIFF: tensor(5.3879, grad_fn=<NllLossBackward0>) tensor(5.3734, grad_fn=<NllLossBackward0>)
674 LOSS DIFF: tensor(5.4095, grad_fn=<NllLossBackward0>) tensor(5.3879, grad_fn=<NllLossBackward0>)
675 LOSS DIFF: tensor(5.3731, grad_fn=<NllLossBackward0>) tensor(5.3149, grad_fn=<NllLossBackward0>)
676 LOSS DIFF: tensor(5.3762, grad_fn=<NllLossBackward0>) tensor(5.2030, grad_fn=<NllLossBackward0>)
677 LOSS DIFF: tensor(5.3640, grad_fn=<NllLossBackward0>) tensor(5.2093, grad_fn=<NllLossBackward0>)
678 LOSS DIFF: tensor(5.3913, grad_fn=<NllLossBackward0>) tensor(5.3640, grad_fn=<NllLossBackward0>)
679 LOSS DIFF: tensor(5.3979, grad_fn=<NllLossBackward0>) tensor(5.3913, grad_fn=<NllLossBackward0>)
680 LOSS DIFF: tensor(5.3584, grad_fn=<NllLossBackward0>) tensor(5.2680, grad_fn=<NllLossBackward0>)
681 LOSS DIFF: tensor(5.3767, grad_fn=<NllLossBackward0>) tensor(5.3584, grad_fn=<NllLossBackward0>)
682 LOSS DIFF: tensor(5.3828, grad_fn=<NllLossBackward0>) tensor(5.2542, grad_fn=<NllLossBackward0>)
683 LOSS DIFF: tensor(5.3277, grad_fn=<NllLossBackward0>) tensor(5.2771, grad_fn=<NllLossBackward0>)
684 LOSS DIFF: tensor(5.2910, grad_fn=<NllLossBackward0>) tensor(5.2756, grad_fn=<NllLossBackward0>)
685 LOSS DIFF: tensor(5.3150, grad_fn=<NllLossBackward0>) tensor(5.2910, grad_fn=<NllLossBackward0>)
686 LOSS DIFF: tensor(5.3208, grad_fn=<NllLossBackward0>) tensor(5.3150, grad_fn=<NllLossBackward0>)
687 LOSS DIFF: tensor(5.4099, grad_fn=<NllLossBackward0>) tensor(5.1751, grad_fn=<NllLossBackward0>)
688 LOSS DIFF: tensor(5.3103, grad_fn=<NllLossBackward0>) tensor(5.1557, grad_fn=<NllLossBackward0>)
689 LOSS DIFF: tensor(5.2464, grad_fn=<NllLossBackward0>) tensor(5.2038, grad_fn=<NllLossBackward0>)
690 LOSS DIFF: tensor(5.4148, grad_fn=<NllLossBackward0>) tensor(5.2464, grad_fn=<NllLossBackward0>)
691 LOSS DIFF: tensor(5.3898, grad_fn=<NllLossBackward0>) tensor(5.1863, grad_fn=<NllLossBackward0>)
692 LOSS DIFF: tensor(5.3926, grad_fn=<NllLossBackward0>) tensor(5.3898, grad_fn=<NllLossBackward0>)
693 LOSS DIFF: tensor(5.3975, grad_fn=<NllLossBackward0>) tensor(5.2156, grad_fn=<NllLossBackward0>)
694 LOSS DIFF: tensor(5.2680, grad_fn=<NllLossBackward0>) tensor(5.2367, grad_fn=<NllLossBackward0>)
695 LOSS DIFF: tensor(5.4590, grad_fn=<NllLossBackward0>) tensor(5.1675, grad_fn=<NllLossBackward0>)
696 LOSS DIFF: tensor(5.3168, grad_fn=<NllLossBackward0>) tensor(5.2447, grad_fn=<NllLossBackward0>)
697 LOSS DIFF: tensor(5.3581, grad_fn=<NllLossBackward0>) tensor(5.2256, grad_fn=<NllLossBackward0>)
698 LOSS DIFF: tensor(5.3668, grad_fn=<NllLossBackward0>) tensor(5.3399, grad_fn=<NllLossBackward0>)
1400 tensor(5.4240, grad_fn=<NllLossBackward0>)
699 LOSS DIFF: tensor(5.4240, grad_fn=<NllLossBackward0>) tensor(5.2860, grad_fn=<NllLossBackward0>)
700 LOSS DIFF: tensor(5.4507, grad_fn=<NllLossBackward0>) tensor(5.2273, grad_fn=<NllLossBackward0>)
701 LOSS DIFF: tensor(5.3034, grad_fn=<NllLossBackward0>) tensor(5.2823, grad_fn=<NllLossBackward0>)
702 LOSS DIFF: tensor(5.3641, grad_fn=<NllLossBackward0>) tensor(5.2678, grad_fn=<NllLossBackward0>)
703 LOSS DIFF: tensor(5.3712, grad_fn=<NllLossBackward0>) tensor(5.3641, grad_fn=<NllLossBackward0>)
704 LOSS DIFF: tensor(5.3199, grad_fn=<NllLossBackward0>) tensor(5.2634, grad_fn=<NllLossBackward0>)
705 LOSS DIFF: tensor(5.2937, grad_fn=<NllLossBackward0>) tensor(5.2929, grad_fn=<NllLossBackward0>)
706 LOSS DIFF: tensor(5.4281, grad_fn=<NllLossBackward0>) tensor(5.2937, grad_fn=<NllLossBackward0>)
707 LOSS DIFF: tensor(5.3490, grad_fn=<NllLossBackward0>) tensor(5.2559, grad_fn=<NllLossBackward0>)
708 LOSS DIFF: tensor(5.2956, grad_fn=<NllLossBackward0>) tensor(5.2263, grad_fn=<NllLossBackward0>)
709 LOSS DIFF: tensor(5.3573, grad_fn=<NllLossBackward0>) tensor(5.2956, grad_fn=<NllLossBackward0>)
710 LOSS DIFF: tensor(5.2388, grad_fn=<NllLossBackward0>) tensor(5.1368, grad_fn=<NllLossBackward0>)
711 LOSS DIFF: tensor(5.4568, grad_fn=<NllLossBackward0>) tensor(5.2388, grad_fn=<NllLossBackward0>)
712 LOSS DIFF: tensor(5.3657, grad_fn=<NllLossBackward0>) tensor(5.2206, grad_fn=<NllLossBackward0>)
713 LOSS DIFF: tensor(5.3937, grad_fn=<NllLossBackward0>) tensor(5.3657, grad_fn=<NllLossBackward0>)
714 LOSS DIFF: tensor(5.3151, grad_fn=<NllLossBackward0>) tensor(5.2181, grad_fn=<NllLossBackward0>)
715 LOSS DIFF: tensor(5.3477, grad_fn=<NllLossBackward0>) tensor(5.3151, grad_fn=<NllLossBackward0>)
716 LOSS DIFF: tensor(5.3319, grad_fn=<NllLossBackward0>) tensor(5.2977, grad_fn=<NllLossBackward0>)
717 LOSS DIFF: tensor(5.2638, grad_fn=<NllLossBackward0>) tensor(5.1780, grad_fn=<NllLossBackward0>)
718 LOSS DIFF: tensor(5.2669, grad_fn=<NllLossBackward0>) tensor(5.2638, grad_fn=<NllLossBackward0>)
719 LOSS DIFF: tensor(5.2977, grad_fn=<NllLossBackward0>) tensor(5.2669, grad_fn=<NllLossBackward0>)
720 LOSS DIFF: tensor(5.4203, grad_fn=<NllLossBackward0>) tensor(5.2977, grad_fn=<NllLossBackward0>)
721 LOSS DIFF: tensor(5.3931, grad_fn=<NllLossBackward0>) tensor(5.3073, grad_fn=<NllLossBackward0>)
722 LOSS DIFF: tensor(5.2668, grad_fn=<NllLossBackward0>) tensor(5.2528, grad_fn=<NllLossBackward0>)
723 LOSS DIFF: tensor(5.2713, grad_fn=<NllLossBackward0>) tensor(5.2102, grad_fn=<NllLossBackward0>)
724 LOSS DIFF: tensor(5.4657, grad_fn=<NllLossBackward0>) tensor(5.2713, grad_fn=<NllLossBackward0>)
725 LOSS DIFF: tensor(5.3160, grad_fn=<NllLossBackward0>) tensor(5.2097, grad_fn=<NllLossBackward0>)
726 LOSS DIFF: tensor(5.2945, grad_fn=<NllLossBackward0>) tensor(5.2223, grad_fn=<NllLossBackward0>)
727 LOSS DIFF: tensor(5.2871, grad_fn=<NllLossBackward0>) tensor(5.2417, grad_fn=<NllLossBackward0>)
728 LOSS DIFF: tensor(5.3049, grad_fn=<NllLossBackward0>) tensor(5.2871, grad_fn=<NllLossBackward0>)
729 LOSS DIFF: tensor(5.2566, grad_fn=<NllLossBackward0>) tensor(5.2405, grad_fn=<NllLossBackward0>)
730 LOSS DIFF: tensor(5.3831, grad_fn=<NllLossBackward0>) tensor(5.2566, grad_fn=<NllLossBackward0>)
731 LOSS DIFF: tensor(5.3322, grad_fn=<NllLossBackward0>) tensor(5.2234, grad_fn=<NllLossBackward0>)
732 LOSS DIFF: tensor(5.3731, grad_fn=<NllLossBackward0>) tensor(5.2365, grad_fn=<NllLossBackward0>)
733 LOSS DIFF: tensor(5.4400, grad_fn=<NllLossBackward0>) tensor(5.3731, grad_fn=<NllLossBackward0>)
734 LOSS DIFF: tensor(5.4715, grad_fn=<NllLossBackward0>) tensor(5.3013, grad_fn=<NllLossBackward0>)
735 LOSS DIFF: tensor(5.4422, grad_fn=<NllLossBackward0>) tensor(5.4010, grad_fn=<NllLossBackward0>)
736 LOSS DIFF: tensor(5.2298, grad_fn=<NllLossBackward0>) tensor(5.2163, grad_fn=<NllLossBackward0>)
737 LOSS DIFF: tensor(5.2493, grad_fn=<NllLossBackward0>) tensor(5.2298, grad_fn=<NllLossBackward0>)
738 LOSS DIFF: tensor(5.2958, grad_fn=<NllLossBackward0>) tensor(5.2493, grad_fn=<NllLossBackward0>)
739 LOSS DIFF: tensor(5.4094, grad_fn=<NllLossBackward0>) tensor(5.2502, grad_fn=<NllLossBackward0>)
740 LOSS DIFF: tensor(5.2576, grad_fn=<NllLossBackward0>) tensor(5.2305, grad_fn=<NllLossBackward0>)
741 LOSS DIFF: tensor(5.3885, grad_fn=<NllLossBackward0>) tensor(5.2576, grad_fn=<NllLossBackward0>)
742 LOSS DIFF: tensor(5.3493, grad_fn=<NllLossBackward0>) tensor(5.3387, grad_fn=<NllLossBackward0>)
743 LOSS DIFF: tensor(5.2640, grad_fn=<NllLossBackward0>) tensor(5.1842, grad_fn=<NllLossBackward0>)
744 LOSS DIFF: tensor(5.3568, grad_fn=<NllLossBackward0>) tensor(5.2640, grad_fn=<NllLossBackward0>)
745 LOSS DIFF: tensor(5.4262, grad_fn=<NllLossBackward0>) tensor(5.3232, grad_fn=<NllLossBackward0>)
746 LOSS DIFF: tensor(5.3020, grad_fn=<NllLossBackward0>) tensor(5.2816, grad_fn=<NllLossBackward0>)
1500 tensor(5.1988, grad_fn=<NllLossBackward0>)
747 LOSS DIFF: tensor(5.2921, grad_fn=<NllLossBackward0>) tensor(5.1988, grad_fn=<NllLossBackward0>)
748 LOSS DIFF: tensor(5.3279, grad_fn=<NllLossBackward0>) tensor(5.2921, grad_fn=<NllLossBackward0>)
749 LOSS DIFF: tensor(5.3318, grad_fn=<NllLossBackward0>) tensor(5.0392, grad_fn=<NllLossBackward0>)
750 LOSS DIFF: tensor(5.4100, grad_fn=<NllLossBackward0>) tensor(5.1959, grad_fn=<NllLossBackward0>)
751 LOSS DIFF: tensor(5.2634, grad_fn=<NllLossBackward0>) tensor(5.2334, grad_fn=<NllLossBackward0>)
752 LOSS DIFF: tensor(5.2761, grad_fn=<NllLossBackward0>) tensor(5.2634, grad_fn=<NllLossBackward0>)
753 LOSS DIFF: tensor(5.3743, grad_fn=<NllLossBackward0>) tensor(5.2761, grad_fn=<NllLossBackward0>)
754 LOSS DIFF: tensor(5.4399, grad_fn=<NllLossBackward0>) tensor(5.2495, grad_fn=<NllLossBackward0>)
755 LOSS DIFF: tensor(5.3723, grad_fn=<NllLossBackward0>) tensor(5.2125, grad_fn=<NllLossBackward0>)
756 LOSS DIFF: tensor(5.4313, grad_fn=<NllLossBackward0>) tensor(5.2310, grad_fn=<NllLossBackward0>)
757 LOSS DIFF: tensor(5.3316, grad_fn=<NllLossBackward0>) tensor(5.2243, grad_fn=<NllLossBackward0>)
758 LOSS DIFF: tensor(5.3435, grad_fn=<NllLossBackward0>) tensor(5.3128, grad_fn=<NllLossBackward0>)
759 LOSS DIFF: tensor(5.3396, grad_fn=<NllLossBackward0>) tensor(5.1988, grad_fn=<NllLossBackward0>)
760 LOSS DIFF: tensor(5.3344, grad_fn=<NllLossBackward0>) tensor(5.2798, grad_fn=<NllLossBackward0>)
761 LOSS DIFF: tensor(5.3503, grad_fn=<NllLossBackward0>) tensor(5.2845, grad_fn=<NllLossBackward0>)
762 LOSS DIFF: tensor(5.3522, grad_fn=<NllLossBackward0>) tensor(5.3503, grad_fn=<NllLossBackward0>)
763 LOSS DIFF: tensor(5.2487, grad_fn=<NllLossBackward0>) tensor(5.2103, grad_fn=<NllLossBackward0>)
764 LOSS DIFF: tensor(5.3914, grad_fn=<NllLossBackward0>) tensor(5.2487, grad_fn=<NllLossBackward0>)
765 LOSS DIFF: tensor(5.3346, grad_fn=<NllLossBackward0>) tensor(5.3265, grad_fn=<NllLossBackward0>)
766 LOSS DIFF: tensor(5.3932, grad_fn=<NllLossBackward0>) tensor(5.2668, grad_fn=<NllLossBackward0>)
767 LOSS DIFF: tensor(5.3308, grad_fn=<NllLossBackward0>) tensor(5.2136, grad_fn=<NllLossBackward0>)
768 LOSS DIFF: tensor(5.2342, grad_fn=<NllLossBackward0>) tensor(5.1842, grad_fn=<NllLossBackward0>)
769 LOSS DIFF: tensor(5.2779, grad_fn=<NllLossBackward0>) tensor(5.2342, grad_fn=<NllLossBackward0>)
770 LOSS DIFF: tensor(5.3309, grad_fn=<NllLossBackward0>) tensor(5.2779, grad_fn=<NllLossBackward0>)
771 LOSS DIFF: tensor(5.2772, grad_fn=<NllLossBackward0>) tensor(5.2208, grad_fn=<NllLossBackward0>)
772 LOSS DIFF: tensor(5.2998, grad_fn=<NllLossBackward0>) tensor(5.2772, grad_fn=<NllLossBackward0>)
773 LOSS DIFF: tensor(5.3198, grad_fn=<NllLossBackward0>) tensor(5.2998, grad_fn=<NllLossBackward0>)
774 LOSS DIFF: tensor(5.4071, grad_fn=<NllLossBackward0>) tensor(5.2555, grad_fn=<NllLossBackward0>)
775 LOSS DIFF: tensor(5.3407, grad_fn=<NllLossBackward0>) tensor(5.2137, grad_fn=<NllLossBackward0>)
776 LOSS DIFF: tensor(5.3168, grad_fn=<NllLossBackward0>) tensor(5.1123, grad_fn=<NllLossBackward0>)
777 LOSS DIFF: tensor(5.3270, grad_fn=<NllLossBackward0>) tensor(5.3168, grad_fn=<NllLossBackward0>)
778 LOSS DIFF: tensor(5.2770, grad_fn=<NllLossBackward0>) tensor(5.1605, grad_fn=<NllLossBackward0>)
779 LOSS DIFF: tensor(5.3174, grad_fn=<NllLossBackward0>) tensor(5.2770, grad_fn=<NllLossBackward0>)
780 LOSS DIFF: tensor(5.5412, grad_fn=<NllLossBackward0>) tensor(5.2626, grad_fn=<NllLossBackward0>)
781 LOSS DIFF: tensor(5.3245, grad_fn=<NllLossBackward0>) tensor(5.2973, grad_fn=<NllLossBackward0>)
782 LOSS DIFF: tensor(5.2911, grad_fn=<NllLossBackward0>) tensor(5.2910, grad_fn=<NllLossBackward0>)
783 LOSS DIFF: tensor(5.3198, grad_fn=<NllLossBackward0>) tensor(5.2911, grad_fn=<NllLossBackward0>)
784 LOSS DIFF: tensor(5.2661, grad_fn=<NllLossBackward0>) tensor(5.2297, grad_fn=<NllLossBackward0>)
785 LOSS DIFF: tensor(5.3086, grad_fn=<NllLossBackward0>) tensor(5.2661, grad_fn=<NllLossBackward0>)
786 LOSS DIFF: tensor(5.3143, grad_fn=<NllLossBackward0>) tensor(5.3086, grad_fn=<NllLossBackward0>)
787 LOSS DIFF: tensor(5.3467, grad_fn=<NllLossBackward0>) tensor(5.3143, grad_fn=<NllLossBackward0>)
788 LOSS DIFF: tensor(5.3771, grad_fn=<NllLossBackward0>) tensor(5.3003, grad_fn=<NllLossBackward0>)
789 LOSS DIFF: tensor(5.2802, grad_fn=<NllLossBackward0>) tensor(5.2619, grad_fn=<NllLossBackward0>)
790 LOSS DIFF: tensor(5.3205, grad_fn=<NllLossBackward0>) tensor(5.2489, grad_fn=<NllLossBackward0>)
791 LOSS DIFF: tensor(5.3028, grad_fn=<NllLossBackward0>) tensor(5.1770, grad_fn=<NllLossBackward0>)
792 LOSS DIFF: tensor(5.3130, grad_fn=<NllLossBackward0>) tensor(5.3028, grad_fn=<NllLossBackward0>)
793 LOSS DIFF: tensor(5.2011, grad_fn=<NllLossBackward0>) tensor(5.0365, grad_fn=<NllLossBackward0>)
794 LOSS DIFF: tensor(5.2648, grad_fn=<NllLossBackward0>) tensor(5.2011, grad_fn=<NllLossBackward0>)
795 LOSS DIFF: tensor(5.3135, grad_fn=<NllLossBackward0>) tensor(5.2648, grad_fn=<NllLossBackward0>)
796 LOSS DIFF: tensor(5.3958, grad_fn=<NllLossBackward0>) tensor(5.3135, grad_fn=<NllLossBackward0>)
797 LOSS DIFF: tensor(5.3604, grad_fn=<NllLossBackward0>) tensor(5.1652, grad_fn=<NllLossBackward0>)
1600 tensor(5.3680, grad_fn=<NllLossBackward0>)
798 LOSS DIFF: tensor(5.3680, grad_fn=<NllLossBackward0>) tensor(5.2941, grad_fn=<NllLossBackward0>)
799 LOSS DIFF: tensor(5.2164, grad_fn=<NllLossBackward0>) tensor(5.1485, grad_fn=<NllLossBackward0>)
800 LOSS DIFF: tensor(5.3943, grad_fn=<NllLossBackward0>) tensor(5.2164, grad_fn=<NllLossBackward0>)
801 LOSS DIFF: tensor(5.2456, grad_fn=<NllLossBackward0>) tensor(5.1408, grad_fn=<NllLossBackward0>)
802 LOSS DIFF: tensor(5.2624, grad_fn=<NllLossBackward0>) tensor(5.2268, grad_fn=<NllLossBackward0>)
803 LOSS DIFF: tensor(5.3054, grad_fn=<NllLossBackward0>) tensor(5.1765, grad_fn=<NllLossBackward0>)
804 LOSS DIFF: tensor(5.3530, grad_fn=<NllLossBackward0>) tensor(5.3054, grad_fn=<NllLossBackward0>)
805 LOSS DIFF: tensor(5.3219, grad_fn=<NllLossBackward0>) tensor(5.2960, grad_fn=<NllLossBackward0>)
806 LOSS DIFF: tensor(5.3445, grad_fn=<NllLossBackward0>) tensor(5.2025, grad_fn=<NllLossBackward0>)
807 LOSS DIFF: tensor(5.4269, grad_fn=<NllLossBackward0>) tensor(5.2403, grad_fn=<NllLossBackward0>)
808 LOSS DIFF: tensor(5.3550, grad_fn=<NllLossBackward0>) tensor(5.2981, grad_fn=<NllLossBackward0>)
809 LOSS DIFF: tensor(5.2882, grad_fn=<NllLossBackward0>) tensor(5.2592, grad_fn=<NllLossBackward0>)
810 LOSS DIFF: tensor(5.3459, grad_fn=<NllLossBackward0>) tensor(5.2882, grad_fn=<NllLossBackward0>)
811 LOSS DIFF: tensor(5.3961, grad_fn=<NllLossBackward0>) tensor(5.2398, grad_fn=<NllLossBackward0>)
812 LOSS DIFF: tensor(5.3464, grad_fn=<NllLossBackward0>) tensor(5.2061, grad_fn=<NllLossBackward0>)
813 LOSS DIFF: tensor(5.4667, grad_fn=<NllLossBackward0>) tensor(5.3051, grad_fn=<NllLossBackward0>)
814 LOSS DIFF: tensor(5.3144, grad_fn=<NllLossBackward0>) tensor(5.2452, grad_fn=<NllLossBackward0>)
815 LOSS DIFF: tensor(5.3118, grad_fn=<NllLossBackward0>) tensor(5.1809, grad_fn=<NllLossBackward0>)
816 LOSS DIFF: tensor(5.2670, grad_fn=<NllLossBackward0>) tensor(5.2661, grad_fn=<NllLossBackward0>)
817 LOSS DIFF: tensor(5.2897, grad_fn=<NllLossBackward0>) tensor(5.2135, grad_fn=<NllLossBackward0>)
818 LOSS DIFF: tensor(5.3138, grad_fn=<NllLossBackward0>) tensor(5.2798, grad_fn=<NllLossBackward0>)
819 LOSS DIFF: tensor(5.3730, grad_fn=<NllLossBackward0>) tensor(5.3138, grad_fn=<NllLossBackward0>)
820 LOSS DIFF: tensor(5.3392, grad_fn=<NllLossBackward0>) tensor(5.3115, grad_fn=<NllLossBackward0>)
821 LOSS DIFF: tensor(5.3534, grad_fn=<NllLossBackward0>) tensor(5.2959, grad_fn=<NllLossBackward0>)
822 LOSS DIFF: tensor(5.3893, grad_fn=<NllLossBackward0>) tensor(5.3500, grad_fn=<NllLossBackward0>)
823 LOSS DIFF: tensor(5.2580, grad_fn=<NllLossBackward0>) tensor(5.1436, grad_fn=<NllLossBackward0>)
824 LOSS DIFF: tensor(5.2688, grad_fn=<NllLossBackward0>) tensor(5.2580, grad_fn=<NllLossBackward0>)
825 LOSS DIFF: tensor(5.3212, grad_fn=<NllLossBackward0>) tensor(5.2688, grad_fn=<NllLossBackward0>)
826 LOSS DIFF: tensor(5.3839, grad_fn=<NllLossBackward0>) tensor(5.2897, grad_fn=<NllLossBackward0>)
827 LOSS DIFF: tensor(5.3353, grad_fn=<NllLossBackward0>) tensor(5.2536, grad_fn=<NllLossBackward0>)
828 LOSS DIFF: tensor(5.2735, grad_fn=<NllLossBackward0>) tensor(5.2156, grad_fn=<NllLossBackward0>)
829 LOSS DIFF: tensor(5.3446, grad_fn=<NllLossBackward0>) tensor(5.2735, grad_fn=<NllLossBackward0>)
830 LOSS DIFF: tensor(5.3156, grad_fn=<NllLossBackward0>) tensor(5.2965, grad_fn=<NllLossBackward0>)
831 LOSS DIFF: tensor(5.3263, grad_fn=<NllLossBackward0>) tensor(5.2847, grad_fn=<NllLossBackward0>)
832 LOSS DIFF: tensor(5.2776, grad_fn=<NllLossBackward0>) tensor(5.2448, grad_fn=<NllLossBackward0>)
833 LOSS DIFF: tensor(5.3394, grad_fn=<NllLossBackward0>) tensor(5.2776, grad_fn=<NllLossBackward0>)
834 LOSS DIFF: tensor(5.3633, grad_fn=<NllLossBackward0>) tensor(5.2746, grad_fn=<NllLossBackward0>)
835 LOSS DIFF: tensor(5.2726, grad_fn=<NllLossBackward0>) tensor(5.2409, grad_fn=<NllLossBackward0>)
836 LOSS DIFF: tensor(5.2986, grad_fn=<NllLossBackward0>) tensor(5.2726, grad_fn=<NllLossBackward0>)
837 LOSS DIFF: tensor(5.2534, grad_fn=<NllLossBackward0>) tensor(5.1774, grad_fn=<NllLossBackward0>)
838 LOSS DIFF: tensor(5.3111, grad_fn=<NllLossBackward0>) tensor(5.2534, grad_fn=<NllLossBackward0>)
839 LOSS DIFF: tensor(5.3127, grad_fn=<NllLossBackward0>) tensor(5.3111, grad_fn=<NllLossBackward0>)
840 LOSS DIFF: tensor(5.4215, grad_fn=<NllLossBackward0>) tensor(5.2348, grad_fn=<NllLossBackward0>)
841 LOSS DIFF: tensor(5.2974, grad_fn=<NllLossBackward0>) tensor(5.1407, grad_fn=<NllLossBackward0>)
842 LOSS DIFF: tensor(5.3341, grad_fn=<NllLossBackward0>) tensor(5.2498, grad_fn=<NllLossBackward0>)
843 LOSS DIFF: tensor(5.3087, grad_fn=<NllLossBackward0>) tensor(5.2148, grad_fn=<NllLossBackward0>)
844 LOSS DIFF: tensor(5.2507, grad_fn=<NllLossBackward0>) tensor(5.1230, grad_fn=<NllLossBackward0>)
1700 tensor(5.3550, grad_fn=<NllLossBackward0>)
845 LOSS DIFF: tensor(5.3550, grad_fn=<NllLossBackward0>) tensor(5.2507, grad_fn=<NllLossBackward0>)
846 LOSS DIFF: tensor(5.3766, grad_fn=<NllLossBackward0>) tensor(5.3550, grad_fn=<NllLossBackward0>)
847 LOSS DIFF: tensor(5.2487, grad_fn=<NllLossBackward0>) tensor(5.2300, grad_fn=<NllLossBackward0>)
848 LOSS DIFF: tensor(5.3142, grad_fn=<NllLossBackward0>) tensor(5.2487, grad_fn=<NllLossBackward0>)
849 LOSS DIFF: tensor(5.3734, grad_fn=<NllLossBackward0>) tensor(5.2986, grad_fn=<NllLossBackward0>)
850 LOSS DIFF: tensor(5.2452, grad_fn=<NllLossBackward0>) tensor(5.1219, grad_fn=<NllLossBackward0>)
851 LOSS DIFF: tensor(5.2957, grad_fn=<NllLossBackward0>) tensor(5.2452, grad_fn=<NllLossBackward0>)
852 LOSS DIFF: tensor(5.2852, grad_fn=<NllLossBackward0>) tensor(5.2758, grad_fn=<NllLossBackward0>)
853 LOSS DIFF: tensor(5.3498, grad_fn=<NllLossBackward0>) tensor(5.2852, grad_fn=<NllLossBackward0>)
854 LOSS DIFF: tensor(5.4008, grad_fn=<NllLossBackward0>) tensor(5.3498, grad_fn=<NllLossBackward0>)
855 LOSS DIFF: tensor(5.2165, grad_fn=<NllLossBackward0>) tensor(5.1128, grad_fn=<NllLossBackward0>)
856 LOSS DIFF: tensor(5.2850, grad_fn=<NllLossBackward0>) tensor(5.2165, grad_fn=<NllLossBackward0>)
857 LOSS DIFF: tensor(5.3881, grad_fn=<NllLossBackward0>) tensor(5.2850, grad_fn=<NllLossBackward0>)
858 LOSS DIFF: tensor(5.2249, grad_fn=<NllLossBackward0>) tensor(5.2228, grad_fn=<NllLossBackward0>)
859 LOSS DIFF: tensor(5.2559, grad_fn=<NllLossBackward0>) tensor(5.2249, grad_fn=<NllLossBackward0>)
860 LOSS DIFF: tensor(5.2867, grad_fn=<NllLossBackward0>) tensor(5.2559, grad_fn=<NllLossBackward0>)
861 LOSS DIFF: tensor(5.4387, grad_fn=<NllLossBackward0>) tensor(5.2314, grad_fn=<NllLossBackward0>)
862 LOSS DIFF: tensor(5.2867, grad_fn=<NllLossBackward0>) tensor(5.2233, grad_fn=<NllLossBackward0>)
863 LOSS DIFF: tensor(5.3220, grad_fn=<NllLossBackward0>) tensor(5.2867, grad_fn=<NllLossBackward0>)
864 LOSS DIFF: tensor(5.2581, grad_fn=<NllLossBackward0>) tensor(5.2269, grad_fn=<NllLossBackward0>)
865 LOSS DIFF: tensor(5.2703, grad_fn=<NllLossBackward0>) tensor(5.2581, grad_fn=<NllLossBackward0>)
866 LOSS DIFF: tensor(5.2300, grad_fn=<NllLossBackward0>) tensor(5.1481, grad_fn=<NllLossBackward0>)
867 LOSS DIFF: tensor(5.2460, grad_fn=<NllLossBackward0>) tensor(5.2300, grad_fn=<NllLossBackward0>)
868 LOSS DIFF: tensor(5.3260, grad_fn=<NllLossBackward0>) tensor(5.2460, grad_fn=<NllLossBackward0>)
869 LOSS DIFF: tensor(5.2582, grad_fn=<NllLossBackward0>) tensor(5.1454, grad_fn=<NllLossBackward0>)
870 LOSS DIFF: tensor(5.3153, grad_fn=<NllLossBackward0>) tensor(5.2582, grad_fn=<NllLossBackward0>)
871 LOSS DIFF: tensor(5.2967, grad_fn=<NllLossBackward0>) tensor(5.0807, grad_fn=<NllLossBackward0>)
872 LOSS DIFF: tensor(5.3636, grad_fn=<NllLossBackward0>) tensor(5.2188, grad_fn=<NllLossBackward0>)
873 LOSS DIFF: tensor(5.3807, grad_fn=<NllLossBackward0>) tensor(5.3636, grad_fn=<NllLossBackward0>)
874 LOSS DIFF: tensor(5.3318, grad_fn=<NllLossBackward0>) tensor(5.2364, grad_fn=<NllLossBackward0>)
875 LOSS DIFF: tensor(5.3220, grad_fn=<NllLossBackward0>) tensor(5.2170, grad_fn=<NllLossBackward0>)
876 LOSS DIFF: tensor(5.2753, grad_fn=<NllLossBackward0>) tensor(5.1677, grad_fn=<NllLossBackward0>)
877 LOSS DIFF: tensor(5.3142, grad_fn=<NllLossBackward0>) tensor(5.2753, grad_fn=<NllLossBackward0>)
878 LOSS DIFF: tensor(5.3142, grad_fn=<NllLossBackward0>) tensor(5.1974, grad_fn=<NllLossBackward0>)
879 LOSS DIFF: tensor(5.1746, grad_fn=<NllLossBackward0>) tensor(5.0885, grad_fn=<NllLossBackward0>)
880 LOSS DIFF: tensor(5.3789, grad_fn=<NllLossBackward0>) tensor(5.1746, grad_fn=<NllLossBackward0>)
881 LOSS DIFF: tensor(5.3057, grad_fn=<NllLossBackward0>) tensor(5.2196, grad_fn=<NllLossBackward0>)
882 LOSS DIFF: tensor(5.2886, grad_fn=<NllLossBackward0>) tensor(5.2158, grad_fn=<NllLossBackward0>)
883 LOSS DIFF: tensor(5.3288, grad_fn=<NllLossBackward0>) tensor(5.2491, grad_fn=<NllLossBackward0>)
884 LOSS DIFF: tensor(5.4903, grad_fn=<NllLossBackward0>) tensor(5.3288, grad_fn=<NllLossBackward0>)
885 LOSS DIFF: tensor(5.4034, grad_fn=<NllLossBackward0>) tensor(5.2798, grad_fn=<NllLossBackward0>)
886 LOSS DIFF: tensor(5.3601, grad_fn=<NllLossBackward0>) tensor(5.1771, grad_fn=<NllLossBackward0>)
887 LOSS DIFF: tensor(5.2809, grad_fn=<NllLossBackward0>) tensor(5.1809, grad_fn=<NllLossBackward0>)
888 LOSS DIFF: tensor(5.3620, grad_fn=<NllLossBackward0>) tensor(5.2748, grad_fn=<NllLossBackward0>)
889 LOSS DIFF: tensor(5.3855, grad_fn=<NllLossBackward0>) tensor(5.2573, grad_fn=<NllLossBackward0>)
890 LOSS DIFF: tensor(5.3124, grad_fn=<NllLossBackward0>) tensor(5.2379, grad_fn=<NllLossBackward0>)
891 LOSS DIFF: tensor(5.3192, grad_fn=<NllLossBackward0>) tensor(5.3124, grad_fn=<NllLossBackward0>)
892 LOSS DIFF: tensor(5.3423, grad_fn=<NllLossBackward0>) tensor(5.3192, grad_fn=<NllLossBackward0>)
893 LOSS DIFF: tensor(5.4086, grad_fn=<NllLossBackward0>) tensor(5.1976, grad_fn=<NllLossBackward0>)
894 LOSS DIFF: tensor(5.3156, grad_fn=<NllLossBackward0>) tensor(5.2619, grad_fn=<NllLossBackward0>)
895 LOSS DIFF: tensor(5.3277, grad_fn=<NllLossBackward0>) tensor(5.3156, grad_fn=<NllLossBackward0>)
896 LOSS DIFF: tensor(5.2352, grad_fn=<NllLossBackward0>) tensor(5.2142, grad_fn=<NllLossBackward0>)
897 LOSS DIFF: tensor(5.3471, grad_fn=<NllLossBackward0>) tensor(5.2059, grad_fn=<NllLossBackward0>)
898 LOSS DIFF: tensor(5.2658, grad_fn=<NllLossBackward0>) tensor(5.1801, grad_fn=<NllLossBackward0>)
1800 tensor(5.4171, grad_fn=<NllLossBackward0>)
899 LOSS DIFF: tensor(5.4171, grad_fn=<NllLossBackward0>) tensor(5.2658, grad_fn=<NllLossBackward0>)
900 LOSS DIFF: tensor(5.3919, grad_fn=<NllLossBackward0>) tensor(5.2872, grad_fn=<NllLossBackward0>)
901 LOSS DIFF: tensor(5.2667, grad_fn=<NllLossBackward0>) tensor(5.1940, grad_fn=<NllLossBackward0>)
902 LOSS DIFF: tensor(5.3631, grad_fn=<NllLossBackward0>) tensor(5.2667, grad_fn=<NllLossBackward0>)
903 LOSS DIFF: tensor(5.3693, grad_fn=<NllLossBackward0>) tensor(5.2566, grad_fn=<NllLossBackward0>)
904 LOSS DIFF: tensor(5.3239, grad_fn=<NllLossBackward0>) tensor(5.2152, grad_fn=<NllLossBackward0>)
905 LOSS DIFF: tensor(5.3641, grad_fn=<NllLossBackward0>) tensor(5.3239, grad_fn=<NllLossBackward0>)
906 LOSS DIFF: tensor(5.2443, grad_fn=<NllLossBackward0>) tensor(5.1951, grad_fn=<NllLossBackward0>)
907 LOSS DIFF: tensor(5.4277, grad_fn=<NllLossBackward0>) tensor(5.1634, grad_fn=<NllLossBackward0>)
908 LOSS DIFF: tensor(5.2730, grad_fn=<NllLossBackward0>) tensor(5.0604, grad_fn=<NllLossBackward0>)
909 LOSS DIFF: tensor(5.2867, grad_fn=<NllLossBackward0>) tensor(5.2566, grad_fn=<NllLossBackward0>)
910 LOSS DIFF: tensor(5.4127, grad_fn=<NllLossBackward0>) tensor(5.2155, grad_fn=<NllLossBackward0>)
911 LOSS DIFF: tensor(5.3634, grad_fn=<NllLossBackward0>) tensor(5.3211, grad_fn=<NllLossBackward0>)
912 LOSS DIFF: tensor(5.2831, grad_fn=<NllLossBackward0>) tensor(5.2335, grad_fn=<NllLossBackward0>)
913 LOSS DIFF: tensor(5.2755, grad_fn=<NllLossBackward0>) tensor(5.2735, grad_fn=<NllLossBackward0>)
914 LOSS DIFF: tensor(5.2826, grad_fn=<NllLossBackward0>) tensor(5.2755, grad_fn=<NllLossBackward0>)
915 LOSS DIFF: tensor(5.3887, grad_fn=<NllLossBackward0>) tensor(5.0861, grad_fn=<NllLossBackward0>)
916 LOSS DIFF: tensor(5.3065, grad_fn=<NllLossBackward0>) tensor(5.2729, grad_fn=<NllLossBackward0>)
917 LOSS DIFF: tensor(5.2632, grad_fn=<NllLossBackward0>) tensor(5.1560, grad_fn=<NllLossBackward0>)
918 LOSS DIFF: tensor(5.2920, grad_fn=<NllLossBackward0>) tensor(5.1884, grad_fn=<NllLossBackward0>)
919 LOSS DIFF: tensor(5.3229, grad_fn=<NllLossBackward0>) tensor(5.2920, grad_fn=<NllLossBackward0>)
920 LOSS DIFF: tensor(5.2855, grad_fn=<NllLossBackward0>) tensor(5.1965, grad_fn=<NllLossBackward0>)
921 LOSS DIFF: tensor(5.3634, grad_fn=<NllLossBackward0>) tensor(5.2855, grad_fn=<NllLossBackward0>)
922 LOSS DIFF: tensor(5.3724, grad_fn=<NllLossBackward0>) tensor(5.0690, grad_fn=<NllLossBackward0>)
923 LOSS DIFF: tensor(5.2805, grad_fn=<NllLossBackward0>) tensor(5.2636, grad_fn=<NllLossBackward0>)
924 LOSS DIFF: tensor(5.2306, grad_fn=<NllLossBackward0>) tensor(5.0033, grad_fn=<NllLossBackward0>)
925 LOSS DIFF: tensor(5.2542, grad_fn=<NllLossBackward0>) tensor(5.2243, grad_fn=<NllLossBackward0>)
926 LOSS DIFF: tensor(5.3378, grad_fn=<NllLossBackward0>) tensor(5.2542, grad_fn=<NllLossBackward0>)
927 LOSS DIFF: tensor(5.2164, grad_fn=<NllLossBackward0>) tensor(5.1267, grad_fn=<NllLossBackward0>)
928 LOSS DIFF: tensor(5.3090, grad_fn=<NllLossBackward0>) tensor(5.2164, grad_fn=<NllLossBackward0>)
929 LOSS DIFF: tensor(5.3777, grad_fn=<NllLossBackward0>) tensor(5.3090, grad_fn=<NllLossBackward0>)
930 LOSS DIFF: tensor(5.2597, grad_fn=<NllLossBackward0>) tensor(5.2556, grad_fn=<NllLossBackward0>)
931 LOSS DIFF: tensor(5.4438, grad_fn=<NllLossBackward0>) tensor(5.2080, grad_fn=<NllLossBackward0>)
932 LOSS DIFF: tensor(5.2762, grad_fn=<NllLossBackward0>) tensor(5.2386, grad_fn=<NllLossBackward0>)
933 LOSS DIFF: tensor(5.3475, grad_fn=<NllLossBackward0>) tensor(5.1511, grad_fn=<NllLossBackward0>)
934 LOSS DIFF: tensor(5.3897, grad_fn=<NllLossBackward0>) tensor(5.3475, grad_fn=<NllLossBackward0>)
935 LOSS DIFF: tensor(5.2932, grad_fn=<NllLossBackward0>) tensor(5.1943, grad_fn=<NllLossBackward0>)
936 LOSS DIFF: tensor(5.3678, grad_fn=<NllLossBackward0>) tensor(5.2932, grad_fn=<NllLossBackward0>)
937 LOSS DIFF: tensor(5.3282, grad_fn=<NllLossBackward0>) tensor(5.2433, grad_fn=<NllLossBackward0>)
938 LOSS DIFF: tensor(5.3416, grad_fn=<NllLossBackward0>) tensor(5.3282, grad_fn=<NllLossBackward0>)
939 LOSS DIFF: tensor(5.2709, grad_fn=<NllLossBackward0>) tensor(5.1789, grad_fn=<NllLossBackward0>)
940 LOSS DIFF: tensor(5.3140, grad_fn=<NllLossBackward0>) tensor(5.2709, grad_fn=<NllLossBackward0>)
941 LOSS DIFF: tensor(5.2993, grad_fn=<NllLossBackward0>) tensor(5.2861, grad_fn=<NllLossBackward0>)
942 LOSS DIFF: tensor(5.1903, grad_fn=<NllLossBackward0>) tensor(5.1216, grad_fn=<NllLossBackward0>)
943 LOSS DIFF: tensor(5.2935, grad_fn=<NllLossBackward0>) tensor(5.1903, grad_fn=<NllLossBackward0>)
944 LOSS DIFF: tensor(5.2984, grad_fn=<NllLossBackward0>) tensor(5.2935, grad_fn=<NllLossBackward0>)
945 LOSS DIFF: tensor(5.3579, grad_fn=<NllLossBackward0>) tensor(5.2984, grad_fn=<NllLossBackward0>)
946 LOSS DIFF: tensor(5.2808, grad_fn=<NllLossBackward0>) tensor(5.1785, grad_fn=<NllLossBackward0>)
947 LOSS DIFF: tensor(5.2995, grad_fn=<NllLossBackward0>) tensor(5.2629, grad_fn=<NllLossBackward0>)
948 LOSS DIFF: tensor(5.3437, grad_fn=<NllLossBackward0>) tensor(5.2995, grad_fn=<NllLossBackward0>)
949 LOSS DIFF: tensor(5.3592, grad_fn=<NllLossBackward0>) tensor(5.3437, grad_fn=<NllLossBackward0>)
950 LOSS DIFF: tensor(5.4155, grad_fn=<NllLossBackward0>) tensor(5.3592, grad_fn=<NllLossBackward0>)
951 LOSS DIFF: tensor(5.3014, grad_fn=<NllLossBackward0>) tensor(5.2301, grad_fn=<NllLossBackward0>)
1900 tensor(5.3040, grad_fn=<NllLossBackward0>)
952 LOSS DIFF: tensor(5.3040, grad_fn=<NllLossBackward0>) tensor(5.2344, grad_fn=<NllLossBackward0>)
953 LOSS DIFF: tensor(5.2827, grad_fn=<NllLossBackward0>) tensor(5.2677, grad_fn=<NllLossBackward0>)
954 LOSS DIFF: tensor(5.3628, grad_fn=<NllLossBackward0>) tensor(5.2827, grad_fn=<NllLossBackward0>)
955 LOSS DIFF: tensor(5.2943, grad_fn=<NllLossBackward0>) tensor(5.2210, grad_fn=<NllLossBackward0>)
956 LOSS DIFF: tensor(5.1808, grad_fn=<NllLossBackward0>) tensor(5.1610, grad_fn=<NllLossBackward0>)
957 LOSS DIFF: tensor(5.3546, grad_fn=<NllLossBackward0>) tensor(5.1808, grad_fn=<NllLossBackward0>)
958 LOSS DIFF: tensor(5.1927, grad_fn=<NllLossBackward0>) tensor(5.1525, grad_fn=<NllLossBackward0>)
959 LOSS DIFF: tensor(5.3402, grad_fn=<NllLossBackward0>) tensor(5.1927, grad_fn=<NllLossBackward0>)
960 LOSS DIFF: tensor(5.3660, grad_fn=<NllLossBackward0>) tensor(5.2197, grad_fn=<NllLossBackward0>)
961 LOSS DIFF: tensor(5.3701, grad_fn=<NllLossBackward0>) tensor(5.3660, grad_fn=<NllLossBackward0>)
962 LOSS DIFF: tensor(5.1755, grad_fn=<NllLossBackward0>) tensor(5.1572, grad_fn=<NllLossBackward0>)
963 LOSS DIFF: tensor(5.2423, grad_fn=<NllLossBackward0>) tensor(5.1755, grad_fn=<NllLossBackward0>)
964 LOSS DIFF: tensor(5.4032, grad_fn=<NllLossBackward0>) tensor(5.2423, grad_fn=<NllLossBackward0>)
965 LOSS DIFF: tensor(5.3041, grad_fn=<NllLossBackward0>) tensor(5.1882, grad_fn=<NllLossBackward0>)
966 LOSS DIFF: tensor(5.3328, grad_fn=<NllLossBackward0>) tensor(5.3041, grad_fn=<NllLossBackward0>)
967 LOSS DIFF: tensor(5.1994, grad_fn=<NllLossBackward0>) tensor(5.1086, grad_fn=<NllLossBackward0>)
968 LOSS DIFF: tensor(5.2771, grad_fn=<NllLossBackward0>) tensor(5.1994, grad_fn=<NllLossBackward0>)
969 LOSS DIFF: tensor(5.3016, grad_fn=<NllLossBackward0>) tensor(5.2771, grad_fn=<NllLossBackward0>)
970 LOSS DIFF: tensor(5.3162, grad_fn=<NllLossBackward0>) tensor(5.3016, grad_fn=<NllLossBackward0>)
971 LOSS DIFF: tensor(5.3276, grad_fn=<NllLossBackward0>) tensor(5.2404, grad_fn=<NllLossBackward0>)
972 LOSS DIFF: tensor(5.3335, grad_fn=<NllLossBackward0>) tensor(5.3276, grad_fn=<NllLossBackward0>)
973 LOSS DIFF: tensor(5.3803, grad_fn=<NllLossBackward0>) tensor(5.2597, grad_fn=<NllLossBackward0>)
974 LOSS DIFF: tensor(5.2477, grad_fn=<NllLossBackward0>) tensor(5.1569, grad_fn=<NllLossBackward0>)
975 LOSS DIFF: tensor(5.3720, grad_fn=<NllLossBackward0>) tensor(5.2477, grad_fn=<NllLossBackward0>)
976 LOSS DIFF: tensor(5.3752, grad_fn=<NllLossBackward0>) tensor(5.3720, grad_fn=<NllLossBackward0>)
977 LOSS DIFF: tensor(5.2881, grad_fn=<NllLossBackward0>) tensor(5.2406, grad_fn=<NllLossBackward0>)
978 LOSS DIFF: tensor(5.4561, grad_fn=<NllLossBackward0>) tensor(5.2564, grad_fn=<NllLossBackward0>)
979 LOSS DIFF: tensor(5.3796, grad_fn=<NllLossBackward0>) tensor(5.3418, grad_fn=<NllLossBackward0>)
980 LOSS DIFF: tensor(5.2454, grad_fn=<NllLossBackward0>) tensor(5.2276, grad_fn=<NllLossBackward0>)
981 LOSS DIFF: tensor(5.3129, grad_fn=<NllLossBackward0>) tensor(5.2454, grad_fn=<NllLossBackward0>)
982 LOSS DIFF: tensor(5.3334, grad_fn=<NllLossBackward0>) tensor(5.3129, grad_fn=<NllLossBackward0>)
983 LOSS DIFF: tensor(5.3955, grad_fn=<NllLossBackward0>) tensor(5.3334, grad_fn=<NllLossBackward0>)
984 LOSS DIFF: tensor(5.4304, grad_fn=<NllLossBackward0>) tensor(5.2307, grad_fn=<NllLossBackward0>)
985 LOSS DIFF: tensor(5.3111, grad_fn=<NllLossBackward0>) tensor(5.1737, grad_fn=<NllLossBackward0>)
986 LOSS DIFF: tensor(5.3549, grad_fn=<NllLossBackward0>) tensor(5.3111, grad_fn=<NllLossBackward0>)
987 LOSS DIFF: tensor(5.3662, grad_fn=<NllLossBackward0>) tensor(5.2584, grad_fn=<NllLossBackward0>)
988 LOSS DIFF: tensor(5.3705, grad_fn=<NllLossBackward0>) tensor(5.1949, grad_fn=<NllLossBackward0>)
989 LOSS DIFF: tensor(5.2877, grad_fn=<NllLossBackward0>) tensor(5.2517, grad_fn=<NllLossBackward0>)
990 LOSS DIFF: tensor(5.2987, grad_fn=<NllLossBackward0>) tensor(5.2175, grad_fn=<NllLossBackward0>)
991 LOSS DIFF: tensor(5.3813, grad_fn=<NllLossBackward0>) tensor(5.1823, grad_fn=<NllLossBackward0>)
992 LOSS DIFF: tensor(5.3100, grad_fn=<NllLossBackward0>) tensor(5.2477, grad_fn=<NllLossBackward0>)
993 LOSS DIFF: tensor(5.3208, grad_fn=<NllLossBackward0>) tensor(5.1584, grad_fn=<NllLossBackward0>)
994 LOSS DIFF: tensor(5.3709, grad_fn=<NllLossBackward0>) tensor(5.3208, grad_fn=<NllLossBackward0>)
995 LOSS DIFF: tensor(5.2744, grad_fn=<NllLossBackward0>) tensor(5.1538, grad_fn=<NllLossBackward0>)
996 LOSS DIFF: tensor(5.2920, grad_fn=<NllLossBackward0>) tensor(5.2744, grad_fn=<NllLossBackward0>)
997 LOSS DIFF: tensor(5.3297, grad_fn=<NllLossBackward0>) tensor(5.2446, grad_fn=<NllLossBackward0>)
998 LOSS DIFF: tensor(5.3818, grad_fn=<NllLossBackward0>) tensor(5.3297, grad_fn=<NllLossBackward0>)
999 LOSS DIFF: tensor(5.2615, grad_fn=<NllLossBackward0>) tensor(5.1173, grad_fn=<NllLossBackward0>)
1000 LOSS DIFF: tensor(5.3420, grad_fn=<NllLossBackward0>) tensor(5.2615, grad_fn=<NllLossBackward0>)
loss_track2 = [t.detach().numpy() for t in loss_track]
import matplotlib.pyplot as plt

plt.plot(loss_track2)
plt.show()
torch.save(model.state_dict(), 'model.bin')
device = 'cpu'
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
model.load_state_dict(torch.load('model.bin'))
model.eval()

ixs = torch.tensor(vocab.forward(['he'])).to(device)

out = model(ixs)
top = torch.topk(out[0], 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
list(zip(top_words, top_indices, top_probs))
c:\PROGRAMY\Anaconda3\envs\scweet\lib\site-packages\torch\nn\modules\container.py:217: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  input = module(input)
[('<unk>', 0, 0.1108938604593277),
 ('was', 12, 0.0792110487818718),
 ('had', 37, 0.07402306795120239),
 ('is', 8, 0.04529397189617157),
 ('has', 39, 0.03909718990325928),
 ('would', 48, 0.038855526596307755),
 ('said', 43, 0.022579118609428406),
 ('will', 27, 0.02008220925927162),
 ('went', 251, 0.013605386018753052),
 ('did', 151, 0.013007525354623795)]
def prediction(word: str) -> str:
    ixs = torch.tensor(vocab.forward([word])).to(device)
    out = model(ixs)
    top = torch.topk(out[0], 5)
    top_indices = top.indices.tolist()
    top_probs = top.values.tolist()
    top_words = vocab.lookup_tokens(top_indices)
    zipped = list(zip(top_words, top_probs))
    for index, element in enumerate(zipped):
        unk = None
        if '<unk>' in element:
            unk = zipped.pop(index)
            zipped.append(('', unk[1]))
            break
    if unk is None:
        zipped[-1] = ('', zipped[-1][1])
    return ' '.join([f'{x[0]}:{x[1]}' for x in zipped])
def create_outputs(folder_name):
    print(f'Creating outputs in {folder_name}')
    with lzma.open(f'{folder_name}/in.tsv.xz', mode='rt', encoding='utf-8') as fid:
        with open(f'{folder_name}/out.tsv', 'w', encoding='utf-8', newline='\n') as f:
            for line in fid:
                separated = line.split('\t')
                prefix = separated[6].replace(r'\n', ' ').split()[-1]
                output_line = prediction(prefix)
                f.write(output_line + '\n')
create_outputs('dev-0')
create_outputs('test-A')
Creating outputs in dev-0
Creating outputs in test-A