challenging-america-word-ga.../nn_trigram.ipynb
2023-05-09 23:56:29 +02:00

103 KiB
Raw Blame History

import torch
import lzma
from itertools import islice
import re
import sys
from torchtext.vocab import build_vocab_from_iterator
from torch import nn
from torch.utils.data import IterableDataset, DataLoader
import itertools
import matplotlib.pyplot as plt
VOCAB_SIZE = 10_000
EMBED_SIZE = 400
def get_words_from_line(line):
  line = line.rstrip()
  line = line.split("\t")
  text = line[-2] + " " + line[-1]
  text = re.sub(r"\\\\+n", " ", text)
  text = re.sub('[^A-Za-z ]+', '', text)
  for t in text.split():
    yield t

def get_word_lines_from_file(file_name):
  with lzma.open(file_name, encoding='utf8', mode="rt") as fh:
    for line in fh:
       yield get_words_from_line(line)

vocab = build_vocab_from_iterator(
    get_word_lines_from_file("train/in.tsv.xz"),
    max_tokens = VOCAB_SIZE,
    specials = ['<unk>'])
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
d:\studia\challenging-america-word-gap-prediction\nn_trigram.ipynb Cell 3 in <cell line: 17>()
     <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=12'>13</a>        yield get_words_from_line(line)
     <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=14'>15</a> vocab_size = 1_000
---> <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=16'>17</a> vocab = build_vocab_from_iterator(
     <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=17'>18</a>     get_word_lines_from_file("train/in.tsv.xz"),
     <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=18'>19</a>     max_tokens = VOCAB_SIZE,
     <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=19'>20</a>     specials = ['<unk>'])

File c:\PROGRAMY\Anaconda3\envs\modelowanie-jezyka\lib\site-packages\torchtext\vocab\vocab_factory.py:98, in build_vocab_from_iterator(iterator, min_freq, specials, special_first, max_tokens)
     72 """
     73 Build a Vocab from an iterator.
     74 
   (...)
     94     >>> vocab = build_vocab_from_iterator(yield_tokens(file_path), specials=["<unk>"])
     95 """
     97 counter = Counter()
---> 98 for tokens in iterator:
     99     counter.update(tokens)
    101 specials = specials or []

d:\studia\challenging-america-word-gap-prediction\nn_trigram.ipynb Cell 3 in get_word_lines_from_file(file_name)
     <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=9'>10</a> def get_word_lines_from_file(file_name):
     <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=10'>11</a>   with lzma.open(file_name, encoding='utf8', mode="rt") as fh:
---> <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=11'>12</a>     for line in fh:
     <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=12'>13</a>        yield get_words_from_line(line)

File c:\PROGRAMY\Anaconda3\envs\modelowanie-jezyka\lib\lzma.py:212, in LZMAFile.read1(self, size)
    210 if size < 0:
    211     size = io.DEFAULT_BUFFER_SIZE
--> 212 return self._buffer.read1(size)

File c:\PROGRAMY\Anaconda3\envs\modelowanie-jezyka\lib\_compression.py:68, in DecompressReader.readinto(self, b)
     66 def readinto(self, b):
     67     with memoryview(b) as view, view.cast("B") as byte_view:
---> 68         data = self.read(len(byte_view))
     69         byte_view[:len(data)] = data
     70     return len(data)

File c:\PROGRAMY\Anaconda3\envs\modelowanie-jezyka\lib\_compression.py:103, in DecompressReader.read(self, size)
    101     else:
    102         rawblock = b""
--> 103     data = self._decompressor.decompress(rawblock, size)
    104 if data:
    105     break

KeyboardInterrupt: 
def look_ahead_iterator(gen):
   first = None
   second = None
   for item in gen:
      if first is not None and second is not None:
         yield ((first, item), second)
      first = second
      second = item

class Trigrams(IterableDataset):
  def __init__(self, text_file, vocabulary_size):
      self.vocab = vocab
      self.vocab.set_default_index(self.vocab['<unk>'])
      self.vocabulary_size = VOCAB_SIZE
      self.text_file = text_file

  def __iter__(self):
     return look_ahead_iterator(
         (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))

train_dataset = Trigrams("train/in.tsv.xz", VOCAB_SIZE)
class TrigramNNModel(nn.Module):
  def __init__(self, VOCAB_SIZE, EMBED_SIZE):
      super(TrigramNNModel, self).__init__()
      self.embeddings = nn.Embedding(VOCAB_SIZE, EMBED_SIZE)
      self.hidden_layer = nn.Linear(EMBED_SIZE*2, 1200)
      self.output_layer = nn.Linear(1200, VOCAB_SIZE)
      self.softmax = nn.Softmax()

  def forward(self, x):
      emb_2 = self.embeddings(x[0])
      emb_1 = self.embeddings(x[1])
      x = torch.cat([emb_2, emb_1], dim=1)
      x = self.hidden_layer(x)
      x = self.output_layer(x)
      x = self.softmax(x)
      return x

model = TrigramNNModel(vocab_size, embed_size)

vocab.set_default_index(vocab['<unk>'])
device = 'cpu'
model = TrigramNNModel(VOCAB_SIZE, EMBED_SIZE).to(device)
data = DataLoader(train_dataset, batch_size=2_000)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()

loss_track = []
last_loss = 1_000
trigger_count = 0

model.train()
step = 0
for x, y in data:
   x[0] = x[0].to(device)
   x[1] = x[1].to(device)
   y = y.to(device)
   optimizer.zero_grad()
   ypredicted = model(x)
   loss = criterion(torch.log(ypredicted), y)
   if step % 100 == 0:
      print(step, loss)
   step += 1
   loss.backward()
   optimizer.step()

   if loss > last_loss:
      trigger_count += 1 
      print(trigger_count, 'LOSS DIFF:', loss, last_loss)

   if trigger_count >= 500:
      break

   loss_track.append(loss)
   last_loss = loss
C:\Users\micha\AppData\Local\Temp\ipykernel_14016\2809838665.py:15: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  x = self.softmax(x)
0 tensor(9.2713, grad_fn=<NllLossBackward0>)
1 LOSS DIFF: tensor(8.2370, grad_fn=<NllLossBackward0>) tensor(8.2154, grad_fn=<NllLossBackward0>)
2 LOSS DIFF: tensor(8.0085, grad_fn=<NllLossBackward0>) tensor(7.9711, grad_fn=<NllLossBackward0>)
3 LOSS DIFF: tensor(8.0149, grad_fn=<NllLossBackward0>) tensor(8.0085, grad_fn=<NllLossBackward0>)
4 LOSS DIFF: tensor(7.5328, grad_fn=<NllLossBackward0>) tensor(7.4404, grad_fn=<NllLossBackward0>)
5 LOSS DIFF: tensor(7.5367, grad_fn=<NllLossBackward0>) tensor(7.5328, grad_fn=<NllLossBackward0>)
6 LOSS DIFF: tensor(7.6733, grad_fn=<NllLossBackward0>) tensor(7.5367, grad_fn=<NllLossBackward0>)
7 LOSS DIFF: tensor(7.4703, grad_fn=<NllLossBackward0>) tensor(7.3663, grad_fn=<NllLossBackward0>)
8 LOSS DIFF: tensor(7.2923, grad_fn=<NllLossBackward0>) tensor(7.1224, grad_fn=<NllLossBackward0>)
9 LOSS DIFF: tensor(7.2912, grad_fn=<NllLossBackward0>) tensor(7.0721, grad_fn=<NllLossBackward0>)
10 LOSS DIFF: tensor(7.4529, grad_fn=<NllLossBackward0>) tensor(7.0255, grad_fn=<NllLossBackward0>)
11 LOSS DIFF: tensor(7.2017, grad_fn=<NllLossBackward0>) tensor(7.0108, grad_fn=<NllLossBackward0>)
12 LOSS DIFF: tensor(7.0689, grad_fn=<NllLossBackward0>) tensor(6.7964, grad_fn=<NllLossBackward0>)
13 LOSS DIFF: tensor(7.1870, grad_fn=<NllLossBackward0>) tensor(6.7505, grad_fn=<NllLossBackward0>)
14 LOSS DIFF: tensor(7.0149, grad_fn=<NllLossBackward0>) tensor(6.7360, grad_fn=<NllLossBackward0>)
15 LOSS DIFF: tensor(7.0185, grad_fn=<NllLossBackward0>) tensor(6.5064, grad_fn=<NllLossBackward0>)
16 LOSS DIFF: tensor(6.6809, grad_fn=<NllLossBackward0>) tensor(6.6315, grad_fn=<NllLossBackward0>)
17 LOSS DIFF: tensor(6.6161, grad_fn=<NllLossBackward0>) tensor(6.5363, grad_fn=<NllLossBackward0>)
18 LOSS DIFF: tensor(6.6186, grad_fn=<NllLossBackward0>) tensor(6.4474, grad_fn=<NllLossBackward0>)
19 LOSS DIFF: tensor(6.7242, grad_fn=<NllLossBackward0>) tensor(6.6186, grad_fn=<NllLossBackward0>)
20 LOSS DIFF: tensor(6.8363, grad_fn=<NllLossBackward0>) tensor(6.4740, grad_fn=<NllLossBackward0>)
21 LOSS DIFF: tensor(6.4746, grad_fn=<NllLossBackward0>) tensor(6.3583, grad_fn=<NllLossBackward0>)
22 LOSS DIFF: tensor(6.2821, grad_fn=<NllLossBackward0>) tensor(6.2621, grad_fn=<NllLossBackward0>)
23 LOSS DIFF: tensor(6.5530, grad_fn=<NllLossBackward0>) tensor(6.2821, grad_fn=<NllLossBackward0>)
24 LOSS DIFF: tensor(6.3082, grad_fn=<NllLossBackward0>) tensor(6.1749, grad_fn=<NllLossBackward0>)
25 LOSS DIFF: tensor(6.3215, grad_fn=<NllLossBackward0>) tensor(6.0069, grad_fn=<NllLossBackward0>)
26 LOSS DIFF: tensor(6.3455, grad_fn=<NllLossBackward0>) tensor(6.1887, grad_fn=<NllLossBackward0>)
27 LOSS DIFF: tensor(6.0695, grad_fn=<NllLossBackward0>) tensor(6.0053, grad_fn=<NllLossBackward0>)
28 LOSS DIFF: tensor(6.2298, grad_fn=<NllLossBackward0>) tensor(6.0553, grad_fn=<NllLossBackward0>)
29 LOSS DIFF: tensor(6.2879, grad_fn=<NllLossBackward0>) tensor(6.2298, grad_fn=<NllLossBackward0>)
30 LOSS DIFF: tensor(5.8552, grad_fn=<NllLossBackward0>) tensor(5.7972, grad_fn=<NllLossBackward0>)
31 LOSS DIFF: tensor(5.8884, grad_fn=<NllLossBackward0>) tensor(5.8552, grad_fn=<NllLossBackward0>)
32 LOSS DIFF: tensor(6.0852, grad_fn=<NllLossBackward0>) tensor(5.8884, grad_fn=<NllLossBackward0>)
33 LOSS DIFF: tensor(6.2040, grad_fn=<NllLossBackward0>) tensor(6.0852, grad_fn=<NllLossBackward0>)
34 LOSS DIFF: tensor(6.1036, grad_fn=<NllLossBackward0>) tensor(5.9439, grad_fn=<NllLossBackward0>)
35 LOSS DIFF: tensor(6.0782, grad_fn=<NllLossBackward0>) tensor(5.9413, grad_fn=<NllLossBackward0>)
36 LOSS DIFF: tensor(5.9607, grad_fn=<NllLossBackward0>) tensor(5.7949, grad_fn=<NllLossBackward0>)
37 LOSS DIFF: tensor(6.0354, grad_fn=<NllLossBackward0>) tensor(5.9607, grad_fn=<NllLossBackward0>)
38 LOSS DIFF: tensor(6.2669, grad_fn=<NllLossBackward0>) tensor(6.0243, grad_fn=<NllLossBackward0>)
39 LOSS DIFF: tensor(5.8678, grad_fn=<NllLossBackward0>) tensor(5.6556, grad_fn=<NllLossBackward0>)
40 LOSS DIFF: tensor(6.0265, grad_fn=<NllLossBackward0>) tensor(5.8678, grad_fn=<NllLossBackward0>)
41 LOSS DIFF: tensor(6.1147, grad_fn=<NllLossBackward0>) tensor(5.8050, grad_fn=<NllLossBackward0>)
100 tensor(5.8244, grad_fn=<NllLossBackward0>)
42 LOSS DIFF: tensor(5.8244, grad_fn=<NllLossBackward0>) tensor(5.7412, grad_fn=<NllLossBackward0>)
43 LOSS DIFF: tensor(5.9226, grad_fn=<NllLossBackward0>) tensor(5.8244, grad_fn=<NllLossBackward0>)
44 LOSS DIFF: tensor(5.9487, grad_fn=<NllLossBackward0>) tensor(5.9226, grad_fn=<NllLossBackward0>)
45 LOSS DIFF: tensor(5.8844, grad_fn=<NllLossBackward0>) tensor(5.3183, grad_fn=<NllLossBackward0>)
46 LOSS DIFF: tensor(6.0141, grad_fn=<NllLossBackward0>) tensor(5.8844, grad_fn=<NllLossBackward0>)
47 LOSS DIFF: tensor(6.1782, grad_fn=<NllLossBackward0>) tensor(5.8340, grad_fn=<NllLossBackward0>)
48 LOSS DIFF: tensor(5.8840, grad_fn=<NllLossBackward0>) tensor(5.7920, grad_fn=<NllLossBackward0>)
49 LOSS DIFF: tensor(5.7265, grad_fn=<NllLossBackward0>) tensor(5.6177, grad_fn=<NllLossBackward0>)
50 LOSS DIFF: tensor(5.9389, grad_fn=<NllLossBackward0>) tensor(5.7265, grad_fn=<NllLossBackward0>)
51 LOSS DIFF: tensor(5.6946, grad_fn=<NllLossBackward0>) tensor(5.6487, grad_fn=<NllLossBackward0>)
52 LOSS DIFF: tensor(5.8837, grad_fn=<NllLossBackward0>) tensor(5.6946, grad_fn=<NllLossBackward0>)
53 LOSS DIFF: tensor(5.9090, grad_fn=<NllLossBackward0>) tensor(5.8837, grad_fn=<NllLossBackward0>)
54 LOSS DIFF: tensor(5.9914, grad_fn=<NllLossBackward0>) tensor(5.9090, grad_fn=<NllLossBackward0>)
55 LOSS DIFF: tensor(5.8042, grad_fn=<NllLossBackward0>) tensor(5.7994, grad_fn=<NllLossBackward0>)
56 LOSS DIFF: tensor(5.9282, grad_fn=<NllLossBackward0>) tensor(5.8042, grad_fn=<NllLossBackward0>)
57 LOSS DIFF: tensor(5.9366, grad_fn=<NllLossBackward0>) tensor(5.7254, grad_fn=<NllLossBackward0>)
58 LOSS DIFF: tensor(5.7995, grad_fn=<NllLossBackward0>) tensor(5.7486, grad_fn=<NllLossBackward0>)
59 LOSS DIFF: tensor(5.6361, grad_fn=<NllLossBackward0>) tensor(5.5307, grad_fn=<NllLossBackward0>)
60 LOSS DIFF: tensor(5.7078, grad_fn=<NllLossBackward0>) tensor(5.6361, grad_fn=<NllLossBackward0>)
61 LOSS DIFF: tensor(5.7592, grad_fn=<NllLossBackward0>) tensor(5.7078, grad_fn=<NllLossBackward0>)
62 LOSS DIFF: tensor(5.7625, grad_fn=<NllLossBackward0>) tensor(5.5981, grad_fn=<NllLossBackward0>)
63 LOSS DIFF: tensor(5.8389, grad_fn=<NllLossBackward0>) tensor(5.7625, grad_fn=<NllLossBackward0>)
64 LOSS DIFF: tensor(5.7739, grad_fn=<NllLossBackward0>) tensor(5.7312, grad_fn=<NllLossBackward0>)
65 LOSS DIFF: tensor(5.9031, grad_fn=<NllLossBackward0>) tensor(5.6170, grad_fn=<NllLossBackward0>)
66 LOSS DIFF: tensor(5.7173, grad_fn=<NllLossBackward0>) tensor(5.5232, grad_fn=<NllLossBackward0>)
67 LOSS DIFF: tensor(5.7408, grad_fn=<NllLossBackward0>) tensor(5.7173, grad_fn=<NllLossBackward0>)
68 LOSS DIFF: tensor(5.8191, grad_fn=<NllLossBackward0>) tensor(5.7408, grad_fn=<NllLossBackward0>)
69 LOSS DIFF: tensor(6.0318, grad_fn=<NllLossBackward0>) tensor(5.8191, grad_fn=<NllLossBackward0>)
70 LOSS DIFF: tensor(5.6656, grad_fn=<NllLossBackward0>) tensor(5.5086, grad_fn=<NllLossBackward0>)
71 LOSS DIFF: tensor(5.7288, grad_fn=<NllLossBackward0>) tensor(5.6656, grad_fn=<NllLossBackward0>)
72 LOSS DIFF: tensor(6.0700, grad_fn=<NllLossBackward0>) tensor(5.7288, grad_fn=<NllLossBackward0>)
73 LOSS DIFF: tensor(5.8114, grad_fn=<NllLossBackward0>) tensor(5.5442, grad_fn=<NllLossBackward0>)
74 LOSS DIFF: tensor(5.8363, grad_fn=<NllLossBackward0>) tensor(5.5099, grad_fn=<NllLossBackward0>)
75 LOSS DIFF: tensor(5.8545, grad_fn=<NllLossBackward0>) tensor(5.8363, grad_fn=<NllLossBackward0>)
76 LOSS DIFF: tensor(5.9820, grad_fn=<NllLossBackward0>) tensor(5.8545, grad_fn=<NllLossBackward0>)
77 LOSS DIFF: tensor(5.8431, grad_fn=<NllLossBackward0>) tensor(5.7144, grad_fn=<NllLossBackward0>)
78 LOSS DIFF: tensor(5.9114, grad_fn=<NllLossBackward0>) tensor(5.8431, grad_fn=<NllLossBackward0>)
79 LOSS DIFF: tensor(5.8020, grad_fn=<NllLossBackward0>) tensor(5.4449, grad_fn=<NllLossBackward0>)
80 LOSS DIFF: tensor(5.8973, grad_fn=<NllLossBackward0>) tensor(5.5983, grad_fn=<NllLossBackward0>)
81 LOSS DIFF: tensor(5.6962, grad_fn=<NllLossBackward0>) tensor(5.6396, grad_fn=<NllLossBackward0>)
82 LOSS DIFF: tensor(5.6928, grad_fn=<NllLossBackward0>) tensor(5.5821, grad_fn=<NllLossBackward0>)
83 LOSS DIFF: tensor(5.7957, grad_fn=<NllLossBackward0>) tensor(5.6928, grad_fn=<NllLossBackward0>)
84 LOSS DIFF: tensor(5.5650, grad_fn=<NllLossBackward0>) tensor(5.5055, grad_fn=<NllLossBackward0>)
85 LOSS DIFF: tensor(5.6884, grad_fn=<NllLossBackward0>) tensor(5.5650, grad_fn=<NllLossBackward0>)
86 LOSS DIFF: tensor(5.7350, grad_fn=<NllLossBackward0>) tensor(5.6884, grad_fn=<NllLossBackward0>)
87 LOSS DIFF: tensor(5.6654, grad_fn=<NllLossBackward0>) tensor(5.5815, grad_fn=<NllLossBackward0>)
88 LOSS DIFF: tensor(5.7693, grad_fn=<NllLossBackward0>) tensor(5.3977, grad_fn=<NllLossBackward0>)
89 LOSS DIFF: tensor(5.5829, grad_fn=<NllLossBackward0>) tensor(5.5628, grad_fn=<NllLossBackward0>)
90 LOSS DIFF: tensor(5.8661, grad_fn=<NllLossBackward0>) tensor(5.5829, grad_fn=<NllLossBackward0>)
91 LOSS DIFF: tensor(5.4884, grad_fn=<NllLossBackward0>) tensor(5.4546, grad_fn=<NllLossBackward0>)
92 LOSS DIFF: tensor(5.6575, grad_fn=<NllLossBackward0>) tensor(5.4884, grad_fn=<NllLossBackward0>)
93 LOSS DIFF: tensor(5.8113, grad_fn=<NllLossBackward0>) tensor(5.6575, grad_fn=<NllLossBackward0>)
94 LOSS DIFF: tensor(5.6923, grad_fn=<NllLossBackward0>) tensor(5.5077, grad_fn=<NllLossBackward0>)
95 LOSS DIFF: tensor(5.7196, grad_fn=<NllLossBackward0>) tensor(5.6923, grad_fn=<NllLossBackward0>)
96 LOSS DIFF: tensor(5.6317, grad_fn=<NllLossBackward0>) tensor(5.6262, grad_fn=<NllLossBackward0>)
97 LOSS DIFF: tensor(5.7707, grad_fn=<NllLossBackward0>) tensor(5.6099, grad_fn=<NllLossBackward0>)
200 tensor(5.4212, grad_fn=<NllLossBackward0>)
98 LOSS DIFF: tensor(5.5956, grad_fn=<NllLossBackward0>) tensor(5.4212, grad_fn=<NllLossBackward0>)
99 LOSS DIFF: tensor(5.7422, grad_fn=<NllLossBackward0>) tensor(5.5956, grad_fn=<NllLossBackward0>)
100 LOSS DIFF: tensor(5.8166, grad_fn=<NllLossBackward0>) tensor(5.7422, grad_fn=<NllLossBackward0>)
101 LOSS DIFF: tensor(5.8615, grad_fn=<NllLossBackward0>) tensor(5.8166, grad_fn=<NllLossBackward0>)
102 LOSS DIFF: tensor(5.9617, grad_fn=<NllLossBackward0>) tensor(5.8615, grad_fn=<NllLossBackward0>)
103 LOSS DIFF: tensor(5.9847, grad_fn=<NllLossBackward0>) tensor(5.9617, grad_fn=<NllLossBackward0>)
104 LOSS DIFF: tensor(5.8443, grad_fn=<NllLossBackward0>) tensor(5.6014, grad_fn=<NllLossBackward0>)
105 LOSS DIFF: tensor(5.7755, grad_fn=<NllLossBackward0>) tensor(5.7413, grad_fn=<NllLossBackward0>)
106 LOSS DIFF: tensor(6.0574, grad_fn=<NllLossBackward0>) tensor(5.6690, grad_fn=<NllLossBackward0>)
107 LOSS DIFF: tensor(5.4708, grad_fn=<NllLossBackward0>) tensor(5.4460, grad_fn=<NllLossBackward0>)
108 LOSS DIFF: tensor(5.6402, grad_fn=<NllLossBackward0>) tensor(5.4708, grad_fn=<NllLossBackward0>)
109 LOSS DIFF: tensor(5.7016, grad_fn=<NllLossBackward0>) tensor(5.6402, grad_fn=<NllLossBackward0>)
110 LOSS DIFF: tensor(5.5643, grad_fn=<NllLossBackward0>) tensor(5.4158, grad_fn=<NllLossBackward0>)
111 LOSS DIFF: tensor(5.6958, grad_fn=<NllLossBackward0>) tensor(5.3094, grad_fn=<NllLossBackward0>)
112 LOSS DIFF: tensor(5.8296, grad_fn=<NllLossBackward0>) tensor(5.4617, grad_fn=<NllLossBackward0>)
113 LOSS DIFF: tensor(5.6992, grad_fn=<NllLossBackward0>) tensor(5.5483, grad_fn=<NllLossBackward0>)
114 LOSS DIFF: tensor(5.4980, grad_fn=<NllLossBackward0>) tensor(5.4310, grad_fn=<NllLossBackward0>)
115 LOSS DIFF: tensor(5.4942, grad_fn=<NllLossBackward0>) tensor(5.3832, grad_fn=<NllLossBackward0>)
116 LOSS DIFF: tensor(5.6928, grad_fn=<NllLossBackward0>) tensor(5.4942, grad_fn=<NllLossBackward0>)
117 LOSS DIFF: tensor(5.6334, grad_fn=<NllLossBackward0>) tensor(5.5606, grad_fn=<NllLossBackward0>)
118 LOSS DIFF: tensor(5.7307, grad_fn=<NllLossBackward0>) tensor(5.5210, grad_fn=<NllLossBackward0>)
119 LOSS DIFF: tensor(5.5673, grad_fn=<NllLossBackward0>) tensor(5.5488, grad_fn=<NllLossBackward0>)
120 LOSS DIFF: tensor(6.0060, grad_fn=<NllLossBackward0>) tensor(5.4800, grad_fn=<NllLossBackward0>)
121 LOSS DIFF: tensor(5.5278, grad_fn=<NllLossBackward0>) tensor(5.1856, grad_fn=<NllLossBackward0>)
122 LOSS DIFF: tensor(5.5388, grad_fn=<NllLossBackward0>) tensor(5.5278, grad_fn=<NllLossBackward0>)
123 LOSS DIFF: tensor(5.6835, grad_fn=<NllLossBackward0>) tensor(5.5388, grad_fn=<NllLossBackward0>)
124 LOSS DIFF: tensor(5.6808, grad_fn=<NllLossBackward0>) tensor(5.5417, grad_fn=<NllLossBackward0>)
125 LOSS DIFF: tensor(5.8665, grad_fn=<NllLossBackward0>) tensor(5.5828, grad_fn=<NllLossBackward0>)
126 LOSS DIFF: tensor(5.7710, grad_fn=<NllLossBackward0>) tensor(5.5468, grad_fn=<NllLossBackward0>)
127 LOSS DIFF: tensor(5.6604, grad_fn=<NllLossBackward0>) tensor(5.6368, grad_fn=<NllLossBackward0>)
128 LOSS DIFF: tensor(5.5983, grad_fn=<NllLossBackward0>) tensor(5.5213, grad_fn=<NllLossBackward0>)
129 LOSS DIFF: tensor(5.6943, grad_fn=<NllLossBackward0>) tensor(5.4842, grad_fn=<NllLossBackward0>)
130 LOSS DIFF: tensor(5.5073, grad_fn=<NllLossBackward0>) tensor(5.4259, grad_fn=<NllLossBackward0>)
131 LOSS DIFF: tensor(5.5320, grad_fn=<NllLossBackward0>) tensor(5.5073, grad_fn=<NllLossBackward0>)
132 LOSS DIFF: tensor(5.6082, grad_fn=<NllLossBackward0>) tensor(5.4292, grad_fn=<NllLossBackward0>)
133 LOSS DIFF: tensor(5.6768, grad_fn=<NllLossBackward0>) tensor(5.4724, grad_fn=<NllLossBackward0>)
134 LOSS DIFF: tensor(5.5272, grad_fn=<NllLossBackward0>) tensor(5.5222, grad_fn=<NllLossBackward0>)
135 LOSS DIFF: tensor(5.5190, grad_fn=<NllLossBackward0>) tensor(5.5016, grad_fn=<NllLossBackward0>)
136 LOSS DIFF: tensor(5.6560, grad_fn=<NllLossBackward0>) tensor(5.5190, grad_fn=<NllLossBackward0>)
137 LOSS DIFF: tensor(5.6775, grad_fn=<NllLossBackward0>) tensor(5.6560, grad_fn=<NllLossBackward0>)
138 LOSS DIFF: tensor(5.6694, grad_fn=<NllLossBackward0>) tensor(5.6686, grad_fn=<NllLossBackward0>)
139 LOSS DIFF: tensor(5.5788, grad_fn=<NllLossBackward0>) tensor(5.2768, grad_fn=<NllLossBackward0>)
140 LOSS DIFF: tensor(5.3935, grad_fn=<NllLossBackward0>) tensor(5.3774, grad_fn=<NllLossBackward0>)
141 LOSS DIFF: tensor(5.6068, grad_fn=<NllLossBackward0>) tensor(5.3935, grad_fn=<NllLossBackward0>)
142 LOSS DIFF: tensor(5.6336, grad_fn=<NllLossBackward0>) tensor(5.6068, grad_fn=<NllLossBackward0>)
143 LOSS DIFF: tensor(5.7687, grad_fn=<NllLossBackward0>) tensor(5.5630, grad_fn=<NllLossBackward0>)
144 LOSS DIFF: tensor(5.7539, grad_fn=<NllLossBackward0>) tensor(5.6827, grad_fn=<NllLossBackward0>)
145 LOSS DIFF: tensor(5.7485, grad_fn=<NllLossBackward0>) tensor(5.6277, grad_fn=<NllLossBackward0>)
300 tensor(5.8304, grad_fn=<NllLossBackward0>)
146 LOSS DIFF: tensor(5.8304, grad_fn=<NllLossBackward0>) tensor(5.5549, grad_fn=<NllLossBackward0>)
147 LOSS DIFF: tensor(5.5819, grad_fn=<NllLossBackward0>) tensor(5.4616, grad_fn=<NllLossBackward0>)
148 LOSS DIFF: tensor(5.6154, grad_fn=<NllLossBackward0>) tensor(5.5819, grad_fn=<NllLossBackward0>)
149 LOSS DIFF: tensor(5.7859, grad_fn=<NllLossBackward0>) tensor(5.3329, grad_fn=<NllLossBackward0>)
150 LOSS DIFF: tensor(5.5458, grad_fn=<NllLossBackward0>) tensor(5.5438, grad_fn=<NllLossBackward0>)
151 LOSS DIFF: tensor(5.7121, grad_fn=<NllLossBackward0>) tensor(5.5458, grad_fn=<NllLossBackward0>)
152 LOSS DIFF: tensor(5.6329, grad_fn=<NllLossBackward0>) tensor(5.2700, grad_fn=<NllLossBackward0>)
153 LOSS DIFF: tensor(5.6739, grad_fn=<NllLossBackward0>) tensor(5.3680, grad_fn=<NllLossBackward0>)
154 LOSS DIFF: tensor(5.7045, grad_fn=<NllLossBackward0>) tensor(5.6739, grad_fn=<NllLossBackward0>)
155 LOSS DIFF: tensor(5.5067, grad_fn=<NllLossBackward0>) tensor(5.2978, grad_fn=<NllLossBackward0>)
156 LOSS DIFF: tensor(5.5102, grad_fn=<NllLossBackward0>) tensor(5.5067, grad_fn=<NllLossBackward0>)
157 LOSS DIFF: tensor(5.5956, grad_fn=<NllLossBackward0>) tensor(5.4116, grad_fn=<NllLossBackward0>)
158 LOSS DIFF: tensor(5.5993, grad_fn=<NllLossBackward0>) tensor(5.4012, grad_fn=<NllLossBackward0>)
159 LOSS DIFF: tensor(5.6150, grad_fn=<NllLossBackward0>) tensor(5.3476, grad_fn=<NllLossBackward0>)
160 LOSS DIFF: tensor(5.4375, grad_fn=<NllLossBackward0>) tensor(5.4351, grad_fn=<NllLossBackward0>)
161 LOSS DIFF: tensor(5.7052, grad_fn=<NllLossBackward0>) tensor(5.4375, grad_fn=<NllLossBackward0>)
162 LOSS DIFF: tensor(5.7059, grad_fn=<NllLossBackward0>) tensor(5.5050, grad_fn=<NllLossBackward0>)
163 LOSS DIFF: tensor(5.7356, grad_fn=<NllLossBackward0>) tensor(5.5716, grad_fn=<NllLossBackward0>)
164 LOSS DIFF: tensor(5.7517, grad_fn=<NllLossBackward0>) tensor(5.5423, grad_fn=<NllLossBackward0>)
165 LOSS DIFF: tensor(5.7358, grad_fn=<NllLossBackward0>) tensor(5.4403, grad_fn=<NllLossBackward0>)
166 LOSS DIFF: tensor(5.6180, grad_fn=<NllLossBackward0>) tensor(5.4437, grad_fn=<NllLossBackward0>)
167 LOSS DIFF: tensor(5.5725, grad_fn=<NllLossBackward0>) tensor(5.2734, grad_fn=<NllLossBackward0>)
168 LOSS DIFF: tensor(5.8849, grad_fn=<NllLossBackward0>) tensor(5.3810, grad_fn=<NllLossBackward0>)
169 LOSS DIFF: tensor(5.5414, grad_fn=<NllLossBackward0>) tensor(5.5272, grad_fn=<NllLossBackward0>)
170 LOSS DIFF: tensor(5.5738, grad_fn=<NllLossBackward0>) tensor(5.3898, grad_fn=<NllLossBackward0>)
171 LOSS DIFF: tensor(5.7096, grad_fn=<NllLossBackward0>) tensor(5.2583, grad_fn=<NllLossBackward0>)
172 LOSS DIFF: tensor(5.7039, grad_fn=<NllLossBackward0>) tensor(5.6133, grad_fn=<NllLossBackward0>)
173 LOSS DIFF: tensor(5.5324, grad_fn=<NllLossBackward0>) tensor(5.5068, grad_fn=<NllLossBackward0>)
174 LOSS DIFF: tensor(5.5902, grad_fn=<NllLossBackward0>) tensor(5.4034, grad_fn=<NllLossBackward0>)
175 LOSS DIFF: tensor(5.5912, grad_fn=<NllLossBackward0>) tensor(5.5902, grad_fn=<NllLossBackward0>)
176 LOSS DIFF: tensor(5.7047, grad_fn=<NllLossBackward0>) tensor(5.5912, grad_fn=<NllLossBackward0>)
177 LOSS DIFF: tensor(5.6506, grad_fn=<NllLossBackward0>) tensor(5.4474, grad_fn=<NllLossBackward0>)
178 LOSS DIFF: tensor(5.5547, grad_fn=<NllLossBackward0>) tensor(5.5172, grad_fn=<NllLossBackward0>)
179 LOSS DIFF: tensor(5.5271, grad_fn=<NllLossBackward0>) tensor(5.2485, grad_fn=<NllLossBackward0>)
180 LOSS DIFF: tensor(5.5400, grad_fn=<NllLossBackward0>) tensor(5.4519, grad_fn=<NllLossBackward0>)
181 LOSS DIFF: tensor(5.6702, grad_fn=<NllLossBackward0>) tensor(5.5037, grad_fn=<NllLossBackward0>)
182 LOSS DIFF: tensor(5.5462, grad_fn=<NllLossBackward0>) tensor(5.4319, grad_fn=<NllLossBackward0>)
183 LOSS DIFF: tensor(5.5346, grad_fn=<NllLossBackward0>) tensor(5.4046, grad_fn=<NllLossBackward0>)
184 LOSS DIFF: tensor(5.5779, grad_fn=<NllLossBackward0>) tensor(5.5096, grad_fn=<NllLossBackward0>)
185 LOSS DIFF: tensor(5.5979, grad_fn=<NllLossBackward0>) tensor(5.4310, grad_fn=<NllLossBackward0>)
186 LOSS DIFF: tensor(5.4231, grad_fn=<NllLossBackward0>) tensor(5.2371, grad_fn=<NllLossBackward0>)
187 LOSS DIFF: tensor(5.6120, grad_fn=<NllLossBackward0>) tensor(5.4231, grad_fn=<NllLossBackward0>)
188 LOSS DIFF: tensor(5.4934, grad_fn=<NllLossBackward0>) tensor(5.1333, grad_fn=<NllLossBackward0>)
189 LOSS DIFF: tensor(5.5445, grad_fn=<NllLossBackward0>) tensor(5.2967, grad_fn=<NllLossBackward0>)
190 LOSS DIFF: tensor(5.5506, grad_fn=<NllLossBackward0>) tensor(5.5445, grad_fn=<NllLossBackward0>)
191 LOSS DIFF: tensor(5.6374, grad_fn=<NllLossBackward0>) tensor(5.5506, grad_fn=<NllLossBackward0>)
400 tensor(5.5743, grad_fn=<NllLossBackward0>)
192 LOSS DIFF: tensor(5.6050, grad_fn=<NllLossBackward0>) tensor(5.5743, grad_fn=<NllLossBackward0>)
193 LOSS DIFF: tensor(5.5826, grad_fn=<NllLossBackward0>) tensor(5.3787, grad_fn=<NllLossBackward0>)
194 LOSS DIFF: tensor(5.5223, grad_fn=<NllLossBackward0>) tensor(5.3267, grad_fn=<NllLossBackward0>)
195 LOSS DIFF: tensor(5.4600, grad_fn=<NllLossBackward0>) tensor(5.4485, grad_fn=<NllLossBackward0>)
196 LOSS DIFF: tensor(5.5178, grad_fn=<NllLossBackward0>) tensor(5.4600, grad_fn=<NllLossBackward0>)
197 LOSS DIFF: tensor(5.5514, grad_fn=<NllLossBackward0>) tensor(5.2249, grad_fn=<NllLossBackward0>)
198 LOSS DIFF: tensor(5.5651, grad_fn=<NllLossBackward0>) tensor(5.4807, grad_fn=<NllLossBackward0>)
199 LOSS DIFF: tensor(5.4252, grad_fn=<NllLossBackward0>) tensor(5.1542, grad_fn=<NllLossBackward0>)
200 LOSS DIFF: tensor(5.6503, grad_fn=<NllLossBackward0>) tensor(5.4252, grad_fn=<NllLossBackward0>)
201 LOSS DIFF: tensor(5.5460, grad_fn=<NllLossBackward0>) tensor(5.3643, grad_fn=<NllLossBackward0>)
202 LOSS DIFF: tensor(5.7145, grad_fn=<NllLossBackward0>) tensor(5.4959, grad_fn=<NllLossBackward0>)
203 LOSS DIFF: tensor(5.4506, grad_fn=<NllLossBackward0>) tensor(5.4382, grad_fn=<NllLossBackward0>)
204 LOSS DIFF: tensor(5.5514, grad_fn=<NllLossBackward0>) tensor(5.4506, grad_fn=<NllLossBackward0>)
205 LOSS DIFF: tensor(5.5680, grad_fn=<NllLossBackward0>) tensor(5.5468, grad_fn=<NllLossBackward0>)
206 LOSS DIFF: tensor(5.5970, grad_fn=<NllLossBackward0>) tensor(5.5680, grad_fn=<NllLossBackward0>)
207 LOSS DIFF: tensor(5.6742, grad_fn=<NllLossBackward0>) tensor(5.5970, grad_fn=<NllLossBackward0>)
208 LOSS DIFF: tensor(5.5306, grad_fn=<NllLossBackward0>) tensor(5.2061, grad_fn=<NllLossBackward0>)
209 LOSS DIFF: tensor(5.7571, grad_fn=<NllLossBackward0>) tensor(5.5306, grad_fn=<NllLossBackward0>)
210 LOSS DIFF: tensor(5.6525, grad_fn=<NllLossBackward0>) tensor(5.3833, grad_fn=<NllLossBackward0>)
211 LOSS DIFF: tensor(5.5354, grad_fn=<NllLossBackward0>) tensor(5.3948, grad_fn=<NllLossBackward0>)
212 LOSS DIFF: tensor(5.5960, grad_fn=<NllLossBackward0>) tensor(5.5354, grad_fn=<NllLossBackward0>)
213 LOSS DIFF: tensor(5.7113, grad_fn=<NllLossBackward0>) tensor(5.5470, grad_fn=<NllLossBackward0>)
214 LOSS DIFF: tensor(5.4059, grad_fn=<NllLossBackward0>) tensor(5.3649, grad_fn=<NllLossBackward0>)
215 LOSS DIFF: tensor(5.4863, grad_fn=<NllLossBackward0>) tensor(5.4004, grad_fn=<NllLossBackward0>)
216 LOSS DIFF: tensor(5.5381, grad_fn=<NllLossBackward0>) tensor(5.4863, grad_fn=<NllLossBackward0>)
217 LOSS DIFF: tensor(5.3652, grad_fn=<NllLossBackward0>) tensor(5.3540, grad_fn=<NllLossBackward0>)
218 LOSS DIFF: tensor(5.3894, grad_fn=<NllLossBackward0>) tensor(5.1646, grad_fn=<NllLossBackward0>)
219 LOSS DIFF: tensor(5.6803, grad_fn=<NllLossBackward0>) tensor(5.3894, grad_fn=<NllLossBackward0>)
220 LOSS DIFF: tensor(5.6113, grad_fn=<NllLossBackward0>) tensor(5.4769, grad_fn=<NllLossBackward0>)
221 LOSS DIFF: tensor(5.6813, grad_fn=<NllLossBackward0>) tensor(5.2015, grad_fn=<NllLossBackward0>)
222 LOSS DIFF: tensor(5.3458, grad_fn=<NllLossBackward0>) tensor(5.2679, grad_fn=<NllLossBackward0>)
223 LOSS DIFF: tensor(5.2445, grad_fn=<NllLossBackward0>) tensor(5.1445, grad_fn=<NllLossBackward0>)
224 LOSS DIFF: tensor(5.6649, grad_fn=<NllLossBackward0>) tensor(5.2441, grad_fn=<NllLossBackward0>)
225 LOSS DIFF: tensor(5.8539, grad_fn=<NllLossBackward0>) tensor(5.6026, grad_fn=<NllLossBackward0>)
226 LOSS DIFF: tensor(5.4560, grad_fn=<NllLossBackward0>) tensor(5.4208, grad_fn=<NllLossBackward0>)
227 LOSS DIFF: tensor(5.5729, grad_fn=<NllLossBackward0>) tensor(5.4560, grad_fn=<NllLossBackward0>)
228 LOSS DIFF: tensor(5.5996, grad_fn=<NllLossBackward0>) tensor(5.3175, grad_fn=<NllLossBackward0>)
229 LOSS DIFF: tensor(5.6685, grad_fn=<NllLossBackward0>) tensor(5.2451, grad_fn=<NllLossBackward0>)
230 LOSS DIFF: tensor(5.5938, grad_fn=<NllLossBackward0>) tensor(5.4874, grad_fn=<NllLossBackward0>)
231 LOSS DIFF: tensor(5.6228, grad_fn=<NllLossBackward0>) tensor(5.2840, grad_fn=<NllLossBackward0>)
232 LOSS DIFF: tensor(5.3415, grad_fn=<NllLossBackward0>) tensor(5.3339, grad_fn=<NllLossBackward0>)
233 LOSS DIFF: tensor(5.3861, grad_fn=<NllLossBackward0>) tensor(5.1807, grad_fn=<NllLossBackward0>)
234 LOSS DIFF: tensor(5.4093, grad_fn=<NllLossBackward0>) tensor(5.3861, grad_fn=<NllLossBackward0>)
235 LOSS DIFF: tensor(5.6085, grad_fn=<NllLossBackward0>) tensor(5.4093, grad_fn=<NllLossBackward0>)
236 LOSS DIFF: tensor(5.3475, grad_fn=<NllLossBackward0>) tensor(5.1380, grad_fn=<NllLossBackward0>)
237 LOSS DIFF: tensor(5.6542, grad_fn=<NllLossBackward0>) tensor(5.3475, grad_fn=<NllLossBackward0>)
238 LOSS DIFF: tensor(5.6034, grad_fn=<NllLossBackward0>) tensor(5.2396, grad_fn=<NllLossBackward0>)
239 LOSS DIFF: tensor(5.5599, grad_fn=<NllLossBackward0>) tensor(5.2510, grad_fn=<NllLossBackward0>)
240 LOSS DIFF: tensor(5.4534, grad_fn=<NllLossBackward0>) tensor(5.3629, grad_fn=<NllLossBackward0>)
500 tensor(5.5447, grad_fn=<NllLossBackward0>)
241 LOSS DIFF: tensor(5.5447, grad_fn=<NllLossBackward0>) tensor(5.4534, grad_fn=<NllLossBackward0>)
242 LOSS DIFF: tensor(5.4929, grad_fn=<NllLossBackward0>) tensor(5.3445, grad_fn=<NllLossBackward0>)
243 LOSS DIFF: tensor(5.4963, grad_fn=<NllLossBackward0>) tensor(5.3411, grad_fn=<NllLossBackward0>)
244 LOSS DIFF: tensor(5.3306, grad_fn=<NllLossBackward0>) tensor(5.1341, grad_fn=<NllLossBackward0>)
245 LOSS DIFF: tensor(5.3853, grad_fn=<NllLossBackward0>) tensor(5.3306, grad_fn=<NllLossBackward0>)
246 LOSS DIFF: tensor(5.5949, grad_fn=<NllLossBackward0>) tensor(5.3853, grad_fn=<NllLossBackward0>)
247 LOSS DIFF: tensor(5.5202, grad_fn=<NllLossBackward0>) tensor(5.2283, grad_fn=<NllLossBackward0>)
248 LOSS DIFF: tensor(5.5862, grad_fn=<NllLossBackward0>) tensor(5.5202, grad_fn=<NllLossBackward0>)
249 LOSS DIFF: tensor(5.5425, grad_fn=<NllLossBackward0>) tensor(5.2707, grad_fn=<NllLossBackward0>)
250 LOSS DIFF: tensor(5.6233, grad_fn=<NllLossBackward0>) tensor(5.2300, grad_fn=<NllLossBackward0>)
251 LOSS DIFF: tensor(5.4803, grad_fn=<NllLossBackward0>) tensor(5.3777, grad_fn=<NllLossBackward0>)
252 LOSS DIFF: tensor(5.6414, grad_fn=<NllLossBackward0>) tensor(5.3601, grad_fn=<NllLossBackward0>)
253 LOSS DIFF: tensor(5.2371, grad_fn=<NllLossBackward0>) tensor(5.2364, grad_fn=<NllLossBackward0>)
254 LOSS DIFF: tensor(5.3186, grad_fn=<NllLossBackward0>) tensor(5.2371, grad_fn=<NllLossBackward0>)
255 LOSS DIFF: tensor(5.6731, grad_fn=<NllLossBackward0>) tensor(5.3186, grad_fn=<NllLossBackward0>)
256 LOSS DIFF: tensor(5.5774, grad_fn=<NllLossBackward0>) tensor(5.5003, grad_fn=<NllLossBackward0>)
257 LOSS DIFF: tensor(5.6139, grad_fn=<NllLossBackward0>) tensor(5.0909, grad_fn=<NllLossBackward0>)
258 LOSS DIFF: tensor(5.4975, grad_fn=<NllLossBackward0>) tensor(5.3252, grad_fn=<NllLossBackward0>)
259 LOSS DIFF: tensor(5.1695, grad_fn=<NllLossBackward0>) tensor(5.1682, grad_fn=<NllLossBackward0>)
260 LOSS DIFF: tensor(5.4441, grad_fn=<NllLossBackward0>) tensor(5.1695, grad_fn=<NllLossBackward0>)
261 LOSS DIFF: tensor(5.5408, grad_fn=<NllLossBackward0>) tensor(5.4441, grad_fn=<NllLossBackward0>)
262 LOSS DIFF: tensor(5.5618, grad_fn=<NllLossBackward0>) tensor(5.5408, grad_fn=<NllLossBackward0>)
263 LOSS DIFF: tensor(5.5545, grad_fn=<NllLossBackward0>) tensor(5.5457, grad_fn=<NllLossBackward0>)
264 LOSS DIFF: tensor(5.6082, grad_fn=<NllLossBackward0>) tensor(5.5545, grad_fn=<NllLossBackward0>)
265 LOSS DIFF: tensor(5.3351, grad_fn=<NllLossBackward0>) tensor(5.3258, grad_fn=<NllLossBackward0>)
266 LOSS DIFF: tensor(5.5028, grad_fn=<NllLossBackward0>) tensor(5.3351, grad_fn=<NllLossBackward0>)
267 LOSS DIFF: tensor(5.4873, grad_fn=<NllLossBackward0>) tensor(5.3415, grad_fn=<NllLossBackward0>)
268 LOSS DIFF: tensor(5.5458, grad_fn=<NllLossBackward0>) tensor(5.4873, grad_fn=<NllLossBackward0>)
269 LOSS DIFF: tensor(5.3706, grad_fn=<NllLossBackward0>) tensor(5.3371, grad_fn=<NllLossBackward0>)
270 LOSS DIFF: tensor(5.5207, grad_fn=<NllLossBackward0>) tensor(5.3706, grad_fn=<NllLossBackward0>)
271 LOSS DIFF: tensor(5.4275, grad_fn=<NllLossBackward0>) tensor(5.3686, grad_fn=<NllLossBackward0>)
272 LOSS DIFF: tensor(5.5256, grad_fn=<NllLossBackward0>) tensor(5.4275, grad_fn=<NllLossBackward0>)
273 LOSS DIFF: tensor(5.3044, grad_fn=<NllLossBackward0>) tensor(5.1722, grad_fn=<NllLossBackward0>)
274 LOSS DIFF: tensor(5.1798, grad_fn=<NllLossBackward0>) tensor(5.0866, grad_fn=<NllLossBackward0>)
275 LOSS DIFF: tensor(5.5159, grad_fn=<NllLossBackward0>) tensor(5.1798, grad_fn=<NllLossBackward0>)
276 LOSS DIFF: tensor(5.3755, grad_fn=<NllLossBackward0>) tensor(5.3404, grad_fn=<NllLossBackward0>)
277 LOSS DIFF: tensor(5.3817, grad_fn=<NllLossBackward0>) tensor(5.3755, grad_fn=<NllLossBackward0>)
278 LOSS DIFF: tensor(5.5214, grad_fn=<NllLossBackward0>) tensor(5.3817, grad_fn=<NllLossBackward0>)
279 LOSS DIFF: tensor(5.4231, grad_fn=<NllLossBackward0>) tensor(5.4104, grad_fn=<NllLossBackward0>)
280 LOSS DIFF: tensor(5.7068, grad_fn=<NllLossBackward0>) tensor(5.4231, grad_fn=<NllLossBackward0>)
281 LOSS DIFF: tensor(5.6217, grad_fn=<NllLossBackward0>) tensor(5.3672, grad_fn=<NllLossBackward0>)
282 LOSS DIFF: tensor(5.5297, grad_fn=<NllLossBackward0>) tensor(5.2592, grad_fn=<NllLossBackward0>)
283 LOSS DIFF: tensor(5.4354, grad_fn=<NllLossBackward0>) tensor(5.1583, grad_fn=<NllLossBackward0>)
284 LOSS DIFF: tensor(5.3529, grad_fn=<NllLossBackward0>) tensor(5.3227, grad_fn=<NllLossBackward0>)
285 LOSS DIFF: tensor(5.5201, grad_fn=<NllLossBackward0>) tensor(5.3529, grad_fn=<NllLossBackward0>)
286 LOSS DIFF: tensor(5.3654, grad_fn=<NllLossBackward0>) tensor(5.3083, grad_fn=<NllLossBackward0>)
287 LOSS DIFF: tensor(5.3719, grad_fn=<NllLossBackward0>) tensor(5.3654, grad_fn=<NllLossBackward0>)
288 LOSS DIFF: tensor(5.7598, grad_fn=<NllLossBackward0>) tensor(5.3256, grad_fn=<NllLossBackward0>)
289 LOSS DIFF: tensor(5.4723, grad_fn=<NllLossBackward0>) tensor(5.3773, grad_fn=<NllLossBackward0>)
600 tensor(5.1854, grad_fn=<NllLossBackward0>)
290 LOSS DIFF: tensor(5.2626, grad_fn=<NllLossBackward0>) tensor(5.1854, grad_fn=<NllLossBackward0>)
291 LOSS DIFF: tensor(5.3265, grad_fn=<NllLossBackward0>) tensor(5.2626, grad_fn=<NllLossBackward0>)
292 LOSS DIFF: tensor(5.3546, grad_fn=<NllLossBackward0>) tensor(5.3265, grad_fn=<NllLossBackward0>)
293 LOSS DIFF: tensor(5.4134, grad_fn=<NllLossBackward0>) tensor(5.3546, grad_fn=<NllLossBackward0>)
294 LOSS DIFF: tensor(5.3317, grad_fn=<NllLossBackward0>) tensor(5.3061, grad_fn=<NllLossBackward0>)
295 LOSS DIFF: tensor(5.5886, grad_fn=<NllLossBackward0>) tensor(5.3317, grad_fn=<NllLossBackward0>)
296 LOSS DIFF: tensor(5.2714, grad_fn=<NllLossBackward0>) tensor(5.2538, grad_fn=<NllLossBackward0>)
297 LOSS DIFF: tensor(5.4437, grad_fn=<NllLossBackward0>) tensor(5.2699, grad_fn=<NllLossBackward0>)
298 LOSS DIFF: tensor(5.4026, grad_fn=<NllLossBackward0>) tensor(5.3539, grad_fn=<NllLossBackward0>)
299 LOSS DIFF: tensor(5.5344, grad_fn=<NllLossBackward0>) tensor(5.4026, grad_fn=<NllLossBackward0>)
300 LOSS DIFF: tensor(5.2724, grad_fn=<NllLossBackward0>) tensor(5.1554, grad_fn=<NllLossBackward0>)
301 LOSS DIFF: tensor(5.4204, grad_fn=<NllLossBackward0>) tensor(5.2614, grad_fn=<NllLossBackward0>)
302 LOSS DIFF: tensor(5.5588, grad_fn=<NllLossBackward0>) tensor(5.4204, grad_fn=<NllLossBackward0>)
303 LOSS DIFF: tensor(5.4821, grad_fn=<NllLossBackward0>) tensor(5.2939, grad_fn=<NllLossBackward0>)
304 LOSS DIFF: tensor(5.5529, grad_fn=<NllLossBackward0>) tensor(5.4821, grad_fn=<NllLossBackward0>)
305 LOSS DIFF: tensor(5.5659, grad_fn=<NllLossBackward0>) tensor(5.5529, grad_fn=<NllLossBackward0>)
306 LOSS DIFF: tensor(5.3128, grad_fn=<NllLossBackward0>) tensor(5.1975, grad_fn=<NllLossBackward0>)
307 LOSS DIFF: tensor(5.4044, grad_fn=<NllLossBackward0>) tensor(5.2514, grad_fn=<NllLossBackward0>)
308 LOSS DIFF: tensor(5.5461, grad_fn=<NllLossBackward0>) tensor(5.4044, grad_fn=<NllLossBackward0>)
309 LOSS DIFF: tensor(5.4835, grad_fn=<NllLossBackward0>) tensor(5.4153, grad_fn=<NllLossBackward0>)
310 LOSS DIFF: tensor(5.4990, grad_fn=<NllLossBackward0>) tensor(5.3391, grad_fn=<NllLossBackward0>)
311 LOSS DIFF: tensor(5.5111, grad_fn=<NllLossBackward0>) tensor(5.4990, grad_fn=<NllLossBackward0>)
312 LOSS DIFF: tensor(5.4828, grad_fn=<NllLossBackward0>) tensor(5.3784, grad_fn=<NllLossBackward0>)
313 LOSS DIFF: tensor(5.4165, grad_fn=<NllLossBackward0>) tensor(5.0706, grad_fn=<NllLossBackward0>)
314 LOSS DIFF: tensor(5.5142, grad_fn=<NllLossBackward0>) tensor(5.4165, grad_fn=<NllLossBackward0>)
315 LOSS DIFF: tensor(5.3397, grad_fn=<NllLossBackward0>) tensor(5.1207, grad_fn=<NllLossBackward0>)
316 LOSS DIFF: tensor(5.6205, grad_fn=<NllLossBackward0>) tensor(5.3397, grad_fn=<NllLossBackward0>)
317 LOSS DIFF: tensor(5.4190, grad_fn=<NllLossBackward0>) tensor(5.3573, grad_fn=<NllLossBackward0>)
318 LOSS DIFF: tensor(5.2788, grad_fn=<NllLossBackward0>) tensor(5.2728, grad_fn=<NllLossBackward0>)
319 LOSS DIFF: tensor(5.3070, grad_fn=<NllLossBackward0>) tensor(5.2788, grad_fn=<NllLossBackward0>)
320 LOSS DIFF: tensor(5.5223, grad_fn=<NllLossBackward0>) tensor(5.3070, grad_fn=<NllLossBackward0>)
321 LOSS DIFF: tensor(5.3895, grad_fn=<NllLossBackward0>) tensor(5.2946, grad_fn=<NllLossBackward0>)
322 LOSS DIFF: tensor(5.6954, grad_fn=<NllLossBackward0>) tensor(5.2766, grad_fn=<NllLossBackward0>)
323 LOSS DIFF: tensor(5.3206, grad_fn=<NllLossBackward0>) tensor(5.2566, grad_fn=<NllLossBackward0>)
324 LOSS DIFF: tensor(5.4333, grad_fn=<NllLossBackward0>) tensor(5.1247, grad_fn=<NllLossBackward0>)
325 LOSS DIFF: tensor(5.5108, grad_fn=<NllLossBackward0>) tensor(5.2871, grad_fn=<NllLossBackward0>)
326 LOSS DIFF: tensor(5.3659, grad_fn=<NllLossBackward0>) tensor(5.2939, grad_fn=<NllLossBackward0>)
327 LOSS DIFF: tensor(5.4602, grad_fn=<NllLossBackward0>) tensor(5.2214, grad_fn=<NllLossBackward0>)
328 LOSS DIFF: tensor(5.1405, grad_fn=<NllLossBackward0>) tensor(4.9549, grad_fn=<NllLossBackward0>)
329 LOSS DIFF: tensor(5.4136, grad_fn=<NllLossBackward0>) tensor(4.9053, grad_fn=<NllLossBackward0>)
330 LOSS DIFF: tensor(5.7120, grad_fn=<NllLossBackward0>) tensor(5.2294, grad_fn=<NllLossBackward0>)
331 LOSS DIFF: tensor(5.4775, grad_fn=<NllLossBackward0>) tensor(5.3224, grad_fn=<NllLossBackward0>)
332 LOSS DIFF: tensor(5.2917, grad_fn=<NllLossBackward0>) tensor(5.1672, grad_fn=<NllLossBackward0>)
333 LOSS DIFF: tensor(5.3209, grad_fn=<NllLossBackward0>) tensor(5.2917, grad_fn=<NllLossBackward0>)
334 LOSS DIFF: tensor(5.3745, grad_fn=<NllLossBackward0>) tensor(5.3209, grad_fn=<NllLossBackward0>)
335 LOSS DIFF: tensor(5.4889, grad_fn=<NllLossBackward0>) tensor(5.3172, grad_fn=<NllLossBackward0>)
336 LOSS DIFF: tensor(5.3614, grad_fn=<NllLossBackward0>) tensor(5.2868, grad_fn=<NllLossBackward0>)
337 LOSS DIFF: tensor(5.4456, grad_fn=<NllLossBackward0>) tensor(5.3614, grad_fn=<NllLossBackward0>)
338 LOSS DIFF: tensor(5.3012, grad_fn=<NllLossBackward0>) tensor(5.2641, grad_fn=<NllLossBackward0>)
339 LOSS DIFF: tensor(5.5309, grad_fn=<NllLossBackward0>) tensor(5.3012, grad_fn=<NllLossBackward0>)
340 LOSS DIFF: tensor(5.2953, grad_fn=<NllLossBackward0>) tensor(5.1931, grad_fn=<NllLossBackward0>)
341 LOSS DIFF: tensor(5.3908, grad_fn=<NllLossBackward0>) tensor(5.2953, grad_fn=<NllLossBackward0>)
342 LOSS DIFF: tensor(5.5060, grad_fn=<NllLossBackward0>) tensor(5.1682, grad_fn=<NllLossBackward0>)
700 tensor(5.1404, grad_fn=<NllLossBackward0>)
343 LOSS DIFF: tensor(5.3184, grad_fn=<NllLossBackward0>) tensor(4.8281, grad_fn=<NllLossBackward0>)
344 LOSS DIFF: tensor(5.4549, grad_fn=<NllLossBackward0>) tensor(5.3184, grad_fn=<NllLossBackward0>)
345 LOSS DIFF: tensor(5.4196, grad_fn=<NllLossBackward0>) tensor(5.4127, grad_fn=<NllLossBackward0>)
346 LOSS DIFF: tensor(5.4480, grad_fn=<NllLossBackward0>) tensor(5.4196, grad_fn=<NllLossBackward0>)
347 LOSS DIFF: tensor(5.5778, grad_fn=<NllLossBackward0>) tensor(5.3616, grad_fn=<NllLossBackward0>)
348 LOSS DIFF: tensor(5.2266, grad_fn=<NllLossBackward0>) tensor(5.1052, grad_fn=<NllLossBackward0>)
349 LOSS DIFF: tensor(5.4058, grad_fn=<NllLossBackward0>) tensor(5.2266, grad_fn=<NllLossBackward0>)
350 LOSS DIFF: tensor(5.2772, grad_fn=<NllLossBackward0>) tensor(5.1653, grad_fn=<NllLossBackward0>)
351 LOSS DIFF: tensor(5.3236, grad_fn=<NllLossBackward0>) tensor(5.2772, grad_fn=<NllLossBackward0>)
352 LOSS DIFF: tensor(5.3818, grad_fn=<NllLossBackward0>) tensor(5.3236, grad_fn=<NllLossBackward0>)
353 LOSS DIFF: tensor(5.1957, grad_fn=<NllLossBackward0>) tensor(5.1122, grad_fn=<NllLossBackward0>)
354 LOSS DIFF: tensor(5.2754, grad_fn=<NllLossBackward0>) tensor(5.1957, grad_fn=<NllLossBackward0>)
355 LOSS DIFF: tensor(5.4069, grad_fn=<NllLossBackward0>) tensor(5.2754, grad_fn=<NllLossBackward0>)
356 LOSS DIFF: tensor(5.3361, grad_fn=<NllLossBackward0>) tensor(5.1708, grad_fn=<NllLossBackward0>)
357 LOSS DIFF: tensor(5.5310, grad_fn=<NllLossBackward0>) tensor(5.2320, grad_fn=<NllLossBackward0>)
358 LOSS DIFF: tensor(5.5582, grad_fn=<NllLossBackward0>) tensor(5.3281, grad_fn=<NllLossBackward0>)
359 LOSS DIFF: tensor(5.4403, grad_fn=<NllLossBackward0>) tensor(5.0958, grad_fn=<NllLossBackward0>)
360 LOSS DIFF: tensor(5.3855, grad_fn=<NllLossBackward0>) tensor(5.3547, grad_fn=<NllLossBackward0>)
361 LOSS DIFF: tensor(5.4341, grad_fn=<NllLossBackward0>) tensor(5.3628, grad_fn=<NllLossBackward0>)
362 LOSS DIFF: tensor(5.4064, grad_fn=<NllLossBackward0>) tensor(5.3641, grad_fn=<NllLossBackward0>)
363 LOSS DIFF: tensor(5.4232, grad_fn=<NllLossBackward0>) tensor(5.4064, grad_fn=<NllLossBackward0>)
364 LOSS DIFF: tensor(5.4929, grad_fn=<NllLossBackward0>) tensor(5.2922, grad_fn=<NllLossBackward0>)
365 LOSS DIFF: tensor(5.2788, grad_fn=<NllLossBackward0>) tensor(5.1483, grad_fn=<NllLossBackward0>)
366 LOSS DIFF: tensor(5.3894, grad_fn=<NllLossBackward0>) tensor(5.1464, grad_fn=<NllLossBackward0>)
367 LOSS DIFF: tensor(5.5410, grad_fn=<NllLossBackward0>) tensor(5.3032, grad_fn=<NllLossBackward0>)
368 LOSS DIFF: tensor(5.4745, grad_fn=<NllLossBackward0>) tensor(5.3954, grad_fn=<NllLossBackward0>)
369 LOSS DIFF: tensor(5.4002, grad_fn=<NllLossBackward0>) tensor(5.2852, grad_fn=<NllLossBackward0>)
370 LOSS DIFF: tensor(5.5121, grad_fn=<NllLossBackward0>) tensor(5.1010, grad_fn=<NllLossBackward0>)
371 LOSS DIFF: tensor(5.1770, grad_fn=<NllLossBackward0>) tensor(4.9924, grad_fn=<NllLossBackward0>)
372 LOSS DIFF: tensor(5.2602, grad_fn=<NllLossBackward0>) tensor(5.0630, grad_fn=<NllLossBackward0>)
373 LOSS DIFF: tensor(5.1854, grad_fn=<NllLossBackward0>) tensor(5.1847, grad_fn=<NllLossBackward0>)
374 LOSS DIFF: tensor(5.4752, grad_fn=<NllLossBackward0>) tensor(5.1854, grad_fn=<NllLossBackward0>)
375 LOSS DIFF: tensor(5.3940, grad_fn=<NllLossBackward0>) tensor(4.9471, grad_fn=<NllLossBackward0>)
376 LOSS DIFF: tensor(5.4444, grad_fn=<NllLossBackward0>) tensor(5.3940, grad_fn=<NllLossBackward0>)
377 LOSS DIFF: tensor(5.2639, grad_fn=<NllLossBackward0>) tensor(5.2434, grad_fn=<NllLossBackward0>)
378 LOSS DIFF: tensor(5.5010, grad_fn=<NllLossBackward0>) tensor(5.2639, grad_fn=<NllLossBackward0>)
379 LOSS DIFF: tensor(5.3871, grad_fn=<NllLossBackward0>) tensor(5.2697, grad_fn=<NllLossBackward0>)
380 LOSS DIFF: tensor(5.5319, grad_fn=<NllLossBackward0>) tensor(5.2951, grad_fn=<NllLossBackward0>)
381 LOSS DIFF: tensor(5.2672, grad_fn=<NllLossBackward0>) tensor(5.0885, grad_fn=<NllLossBackward0>)
382 LOSS DIFF: tensor(5.3262, grad_fn=<NllLossBackward0>) tensor(5.2672, grad_fn=<NllLossBackward0>)
383 LOSS DIFF: tensor(5.4015, grad_fn=<NllLossBackward0>) tensor(5.3262, grad_fn=<NllLossBackward0>)
384 LOSS DIFF: tensor(5.2618, grad_fn=<NllLossBackward0>) tensor(5.2335, grad_fn=<NllLossBackward0>)
385 LOSS DIFF: tensor(5.3040, grad_fn=<NllLossBackward0>) tensor(5.2618, grad_fn=<NllLossBackward0>)
386 LOSS DIFF: tensor(5.2459, grad_fn=<NllLossBackward0>) tensor(5.0806, grad_fn=<NllLossBackward0>)
387 LOSS DIFF: tensor(5.3756, grad_fn=<NllLossBackward0>) tensor(5.2459, grad_fn=<NllLossBackward0>)
388 LOSS DIFF: tensor(5.3504, grad_fn=<NllLossBackward0>) tensor(5.1054, grad_fn=<NllLossBackward0>)
389 LOSS DIFF: tensor(5.2258, grad_fn=<NllLossBackward0>) tensor(5.1519, grad_fn=<NllLossBackward0>)
390 LOSS DIFF: tensor(5.2802, grad_fn=<NllLossBackward0>) tensor(5.2258, grad_fn=<NllLossBackward0>)
391 LOSS DIFF: tensor(5.3461, grad_fn=<NllLossBackward0>) tensor(5.2802, grad_fn=<NllLossBackward0>)
392 LOSS DIFF: tensor(5.3227, grad_fn=<NllLossBackward0>) tensor(5.2572, grad_fn=<NllLossBackward0>)
800 tensor(5.1938, grad_fn=<NllLossBackward0>)
393 LOSS DIFF: tensor(5.4509, grad_fn=<NllLossBackward0>) tensor(5.1938, grad_fn=<NllLossBackward0>)
394 LOSS DIFF: tensor(5.1965, grad_fn=<NllLossBackward0>) tensor(5.1726, grad_fn=<NllLossBackward0>)
395 LOSS DIFF: tensor(5.3317, grad_fn=<NllLossBackward0>) tensor(5.1965, grad_fn=<NllLossBackward0>)
396 LOSS DIFF: tensor(5.2442, grad_fn=<NllLossBackward0>) tensor(5.0167, grad_fn=<NllLossBackward0>)
397 LOSS DIFF: tensor(5.2592, grad_fn=<NllLossBackward0>) tensor(5.2442, grad_fn=<NllLossBackward0>)
398 LOSS DIFF: tensor(5.2272, grad_fn=<NllLossBackward0>) tensor(5.1738, grad_fn=<NllLossBackward0>)
399 LOSS DIFF: tensor(5.2863, grad_fn=<NllLossBackward0>) tensor(5.2272, grad_fn=<NllLossBackward0>)
400 LOSS DIFF: tensor(5.3143, grad_fn=<NllLossBackward0>) tensor(5.2863, grad_fn=<NllLossBackward0>)
401 LOSS DIFF: tensor(5.0616, grad_fn=<NllLossBackward0>) tensor(5.0013, grad_fn=<NllLossBackward0>)
402 LOSS DIFF: tensor(5.4039, grad_fn=<NllLossBackward0>) tensor(5.0616, grad_fn=<NllLossBackward0>)
403 LOSS DIFF: tensor(5.3913, grad_fn=<NllLossBackward0>) tensor(4.9984, grad_fn=<NllLossBackward0>)
404 LOSS DIFF: tensor(5.2658, grad_fn=<NllLossBackward0>) tensor(5.2179, grad_fn=<NllLossBackward0>)
405 LOSS DIFF: tensor(5.2846, grad_fn=<NllLossBackward0>) tensor(5.2658, grad_fn=<NllLossBackward0>)
406 LOSS DIFF: tensor(5.3590, grad_fn=<NllLossBackward0>) tensor(5.2846, grad_fn=<NllLossBackward0>)
407 LOSS DIFF: tensor(5.4706, grad_fn=<NllLossBackward0>) tensor(5.0496, grad_fn=<NllLossBackward0>)
408 LOSS DIFF: tensor(5.6955, grad_fn=<NllLossBackward0>) tensor(5.4706, grad_fn=<NllLossBackward0>)
409 LOSS DIFF: tensor(5.4540, grad_fn=<NllLossBackward0>) tensor(4.9054, grad_fn=<NllLossBackward0>)
410 LOSS DIFF: tensor(5.1788, grad_fn=<NllLossBackward0>) tensor(5.0048, grad_fn=<NllLossBackward0>)
411 LOSS DIFF: tensor(5.2213, grad_fn=<NllLossBackward0>) tensor(5.1788, grad_fn=<NllLossBackward0>)
412 LOSS DIFF: tensor(5.2282, grad_fn=<NllLossBackward0>) tensor(5.2213, grad_fn=<NllLossBackward0>)
413 LOSS DIFF: tensor(5.4138, grad_fn=<NllLossBackward0>) tensor(5.1972, grad_fn=<NllLossBackward0>)
414 LOSS DIFF: tensor(5.3300, grad_fn=<NllLossBackward0>) tensor(4.9654, grad_fn=<NllLossBackward0>)
415 LOSS DIFF: tensor(5.0692, grad_fn=<NllLossBackward0>) tensor(4.9775, grad_fn=<NllLossBackward0>)
416 LOSS DIFF: tensor(5.1780, grad_fn=<NllLossBackward0>) tensor(5.0692, grad_fn=<NllLossBackward0>)
417 LOSS DIFF: tensor(5.4131, grad_fn=<NllLossBackward0>) tensor(5.1780, grad_fn=<NllLossBackward0>)
418 LOSS DIFF: tensor(5.5625, grad_fn=<NllLossBackward0>) tensor(5.4131, grad_fn=<NllLossBackward0>)
419 LOSS DIFF: tensor(5.1862, grad_fn=<NllLossBackward0>) tensor(5.1502, grad_fn=<NllLossBackward0>)
420 LOSS DIFF: tensor(5.2858, grad_fn=<NllLossBackward0>) tensor(5.1862, grad_fn=<NllLossBackward0>)
421 LOSS DIFF: tensor(5.2607, grad_fn=<NllLossBackward0>) tensor(5.2394, grad_fn=<NllLossBackward0>)
422 LOSS DIFF: tensor(5.4085, grad_fn=<NllLossBackward0>) tensor(5.2607, grad_fn=<NllLossBackward0>)
423 LOSS DIFF: tensor(5.3268, grad_fn=<NllLossBackward0>) tensor(5.3040, grad_fn=<NllLossBackward0>)
424 LOSS DIFF: tensor(5.4477, grad_fn=<NllLossBackward0>) tensor(5.3268, grad_fn=<NllLossBackward0>)
425 LOSS DIFF: tensor(5.3032, grad_fn=<NllLossBackward0>) tensor(5.2228, grad_fn=<NllLossBackward0>)
426 LOSS DIFF: tensor(5.4339, grad_fn=<NllLossBackward0>) tensor(5.2517, grad_fn=<NllLossBackward0>)
427 LOSS DIFF: tensor(5.3693, grad_fn=<NllLossBackward0>) tensor(5.0677, grad_fn=<NllLossBackward0>)
428 LOSS DIFF: tensor(5.2379, grad_fn=<NllLossBackward0>) tensor(5.2100, grad_fn=<NllLossBackward0>)
429 LOSS DIFF: tensor(5.2541, grad_fn=<NllLossBackward0>) tensor(5.2379, grad_fn=<NllLossBackward0>)
430 LOSS DIFF: tensor(5.2259, grad_fn=<NllLossBackward0>) tensor(5.1291, grad_fn=<NllLossBackward0>)
431 LOSS DIFF: tensor(5.2455, grad_fn=<NllLossBackward0>) tensor(5.1523, grad_fn=<NllLossBackward0>)
432 LOSS DIFF: tensor(5.3854, grad_fn=<NllLossBackward0>) tensor(5.2147, grad_fn=<NllLossBackward0>)
433 LOSS DIFF: tensor(5.2580, grad_fn=<NllLossBackward0>) tensor(5.1674, grad_fn=<NllLossBackward0>)
434 LOSS DIFF: tensor(5.3666, grad_fn=<NllLossBackward0>) tensor(5.2580, grad_fn=<NllLossBackward0>)
435 LOSS DIFF: tensor(5.3990, grad_fn=<NllLossBackward0>) tensor(5.2895, grad_fn=<NllLossBackward0>)
436 LOSS DIFF: tensor(5.4095, grad_fn=<NllLossBackward0>) tensor(5.2050, grad_fn=<NllLossBackward0>)
437 LOSS DIFF: tensor(5.3580, grad_fn=<NllLossBackward0>) tensor(5.1551, grad_fn=<NllLossBackward0>)
438 LOSS DIFF: tensor(5.5038, grad_fn=<NllLossBackward0>) tensor(5.2894, grad_fn=<NllLossBackward0>)
439 LOSS DIFF: tensor(5.3097, grad_fn=<NllLossBackward0>) tensor(5.1047, grad_fn=<NllLossBackward0>)
440 LOSS DIFF: tensor(5.4076, grad_fn=<NllLossBackward0>) tensor(5.3097, grad_fn=<NllLossBackward0>)
441 LOSS DIFF: tensor(5.3938, grad_fn=<NllLossBackward0>) tensor(5.2490, grad_fn=<NllLossBackward0>)
442 LOSS DIFF: tensor(5.6185, grad_fn=<NllLossBackward0>) tensor(5.3873, grad_fn=<NllLossBackward0>)
900 tensor(5.2894, grad_fn=<NllLossBackward0>)
443 LOSS DIFF: tensor(5.2605, grad_fn=<NllLossBackward0>) tensor(5.0513, grad_fn=<NllLossBackward0>)
444 LOSS DIFF: tensor(5.5549, grad_fn=<NllLossBackward0>) tensor(5.2605, grad_fn=<NllLossBackward0>)
445 LOSS DIFF: tensor(5.1775, grad_fn=<NllLossBackward0>) tensor(5.1379, grad_fn=<NllLossBackward0>)
446 LOSS DIFF: tensor(5.3998, grad_fn=<NllLossBackward0>) tensor(5.1775, grad_fn=<NllLossBackward0>)
447 LOSS DIFF: tensor(5.4069, grad_fn=<NllLossBackward0>) tensor(5.3169, grad_fn=<NllLossBackward0>)
448 LOSS DIFF: tensor(5.2558, grad_fn=<NllLossBackward0>) tensor(4.9919, grad_fn=<NllLossBackward0>)
449 LOSS DIFF: tensor(5.4139, grad_fn=<NllLossBackward0>) tensor(5.2558, grad_fn=<NllLossBackward0>)
450 LOSS DIFF: tensor(5.4725, grad_fn=<NllLossBackward0>) tensor(5.4139, grad_fn=<NllLossBackward0>)
451 LOSS DIFF: tensor(5.3004, grad_fn=<NllLossBackward0>) tensor(5.1489, grad_fn=<NllLossBackward0>)
452 LOSS DIFF: tensor(5.3943, grad_fn=<NllLossBackward0>) tensor(5.3004, grad_fn=<NllLossBackward0>)
453 LOSS DIFF: tensor(5.2652, grad_fn=<NllLossBackward0>) tensor(5.0230, grad_fn=<NllLossBackward0>)
454 LOSS DIFF: tensor(5.3982, grad_fn=<NllLossBackward0>) tensor(5.2229, grad_fn=<NllLossBackward0>)
455 LOSS DIFF: tensor(5.4184, grad_fn=<NllLossBackward0>) tensor(5.2137, grad_fn=<NllLossBackward0>)
456 LOSS DIFF: tensor(5.6858, grad_fn=<NllLossBackward0>) tensor(5.1474, grad_fn=<NllLossBackward0>)
457 LOSS DIFF: tensor(5.3886, grad_fn=<NllLossBackward0>) tensor(5.1649, grad_fn=<NllLossBackward0>)
458 LOSS DIFF: tensor(5.3129, grad_fn=<NllLossBackward0>) tensor(5.2705, grad_fn=<NllLossBackward0>)
459 LOSS DIFF: tensor(5.4430, grad_fn=<NllLossBackward0>) tensor(5.0307, grad_fn=<NllLossBackward0>)
460 LOSS DIFF: tensor(5.4555, grad_fn=<NllLossBackward0>) tensor(5.3132, grad_fn=<NllLossBackward0>)
461 LOSS DIFF: tensor(5.2490, grad_fn=<NllLossBackward0>) tensor(4.9971, grad_fn=<NllLossBackward0>)
462 LOSS DIFF: tensor(5.4743, grad_fn=<NllLossBackward0>) tensor(5.1878, grad_fn=<NllLossBackward0>)
463 LOSS DIFF: tensor(5.2897, grad_fn=<NllLossBackward0>) tensor(4.9685, grad_fn=<NllLossBackward0>)
464 LOSS DIFF: tensor(5.3322, grad_fn=<NllLossBackward0>) tensor(5.1790, grad_fn=<NllLossBackward0>)
465 LOSS DIFF: tensor(5.2013, grad_fn=<NllLossBackward0>) tensor(5.0778, grad_fn=<NllLossBackward0>)
466 LOSS DIFF: tensor(5.2347, grad_fn=<NllLossBackward0>) tensor(5.0395, grad_fn=<NllLossBackward0>)
467 LOSS DIFF: tensor(5.2472, grad_fn=<NllLossBackward0>) tensor(5.2347, grad_fn=<NllLossBackward0>)
468 LOSS DIFF: tensor(5.3672, grad_fn=<NllLossBackward0>) tensor(5.1695, grad_fn=<NllLossBackward0>)
469 LOSS DIFF: tensor(5.3892, grad_fn=<NllLossBackward0>) tensor(5.3672, grad_fn=<NllLossBackward0>)
470 LOSS DIFF: tensor(5.1295, grad_fn=<NllLossBackward0>) tensor(5.1241, grad_fn=<NllLossBackward0>)
471 LOSS DIFF: tensor(5.2935, grad_fn=<NllLossBackward0>) tensor(5.1295, grad_fn=<NllLossBackward0>)
472 LOSS DIFF: tensor(5.4916, grad_fn=<NllLossBackward0>) tensor(5.2935, grad_fn=<NllLossBackward0>)
473 LOSS DIFF: tensor(5.2570, grad_fn=<NllLossBackward0>) tensor(5.0166, grad_fn=<NllLossBackward0>)
474 LOSS DIFF: tensor(5.3124, grad_fn=<NllLossBackward0>) tensor(5.1387, grad_fn=<NllLossBackward0>)
475 LOSS DIFF: tensor(5.2445, grad_fn=<NllLossBackward0>) tensor(5.1581, grad_fn=<NllLossBackward0>)
476 LOSS DIFF: tensor(5.4986, grad_fn=<NllLossBackward0>) tensor(5.2445, grad_fn=<NllLossBackward0>)
477 LOSS DIFF: tensor(5.2073, grad_fn=<NllLossBackward0>) tensor(5.1772, grad_fn=<NllLossBackward0>)
478 LOSS DIFF: tensor(5.2213, grad_fn=<NllLossBackward0>) tensor(5.0682, grad_fn=<NllLossBackward0>)
479 LOSS DIFF: tensor(5.2317, grad_fn=<NllLossBackward0>) tensor(5.2213, grad_fn=<NllLossBackward0>)
480 LOSS DIFF: tensor(5.2169, grad_fn=<NllLossBackward0>) tensor(4.8229, grad_fn=<NllLossBackward0>)
481 LOSS DIFF: tensor(5.4192, grad_fn=<NllLossBackward0>) tensor(5.2169, grad_fn=<NllLossBackward0>)
482 LOSS DIFF: tensor(5.3481, grad_fn=<NllLossBackward0>) tensor(5.1884, grad_fn=<NllLossBackward0>)
483 LOSS DIFF: tensor(5.4329, grad_fn=<NllLossBackward0>) tensor(5.3481, grad_fn=<NllLossBackward0>)
484 LOSS DIFF: tensor(5.1482, grad_fn=<NllLossBackward0>) tensor(4.8979, grad_fn=<NllLossBackward0>)
485 LOSS DIFF: tensor(5.3562, grad_fn=<NllLossBackward0>) tensor(5.1482, grad_fn=<NllLossBackward0>)
486 LOSS DIFF: tensor(5.5739, grad_fn=<NllLossBackward0>) tensor(5.3562, grad_fn=<NllLossBackward0>)
487 LOSS DIFF: tensor(5.0749, grad_fn=<NllLossBackward0>) tensor(4.9742, grad_fn=<NllLossBackward0>)
488 LOSS DIFF: tensor(5.2301, grad_fn=<NllLossBackward0>) tensor(5.0749, grad_fn=<NllLossBackward0>)
489 LOSS DIFF: tensor(5.4543, grad_fn=<NllLossBackward0>) tensor(5.2301, grad_fn=<NllLossBackward0>)
490 LOSS DIFF: tensor(5.2210, grad_fn=<NllLossBackward0>) tensor(4.9663, grad_fn=<NllLossBackward0>)
491 LOSS DIFF: tensor(5.3469, grad_fn=<NllLossBackward0>) tensor(5.2210, grad_fn=<NllLossBackward0>)
1000 tensor(5.4116, grad_fn=<NllLossBackward0>)
492 LOSS DIFF: tensor(5.4116, grad_fn=<NllLossBackward0>) tensor(5.2156, grad_fn=<NllLossBackward0>)
493 LOSS DIFF: tensor(5.1600, grad_fn=<NllLossBackward0>) tensor(4.9976, grad_fn=<NllLossBackward0>)
494 LOSS DIFF: tensor(5.2190, grad_fn=<NllLossBackward0>) tensor(5.1102, grad_fn=<NllLossBackward0>)
495 LOSS DIFF: tensor(5.1974, grad_fn=<NllLossBackward0>) tensor(5.0123, grad_fn=<NllLossBackward0>)
496 LOSS DIFF: tensor(5.3085, grad_fn=<NllLossBackward0>) tensor(5.1974, grad_fn=<NllLossBackward0>)
497 LOSS DIFF: tensor(5.3090, grad_fn=<NllLossBackward0>) tensor(5.3085, grad_fn=<NllLossBackward0>)
498 LOSS DIFF: tensor(5.3978, grad_fn=<NllLossBackward0>) tensor(5.0467, grad_fn=<NllLossBackward0>)
499 LOSS DIFF: tensor(5.3369, grad_fn=<NllLossBackward0>) tensor(5.0919, grad_fn=<NllLossBackward0>)
500 LOSS DIFF: tensor(5.3036, grad_fn=<NllLossBackward0>) tensor(5.2151, grad_fn=<NllLossBackward0>)
plt.plot([t.detach().numpy() for t in loss_track])
plt.show()
torch.save(model.state_dict(), f'model_trigram-EMBED_SIZE={EMBED_SIZE}.bin')
vocab_unique = set(vocab.get_stoi().keys())
output = []
with lzma.open("dev-0/in.tsv.xz", encoding='utf8', mode="rt") as file:
    for line in file:
        line = line.split("\t")

        first_word = re.sub(r"\\\\+n", " ", line[-2]).split()[-1]
        first_word = re.sub('[^A-Za-z]+', '', first_word)

        second_word = re.sub(r"\\\\+n", " ", line[-1]).split()[0]
        second_word = re.sub('[^A-Za-z]+', '', second_word)

        if first_word not in vocab_unique:
            word = "<unk>"
        if second_word not in vocab_unique:
            word = "<unk>"

        input_tokens = torch.tensor([vocab.forward([first_word]), vocab.forward([second_word])]).to(device)
        out = model(input_tokens)

        top = torch.topk(out[0], 10)
        top_indices = top.indices.tolist()
        top_probs = top.values.tolist()
        unk_bonus = 1 - sum(top_probs)
        top_words = vocab.lookup_tokens(top_indices)
        top_zipped = list(zip(top_words, top_probs))

        res = ""
        for w, p in top_zipped:
            if w == "<unk>":
                res += f":{(p + unk_bonus):.4f} "
            else:
                res += f"{w}:{p:.4f} "
        
        res = res[:-1]
        res += "\n"
        output.append(res)

with open(f"dev-0/out-EMBED_SIZE={EMBED_SIZE}.tsv", mode="w") as file:
    file.writelines(output)
C:\Users\micha\AppData\Local\Temp\ipykernel_14016\2809838665.py:15: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  x = self.softmax(x)
model.eval()

output = []
with lzma.open("test-A/in.tsv.xz", encoding='utf8', mode="rt") as file:
    for line in file:
        line = line.split("\t")

        first_word = re.sub(r"\\\\+n", " ", line[-2]).split()[-1]
        first_word = re.sub('[^A-Za-z]+', '', first_word)

        second_word = re.sub(r"\\\\+n", " ", line[-1]).split()[0]
        second_word = re.sub('[^A-Za-z]+', '', second_word)

        if first_word not in vocab_unique:
            word = "<unk>"
        if second_word not in vocab_unique:
            word = "<unk>"

        input_tokens = torch.tensor([vocab.forward([first_word]), vocab.forward([second_word])]).to(device)
        out = model(input_tokens)

        top = torch.topk(out[0], 10)
        top_indices = top.indices.tolist()
        top_probs = top.values.tolist()
        unk_bonus = 1 - sum(top_probs)
        top_words = vocab.lookup_tokens(top_indices)
        top_zipped = list(zip(top_words, top_probs))

        res = ""
        for w, p in top_zipped:
            if w == "<unk>":
                res += f":{(p + unk_bonus):.4f} "
            else:
                res += f"{w}:{p:.4f} "
        
        res = res[:-1]
        res += "\n"
        output.append(res)

with open(f"test-A/out-EMBED_SIZE={EMBED_SIZE}.tsv", mode="w") as file:
    file.writelines(output)
C:\Users\micha\AppData\Local\Temp\ipykernel_14016\2809838665.py:15: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  x = self.softmax(x)