103 KiB
103 KiB
import torch
import lzma
from itertools import islice
import re
import sys
from torchtext.vocab import build_vocab_from_iterator
from torch import nn
from torch.utils.data import IterableDataset, DataLoader
import itertools
import matplotlib.pyplot as plt
VOCAB_SIZE = 10_000
EMBED_SIZE = 400
def get_words_from_line(line):
line = line.rstrip()
line = line.split("\t")
text = line[-2] + " " + line[-1]
text = re.sub(r"\\\\+n", " ", text)
text = re.sub('[^A-Za-z ]+', '', text)
for t in text.split():
yield t
def get_word_lines_from_file(file_name):
with lzma.open(file_name, encoding='utf8', mode="rt") as fh:
for line in fh:
yield get_words_from_line(line)
vocab = build_vocab_from_iterator(
get_word_lines_from_file("train/in.tsv.xz"),
max_tokens = VOCAB_SIZE,
specials = ['<unk>'])
[1;31m---------------------------------------------------------------------------[0m [1;31mKeyboardInterrupt[0m Traceback (most recent call last) [1;32md:\studia\challenging-america-word-gap-prediction\nn_trigram.ipynb Cell 3[0m in [0;36m<cell line: 17>[1;34m()[0m [0;32m <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=12'>13</a>[0m [39myield[39;00m get_words_from_line(line) [0;32m <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=14'>15</a>[0m vocab_size [39m=[39m [39m1_000[39m [1;32m---> <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=16'>17</a>[0m vocab [39m=[39m build_vocab_from_iterator( [0;32m <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=17'>18</a>[0m get_word_lines_from_file([39m"[39;49m[39mtrain/in.tsv.xz[39;49m[39m"[39;49m), [0;32m <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=18'>19</a>[0m max_tokens [39m=[39;49m VOCAB_SIZE, [0;32m <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=19'>20</a>[0m specials [39m=[39;49m [[39m'[39;49m[39m<unk>[39;49m[39m'[39;49m]) File [1;32mc:\PROGRAMY\Anaconda3\envs\modelowanie-jezyka\lib\site-packages\torchtext\vocab\vocab_factory.py:98[0m, in [0;36mbuild_vocab_from_iterator[1;34m(iterator, min_freq, specials, special_first, max_tokens)[0m [0;32m 72[0m [39m[39m[39m"""[39;00m [0;32m 73[0m [39mBuild a Vocab from an iterator.[39;00m [0;32m 74[0m [1;32m (...)[0m [0;32m 94[0m [39m >>> vocab = build_vocab_from_iterator(yield_tokens(file_path), specials=["<unk>"])[39;00m [0;32m 95[0m [39m"""[39;00m [0;32m 97[0m counter [39m=[39m Counter() [1;32m---> 98[0m [39mfor[39;00m tokens [39min[39;00m iterator: [0;32m 99[0m counter[39m.[39mupdate(tokens) [0;32m 101[0m specials [39m=[39m specials [39mor[39;00m [] [1;32md:\studia\challenging-america-word-gap-prediction\nn_trigram.ipynb Cell 3[0m in [0;36mget_word_lines_from_file[1;34m(file_name)[0m [0;32m <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=9'>10</a>[0m [39mdef[39;00m [39mget_word_lines_from_file[39m(file_name): [0;32m <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=10'>11</a>[0m [39mwith[39;00m lzma[39m.[39mopen(file_name, encoding[39m=[39m[39m'[39m[39mutf8[39m[39m'[39m, mode[39m=[39m[39m"[39m[39mrt[39m[39m"[39m) [39mas[39;00m fh: [1;32m---> <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=11'>12</a>[0m [39mfor[39;00m line [39min[39;00m fh: [0;32m <a href='vscode-notebook-cell:/d%3A/studia/challenging-america-word-gap-prediction/nn_trigram.ipynb#W2sZmlsZQ%3D%3D?line=12'>13</a>[0m [39myield[39;00m get_words_from_line(line) File [1;32mc:\PROGRAMY\Anaconda3\envs\modelowanie-jezyka\lib\lzma.py:212[0m, in [0;36mLZMAFile.read1[1;34m(self, size)[0m [0;32m 210[0m [39mif[39;00m size [39m<[39m [39m0[39m: [0;32m 211[0m size [39m=[39m io[39m.[39mDEFAULT_BUFFER_SIZE [1;32m--> 212[0m [39mreturn[39;00m [39mself[39;49m[39m.[39;49m_buffer[39m.[39;49mread1(size) File [1;32mc:\PROGRAMY\Anaconda3\envs\modelowanie-jezyka\lib\_compression.py:68[0m, in [0;36mDecompressReader.readinto[1;34m(self, b)[0m [0;32m 66[0m [39mdef[39;00m [39mreadinto[39m([39mself[39m, b): [0;32m 67[0m [39mwith[39;00m [39mmemoryview[39m(b) [39mas[39;00m view, view[39m.[39mcast([39m"[39m[39mB[39m[39m"[39m) [39mas[39;00m byte_view: [1;32m---> 68[0m data [39m=[39m [39mself[39;49m[39m.[39;49mread([39mlen[39;49m(byte_view)) [0;32m 69[0m byte_view[:[39mlen[39m(data)] [39m=[39m data [0;32m 70[0m [39mreturn[39;00m [39mlen[39m(data) File [1;32mc:\PROGRAMY\Anaconda3\envs\modelowanie-jezyka\lib\_compression.py:103[0m, in [0;36mDecompressReader.read[1;34m(self, size)[0m [0;32m 101[0m [39melse[39;00m: [0;32m 102[0m rawblock [39m=[39m [39mb[39m[39m"[39m[39m"[39m [1;32m--> 103[0m data [39m=[39m [39mself[39;49m[39m.[39;49m_decompressor[39m.[39;49mdecompress(rawblock, size) [0;32m 104[0m [39mif[39;00m data: [0;32m 105[0m [39mbreak[39;00m [1;31mKeyboardInterrupt[0m:
def look_ahead_iterator(gen):
first = None
second = None
for item in gen:
if first is not None and second is not None:
yield ((first, item), second)
first = second
second = item
class Trigrams(IterableDataset):
def __init__(self, text_file, vocabulary_size):
self.vocab = vocab
self.vocab.set_default_index(self.vocab['<unk>'])
self.vocabulary_size = VOCAB_SIZE
self.text_file = text_file
def __iter__(self):
return look_ahead_iterator(
(self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))
train_dataset = Trigrams("train/in.tsv.xz", VOCAB_SIZE)
class TrigramNNModel(nn.Module):
def __init__(self, VOCAB_SIZE, EMBED_SIZE):
super(TrigramNNModel, self).__init__()
self.embeddings = nn.Embedding(VOCAB_SIZE, EMBED_SIZE)
self.hidden_layer = nn.Linear(EMBED_SIZE*2, 1200)
self.output_layer = nn.Linear(1200, VOCAB_SIZE)
self.softmax = nn.Softmax()
def forward(self, x):
emb_2 = self.embeddings(x[0])
emb_1 = self.embeddings(x[1])
x = torch.cat([emb_2, emb_1], dim=1)
x = self.hidden_layer(x)
x = self.output_layer(x)
x = self.softmax(x)
return x
model = TrigramNNModel(vocab_size, embed_size)
vocab.set_default_index(vocab['<unk>'])
device = 'cpu'
model = TrigramNNModel(VOCAB_SIZE, EMBED_SIZE).to(device)
data = DataLoader(train_dataset, batch_size=2_000)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()
loss_track = []
last_loss = 1_000
trigger_count = 0
model.train()
step = 0
for x, y in data:
x[0] = x[0].to(device)
x[1] = x[1].to(device)
y = y.to(device)
optimizer.zero_grad()
ypredicted = model(x)
loss = criterion(torch.log(ypredicted), y)
if step % 100 == 0:
print(step, loss)
step += 1
loss.backward()
optimizer.step()
if loss > last_loss:
trigger_count += 1
print(trigger_count, 'LOSS DIFF:', loss, last_loss)
if trigger_count >= 500:
break
loss_track.append(loss)
last_loss = loss
C:\Users\micha\AppData\Local\Temp\ipykernel_14016\2809838665.py:15: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument. x = self.softmax(x)
0 tensor(9.2713, grad_fn=<NllLossBackward0>) 1 LOSS DIFF: tensor(8.2370, grad_fn=<NllLossBackward0>) tensor(8.2154, grad_fn=<NllLossBackward0>) 2 LOSS DIFF: tensor(8.0085, grad_fn=<NllLossBackward0>) tensor(7.9711, grad_fn=<NllLossBackward0>) 3 LOSS DIFF: tensor(8.0149, grad_fn=<NllLossBackward0>) tensor(8.0085, grad_fn=<NllLossBackward0>) 4 LOSS DIFF: tensor(7.5328, grad_fn=<NllLossBackward0>) tensor(7.4404, grad_fn=<NllLossBackward0>) 5 LOSS DIFF: tensor(7.5367, grad_fn=<NllLossBackward0>) tensor(7.5328, grad_fn=<NllLossBackward0>) 6 LOSS DIFF: tensor(7.6733, grad_fn=<NllLossBackward0>) tensor(7.5367, grad_fn=<NllLossBackward0>) 7 LOSS DIFF: tensor(7.4703, grad_fn=<NllLossBackward0>) tensor(7.3663, grad_fn=<NllLossBackward0>) 8 LOSS DIFF: tensor(7.2923, grad_fn=<NllLossBackward0>) tensor(7.1224, grad_fn=<NllLossBackward0>) 9 LOSS DIFF: tensor(7.2912, grad_fn=<NllLossBackward0>) tensor(7.0721, grad_fn=<NllLossBackward0>) 10 LOSS DIFF: tensor(7.4529, grad_fn=<NllLossBackward0>) tensor(7.0255, grad_fn=<NllLossBackward0>) 11 LOSS DIFF: tensor(7.2017, grad_fn=<NllLossBackward0>) tensor(7.0108, grad_fn=<NllLossBackward0>) 12 LOSS DIFF: tensor(7.0689, grad_fn=<NllLossBackward0>) tensor(6.7964, grad_fn=<NllLossBackward0>) 13 LOSS DIFF: tensor(7.1870, grad_fn=<NllLossBackward0>) tensor(6.7505, grad_fn=<NllLossBackward0>) 14 LOSS DIFF: tensor(7.0149, grad_fn=<NllLossBackward0>) tensor(6.7360, grad_fn=<NllLossBackward0>) 15 LOSS DIFF: tensor(7.0185, grad_fn=<NllLossBackward0>) tensor(6.5064, grad_fn=<NllLossBackward0>) 16 LOSS DIFF: tensor(6.6809, grad_fn=<NllLossBackward0>) tensor(6.6315, grad_fn=<NllLossBackward0>) 17 LOSS DIFF: tensor(6.6161, grad_fn=<NllLossBackward0>) tensor(6.5363, grad_fn=<NllLossBackward0>) 18 LOSS DIFF: tensor(6.6186, grad_fn=<NllLossBackward0>) tensor(6.4474, grad_fn=<NllLossBackward0>) 19 LOSS DIFF: tensor(6.7242, grad_fn=<NllLossBackward0>) tensor(6.6186, grad_fn=<NllLossBackward0>) 20 LOSS DIFF: tensor(6.8363, grad_fn=<NllLossBackward0>) tensor(6.4740, grad_fn=<NllLossBackward0>) 21 LOSS DIFF: tensor(6.4746, grad_fn=<NllLossBackward0>) tensor(6.3583, grad_fn=<NllLossBackward0>) 22 LOSS DIFF: tensor(6.2821, grad_fn=<NllLossBackward0>) tensor(6.2621, grad_fn=<NllLossBackward0>) 23 LOSS DIFF: tensor(6.5530, grad_fn=<NllLossBackward0>) tensor(6.2821, grad_fn=<NllLossBackward0>) 24 LOSS DIFF: tensor(6.3082, grad_fn=<NllLossBackward0>) tensor(6.1749, grad_fn=<NllLossBackward0>) 25 LOSS DIFF: tensor(6.3215, grad_fn=<NllLossBackward0>) tensor(6.0069, grad_fn=<NllLossBackward0>) 26 LOSS DIFF: tensor(6.3455, grad_fn=<NllLossBackward0>) tensor(6.1887, grad_fn=<NllLossBackward0>) 27 LOSS DIFF: tensor(6.0695, grad_fn=<NllLossBackward0>) tensor(6.0053, grad_fn=<NllLossBackward0>) 28 LOSS DIFF: tensor(6.2298, grad_fn=<NllLossBackward0>) tensor(6.0553, grad_fn=<NllLossBackward0>) 29 LOSS DIFF: tensor(6.2879, grad_fn=<NllLossBackward0>) tensor(6.2298, grad_fn=<NllLossBackward0>) 30 LOSS DIFF: tensor(5.8552, grad_fn=<NllLossBackward0>) tensor(5.7972, grad_fn=<NllLossBackward0>) 31 LOSS DIFF: tensor(5.8884, grad_fn=<NllLossBackward0>) tensor(5.8552, grad_fn=<NllLossBackward0>) 32 LOSS DIFF: tensor(6.0852, grad_fn=<NllLossBackward0>) tensor(5.8884, grad_fn=<NllLossBackward0>) 33 LOSS DIFF: tensor(6.2040, grad_fn=<NllLossBackward0>) tensor(6.0852, grad_fn=<NllLossBackward0>) 34 LOSS DIFF: tensor(6.1036, grad_fn=<NllLossBackward0>) tensor(5.9439, grad_fn=<NllLossBackward0>) 35 LOSS DIFF: tensor(6.0782, grad_fn=<NllLossBackward0>) tensor(5.9413, grad_fn=<NllLossBackward0>) 36 LOSS DIFF: tensor(5.9607, grad_fn=<NllLossBackward0>) tensor(5.7949, grad_fn=<NllLossBackward0>) 37 LOSS DIFF: tensor(6.0354, grad_fn=<NllLossBackward0>) tensor(5.9607, grad_fn=<NllLossBackward0>) 38 LOSS DIFF: tensor(6.2669, grad_fn=<NllLossBackward0>) tensor(6.0243, grad_fn=<NllLossBackward0>) 39 LOSS DIFF: tensor(5.8678, grad_fn=<NllLossBackward0>) tensor(5.6556, grad_fn=<NllLossBackward0>) 40 LOSS DIFF: tensor(6.0265, grad_fn=<NllLossBackward0>) tensor(5.8678, grad_fn=<NllLossBackward0>) 41 LOSS DIFF: tensor(6.1147, grad_fn=<NllLossBackward0>) tensor(5.8050, grad_fn=<NllLossBackward0>) 100 tensor(5.8244, grad_fn=<NllLossBackward0>) 42 LOSS DIFF: tensor(5.8244, grad_fn=<NllLossBackward0>) tensor(5.7412, grad_fn=<NllLossBackward0>) 43 LOSS DIFF: tensor(5.9226, grad_fn=<NllLossBackward0>) tensor(5.8244, grad_fn=<NllLossBackward0>) 44 LOSS DIFF: tensor(5.9487, grad_fn=<NllLossBackward0>) tensor(5.9226, grad_fn=<NllLossBackward0>) 45 LOSS DIFF: tensor(5.8844, grad_fn=<NllLossBackward0>) tensor(5.3183, grad_fn=<NllLossBackward0>) 46 LOSS DIFF: tensor(6.0141, grad_fn=<NllLossBackward0>) tensor(5.8844, grad_fn=<NllLossBackward0>) 47 LOSS DIFF: tensor(6.1782, grad_fn=<NllLossBackward0>) tensor(5.8340, grad_fn=<NllLossBackward0>) 48 LOSS DIFF: tensor(5.8840, grad_fn=<NllLossBackward0>) tensor(5.7920, grad_fn=<NllLossBackward0>) 49 LOSS DIFF: tensor(5.7265, grad_fn=<NllLossBackward0>) tensor(5.6177, grad_fn=<NllLossBackward0>) 50 LOSS DIFF: tensor(5.9389, grad_fn=<NllLossBackward0>) tensor(5.7265, grad_fn=<NllLossBackward0>) 51 LOSS DIFF: tensor(5.6946, grad_fn=<NllLossBackward0>) tensor(5.6487, grad_fn=<NllLossBackward0>) 52 LOSS DIFF: tensor(5.8837, grad_fn=<NllLossBackward0>) tensor(5.6946, grad_fn=<NllLossBackward0>) 53 LOSS DIFF: tensor(5.9090, grad_fn=<NllLossBackward0>) tensor(5.8837, grad_fn=<NllLossBackward0>) 54 LOSS DIFF: tensor(5.9914, grad_fn=<NllLossBackward0>) tensor(5.9090, grad_fn=<NllLossBackward0>) 55 LOSS DIFF: tensor(5.8042, grad_fn=<NllLossBackward0>) tensor(5.7994, grad_fn=<NllLossBackward0>) 56 LOSS DIFF: tensor(5.9282, grad_fn=<NllLossBackward0>) tensor(5.8042, grad_fn=<NllLossBackward0>) 57 LOSS DIFF: tensor(5.9366, grad_fn=<NllLossBackward0>) tensor(5.7254, grad_fn=<NllLossBackward0>) 58 LOSS DIFF: tensor(5.7995, grad_fn=<NllLossBackward0>) tensor(5.7486, grad_fn=<NllLossBackward0>) 59 LOSS DIFF: tensor(5.6361, grad_fn=<NllLossBackward0>) tensor(5.5307, grad_fn=<NllLossBackward0>) 60 LOSS DIFF: tensor(5.7078, grad_fn=<NllLossBackward0>) tensor(5.6361, grad_fn=<NllLossBackward0>) 61 LOSS DIFF: tensor(5.7592, grad_fn=<NllLossBackward0>) tensor(5.7078, grad_fn=<NllLossBackward0>) 62 LOSS DIFF: tensor(5.7625, grad_fn=<NllLossBackward0>) tensor(5.5981, grad_fn=<NllLossBackward0>) 63 LOSS DIFF: tensor(5.8389, grad_fn=<NllLossBackward0>) tensor(5.7625, grad_fn=<NllLossBackward0>) 64 LOSS DIFF: tensor(5.7739, grad_fn=<NllLossBackward0>) tensor(5.7312, grad_fn=<NllLossBackward0>) 65 LOSS DIFF: tensor(5.9031, grad_fn=<NllLossBackward0>) tensor(5.6170, grad_fn=<NllLossBackward0>) 66 LOSS DIFF: tensor(5.7173, grad_fn=<NllLossBackward0>) tensor(5.5232, grad_fn=<NllLossBackward0>) 67 LOSS DIFF: tensor(5.7408, grad_fn=<NllLossBackward0>) tensor(5.7173, grad_fn=<NllLossBackward0>) 68 LOSS DIFF: tensor(5.8191, grad_fn=<NllLossBackward0>) tensor(5.7408, grad_fn=<NllLossBackward0>) 69 LOSS DIFF: tensor(6.0318, grad_fn=<NllLossBackward0>) tensor(5.8191, grad_fn=<NllLossBackward0>) 70 LOSS DIFF: tensor(5.6656, grad_fn=<NllLossBackward0>) tensor(5.5086, grad_fn=<NllLossBackward0>) 71 LOSS DIFF: tensor(5.7288, grad_fn=<NllLossBackward0>) tensor(5.6656, grad_fn=<NllLossBackward0>) 72 LOSS DIFF: tensor(6.0700, grad_fn=<NllLossBackward0>) tensor(5.7288, grad_fn=<NllLossBackward0>) 73 LOSS DIFF: tensor(5.8114, grad_fn=<NllLossBackward0>) tensor(5.5442, grad_fn=<NllLossBackward0>) 74 LOSS DIFF: tensor(5.8363, grad_fn=<NllLossBackward0>) tensor(5.5099, grad_fn=<NllLossBackward0>) 75 LOSS DIFF: tensor(5.8545, grad_fn=<NllLossBackward0>) tensor(5.8363, grad_fn=<NllLossBackward0>) 76 LOSS DIFF: tensor(5.9820, grad_fn=<NllLossBackward0>) tensor(5.8545, grad_fn=<NllLossBackward0>) 77 LOSS DIFF: tensor(5.8431, grad_fn=<NllLossBackward0>) tensor(5.7144, grad_fn=<NllLossBackward0>) 78 LOSS DIFF: tensor(5.9114, grad_fn=<NllLossBackward0>) tensor(5.8431, grad_fn=<NllLossBackward0>) 79 LOSS DIFF: tensor(5.8020, grad_fn=<NllLossBackward0>) tensor(5.4449, grad_fn=<NllLossBackward0>) 80 LOSS DIFF: tensor(5.8973, grad_fn=<NllLossBackward0>) tensor(5.5983, grad_fn=<NllLossBackward0>) 81 LOSS DIFF: tensor(5.6962, grad_fn=<NllLossBackward0>) tensor(5.6396, grad_fn=<NllLossBackward0>) 82 LOSS DIFF: tensor(5.6928, grad_fn=<NllLossBackward0>) tensor(5.5821, grad_fn=<NllLossBackward0>) 83 LOSS DIFF: tensor(5.7957, grad_fn=<NllLossBackward0>) tensor(5.6928, grad_fn=<NllLossBackward0>) 84 LOSS DIFF: tensor(5.5650, grad_fn=<NllLossBackward0>) tensor(5.5055, grad_fn=<NllLossBackward0>) 85 LOSS DIFF: tensor(5.6884, grad_fn=<NllLossBackward0>) tensor(5.5650, grad_fn=<NllLossBackward0>) 86 LOSS DIFF: tensor(5.7350, grad_fn=<NllLossBackward0>) tensor(5.6884, grad_fn=<NllLossBackward0>) 87 LOSS DIFF: tensor(5.6654, grad_fn=<NllLossBackward0>) tensor(5.5815, grad_fn=<NllLossBackward0>) 88 LOSS DIFF: tensor(5.7693, grad_fn=<NllLossBackward0>) tensor(5.3977, grad_fn=<NllLossBackward0>) 89 LOSS DIFF: tensor(5.5829, grad_fn=<NllLossBackward0>) tensor(5.5628, grad_fn=<NllLossBackward0>) 90 LOSS DIFF: tensor(5.8661, grad_fn=<NllLossBackward0>) tensor(5.5829, grad_fn=<NllLossBackward0>) 91 LOSS DIFF: tensor(5.4884, grad_fn=<NllLossBackward0>) tensor(5.4546, grad_fn=<NllLossBackward0>) 92 LOSS DIFF: tensor(5.6575, grad_fn=<NllLossBackward0>) tensor(5.4884, grad_fn=<NllLossBackward0>) 93 LOSS DIFF: tensor(5.8113, grad_fn=<NllLossBackward0>) tensor(5.6575, grad_fn=<NllLossBackward0>) 94 LOSS DIFF: tensor(5.6923, grad_fn=<NllLossBackward0>) tensor(5.5077, grad_fn=<NllLossBackward0>) 95 LOSS DIFF: tensor(5.7196, grad_fn=<NllLossBackward0>) tensor(5.6923, grad_fn=<NllLossBackward0>) 96 LOSS DIFF: tensor(5.6317, grad_fn=<NllLossBackward0>) tensor(5.6262, grad_fn=<NllLossBackward0>) 97 LOSS DIFF: tensor(5.7707, grad_fn=<NllLossBackward0>) tensor(5.6099, grad_fn=<NllLossBackward0>) 200 tensor(5.4212, grad_fn=<NllLossBackward0>) 98 LOSS DIFF: tensor(5.5956, grad_fn=<NllLossBackward0>) tensor(5.4212, grad_fn=<NllLossBackward0>) 99 LOSS DIFF: tensor(5.7422, grad_fn=<NllLossBackward0>) tensor(5.5956, grad_fn=<NllLossBackward0>) 100 LOSS DIFF: tensor(5.8166, grad_fn=<NllLossBackward0>) tensor(5.7422, grad_fn=<NllLossBackward0>) 101 LOSS DIFF: tensor(5.8615, grad_fn=<NllLossBackward0>) tensor(5.8166, grad_fn=<NllLossBackward0>) 102 LOSS DIFF: tensor(5.9617, grad_fn=<NllLossBackward0>) tensor(5.8615, grad_fn=<NllLossBackward0>) 103 LOSS DIFF: tensor(5.9847, grad_fn=<NllLossBackward0>) tensor(5.9617, grad_fn=<NllLossBackward0>) 104 LOSS DIFF: tensor(5.8443, grad_fn=<NllLossBackward0>) tensor(5.6014, grad_fn=<NllLossBackward0>) 105 LOSS DIFF: tensor(5.7755, grad_fn=<NllLossBackward0>) tensor(5.7413, grad_fn=<NllLossBackward0>) 106 LOSS DIFF: tensor(6.0574, grad_fn=<NllLossBackward0>) tensor(5.6690, grad_fn=<NllLossBackward0>) 107 LOSS DIFF: tensor(5.4708, grad_fn=<NllLossBackward0>) tensor(5.4460, grad_fn=<NllLossBackward0>) 108 LOSS DIFF: tensor(5.6402, grad_fn=<NllLossBackward0>) tensor(5.4708, grad_fn=<NllLossBackward0>) 109 LOSS DIFF: tensor(5.7016, grad_fn=<NllLossBackward0>) tensor(5.6402, grad_fn=<NllLossBackward0>) 110 LOSS DIFF: tensor(5.5643, grad_fn=<NllLossBackward0>) tensor(5.4158, grad_fn=<NllLossBackward0>) 111 LOSS DIFF: tensor(5.6958, grad_fn=<NllLossBackward0>) tensor(5.3094, grad_fn=<NllLossBackward0>) 112 LOSS DIFF: tensor(5.8296, grad_fn=<NllLossBackward0>) tensor(5.4617, grad_fn=<NllLossBackward0>) 113 LOSS DIFF: tensor(5.6992, grad_fn=<NllLossBackward0>) tensor(5.5483, grad_fn=<NllLossBackward0>) 114 LOSS DIFF: tensor(5.4980, grad_fn=<NllLossBackward0>) tensor(5.4310, grad_fn=<NllLossBackward0>) 115 LOSS DIFF: tensor(5.4942, grad_fn=<NllLossBackward0>) tensor(5.3832, grad_fn=<NllLossBackward0>) 116 LOSS DIFF: tensor(5.6928, grad_fn=<NllLossBackward0>) tensor(5.4942, grad_fn=<NllLossBackward0>) 117 LOSS DIFF: tensor(5.6334, grad_fn=<NllLossBackward0>) tensor(5.5606, grad_fn=<NllLossBackward0>) 118 LOSS DIFF: tensor(5.7307, grad_fn=<NllLossBackward0>) tensor(5.5210, grad_fn=<NllLossBackward0>) 119 LOSS DIFF: tensor(5.5673, grad_fn=<NllLossBackward0>) tensor(5.5488, grad_fn=<NllLossBackward0>) 120 LOSS DIFF: tensor(6.0060, grad_fn=<NllLossBackward0>) tensor(5.4800, grad_fn=<NllLossBackward0>) 121 LOSS DIFF: tensor(5.5278, grad_fn=<NllLossBackward0>) tensor(5.1856, grad_fn=<NllLossBackward0>) 122 LOSS DIFF: tensor(5.5388, grad_fn=<NllLossBackward0>) tensor(5.5278, grad_fn=<NllLossBackward0>) 123 LOSS DIFF: tensor(5.6835, grad_fn=<NllLossBackward0>) tensor(5.5388, grad_fn=<NllLossBackward0>) 124 LOSS DIFF: tensor(5.6808, grad_fn=<NllLossBackward0>) tensor(5.5417, grad_fn=<NllLossBackward0>) 125 LOSS DIFF: tensor(5.8665, grad_fn=<NllLossBackward0>) tensor(5.5828, grad_fn=<NllLossBackward0>) 126 LOSS DIFF: tensor(5.7710, grad_fn=<NllLossBackward0>) tensor(5.5468, grad_fn=<NllLossBackward0>) 127 LOSS DIFF: tensor(5.6604, grad_fn=<NllLossBackward0>) tensor(5.6368, grad_fn=<NllLossBackward0>) 128 LOSS DIFF: tensor(5.5983, grad_fn=<NllLossBackward0>) tensor(5.5213, grad_fn=<NllLossBackward0>) 129 LOSS DIFF: tensor(5.6943, grad_fn=<NllLossBackward0>) tensor(5.4842, grad_fn=<NllLossBackward0>) 130 LOSS DIFF: tensor(5.5073, grad_fn=<NllLossBackward0>) tensor(5.4259, grad_fn=<NllLossBackward0>) 131 LOSS DIFF: tensor(5.5320, grad_fn=<NllLossBackward0>) tensor(5.5073, grad_fn=<NllLossBackward0>) 132 LOSS DIFF: tensor(5.6082, grad_fn=<NllLossBackward0>) tensor(5.4292, grad_fn=<NllLossBackward0>) 133 LOSS DIFF: tensor(5.6768, grad_fn=<NllLossBackward0>) tensor(5.4724, grad_fn=<NllLossBackward0>) 134 LOSS DIFF: tensor(5.5272, grad_fn=<NllLossBackward0>) tensor(5.5222, grad_fn=<NllLossBackward0>) 135 LOSS DIFF: tensor(5.5190, grad_fn=<NllLossBackward0>) tensor(5.5016, grad_fn=<NllLossBackward0>) 136 LOSS DIFF: tensor(5.6560, grad_fn=<NllLossBackward0>) tensor(5.5190, grad_fn=<NllLossBackward0>) 137 LOSS DIFF: tensor(5.6775, grad_fn=<NllLossBackward0>) tensor(5.6560, grad_fn=<NllLossBackward0>) 138 LOSS DIFF: tensor(5.6694, grad_fn=<NllLossBackward0>) tensor(5.6686, grad_fn=<NllLossBackward0>) 139 LOSS DIFF: tensor(5.5788, grad_fn=<NllLossBackward0>) tensor(5.2768, grad_fn=<NllLossBackward0>) 140 LOSS DIFF: tensor(5.3935, grad_fn=<NllLossBackward0>) tensor(5.3774, grad_fn=<NllLossBackward0>) 141 LOSS DIFF: tensor(5.6068, grad_fn=<NllLossBackward0>) tensor(5.3935, grad_fn=<NllLossBackward0>) 142 LOSS DIFF: tensor(5.6336, grad_fn=<NllLossBackward0>) tensor(5.6068, grad_fn=<NllLossBackward0>) 143 LOSS DIFF: tensor(5.7687, grad_fn=<NllLossBackward0>) tensor(5.5630, grad_fn=<NllLossBackward0>) 144 LOSS DIFF: tensor(5.7539, grad_fn=<NllLossBackward0>) tensor(5.6827, grad_fn=<NllLossBackward0>) 145 LOSS DIFF: tensor(5.7485, grad_fn=<NllLossBackward0>) tensor(5.6277, grad_fn=<NllLossBackward0>) 300 tensor(5.8304, grad_fn=<NllLossBackward0>) 146 LOSS DIFF: tensor(5.8304, grad_fn=<NllLossBackward0>) tensor(5.5549, grad_fn=<NllLossBackward0>) 147 LOSS DIFF: tensor(5.5819, grad_fn=<NllLossBackward0>) tensor(5.4616, grad_fn=<NllLossBackward0>) 148 LOSS DIFF: tensor(5.6154, grad_fn=<NllLossBackward0>) tensor(5.5819, grad_fn=<NllLossBackward0>) 149 LOSS DIFF: tensor(5.7859, grad_fn=<NllLossBackward0>) tensor(5.3329, grad_fn=<NllLossBackward0>) 150 LOSS DIFF: tensor(5.5458, grad_fn=<NllLossBackward0>) tensor(5.5438, grad_fn=<NllLossBackward0>) 151 LOSS DIFF: tensor(5.7121, grad_fn=<NllLossBackward0>) tensor(5.5458, grad_fn=<NllLossBackward0>) 152 LOSS DIFF: tensor(5.6329, grad_fn=<NllLossBackward0>) tensor(5.2700, grad_fn=<NllLossBackward0>) 153 LOSS DIFF: tensor(5.6739, grad_fn=<NllLossBackward0>) tensor(5.3680, grad_fn=<NllLossBackward0>) 154 LOSS DIFF: tensor(5.7045, grad_fn=<NllLossBackward0>) tensor(5.6739, grad_fn=<NllLossBackward0>) 155 LOSS DIFF: tensor(5.5067, grad_fn=<NllLossBackward0>) tensor(5.2978, grad_fn=<NllLossBackward0>) 156 LOSS DIFF: tensor(5.5102, grad_fn=<NllLossBackward0>) tensor(5.5067, grad_fn=<NllLossBackward0>) 157 LOSS DIFF: tensor(5.5956, grad_fn=<NllLossBackward0>) tensor(5.4116, grad_fn=<NllLossBackward0>) 158 LOSS DIFF: tensor(5.5993, grad_fn=<NllLossBackward0>) tensor(5.4012, grad_fn=<NllLossBackward0>) 159 LOSS DIFF: tensor(5.6150, grad_fn=<NllLossBackward0>) tensor(5.3476, grad_fn=<NllLossBackward0>) 160 LOSS DIFF: tensor(5.4375, grad_fn=<NllLossBackward0>) tensor(5.4351, grad_fn=<NllLossBackward0>) 161 LOSS DIFF: tensor(5.7052, grad_fn=<NllLossBackward0>) tensor(5.4375, grad_fn=<NllLossBackward0>) 162 LOSS DIFF: tensor(5.7059, grad_fn=<NllLossBackward0>) tensor(5.5050, grad_fn=<NllLossBackward0>) 163 LOSS DIFF: tensor(5.7356, grad_fn=<NllLossBackward0>) tensor(5.5716, grad_fn=<NllLossBackward0>) 164 LOSS DIFF: tensor(5.7517, grad_fn=<NllLossBackward0>) tensor(5.5423, grad_fn=<NllLossBackward0>) 165 LOSS DIFF: tensor(5.7358, grad_fn=<NllLossBackward0>) tensor(5.4403, grad_fn=<NllLossBackward0>) 166 LOSS DIFF: tensor(5.6180, grad_fn=<NllLossBackward0>) tensor(5.4437, grad_fn=<NllLossBackward0>) 167 LOSS DIFF: tensor(5.5725, grad_fn=<NllLossBackward0>) tensor(5.2734, grad_fn=<NllLossBackward0>) 168 LOSS DIFF: tensor(5.8849, grad_fn=<NllLossBackward0>) tensor(5.3810, grad_fn=<NllLossBackward0>) 169 LOSS DIFF: tensor(5.5414, grad_fn=<NllLossBackward0>) tensor(5.5272, grad_fn=<NllLossBackward0>) 170 LOSS DIFF: tensor(5.5738, grad_fn=<NllLossBackward0>) tensor(5.3898, grad_fn=<NllLossBackward0>) 171 LOSS DIFF: tensor(5.7096, grad_fn=<NllLossBackward0>) tensor(5.2583, grad_fn=<NllLossBackward0>) 172 LOSS DIFF: tensor(5.7039, grad_fn=<NllLossBackward0>) tensor(5.6133, grad_fn=<NllLossBackward0>) 173 LOSS DIFF: tensor(5.5324, grad_fn=<NllLossBackward0>) tensor(5.5068, grad_fn=<NllLossBackward0>) 174 LOSS DIFF: tensor(5.5902, grad_fn=<NllLossBackward0>) tensor(5.4034, grad_fn=<NllLossBackward0>) 175 LOSS DIFF: tensor(5.5912, grad_fn=<NllLossBackward0>) tensor(5.5902, grad_fn=<NllLossBackward0>) 176 LOSS DIFF: tensor(5.7047, grad_fn=<NllLossBackward0>) tensor(5.5912, grad_fn=<NllLossBackward0>) 177 LOSS DIFF: tensor(5.6506, grad_fn=<NllLossBackward0>) tensor(5.4474, grad_fn=<NllLossBackward0>) 178 LOSS DIFF: tensor(5.5547, grad_fn=<NllLossBackward0>) tensor(5.5172, grad_fn=<NllLossBackward0>) 179 LOSS DIFF: tensor(5.5271, grad_fn=<NllLossBackward0>) tensor(5.2485, grad_fn=<NllLossBackward0>) 180 LOSS DIFF: tensor(5.5400, grad_fn=<NllLossBackward0>) tensor(5.4519, grad_fn=<NllLossBackward0>) 181 LOSS DIFF: tensor(5.6702, grad_fn=<NllLossBackward0>) tensor(5.5037, grad_fn=<NllLossBackward0>) 182 LOSS DIFF: tensor(5.5462, grad_fn=<NllLossBackward0>) tensor(5.4319, grad_fn=<NllLossBackward0>) 183 LOSS DIFF: tensor(5.5346, grad_fn=<NllLossBackward0>) tensor(5.4046, grad_fn=<NllLossBackward0>) 184 LOSS DIFF: tensor(5.5779, grad_fn=<NllLossBackward0>) tensor(5.5096, grad_fn=<NllLossBackward0>) 185 LOSS DIFF: tensor(5.5979, grad_fn=<NllLossBackward0>) tensor(5.4310, grad_fn=<NllLossBackward0>) 186 LOSS DIFF: tensor(5.4231, grad_fn=<NllLossBackward0>) tensor(5.2371, grad_fn=<NllLossBackward0>) 187 LOSS DIFF: tensor(5.6120, grad_fn=<NllLossBackward0>) tensor(5.4231, grad_fn=<NllLossBackward0>) 188 LOSS DIFF: tensor(5.4934, grad_fn=<NllLossBackward0>) tensor(5.1333, grad_fn=<NllLossBackward0>) 189 LOSS DIFF: tensor(5.5445, grad_fn=<NllLossBackward0>) tensor(5.2967, grad_fn=<NllLossBackward0>) 190 LOSS DIFF: tensor(5.5506, grad_fn=<NllLossBackward0>) tensor(5.5445, grad_fn=<NllLossBackward0>) 191 LOSS DIFF: tensor(5.6374, grad_fn=<NllLossBackward0>) tensor(5.5506, grad_fn=<NllLossBackward0>) 400 tensor(5.5743, grad_fn=<NllLossBackward0>) 192 LOSS DIFF: tensor(5.6050, grad_fn=<NllLossBackward0>) tensor(5.5743, grad_fn=<NllLossBackward0>) 193 LOSS DIFF: tensor(5.5826, grad_fn=<NllLossBackward0>) tensor(5.3787, grad_fn=<NllLossBackward0>) 194 LOSS DIFF: tensor(5.5223, grad_fn=<NllLossBackward0>) tensor(5.3267, grad_fn=<NllLossBackward0>) 195 LOSS DIFF: tensor(5.4600, grad_fn=<NllLossBackward0>) tensor(5.4485, grad_fn=<NllLossBackward0>) 196 LOSS DIFF: tensor(5.5178, grad_fn=<NllLossBackward0>) tensor(5.4600, grad_fn=<NllLossBackward0>) 197 LOSS DIFF: tensor(5.5514, grad_fn=<NllLossBackward0>) tensor(5.2249, grad_fn=<NllLossBackward0>) 198 LOSS DIFF: tensor(5.5651, grad_fn=<NllLossBackward0>) tensor(5.4807, grad_fn=<NllLossBackward0>) 199 LOSS DIFF: tensor(5.4252, grad_fn=<NllLossBackward0>) tensor(5.1542, grad_fn=<NllLossBackward0>) 200 LOSS DIFF: tensor(5.6503, grad_fn=<NllLossBackward0>) tensor(5.4252, grad_fn=<NllLossBackward0>) 201 LOSS DIFF: tensor(5.5460, grad_fn=<NllLossBackward0>) tensor(5.3643, grad_fn=<NllLossBackward0>) 202 LOSS DIFF: tensor(5.7145, grad_fn=<NllLossBackward0>) tensor(5.4959, grad_fn=<NllLossBackward0>) 203 LOSS DIFF: tensor(5.4506, grad_fn=<NllLossBackward0>) tensor(5.4382, grad_fn=<NllLossBackward0>) 204 LOSS DIFF: tensor(5.5514, grad_fn=<NllLossBackward0>) tensor(5.4506, grad_fn=<NllLossBackward0>) 205 LOSS DIFF: tensor(5.5680, grad_fn=<NllLossBackward0>) tensor(5.5468, grad_fn=<NllLossBackward0>) 206 LOSS DIFF: tensor(5.5970, grad_fn=<NllLossBackward0>) tensor(5.5680, grad_fn=<NllLossBackward0>) 207 LOSS DIFF: tensor(5.6742, grad_fn=<NllLossBackward0>) tensor(5.5970, grad_fn=<NllLossBackward0>) 208 LOSS DIFF: tensor(5.5306, grad_fn=<NllLossBackward0>) tensor(5.2061, grad_fn=<NllLossBackward0>) 209 LOSS DIFF: tensor(5.7571, grad_fn=<NllLossBackward0>) tensor(5.5306, grad_fn=<NllLossBackward0>) 210 LOSS DIFF: tensor(5.6525, grad_fn=<NllLossBackward0>) tensor(5.3833, grad_fn=<NllLossBackward0>) 211 LOSS DIFF: tensor(5.5354, grad_fn=<NllLossBackward0>) tensor(5.3948, grad_fn=<NllLossBackward0>) 212 LOSS DIFF: tensor(5.5960, grad_fn=<NllLossBackward0>) tensor(5.5354, grad_fn=<NllLossBackward0>) 213 LOSS DIFF: tensor(5.7113, grad_fn=<NllLossBackward0>) tensor(5.5470, grad_fn=<NllLossBackward0>) 214 LOSS DIFF: tensor(5.4059, grad_fn=<NllLossBackward0>) tensor(5.3649, grad_fn=<NllLossBackward0>) 215 LOSS DIFF: tensor(5.4863, grad_fn=<NllLossBackward0>) tensor(5.4004, grad_fn=<NllLossBackward0>) 216 LOSS DIFF: tensor(5.5381, grad_fn=<NllLossBackward0>) tensor(5.4863, grad_fn=<NllLossBackward0>) 217 LOSS DIFF: tensor(5.3652, grad_fn=<NllLossBackward0>) tensor(5.3540, grad_fn=<NllLossBackward0>) 218 LOSS DIFF: tensor(5.3894, grad_fn=<NllLossBackward0>) tensor(5.1646, grad_fn=<NllLossBackward0>) 219 LOSS DIFF: tensor(5.6803, grad_fn=<NllLossBackward0>) tensor(5.3894, grad_fn=<NllLossBackward0>) 220 LOSS DIFF: tensor(5.6113, grad_fn=<NllLossBackward0>) tensor(5.4769, grad_fn=<NllLossBackward0>) 221 LOSS DIFF: tensor(5.6813, grad_fn=<NllLossBackward0>) tensor(5.2015, grad_fn=<NllLossBackward0>) 222 LOSS DIFF: tensor(5.3458, grad_fn=<NllLossBackward0>) tensor(5.2679, grad_fn=<NllLossBackward0>) 223 LOSS DIFF: tensor(5.2445, grad_fn=<NllLossBackward0>) tensor(5.1445, grad_fn=<NllLossBackward0>) 224 LOSS DIFF: tensor(5.6649, grad_fn=<NllLossBackward0>) tensor(5.2441, grad_fn=<NllLossBackward0>) 225 LOSS DIFF: tensor(5.8539, grad_fn=<NllLossBackward0>) tensor(5.6026, grad_fn=<NllLossBackward0>) 226 LOSS DIFF: tensor(5.4560, grad_fn=<NllLossBackward0>) tensor(5.4208, grad_fn=<NllLossBackward0>) 227 LOSS DIFF: tensor(5.5729, grad_fn=<NllLossBackward0>) tensor(5.4560, grad_fn=<NllLossBackward0>) 228 LOSS DIFF: tensor(5.5996, grad_fn=<NllLossBackward0>) tensor(5.3175, grad_fn=<NllLossBackward0>) 229 LOSS DIFF: tensor(5.6685, grad_fn=<NllLossBackward0>) tensor(5.2451, grad_fn=<NllLossBackward0>) 230 LOSS DIFF: tensor(5.5938, grad_fn=<NllLossBackward0>) tensor(5.4874, grad_fn=<NllLossBackward0>) 231 LOSS DIFF: tensor(5.6228, grad_fn=<NllLossBackward0>) tensor(5.2840, grad_fn=<NllLossBackward0>) 232 LOSS DIFF: tensor(5.3415, grad_fn=<NllLossBackward0>) tensor(5.3339, grad_fn=<NllLossBackward0>) 233 LOSS DIFF: tensor(5.3861, grad_fn=<NllLossBackward0>) tensor(5.1807, grad_fn=<NllLossBackward0>) 234 LOSS DIFF: tensor(5.4093, grad_fn=<NllLossBackward0>) tensor(5.3861, grad_fn=<NllLossBackward0>) 235 LOSS DIFF: tensor(5.6085, grad_fn=<NllLossBackward0>) tensor(5.4093, grad_fn=<NllLossBackward0>) 236 LOSS DIFF: tensor(5.3475, grad_fn=<NllLossBackward0>) tensor(5.1380, grad_fn=<NllLossBackward0>) 237 LOSS DIFF: tensor(5.6542, grad_fn=<NllLossBackward0>) tensor(5.3475, grad_fn=<NllLossBackward0>) 238 LOSS DIFF: tensor(5.6034, grad_fn=<NllLossBackward0>) tensor(5.2396, grad_fn=<NllLossBackward0>) 239 LOSS DIFF: tensor(5.5599, grad_fn=<NllLossBackward0>) tensor(5.2510, grad_fn=<NllLossBackward0>) 240 LOSS DIFF: tensor(5.4534, grad_fn=<NllLossBackward0>) tensor(5.3629, grad_fn=<NllLossBackward0>) 500 tensor(5.5447, grad_fn=<NllLossBackward0>) 241 LOSS DIFF: tensor(5.5447, grad_fn=<NllLossBackward0>) tensor(5.4534, grad_fn=<NllLossBackward0>) 242 LOSS DIFF: tensor(5.4929, grad_fn=<NllLossBackward0>) tensor(5.3445, grad_fn=<NllLossBackward0>) 243 LOSS DIFF: tensor(5.4963, grad_fn=<NllLossBackward0>) tensor(5.3411, grad_fn=<NllLossBackward0>) 244 LOSS DIFF: tensor(5.3306, grad_fn=<NllLossBackward0>) tensor(5.1341, grad_fn=<NllLossBackward0>) 245 LOSS DIFF: tensor(5.3853, grad_fn=<NllLossBackward0>) tensor(5.3306, grad_fn=<NllLossBackward0>) 246 LOSS DIFF: tensor(5.5949, grad_fn=<NllLossBackward0>) tensor(5.3853, grad_fn=<NllLossBackward0>) 247 LOSS DIFF: tensor(5.5202, grad_fn=<NllLossBackward0>) tensor(5.2283, grad_fn=<NllLossBackward0>) 248 LOSS DIFF: tensor(5.5862, grad_fn=<NllLossBackward0>) tensor(5.5202, grad_fn=<NllLossBackward0>) 249 LOSS DIFF: tensor(5.5425, grad_fn=<NllLossBackward0>) tensor(5.2707, grad_fn=<NllLossBackward0>) 250 LOSS DIFF: tensor(5.6233, grad_fn=<NllLossBackward0>) tensor(5.2300, grad_fn=<NllLossBackward0>) 251 LOSS DIFF: tensor(5.4803, grad_fn=<NllLossBackward0>) tensor(5.3777, grad_fn=<NllLossBackward0>) 252 LOSS DIFF: tensor(5.6414, grad_fn=<NllLossBackward0>) tensor(5.3601, grad_fn=<NllLossBackward0>) 253 LOSS DIFF: tensor(5.2371, grad_fn=<NllLossBackward0>) tensor(5.2364, grad_fn=<NllLossBackward0>) 254 LOSS DIFF: tensor(5.3186, grad_fn=<NllLossBackward0>) tensor(5.2371, grad_fn=<NllLossBackward0>) 255 LOSS DIFF: tensor(5.6731, grad_fn=<NllLossBackward0>) tensor(5.3186, grad_fn=<NllLossBackward0>) 256 LOSS DIFF: tensor(5.5774, grad_fn=<NllLossBackward0>) tensor(5.5003, grad_fn=<NllLossBackward0>) 257 LOSS DIFF: tensor(5.6139, grad_fn=<NllLossBackward0>) tensor(5.0909, grad_fn=<NllLossBackward0>) 258 LOSS DIFF: tensor(5.4975, grad_fn=<NllLossBackward0>) tensor(5.3252, grad_fn=<NllLossBackward0>) 259 LOSS DIFF: tensor(5.1695, grad_fn=<NllLossBackward0>) tensor(5.1682, grad_fn=<NllLossBackward0>) 260 LOSS DIFF: tensor(5.4441, grad_fn=<NllLossBackward0>) tensor(5.1695, grad_fn=<NllLossBackward0>) 261 LOSS DIFF: tensor(5.5408, grad_fn=<NllLossBackward0>) tensor(5.4441, grad_fn=<NllLossBackward0>) 262 LOSS DIFF: tensor(5.5618, grad_fn=<NllLossBackward0>) tensor(5.5408, grad_fn=<NllLossBackward0>) 263 LOSS DIFF: tensor(5.5545, grad_fn=<NllLossBackward0>) tensor(5.5457, grad_fn=<NllLossBackward0>) 264 LOSS DIFF: tensor(5.6082, grad_fn=<NllLossBackward0>) tensor(5.5545, grad_fn=<NllLossBackward0>) 265 LOSS DIFF: tensor(5.3351, grad_fn=<NllLossBackward0>) tensor(5.3258, grad_fn=<NllLossBackward0>) 266 LOSS DIFF: tensor(5.5028, grad_fn=<NllLossBackward0>) tensor(5.3351, grad_fn=<NllLossBackward0>) 267 LOSS DIFF: tensor(5.4873, grad_fn=<NllLossBackward0>) tensor(5.3415, grad_fn=<NllLossBackward0>) 268 LOSS DIFF: tensor(5.5458, grad_fn=<NllLossBackward0>) tensor(5.4873, grad_fn=<NllLossBackward0>) 269 LOSS DIFF: tensor(5.3706, grad_fn=<NllLossBackward0>) tensor(5.3371, grad_fn=<NllLossBackward0>) 270 LOSS DIFF: tensor(5.5207, grad_fn=<NllLossBackward0>) tensor(5.3706, grad_fn=<NllLossBackward0>) 271 LOSS DIFF: tensor(5.4275, grad_fn=<NllLossBackward0>) tensor(5.3686, grad_fn=<NllLossBackward0>) 272 LOSS DIFF: tensor(5.5256, grad_fn=<NllLossBackward0>) tensor(5.4275, grad_fn=<NllLossBackward0>) 273 LOSS DIFF: tensor(5.3044, grad_fn=<NllLossBackward0>) tensor(5.1722, grad_fn=<NllLossBackward0>) 274 LOSS DIFF: tensor(5.1798, grad_fn=<NllLossBackward0>) tensor(5.0866, grad_fn=<NllLossBackward0>) 275 LOSS DIFF: tensor(5.5159, grad_fn=<NllLossBackward0>) tensor(5.1798, grad_fn=<NllLossBackward0>) 276 LOSS DIFF: tensor(5.3755, grad_fn=<NllLossBackward0>) tensor(5.3404, grad_fn=<NllLossBackward0>) 277 LOSS DIFF: tensor(5.3817, grad_fn=<NllLossBackward0>) tensor(5.3755, grad_fn=<NllLossBackward0>) 278 LOSS DIFF: tensor(5.5214, grad_fn=<NllLossBackward0>) tensor(5.3817, grad_fn=<NllLossBackward0>) 279 LOSS DIFF: tensor(5.4231, grad_fn=<NllLossBackward0>) tensor(5.4104, grad_fn=<NllLossBackward0>) 280 LOSS DIFF: tensor(5.7068, grad_fn=<NllLossBackward0>) tensor(5.4231, grad_fn=<NllLossBackward0>) 281 LOSS DIFF: tensor(5.6217, grad_fn=<NllLossBackward0>) tensor(5.3672, grad_fn=<NllLossBackward0>) 282 LOSS DIFF: tensor(5.5297, grad_fn=<NllLossBackward0>) tensor(5.2592, grad_fn=<NllLossBackward0>) 283 LOSS DIFF: tensor(5.4354, grad_fn=<NllLossBackward0>) tensor(5.1583, grad_fn=<NllLossBackward0>) 284 LOSS DIFF: tensor(5.3529, grad_fn=<NllLossBackward0>) tensor(5.3227, grad_fn=<NllLossBackward0>) 285 LOSS DIFF: tensor(5.5201, grad_fn=<NllLossBackward0>) tensor(5.3529, grad_fn=<NllLossBackward0>) 286 LOSS DIFF: tensor(5.3654, grad_fn=<NllLossBackward0>) tensor(5.3083, grad_fn=<NllLossBackward0>) 287 LOSS DIFF: tensor(5.3719, grad_fn=<NllLossBackward0>) tensor(5.3654, grad_fn=<NllLossBackward0>) 288 LOSS DIFF: tensor(5.7598, grad_fn=<NllLossBackward0>) tensor(5.3256, grad_fn=<NllLossBackward0>) 289 LOSS DIFF: tensor(5.4723, grad_fn=<NllLossBackward0>) tensor(5.3773, grad_fn=<NllLossBackward0>) 600 tensor(5.1854, grad_fn=<NllLossBackward0>) 290 LOSS DIFF: tensor(5.2626, grad_fn=<NllLossBackward0>) tensor(5.1854, grad_fn=<NllLossBackward0>) 291 LOSS DIFF: tensor(5.3265, grad_fn=<NllLossBackward0>) tensor(5.2626, grad_fn=<NllLossBackward0>) 292 LOSS DIFF: tensor(5.3546, grad_fn=<NllLossBackward0>) tensor(5.3265, grad_fn=<NllLossBackward0>) 293 LOSS DIFF: tensor(5.4134, grad_fn=<NllLossBackward0>) tensor(5.3546, grad_fn=<NllLossBackward0>) 294 LOSS DIFF: tensor(5.3317, grad_fn=<NllLossBackward0>) tensor(5.3061, grad_fn=<NllLossBackward0>) 295 LOSS DIFF: tensor(5.5886, grad_fn=<NllLossBackward0>) tensor(5.3317, grad_fn=<NllLossBackward0>) 296 LOSS DIFF: tensor(5.2714, grad_fn=<NllLossBackward0>) tensor(5.2538, grad_fn=<NllLossBackward0>) 297 LOSS DIFF: tensor(5.4437, grad_fn=<NllLossBackward0>) tensor(5.2699, grad_fn=<NllLossBackward0>) 298 LOSS DIFF: tensor(5.4026, grad_fn=<NllLossBackward0>) tensor(5.3539, grad_fn=<NllLossBackward0>) 299 LOSS DIFF: tensor(5.5344, grad_fn=<NllLossBackward0>) tensor(5.4026, grad_fn=<NllLossBackward0>) 300 LOSS DIFF: tensor(5.2724, grad_fn=<NllLossBackward0>) tensor(5.1554, grad_fn=<NllLossBackward0>) 301 LOSS DIFF: tensor(5.4204, grad_fn=<NllLossBackward0>) tensor(5.2614, grad_fn=<NllLossBackward0>) 302 LOSS DIFF: tensor(5.5588, grad_fn=<NllLossBackward0>) tensor(5.4204, grad_fn=<NllLossBackward0>) 303 LOSS DIFF: tensor(5.4821, grad_fn=<NllLossBackward0>) tensor(5.2939, grad_fn=<NllLossBackward0>) 304 LOSS DIFF: tensor(5.5529, grad_fn=<NllLossBackward0>) tensor(5.4821, grad_fn=<NllLossBackward0>) 305 LOSS DIFF: tensor(5.5659, grad_fn=<NllLossBackward0>) tensor(5.5529, grad_fn=<NllLossBackward0>) 306 LOSS DIFF: tensor(5.3128, grad_fn=<NllLossBackward0>) tensor(5.1975, grad_fn=<NllLossBackward0>) 307 LOSS DIFF: tensor(5.4044, grad_fn=<NllLossBackward0>) tensor(5.2514, grad_fn=<NllLossBackward0>) 308 LOSS DIFF: tensor(5.5461, grad_fn=<NllLossBackward0>) tensor(5.4044, grad_fn=<NllLossBackward0>) 309 LOSS DIFF: tensor(5.4835, grad_fn=<NllLossBackward0>) tensor(5.4153, grad_fn=<NllLossBackward0>) 310 LOSS DIFF: tensor(5.4990, grad_fn=<NllLossBackward0>) tensor(5.3391, grad_fn=<NllLossBackward0>) 311 LOSS DIFF: tensor(5.5111, grad_fn=<NllLossBackward0>) tensor(5.4990, grad_fn=<NllLossBackward0>) 312 LOSS DIFF: tensor(5.4828, grad_fn=<NllLossBackward0>) tensor(5.3784, grad_fn=<NllLossBackward0>) 313 LOSS DIFF: tensor(5.4165, grad_fn=<NllLossBackward0>) tensor(5.0706, grad_fn=<NllLossBackward0>) 314 LOSS DIFF: tensor(5.5142, grad_fn=<NllLossBackward0>) tensor(5.4165, grad_fn=<NllLossBackward0>) 315 LOSS DIFF: tensor(5.3397, grad_fn=<NllLossBackward0>) tensor(5.1207, grad_fn=<NllLossBackward0>) 316 LOSS DIFF: tensor(5.6205, grad_fn=<NllLossBackward0>) tensor(5.3397, grad_fn=<NllLossBackward0>) 317 LOSS DIFF: tensor(5.4190, grad_fn=<NllLossBackward0>) tensor(5.3573, grad_fn=<NllLossBackward0>) 318 LOSS DIFF: tensor(5.2788, grad_fn=<NllLossBackward0>) tensor(5.2728, grad_fn=<NllLossBackward0>) 319 LOSS DIFF: tensor(5.3070, grad_fn=<NllLossBackward0>) tensor(5.2788, grad_fn=<NllLossBackward0>) 320 LOSS DIFF: tensor(5.5223, grad_fn=<NllLossBackward0>) tensor(5.3070, grad_fn=<NllLossBackward0>) 321 LOSS DIFF: tensor(5.3895, grad_fn=<NllLossBackward0>) tensor(5.2946, grad_fn=<NllLossBackward0>) 322 LOSS DIFF: tensor(5.6954, grad_fn=<NllLossBackward0>) tensor(5.2766, grad_fn=<NllLossBackward0>) 323 LOSS DIFF: tensor(5.3206, grad_fn=<NllLossBackward0>) tensor(5.2566, grad_fn=<NllLossBackward0>) 324 LOSS DIFF: tensor(5.4333, grad_fn=<NllLossBackward0>) tensor(5.1247, grad_fn=<NllLossBackward0>) 325 LOSS DIFF: tensor(5.5108, grad_fn=<NllLossBackward0>) tensor(5.2871, grad_fn=<NllLossBackward0>) 326 LOSS DIFF: tensor(5.3659, grad_fn=<NllLossBackward0>) tensor(5.2939, grad_fn=<NllLossBackward0>) 327 LOSS DIFF: tensor(5.4602, grad_fn=<NllLossBackward0>) tensor(5.2214, grad_fn=<NllLossBackward0>) 328 LOSS DIFF: tensor(5.1405, grad_fn=<NllLossBackward0>) tensor(4.9549, grad_fn=<NllLossBackward0>) 329 LOSS DIFF: tensor(5.4136, grad_fn=<NllLossBackward0>) tensor(4.9053, grad_fn=<NllLossBackward0>) 330 LOSS DIFF: tensor(5.7120, grad_fn=<NllLossBackward0>) tensor(5.2294, grad_fn=<NllLossBackward0>) 331 LOSS DIFF: tensor(5.4775, grad_fn=<NllLossBackward0>) tensor(5.3224, grad_fn=<NllLossBackward0>) 332 LOSS DIFF: tensor(5.2917, grad_fn=<NllLossBackward0>) tensor(5.1672, grad_fn=<NllLossBackward0>) 333 LOSS DIFF: tensor(5.3209, grad_fn=<NllLossBackward0>) tensor(5.2917, grad_fn=<NllLossBackward0>) 334 LOSS DIFF: tensor(5.3745, grad_fn=<NllLossBackward0>) tensor(5.3209, grad_fn=<NllLossBackward0>) 335 LOSS DIFF: tensor(5.4889, grad_fn=<NllLossBackward0>) tensor(5.3172, grad_fn=<NllLossBackward0>) 336 LOSS DIFF: tensor(5.3614, grad_fn=<NllLossBackward0>) tensor(5.2868, grad_fn=<NllLossBackward0>) 337 LOSS DIFF: tensor(5.4456, grad_fn=<NllLossBackward0>) tensor(5.3614, grad_fn=<NllLossBackward0>) 338 LOSS DIFF: tensor(5.3012, grad_fn=<NllLossBackward0>) tensor(5.2641, grad_fn=<NllLossBackward0>) 339 LOSS DIFF: tensor(5.5309, grad_fn=<NllLossBackward0>) tensor(5.3012, grad_fn=<NllLossBackward0>) 340 LOSS DIFF: tensor(5.2953, grad_fn=<NllLossBackward0>) tensor(5.1931, grad_fn=<NllLossBackward0>) 341 LOSS DIFF: tensor(5.3908, grad_fn=<NllLossBackward0>) tensor(5.2953, grad_fn=<NllLossBackward0>) 342 LOSS DIFF: tensor(5.5060, grad_fn=<NllLossBackward0>) tensor(5.1682, grad_fn=<NllLossBackward0>) 700 tensor(5.1404, grad_fn=<NllLossBackward0>) 343 LOSS DIFF: tensor(5.3184, grad_fn=<NllLossBackward0>) tensor(4.8281, grad_fn=<NllLossBackward0>) 344 LOSS DIFF: tensor(5.4549, grad_fn=<NllLossBackward0>) tensor(5.3184, grad_fn=<NllLossBackward0>) 345 LOSS DIFF: tensor(5.4196, grad_fn=<NllLossBackward0>) tensor(5.4127, grad_fn=<NllLossBackward0>) 346 LOSS DIFF: tensor(5.4480, grad_fn=<NllLossBackward0>) tensor(5.4196, grad_fn=<NllLossBackward0>) 347 LOSS DIFF: tensor(5.5778, grad_fn=<NllLossBackward0>) tensor(5.3616, grad_fn=<NllLossBackward0>) 348 LOSS DIFF: tensor(5.2266, grad_fn=<NllLossBackward0>) tensor(5.1052, grad_fn=<NllLossBackward0>) 349 LOSS DIFF: tensor(5.4058, grad_fn=<NllLossBackward0>) tensor(5.2266, grad_fn=<NllLossBackward0>) 350 LOSS DIFF: tensor(5.2772, grad_fn=<NllLossBackward0>) tensor(5.1653, grad_fn=<NllLossBackward0>) 351 LOSS DIFF: tensor(5.3236, grad_fn=<NllLossBackward0>) tensor(5.2772, grad_fn=<NllLossBackward0>) 352 LOSS DIFF: tensor(5.3818, grad_fn=<NllLossBackward0>) tensor(5.3236, grad_fn=<NllLossBackward0>) 353 LOSS DIFF: tensor(5.1957, grad_fn=<NllLossBackward0>) tensor(5.1122, grad_fn=<NllLossBackward0>) 354 LOSS DIFF: tensor(5.2754, grad_fn=<NllLossBackward0>) tensor(5.1957, grad_fn=<NllLossBackward0>) 355 LOSS DIFF: tensor(5.4069, grad_fn=<NllLossBackward0>) tensor(5.2754, grad_fn=<NllLossBackward0>) 356 LOSS DIFF: tensor(5.3361, grad_fn=<NllLossBackward0>) tensor(5.1708, grad_fn=<NllLossBackward0>) 357 LOSS DIFF: tensor(5.5310, grad_fn=<NllLossBackward0>) tensor(5.2320, grad_fn=<NllLossBackward0>) 358 LOSS DIFF: tensor(5.5582, grad_fn=<NllLossBackward0>) tensor(5.3281, grad_fn=<NllLossBackward0>) 359 LOSS DIFF: tensor(5.4403, grad_fn=<NllLossBackward0>) tensor(5.0958, grad_fn=<NllLossBackward0>) 360 LOSS DIFF: tensor(5.3855, grad_fn=<NllLossBackward0>) tensor(5.3547, grad_fn=<NllLossBackward0>) 361 LOSS DIFF: tensor(5.4341, grad_fn=<NllLossBackward0>) tensor(5.3628, grad_fn=<NllLossBackward0>) 362 LOSS DIFF: tensor(5.4064, grad_fn=<NllLossBackward0>) tensor(5.3641, grad_fn=<NllLossBackward0>) 363 LOSS DIFF: tensor(5.4232, grad_fn=<NllLossBackward0>) tensor(5.4064, grad_fn=<NllLossBackward0>) 364 LOSS DIFF: tensor(5.4929, grad_fn=<NllLossBackward0>) tensor(5.2922, grad_fn=<NllLossBackward0>) 365 LOSS DIFF: tensor(5.2788, grad_fn=<NllLossBackward0>) tensor(5.1483, grad_fn=<NllLossBackward0>) 366 LOSS DIFF: tensor(5.3894, grad_fn=<NllLossBackward0>) tensor(5.1464, grad_fn=<NllLossBackward0>) 367 LOSS DIFF: tensor(5.5410, grad_fn=<NllLossBackward0>) tensor(5.3032, grad_fn=<NllLossBackward0>) 368 LOSS DIFF: tensor(5.4745, grad_fn=<NllLossBackward0>) tensor(5.3954, grad_fn=<NllLossBackward0>) 369 LOSS DIFF: tensor(5.4002, grad_fn=<NllLossBackward0>) tensor(5.2852, grad_fn=<NllLossBackward0>) 370 LOSS DIFF: tensor(5.5121, grad_fn=<NllLossBackward0>) tensor(5.1010, grad_fn=<NllLossBackward0>) 371 LOSS DIFF: tensor(5.1770, grad_fn=<NllLossBackward0>) tensor(4.9924, grad_fn=<NllLossBackward0>) 372 LOSS DIFF: tensor(5.2602, grad_fn=<NllLossBackward0>) tensor(5.0630, grad_fn=<NllLossBackward0>) 373 LOSS DIFF: tensor(5.1854, grad_fn=<NllLossBackward0>) tensor(5.1847, grad_fn=<NllLossBackward0>) 374 LOSS DIFF: tensor(5.4752, grad_fn=<NllLossBackward0>) tensor(5.1854, grad_fn=<NllLossBackward0>) 375 LOSS DIFF: tensor(5.3940, grad_fn=<NllLossBackward0>) tensor(4.9471, grad_fn=<NllLossBackward0>) 376 LOSS DIFF: tensor(5.4444, grad_fn=<NllLossBackward0>) tensor(5.3940, grad_fn=<NllLossBackward0>) 377 LOSS DIFF: tensor(5.2639, grad_fn=<NllLossBackward0>) tensor(5.2434, grad_fn=<NllLossBackward0>) 378 LOSS DIFF: tensor(5.5010, grad_fn=<NllLossBackward0>) tensor(5.2639, grad_fn=<NllLossBackward0>) 379 LOSS DIFF: tensor(5.3871, grad_fn=<NllLossBackward0>) tensor(5.2697, grad_fn=<NllLossBackward0>) 380 LOSS DIFF: tensor(5.5319, grad_fn=<NllLossBackward0>) tensor(5.2951, grad_fn=<NllLossBackward0>) 381 LOSS DIFF: tensor(5.2672, grad_fn=<NllLossBackward0>) tensor(5.0885, grad_fn=<NllLossBackward0>) 382 LOSS DIFF: tensor(5.3262, grad_fn=<NllLossBackward0>) tensor(5.2672, grad_fn=<NllLossBackward0>) 383 LOSS DIFF: tensor(5.4015, grad_fn=<NllLossBackward0>) tensor(5.3262, grad_fn=<NllLossBackward0>) 384 LOSS DIFF: tensor(5.2618, grad_fn=<NllLossBackward0>) tensor(5.2335, grad_fn=<NllLossBackward0>) 385 LOSS DIFF: tensor(5.3040, grad_fn=<NllLossBackward0>) tensor(5.2618, grad_fn=<NllLossBackward0>) 386 LOSS DIFF: tensor(5.2459, grad_fn=<NllLossBackward0>) tensor(5.0806, grad_fn=<NllLossBackward0>) 387 LOSS DIFF: tensor(5.3756, grad_fn=<NllLossBackward0>) tensor(5.2459, grad_fn=<NllLossBackward0>) 388 LOSS DIFF: tensor(5.3504, grad_fn=<NllLossBackward0>) tensor(5.1054, grad_fn=<NllLossBackward0>) 389 LOSS DIFF: tensor(5.2258, grad_fn=<NllLossBackward0>) tensor(5.1519, grad_fn=<NllLossBackward0>) 390 LOSS DIFF: tensor(5.2802, grad_fn=<NllLossBackward0>) tensor(5.2258, grad_fn=<NllLossBackward0>) 391 LOSS DIFF: tensor(5.3461, grad_fn=<NllLossBackward0>) tensor(5.2802, grad_fn=<NllLossBackward0>) 392 LOSS DIFF: tensor(5.3227, grad_fn=<NllLossBackward0>) tensor(5.2572, grad_fn=<NllLossBackward0>) 800 tensor(5.1938, grad_fn=<NllLossBackward0>) 393 LOSS DIFF: tensor(5.4509, grad_fn=<NllLossBackward0>) tensor(5.1938, grad_fn=<NllLossBackward0>) 394 LOSS DIFF: tensor(5.1965, grad_fn=<NllLossBackward0>) tensor(5.1726, grad_fn=<NllLossBackward0>) 395 LOSS DIFF: tensor(5.3317, grad_fn=<NllLossBackward0>) tensor(5.1965, grad_fn=<NllLossBackward0>) 396 LOSS DIFF: tensor(5.2442, grad_fn=<NllLossBackward0>) tensor(5.0167, grad_fn=<NllLossBackward0>) 397 LOSS DIFF: tensor(5.2592, grad_fn=<NllLossBackward0>) tensor(5.2442, grad_fn=<NllLossBackward0>) 398 LOSS DIFF: tensor(5.2272, grad_fn=<NllLossBackward0>) tensor(5.1738, grad_fn=<NllLossBackward0>) 399 LOSS DIFF: tensor(5.2863, grad_fn=<NllLossBackward0>) tensor(5.2272, grad_fn=<NllLossBackward0>) 400 LOSS DIFF: tensor(5.3143, grad_fn=<NllLossBackward0>) tensor(5.2863, grad_fn=<NllLossBackward0>) 401 LOSS DIFF: tensor(5.0616, grad_fn=<NllLossBackward0>) tensor(5.0013, grad_fn=<NllLossBackward0>) 402 LOSS DIFF: tensor(5.4039, grad_fn=<NllLossBackward0>) tensor(5.0616, grad_fn=<NllLossBackward0>) 403 LOSS DIFF: tensor(5.3913, grad_fn=<NllLossBackward0>) tensor(4.9984, grad_fn=<NllLossBackward0>) 404 LOSS DIFF: tensor(5.2658, grad_fn=<NllLossBackward0>) tensor(5.2179, grad_fn=<NllLossBackward0>) 405 LOSS DIFF: tensor(5.2846, grad_fn=<NllLossBackward0>) tensor(5.2658, grad_fn=<NllLossBackward0>) 406 LOSS DIFF: tensor(5.3590, grad_fn=<NllLossBackward0>) tensor(5.2846, grad_fn=<NllLossBackward0>) 407 LOSS DIFF: tensor(5.4706, grad_fn=<NllLossBackward0>) tensor(5.0496, grad_fn=<NllLossBackward0>) 408 LOSS DIFF: tensor(5.6955, grad_fn=<NllLossBackward0>) tensor(5.4706, grad_fn=<NllLossBackward0>) 409 LOSS DIFF: tensor(5.4540, grad_fn=<NllLossBackward0>) tensor(4.9054, grad_fn=<NllLossBackward0>) 410 LOSS DIFF: tensor(5.1788, grad_fn=<NllLossBackward0>) tensor(5.0048, grad_fn=<NllLossBackward0>) 411 LOSS DIFF: tensor(5.2213, grad_fn=<NllLossBackward0>) tensor(5.1788, grad_fn=<NllLossBackward0>) 412 LOSS DIFF: tensor(5.2282, grad_fn=<NllLossBackward0>) tensor(5.2213, grad_fn=<NllLossBackward0>) 413 LOSS DIFF: tensor(5.4138, grad_fn=<NllLossBackward0>) tensor(5.1972, grad_fn=<NllLossBackward0>) 414 LOSS DIFF: tensor(5.3300, grad_fn=<NllLossBackward0>) tensor(4.9654, grad_fn=<NllLossBackward0>) 415 LOSS DIFF: tensor(5.0692, grad_fn=<NllLossBackward0>) tensor(4.9775, grad_fn=<NllLossBackward0>) 416 LOSS DIFF: tensor(5.1780, grad_fn=<NllLossBackward0>) tensor(5.0692, grad_fn=<NllLossBackward0>) 417 LOSS DIFF: tensor(5.4131, grad_fn=<NllLossBackward0>) tensor(5.1780, grad_fn=<NllLossBackward0>) 418 LOSS DIFF: tensor(5.5625, grad_fn=<NllLossBackward0>) tensor(5.4131, grad_fn=<NllLossBackward0>) 419 LOSS DIFF: tensor(5.1862, grad_fn=<NllLossBackward0>) tensor(5.1502, grad_fn=<NllLossBackward0>) 420 LOSS DIFF: tensor(5.2858, grad_fn=<NllLossBackward0>) tensor(5.1862, grad_fn=<NllLossBackward0>) 421 LOSS DIFF: tensor(5.2607, grad_fn=<NllLossBackward0>) tensor(5.2394, grad_fn=<NllLossBackward0>) 422 LOSS DIFF: tensor(5.4085, grad_fn=<NllLossBackward0>) tensor(5.2607, grad_fn=<NllLossBackward0>) 423 LOSS DIFF: tensor(5.3268, grad_fn=<NllLossBackward0>) tensor(5.3040, grad_fn=<NllLossBackward0>) 424 LOSS DIFF: tensor(5.4477, grad_fn=<NllLossBackward0>) tensor(5.3268, grad_fn=<NllLossBackward0>) 425 LOSS DIFF: tensor(5.3032, grad_fn=<NllLossBackward0>) tensor(5.2228, grad_fn=<NllLossBackward0>) 426 LOSS DIFF: tensor(5.4339, grad_fn=<NllLossBackward0>) tensor(5.2517, grad_fn=<NllLossBackward0>) 427 LOSS DIFF: tensor(5.3693, grad_fn=<NllLossBackward0>) tensor(5.0677, grad_fn=<NllLossBackward0>) 428 LOSS DIFF: tensor(5.2379, grad_fn=<NllLossBackward0>) tensor(5.2100, grad_fn=<NllLossBackward0>) 429 LOSS DIFF: tensor(5.2541, grad_fn=<NllLossBackward0>) tensor(5.2379, grad_fn=<NllLossBackward0>) 430 LOSS DIFF: tensor(5.2259, grad_fn=<NllLossBackward0>) tensor(5.1291, grad_fn=<NllLossBackward0>) 431 LOSS DIFF: tensor(5.2455, grad_fn=<NllLossBackward0>) tensor(5.1523, grad_fn=<NllLossBackward0>) 432 LOSS DIFF: tensor(5.3854, grad_fn=<NllLossBackward0>) tensor(5.2147, grad_fn=<NllLossBackward0>) 433 LOSS DIFF: tensor(5.2580, grad_fn=<NllLossBackward0>) tensor(5.1674, grad_fn=<NllLossBackward0>) 434 LOSS DIFF: tensor(5.3666, grad_fn=<NllLossBackward0>) tensor(5.2580, grad_fn=<NllLossBackward0>) 435 LOSS DIFF: tensor(5.3990, grad_fn=<NllLossBackward0>) tensor(5.2895, grad_fn=<NllLossBackward0>) 436 LOSS DIFF: tensor(5.4095, grad_fn=<NllLossBackward0>) tensor(5.2050, grad_fn=<NllLossBackward0>) 437 LOSS DIFF: tensor(5.3580, grad_fn=<NllLossBackward0>) tensor(5.1551, grad_fn=<NllLossBackward0>) 438 LOSS DIFF: tensor(5.5038, grad_fn=<NllLossBackward0>) tensor(5.2894, grad_fn=<NllLossBackward0>) 439 LOSS DIFF: tensor(5.3097, grad_fn=<NllLossBackward0>) tensor(5.1047, grad_fn=<NllLossBackward0>) 440 LOSS DIFF: tensor(5.4076, grad_fn=<NllLossBackward0>) tensor(5.3097, grad_fn=<NllLossBackward0>) 441 LOSS DIFF: tensor(5.3938, grad_fn=<NllLossBackward0>) tensor(5.2490, grad_fn=<NllLossBackward0>) 442 LOSS DIFF: tensor(5.6185, grad_fn=<NllLossBackward0>) tensor(5.3873, grad_fn=<NllLossBackward0>) 900 tensor(5.2894, grad_fn=<NllLossBackward0>) 443 LOSS DIFF: tensor(5.2605, grad_fn=<NllLossBackward0>) tensor(5.0513, grad_fn=<NllLossBackward0>) 444 LOSS DIFF: tensor(5.5549, grad_fn=<NllLossBackward0>) tensor(5.2605, grad_fn=<NllLossBackward0>) 445 LOSS DIFF: tensor(5.1775, grad_fn=<NllLossBackward0>) tensor(5.1379, grad_fn=<NllLossBackward0>) 446 LOSS DIFF: tensor(5.3998, grad_fn=<NllLossBackward0>) tensor(5.1775, grad_fn=<NllLossBackward0>) 447 LOSS DIFF: tensor(5.4069, grad_fn=<NllLossBackward0>) tensor(5.3169, grad_fn=<NllLossBackward0>) 448 LOSS DIFF: tensor(5.2558, grad_fn=<NllLossBackward0>) tensor(4.9919, grad_fn=<NllLossBackward0>) 449 LOSS DIFF: tensor(5.4139, grad_fn=<NllLossBackward0>) tensor(5.2558, grad_fn=<NllLossBackward0>) 450 LOSS DIFF: tensor(5.4725, grad_fn=<NllLossBackward0>) tensor(5.4139, grad_fn=<NllLossBackward0>) 451 LOSS DIFF: tensor(5.3004, grad_fn=<NllLossBackward0>) tensor(5.1489, grad_fn=<NllLossBackward0>) 452 LOSS DIFF: tensor(5.3943, grad_fn=<NllLossBackward0>) tensor(5.3004, grad_fn=<NllLossBackward0>) 453 LOSS DIFF: tensor(5.2652, grad_fn=<NllLossBackward0>) tensor(5.0230, grad_fn=<NllLossBackward0>) 454 LOSS DIFF: tensor(5.3982, grad_fn=<NllLossBackward0>) tensor(5.2229, grad_fn=<NllLossBackward0>) 455 LOSS DIFF: tensor(5.4184, grad_fn=<NllLossBackward0>) tensor(5.2137, grad_fn=<NllLossBackward0>) 456 LOSS DIFF: tensor(5.6858, grad_fn=<NllLossBackward0>) tensor(5.1474, grad_fn=<NllLossBackward0>) 457 LOSS DIFF: tensor(5.3886, grad_fn=<NllLossBackward0>) tensor(5.1649, grad_fn=<NllLossBackward0>) 458 LOSS DIFF: tensor(5.3129, grad_fn=<NllLossBackward0>) tensor(5.2705, grad_fn=<NllLossBackward0>) 459 LOSS DIFF: tensor(5.4430, grad_fn=<NllLossBackward0>) tensor(5.0307, grad_fn=<NllLossBackward0>) 460 LOSS DIFF: tensor(5.4555, grad_fn=<NllLossBackward0>) tensor(5.3132, grad_fn=<NllLossBackward0>) 461 LOSS DIFF: tensor(5.2490, grad_fn=<NllLossBackward0>) tensor(4.9971, grad_fn=<NllLossBackward0>) 462 LOSS DIFF: tensor(5.4743, grad_fn=<NllLossBackward0>) tensor(5.1878, grad_fn=<NllLossBackward0>) 463 LOSS DIFF: tensor(5.2897, grad_fn=<NllLossBackward0>) tensor(4.9685, grad_fn=<NllLossBackward0>) 464 LOSS DIFF: tensor(5.3322, grad_fn=<NllLossBackward0>) tensor(5.1790, grad_fn=<NllLossBackward0>) 465 LOSS DIFF: tensor(5.2013, grad_fn=<NllLossBackward0>) tensor(5.0778, grad_fn=<NllLossBackward0>) 466 LOSS DIFF: tensor(5.2347, grad_fn=<NllLossBackward0>) tensor(5.0395, grad_fn=<NllLossBackward0>) 467 LOSS DIFF: tensor(5.2472, grad_fn=<NllLossBackward0>) tensor(5.2347, grad_fn=<NllLossBackward0>) 468 LOSS DIFF: tensor(5.3672, grad_fn=<NllLossBackward0>) tensor(5.1695, grad_fn=<NllLossBackward0>) 469 LOSS DIFF: tensor(5.3892, grad_fn=<NllLossBackward0>) tensor(5.3672, grad_fn=<NllLossBackward0>) 470 LOSS DIFF: tensor(5.1295, grad_fn=<NllLossBackward0>) tensor(5.1241, grad_fn=<NllLossBackward0>) 471 LOSS DIFF: tensor(5.2935, grad_fn=<NllLossBackward0>) tensor(5.1295, grad_fn=<NllLossBackward0>) 472 LOSS DIFF: tensor(5.4916, grad_fn=<NllLossBackward0>) tensor(5.2935, grad_fn=<NllLossBackward0>) 473 LOSS DIFF: tensor(5.2570, grad_fn=<NllLossBackward0>) tensor(5.0166, grad_fn=<NllLossBackward0>) 474 LOSS DIFF: tensor(5.3124, grad_fn=<NllLossBackward0>) tensor(5.1387, grad_fn=<NllLossBackward0>) 475 LOSS DIFF: tensor(5.2445, grad_fn=<NllLossBackward0>) tensor(5.1581, grad_fn=<NllLossBackward0>) 476 LOSS DIFF: tensor(5.4986, grad_fn=<NllLossBackward0>) tensor(5.2445, grad_fn=<NllLossBackward0>) 477 LOSS DIFF: tensor(5.2073, grad_fn=<NllLossBackward0>) tensor(5.1772, grad_fn=<NllLossBackward0>) 478 LOSS DIFF: tensor(5.2213, grad_fn=<NllLossBackward0>) tensor(5.0682, grad_fn=<NllLossBackward0>) 479 LOSS DIFF: tensor(5.2317, grad_fn=<NllLossBackward0>) tensor(5.2213, grad_fn=<NllLossBackward0>) 480 LOSS DIFF: tensor(5.2169, grad_fn=<NllLossBackward0>) tensor(4.8229, grad_fn=<NllLossBackward0>) 481 LOSS DIFF: tensor(5.4192, grad_fn=<NllLossBackward0>) tensor(5.2169, grad_fn=<NllLossBackward0>) 482 LOSS DIFF: tensor(5.3481, grad_fn=<NllLossBackward0>) tensor(5.1884, grad_fn=<NllLossBackward0>) 483 LOSS DIFF: tensor(5.4329, grad_fn=<NllLossBackward0>) tensor(5.3481, grad_fn=<NllLossBackward0>) 484 LOSS DIFF: tensor(5.1482, grad_fn=<NllLossBackward0>) tensor(4.8979, grad_fn=<NllLossBackward0>) 485 LOSS DIFF: tensor(5.3562, grad_fn=<NllLossBackward0>) tensor(5.1482, grad_fn=<NllLossBackward0>) 486 LOSS DIFF: tensor(5.5739, grad_fn=<NllLossBackward0>) tensor(5.3562, grad_fn=<NllLossBackward0>) 487 LOSS DIFF: tensor(5.0749, grad_fn=<NllLossBackward0>) tensor(4.9742, grad_fn=<NllLossBackward0>) 488 LOSS DIFF: tensor(5.2301, grad_fn=<NllLossBackward0>) tensor(5.0749, grad_fn=<NllLossBackward0>) 489 LOSS DIFF: tensor(5.4543, grad_fn=<NllLossBackward0>) tensor(5.2301, grad_fn=<NllLossBackward0>) 490 LOSS DIFF: tensor(5.2210, grad_fn=<NllLossBackward0>) tensor(4.9663, grad_fn=<NllLossBackward0>) 491 LOSS DIFF: tensor(5.3469, grad_fn=<NllLossBackward0>) tensor(5.2210, grad_fn=<NllLossBackward0>) 1000 tensor(5.4116, grad_fn=<NllLossBackward0>) 492 LOSS DIFF: tensor(5.4116, grad_fn=<NllLossBackward0>) tensor(5.2156, grad_fn=<NllLossBackward0>) 493 LOSS DIFF: tensor(5.1600, grad_fn=<NllLossBackward0>) tensor(4.9976, grad_fn=<NllLossBackward0>) 494 LOSS DIFF: tensor(5.2190, grad_fn=<NllLossBackward0>) tensor(5.1102, grad_fn=<NllLossBackward0>) 495 LOSS DIFF: tensor(5.1974, grad_fn=<NllLossBackward0>) tensor(5.0123, grad_fn=<NllLossBackward0>) 496 LOSS DIFF: tensor(5.3085, grad_fn=<NllLossBackward0>) tensor(5.1974, grad_fn=<NllLossBackward0>) 497 LOSS DIFF: tensor(5.3090, grad_fn=<NllLossBackward0>) tensor(5.3085, grad_fn=<NllLossBackward0>) 498 LOSS DIFF: tensor(5.3978, grad_fn=<NllLossBackward0>) tensor(5.0467, grad_fn=<NllLossBackward0>) 499 LOSS DIFF: tensor(5.3369, grad_fn=<NllLossBackward0>) tensor(5.0919, grad_fn=<NllLossBackward0>) 500 LOSS DIFF: tensor(5.3036, grad_fn=<NllLossBackward0>) tensor(5.2151, grad_fn=<NllLossBackward0>)
plt.plot([t.detach().numpy() for t in loss_track])
plt.show()
torch.save(model.state_dict(), f'model_trigram-EMBED_SIZE={EMBED_SIZE}.bin')
vocab_unique = set(vocab.get_stoi().keys())
output = []
with lzma.open("dev-0/in.tsv.xz", encoding='utf8', mode="rt") as file:
for line in file:
line = line.split("\t")
first_word = re.sub(r"\\\\+n", " ", line[-2]).split()[-1]
first_word = re.sub('[^A-Za-z]+', '', first_word)
second_word = re.sub(r"\\\\+n", " ", line[-1]).split()[0]
second_word = re.sub('[^A-Za-z]+', '', second_word)
if first_word not in vocab_unique:
word = "<unk>"
if second_word not in vocab_unique:
word = "<unk>"
input_tokens = torch.tensor([vocab.forward([first_word]), vocab.forward([second_word])]).to(device)
out = model(input_tokens)
top = torch.topk(out[0], 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
unk_bonus = 1 - sum(top_probs)
top_words = vocab.lookup_tokens(top_indices)
top_zipped = list(zip(top_words, top_probs))
res = ""
for w, p in top_zipped:
if w == "<unk>":
res += f":{(p + unk_bonus):.4f} "
else:
res += f"{w}:{p:.4f} "
res = res[:-1]
res += "\n"
output.append(res)
with open(f"dev-0/out-EMBED_SIZE={EMBED_SIZE}.tsv", mode="w") as file:
file.writelines(output)
C:\Users\micha\AppData\Local\Temp\ipykernel_14016\2809838665.py:15: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument. x = self.softmax(x)
model.eval()
output = []
with lzma.open("test-A/in.tsv.xz", encoding='utf8', mode="rt") as file:
for line in file:
line = line.split("\t")
first_word = re.sub(r"\\\\+n", " ", line[-2]).split()[-1]
first_word = re.sub('[^A-Za-z]+', '', first_word)
second_word = re.sub(r"\\\\+n", " ", line[-1]).split()[0]
second_word = re.sub('[^A-Za-z]+', '', second_word)
if first_word not in vocab_unique:
word = "<unk>"
if second_word not in vocab_unique:
word = "<unk>"
input_tokens = torch.tensor([vocab.forward([first_word]), vocab.forward([second_word])]).to(device)
out = model(input_tokens)
top = torch.topk(out[0], 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
unk_bonus = 1 - sum(top_probs)
top_words = vocab.lookup_tokens(top_indices)
top_zipped = list(zip(top_words, top_probs))
res = ""
for w, p in top_zipped:
if w == "<unk>":
res += f":{(p + unk_bonus):.4f} "
else:
res += f"{w}:{p:.4f} "
res = res[:-1]
res += "\n"
output.append(res)
with open(f"test-A/out-EMBED_SIZE={EMBED_SIZE}.tsv", mode="w") as file:
file.writelines(output)
C:\Users\micha\AppData\Local\Temp\ipykernel_14016\2809838665.py:15: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument. x = self.softmax(x)