challenging-america-word-ga.../Zad_7.ipynb

260 KiB

connect to google drive (working on colab)

from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
!mkdir moj7
%cd drive
/content/drive
%cd MyDrive
/content/drive/MyDrive
%cd moj7
/content/drive/MyDrive/moj7
!pwd
/content/drive/MyDrive/moj7

Preprocess

import re
train_file ='train/in.tsv.xz'
test_file = 'test-A/in.tsv.xz'
out_file = 'test-A/out.tsv'

def preprocess(line):
    line = replace_endline(line)
    line = get_rid_of_header(line)
    return line

def get_rid_of_header(line):
    line = line.split('\t')[6:]
    return "".join(line)
    
def replace_endline(line):
    line = re.sub("\\\\n|\\\\+", " ", line)
    return line
from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
import lzma
import pickle



def get_words_from_line(line):
    line = line.rstrip()
    yield '<s>'
    line = preprocess(line)
    for t in line.split(' '):
        yield t
    yield '</s>'


def get_word_lines_from_file(file_name):
    n = 0
    with lzma.open(file_name, 'r') as fh:
        for line in fh:
            n+=1
            if n%1000==0:
                print(n)
            yield get_words_from_line(line.decode('utf-8'))
#vocab_size = 20000
vocab_size = 20000

vocab = build_vocab_from_iterator(
    get_word_lines_from_file(train_file),
    max_tokens = vocab_size,
    specials = ['<unk>'])

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000
159000
160000
161000
162000
163000
164000
165000
166000
167000
168000
169000
170000
171000
172000
173000
174000
175000
176000
177000
178000
179000
180000
181000
182000
183000
184000
185000
186000
187000
188000
189000
190000
191000
192000
193000
194000
195000
196000
197000
198000
199000
200000
201000
202000
203000
204000
205000
206000
207000
208000
209000
210000
211000
212000
213000
214000
215000
216000
217000
218000
219000
220000
221000
222000
223000
224000
225000
226000
227000
228000
229000
230000
231000
232000
233000
234000
235000
236000
237000
238000
239000
240000
241000
242000
243000
244000
245000
246000
247000
248000
249000
250000
251000
252000
253000
254000
255000
256000
257000
258000
259000
260000
261000
262000
263000
264000
265000
266000
267000
268000
269000
270000
271000
272000
273000
274000
275000
276000
277000
278000
279000
280000
281000
282000
283000
284000
285000
286000
287000
288000
289000
290000
291000
292000
293000
294000
295000
296000
297000
298000
299000
300000
301000
302000
303000
304000
305000
306000
307000
308000
309000
310000
311000
312000
313000
314000
315000
316000
317000
318000
319000
320000
321000
322000
323000
324000
325000
326000
327000
328000
329000
330000
331000
332000
333000
334000
335000
336000
337000
338000
339000
340000
341000
342000
343000
344000
345000
346000
347000
348000
349000
350000
351000
352000
353000
354000
355000
356000
357000
358000
359000
360000
361000
362000
363000
364000
365000
366000
367000
368000
369000
370000
371000
372000
373000
374000
375000
376000
377000
378000
379000
380000
381000
382000
383000
384000
385000
386000
387000
388000
389000
390000
391000
392000
393000
394000
395000
396000
397000
398000
399000
400000
401000
402000
403000
404000
405000
406000
407000
408000
409000
410000
411000
412000
413000
414000
415000
416000
417000
418000
419000
420000
421000
422000
423000
424000
425000
426000
427000
428000
429000
430000
431000
432000
vocab['no']
50
with open('filename.pickle', 'wb') as handle:
    pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

Create NN

from torch import nn
import torch
import pickle
# embed_size = 150
embed_size = 150

class Bigram(nn.Module):
  def __init__(self, vocabulary_size, embedding_size):
      super(Bigram, self).__init__()
      self.model = nn.Sequential(
          nn.Embedding(vocabulary_size, embedding_size),
          nn.Linear(embedding_size, vocabulary_size),
          nn.Softmax()
      )
  def forward(self, x):
      return self.model(x)

model = Bigram(vocab_size, embed_size)

vocab.set_default_index(vocab['<unk>'])
res = torch.tensor(vocab.forward(['order']))
print(res)
tensor([215])
from torch.utils.data import IterableDataset
import itertools

def look_ahead_iterator(gen):
    prev = None
    for item in gen:
        if prev is not None:
            yield (prev, item)
        prev = item

class Bigrams(IterableDataset):
  def __init__(self, text_file, vocabulary_size):
      self.vocab = build_vocab_from_iterator(
         get_word_lines_from_file(text_file),
         max_tokens = vocabulary_size,
         specials = ['<unk>'])
      self.vocab.set_default_index(self.vocab['<unk>'])
      self.vocabulary_size = vocabulary_size
      self.text_file = text_file

  def __iter__(self):
     return look_ahead_iterator(
         (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))


train_dataset = Bigrams(train_file, vocab_size)
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000
159000
160000
161000
162000
163000
164000
165000
166000
167000
168000
169000
170000
171000
172000
173000
174000
175000
176000
177000
178000
179000
180000
181000
182000
183000
184000
185000
186000
187000
188000
189000
190000
191000
192000
193000
194000
195000
196000
197000
198000
199000
200000
201000
202000
203000
204000
205000
206000
207000
208000
209000
210000
211000
212000
213000
214000
215000
216000
217000
218000
219000
220000
221000
222000
223000
224000
225000
226000
227000
228000
229000
230000
231000
232000
233000
234000
235000
236000
237000
238000
239000
240000
241000
242000
243000
244000
245000
246000
247000
248000
249000
250000
251000
252000
253000
254000
255000
256000
257000
258000
259000
260000
261000
262000
263000
264000
265000
266000
267000
268000
269000
270000
271000
272000
273000
274000
275000
276000
277000
278000
279000
280000
281000
282000
283000
284000
285000
286000
287000
288000
289000
290000
291000
292000
293000
294000
295000
296000
297000
298000
299000
300000
301000
302000
303000
304000
305000
306000
307000
308000
309000
310000
311000
312000
313000
314000
315000
316000
317000
318000
319000
320000
321000
322000
323000
324000
325000
326000
327000
328000
329000
330000
331000
332000
333000
334000
335000
336000
337000
338000
339000
340000
341000
342000
343000
344000
345000
346000
347000
348000
349000
350000
351000
352000
353000
354000
355000
356000
357000
358000
359000
360000
361000
362000
363000
364000
365000
366000
367000
368000
369000
370000
371000
372000
373000
374000
375000
376000
377000
378000
379000
380000
381000
382000
383000
384000
385000
386000
387000
388000
389000
390000
391000
392000
393000
394000
395000
396000
397000
398000
399000
400000
401000
402000
403000
404000
405000
406000
407000
408000
409000
410000
411000
412000
413000
414000
415000
416000
417000
418000
419000
420000
421000
422000
423000
424000
425000
426000
427000
428000
429000
430000
431000
432000
from torch.utils.data import DataLoader

next(iter(DataLoader(train_dataset, batch_size=5)))
[tensor([  23,  191, 5791,    1,  112]),
 tensor([ 191, 5791,    1,  112,  159])]

Train

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Bigram(vocab_size, embed_size).to(device)
data = DataLoader(train_dataset, batch_size=1000)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()
## epochs=2
for i in range(2):
    print('epoch: =', i)
    model.train()
    step = 0
    for x, y in data:
       x = x.to(device)
       y = y.to(device)
       optimizer.zero_grad()
       ypredicted = model(x)
       loss = criterion(torch.log(ypredicted), y)
       if step % 100 == 0:
          print(step, loss)
       step += 1
       loss.backward()
       optimizer.step()
    torch.save(model.state_dict(), 'model.bin')    
epoch: = 0
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/container.py:217: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  input = module(input)
0 tensor(10.3037, device='cuda:0', grad_fn=<NllLossBackward0>)
100 tensor(8.7506, device='cuda:0', grad_fn=<NllLossBackward0>)
200 tensor(7.8141, device='cuda:0', grad_fn=<NllLossBackward0>)
1000
300 tensor(7.4218, device='cuda:0', grad_fn=<NllLossBackward0>)
400 tensor(7.1627, device='cuda:0', grad_fn=<NllLossBackward0>)
500 tensor(6.7964, device='cuda:0', grad_fn=<NllLossBackward0>)
2000
600 tensor(6.4704, device='cuda:0', grad_fn=<NllLossBackward0>)
700 tensor(6.3798, device='cuda:0', grad_fn=<NllLossBackward0>)
800 tensor(6.2849, device='cuda:0', grad_fn=<NllLossBackward0>)
3000
900 tensor(6.3975, device='cuda:0', grad_fn=<NllLossBackward0>)
1000 tensor(6.0096, device='cuda:0', grad_fn=<NllLossBackward0>)
1100 tensor(5.7434, device='cuda:0', grad_fn=<NllLossBackward0>)
4000
1200 tensor(5.9602, device='cuda:0', grad_fn=<NllLossBackward0>)
1300 tensor(6.1623, device='cuda:0', grad_fn=<NllLossBackward0>)
1400 tensor(6.1647, device='cuda:0', grad_fn=<NllLossBackward0>)
5000
1500 tensor(6.1010, device='cuda:0', grad_fn=<NllLossBackward0>)
1600 tensor(6.0634, device='cuda:0', grad_fn=<NllLossBackward0>)
6000
1700 tensor(5.9149, device='cuda:0', grad_fn=<NllLossBackward0>)
1800 tensor(5.7918, device='cuda:0', grad_fn=<NllLossBackward0>)
1900 tensor(5.6739, device='cuda:0', grad_fn=<NllLossBackward0>)
7000
2000 tensor(5.5298, device='cuda:0', grad_fn=<NllLossBackward0>)
2100 tensor(5.8011, device='cuda:0', grad_fn=<NllLossBackward0>)
2200 tensor(5.4338, device='cuda:0', grad_fn=<NllLossBackward0>)
8000
2300 tensor(5.7522, device='cuda:0', grad_fn=<NllLossBackward0>)
2400 tensor(5.0313, device='cuda:0', grad_fn=<NllLossBackward0>)
2500 tensor(5.7116, device='cuda:0', grad_fn=<NllLossBackward0>)
9000
2600 tensor(5.2706, device='cuda:0', grad_fn=<NllLossBackward0>)
2700 tensor(5.6324, device='cuda:0', grad_fn=<NllLossBackward0>)
2800 tensor(5.0710, device='cuda:0', grad_fn=<NllLossBackward0>)
10000
2900 tensor(5.5921, device='cuda:0', grad_fn=<NllLossBackward0>)
3000 tensor(5.4808, device='cuda:0', grad_fn=<NllLossBackward0>)
11000
3100 tensor(5.3611, device='cuda:0', grad_fn=<NllLossBackward0>)
3200 tensor(5.6228, device='cuda:0', grad_fn=<NllLossBackward0>)
3300 tensor(5.4286, device='cuda:0', grad_fn=<NllLossBackward0>)
12000
3400 tensor(5.3550, device='cuda:0', grad_fn=<NllLossBackward0>)
3500 tensor(5.4032, device='cuda:0', grad_fn=<NllLossBackward0>)
3600 tensor(5.1070, device='cuda:0', grad_fn=<NllLossBackward0>)
13000
3700 tensor(5.4506, device='cuda:0', grad_fn=<NllLossBackward0>)
3800 tensor(5.4622, device='cuda:0', grad_fn=<NllLossBackward0>)
3900 tensor(5.4984, device='cuda:0', grad_fn=<NllLossBackward0>)
14000
4000 tensor(5.1740, device='cuda:0', grad_fn=<NllLossBackward0>)
4100 tensor(5.6064, device='cuda:0', grad_fn=<NllLossBackward0>)
4200 tensor(5.0705, device='cuda:0', grad_fn=<NllLossBackward0>)
15000
4300 tensor(5.5181, device='cuda:0', grad_fn=<NllLossBackward0>)
4400 tensor(5.2919, device='cuda:0', grad_fn=<NllLossBackward0>)
16000
4500 tensor(5.5021, device='cuda:0', grad_fn=<NllLossBackward0>)
4600 tensor(5.5308, device='cuda:0', grad_fn=<NllLossBackward0>)
4700 tensor(5.4699, device='cuda:0', grad_fn=<NllLossBackward0>)
17000
4800 tensor(5.2686, device='cuda:0', grad_fn=<NllLossBackward0>)
4900 tensor(5.4776, device='cuda:0', grad_fn=<NllLossBackward0>)
5000 tensor(5.5061, device='cuda:0', grad_fn=<NllLossBackward0>)
18000
5100 tensor(5.3180, device='cuda:0', grad_fn=<NllLossBackward0>)
5200 tensor(5.5524, device='cuda:0', grad_fn=<NllLossBackward0>)
5300 tensor(5.3481, device='cuda:0', grad_fn=<NllLossBackward0>)
19000
5400 tensor(5.2153, device='cuda:0', grad_fn=<NllLossBackward0>)
5500 tensor(5.4478, device='cuda:0', grad_fn=<NllLossBackward0>)
20000
5600 tensor(5.3441, device='cuda:0', grad_fn=<NllLossBackward0>)
5700 tensor(5.3958, device='cuda:0', grad_fn=<NllLossBackward0>)
5800 tensor(5.8945, device='cuda:0', grad_fn=<NllLossBackward0>)
21000
5900 tensor(5.5684, device='cuda:0', grad_fn=<NllLossBackward0>)
6000 tensor(5.5715, device='cuda:0', grad_fn=<NllLossBackward0>)
6100 tensor(5.2367, device='cuda:0', grad_fn=<NllLossBackward0>)
22000
6200 tensor(5.6976, device='cuda:0', grad_fn=<NllLossBackward0>)
6300 tensor(5.5367, device='cuda:0', grad_fn=<NllLossBackward0>)
6400 tensor(5.3024, device='cuda:0', grad_fn=<NllLossBackward0>)
23000
6500 tensor(5.3010, device='cuda:0', grad_fn=<NllLossBackward0>)
6600 tensor(6.0962, device='cuda:0', grad_fn=<NllLossBackward0>)
6700 tensor(5.0961, device='cuda:0', grad_fn=<NllLossBackward0>)
24000
6800 tensor(5.1091, device='cuda:0', grad_fn=<NllLossBackward0>)
6900 tensor(5.4123, device='cuda:0', grad_fn=<NllLossBackward0>)
25000
7000 tensor(5.3128, device='cuda:0', grad_fn=<NllLossBackward0>)
7100 tensor(5.3416, device='cuda:0', grad_fn=<NllLossBackward0>)
7200 tensor(5.4973, device='cuda:0', grad_fn=<NllLossBackward0>)
26000
7300 tensor(5.4418, device='cuda:0', grad_fn=<NllLossBackward0>)
7400 tensor(5.2171, device='cuda:0', grad_fn=<NllLossBackward0>)
7500 tensor(5.6509, device='cuda:0', grad_fn=<NllLossBackward0>)
27000
7600 tensor(5.0550, device='cuda:0', grad_fn=<NllLossBackward0>)
7700 tensor(5.4937, device='cuda:0', grad_fn=<NllLossBackward0>)
7800 tensor(5.9218, device='cuda:0', grad_fn=<NllLossBackward0>)
28000
7900 tensor(5.2853, device='cuda:0', grad_fn=<NllLossBackward0>)
8000 tensor(5.3146, device='cuda:0', grad_fn=<NllLossBackward0>)
8100 tensor(4.8552, device='cuda:0', grad_fn=<NllLossBackward0>)
29000
8200 tensor(5.3389, device='cuda:0', grad_fn=<NllLossBackward0>)
8300 tensor(5.2421, device='cuda:0', grad_fn=<NllLossBackward0>)
30000
8400 tensor(5.2460, device='cuda:0', grad_fn=<NllLossBackward0>)
8500 tensor(5.0331, device='cuda:0', grad_fn=<NllLossBackward0>)
8600 tensor(5.0050, device='cuda:0', grad_fn=<NllLossBackward0>)
31000
8700 tensor(5.3844, device='cuda:0', grad_fn=<NllLossBackward0>)
8800 tensor(5.4491, device='cuda:0', grad_fn=<NllLossBackward0>)
8900 tensor(5.6790, device='cuda:0', grad_fn=<NllLossBackward0>)
32000
9000 tensor(5.1118, device='cuda:0', grad_fn=<NllLossBackward0>)
9100 tensor(5.3567, device='cuda:0', grad_fn=<NllLossBackward0>)
9200 tensor(5.4141, device='cuda:0', grad_fn=<NllLossBackward0>)
33000
9300 tensor(5.3085, device='cuda:0', grad_fn=<NllLossBackward0>)
9400 tensor(5.2808, device='cuda:0', grad_fn=<NllLossBackward0>)
34000
9500 tensor(5.0931, device='cuda:0', grad_fn=<NllLossBackward0>)
9600 tensor(5.1090, device='cuda:0', grad_fn=<NllLossBackward0>)
9700 tensor(5.2519, device='cuda:0', grad_fn=<NllLossBackward0>)
35000
9800 tensor(5.3852, device='cuda:0', grad_fn=<NllLossBackward0>)
9900 tensor(5.0943, device='cuda:0', grad_fn=<NllLossBackward0>)
10000 tensor(5.4690, device='cuda:0', grad_fn=<NllLossBackward0>)
36000
10100 tensor(5.4348, device='cuda:0', grad_fn=<NllLossBackward0>)
10200 tensor(5.3262, device='cuda:0', grad_fn=<NllLossBackward0>)
10300 tensor(5.4878, device='cuda:0', grad_fn=<NllLossBackward0>)
37000
10400 tensor(5.2384, device='cuda:0', grad_fn=<NllLossBackward0>)
10500 tensor(5.2151, device='cuda:0', grad_fn=<NllLossBackward0>)
10600 tensor(4.8722, device='cuda:0', grad_fn=<NllLossBackward0>)
38000
10700 tensor(5.4325, device='cuda:0', grad_fn=<NllLossBackward0>)
10800 tensor(4.8699, device='cuda:0', grad_fn=<NllLossBackward0>)
39000
10900 tensor(5.3448, device='cuda:0', grad_fn=<NllLossBackward0>)
11000 tensor(5.1358, device='cuda:0', grad_fn=<NllLossBackward0>)
11100 tensor(5.0432, device='cuda:0', grad_fn=<NllLossBackward0>)
40000
11200 tensor(5.4062, device='cuda:0', grad_fn=<NllLossBackward0>)
11300 tensor(5.4040, device='cuda:0', grad_fn=<NllLossBackward0>)
11400 tensor(5.5312, device='cuda:0', grad_fn=<NllLossBackward0>)
41000
11500 tensor(5.4374, device='cuda:0', grad_fn=<NllLossBackward0>)
11600 tensor(5.0998, device='cuda:0', grad_fn=<NllLossBackward0>)
11700 tensor(5.4217, device='cuda:0', grad_fn=<NllLossBackward0>)
42000
11800 tensor(5.5747, device='cuda:0', grad_fn=<NllLossBackward0>)
11900 tensor(5.0467, device='cuda:0', grad_fn=<NllLossBackward0>)
12000 tensor(5.4270, device='cuda:0', grad_fn=<NllLossBackward0>)
43000
12100 tensor(5.2043, device='cuda:0', grad_fn=<NllLossBackward0>)
12200 tensor(5.2369, device='cuda:0', grad_fn=<NllLossBackward0>)
44000
12300 tensor(5.4465, device='cuda:0', grad_fn=<NllLossBackward0>)
12400 tensor(4.9839, device='cuda:0', grad_fn=<NllLossBackward0>)
12500 tensor(5.3214, device='cuda:0', grad_fn=<NllLossBackward0>)
45000
12600 tensor(5.1928, device='cuda:0', grad_fn=<NllLossBackward0>)
12700 tensor(4.9646, device='cuda:0', grad_fn=<NllLossBackward0>)
12800 tensor(5.3325, device='cuda:0', grad_fn=<NllLossBackward0>)
46000
12900 tensor(5.4429, device='cuda:0', grad_fn=<NllLossBackward0>)
13000 tensor(5.0652, device='cuda:0', grad_fn=<NllLossBackward0>)
13100 tensor(5.3126, device='cuda:0', grad_fn=<NllLossBackward0>)
47000
13200 tensor(5.4124, device='cuda:0', grad_fn=<NllLossBackward0>)
13300 tensor(5.5385, device='cuda:0', grad_fn=<NllLossBackward0>)
13400 tensor(5.0986, device='cuda:0', grad_fn=<NllLossBackward0>)
48000
13500 tensor(5.2693, device='cuda:0', grad_fn=<NllLossBackward0>)
13600 tensor(5.2136, device='cuda:0', grad_fn=<NllLossBackward0>)
49000
13700 tensor(5.5169, device='cuda:0', grad_fn=<NllLossBackward0>)
13800 tensor(5.1840, device='cuda:0', grad_fn=<NllLossBackward0>)
13900 tensor(5.2700, device='cuda:0', grad_fn=<NllLossBackward0>)
50000
14000 tensor(5.2077, device='cuda:0', grad_fn=<NllLossBackward0>)
14100 tensor(5.3791, device='cuda:0', grad_fn=<NllLossBackward0>)
14200 tensor(5.4008, device='cuda:0', grad_fn=<NllLossBackward0>)
51000
14300 tensor(5.3506, device='cuda:0', grad_fn=<NllLossBackward0>)
14400 tensor(4.7662, device='cuda:0', grad_fn=<NllLossBackward0>)
14500 tensor(4.9474, device='cuda:0', grad_fn=<NllLossBackward0>)
52000
14600 tensor(5.0245, device='cuda:0', grad_fn=<NllLossBackward0>)
14700 tensor(5.3977, device='cuda:0', grad_fn=<NllLossBackward0>)
14800 tensor(4.9653, device='cuda:0', grad_fn=<NllLossBackward0>)
53000
14900 tensor(4.8947, device='cuda:0', grad_fn=<NllLossBackward0>)
15000 tensor(5.3548, device='cuda:0', grad_fn=<NllLossBackward0>)
54000
15100 tensor(4.7244, device='cuda:0', grad_fn=<NllLossBackward0>)
15200 tensor(4.9752, device='cuda:0', grad_fn=<NllLossBackward0>)
15300 tensor(5.3929, device='cuda:0', grad_fn=<NllLossBackward0>)
55000
15400 tensor(5.3096, device='cuda:0', grad_fn=<NllLossBackward0>)
15500 tensor(5.1247, device='cuda:0', grad_fn=<NllLossBackward0>)
15600 tensor(5.2753, device='cuda:0', grad_fn=<NllLossBackward0>)
56000
15700 tensor(5.2373, device='cuda:0', grad_fn=<NllLossBackward0>)
15800 tensor(4.9997, device='cuda:0', grad_fn=<NllLossBackward0>)
15900 tensor(5.1718, device='cuda:0', grad_fn=<NllLossBackward0>)
57000
16000 tensor(5.5952, device='cuda:0', grad_fn=<NllLossBackward0>)
16100 tensor(5.3699, device='cuda:0', grad_fn=<NllLossBackward0>)
16200 tensor(5.0923, device='cuda:0', grad_fn=<NllLossBackward0>)
58000
16300 tensor(4.9985, device='cuda:0', grad_fn=<NllLossBackward0>)
16400 tensor(5.3076, device='cuda:0', grad_fn=<NllLossBackward0>)
59000
16500 tensor(5.1994, device='cuda:0', grad_fn=<NllLossBackward0>)
16600 tensor(5.3672, device='cuda:0', grad_fn=<NllLossBackward0>)
16700 tensor(5.2054, device='cuda:0', grad_fn=<NllLossBackward0>)
60000
16800 tensor(5.3379, device='cuda:0', grad_fn=<NllLossBackward0>)
16900 tensor(5.2785, device='cuda:0', grad_fn=<NllLossBackward0>)
17000 tensor(5.2590, device='cuda:0', grad_fn=<NllLossBackward0>)
61000
17100 tensor(5.3564, device='cuda:0', grad_fn=<NllLossBackward0>)
17200 tensor(5.3598, device='cuda:0', grad_fn=<NllLossBackward0>)
17300 tensor(4.7786, device='cuda:0', grad_fn=<NllLossBackward0>)
62000
17400 tensor(5.2639, device='cuda:0', grad_fn=<NllLossBackward0>)
17500 tensor(5.2037, device='cuda:0', grad_fn=<NllLossBackward0>)
17600 tensor(5.1158, device='cuda:0', grad_fn=<NllLossBackward0>)
63000
17700 tensor(4.9831, device='cuda:0', grad_fn=<NllLossBackward0>)
17800 tensor(4.8950, device='cuda:0', grad_fn=<NllLossBackward0>)
64000
17900 tensor(5.0928, device='cuda:0', grad_fn=<NllLossBackward0>)
18000 tensor(5.3423, device='cuda:0', grad_fn=<NllLossBackward0>)
18100 tensor(5.1760, device='cuda:0', grad_fn=<NllLossBackward0>)
65000
18200 tensor(5.2021, device='cuda:0', grad_fn=<NllLossBackward0>)
18300 tensor(5.1306, device='cuda:0', grad_fn=<NllLossBackward0>)
18400 tensor(5.1199, device='cuda:0', grad_fn=<NllLossBackward0>)
66000
18500 tensor(5.2082, device='cuda:0', grad_fn=<NllLossBackward0>)
18600 tensor(5.3290, device='cuda:0', grad_fn=<NllLossBackward0>)
18700 tensor(5.2257, device='cuda:0', grad_fn=<NllLossBackward0>)
67000
18800 tensor(4.9107, device='cuda:0', grad_fn=<NllLossBackward0>)
18900 tensor(5.3400, device='cuda:0', grad_fn=<NllLossBackward0>)
68000
19000 tensor(5.1366, device='cuda:0', grad_fn=<NllLossBackward0>)
19100 tensor(5.1199, device='cuda:0', grad_fn=<NllLossBackward0>)
19200 tensor(5.2202, device='cuda:0', grad_fn=<NllLossBackward0>)
69000
19300 tensor(5.2236, device='cuda:0', grad_fn=<NllLossBackward0>)
19400 tensor(5.2953, device='cuda:0', grad_fn=<NllLossBackward0>)
19500 tensor(5.1308, device='cuda:0', grad_fn=<NllLossBackward0>)
70000
19600 tensor(5.3578, device='cuda:0', grad_fn=<NllLossBackward0>)
19700 tensor(5.1600, device='cuda:0', grad_fn=<NllLossBackward0>)
19800 tensor(4.6220, device='cuda:0', grad_fn=<NllLossBackward0>)
71000
19900 tensor(5.3731, device='cuda:0', grad_fn=<NllLossBackward0>)
20000 tensor(4.9936, device='cuda:0', grad_fn=<NllLossBackward0>)
20100 tensor(5.0817, device='cuda:0', grad_fn=<NllLossBackward0>)
72000
20200 tensor(5.1613, device='cuda:0', grad_fn=<NllLossBackward0>)
20300 tensor(5.3877, device='cuda:0', grad_fn=<NllLossBackward0>)
73000
20400 tensor(5.4114, device='cuda:0', grad_fn=<NllLossBackward0>)
20500 tensor(5.2609, device='cuda:0', grad_fn=<NllLossBackward0>)
20600 tensor(5.1378, device='cuda:0', grad_fn=<NllLossBackward0>)
74000
20700 tensor(5.0799, device='cuda:0', grad_fn=<NllLossBackward0>)
20800 tensor(5.3615, device='cuda:0', grad_fn=<NllLossBackward0>)
20900 tensor(5.3365, device='cuda:0', grad_fn=<NllLossBackward0>)
75000
21000 tensor(4.9244, device='cuda:0', grad_fn=<NllLossBackward0>)
21100 tensor(5.5084, device='cuda:0', grad_fn=<NllLossBackward0>)
21200 tensor(4.8769, device='cuda:0', grad_fn=<NllLossBackward0>)
76000
21300 tensor(5.3414, device='cuda:0', grad_fn=<NllLossBackward0>)
21400 tensor(5.0648, device='cuda:0', grad_fn=<NllLossBackward0>)
21500 tensor(5.0594, device='cuda:0', grad_fn=<NllLossBackward0>)
77000
21600 tensor(5.2537, device='cuda:0', grad_fn=<NllLossBackward0>)
21700 tensor(5.1834, device='cuda:0', grad_fn=<NllLossBackward0>)
21800 tensor(4.8151, device='cuda:0', grad_fn=<NllLossBackward0>)
78000
21900 tensor(5.3335, device='cuda:0', grad_fn=<NllLossBackward0>)
22000 tensor(4.9580, device='cuda:0', grad_fn=<NllLossBackward0>)
79000
22100 tensor(5.2262, device='cuda:0', grad_fn=<NllLossBackward0>)
22200 tensor(5.1946, device='cuda:0', grad_fn=<NllLossBackward0>)
22300 tensor(5.2404, device='cuda:0', grad_fn=<NllLossBackward0>)
80000
22400 tensor(4.9491, device='cuda:0', grad_fn=<NllLossBackward0>)
22500 tensor(4.6901, device='cuda:0', grad_fn=<NllLossBackward0>)
22600 tensor(5.1937, device='cuda:0', grad_fn=<NllLossBackward0>)
81000
22700 tensor(4.9937, device='cuda:0', grad_fn=<NllLossBackward0>)
22800 tensor(5.1401, device='cuda:0', grad_fn=<NllLossBackward0>)
22900 tensor(5.0599, device='cuda:0', grad_fn=<NllLossBackward0>)
82000
23000 tensor(5.4315, device='cuda:0', grad_fn=<NllLossBackward0>)
23100 tensor(5.1854, device='cuda:0', grad_fn=<NllLossBackward0>)
83000
23200 tensor(5.1033, device='cuda:0', grad_fn=<NllLossBackward0>)
23300 tensor(5.2352, device='cuda:0', grad_fn=<NllLossBackward0>)
23400 tensor(5.2004, device='cuda:0', grad_fn=<NllLossBackward0>)
84000
23500 tensor(5.0866, device='cuda:0', grad_fn=<NllLossBackward0>)
23600 tensor(5.2372, device='cuda:0', grad_fn=<NllLossBackward0>)
23700 tensor(5.4711, device='cuda:0', grad_fn=<NllLossBackward0>)
85000
23800 tensor(5.4030, device='cuda:0', grad_fn=<NllLossBackward0>)
23900 tensor(5.3589, device='cuda:0', grad_fn=<NllLossBackward0>)
24000 tensor(5.1646, device='cuda:0', grad_fn=<NllLossBackward0>)
86000
24100 tensor(5.4865, device='cuda:0', grad_fn=<NllLossBackward0>)
24200 tensor(5.3663, device='cuda:0', grad_fn=<NllLossBackward0>)
24300 tensor(5.1760, device='cuda:0', grad_fn=<NllLossBackward0>)
87000
24400 tensor(5.2950, device='cuda:0', grad_fn=<NllLossBackward0>)
24500 tensor(5.0376, device='cuda:0', grad_fn=<NllLossBackward0>)
88000
24600 tensor(5.1229, device='cuda:0', grad_fn=<NllLossBackward0>)
24700 tensor(5.3261, device='cuda:0', grad_fn=<NllLossBackward0>)
24800 tensor(5.3953, device='cuda:0', grad_fn=<NllLossBackward0>)
89000
24900 tensor(5.2734, device='cuda:0', grad_fn=<NllLossBackward0>)
25000 tensor(5.5544, device='cuda:0', grad_fn=<NllLossBackward0>)
25100 tensor(5.1847, device='cuda:0', grad_fn=<NllLossBackward0>)
90000
25200 tensor(5.2963, device='cuda:0', grad_fn=<NllLossBackward0>)
25300 tensor(5.2350, device='cuda:0', grad_fn=<NllLossBackward0>)
25400 tensor(5.1483, device='cuda:0', grad_fn=<NllLossBackward0>)
91000
25500 tensor(5.0571, device='cuda:0', grad_fn=<NllLossBackward0>)
25600 tensor(4.8022, device='cuda:0', grad_fn=<NllLossBackward0>)
25700 tensor(5.2115, device='cuda:0', grad_fn=<NllLossBackward0>)
92000
25800 tensor(5.1675, device='cuda:0', grad_fn=<NllLossBackward0>)
25900 tensor(5.2786, device='cuda:0', grad_fn=<NllLossBackward0>)
93000
26000 tensor(4.8893, device='cuda:0', grad_fn=<NllLossBackward0>)
26100 tensor(5.3295, device='cuda:0', grad_fn=<NllLossBackward0>)
26200 tensor(4.9849, device='cuda:0', grad_fn=<NllLossBackward0>)
94000
26300 tensor(5.2289, device='cuda:0', grad_fn=<NllLossBackward0>)
26400 tensor(5.1430, device='cuda:0', grad_fn=<NllLossBackward0>)
26500 tensor(5.1389, device='cuda:0', grad_fn=<NllLossBackward0>)
95000
26600 tensor(5.3339, device='cuda:0', grad_fn=<NllLossBackward0>)
26700 tensor(5.1904, device='cuda:0', grad_fn=<NllLossBackward0>)
26800 tensor(5.0073, device='cuda:0', grad_fn=<NllLossBackward0>)
96000
26900 tensor(5.1798, device='cuda:0', grad_fn=<NllLossBackward0>)
27000 tensor(4.9300, device='cuda:0', grad_fn=<NllLossBackward0>)
27100 tensor(5.1331, device='cuda:0', grad_fn=<NllLossBackward0>)
97000
27200 tensor(5.1750, device='cuda:0', grad_fn=<NllLossBackward0>)
27300 tensor(5.1801, device='cuda:0', grad_fn=<NllLossBackward0>)
98000
27400 tensor(5.0430, device='cuda:0', grad_fn=<NllLossBackward0>)
27500 tensor(5.3107, device='cuda:0', grad_fn=<NllLossBackward0>)
27600 tensor(5.2876, device='cuda:0', grad_fn=<NllLossBackward0>)
99000
27700 tensor(5.2763, device='cuda:0', grad_fn=<NllLossBackward0>)
27800 tensor(5.0737, device='cuda:0', grad_fn=<NllLossBackward0>)
27900 tensor(5.0593, device='cuda:0', grad_fn=<NllLossBackward0>)
100000
28000 tensor(5.4758, device='cuda:0', grad_fn=<NllLossBackward0>)
28100 tensor(5.4440, device='cuda:0', grad_fn=<NllLossBackward0>)
28200 tensor(5.0862, device='cuda:0', grad_fn=<NllLossBackward0>)
101000
28300 tensor(5.2405, device='cuda:0', grad_fn=<NllLossBackward0>)
28400 tensor(4.8600, device='cuda:0', grad_fn=<NllLossBackward0>)
28500 tensor(5.2056, device='cuda:0', grad_fn=<NllLossBackward0>)
102000
28600 tensor(5.6196, device='cuda:0', grad_fn=<NllLossBackward0>)
28700 tensor(5.2235, device='cuda:0', grad_fn=<NllLossBackward0>)
103000
28800 tensor(5.2124, device='cuda:0', grad_fn=<NllLossBackward0>)
28900 tensor(4.8836, device='cuda:0', grad_fn=<NllLossBackward0>)
29000 tensor(4.9913, device='cuda:0', grad_fn=<NllLossBackward0>)
104000
29100 tensor(5.2689, device='cuda:0', grad_fn=<NllLossBackward0>)
29200 tensor(4.9352, device='cuda:0', grad_fn=<NllLossBackward0>)
29300 tensor(5.3143, device='cuda:0', grad_fn=<NllLossBackward0>)
105000
29400 tensor(5.2134, device='cuda:0', grad_fn=<NllLossBackward0>)
29500 tensor(5.0963, device='cuda:0', grad_fn=<NllLossBackward0>)
29600 tensor(5.4291, device='cuda:0', grad_fn=<NllLossBackward0>)
106000
29700 tensor(4.8093, device='cuda:0', grad_fn=<NllLossBackward0>)
29800 tensor(4.9271, device='cuda:0', grad_fn=<NllLossBackward0>)
29900 tensor(5.0479, device='cuda:0', grad_fn=<NllLossBackward0>)
107000
30000 tensor(5.2494, device='cuda:0', grad_fn=<NllLossBackward0>)
30100 tensor(5.2027, device='cuda:0', grad_fn=<NllLossBackward0>)
108000
30200 tensor(5.2714, device='cuda:0', grad_fn=<NllLossBackward0>)
30300 tensor(4.9223, device='cuda:0', grad_fn=<NllLossBackward0>)
30400 tensor(4.8816, device='cuda:0', grad_fn=<NllLossBackward0>)
109000
30500 tensor(4.9169, device='cuda:0', grad_fn=<NllLossBackward0>)
30600 tensor(5.1237, device='cuda:0', grad_fn=<NllLossBackward0>)
30700 tensor(5.1190, device='cuda:0', grad_fn=<NllLossBackward0>)
110000
30800 tensor(5.2271, device='cuda:0', grad_fn=<NllLossBackward0>)
30900 tensor(5.0194, device='cuda:0', grad_fn=<NllLossBackward0>)
31000 tensor(5.2287, device='cuda:0', grad_fn=<NllLossBackward0>)
111000
31100 tensor(5.3456, device='cuda:0', grad_fn=<NllLossBackward0>)
31200 tensor(5.0911, device='cuda:0', grad_fn=<NllLossBackward0>)
31300 tensor(5.3793, device='cuda:0', grad_fn=<NllLossBackward0>)
112000
31400 tensor(5.0410, device='cuda:0', grad_fn=<NllLossBackward0>)
31500 tensor(5.0014, device='cuda:0', grad_fn=<NllLossBackward0>)
113000
31600 tensor(4.9323, device='cuda:0', grad_fn=<NllLossBackward0>)
31700 tensor(5.3869, device='cuda:0', grad_fn=<NllLossBackward0>)
31800 tensor(4.8621, device='cuda:0', grad_fn=<NllLossBackward0>)
114000
31900 tensor(5.1184, device='cuda:0', grad_fn=<NllLossBackward0>)
32000 tensor(5.2873, device='cuda:0', grad_fn=<NllLossBackward0>)
32100 tensor(4.8723, device='cuda:0', grad_fn=<NllLossBackward0>)
115000
32200 tensor(5.4421, device='cuda:0', grad_fn=<NllLossBackward0>)
32300 tensor(5.0732, device='cuda:0', grad_fn=<NllLossBackward0>)
32400 tensor(5.0461, device='cuda:0', grad_fn=<NllLossBackward0>)
116000
32500 tensor(5.3476, device='cuda:0', grad_fn=<NllLossBackward0>)
32600 tensor(5.1149, device='cuda:0', grad_fn=<NllLossBackward0>)
117000
32700 tensor(5.2261, device='cuda:0', grad_fn=<NllLossBackward0>)
32800 tensor(5.1622, device='cuda:0', grad_fn=<NllLossBackward0>)
32900 tensor(5.0429, device='cuda:0', grad_fn=<NllLossBackward0>)
118000
33000 tensor(5.1329, device='cuda:0', grad_fn=<NllLossBackward0>)
33100 tensor(5.2284, device='cuda:0', grad_fn=<NllLossBackward0>)
33200 tensor(5.1794, device='cuda:0', grad_fn=<NllLossBackward0>)
119000
33300 tensor(4.8535, device='cuda:0', grad_fn=<NllLossBackward0>)
33400 tensor(4.7420, device='cuda:0', grad_fn=<NllLossBackward0>)
33500 tensor(5.4625, device='cuda:0', grad_fn=<NllLossBackward0>)
120000
33600 tensor(4.9975, device='cuda:0', grad_fn=<NllLossBackward0>)
33700 tensor(5.2172, device='cuda:0', grad_fn=<NllLossBackward0>)
33800 tensor(5.1135, device='cuda:0', grad_fn=<NllLossBackward0>)
121000
33900 tensor(5.1974, device='cuda:0', grad_fn=<NllLossBackward0>)
34000 tensor(5.0866, device='cuda:0', grad_fn=<NllLossBackward0>)
122000
34100 tensor(5.1738, device='cuda:0', grad_fn=<NllLossBackward0>)
34200 tensor(5.2338, device='cuda:0', grad_fn=<NllLossBackward0>)
34300 tensor(5.3866, device='cuda:0', grad_fn=<NllLossBackward0>)
123000
34400 tensor(5.2937, device='cuda:0', grad_fn=<NllLossBackward0>)
34500 tensor(5.4265, device='cuda:0', grad_fn=<NllLossBackward0>)
34600 tensor(5.0077, device='cuda:0', grad_fn=<NllLossBackward0>)
124000
34700 tensor(5.3335, device='cuda:0', grad_fn=<NllLossBackward0>)
34800 tensor(4.9879, device='cuda:0', grad_fn=<NllLossBackward0>)
34900 tensor(5.3270, device='cuda:0', grad_fn=<NllLossBackward0>)
125000
35000 tensor(5.2056, device='cuda:0', grad_fn=<NllLossBackward0>)
35100 tensor(5.1341, device='cuda:0', grad_fn=<NllLossBackward0>)
35200 tensor(5.2629, device='cuda:0', grad_fn=<NllLossBackward0>)
126000
35300 tensor(5.0193, device='cuda:0', grad_fn=<NllLossBackward0>)
35400 tensor(5.2414, device='cuda:0', grad_fn=<NllLossBackward0>)
127000
35500 tensor(5.3116, device='cuda:0', grad_fn=<NllLossBackward0>)
35600 tensor(5.3802, device='cuda:0', grad_fn=<NllLossBackward0>)
35700 tensor(5.5126, device='cuda:0', grad_fn=<NllLossBackward0>)
128000
35800 tensor(5.2014, device='cuda:0', grad_fn=<NllLossBackward0>)
35900 tensor(5.2182, device='cuda:0', grad_fn=<NllLossBackward0>)
36000 tensor(5.2556, device='cuda:0', grad_fn=<NllLossBackward0>)
129000
36100 tensor(5.2955, device='cuda:0', grad_fn=<NllLossBackward0>)
36200 tensor(5.2068, device='cuda:0', grad_fn=<NllLossBackward0>)
36300 tensor(5.4835, device='cuda:0', grad_fn=<NllLossBackward0>)
130000
36400 tensor(5.2844, device='cuda:0', grad_fn=<NllLossBackward0>)
36500 tensor(5.4568, device='cuda:0', grad_fn=<NllLossBackward0>)
131000
36600 tensor(4.9546, device='cuda:0', grad_fn=<NllLossBackward0>)
36700 tensor(5.5436, device='cuda:0', grad_fn=<NllLossBackward0>)
36800 tensor(5.4078, device='cuda:0', grad_fn=<NllLossBackward0>)
132000
36900 tensor(4.9905, device='cuda:0', grad_fn=<NllLossBackward0>)
37000 tensor(5.2532, device='cuda:0', grad_fn=<NllLossBackward0>)
37100 tensor(4.9246, device='cuda:0', grad_fn=<NllLossBackward0>)
133000
37200 tensor(4.8595, device='cuda:0', grad_fn=<NllLossBackward0>)
37300 tensor(4.9436, device='cuda:0', grad_fn=<NllLossBackward0>)
37400 tensor(4.9957, device='cuda:0', grad_fn=<NllLossBackward0>)
134000
37500 tensor(4.8686, device='cuda:0', grad_fn=<NllLossBackward0>)
37600 tensor(4.8965, device='cuda:0', grad_fn=<NllLossBackward0>)
37700 tensor(5.1955, device='cuda:0', grad_fn=<NllLossBackward0>)
135000
37800 tensor(5.2660, device='cuda:0', grad_fn=<NllLossBackward0>)
37900 tensor(5.3396, device='cuda:0', grad_fn=<NllLossBackward0>)
136000
38000 tensor(5.0582, device='cuda:0', grad_fn=<NllLossBackward0>)
38100 tensor(5.2351, device='cuda:0', grad_fn=<NllLossBackward0>)
38200 tensor(4.9177, device='cuda:0', grad_fn=<NllLossBackward0>)
137000
38300 tensor(5.3007, device='cuda:0', grad_fn=<NllLossBackward0>)
38400 tensor(4.9814, device='cuda:0', grad_fn=<NllLossBackward0>)
38500 tensor(5.2672, device='cuda:0', grad_fn=<NllLossBackward0>)
138000
38600 tensor(4.8161, device='cuda:0', grad_fn=<NllLossBackward0>)
38700 tensor(5.1975, device='cuda:0', grad_fn=<NllLossBackward0>)
38800 tensor(4.9423, device='cuda:0', grad_fn=<NllLossBackward0>)
139000
38900 tensor(5.0342, device='cuda:0', grad_fn=<NllLossBackward0>)
39000 tensor(5.2703, device='cuda:0', grad_fn=<NllLossBackward0>)
39100 tensor(4.9539, device='cuda:0', grad_fn=<NllLossBackward0>)
140000
39200 tensor(5.4193, device='cuda:0', grad_fn=<NllLossBackward0>)
39300 tensor(5.1109, device='cuda:0', grad_fn=<NllLossBackward0>)
141000
39400 tensor(5.0914, device='cuda:0', grad_fn=<NllLossBackward0>)
39500 tensor(5.3126, device='cuda:0', grad_fn=<NllLossBackward0>)
39600 tensor(4.9927, device='cuda:0', grad_fn=<NllLossBackward0>)
142000
39700 tensor(5.3725, device='cuda:0', grad_fn=<NllLossBackward0>)
39800 tensor(5.2926, device='cuda:0', grad_fn=<NllLossBackward0>)
39900 tensor(4.8999, device='cuda:0', grad_fn=<NllLossBackward0>)
143000
40000 tensor(5.0333, device='cuda:0', grad_fn=<NllLossBackward0>)
40100 tensor(5.1059, device='cuda:0', grad_fn=<NllLossBackward0>)
40200 tensor(5.1685, device='cuda:0', grad_fn=<NllLossBackward0>)
144000
40300 tensor(5.1375, device='cuda:0', grad_fn=<NllLossBackward0>)
40400 tensor(5.3952, device='cuda:0', grad_fn=<NllLossBackward0>)
40500 tensor(4.8467, device='cuda:0', grad_fn=<NllLossBackward0>)
145000
40600 tensor(5.4346, device='cuda:0', grad_fn=<NllLossBackward0>)
40700 tensor(5.2852, device='cuda:0', grad_fn=<NllLossBackward0>)
146000
40800 tensor(5.0129, device='cuda:0', grad_fn=<NllLossBackward0>)
40900 tensor(5.0657, device='cuda:0', grad_fn=<NllLossBackward0>)
41000 tensor(5.1874, device='cuda:0', grad_fn=<NllLossBackward0>)
147000
41100 tensor(5.0094, device='cuda:0', grad_fn=<NllLossBackward0>)
41200 tensor(5.2361, device='cuda:0', grad_fn=<NllLossBackward0>)
41300 tensor(4.8607, device='cuda:0', grad_fn=<NllLossBackward0>)
148000
41400 tensor(4.9584, device='cuda:0', grad_fn=<NllLossBackward0>)
41500 tensor(5.3754, device='cuda:0', grad_fn=<NllLossBackward0>)
41600 tensor(4.9372, device='cuda:0', grad_fn=<NllLossBackward0>)
149000
41700 tensor(5.2398, device='cuda:0', grad_fn=<NllLossBackward0>)
41800 tensor(5.1772, device='cuda:0', grad_fn=<NllLossBackward0>)
41900 tensor(5.1525, device='cuda:0', grad_fn=<NllLossBackward0>)
150000
42000 tensor(5.2827, device='cuda:0', grad_fn=<NllLossBackward0>)
42100 tensor(4.6855, device='cuda:0', grad_fn=<NllLossBackward0>)
151000
42200 tensor(5.0215, device='cuda:0', grad_fn=<NllLossBackward0>)
42300 tensor(5.1695, device='cuda:0', grad_fn=<NllLossBackward0>)
42400 tensor(5.0091, device='cuda:0', grad_fn=<NllLossBackward0>)
152000
42500 tensor(5.1274, device='cuda:0', grad_fn=<NllLossBackward0>)
42600 tensor(4.6771, device='cuda:0', grad_fn=<NllLossBackward0>)
42700 tensor(5.3160, device='cuda:0', grad_fn=<NllLossBackward0>)
153000
42800 tensor(5.2484, device='cuda:0', grad_fn=<NllLossBackward0>)
42900 tensor(5.0599, device='cuda:0', grad_fn=<NllLossBackward0>)
43000 tensor(5.2023, device='cuda:0', grad_fn=<NllLossBackward0>)
154000
43100 tensor(4.9059, device='cuda:0', grad_fn=<NllLossBackward0>)
43200 tensor(5.0727, device='cuda:0', grad_fn=<NllLossBackward0>)
43300 tensor(5.1986, device='cuda:0', grad_fn=<NllLossBackward0>)
155000
43400 tensor(5.1041, device='cuda:0', grad_fn=<NllLossBackward0>)
43500 tensor(5.4856, device='cuda:0', grad_fn=<NllLossBackward0>)
156000
43600 tensor(5.3947, device='cuda:0', grad_fn=<NllLossBackward0>)
43700 tensor(5.0493, device='cuda:0', grad_fn=<NllLossBackward0>)
43800 tensor(5.1703, device='cuda:0', grad_fn=<NllLossBackward0>)
157000
43900 tensor(4.9899, device='cuda:0', grad_fn=<NllLossBackward0>)
44000 tensor(4.8695, device='cuda:0', grad_fn=<NllLossBackward0>)
44100 tensor(5.3520, device='cuda:0', grad_fn=<NllLossBackward0>)
158000
44200 tensor(4.9379, device='cuda:0', grad_fn=<NllLossBackward0>)
44300 tensor(5.4677, device='cuda:0', grad_fn=<NllLossBackward0>)
44400 tensor(5.0739, device='cuda:0', grad_fn=<NllLossBackward0>)
159000
44500 tensor(4.9127, device='cuda:0', grad_fn=<NllLossBackward0>)
44600 tensor(5.0247, device='cuda:0', grad_fn=<NllLossBackward0>)
160000
44700 tensor(4.9641, device='cuda:0', grad_fn=<NllLossBackward0>)
44800 tensor(5.2538, device='cuda:0', grad_fn=<NllLossBackward0>)
44900 tensor(5.0657, device='cuda:0', grad_fn=<NllLossBackward0>)
161000
45000 tensor(4.9174, device='cuda:0', grad_fn=<NllLossBackward0>)
45100 tensor(5.2311, device='cuda:0', grad_fn=<NllLossBackward0>)
45200 tensor(5.0617, device='cuda:0', grad_fn=<NllLossBackward0>)
162000
45300 tensor(5.2111, device='cuda:0', grad_fn=<NllLossBackward0>)
45400 tensor(4.6137, device='cuda:0', grad_fn=<NllLossBackward0>)
45500 tensor(4.9275, device='cuda:0', grad_fn=<NllLossBackward0>)
163000
45600 tensor(5.2501, device='cuda:0', grad_fn=<NllLossBackward0>)
45700 tensor(5.2526, device='cuda:0', grad_fn=<NllLossBackward0>)
45800 tensor(5.1342, device='cuda:0', grad_fn=<NllLossBackward0>)
164000
45900 tensor(4.9400, device='cuda:0', grad_fn=<NllLossBackward0>)
46000 tensor(5.1521, device='cuda:0', grad_fn=<NllLossBackward0>)
165000
46100 tensor(5.0033, device='cuda:0', grad_fn=<NllLossBackward0>)
46200 tensor(4.9825, device='cuda:0', grad_fn=<NllLossBackward0>)
46300 tensor(5.3210, device='cuda:0', grad_fn=<NllLossBackward0>)
166000
46400 tensor(5.3635, device='cuda:0', grad_fn=<NllLossBackward0>)
46500 tensor(4.9169, device='cuda:0', grad_fn=<NllLossBackward0>)
46600 tensor(4.9497, device='cuda:0', grad_fn=<NllLossBackward0>)
167000
46700 tensor(5.2150, device='cuda:0', grad_fn=<NllLossBackward0>)
46800 tensor(4.8570, device='cuda:0', grad_fn=<NllLossBackward0>)
46900 tensor(4.9026, device='cuda:0', grad_fn=<NllLossBackward0>)
168000
47000 tensor(5.1486, device='cuda:0', grad_fn=<NllLossBackward0>)
47100 tensor(5.4026, device='cuda:0', grad_fn=<NllLossBackward0>)
47200 tensor(5.1028, device='cuda:0', grad_fn=<NllLossBackward0>)
169000
47300 tensor(5.2411, device='cuda:0', grad_fn=<NllLossBackward0>)
47400 tensor(4.9942, device='cuda:0', grad_fn=<NllLossBackward0>)
170000
47500 tensor(5.1984, device='cuda:0', grad_fn=<NllLossBackward0>)
47600 tensor(5.1100, device='cuda:0', grad_fn=<NllLossBackward0>)
47700 tensor(5.1202, device='cuda:0', grad_fn=<NllLossBackward0>)
171000
47800 tensor(5.2505, device='cuda:0', grad_fn=<NllLossBackward0>)
47900 tensor(5.2392, device='cuda:0', grad_fn=<NllLossBackward0>)
48000 tensor(4.9209, device='cuda:0', grad_fn=<NllLossBackward0>)
172000
48100 tensor(5.3051, device='cuda:0', grad_fn=<NllLossBackward0>)
48200 tensor(5.1233, device='cuda:0', grad_fn=<NllLossBackward0>)
48300 tensor(5.2021, device='cuda:0', grad_fn=<NllLossBackward0>)
173000
48400 tensor(4.8006, device='cuda:0', grad_fn=<NllLossBackward0>)
48500 tensor(5.0496, device='cuda:0', grad_fn=<NllLossBackward0>)
48600 tensor(5.1875, device='cuda:0', grad_fn=<NllLossBackward0>)
174000
48700 tensor(5.1499, device='cuda:0', grad_fn=<NllLossBackward0>)
48800 tensor(4.9676, device='cuda:0', grad_fn=<NllLossBackward0>)
175000
48900 tensor(5.2299, device='cuda:0', grad_fn=<NllLossBackward0>)
49000 tensor(5.0078, device='cuda:0', grad_fn=<NllLossBackward0>)
49100 tensor(5.1948, device='cuda:0', grad_fn=<NllLossBackward0>)
176000
49200 tensor(5.2544, device='cuda:0', grad_fn=<NllLossBackward0>)
49300 tensor(5.0308, device='cuda:0', grad_fn=<NllLossBackward0>)
49400 tensor(5.2188, device='cuda:0', grad_fn=<NllLossBackward0>)
177000
49500 tensor(5.3279, device='cuda:0', grad_fn=<NllLossBackward0>)
49600 tensor(4.9758, device='cuda:0', grad_fn=<NllLossBackward0>)
49700 tensor(5.0799, device='cuda:0', grad_fn=<NllLossBackward0>)
178000
49800 tensor(5.0324, device='cuda:0', grad_fn=<NllLossBackward0>)
49900 tensor(5.2124, device='cuda:0', grad_fn=<NllLossBackward0>)
50000 tensor(5.1213, device='cuda:0', grad_fn=<NllLossBackward0>)
179000
50100 tensor(5.2672, device='cuda:0', grad_fn=<NllLossBackward0>)
50200 tensor(4.9766, device='cuda:0', grad_fn=<NllLossBackward0>)
180000
50300 tensor(5.2499, device='cuda:0', grad_fn=<NllLossBackward0>)
50400 tensor(4.9455, device='cuda:0', grad_fn=<NllLossBackward0>)
50500 tensor(5.2477, device='cuda:0', grad_fn=<NllLossBackward0>)
181000
50600 tensor(5.0518, device='cuda:0', grad_fn=<NllLossBackward0>)
50700 tensor(4.9514, device='cuda:0', grad_fn=<NllLossBackward0>)
50800 tensor(4.8579, device='cuda:0', grad_fn=<NllLossBackward0>)
182000
50900 tensor(5.1602, device='cuda:0', grad_fn=<NllLossBackward0>)
51000 tensor(5.2193, device='cuda:0', grad_fn=<NllLossBackward0>)
51100 tensor(5.0982, device='cuda:0', grad_fn=<NllLossBackward0>)
183000
51200 tensor(5.2880, device='cuda:0', grad_fn=<NllLossBackward0>)
51300 tensor(4.8375, device='cuda:0', grad_fn=<NllLossBackward0>)
51400 tensor(5.1995, device='cuda:0', grad_fn=<NllLossBackward0>)
184000
51500 tensor(5.1672, device='cuda:0', grad_fn=<NllLossBackward0>)
51600 tensor(5.2276, device='cuda:0', grad_fn=<NllLossBackward0>)
185000
51700 tensor(5.0882, device='cuda:0', grad_fn=<NllLossBackward0>)
51800 tensor(5.2304, device='cuda:0', grad_fn=<NllLossBackward0>)
51900 tensor(5.0226, device='cuda:0', grad_fn=<NllLossBackward0>)
186000
52000 tensor(5.1911, device='cuda:0', grad_fn=<NllLossBackward0>)
52100 tensor(5.3534, device='cuda:0', grad_fn=<NllLossBackward0>)
52200 tensor(5.1108, device='cuda:0', grad_fn=<NllLossBackward0>)
187000
52300 tensor(5.0774, device='cuda:0', grad_fn=<NllLossBackward0>)
52400 tensor(5.1188, device='cuda:0', grad_fn=<NllLossBackward0>)
52500 tensor(5.1506, device='cuda:0', grad_fn=<NllLossBackward0>)
188000
52600 tensor(5.0832, device='cuda:0', grad_fn=<NllLossBackward0>)
52700 tensor(5.2014, device='cuda:0', grad_fn=<NllLossBackward0>)
189000
52800 tensor(5.2696, device='cuda:0', grad_fn=<NllLossBackward0>)
52900 tensor(5.0779, device='cuda:0', grad_fn=<NllLossBackward0>)
53000 tensor(5.0554, device='cuda:0', grad_fn=<NllLossBackward0>)
190000
53100 tensor(5.0068, device='cuda:0', grad_fn=<NllLossBackward0>)
53200 tensor(4.7833, device='cuda:0', grad_fn=<NllLossBackward0>)
53300 tensor(5.2709, device='cuda:0', grad_fn=<NllLossBackward0>)
191000
53400 tensor(5.1926, device='cuda:0', grad_fn=<NllLossBackward0>)
53500 tensor(5.0873, device='cuda:0', grad_fn=<NllLossBackward0>)
53600 tensor(5.1936, device='cuda:0', grad_fn=<NllLossBackward0>)
192000
53700 tensor(5.1878, device='cuda:0', grad_fn=<NllLossBackward0>)
53800 tensor(4.8307, device='cuda:0', grad_fn=<NllLossBackward0>)
53900 tensor(4.8449, device='cuda:0', grad_fn=<NllLossBackward0>)
193000
54000 tensor(5.3690, device='cuda:0', grad_fn=<NllLossBackward0>)
54100 tensor(4.4637, device='cuda:0', grad_fn=<NllLossBackward0>)
194000
54200 tensor(5.2497, device='cuda:0', grad_fn=<NllLossBackward0>)
54300 tensor(5.0490, device='cuda:0', grad_fn=<NllLossBackward0>)
54400 tensor(4.7058, device='cuda:0', grad_fn=<NllLossBackward0>)
195000
54500 tensor(5.2924, device='cuda:0', grad_fn=<NllLossBackward0>)
54600 tensor(5.3235, device='cuda:0', grad_fn=<NllLossBackward0>)
54700 tensor(5.0440, device='cuda:0', grad_fn=<NllLossBackward0>)
196000
54800 tensor(5.2241, device='cuda:0', grad_fn=<NllLossBackward0>)
54900 tensor(4.9492, device='cuda:0', grad_fn=<NllLossBackward0>)
55000 tensor(5.0754, device='cuda:0', grad_fn=<NllLossBackward0>)
197000
55100 tensor(5.1074, device='cuda:0', grad_fn=<NllLossBackward0>)
55200 tensor(5.2155, device='cuda:0', grad_fn=<NllLossBackward0>)
55300 tensor(5.1454, device='cuda:0', grad_fn=<NllLossBackward0>)
198000
55400 tensor(5.2931, device='cuda:0', grad_fn=<NllLossBackward0>)
55500 tensor(5.3043, device='cuda:0', grad_fn=<NllLossBackward0>)
199000
55600 tensor(4.8684, device='cuda:0', grad_fn=<NllLossBackward0>)
55700 tensor(5.0869, device='cuda:0', grad_fn=<NllLossBackward0>)
55800 tensor(4.6445, device='cuda:0', grad_fn=<NllLossBackward0>)
200000
55900 tensor(5.1047, device='cuda:0', grad_fn=<NllLossBackward0>)
56000 tensor(4.9902, device='cuda:0', grad_fn=<NllLossBackward0>)
56100 tensor(5.2816, device='cuda:0', grad_fn=<NllLossBackward0>)
201000
56200 tensor(5.3301, device='cuda:0', grad_fn=<NllLossBackward0>)
56300 tensor(5.1437, device='cuda:0', grad_fn=<NllLossBackward0>)
56400 tensor(5.2254, device='cuda:0', grad_fn=<NllLossBackward0>)
202000
56500 tensor(5.2339, device='cuda:0', grad_fn=<NllLossBackward0>)
56600 tensor(4.9340, device='cuda:0', grad_fn=<NllLossBackward0>)
56700 tensor(5.0758, device='cuda:0', grad_fn=<NllLossBackward0>)
203000
56800 tensor(5.5707, device='cuda:0', grad_fn=<NllLossBackward0>)
56900 tensor(4.9705, device='cuda:0', grad_fn=<NllLossBackward0>)
204000
57000 tensor(5.3555, device='cuda:0', grad_fn=<NllLossBackward0>)
57100 tensor(5.1206, device='cuda:0', grad_fn=<NllLossBackward0>)
57200 tensor(4.9509, device='cuda:0', grad_fn=<NllLossBackward0>)
205000
57300 tensor(5.0644, device='cuda:0', grad_fn=<NllLossBackward0>)
57400 tensor(5.2678, device='cuda:0', grad_fn=<NllLossBackward0>)
57500 tensor(4.9110, device='cuda:0', grad_fn=<NllLossBackward0>)
206000
57600 tensor(4.7788, device='cuda:0', grad_fn=<NllLossBackward0>)
57700 tensor(4.9975, device='cuda:0', grad_fn=<NllLossBackward0>)
57800 tensor(5.3311, device='cuda:0', grad_fn=<NllLossBackward0>)
207000
57900 tensor(5.2600, device='cuda:0', grad_fn=<NllLossBackward0>)
58000 tensor(5.0070, device='cuda:0', grad_fn=<NllLossBackward0>)
208000
58100 tensor(5.3885, device='cuda:0', grad_fn=<NllLossBackward0>)
58200 tensor(5.4209, device='cuda:0', grad_fn=<NllLossBackward0>)
58300 tensor(4.9688, device='cuda:0', grad_fn=<NllLossBackward0>)
209000
58400 tensor(4.8874, device='cuda:0', grad_fn=<NllLossBackward0>)
58500 tensor(4.9976, device='cuda:0', grad_fn=<NllLossBackward0>)
58600 tensor(5.4753, device='cuda:0', grad_fn=<NllLossBackward0>)
210000
58700 tensor(4.9572, device='cuda:0', grad_fn=<NllLossBackward0>)
58800 tensor(5.1014, device='cuda:0', grad_fn=<NllLossBackward0>)
58900 tensor(5.2920, device='cuda:0', grad_fn=<NllLossBackward0>)
211000
59000 tensor(5.2007, device='cuda:0', grad_fn=<NllLossBackward0>)
59100 tensor(5.0399, device='cuda:0', grad_fn=<NllLossBackward0>)
59200 tensor(5.1078, device='cuda:0', grad_fn=<NllLossBackward0>)
212000
59300 tensor(4.9478, device='cuda:0', grad_fn=<NllLossBackward0>)
59400 tensor(5.1439, device='cuda:0', grad_fn=<NllLossBackward0>)
213000
59500 tensor(5.0825, device='cuda:0', grad_fn=<NllLossBackward0>)
59600 tensor(5.0066, device='cuda:0', grad_fn=<NllLossBackward0>)
59700 tensor(4.8766, device='cuda:0', grad_fn=<NllLossBackward0>)
214000
59800 tensor(5.0725, device='cuda:0', grad_fn=<NllLossBackward0>)
59900 tensor(5.1101, device='cuda:0', grad_fn=<NllLossBackward0>)
60000 tensor(5.1072, device='cuda:0', grad_fn=<NllLossBackward0>)
215000
60100 tensor(5.1821, device='cuda:0', grad_fn=<NllLossBackward0>)
60200 tensor(5.2384, device='cuda:0', grad_fn=<NllLossBackward0>)
60300 tensor(5.0115, device='cuda:0', grad_fn=<NllLossBackward0>)
216000
60400 tensor(5.1308, device='cuda:0', grad_fn=<NllLossBackward0>)
60500 tensor(5.1859, device='cuda:0', grad_fn=<NllLossBackward0>)
60600 tensor(5.0918, device='cuda:0', grad_fn=<NllLossBackward0>)
217000
60700 tensor(5.0108, device='cuda:0', grad_fn=<NllLossBackward0>)
60800 tensor(5.1547, device='cuda:0', grad_fn=<NllLossBackward0>)
218000
60900 tensor(5.2653, device='cuda:0', grad_fn=<NllLossBackward0>)
61000 tensor(5.2235, device='cuda:0', grad_fn=<NllLossBackward0>)
61100 tensor(5.1075, device='cuda:0', grad_fn=<NllLossBackward0>)
219000
61200 tensor(4.8467, device='cuda:0', grad_fn=<NllLossBackward0>)
61300 tensor(5.0838, device='cuda:0', grad_fn=<NllLossBackward0>)
61400 tensor(5.2590, device='cuda:0', grad_fn=<NllLossBackward0>)
220000
61500 tensor(5.1743, device='cuda:0', grad_fn=<NllLossBackward0>)
61600 tensor(4.9954, device='cuda:0', grad_fn=<NllLossBackward0>)
61700 tensor(4.9919, device='cuda:0', grad_fn=<NllLossBackward0>)
221000
61800 tensor(5.0254, device='cuda:0', grad_fn=<NllLossBackward0>)
61900 tensor(5.0908, device='cuda:0', grad_fn=<NllLossBackward0>)
62000 tensor(5.2753, device='cuda:0', grad_fn=<NllLossBackward0>)
222000
62100 tensor(4.8669, device='cuda:0', grad_fn=<NllLossBackward0>)
62200 tensor(4.9641, device='cuda:0', grad_fn=<NllLossBackward0>)
223000
62300 tensor(5.0582, device='cuda:0', grad_fn=<NllLossBackward0>)
62400 tensor(4.9853, device='cuda:0', grad_fn=<NllLossBackward0>)
62500 tensor(5.0922, device='cuda:0', grad_fn=<NllLossBackward0>)
224000
62600 tensor(4.9824, device='cuda:0', grad_fn=<NllLossBackward0>)
62700 tensor(5.1782, device='cuda:0', grad_fn=<NllLossBackward0>)
62800 tensor(4.8954, device='cuda:0', grad_fn=<NllLossBackward0>)
225000
62900 tensor(5.3762, device='cuda:0', grad_fn=<NllLossBackward0>)
63000 tensor(5.2436, device='cuda:0', grad_fn=<NllLossBackward0>)
63100 tensor(5.3535, device='cuda:0', grad_fn=<NllLossBackward0>)
226000
63200 tensor(5.1772, device='cuda:0', grad_fn=<NllLossBackward0>)
63300 tensor(4.9112, device='cuda:0', grad_fn=<NllLossBackward0>)
63400 tensor(5.0962, device='cuda:0', grad_fn=<NllLossBackward0>)
227000
63500 tensor(4.9657, device='cuda:0', grad_fn=<NllLossBackward0>)
63600 tensor(5.0014, device='cuda:0', grad_fn=<NllLossBackward0>)
228000
63700 tensor(5.0686, device='cuda:0', grad_fn=<NllLossBackward0>)
63800 tensor(5.3235, device='cuda:0', grad_fn=<NllLossBackward0>)
63900 tensor(5.3906, device='cuda:0', grad_fn=<NllLossBackward0>)
229000
64000 tensor(4.9716, device='cuda:0', grad_fn=<NllLossBackward0>)
64100 tensor(5.2290, device='cuda:0', grad_fn=<NllLossBackward0>)
64200 tensor(5.1423, device='cuda:0', grad_fn=<NllLossBackward0>)
230000
64300 tensor(5.1310, device='cuda:0', grad_fn=<NllLossBackward0>)
64400 tensor(4.9695, device='cuda:0', grad_fn=<NllLossBackward0>)
64500 tensor(4.9604, device='cuda:0', grad_fn=<NllLossBackward0>)
231000
64600 tensor(4.9843, device='cuda:0', grad_fn=<NllLossBackward0>)
64700 tensor(5.4459, device='cuda:0', grad_fn=<NllLossBackward0>)
64800 tensor(5.1474, device='cuda:0', grad_fn=<NllLossBackward0>)
232000
64900 tensor(5.2715, device='cuda:0', grad_fn=<NllLossBackward0>)
65000 tensor(5.2314, device='cuda:0', grad_fn=<NllLossBackward0>)
233000
65100 tensor(4.7774, device='cuda:0', grad_fn=<NllLossBackward0>)
65200 tensor(5.3146, device='cuda:0', grad_fn=<NllLossBackward0>)
65300 tensor(5.1604, device='cuda:0', grad_fn=<NllLossBackward0>)
234000
65400 tensor(5.0736, device='cuda:0', grad_fn=<NllLossBackward0>)
65500 tensor(5.1152, device='cuda:0', grad_fn=<NllLossBackward0>)
65600 tensor(5.1409, device='cuda:0', grad_fn=<NllLossBackward0>)
235000
65700 tensor(4.9068, device='cuda:0', grad_fn=<NllLossBackward0>)
65800 tensor(4.9353, device='cuda:0', grad_fn=<NllLossBackward0>)
65900 tensor(5.1612, device='cuda:0', grad_fn=<NllLossBackward0>)
236000
66000 tensor(5.1866, device='cuda:0', grad_fn=<NllLossBackward0>)
66100 tensor(4.8965, device='cuda:0', grad_fn=<NllLossBackward0>)
237000
66200 tensor(4.9215, device='cuda:0', grad_fn=<NllLossBackward0>)
66300 tensor(4.8641, device='cuda:0', grad_fn=<NllLossBackward0>)
66400 tensor(5.1550, device='cuda:0', grad_fn=<NllLossBackward0>)
238000
66500 tensor(4.8722, device='cuda:0', grad_fn=<NllLossBackward0>)
66600 tensor(5.0268, device='cuda:0', grad_fn=<NllLossBackward0>)
66700 tensor(4.9948, device='cuda:0', grad_fn=<NllLossBackward0>)
239000
66800 tensor(5.0565, device='cuda:0', grad_fn=<NllLossBackward0>)
66900 tensor(5.2030, device='cuda:0', grad_fn=<NllLossBackward0>)
67000 tensor(5.2199, device='cuda:0', grad_fn=<NllLossBackward0>)
240000
67100 tensor(5.0560, device='cuda:0', grad_fn=<NllLossBackward0>)
67200 tensor(5.4027, device='cuda:0', grad_fn=<NllLossBackward0>)
67300 tensor(5.0631, device='cuda:0', grad_fn=<NllLossBackward0>)
241000
67400 tensor(5.2687, device='cuda:0', grad_fn=<NllLossBackward0>)
67500 tensor(5.0728, device='cuda:0', grad_fn=<NllLossBackward0>)
242000
67600 tensor(5.1050, device='cuda:0', grad_fn=<NllLossBackward0>)
67700 tensor(5.0495, device='cuda:0', grad_fn=<NllLossBackward0>)
67800 tensor(5.2014, device='cuda:0', grad_fn=<NllLossBackward0>)
243000
67900 tensor(4.9060, device='cuda:0', grad_fn=<NllLossBackward0>)
68000 tensor(4.9703, device='cuda:0', grad_fn=<NllLossBackward0>)
68100 tensor(4.8457, device='cuda:0', grad_fn=<NllLossBackward0>)
244000
68200 tensor(5.1298, device='cuda:0', grad_fn=<NllLossBackward0>)
68300 tensor(5.0272, device='cuda:0', grad_fn=<NllLossBackward0>)
68400 tensor(5.1238, device='cuda:0', grad_fn=<NllLossBackward0>)
245000
68500 tensor(5.1778, device='cuda:0', grad_fn=<NllLossBackward0>)
68600 tensor(5.0997, device='cuda:0', grad_fn=<NllLossBackward0>)
68700 tensor(4.7577, device='cuda:0', grad_fn=<NllLossBackward0>)
246000
68800 tensor(5.1663, device='cuda:0', grad_fn=<NllLossBackward0>)
68900 tensor(4.8385, device='cuda:0', grad_fn=<NllLossBackward0>)
247000
69000 tensor(5.0810, device='cuda:0', grad_fn=<NllLossBackward0>)
69100 tensor(4.8565, device='cuda:0', grad_fn=<NllLossBackward0>)
69200 tensor(4.8088, device='cuda:0', grad_fn=<NllLossBackward0>)
248000
69300 tensor(5.2732, device='cuda:0', grad_fn=<NllLossBackward0>)
69400 tensor(5.1549, device='cuda:0', grad_fn=<NllLossBackward0>)
69500 tensor(5.1323, device='cuda:0', grad_fn=<NllLossBackward0>)
249000
69600 tensor(5.0379, device='cuda:0', grad_fn=<NllLossBackward0>)
69700 tensor(5.3768, device='cuda:0', grad_fn=<NllLossBackward0>)
69800 tensor(4.9056, device='cuda:0', grad_fn=<NllLossBackward0>)
250000
69900 tensor(5.4631, device='cuda:0', grad_fn=<NllLossBackward0>)
70000 tensor(5.1071, device='cuda:0', grad_fn=<NllLossBackward0>)
70100 tensor(5.0137, device='cuda:0', grad_fn=<NllLossBackward0>)
251000
70200 tensor(5.1769, device='cuda:0', grad_fn=<NllLossBackward0>)
70300 tensor(4.8435, device='cuda:0', grad_fn=<NllLossBackward0>)
252000
70400 tensor(5.2020, device='cuda:0', grad_fn=<NllLossBackward0>)
70500 tensor(5.1395, device='cuda:0', grad_fn=<NllLossBackward0>)
70600 tensor(5.3541, device='cuda:0', grad_fn=<NllLossBackward0>)
253000
70700 tensor(5.3092, device='cuda:0', grad_fn=<NllLossBackward0>)
70800 tensor(5.4793, device='cuda:0', grad_fn=<NllLossBackward0>)
70900 tensor(5.2799, device='cuda:0', grad_fn=<NllLossBackward0>)
254000
71000 tensor(5.0872, device='cuda:0', grad_fn=<NllLossBackward0>)
71100 tensor(5.1728, device='cuda:0', grad_fn=<NllLossBackward0>)
71200 tensor(5.2739, device='cuda:0', grad_fn=<NllLossBackward0>)
255000
71300 tensor(4.8960, device='cuda:0', grad_fn=<NllLossBackward0>)
71400 tensor(5.0185, device='cuda:0', grad_fn=<NllLossBackward0>)
71500 tensor(5.0367, device='cuda:0', grad_fn=<NllLossBackward0>)
256000
71600 tensor(5.1992, device='cuda:0', grad_fn=<NllLossBackward0>)
71700 tensor(5.2508, device='cuda:0', grad_fn=<NllLossBackward0>)
257000
71800 tensor(5.3864, device='cuda:0', grad_fn=<NllLossBackward0>)
71900 tensor(5.0381, device='cuda:0', grad_fn=<NllLossBackward0>)
72000 tensor(5.4758, device='cuda:0', grad_fn=<NllLossBackward0>)
258000
72100 tensor(5.1863, device='cuda:0', grad_fn=<NllLossBackward0>)
72200 tensor(5.0258, device='cuda:0', grad_fn=<NllLossBackward0>)
72300 tensor(5.1253, device='cuda:0', grad_fn=<NllLossBackward0>)
259000
72400 tensor(5.0412, device='cuda:0', grad_fn=<NllLossBackward0>)
72500 tensor(5.1304, device='cuda:0', grad_fn=<NllLossBackward0>)
72600 tensor(5.1833, device='cuda:0', grad_fn=<NllLossBackward0>)
260000
72700 tensor(4.8666, device='cuda:0', grad_fn=<NllLossBackward0>)
72800 tensor(5.3088, device='cuda:0', grad_fn=<NllLossBackward0>)
72900 tensor(5.0482, device='cuda:0', grad_fn=<NllLossBackward0>)
261000
73000 tensor(5.2591, device='cuda:0', grad_fn=<NllLossBackward0>)
73100 tensor(4.9459, device='cuda:0', grad_fn=<NllLossBackward0>)
262000
73200 tensor(5.0086, device='cuda:0', grad_fn=<NllLossBackward0>)
73300 tensor(4.9859, device='cuda:0', grad_fn=<NllLossBackward0>)
73400 tensor(5.2442, device='cuda:0', grad_fn=<NllLossBackward0>)
263000
73500 tensor(5.0674, device='cuda:0', grad_fn=<NllLossBackward0>)
73600 tensor(5.0003, device='cuda:0', grad_fn=<NllLossBackward0>)
73700 tensor(5.1476, device='cuda:0', grad_fn=<NllLossBackward0>)
264000
73800 tensor(5.1845, device='cuda:0', grad_fn=<NllLossBackward0>)
73900 tensor(5.1606, device='cuda:0', grad_fn=<NllLossBackward0>)
74000 tensor(5.3307, device='cuda:0', grad_fn=<NllLossBackward0>)
265000
74100 tensor(5.0139, device='cuda:0', grad_fn=<NllLossBackward0>)
74200 tensor(5.0479, device='cuda:0', grad_fn=<NllLossBackward0>)
74300 tensor(5.0041, device='cuda:0', grad_fn=<NllLossBackward0>)
266000
74400 tensor(5.1568, device='cuda:0', grad_fn=<NllLossBackward0>)
74500 tensor(4.7452, device='cuda:0', grad_fn=<NllLossBackward0>)
267000
74600 tensor(5.2662, device='cuda:0', grad_fn=<NllLossBackward0>)
74700 tensor(5.3145, device='cuda:0', grad_fn=<NllLossBackward0>)
74800 tensor(5.0200, device='cuda:0', grad_fn=<NllLossBackward0>)
268000
74900 tensor(4.8437, device='cuda:0', grad_fn=<NllLossBackward0>)
75000 tensor(5.1458, device='cuda:0', grad_fn=<NllLossBackward0>)
75100 tensor(5.3639, device='cuda:0', grad_fn=<NllLossBackward0>)
269000
75200 tensor(5.0922, device='cuda:0', grad_fn=<NllLossBackward0>)
75300 tensor(5.3002, device='cuda:0', grad_fn=<NllLossBackward0>)
75400 tensor(5.0810, device='cuda:0', grad_fn=<NllLossBackward0>)
270000
75500 tensor(5.3591, device='cuda:0', grad_fn=<NllLossBackward0>)
75600 tensor(5.0206, device='cuda:0', grad_fn=<NllLossBackward0>)
271000
75700 tensor(5.0759, device='cuda:0', grad_fn=<NllLossBackward0>)
75800 tensor(5.0702, device='cuda:0', grad_fn=<NllLossBackward0>)
75900 tensor(5.1349, device='cuda:0', grad_fn=<NllLossBackward0>)
272000
76000 tensor(4.8964, device='cuda:0', grad_fn=<NllLossBackward0>)
76100 tensor(4.8618, device='cuda:0', grad_fn=<NllLossBackward0>)
76200 tensor(4.9370, device='cuda:0', grad_fn=<NllLossBackward0>)
273000
76300 tensor(5.0666, device='cuda:0', grad_fn=<NllLossBackward0>)
76400 tensor(4.9261, device='cuda:0', grad_fn=<NllLossBackward0>)
76500 tensor(5.0910, device='cuda:0', grad_fn=<NllLossBackward0>)
274000
76600 tensor(5.1985, device='cuda:0', grad_fn=<NllLossBackward0>)
76700 tensor(4.8590, device='cuda:0', grad_fn=<NllLossBackward0>)
76800 tensor(4.7918, device='cuda:0', grad_fn=<NllLossBackward0>)
275000
76900 tensor(5.1386, device='cuda:0', grad_fn=<NllLossBackward0>)
77000 tensor(5.0233, device='cuda:0', grad_fn=<NllLossBackward0>)
276000
77100 tensor(5.3827, device='cuda:0', grad_fn=<NllLossBackward0>)
77200 tensor(5.0204, device='cuda:0', grad_fn=<NllLossBackward0>)
77300 tensor(4.9805, device='cuda:0', grad_fn=<NllLossBackward0>)
277000
77400 tensor(5.0943, device='cuda:0', grad_fn=<NllLossBackward0>)
77500 tensor(4.7828, device='cuda:0', grad_fn=<NllLossBackward0>)
77600 tensor(5.1103, device='cuda:0', grad_fn=<NllLossBackward0>)
278000
77700 tensor(5.2850, device='cuda:0', grad_fn=<NllLossBackward0>)
77800 tensor(4.8699, device='cuda:0', grad_fn=<NllLossBackward0>)
77900 tensor(4.7829, device='cuda:0', grad_fn=<NllLossBackward0>)
279000
78000 tensor(5.2236, device='cuda:0', grad_fn=<NllLossBackward0>)
78100 tensor(4.8891, device='cuda:0', grad_fn=<NllLossBackward0>)
78200 tensor(5.1773, device='cuda:0', grad_fn=<NllLossBackward0>)
280000
78300 tensor(5.0470, device='cuda:0', grad_fn=<NllLossBackward0>)
78400 tensor(5.0570, device='cuda:0', grad_fn=<NllLossBackward0>)
281000
78500 tensor(5.4172, device='cuda:0', grad_fn=<NllLossBackward0>)
78600 tensor(4.8807, device='cuda:0', grad_fn=<NllLossBackward0>)
78700 tensor(5.3329, device='cuda:0', grad_fn=<NllLossBackward0>)
282000
78800 tensor(5.2173, device='cuda:0', grad_fn=<NllLossBackward0>)
78900 tensor(5.4813, device='cuda:0', grad_fn=<NllLossBackward0>)
79000 tensor(4.9973, device='cuda:0', grad_fn=<NllLossBackward0>)
283000
79100 tensor(4.7734, device='cuda:0', grad_fn=<NllLossBackward0>)
79200 tensor(4.8357, device='cuda:0', grad_fn=<NllLossBackward0>)
79300 tensor(5.3520, device='cuda:0', grad_fn=<NllLossBackward0>)
284000
79400 tensor(5.1467, device='cuda:0', grad_fn=<NllLossBackward0>)
79500 tensor(5.0294, device='cuda:0', grad_fn=<NllLossBackward0>)
79600 tensor(5.6394, device='cuda:0', grad_fn=<NllLossBackward0>)
285000
79700 tensor(4.7860, device='cuda:0', grad_fn=<NllLossBackward0>)
79800 tensor(5.0916, device='cuda:0', grad_fn=<NllLossBackward0>)
286000
79900 tensor(5.0640, device='cuda:0', grad_fn=<NllLossBackward0>)
80000 tensor(5.2131, device='cuda:0', grad_fn=<NllLossBackward0>)
80100 tensor(5.2826, device='cuda:0', grad_fn=<NllLossBackward0>)
287000
80200 tensor(5.3061, device='cuda:0', grad_fn=<NllLossBackward0>)
80300 tensor(5.2656, device='cuda:0', grad_fn=<NllLossBackward0>)
80400 tensor(5.3741, device='cuda:0', grad_fn=<NllLossBackward0>)
288000
80500 tensor(5.0930, device='cuda:0', grad_fn=<NllLossBackward0>)
80600 tensor(5.2616, device='cuda:0', grad_fn=<NllLossBackward0>)
80700 tensor(5.3157, device='cuda:0', grad_fn=<NllLossBackward0>)
289000
80800 tensor(4.8360, device='cuda:0', grad_fn=<NllLossBackward0>)
80900 tensor(5.2270, device='cuda:0', grad_fn=<NllLossBackward0>)
290000
81000 tensor(5.1687, device='cuda:0', grad_fn=<NllLossBackward0>)
81100 tensor(5.1558, device='cuda:0', grad_fn=<NllLossBackward0>)
81200 tensor(4.9090, device='cuda:0', grad_fn=<NllLossBackward0>)
291000
81300 tensor(4.9953, device='cuda:0', grad_fn=<NllLossBackward0>)
81400 tensor(5.1353, device='cuda:0', grad_fn=<NllLossBackward0>)
81500 tensor(5.1045, device='cuda:0', grad_fn=<NllLossBackward0>)
292000
81600 tensor(5.1020, device='cuda:0', grad_fn=<NllLossBackward0>)
81700 tensor(5.2123, device='cuda:0', grad_fn=<NllLossBackward0>)
81800 tensor(5.0393, device='cuda:0', grad_fn=<NllLossBackward0>)
293000
81900 tensor(5.2712, device='cuda:0', grad_fn=<NllLossBackward0>)
82000 tensor(5.0969, device='cuda:0', grad_fn=<NllLossBackward0>)
82100 tensor(5.1216, device='cuda:0', grad_fn=<NllLossBackward0>)
294000
82200 tensor(5.0557, device='cuda:0', grad_fn=<NllLossBackward0>)
82300 tensor(4.9278, device='cuda:0', grad_fn=<NllLossBackward0>)
295000
82400 tensor(5.1703, device='cuda:0', grad_fn=<NllLossBackward0>)
82500 tensor(4.8852, device='cuda:0', grad_fn=<NllLossBackward0>)
82600 tensor(5.0861, device='cuda:0', grad_fn=<NllLossBackward0>)
296000
82700 tensor(5.1290, device='cuda:0', grad_fn=<NllLossBackward0>)
82800 tensor(5.1089, device='cuda:0', grad_fn=<NllLossBackward0>)
82900 tensor(4.7936, device='cuda:0', grad_fn=<NllLossBackward0>)
297000
83000 tensor(5.1975, device='cuda:0', grad_fn=<NllLossBackward0>)
83100 tensor(5.0460, device='cuda:0', grad_fn=<NllLossBackward0>)
83200 tensor(5.2952, device='cuda:0', grad_fn=<NllLossBackward0>)
298000
83300 tensor(4.9863, device='cuda:0', grad_fn=<NllLossBackward0>)
83400 tensor(5.0724, device='cuda:0', grad_fn=<NllLossBackward0>)
83500 tensor(5.0200, device='cuda:0', grad_fn=<NllLossBackward0>)
299000
83600 tensor(4.9496, device='cuda:0', grad_fn=<NllLossBackward0>)
83700 tensor(5.1706, device='cuda:0', grad_fn=<NllLossBackward0>)
300000
83800 tensor(4.8822, device='cuda:0', grad_fn=<NllLossBackward0>)
83900 tensor(4.9673, device='cuda:0', grad_fn=<NllLossBackward0>)
84000 tensor(5.1188, device='cuda:0', grad_fn=<NllLossBackward0>)
301000
84100 tensor(5.4809, device='cuda:0', grad_fn=<NllLossBackward0>)
84200 tensor(4.6303, device='cuda:0', grad_fn=<NllLossBackward0>)
84300 tensor(5.0608, device='cuda:0', grad_fn=<NllLossBackward0>)
302000
84400 tensor(4.8835, device='cuda:0', grad_fn=<NllLossBackward0>)
84500 tensor(5.0710, device='cuda:0', grad_fn=<NllLossBackward0>)
84600 tensor(5.2347, device='cuda:0', grad_fn=<NllLossBackward0>)
303000
84700 tensor(4.9578, device='cuda:0', grad_fn=<NllLossBackward0>)
84800 tensor(4.9840, device='cuda:0', grad_fn=<NllLossBackward0>)
84900 tensor(5.2946, device='cuda:0', grad_fn=<NllLossBackward0>)
304000
85000 tensor(5.1757, device='cuda:0', grad_fn=<NllLossBackward0>)
85100 tensor(5.0449, device='cuda:0', grad_fn=<NllLossBackward0>)
305000
85200 tensor(5.0524, device='cuda:0', grad_fn=<NllLossBackward0>)
85300 tensor(5.3156, device='cuda:0', grad_fn=<NllLossBackward0>)
85400 tensor(5.2982, device='cuda:0', grad_fn=<NllLossBackward0>)
306000
85500 tensor(4.9904, device='cuda:0', grad_fn=<NllLossBackward0>)
85600 tensor(5.1111, device='cuda:0', grad_fn=<NllLossBackward0>)
85700 tensor(5.1132, device='cuda:0', grad_fn=<NllLossBackward0>)
307000
85800 tensor(5.1134, device='cuda:0', grad_fn=<NllLossBackward0>)
85900 tensor(5.3567, device='cuda:0', grad_fn=<NllLossBackward0>)
86000 tensor(5.1374, device='cuda:0', grad_fn=<NllLossBackward0>)
308000
86100 tensor(4.8926, device='cuda:0', grad_fn=<NllLossBackward0>)
86200 tensor(5.0359, device='cuda:0', grad_fn=<NllLossBackward0>)
86300 tensor(5.0061, device='cuda:0', grad_fn=<NllLossBackward0>)
309000
86400 tensor(5.0968, device='cuda:0', grad_fn=<NllLossBackward0>)
86500 tensor(5.1935, device='cuda:0', grad_fn=<NllLossBackward0>)
310000
86600 tensor(5.0132, device='cuda:0', grad_fn=<NllLossBackward0>)
86700 tensor(4.9140, device='cuda:0', grad_fn=<NllLossBackward0>)
86800 tensor(4.9166, device='cuda:0', grad_fn=<NllLossBackward0>)
311000
86900 tensor(4.8179, device='cuda:0', grad_fn=<NllLossBackward0>)
87000 tensor(5.3136, device='cuda:0', grad_fn=<NllLossBackward0>)
87100 tensor(5.1485, device='cuda:0', grad_fn=<NllLossBackward0>)
312000
87200 tensor(5.1635, device='cuda:0', grad_fn=<NllLossBackward0>)
87300 tensor(5.0104, device='cuda:0', grad_fn=<NllLossBackward0>)
87400 tensor(5.0568, device='cuda:0', grad_fn=<NllLossBackward0>)
313000
87500 tensor(5.1699, device='cuda:0', grad_fn=<NllLossBackward0>)
87600 tensor(5.2978, device='cuda:0', grad_fn=<NllLossBackward0>)
87700 tensor(4.9475, device='cuda:0', grad_fn=<NllLossBackward0>)
314000
87800 tensor(4.8997, device='cuda:0', grad_fn=<NllLossBackward0>)
87900 tensor(5.1651, device='cuda:0', grad_fn=<NllLossBackward0>)
315000
88000 tensor(4.6506, device='cuda:0', grad_fn=<NllLossBackward0>)
88100 tensor(5.0203, device='cuda:0', grad_fn=<NllLossBackward0>)
88200 tensor(5.2400, device='cuda:0', grad_fn=<NllLossBackward0>)
316000
88300 tensor(5.1478, device='cuda:0', grad_fn=<NllLossBackward0>)
88400 tensor(5.2234, device='cuda:0', grad_fn=<NllLossBackward0>)
88500 tensor(4.8418, device='cuda:0', grad_fn=<NllLossBackward0>)
317000
88600 tensor(4.8617, device='cuda:0', grad_fn=<NllLossBackward0>)
88700 tensor(4.8720, device='cuda:0', grad_fn=<NllLossBackward0>)
88800 tensor(4.8572, device='cuda:0', grad_fn=<NllLossBackward0>)
318000
88900 tensor(4.7781, device='cuda:0', grad_fn=<NllLossBackward0>)
89000 tensor(4.5953, device='cuda:0', grad_fn=<NllLossBackward0>)
89100 tensor(5.1780, device='cuda:0', grad_fn=<NllLossBackward0>)
319000
89200 tensor(4.9773, device='cuda:0', grad_fn=<NllLossBackward0>)
89300 tensor(5.3703, device='cuda:0', grad_fn=<NllLossBackward0>)
320000
89400 tensor(4.7298, device='cuda:0', grad_fn=<NllLossBackward0>)
89500 tensor(5.0713, device='cuda:0', grad_fn=<NllLossBackward0>)
89600 tensor(5.3035, device='cuda:0', grad_fn=<NllLossBackward0>)
321000
89700 tensor(4.8603, device='cuda:0', grad_fn=<NllLossBackward0>)
89800 tensor(5.0780, device='cuda:0', grad_fn=<NllLossBackward0>)
89900 tensor(5.0922, device='cuda:0', grad_fn=<NllLossBackward0>)
322000
90000 tensor(5.0946, device='cuda:0', grad_fn=<NllLossBackward0>)
90100 tensor(4.9839, device='cuda:0', grad_fn=<NllLossBackward0>)
90200 tensor(5.0898, device='cuda:0', grad_fn=<NllLossBackward0>)
323000
90300 tensor(5.0876, device='cuda:0', grad_fn=<NllLossBackward0>)
90400 tensor(4.7618, device='cuda:0', grad_fn=<NllLossBackward0>)
90500 tensor(4.9550, device='cuda:0', grad_fn=<NllLossBackward0>)
324000
90600 tensor(5.2409, device='cuda:0', grad_fn=<NllLossBackward0>)
90700 tensor(4.8453, device='cuda:0', grad_fn=<NllLossBackward0>)
325000
90800 tensor(5.0906, device='cuda:0', grad_fn=<NllLossBackward0>)
90900 tensor(5.1866, device='cuda:0', grad_fn=<NllLossBackward0>)
91000 tensor(5.1850, device='cuda:0', grad_fn=<NllLossBackward0>)
326000
91100 tensor(4.7448, device='cuda:0', grad_fn=<NllLossBackward0>)
91200 tensor(5.2125, device='cuda:0', grad_fn=<NllLossBackward0>)
91300 tensor(5.3846, device='cuda:0', grad_fn=<NllLossBackward0>)
327000
91400 tensor(4.7534, device='cuda:0', grad_fn=<NllLossBackward0>)
91500 tensor(4.9951, device='cuda:0', grad_fn=<NllLossBackward0>)
91600 tensor(5.2443, device='cuda:0', grad_fn=<NllLossBackward0>)
328000
91700 tensor(5.0912, device='cuda:0', grad_fn=<NllLossBackward0>)
91800 tensor(5.2768, device='cuda:0', grad_fn=<NllLossBackward0>)
329000
91900 tensor(5.0739, device='cuda:0', grad_fn=<NllLossBackward0>)
92000 tensor(5.1095, device='cuda:0', grad_fn=<NllLossBackward0>)
92100 tensor(4.9860, device='cuda:0', grad_fn=<NllLossBackward0>)
330000
92200 tensor(4.6185, device='cuda:0', grad_fn=<NllLossBackward0>)
92300 tensor(5.1181, device='cuda:0', grad_fn=<NllLossBackward0>)
92400 tensor(5.1625, device='cuda:0', grad_fn=<NllLossBackward0>)
331000
92500 tensor(5.2200, device='cuda:0', grad_fn=<NllLossBackward0>)
92600 tensor(5.0955, device='cuda:0', grad_fn=<NllLossBackward0>)
92700 tensor(4.8214, device='cuda:0', grad_fn=<NllLossBackward0>)
332000
92800 tensor(4.8953, device='cuda:0', grad_fn=<NllLossBackward0>)
92900 tensor(5.4247, device='cuda:0', grad_fn=<NllLossBackward0>)
93000 tensor(5.1056, device='cuda:0', grad_fn=<NllLossBackward0>)
333000
93100 tensor(5.0311, device='cuda:0', grad_fn=<NllLossBackward0>)
93200 tensor(5.2195, device='cuda:0', grad_fn=<NllLossBackward0>)
334000
93300 tensor(5.1673, device='cuda:0', grad_fn=<NllLossBackward0>)
93400 tensor(5.2275, device='cuda:0', grad_fn=<NllLossBackward0>)
93500 tensor(4.9526, device='cuda:0', grad_fn=<NllLossBackward0>)
335000
93600 tensor(5.0896, device='cuda:0', grad_fn=<NllLossBackward0>)
93700 tensor(5.1565, device='cuda:0', grad_fn=<NllLossBackward0>)
93800 tensor(4.9091, device='cuda:0', grad_fn=<NllLossBackward0>)
336000
93900 tensor(5.1251, device='cuda:0', grad_fn=<NllLossBackward0>)
94000 tensor(5.0433, device='cuda:0', grad_fn=<NllLossBackward0>)
94100 tensor(4.9108, device='cuda:0', grad_fn=<NllLossBackward0>)
337000
94200 tensor(5.1725, device='cuda:0', grad_fn=<NllLossBackward0>)
94300 tensor(5.0600, device='cuda:0', grad_fn=<NllLossBackward0>)
94400 tensor(5.0994, device='cuda:0', grad_fn=<NllLossBackward0>)
338000
94500 tensor(4.9514, device='cuda:0', grad_fn=<NllLossBackward0>)
94600 tensor(5.2426, device='cuda:0', grad_fn=<NllLossBackward0>)
339000
94700 tensor(4.7794, device='cuda:0', grad_fn=<NllLossBackward0>)
94800 tensor(5.2669, device='cuda:0', grad_fn=<NllLossBackward0>)
94900 tensor(5.2703, device='cuda:0', grad_fn=<NllLossBackward0>)
340000
95000 tensor(5.3905, device='cuda:0', grad_fn=<NllLossBackward0>)
95100 tensor(5.0774, device='cuda:0', grad_fn=<NllLossBackward0>)
95200 tensor(4.9328, device='cuda:0', grad_fn=<NllLossBackward0>)
341000
95300 tensor(5.2937, device='cuda:0', grad_fn=<NllLossBackward0>)
95400 tensor(4.7542, device='cuda:0', grad_fn=<NllLossBackward0>)
95500 tensor(4.9919, device='cuda:0', grad_fn=<NllLossBackward0>)
342000
95600 tensor(4.9873, device='cuda:0', grad_fn=<NllLossBackward0>)
95700 tensor(5.1582, device='cuda:0', grad_fn=<NllLossBackward0>)
95800 tensor(5.0927, device='cuda:0', grad_fn=<NllLossBackward0>)
343000
95900 tensor(4.8471, device='cuda:0', grad_fn=<NllLossBackward0>)
96000 tensor(5.1951, device='cuda:0', grad_fn=<NllLossBackward0>)
344000
96100 tensor(4.9727, device='cuda:0', grad_fn=<NllLossBackward0>)
96200 tensor(4.9915, device='cuda:0', grad_fn=<NllLossBackward0>)
96300 tensor(5.0515, device='cuda:0', grad_fn=<NllLossBackward0>)
345000
96400 tensor(5.1081, device='cuda:0', grad_fn=<NllLossBackward0>)
96500 tensor(5.3060, device='cuda:0', grad_fn=<NllLossBackward0>)
96600 tensor(5.0907, device='cuda:0', grad_fn=<NllLossBackward0>)
346000
96700 tensor(5.2215, device='cuda:0', grad_fn=<NllLossBackward0>)
96800 tensor(5.1097, device='cuda:0', grad_fn=<NllLossBackward0>)
96900 tensor(5.2757, device='cuda:0', grad_fn=<NllLossBackward0>)
347000
97000 tensor(4.9539, device='cuda:0', grad_fn=<NllLossBackward0>)
97100 tensor(5.0553, device='cuda:0', grad_fn=<NllLossBackward0>)
97200 tensor(5.1827, device='cuda:0', grad_fn=<NllLossBackward0>)
348000
97300 tensor(5.0123, device='cuda:0', grad_fn=<NllLossBackward0>)
97400 tensor(5.0624, device='cuda:0', grad_fn=<NllLossBackward0>)
349000
97500 tensor(5.2006, device='cuda:0', grad_fn=<NllLossBackward0>)
97600 tensor(4.9950, device='cuda:0', grad_fn=<NllLossBackward0>)
97700 tensor(5.0549, device='cuda:0', grad_fn=<NllLossBackward0>)
350000
97800 tensor(5.0475, device='cuda:0', grad_fn=<NllLossBackward0>)
97900 tensor(4.8108, device='cuda:0', grad_fn=<NllLossBackward0>)
98000 tensor(4.9221, device='cuda:0', grad_fn=<NllLossBackward0>)
351000
98100 tensor(5.2524, device='cuda:0', grad_fn=<NllLossBackward0>)
98200 tensor(4.9745, device='cuda:0', grad_fn=<NllLossBackward0>)
98300 tensor(5.1369, device='cuda:0', grad_fn=<NllLossBackward0>)
352000
98400 tensor(5.2040, device='cuda:0', grad_fn=<NllLossBackward0>)
98500 tensor(5.2696, device='cuda:0', grad_fn=<NllLossBackward0>)
353000
98600 tensor(5.3089, device='cuda:0', grad_fn=<NllLossBackward0>)
98700 tensor(5.0973, device='cuda:0', grad_fn=<NllLossBackward0>)
98800 tensor(5.0995, device='cuda:0', grad_fn=<NllLossBackward0>)
354000
98900 tensor(5.1192, device='cuda:0', grad_fn=<NllLossBackward0>)
99000 tensor(5.2984, device='cuda:0', grad_fn=<NllLossBackward0>)
99100 tensor(5.1856, device='cuda:0', grad_fn=<NllLossBackward0>)
355000
99200 tensor(5.1883, device='cuda:0', grad_fn=<NllLossBackward0>)
99300 tensor(4.9773, device='cuda:0', grad_fn=<NllLossBackward0>)
99400 tensor(5.2897, device='cuda:0', grad_fn=<NllLossBackward0>)
356000
99500 tensor(5.3124, device='cuda:0', grad_fn=<NllLossBackward0>)
99600 tensor(5.0386, device='cuda:0', grad_fn=<NllLossBackward0>)
99700 tensor(4.8906, device='cuda:0', grad_fn=<NllLossBackward0>)
357000
99800 tensor(5.1151, device='cuda:0', grad_fn=<NllLossBackward0>)
99900 tensor(5.0333, device='cuda:0', grad_fn=<NllLossBackward0>)
100000 tensor(4.9526, device='cuda:0', grad_fn=<NllLossBackward0>)
358000
100100 tensor(5.4919, device='cuda:0', grad_fn=<NllLossBackward0>)
100200 tensor(4.7812, device='cuda:0', grad_fn=<NllLossBackward0>)
359000
100300 tensor(5.0547, device='cuda:0', grad_fn=<NllLossBackward0>)
100400 tensor(5.0233, device='cuda:0', grad_fn=<NllLossBackward0>)
100500 tensor(4.8942, device='cuda:0', grad_fn=<NllLossBackward0>)
360000
100600 tensor(4.9477, device='cuda:0', grad_fn=<NllLossBackward0>)
100700 tensor(4.9277, device='cuda:0', grad_fn=<NllLossBackward0>)
100800 tensor(5.2499, device='cuda:0', grad_fn=<NllLossBackward0>)
361000
100900 tensor(5.2162, device='cuda:0', grad_fn=<NllLossBackward0>)
101000 tensor(5.0550, device='cuda:0', grad_fn=<NllLossBackward0>)
101100 tensor(5.0957, device='cuda:0', grad_fn=<NllLossBackward0>)
362000
101200 tensor(5.0617, device='cuda:0', grad_fn=<NllLossBackward0>)
101300 tensor(5.3101, device='cuda:0', grad_fn=<NllLossBackward0>)
363000
101400 tensor(4.8387, device='cuda:0', grad_fn=<NllLossBackward0>)
101500 tensor(5.0914, device='cuda:0', grad_fn=<NllLossBackward0>)
101600 tensor(5.3198, device='cuda:0', grad_fn=<NllLossBackward0>)
364000
101700 tensor(4.9333, device='cuda:0', grad_fn=<NllLossBackward0>)
101800 tensor(4.7367, device='cuda:0', grad_fn=<NllLossBackward0>)
101900 tensor(4.9608, device='cuda:0', grad_fn=<NllLossBackward0>)
365000
102000 tensor(5.2333, device='cuda:0', grad_fn=<NllLossBackward0>)
102100 tensor(5.1444, device='cuda:0', grad_fn=<NllLossBackward0>)
102200 tensor(5.4328, device='cuda:0', grad_fn=<NllLossBackward0>)
366000
102300 tensor(5.2760, device='cuda:0', grad_fn=<NllLossBackward0>)
102400 tensor(5.2229, device='cuda:0', grad_fn=<NllLossBackward0>)
102500 tensor(5.2043, device='cuda:0', grad_fn=<NllLossBackward0>)
367000
102600 tensor(5.1444, device='cuda:0', grad_fn=<NllLossBackward0>)
102700 tensor(5.2240, device='cuda:0', grad_fn=<NllLossBackward0>)
368000
102800 tensor(5.1234, device='cuda:0', grad_fn=<NllLossBackward0>)
102900 tensor(4.8963, device='cuda:0', grad_fn=<NllLossBackward0>)
103000 tensor(5.0684, device='cuda:0', grad_fn=<NllLossBackward0>)
369000
103100 tensor(5.3707, device='cuda:0', grad_fn=<NllLossBackward0>)
103200 tensor(4.9859, device='cuda:0', grad_fn=<NllLossBackward0>)
103300 tensor(5.0649, device='cuda:0', grad_fn=<NllLossBackward0>)
370000
103400 tensor(5.0871, device='cuda:0', grad_fn=<NllLossBackward0>)
103500 tensor(4.9837, device='cuda:0', grad_fn=<NllLossBackward0>)
103600 tensor(5.1120, device='cuda:0', grad_fn=<NllLossBackward0>)
371000
103700 tensor(5.2086, device='cuda:0', grad_fn=<NllLossBackward0>)
103800 tensor(5.0741, device='cuda:0', grad_fn=<NllLossBackward0>)
103900 tensor(4.9891, device='cuda:0', grad_fn=<NllLossBackward0>)
372000
104000 tensor(4.9637, device='cuda:0', grad_fn=<NllLossBackward0>)
104100 tensor(5.0033, device='cuda:0', grad_fn=<NllLossBackward0>)
373000
104200 tensor(5.1105, device='cuda:0', grad_fn=<NllLossBackward0>)
104300 tensor(4.9057, device='cuda:0', grad_fn=<NllLossBackward0>)
104400 tensor(5.2394, device='cuda:0', grad_fn=<NllLossBackward0>)
374000
104500 tensor(5.1488, device='cuda:0', grad_fn=<NllLossBackward0>)
104600 tensor(5.0034, device='cuda:0', grad_fn=<NllLossBackward0>)
104700 tensor(5.0169, device='cuda:0', grad_fn=<NllLossBackward0>)
375000
104800 tensor(5.5932, device='cuda:0', grad_fn=<NllLossBackward0>)
104900 tensor(5.0594, device='cuda:0', grad_fn=<NllLossBackward0>)
105000 tensor(5.1818, device='cuda:0', grad_fn=<NllLossBackward0>)
376000
105100 tensor(5.1265, device='cuda:0', grad_fn=<NllLossBackward0>)
105200 tensor(5.2194, device='cuda:0', grad_fn=<NllLossBackward0>)
105300 tensor(5.1688, device='cuda:0', grad_fn=<NllLossBackward0>)
377000
105400 tensor(5.3924, device='cuda:0', grad_fn=<NllLossBackward0>)
105500 tensor(5.2369, device='cuda:0', grad_fn=<NllLossBackward0>)
378000
105600 tensor(5.0005, device='cuda:0', grad_fn=<NllLossBackward0>)
105700 tensor(4.9490, device='cuda:0', grad_fn=<NllLossBackward0>)
105800 tensor(4.9947, device='cuda:0', grad_fn=<NllLossBackward0>)
379000
105900 tensor(5.1400, device='cuda:0', grad_fn=<NllLossBackward0>)
106000 tensor(5.1712, device='cuda:0', grad_fn=<NllLossBackward0>)
106100 tensor(5.3889, device='cuda:0', grad_fn=<NllLossBackward0>)
380000
106200 tensor(5.0640, device='cuda:0', grad_fn=<NllLossBackward0>)
106300 tensor(5.1624, device='cuda:0', grad_fn=<NllLossBackward0>)
106400 tensor(4.9938, device='cuda:0', grad_fn=<NllLossBackward0>)
381000
106500 tensor(5.2659, device='cuda:0', grad_fn=<NllLossBackward0>)
106600 tensor(4.9500, device='cuda:0', grad_fn=<NllLossBackward0>)
106700 tensor(5.2001, device='cuda:0', grad_fn=<NllLossBackward0>)
382000
106800 tensor(4.9776, device='cuda:0', grad_fn=<NllLossBackward0>)
106900 tensor(5.2684, device='cuda:0', grad_fn=<NllLossBackward0>)
383000
107000 tensor(4.9327, device='cuda:0', grad_fn=<NllLossBackward0>)
107100 tensor(5.1739, device='cuda:0', grad_fn=<NllLossBackward0>)
107200 tensor(5.1641, device='cuda:0', grad_fn=<NllLossBackward0>)
384000
107300 tensor(4.9207, device='cuda:0', grad_fn=<NllLossBackward0>)
107400 tensor(5.1112, device='cuda:0', grad_fn=<NllLossBackward0>)
107500 tensor(4.8839, device='cuda:0', grad_fn=<NllLossBackward0>)
385000
107600 tensor(4.9063, device='cuda:0', grad_fn=<NllLossBackward0>)
107700 tensor(5.1218, device='cuda:0', grad_fn=<NllLossBackward0>)
107800 tensor(5.1329, device='cuda:0', grad_fn=<NllLossBackward0>)
386000
107900 tensor(5.3598, device='cuda:0', grad_fn=<NllLossBackward0>)
108000 tensor(5.1047, device='cuda:0', grad_fn=<NllLossBackward0>)
108100 tensor(5.0709, device='cuda:0', grad_fn=<NllLossBackward0>)
387000
108200 tensor(5.0716, device='cuda:0', grad_fn=<NllLossBackward0>)
108300 tensor(5.0979, device='cuda:0', grad_fn=<NllLossBackward0>)
388000
108400 tensor(5.2054, device='cuda:0', grad_fn=<NllLossBackward0>)
108500 tensor(4.8856, device='cuda:0', grad_fn=<NllLossBackward0>)
108600 tensor(5.0763, device='cuda:0', grad_fn=<NllLossBackward0>)
389000
108700 tensor(4.6029, device='cuda:0', grad_fn=<NllLossBackward0>)
108800 tensor(4.9980, device='cuda:0', grad_fn=<NllLossBackward0>)
108900 tensor(4.7433, device='cuda:0', grad_fn=<NllLossBackward0>)
390000
109000 tensor(5.1747, device='cuda:0', grad_fn=<NllLossBackward0>)
109100 tensor(4.7590, device='cuda:0', grad_fn=<NllLossBackward0>)
109200 tensor(4.9555, device='cuda:0', grad_fn=<NllLossBackward0>)
391000
109300 tensor(4.6271, device='cuda:0', grad_fn=<NllLossBackward0>)
109400 tensor(4.9270, device='cuda:0', grad_fn=<NllLossBackward0>)
109500 tensor(4.8670, device='cuda:0', grad_fn=<NllLossBackward0>)
392000
109600 tensor(5.0673, device='cuda:0', grad_fn=<NllLossBackward0>)
109700 tensor(5.4325, device='cuda:0', grad_fn=<NllLossBackward0>)
393000
109800 tensor(4.7563, device='cuda:0', grad_fn=<NllLossBackward0>)
109900 tensor(4.9688, device='cuda:0', grad_fn=<NllLossBackward0>)
110000 tensor(5.0599, device='cuda:0', grad_fn=<NllLossBackward0>)
394000
110100 tensor(4.9771, device='cuda:0', grad_fn=<NllLossBackward0>)
110200 tensor(5.3440, device='cuda:0', grad_fn=<NllLossBackward0>)
110300 tensor(5.2067, device='cuda:0', grad_fn=<NllLossBackward0>)
395000
110400 tensor(4.6912, device='cuda:0', grad_fn=<NllLossBackward0>)
110500 tensor(5.1135, device='cuda:0', grad_fn=<NllLossBackward0>)
110600 tensor(4.9512, device='cuda:0', grad_fn=<NllLossBackward0>)
396000
110700 tensor(4.9049, device='cuda:0', grad_fn=<NllLossBackward0>)
110800 tensor(5.3380, device='cuda:0', grad_fn=<NllLossBackward0>)
110900 tensor(4.9982, device='cuda:0', grad_fn=<NllLossBackward0>)
397000
111000 tensor(5.1906, device='cuda:0', grad_fn=<NllLossBackward0>)
111100 tensor(4.9450, device='cuda:0', grad_fn=<NllLossBackward0>)
398000
111200 tensor(4.9843, device='cuda:0', grad_fn=<NllLossBackward0>)
111300 tensor(5.0615, device='cuda:0', grad_fn=<NllLossBackward0>)
111400 tensor(5.1630, device='cuda:0', grad_fn=<NllLossBackward0>)
399000
111500 tensor(5.2631, device='cuda:0', grad_fn=<NllLossBackward0>)
111600 tensor(5.2726, device='cuda:0', grad_fn=<NllLossBackward0>)
111700 tensor(4.8465, device='cuda:0', grad_fn=<NllLossBackward0>)
400000
111800 tensor(5.0638, device='cuda:0', grad_fn=<NllLossBackward0>)
111900 tensor(5.1322, device='cuda:0', grad_fn=<NllLossBackward0>)
112000 tensor(5.0956, device='cuda:0', grad_fn=<NllLossBackward0>)
401000
112100 tensor(4.6850, device='cuda:0', grad_fn=<NllLossBackward0>)
112200 tensor(5.0941, device='cuda:0', grad_fn=<NllLossBackward0>)
402000
112300 tensor(5.2451, device='cuda:0', grad_fn=<NllLossBackward0>)
112400 tensor(5.2617, device='cuda:0', grad_fn=<NllLossBackward0>)
112500 tensor(4.8904, device='cuda:0', grad_fn=<NllLossBackward0>)
403000
112600 tensor(4.9779, device='cuda:0', grad_fn=<NllLossBackward0>)
112700 tensor(4.9773, device='cuda:0', grad_fn=<NllLossBackward0>)
112800 tensor(4.6058, device='cuda:0', grad_fn=<NllLossBackward0>)
404000
112900 tensor(4.8841, device='cuda:0', grad_fn=<NllLossBackward0>)
113000 tensor(5.1203, device='cuda:0', grad_fn=<NllLossBackward0>)
113100 tensor(5.0650, device='cuda:0', grad_fn=<NllLossBackward0>)
405000
113200 tensor(4.6302, device='cuda:0', grad_fn=<NllLossBackward0>)
113300 tensor(4.9626, device='cuda:0', grad_fn=<NllLossBackward0>)
113400 tensor(5.0637, device='cuda:0', grad_fn=<NllLossBackward0>)
406000
113500 tensor(5.3193, device='cuda:0', grad_fn=<NllLossBackward0>)
113600 tensor(5.2732, device='cuda:0', grad_fn=<NllLossBackward0>)
407000
113700 tensor(4.8750, device='cuda:0', grad_fn=<NllLossBackward0>)
113800 tensor(5.2875, device='cuda:0', grad_fn=<NllLossBackward0>)
113900 tensor(5.0677, device='cuda:0', grad_fn=<NllLossBackward0>)
408000
114000 tensor(4.7401, device='cuda:0', grad_fn=<NllLossBackward0>)
114100 tensor(4.8493, device='cuda:0', grad_fn=<NllLossBackward0>)
114200 tensor(4.8203, device='cuda:0', grad_fn=<NllLossBackward0>)
409000
114300 tensor(4.6964, device='cuda:0', grad_fn=<NllLossBackward0>)
114400 tensor(4.9045, device='cuda:0', grad_fn=<NllLossBackward0>)
114500 tensor(4.9766, device='cuda:0', grad_fn=<NllLossBackward0>)
410000
114600 tensor(5.2010, device='cuda:0', grad_fn=<NllLossBackward0>)
114700 tensor(5.1521, device='cuda:0', grad_fn=<NllLossBackward0>)
114800 tensor(5.0945, device='cuda:0', grad_fn=<NllLossBackward0>)
411000
114900 tensor(5.0311, device='cuda:0', grad_fn=<NllLossBackward0>)
115000 tensor(5.0057, device='cuda:0', grad_fn=<NllLossBackward0>)
412000
115100 tensor(5.1129, device='cuda:0', grad_fn=<NllLossBackward0>)
115200 tensor(5.0684, device='cuda:0', grad_fn=<NllLossBackward0>)
115300 tensor(5.1183, device='cuda:0', grad_fn=<NllLossBackward0>)
413000
115400 tensor(5.3141, device='cuda:0', grad_fn=<NllLossBackward0>)
115500 tensor(5.3232, device='cuda:0', grad_fn=<NllLossBackward0>)
115600 tensor(5.1170, device='cuda:0', grad_fn=<NllLossBackward0>)
414000
115700 tensor(5.0023, device='cuda:0', grad_fn=<NllLossBackward0>)
115800 tensor(5.1229, device='cuda:0', grad_fn=<NllLossBackward0>)
115900 tensor(5.1923, device='cuda:0', grad_fn=<NllLossBackward0>)
415000
116000 tensor(5.1286, device='cuda:0', grad_fn=<NllLossBackward0>)
116100 tensor(4.8629, device='cuda:0', grad_fn=<NllLossBackward0>)
116200 tensor(5.2643, device='cuda:0', grad_fn=<NllLossBackward0>)
416000
116300 tensor(4.4037, device='cuda:0', grad_fn=<NllLossBackward0>)
116400 tensor(5.1548, device='cuda:0', grad_fn=<NllLossBackward0>)
417000
116500 tensor(5.0993, device='cuda:0', grad_fn=<NllLossBackward0>)
116600 tensor(5.1907, device='cuda:0', grad_fn=<NllLossBackward0>)
116700 tensor(5.0096, device='cuda:0', grad_fn=<NllLossBackward0>)
418000
116800 tensor(4.8711, device='cuda:0', grad_fn=<NllLossBackward0>)
116900 tensor(5.1060, device='cuda:0', grad_fn=<NllLossBackward0>)
117000 tensor(5.0866, device='cuda:0', grad_fn=<NllLossBackward0>)
419000
117100 tensor(5.1294, device='cuda:0', grad_fn=<NllLossBackward0>)
117200 tensor(4.9697, device='cuda:0', grad_fn=<NllLossBackward0>)
117300 tensor(5.3102, device='cuda:0', grad_fn=<NllLossBackward0>)
420000
117400 tensor(5.3132, device='cuda:0', grad_fn=<NllLossBackward0>)
117500 tensor(4.8965, device='cuda:0', grad_fn=<NllLossBackward0>)
421000
117600 tensor(5.2093, device='cuda:0', grad_fn=<NllLossBackward0>)
117700 tensor(5.1284, device='cuda:0', grad_fn=<NllLossBackward0>)
117800 tensor(4.9976, device='cuda:0', grad_fn=<NllLossBackward0>)
422000
117900 tensor(5.1066, device='cuda:0', grad_fn=<NllLossBackward0>)
118000 tensor(5.1549, device='cuda:0', grad_fn=<NllLossBackward0>)
118100 tensor(4.9796, device='cuda:0', grad_fn=<NllLossBackward0>)
423000
118200 tensor(5.0112, device='cuda:0', grad_fn=<NllLossBackward0>)
118300 tensor(5.0054, device='cuda:0', grad_fn=<NllLossBackward0>)
118400 tensor(5.1148, device='cuda:0', grad_fn=<NllLossBackward0>)
424000
118500 tensor(5.0216, device='cuda:0', grad_fn=<NllLossBackward0>)
118600 tensor(5.1426, device='cuda:0', grad_fn=<NllLossBackward0>)
118700 tensor(5.1632, device='cuda:0', grad_fn=<NllLossBackward0>)
425000
118800 tensor(5.1688, device='cuda:0', grad_fn=<NllLossBackward0>)
118900 tensor(5.2083, device='cuda:0', grad_fn=<NllLossBackward0>)
426000
119000 tensor(4.7053, device='cuda:0', grad_fn=<NllLossBackward0>)
119100 tensor(4.9990, device='cuda:0', grad_fn=<NllLossBackward0>)
119200 tensor(5.3487, device='cuda:0', grad_fn=<NllLossBackward0>)
427000
119300 tensor(4.9779, device='cuda:0', grad_fn=<NllLossBackward0>)
119400 tensor(5.0351, device='cuda:0', grad_fn=<NllLossBackward0>)
119500 tensor(5.0373, device='cuda:0', grad_fn=<NllLossBackward0>)
428000
119600 tensor(5.3674, device='cuda:0', grad_fn=<NllLossBackward0>)
119700 tensor(4.9741, device='cuda:0', grad_fn=<NllLossBackward0>)
119800 tensor(5.0577, device='cuda:0', grad_fn=<NllLossBackward0>)
429000
119900 tensor(4.8425, device='cuda:0', grad_fn=<NllLossBackward0>)
120000 tensor(4.9597, device='cuda:0', grad_fn=<NllLossBackward0>)
120100 tensor(4.9721, device='cuda:0', grad_fn=<NllLossBackward0>)
430000
120200 tensor(5.1214, device='cuda:0', grad_fn=<NllLossBackward0>)
120300 tensor(5.0225, device='cuda:0', grad_fn=<NllLossBackward0>)
431000
120400 tensor(5.3908, device='cuda:0', grad_fn=<NllLossBackward0>)
120500 tensor(4.9955, device='cuda:0', grad_fn=<NllLossBackward0>)
120600 tensor(5.2588, device='cuda:0', grad_fn=<NllLossBackward0>)
432000
epoch: = 1
0 tensor(5.2093, device='cuda:0', grad_fn=<NllLossBackward0>)
100 tensor(5.3814, device='cuda:0', grad_fn=<NllLossBackward0>)
200 tensor(5.1460, device='cuda:0', grad_fn=<NllLossBackward0>)
1000
300 tensor(5.3199, device='cuda:0', grad_fn=<NllLossBackward0>)
400 tensor(5.1499, device='cuda:0', grad_fn=<NllLossBackward0>)
500 tensor(5.0303, device='cuda:0', grad_fn=<NllLossBackward0>)
2000
600 tensor(5.0671, device='cuda:0', grad_fn=<NllLossBackward0>)
700 tensor(5.0000, device='cuda:0', grad_fn=<NllLossBackward0>)
800 tensor(4.9863, device='cuda:0', grad_fn=<NllLossBackward0>)
3000
900 tensor(5.0532, device='cuda:0', grad_fn=<NllLossBackward0>)
1000 tensor(5.0634, device='cuda:0', grad_fn=<NllLossBackward0>)
1100 tensor(4.8666, device='cuda:0', grad_fn=<NllLossBackward0>)
4000
1200 tensor(4.8362, device='cuda:0', grad_fn=<NllLossBackward0>)
1300 tensor(5.1104, device='cuda:0', grad_fn=<NllLossBackward0>)
1400 tensor(5.2784, device='cuda:0', grad_fn=<NllLossBackward0>)
5000
1500 tensor(5.3135, device='cuda:0', grad_fn=<NllLossBackward0>)
1600 tensor(5.1400, device='cuda:0', grad_fn=<NllLossBackward0>)
6000
1700 tensor(5.2427, device='cuda:0', grad_fn=<NllLossBackward0>)
1800 tensor(5.2361, device='cuda:0', grad_fn=<NllLossBackward0>)
1900 tensor(5.0860, device='cuda:0', grad_fn=<NllLossBackward0>)
7000
2000 tensor(4.7754, device='cuda:0', grad_fn=<NllLossBackward0>)
2100 tensor(5.2257, device='cuda:0', grad_fn=<NllLossBackward0>)
2200 tensor(4.9228, device='cuda:0', grad_fn=<NllLossBackward0>)
8000
2300 tensor(5.2087, device='cuda:0', grad_fn=<NllLossBackward0>)
2400 tensor(4.6009, device='cuda:0', grad_fn=<NllLossBackward0>)
2500 tensor(5.0485, device='cuda:0', grad_fn=<NllLossBackward0>)
9000
2600 tensor(4.8077, device='cuda:0', grad_fn=<NllLossBackward0>)
2700 tensor(5.1105, device='cuda:0', grad_fn=<NllLossBackward0>)
2800 tensor(4.6892, device='cuda:0', grad_fn=<NllLossBackward0>)
10000
2900 tensor(4.9577, device='cuda:0', grad_fn=<NllLossBackward0>)
3000 tensor(5.0162, device='cuda:0', grad_fn=<NllLossBackward0>)
11000
3100 tensor(4.9243, device='cuda:0', grad_fn=<NllLossBackward0>)
3200 tensor(5.0965, device='cuda:0', grad_fn=<NllLossBackward0>)
3300 tensor(4.8072, device='cuda:0', grad_fn=<NllLossBackward0>)
12000
3400 tensor(5.0026, device='cuda:0', grad_fn=<NllLossBackward0>)
3500 tensor(4.8778, device='cuda:0', grad_fn=<NllLossBackward0>)
3600 tensor(4.7804, device='cuda:0', grad_fn=<NllLossBackward0>)
13000
3700 tensor(5.0536, device='cuda:0', grad_fn=<NllLossBackward0>)
3800 tensor(5.0348, device='cuda:0', grad_fn=<NllLossBackward0>)
3900 tensor(5.0932, device='cuda:0', grad_fn=<NllLossBackward0>)
14000
4000 tensor(4.8247, device='cuda:0', grad_fn=<NllLossBackward0>)
4100 tensor(5.2288, device='cuda:0', grad_fn=<NllLossBackward0>)
4200 tensor(4.7091, device='cuda:0', grad_fn=<NllLossBackward0>)
15000
4300 tensor(5.1466, device='cuda:0', grad_fn=<NllLossBackward0>)
4400 tensor(4.8608, device='cuda:0', grad_fn=<NllLossBackward0>)
16000
4500 tensor(5.0831, device='cuda:0', grad_fn=<NllLossBackward0>)
4600 tensor(5.1534, device='cuda:0', grad_fn=<NllLossBackward0>)
4700 tensor(5.1433, device='cuda:0', grad_fn=<NllLossBackward0>)
17000
4800 tensor(5.0023, device='cuda:0', grad_fn=<NllLossBackward0>)
4900 tensor(5.1541, device='cuda:0', grad_fn=<NllLossBackward0>)
5000 tensor(5.2215, device='cuda:0', grad_fn=<NllLossBackward0>)
18000
5100 tensor(5.0103, device='cuda:0', grad_fn=<NllLossBackward0>)
5200 tensor(5.2232, device='cuda:0', grad_fn=<NllLossBackward0>)
5300 tensor(5.0542, device='cuda:0', grad_fn=<NllLossBackward0>)
19000
5400 tensor(4.9754, device='cuda:0', grad_fn=<NllLossBackward0>)
5500 tensor(5.1588, device='cuda:0', grad_fn=<NllLossBackward0>)
20000
5600 tensor(4.9750, device='cuda:0', grad_fn=<NllLossBackward0>)
5700 tensor(5.0814, device='cuda:0', grad_fn=<NllLossBackward0>)
5800 tensor(5.4650, device='cuda:0', grad_fn=<NllLossBackward0>)
21000
5900 tensor(5.2366, device='cuda:0', grad_fn=<NllLossBackward0>)
6000 tensor(5.2223, device='cuda:0', grad_fn=<NllLossBackward0>)
6100 tensor(4.9001, device='cuda:0', grad_fn=<NllLossBackward0>)
22000
6200 tensor(5.3510, device='cuda:0', grad_fn=<NllLossBackward0>)
6300 tensor(5.2366, device='cuda:0', grad_fn=<NllLossBackward0>)
6400 tensor(5.0379, device='cuda:0', grad_fn=<NllLossBackward0>)
23000
6500 tensor(5.0265, device='cuda:0', grad_fn=<NllLossBackward0>)
6600 tensor(5.5000, device='cuda:0', grad_fn=<NllLossBackward0>)
6700 tensor(4.8587, device='cuda:0', grad_fn=<NllLossBackward0>)
24000
6800 tensor(4.8712, device='cuda:0', grad_fn=<NllLossBackward0>)
6900 tensor(5.1405, device='cuda:0', grad_fn=<NllLossBackward0>)
25000
7000 tensor(5.0429, device='cuda:0', grad_fn=<NllLossBackward0>)
7100 tensor(5.1420, device='cuda:0', grad_fn=<NllLossBackward0>)
7200 tensor(5.2794, device='cuda:0', grad_fn=<NllLossBackward0>)
26000
7300 tensor(5.2271, device='cuda:0', grad_fn=<NllLossBackward0>)
7400 tensor(4.9754, device='cuda:0', grad_fn=<NllLossBackward0>)
7500 tensor(5.3659, device='cuda:0', grad_fn=<NllLossBackward0>)
27000
7600 tensor(4.7902, device='cuda:0', grad_fn=<NllLossBackward0>)
7700 tensor(5.2327, device='cuda:0', grad_fn=<NllLossBackward0>)
7800 tensor(5.5528, device='cuda:0', grad_fn=<NllLossBackward0>)
28000
7900 tensor(5.0152, device='cuda:0', grad_fn=<NllLossBackward0>)
8000 tensor(5.1026, device='cuda:0', grad_fn=<NllLossBackward0>)
8100 tensor(4.6152, device='cuda:0', grad_fn=<NllLossBackward0>)
29000
8200 tensor(5.0953, device='cuda:0', grad_fn=<NllLossBackward0>)
8300 tensor(5.0372, device='cuda:0', grad_fn=<NllLossBackward0>)
30000
8400 tensor(5.0211, device='cuda:0', grad_fn=<NllLossBackward0>)
8500 tensor(4.7475, device='cuda:0', grad_fn=<NllLossBackward0>)
8600 tensor(4.8443, device='cuda:0', grad_fn=<NllLossBackward0>)
31000
8700 tensor(5.1471, device='cuda:0', grad_fn=<NllLossBackward0>)
8800 tensor(5.1762, device='cuda:0', grad_fn=<NllLossBackward0>)
8900 tensor(5.4800, device='cuda:0', grad_fn=<NllLossBackward0>)
32000
9000 tensor(4.9010, device='cuda:0', grad_fn=<NllLossBackward0>)
9100 tensor(5.1490, device='cuda:0', grad_fn=<NllLossBackward0>)
9200 tensor(5.2119, device='cuda:0', grad_fn=<NllLossBackward0>)
33000
9300 tensor(5.0652, device='cuda:0', grad_fn=<NllLossBackward0>)
9400 tensor(5.0831, device='cuda:0', grad_fn=<NllLossBackward0>)
34000
9500 tensor(4.9114, device='cuda:0', grad_fn=<NllLossBackward0>)
9600 tensor(4.9139, device='cuda:0', grad_fn=<NllLossBackward0>)
9700 tensor(5.0307, device='cuda:0', grad_fn=<NllLossBackward0>)
35000
9800 tensor(5.1592, device='cuda:0', grad_fn=<NllLossBackward0>)
9900 tensor(4.9091, device='cuda:0', grad_fn=<NllLossBackward0>)
10000 tensor(5.2556, device='cuda:0', grad_fn=<NllLossBackward0>)
36000
10100 tensor(5.2265, device='cuda:0', grad_fn=<NllLossBackward0>)
10200 tensor(5.0764, device='cuda:0', grad_fn=<NllLossBackward0>)
10300 tensor(5.2197, device='cuda:0', grad_fn=<NllLossBackward0>)
37000
10400 tensor(4.9872, device='cuda:0', grad_fn=<NllLossBackward0>)
10500 tensor(5.0030, device='cuda:0', grad_fn=<NllLossBackward0>)
10600 tensor(4.7093, device='cuda:0', grad_fn=<NllLossBackward0>)
38000
10700 tensor(5.1943, device='cuda:0', grad_fn=<NllLossBackward0>)
10800 tensor(4.6028, device='cuda:0', grad_fn=<NllLossBackward0>)
39000
10900 tensor(5.1688, device='cuda:0', grad_fn=<NllLossBackward0>)
11000 tensor(4.9071, device='cuda:0', grad_fn=<NllLossBackward0>)
11100 tensor(4.7852, device='cuda:0', grad_fn=<NllLossBackward0>)
40000
11200 tensor(5.1724, device='cuda:0', grad_fn=<NllLossBackward0>)
11300 tensor(5.1048, device='cuda:0', grad_fn=<NllLossBackward0>)
11400 tensor(5.3444, device='cuda:0', grad_fn=<NllLossBackward0>)
41000
11500 tensor(5.2196, device='cuda:0', grad_fn=<NllLossBackward0>)
11600 tensor(4.8939, device='cuda:0', grad_fn=<NllLossBackward0>)
11700 tensor(5.2069, device='cuda:0', grad_fn=<NllLossBackward0>)
42000
11800 tensor(5.3687, device='cuda:0', grad_fn=<NllLossBackward0>)
11900 tensor(4.9033, device='cuda:0', grad_fn=<NllLossBackward0>)
12000 tensor(5.1747, device='cuda:0', grad_fn=<NllLossBackward0>)
43000
12100 tensor(5.0138, device='cuda:0', grad_fn=<NllLossBackward0>)
12200 tensor(5.0538, device='cuda:0', grad_fn=<NllLossBackward0>)
44000
12300 tensor(5.2248, device='cuda:0', grad_fn=<NllLossBackward0>)
12400 tensor(4.8343, device='cuda:0', grad_fn=<NllLossBackward0>)
12500 tensor(5.0820, device='cuda:0', grad_fn=<NllLossBackward0>)
45000
12600 tensor(5.0378, device='cuda:0', grad_fn=<NllLossBackward0>)
12700 tensor(4.7855, device='cuda:0', grad_fn=<NllLossBackward0>)
12800 tensor(5.1147, device='cuda:0', grad_fn=<NllLossBackward0>)
46000
12900 tensor(5.2598, device='cuda:0', grad_fn=<NllLossBackward0>)
13000 tensor(4.8743, device='cuda:0', grad_fn=<NllLossBackward0>)
13100 tensor(5.1137, device='cuda:0', grad_fn=<NllLossBackward0>)
47000
13200 tensor(5.2270, device='cuda:0', grad_fn=<NllLossBackward0>)
13300 tensor(5.3398, device='cuda:0', grad_fn=<NllLossBackward0>)
13400 tensor(4.7988, device='cuda:0', grad_fn=<NllLossBackward0>)
48000
13500 tensor(5.1067, device='cuda:0', grad_fn=<NllLossBackward0>)
13600 tensor(5.0730, device='cuda:0', grad_fn=<NllLossBackward0>)
49000
13700 tensor(5.1172, device='cuda:0', grad_fn=<NllLossBackward0>)
13800 tensor(4.9974, device='cuda:0', grad_fn=<NllLossBackward0>)
13900 tensor(5.0809, device='cuda:0', grad_fn=<NllLossBackward0>)
50000
14000 tensor(5.0515, device='cuda:0', grad_fn=<NllLossBackward0>)
14100 tensor(5.2082, device='cuda:0', grad_fn=<NllLossBackward0>)
14200 tensor(5.2196, device='cuda:0', grad_fn=<NllLossBackward0>)
51000
14300 tensor(5.1963, device='cuda:0', grad_fn=<NllLossBackward0>)
14400 tensor(4.6420, device='cuda:0', grad_fn=<NllLossBackward0>)
14500 tensor(4.7768, device='cuda:0', grad_fn=<NllLossBackward0>)
52000
14600 tensor(4.8853, device='cuda:0', grad_fn=<NllLossBackward0>)
14700 tensor(5.2161, device='cuda:0', grad_fn=<NllLossBackward0>)
14800 tensor(4.8348, device='cuda:0', grad_fn=<NllLossBackward0>)
53000
14900 tensor(4.7723, device='cuda:0', grad_fn=<NllLossBackward0>)
15000 tensor(5.1951, device='cuda:0', grad_fn=<NllLossBackward0>)
54000
15100 tensor(4.5943, device='cuda:0', grad_fn=<NllLossBackward0>)
15200 tensor(4.8785, device='cuda:0', grad_fn=<NllLossBackward0>)
15300 tensor(5.2401, device='cuda:0', grad_fn=<NllLossBackward0>)
55000
15400 tensor(5.1345, device='cuda:0', grad_fn=<NllLossBackward0>)
15500 tensor(4.9845, device='cuda:0', grad_fn=<NllLossBackward0>)
15600 tensor(5.0955, device='cuda:0', grad_fn=<NllLossBackward0>)
56000
15700 tensor(5.0750, device='cuda:0', grad_fn=<NllLossBackward0>)
15800 tensor(4.8953, device='cuda:0', grad_fn=<NllLossBackward0>)
15900 tensor(5.0173, device='cuda:0', grad_fn=<NllLossBackward0>)
57000
16000 tensor(5.3809, device='cuda:0', grad_fn=<NllLossBackward0>)
16100 tensor(5.2108, device='cuda:0', grad_fn=<NllLossBackward0>)
16200 tensor(4.9123, device='cuda:0', grad_fn=<NllLossBackward0>)
58000
16300 tensor(4.8491, device='cuda:0', grad_fn=<NllLossBackward0>)
16400 tensor(5.1661, device='cuda:0', grad_fn=<NllLossBackward0>)
59000
16500 tensor(5.0995, device='cuda:0', grad_fn=<NllLossBackward0>)
16600 tensor(5.1714, device='cuda:0', grad_fn=<NllLossBackward0>)
16700 tensor(5.0571, device='cuda:0', grad_fn=<NllLossBackward0>)
60000
16800 tensor(5.1657, device='cuda:0', grad_fn=<NllLossBackward0>)
16900 tensor(5.1391, device='cuda:0', grad_fn=<NllLossBackward0>)
17000 tensor(5.1067, device='cuda:0', grad_fn=<NllLossBackward0>)
61000
17100 tensor(5.1680, device='cuda:0', grad_fn=<NllLossBackward0>)
17200 tensor(5.2227, device='cuda:0', grad_fn=<NllLossBackward0>)
17300 tensor(4.6891, device='cuda:0', grad_fn=<NllLossBackward0>)
62000
17400 tensor(5.0639, device='cuda:0', grad_fn=<NllLossBackward0>)
17500 tensor(5.0507, device='cuda:0', grad_fn=<NllLossBackward0>)
17600 tensor(4.9053, device='cuda:0', grad_fn=<NllLossBackward0>)
63000
17700 tensor(4.8750, device='cuda:0', grad_fn=<NllLossBackward0>)
17800 tensor(4.7771, device='cuda:0', grad_fn=<NllLossBackward0>)
64000
17900 tensor(4.9931, device='cuda:0', grad_fn=<NllLossBackward0>)
18000 tensor(5.1721, device='cuda:0', grad_fn=<NllLossBackward0>)
18100 tensor(5.0419, device='cuda:0', grad_fn=<NllLossBackward0>)
65000
18200 tensor(5.0401, device='cuda:0', grad_fn=<NllLossBackward0>)
18300 tensor(4.9477, device='cuda:0', grad_fn=<NllLossBackward0>)
18400 tensor(4.9341, device='cuda:0', grad_fn=<NllLossBackward0>)
66000
18500 tensor(5.0645, device='cuda:0', grad_fn=<NllLossBackward0>)
18600 tensor(5.1702, device='cuda:0', grad_fn=<NllLossBackward0>)
18700 tensor(5.0680, device='cuda:0', grad_fn=<NllLossBackward0>)
67000
18800 tensor(4.7324, device='cuda:0', grad_fn=<NllLossBackward0>)
18900 tensor(5.1745, device='cuda:0', grad_fn=<NllLossBackward0>)
68000
19000 tensor(5.0148, device='cuda:0', grad_fn=<NllLossBackward0>)
19100 tensor(4.9552, device='cuda:0', grad_fn=<NllLossBackward0>)
19200 tensor(5.0249, device='cuda:0', grad_fn=<NllLossBackward0>)
69000
19300 tensor(5.0709, device='cuda:0', grad_fn=<NllLossBackward0>)
19400 tensor(5.1439, device='cuda:0', grad_fn=<NllLossBackward0>)
19500 tensor(4.9734, device='cuda:0', grad_fn=<NllLossBackward0>)
70000
19600 tensor(5.2309, device='cuda:0', grad_fn=<NllLossBackward0>)
19700 tensor(5.0292, device='cuda:0', grad_fn=<NllLossBackward0>)
19800 tensor(4.4443, device='cuda:0', grad_fn=<NllLossBackward0>)
71000
19900 tensor(5.2148, device='cuda:0', grad_fn=<NllLossBackward0>)
20000 tensor(4.8955, device='cuda:0', grad_fn=<NllLossBackward0>)
20100 tensor(4.9404, device='cuda:0', grad_fn=<NllLossBackward0>)
72000
20200 tensor(5.0525, device='cuda:0', grad_fn=<NllLossBackward0>)
20300 tensor(5.2610, device='cuda:0', grad_fn=<NllLossBackward0>)
73000
20400 tensor(5.2233, device='cuda:0', grad_fn=<NllLossBackward0>)
20500 tensor(5.1364, device='cuda:0', grad_fn=<NllLossBackward0>)
20600 tensor(4.9917, device='cuda:0', grad_fn=<NllLossBackward0>)
74000
20700 tensor(4.9608, device='cuda:0', grad_fn=<NllLossBackward0>)
20800 tensor(5.1992, device='cuda:0', grad_fn=<NllLossBackward0>)
20900 tensor(5.1686, device='cuda:0', grad_fn=<NllLossBackward0>)
75000
21000 tensor(4.8483, device='cuda:0', grad_fn=<NllLossBackward0>)
21100 tensor(5.4130, device='cuda:0', grad_fn=<NllLossBackward0>)
21200 tensor(4.7093, device='cuda:0', grad_fn=<NllLossBackward0>)
76000
21300 tensor(5.2148, device='cuda:0', grad_fn=<NllLossBackward0>)
21400 tensor(4.9624, device='cuda:0', grad_fn=<NllLossBackward0>)
21500 tensor(4.9292, device='cuda:0', grad_fn=<NllLossBackward0>)
77000
21600 tensor(5.1661, device='cuda:0', grad_fn=<NllLossBackward0>)
21700 tensor(5.0725, device='cuda:0', grad_fn=<NllLossBackward0>)
21800 tensor(4.7123, device='cuda:0', grad_fn=<NllLossBackward0>)
78000
21900 tensor(5.2135, device='cuda:0', grad_fn=<NllLossBackward0>)
22000 tensor(4.8017, device='cuda:0', grad_fn=<NllLossBackward0>)
79000
22100 tensor(5.1210, device='cuda:0', grad_fn=<NllLossBackward0>)
22200 tensor(5.0554, device='cuda:0', grad_fn=<NllLossBackward0>)
22300 tensor(5.0997, device='cuda:0', grad_fn=<NllLossBackward0>)
80000
22400 tensor(4.8383, device='cuda:0', grad_fn=<NllLossBackward0>)
22500 tensor(4.5534, device='cuda:0', grad_fn=<NllLossBackward0>)
22600 tensor(5.1061, device='cuda:0', grad_fn=<NllLossBackward0>)
81000
22700 tensor(4.9188, device='cuda:0', grad_fn=<NllLossBackward0>)
22800 tensor(5.0239, device='cuda:0', grad_fn=<NllLossBackward0>)
22900 tensor(4.9186, device='cuda:0', grad_fn=<NllLossBackward0>)
82000
23000 tensor(5.3097, device='cuda:0', grad_fn=<NllLossBackward0>)
23100 tensor(5.0750, device='cuda:0', grad_fn=<NllLossBackward0>)
83000
23200 tensor(4.9890, device='cuda:0', grad_fn=<NllLossBackward0>)
23300 tensor(5.1193, device='cuda:0', grad_fn=<NllLossBackward0>)
23400 tensor(5.0708, device='cuda:0', grad_fn=<NllLossBackward0>)
84000
23500 tensor(4.9966, device='cuda:0', grad_fn=<NllLossBackward0>)
23600 tensor(5.1132, device='cuda:0', grad_fn=<NllLossBackward0>)
23700 tensor(5.3187, device='cuda:0', grad_fn=<NllLossBackward0>)
85000
23800 tensor(5.2937, device='cuda:0', grad_fn=<NllLossBackward0>)
23900 tensor(5.2049, device='cuda:0', grad_fn=<NllLossBackward0>)
24000 tensor(5.0551, device='cuda:0', grad_fn=<NllLossBackward0>)
86000
24100 tensor(5.3820, device='cuda:0', grad_fn=<NllLossBackward0>)
24200 tensor(5.2243, device='cuda:0', grad_fn=<NllLossBackward0>)
24300 tensor(5.0407, device='cuda:0', grad_fn=<NllLossBackward0>)
87000
24400 tensor(5.1378, device='cuda:0', grad_fn=<NllLossBackward0>)
24500 tensor(4.8982, device='cuda:0', grad_fn=<NllLossBackward0>)
88000
24600 tensor(5.0213, device='cuda:0', grad_fn=<NllLossBackward0>)
24700 tensor(5.1981, device='cuda:0', grad_fn=<NllLossBackward0>)
24800 tensor(5.2277, device='cuda:0', grad_fn=<NllLossBackward0>)
89000
24900 tensor(5.1798, device='cuda:0', grad_fn=<NllLossBackward0>)
25000 tensor(5.4307, device='cuda:0', grad_fn=<NllLossBackward0>)
25100 tensor(5.0697, device='cuda:0', grad_fn=<NllLossBackward0>)
90000
25200 tensor(5.1689, device='cuda:0', grad_fn=<NllLossBackward0>)
25300 tensor(5.1092, device='cuda:0', grad_fn=<NllLossBackward0>)
25400 tensor(5.0354, device='cuda:0', grad_fn=<NllLossBackward0>)
91000
25500 tensor(4.9247, device='cuda:0', grad_fn=<NllLossBackward0>)
25600 tensor(4.6927, device='cuda:0', grad_fn=<NllLossBackward0>)
25700 tensor(5.0795, device='cuda:0', grad_fn=<NllLossBackward0>)
92000
25800 tensor(5.0327, device='cuda:0', grad_fn=<NllLossBackward0>)
25900 tensor(5.1922, device='cuda:0', grad_fn=<NllLossBackward0>)
93000
26000 tensor(4.7922, device='cuda:0', grad_fn=<NllLossBackward0>)
26100 tensor(5.2135, device='cuda:0', grad_fn=<NllLossBackward0>)
26200 tensor(4.8716, device='cuda:0', grad_fn=<NllLossBackward0>)
94000
26300 tensor(5.1197, device='cuda:0', grad_fn=<NllLossBackward0>)
26400 tensor(5.0293, device='cuda:0', grad_fn=<NllLossBackward0>)
26500 tensor(5.0354, device='cuda:0', grad_fn=<NllLossBackward0>)
95000
26600 tensor(5.2529, device='cuda:0', grad_fn=<NllLossBackward0>)
26700 tensor(5.0883, device='cuda:0', grad_fn=<NllLossBackward0>)
26800 tensor(4.8198, device='cuda:0', grad_fn=<NllLossBackward0>)
96000
26900 tensor(5.0282, device='cuda:0', grad_fn=<NllLossBackward0>)
27000 tensor(4.7931, device='cuda:0', grad_fn=<NllLossBackward0>)
27100 tensor(5.0120, device='cuda:0', grad_fn=<NllLossBackward0>)
97000
27200 tensor(5.0787, device='cuda:0', grad_fn=<NllLossBackward0>)
27300 tensor(5.0908, device='cuda:0', grad_fn=<NllLossBackward0>)
98000
27400 tensor(4.9436, device='cuda:0', grad_fn=<NllLossBackward0>)
27500 tensor(5.1993, device='cuda:0', grad_fn=<NllLossBackward0>)
27600 tensor(5.1542, device='cuda:0', grad_fn=<NllLossBackward0>)
99000
27700 tensor(5.1619, device='cuda:0', grad_fn=<NllLossBackward0>)
27800 tensor(4.9495, device='cuda:0', grad_fn=<NllLossBackward0>)
27900 tensor(4.9496, device='cuda:0', grad_fn=<NllLossBackward0>)
100000
28000 tensor(5.3698, device='cuda:0', grad_fn=<NllLossBackward0>)
28100 tensor(5.3364, device='cuda:0', grad_fn=<NllLossBackward0>)
28200 tensor(4.9353, device='cuda:0', grad_fn=<NllLossBackward0>)
101000
28300 tensor(5.1149, device='cuda:0', grad_fn=<NllLossBackward0>)
28400 tensor(4.7524, device='cuda:0', grad_fn=<NllLossBackward0>)
28500 tensor(5.0890, device='cuda:0', grad_fn=<NllLossBackward0>)
102000
28600 tensor(5.5074, device='cuda:0', grad_fn=<NllLossBackward0>)
28700 tensor(5.1043, device='cuda:0', grad_fn=<NllLossBackward0>)
103000
28800 tensor(5.0942, device='cuda:0', grad_fn=<NllLossBackward0>)
28900 tensor(4.7643, device='cuda:0', grad_fn=<NllLossBackward0>)
29000 tensor(4.9018, device='cuda:0', grad_fn=<NllLossBackward0>)
104000
29100 tensor(5.1374, device='cuda:0', grad_fn=<NllLossBackward0>)
29200 tensor(4.8248, device='cuda:0', grad_fn=<NllLossBackward0>)
29300 tensor(5.1806, device='cuda:0', grad_fn=<NllLossBackward0>)
105000
29400 tensor(5.1401, device='cuda:0', grad_fn=<NllLossBackward0>)
29500 tensor(4.9909, device='cuda:0', grad_fn=<NllLossBackward0>)
29600 tensor(5.3300, device='cuda:0', grad_fn=<NllLossBackward0>)
106000
29700 tensor(4.7358, device='cuda:0', grad_fn=<NllLossBackward0>)
29800 tensor(4.8462, device='cuda:0', grad_fn=<NllLossBackward0>)
29900 tensor(4.9805, device='cuda:0', grad_fn=<NllLossBackward0>)
107000
30000 tensor(5.1522, device='cuda:0', grad_fn=<NllLossBackward0>)
30100 tensor(5.0940, device='cuda:0', grad_fn=<NllLossBackward0>)
108000
30200 tensor(5.1651, device='cuda:0', grad_fn=<NllLossBackward0>)
30300 tensor(4.8124, device='cuda:0', grad_fn=<NllLossBackward0>)
30400 tensor(4.7609, device='cuda:0', grad_fn=<NllLossBackward0>)
109000
30500 tensor(4.8187, device='cuda:0', grad_fn=<NllLossBackward0>)
30600 tensor(4.9865, device='cuda:0', grad_fn=<NllLossBackward0>)
30700 tensor(5.0208, device='cuda:0', grad_fn=<NllLossBackward0>)
110000
30800 tensor(5.0904, device='cuda:0', grad_fn=<NllLossBackward0>)
30900 tensor(4.9448, device='cuda:0', grad_fn=<NllLossBackward0>)
31000 tensor(5.1119, device='cuda:0', grad_fn=<NllLossBackward0>)
111000
31100 tensor(5.1994, device='cuda:0', grad_fn=<NllLossBackward0>)
31200 tensor(5.0280, device='cuda:0', grad_fn=<NllLossBackward0>)
31300 tensor(5.2946, device='cuda:0', grad_fn=<NllLossBackward0>)
112000
31400 tensor(4.9557, device='cuda:0', grad_fn=<NllLossBackward0>)
31500 tensor(4.9296, device='cuda:0', grad_fn=<NllLossBackward0>)
113000
31600 tensor(4.8751, device='cuda:0', grad_fn=<NllLossBackward0>)
31700 tensor(5.3086, device='cuda:0', grad_fn=<NllLossBackward0>)
31800 tensor(4.7567, device='cuda:0', grad_fn=<NllLossBackward0>)
114000
31900 tensor(4.9941, device='cuda:0', grad_fn=<NllLossBackward0>)
32000 tensor(5.2035, device='cuda:0', grad_fn=<NllLossBackward0>)
32100 tensor(4.8145, device='cuda:0', grad_fn=<NllLossBackward0>)
115000
32200 tensor(5.3565, device='cuda:0', grad_fn=<NllLossBackward0>)
32300 tensor(4.9674, device='cuda:0', grad_fn=<NllLossBackward0>)
32400 tensor(4.9422, device='cuda:0', grad_fn=<NllLossBackward0>)
116000
32500 tensor(5.2489, device='cuda:0', grad_fn=<NllLossBackward0>)
32600 tensor(5.0207, device='cuda:0', grad_fn=<NllLossBackward0>)
117000
32700 tensor(5.1569, device='cuda:0', grad_fn=<NllLossBackward0>)
32800 tensor(5.0727, device='cuda:0', grad_fn=<NllLossBackward0>)
32900 tensor(4.9706, device='cuda:0', grad_fn=<NllLossBackward0>)
118000
33000 tensor(5.0288, device='cuda:0', grad_fn=<NllLossBackward0>)
33100 tensor(5.1344, device='cuda:0', grad_fn=<NllLossBackward0>)
33200 tensor(5.0739, device='cuda:0', grad_fn=<NllLossBackward0>)
119000
33300 tensor(4.7502, device='cuda:0', grad_fn=<NllLossBackward0>)
33400 tensor(4.6710, device='cuda:0', grad_fn=<NllLossBackward0>)
33500 tensor(5.3873, device='cuda:0', grad_fn=<NllLossBackward0>)
120000
33600 tensor(4.9222, device='cuda:0', grad_fn=<NllLossBackward0>)
33700 tensor(5.1264, device='cuda:0', grad_fn=<NllLossBackward0>)
33800 tensor(4.9766, device='cuda:0', grad_fn=<NllLossBackward0>)
121000
33900 tensor(5.0954, device='cuda:0', grad_fn=<NllLossBackward0>)
34000 tensor(5.0210, device='cuda:0', grad_fn=<NllLossBackward0>)
122000
34100 tensor(5.0876, device='cuda:0', grad_fn=<NllLossBackward0>)
34200 tensor(5.1302, device='cuda:0', grad_fn=<NllLossBackward0>)
34300 tensor(5.2983, device='cuda:0', grad_fn=<NllLossBackward0>)
123000
34400 tensor(5.2121, device='cuda:0', grad_fn=<NllLossBackward0>)
34500 tensor(5.3244, device='cuda:0', grad_fn=<NllLossBackward0>)
34600 tensor(4.9485, device='cuda:0', grad_fn=<NllLossBackward0>)
124000
34700 tensor(5.2090, device='cuda:0', grad_fn=<NllLossBackward0>)
34800 tensor(4.9158, device='cuda:0', grad_fn=<NllLossBackward0>)
34900 tensor(5.2443, device='cuda:0', grad_fn=<NllLossBackward0>)
125000
35000 tensor(5.1179, device='cuda:0', grad_fn=<NllLossBackward0>)
35100 tensor(5.0717, device='cuda:0', grad_fn=<NllLossBackward0>)
35200 tensor(5.1812, device='cuda:0', grad_fn=<NllLossBackward0>)
126000
35300 tensor(4.9081, device='cuda:0', grad_fn=<NllLossBackward0>)
35400 tensor(5.1653, device='cuda:0', grad_fn=<NllLossBackward0>)
127000
35500 tensor(5.1505, device='cuda:0', grad_fn=<NllLossBackward0>)
35600 tensor(5.2750, device='cuda:0', grad_fn=<NllLossBackward0>)
35700 tensor(5.4164, device='cuda:0', grad_fn=<NllLossBackward0>)
128000
35800 tensor(5.1043, device='cuda:0', grad_fn=<NllLossBackward0>)
35900 tensor(5.1016, device='cuda:0', grad_fn=<NllLossBackward0>)
36000 tensor(5.1461, device='cuda:0', grad_fn=<NllLossBackward0>)
129000
36100 tensor(5.2056, device='cuda:0', grad_fn=<NllLossBackward0>)
36200 tensor(5.1355, device='cuda:0', grad_fn=<NllLossBackward0>)
36300 tensor(5.3760, device='cuda:0', grad_fn=<NllLossBackward0>)
130000
36400 tensor(5.2054, device='cuda:0', grad_fn=<NllLossBackward0>)
36500 tensor(5.3691, device='cuda:0', grad_fn=<NllLossBackward0>)
131000
36600 tensor(4.8645, device='cuda:0', grad_fn=<NllLossBackward0>)
36700 tensor(5.4200, device='cuda:0', grad_fn=<NllLossBackward0>)
36800 tensor(5.2951, device='cuda:0', grad_fn=<NllLossBackward0>)
132000
36900 tensor(4.9001, device='cuda:0', grad_fn=<NllLossBackward0>)
37000 tensor(5.1626, device='cuda:0', grad_fn=<NllLossBackward0>)
37100 tensor(4.8412, device='cuda:0', grad_fn=<NllLossBackward0>)
133000
37200 tensor(4.7379, device='cuda:0', grad_fn=<NllLossBackward0>)
37300 tensor(4.8720, device='cuda:0', grad_fn=<NllLossBackward0>)
37400 tensor(4.8967, device='cuda:0', grad_fn=<NllLossBackward0>)
134000
37500 tensor(4.7772, device='cuda:0', grad_fn=<NllLossBackward0>)
37600 tensor(4.8302, device='cuda:0', grad_fn=<NllLossBackward0>)
37700 tensor(5.1201, device='cuda:0', grad_fn=<NllLossBackward0>)
135000
37800 tensor(5.1669, device='cuda:0', grad_fn=<NllLossBackward0>)
37900 tensor(5.2367, device='cuda:0', grad_fn=<NllLossBackward0>)
136000
38000 tensor(4.9588, device='cuda:0', grad_fn=<NllLossBackward0>)
38100 tensor(5.1460, device='cuda:0', grad_fn=<NllLossBackward0>)
38200 tensor(4.8495, device='cuda:0', grad_fn=<NllLossBackward0>)
137000
38300 tensor(5.2402, device='cuda:0', grad_fn=<NllLossBackward0>)
38400 tensor(4.9173, device='cuda:0', grad_fn=<NllLossBackward0>)
38500 tensor(5.1901, device='cuda:0', grad_fn=<NllLossBackward0>)
138000
38600 tensor(4.7848, device='cuda:0', grad_fn=<NllLossBackward0>)
38700 tensor(5.0920, device='cuda:0', grad_fn=<NllLossBackward0>)
38800 tensor(4.8625, device='cuda:0', grad_fn=<NllLossBackward0>)
139000
38900 tensor(4.9405, device='cuda:0', grad_fn=<NllLossBackward0>)
39000 tensor(5.1992, device='cuda:0', grad_fn=<NllLossBackward0>)
39100 tensor(4.8887, device='cuda:0', grad_fn=<NllLossBackward0>)
140000
39200 tensor(5.3122, device='cuda:0', grad_fn=<NllLossBackward0>)
39300 tensor(5.0380, device='cuda:0', grad_fn=<NllLossBackward0>)
141000
39400 tensor(4.9989, device='cuda:0', grad_fn=<NllLossBackward0>)
39500 tensor(5.1872, device='cuda:0', grad_fn=<NllLossBackward0>)
39600 tensor(4.9061, device='cuda:0', grad_fn=<NllLossBackward0>)
142000
39700 tensor(5.2725, device='cuda:0', grad_fn=<NllLossBackward0>)
39800 tensor(5.1865, device='cuda:0', grad_fn=<NllLossBackward0>)
39900 tensor(4.8060, device='cuda:0', grad_fn=<NllLossBackward0>)
143000
40000 tensor(4.9448, device='cuda:0', grad_fn=<NllLossBackward0>)
40100 tensor(5.0130, device='cuda:0', grad_fn=<NllLossBackward0>)
40200 tensor(5.0753, device='cuda:0', grad_fn=<NllLossBackward0>)
144000
40300 tensor(5.0743, device='cuda:0', grad_fn=<NllLossBackward0>)
40400 tensor(5.3216, device='cuda:0', grad_fn=<NllLossBackward0>)
40500 tensor(4.7829, device='cuda:0', grad_fn=<NllLossBackward0>)
145000
40600 tensor(5.3467, device='cuda:0', grad_fn=<NllLossBackward0>)
40700 tensor(5.2229, device='cuda:0', grad_fn=<NllLossBackward0>)
146000
40800 tensor(4.9148, device='cuda:0', grad_fn=<NllLossBackward0>)
40900 tensor(4.9961, device='cuda:0', grad_fn=<NllLossBackward0>)
41000 tensor(5.1233, device='cuda:0', grad_fn=<NllLossBackward0>)
147000
41100 tensor(4.9480, device='cuda:0', grad_fn=<NllLossBackward0>)
41200 tensor(5.1455, device='cuda:0', grad_fn=<NllLossBackward0>)
41300 tensor(4.7624, device='cuda:0', grad_fn=<NllLossBackward0>)
148000
41400 tensor(4.8317, device='cuda:0', grad_fn=<NllLossBackward0>)
41500 tensor(5.3050, device='cuda:0', grad_fn=<NllLossBackward0>)
41600 tensor(4.8164, device='cuda:0', grad_fn=<NllLossBackward0>)
149000
41700 tensor(5.1542, device='cuda:0', grad_fn=<NllLossBackward0>)
41800 tensor(5.0889, device='cuda:0', grad_fn=<NllLossBackward0>)
41900 tensor(5.0844, device='cuda:0', grad_fn=<NllLossBackward0>)
150000
42000 tensor(5.2015, device='cuda:0', grad_fn=<NllLossBackward0>)
42100 tensor(4.6232, device='cuda:0', grad_fn=<NllLossBackward0>)
151000
42200 tensor(4.9591, device='cuda:0', grad_fn=<NllLossBackward0>)
42300 tensor(5.0888, device='cuda:0', grad_fn=<NllLossBackward0>)
42400 tensor(4.9506, device='cuda:0', grad_fn=<NllLossBackward0>)
152000
42500 tensor(5.0405, device='cuda:0', grad_fn=<NllLossBackward0>)
42600 tensor(4.5948, device='cuda:0', grad_fn=<NllLossBackward0>)
42700 tensor(5.2494, device='cuda:0', grad_fn=<NllLossBackward0>)
153000
42800 tensor(5.1611, device='cuda:0', grad_fn=<NllLossBackward0>)
42900 tensor(4.9517, device='cuda:0', grad_fn=<NllLossBackward0>)
43000 tensor(5.1292, device='cuda:0', grad_fn=<NllLossBackward0>)
154000
43100 tensor(4.8461, device='cuda:0', grad_fn=<NllLossBackward0>)
43200 tensor(5.0209, device='cuda:0', grad_fn=<NllLossBackward0>)
43300 tensor(5.1240, device='cuda:0', grad_fn=<NllLossBackward0>)
155000
43400 tensor(5.0400, device='cuda:0', grad_fn=<NllLossBackward0>)
43500 tensor(5.3999, device='cuda:0', grad_fn=<NllLossBackward0>)
156000
43600 tensor(5.3041, device='cuda:0', grad_fn=<NllLossBackward0>)
43700 tensor(4.9734, device='cuda:0', grad_fn=<NllLossBackward0>)
43800 tensor(5.1028, device='cuda:0', grad_fn=<NllLossBackward0>)
157000
43900 tensor(4.9373, device='cuda:0', grad_fn=<NllLossBackward0>)
44000 tensor(4.7666, device='cuda:0', grad_fn=<NllLossBackward0>)
44100 tensor(5.2840, device='cuda:0', grad_fn=<NllLossBackward0>)
158000
44200 tensor(4.8623, device='cuda:0', grad_fn=<NllLossBackward0>)
44300 tensor(5.3904, device='cuda:0', grad_fn=<NllLossBackward0>)
44400 tensor(4.9841, device='cuda:0', grad_fn=<NllLossBackward0>)
159000
44500 tensor(4.8345, device='cuda:0', grad_fn=<NllLossBackward0>)
44600 tensor(4.9179, device='cuda:0', grad_fn=<NllLossBackward0>)
160000
44700 tensor(4.8921, device='cuda:0', grad_fn=<NllLossBackward0>)
44800 tensor(5.1909, device='cuda:0', grad_fn=<NllLossBackward0>)
44900 tensor(5.0083, device='cuda:0', grad_fn=<NllLossBackward0>)
161000
45000 tensor(4.8715, device='cuda:0', grad_fn=<NllLossBackward0>)
45100 tensor(5.1750, device='cuda:0', grad_fn=<NllLossBackward0>)
45200 tensor(4.9847, device='cuda:0', grad_fn=<NllLossBackward0>)
162000
45300 tensor(5.1416, device='cuda:0', grad_fn=<NllLossBackward0>)
45400 tensor(4.5706, device='cuda:0', grad_fn=<NllLossBackward0>)
45500 tensor(4.8476, device='cuda:0', grad_fn=<NllLossBackward0>)
163000
45600 tensor(5.1753, device='cuda:0', grad_fn=<NllLossBackward0>)
45700 tensor(5.1638, device='cuda:0', grad_fn=<NllLossBackward0>)
45800 tensor(5.0620, device='cuda:0', grad_fn=<NllLossBackward0>)
164000
45900 tensor(4.8623, device='cuda:0', grad_fn=<NllLossBackward0>)
46000 tensor(5.0958, device='cuda:0', grad_fn=<NllLossBackward0>)
165000
46100 tensor(4.9271, device='cuda:0', grad_fn=<NllLossBackward0>)
46200 tensor(4.8955, device='cuda:0', grad_fn=<NllLossBackward0>)
46300 tensor(5.2268, device='cuda:0', grad_fn=<NllLossBackward0>)
166000
46400 tensor(5.2585, device='cuda:0', grad_fn=<NllLossBackward0>)
46500 tensor(4.8509, device='cuda:0', grad_fn=<NllLossBackward0>)
46600 tensor(4.8656, device='cuda:0', grad_fn=<NllLossBackward0>)
167000
46700 tensor(5.1203, device='cuda:0', grad_fn=<NllLossBackward0>)
46800 tensor(4.7754, device='cuda:0', grad_fn=<NllLossBackward0>)
46900 tensor(4.8504, device='cuda:0', grad_fn=<NllLossBackward0>)
168000
47000 tensor(5.1042, device='cuda:0', grad_fn=<NllLossBackward0>)
47100 tensor(5.3206, device='cuda:0', grad_fn=<NllLossBackward0>)
47200 tensor(5.0201, device='cuda:0', grad_fn=<NllLossBackward0>)
169000
47300 tensor(5.1387, device='cuda:0', grad_fn=<NllLossBackward0>)
47400 tensor(4.9378, device='cuda:0', grad_fn=<NllLossBackward0>)
170000
47500 tensor(5.1120, device='cuda:0', grad_fn=<NllLossBackward0>)
47600 tensor(5.0451, device='cuda:0', grad_fn=<NllLossBackward0>)
47700 tensor(5.0578, device='cuda:0', grad_fn=<NllLossBackward0>)
171000
47800 tensor(5.1534, device='cuda:0', grad_fn=<NllLossBackward0>)
47900 tensor(5.1646, device='cuda:0', grad_fn=<NllLossBackward0>)
48000 tensor(4.8708, device='cuda:0', grad_fn=<NllLossBackward0>)
172000
48100 tensor(5.2243, device='cuda:0', grad_fn=<NllLossBackward0>)
48200 tensor(5.0376, device='cuda:0', grad_fn=<NllLossBackward0>)
48300 tensor(5.1148, device='cuda:0', grad_fn=<NllLossBackward0>)
173000
48400 tensor(4.7304, device='cuda:0', grad_fn=<NllLossBackward0>)
48500 tensor(4.9864, device='cuda:0', grad_fn=<NllLossBackward0>)
48600 tensor(5.1053, device='cuda:0', grad_fn=<NllLossBackward0>)
174000
48700 tensor(5.0854, device='cuda:0', grad_fn=<NllLossBackward0>)
48800 tensor(4.8705, device='cuda:0', grad_fn=<NllLossBackward0>)
175000
48900 tensor(5.1415, device='cuda:0', grad_fn=<NllLossBackward0>)
49000 tensor(4.9185, device='cuda:0', grad_fn=<NllLossBackward0>)
49100 tensor(5.1249, device='cuda:0', grad_fn=<NllLossBackward0>)
176000
49200 tensor(5.1852, device='cuda:0', grad_fn=<NllLossBackward0>)
49300 tensor(4.9728, device='cuda:0', grad_fn=<NllLossBackward0>)
49400 tensor(5.1541, device='cuda:0', grad_fn=<NllLossBackward0>)
177000
49500 tensor(5.2388, device='cuda:0', grad_fn=<NllLossBackward0>)
49600 tensor(4.8876, device='cuda:0', grad_fn=<NllLossBackward0>)
49700 tensor(5.0066, device='cuda:0', grad_fn=<NllLossBackward0>)
178000
49800 tensor(4.9596, device='cuda:0', grad_fn=<NllLossBackward0>)
49900 tensor(5.1618, device='cuda:0', grad_fn=<NllLossBackward0>)
50000 tensor(5.0436, device='cuda:0', grad_fn=<NllLossBackward0>)
179000
50100 tensor(5.1911, device='cuda:0', grad_fn=<NllLossBackward0>)
50200 tensor(4.9044, device='cuda:0', grad_fn=<NllLossBackward0>)
180000
50300 tensor(5.1698, device='cuda:0', grad_fn=<NllLossBackward0>)
50400 tensor(4.8785, device='cuda:0', grad_fn=<NllLossBackward0>)
50500 tensor(5.1889, device='cuda:0', grad_fn=<NllLossBackward0>)
181000
50600 tensor(4.9799, device='cuda:0', grad_fn=<NllLossBackward0>)
50700 tensor(4.8507, device='cuda:0', grad_fn=<NllLossBackward0>)
50800 tensor(4.7525, device='cuda:0', grad_fn=<NllLossBackward0>)
182000
50900 tensor(5.0997, device='cuda:0', grad_fn=<NllLossBackward0>)
51000 tensor(5.1681, device='cuda:0', grad_fn=<NllLossBackward0>)
51100 tensor(5.0502, device='cuda:0', grad_fn=<NllLossBackward0>)
183000
51200 tensor(5.2121, device='cuda:0', grad_fn=<NllLossBackward0>)
51300 tensor(4.7797, device='cuda:0', grad_fn=<NllLossBackward0>)
51400 tensor(5.1379, device='cuda:0', grad_fn=<NllLossBackward0>)
184000
51500 tensor(5.0826, device='cuda:0', grad_fn=<NllLossBackward0>)
51600 tensor(5.1736, device='cuda:0', grad_fn=<NllLossBackward0>)
185000
51700 tensor(5.0336, device='cuda:0', grad_fn=<NllLossBackward0>)
51800 tensor(5.1728, device='cuda:0', grad_fn=<NllLossBackward0>)
51900 tensor(4.9617, device='cuda:0', grad_fn=<NllLossBackward0>)
186000
52000 tensor(5.1423, device='cuda:0', grad_fn=<NllLossBackward0>)
52100 tensor(5.2641, device='cuda:0', grad_fn=<NllLossBackward0>)
52200 tensor(5.0587, device='cuda:0', grad_fn=<NllLossBackward0>)
187000
52300 tensor(4.9935, device='cuda:0', grad_fn=<NllLossBackward0>)
52400 tensor(5.0743, device='cuda:0', grad_fn=<NllLossBackward0>)
52500 tensor(5.0771, device='cuda:0', grad_fn=<NllLossBackward0>)
188000
52600 tensor(5.0109, device='cuda:0', grad_fn=<NllLossBackward0>)
52700 tensor(5.1416, device='cuda:0', grad_fn=<NllLossBackward0>)
189000
52800 tensor(5.1943, device='cuda:0', grad_fn=<NllLossBackward0>)
52900 tensor(5.0125, device='cuda:0', grad_fn=<NllLossBackward0>)
53000 tensor(4.9937, device='cuda:0', grad_fn=<NllLossBackward0>)
190000
53100 tensor(4.9477, device='cuda:0', grad_fn=<NllLossBackward0>)
53200 tensor(4.7327, device='cuda:0', grad_fn=<NllLossBackward0>)
53300 tensor(5.2251, device='cuda:0', grad_fn=<NllLossBackward0>)
191000
53400 tensor(5.1211, device='cuda:0', grad_fn=<NllLossBackward0>)
53500 tensor(5.0256, device='cuda:0', grad_fn=<NllLossBackward0>)
53600 tensor(5.1351, device='cuda:0', grad_fn=<NllLossBackward0>)
192000
53700 tensor(5.1121, device='cuda:0', grad_fn=<NllLossBackward0>)
53800 tensor(4.7721, device='cuda:0', grad_fn=<NllLossBackward0>)
53900 tensor(4.7813, device='cuda:0', grad_fn=<NllLossBackward0>)
193000
54000 tensor(5.2923, device='cuda:0', grad_fn=<NllLossBackward0>)
54100 tensor(4.4067, device='cuda:0', grad_fn=<NllLossBackward0>)
194000
54200 tensor(5.1834, device='cuda:0', grad_fn=<NllLossBackward0>)
54300 tensor(4.9698, device='cuda:0', grad_fn=<NllLossBackward0>)
54400 tensor(4.6432, device='cuda:0', grad_fn=<NllLossBackward0>)
195000
54500 tensor(5.2431, device='cuda:0', grad_fn=<NllLossBackward0>)
54600 tensor(5.2523, device='cuda:0', grad_fn=<NllLossBackward0>)
54700 tensor(4.9880, device='cuda:0', grad_fn=<NllLossBackward0>)
196000
54800 tensor(5.1705, device='cuda:0', grad_fn=<NllLossBackward0>)
54900 tensor(4.8544, device='cuda:0', grad_fn=<NllLossBackward0>)
55000 tensor(5.0054, device='cuda:0', grad_fn=<NllLossBackward0>)
197000
55100 tensor(5.0684, device='cuda:0', grad_fn=<NllLossBackward0>)
55200 tensor(5.1112, device='cuda:0', grad_fn=<NllLossBackward0>)
55300 tensor(5.0787, device='cuda:0', grad_fn=<NllLossBackward0>)
198000
55400 tensor(5.2173, device='cuda:0', grad_fn=<NllLossBackward0>)
55500 tensor(5.2430, device='cuda:0', grad_fn=<NllLossBackward0>)
199000
55600 tensor(4.8266, device='cuda:0', grad_fn=<NllLossBackward0>)
55700 tensor(5.0219, device='cuda:0', grad_fn=<NllLossBackward0>)
55800 tensor(4.5834, device='cuda:0', grad_fn=<NllLossBackward0>)
200000
55900 tensor(5.0455, device='cuda:0', grad_fn=<NllLossBackward0>)
56000 tensor(4.9394, device='cuda:0', grad_fn=<NllLossBackward0>)
56100 tensor(5.2400, device='cuda:0', grad_fn=<NllLossBackward0>)
201000
56200 tensor(5.2727, device='cuda:0', grad_fn=<NllLossBackward0>)
56300 tensor(5.0803, device='cuda:0', grad_fn=<NllLossBackward0>)
56400 tensor(5.1270, device='cuda:0', grad_fn=<NllLossBackward0>)
202000
56500 tensor(5.1758, device='cuda:0', grad_fn=<NllLossBackward0>)
56600 tensor(4.8928, device='cuda:0', grad_fn=<NllLossBackward0>)
56700 tensor(5.0119, device='cuda:0', grad_fn=<NllLossBackward0>)
203000
56800 tensor(5.5221, device='cuda:0', grad_fn=<NllLossBackward0>)
56900 tensor(4.8963, device='cuda:0', grad_fn=<NllLossBackward0>)
204000
57000 tensor(5.2899, device='cuda:0', grad_fn=<NllLossBackward0>)
57100 tensor(5.0742, device='cuda:0', grad_fn=<NllLossBackward0>)
57200 tensor(4.8940, device='cuda:0', grad_fn=<NllLossBackward0>)
205000
57300 tensor(5.0177, device='cuda:0', grad_fn=<NllLossBackward0>)
57400 tensor(5.2162, device='cuda:0', grad_fn=<NllLossBackward0>)
57500 tensor(4.8549, device='cuda:0', grad_fn=<NllLossBackward0>)
206000
57600 tensor(4.7256, device='cuda:0', grad_fn=<NllLossBackward0>)
57700 tensor(4.9188, device='cuda:0', grad_fn=<NllLossBackward0>)
57800 tensor(5.2799, device='cuda:0', grad_fn=<NllLossBackward0>)
207000
57900 tensor(5.2005, device='cuda:0', grad_fn=<NllLossBackward0>)
58000 tensor(4.9628, device='cuda:0', grad_fn=<NllLossBackward0>)
208000
58100 tensor(5.3107, device='cuda:0', grad_fn=<NllLossBackward0>)
58200 tensor(5.3371, device='cuda:0', grad_fn=<NllLossBackward0>)
58300 tensor(4.9034, device='cuda:0', grad_fn=<NllLossBackward0>)
209000
58400 tensor(4.8145, device='cuda:0', grad_fn=<NllLossBackward0>)
58500 tensor(4.9395, device='cuda:0', grad_fn=<NllLossBackward0>)
58600 tensor(5.3965, device='cuda:0', grad_fn=<NllLossBackward0>)
210000
58700 tensor(4.9055, device='cuda:0', grad_fn=<NllLossBackward0>)
58800 tensor(5.0192, device='cuda:0', grad_fn=<NllLossBackward0>)
58900 tensor(5.2319, device='cuda:0', grad_fn=<NllLossBackward0>)
211000
59000 tensor(5.1120, device='cuda:0', grad_fn=<NllLossBackward0>)
59100 tensor(4.9841, device='cuda:0', grad_fn=<NllLossBackward0>)
59200 tensor(5.0341, device='cuda:0', grad_fn=<NllLossBackward0>)
212000
59300 tensor(4.8836, device='cuda:0', grad_fn=<NllLossBackward0>)
59400 tensor(5.0700, device='cuda:0', grad_fn=<NllLossBackward0>)
213000
59500 tensor(5.0094, device='cuda:0', grad_fn=<NllLossBackward0>)
59600 tensor(4.9719, device='cuda:0', grad_fn=<NllLossBackward0>)
59700 tensor(4.8263, device='cuda:0', grad_fn=<NllLossBackward0>)
214000
59800 tensor(5.0422, device='cuda:0', grad_fn=<NllLossBackward0>)
59900 tensor(5.0391, device='cuda:0', grad_fn=<NllLossBackward0>)
60000 tensor(5.0464, device='cuda:0', grad_fn=<NllLossBackward0>)
215000
60100 tensor(5.1302, device='cuda:0', grad_fn=<NllLossBackward0>)
60200 tensor(5.1667, device='cuda:0', grad_fn=<NllLossBackward0>)
60300 tensor(4.9755, device='cuda:0', grad_fn=<NllLossBackward0>)
216000
60400 tensor(5.0662, device='cuda:0', grad_fn=<NllLossBackward0>)
60500 tensor(5.1330, device='cuda:0', grad_fn=<NllLossBackward0>)
60600 tensor(5.0362, device='cuda:0', grad_fn=<NllLossBackward0>)
217000
60700 tensor(4.9462, device='cuda:0', grad_fn=<NllLossBackward0>)
60800 tensor(5.1028, device='cuda:0', grad_fn=<NllLossBackward0>)
218000
60900 tensor(5.1816, device='cuda:0', grad_fn=<NllLossBackward0>)
61000 tensor(5.1451, device='cuda:0', grad_fn=<NllLossBackward0>)
61100 tensor(5.0389, device='cuda:0', grad_fn=<NllLossBackward0>)
219000
61200 tensor(4.7783, device='cuda:0', grad_fn=<NllLossBackward0>)
61300 tensor(5.0208, device='cuda:0', grad_fn=<NllLossBackward0>)
61400 tensor(5.2014, device='cuda:0', grad_fn=<NllLossBackward0>)
220000
61500 tensor(5.0953, device='cuda:0', grad_fn=<NllLossBackward0>)
61600 tensor(4.9255, device='cuda:0', grad_fn=<NllLossBackward0>)
61700 tensor(4.9145, device='cuda:0', grad_fn=<NllLossBackward0>)
221000
61800 tensor(4.9751, device='cuda:0', grad_fn=<NllLossBackward0>)
61900 tensor(5.0497, device='cuda:0', grad_fn=<NllLossBackward0>)
62000 tensor(5.2065, device='cuda:0', grad_fn=<NllLossBackward0>)
222000
62100 tensor(4.8016, device='cuda:0', grad_fn=<NllLossBackward0>)
62200 tensor(4.9131, device='cuda:0', grad_fn=<NllLossBackward0>)
223000
62300 tensor(4.9816, device='cuda:0', grad_fn=<NllLossBackward0>)
62400 tensor(4.9049, device='cuda:0', grad_fn=<NllLossBackward0>)
62500 tensor(5.0251, device='cuda:0', grad_fn=<NllLossBackward0>)
224000
62600 tensor(4.9373, device='cuda:0', grad_fn=<NllLossBackward0>)
62700 tensor(5.0984, device='cuda:0', grad_fn=<NllLossBackward0>)
62800 tensor(4.8395, device='cuda:0', grad_fn=<NllLossBackward0>)
225000
62900 tensor(5.3108, device='cuda:0', grad_fn=<NllLossBackward0>)
63000 tensor(5.1682, device='cuda:0', grad_fn=<NllLossBackward0>)
63100 tensor(5.2454, device='cuda:0', grad_fn=<NllLossBackward0>)
226000
63200 tensor(5.1091, device='cuda:0', grad_fn=<NllLossBackward0>)
63300 tensor(4.8577, device='cuda:0', grad_fn=<NllLossBackward0>)
63400 tensor(5.0426, device='cuda:0', grad_fn=<NllLossBackward0>)
227000
63500 tensor(4.9117, device='cuda:0', grad_fn=<NllLossBackward0>)
63600 tensor(4.9555, device='cuda:0', grad_fn=<NllLossBackward0>)
228000
63700 tensor(4.9914, device='cuda:0', grad_fn=<NllLossBackward0>)
63800 tensor(5.2633, device='cuda:0', grad_fn=<NllLossBackward0>)
63900 tensor(5.3451, device='cuda:0', grad_fn=<NllLossBackward0>)
229000
64000 tensor(4.9019, device='cuda:0', grad_fn=<NllLossBackward0>)
64100 tensor(5.1581, device='cuda:0', grad_fn=<NllLossBackward0>)
64200 tensor(5.0796, device='cuda:0', grad_fn=<NllLossBackward0>)
230000
64300 tensor(5.0816, device='cuda:0', grad_fn=<NllLossBackward0>)
64400 tensor(4.9050, device='cuda:0', grad_fn=<NllLossBackward0>)
64500 tensor(4.9127, device='cuda:0', grad_fn=<NllLossBackward0>)
231000
64600 tensor(4.9467, device='cuda:0', grad_fn=<NllLossBackward0>)
64700 tensor(5.3998, device='cuda:0', grad_fn=<NllLossBackward0>)
64800 tensor(5.0784, device='cuda:0', grad_fn=<NllLossBackward0>)
232000
64900 tensor(5.2337, device='cuda:0', grad_fn=<NllLossBackward0>)
65000 tensor(5.1862, device='cuda:0', grad_fn=<NllLossBackward0>)
233000
65100 tensor(4.7268, device='cuda:0', grad_fn=<NllLossBackward0>)
65200 tensor(5.2401, device='cuda:0', grad_fn=<NllLossBackward0>)
65300 tensor(5.0779, device='cuda:0', grad_fn=<NllLossBackward0>)
234000
65400 tensor(5.0225, device='cuda:0', grad_fn=<NllLossBackward0>)
65500 tensor(5.0218, device='cuda:0', grad_fn=<NllLossBackward0>)
65600 tensor(5.1051, device='cuda:0', grad_fn=<NllLossBackward0>)
235000
65700 tensor(4.8759, device='cuda:0', grad_fn=<NllLossBackward0>)
65800 tensor(4.9007, device='cuda:0', grad_fn=<NllLossBackward0>)
65900 tensor(5.1119, device='cuda:0', grad_fn=<NllLossBackward0>)
236000
66000 tensor(5.1512, device='cuda:0', grad_fn=<NllLossBackward0>)
66100 tensor(4.8563, device='cuda:0', grad_fn=<NllLossBackward0>)
237000
66200 tensor(4.8676, device='cuda:0', grad_fn=<NllLossBackward0>)
66300 tensor(4.8076, device='cuda:0', grad_fn=<NllLossBackward0>)
66400 tensor(5.0870, device='cuda:0', grad_fn=<NllLossBackward0>)
238000
66500 tensor(4.8053, device='cuda:0', grad_fn=<NllLossBackward0>)
66600 tensor(4.9573, device='cuda:0', grad_fn=<NllLossBackward0>)
66700 tensor(4.9532, device='cuda:0', grad_fn=<NllLossBackward0>)
239000
66800 tensor(5.0142, device='cuda:0', grad_fn=<NllLossBackward0>)
66900 tensor(5.1463, device='cuda:0', grad_fn=<NllLossBackward0>)
67000 tensor(5.1613, device='cuda:0', grad_fn=<NllLossBackward0>)
240000
67100 tensor(5.0125, device='cuda:0', grad_fn=<NllLossBackward0>)
67200 tensor(5.3587, device='cuda:0', grad_fn=<NllLossBackward0>)
67300 tensor(4.9968, device='cuda:0', grad_fn=<NllLossBackward0>)
241000
67400 tensor(5.1922, device='cuda:0', grad_fn=<NllLossBackward0>)
67500 tensor(5.0287, device='cuda:0', grad_fn=<NllLossBackward0>)
242000
67600 tensor(5.0606, device='cuda:0', grad_fn=<NllLossBackward0>)
67700 tensor(5.0029, device='cuda:0', grad_fn=<NllLossBackward0>)
67800 tensor(5.1391, device='cuda:0', grad_fn=<NllLossBackward0>)
243000
67900 tensor(4.8602, device='cuda:0', grad_fn=<NllLossBackward0>)
68000 tensor(4.9282, device='cuda:0', grad_fn=<NllLossBackward0>)
68100 tensor(4.7940, device='cuda:0', grad_fn=<NllLossBackward0>)
244000
68200 tensor(5.0718, device='cuda:0', grad_fn=<NllLossBackward0>)
68300 tensor(4.9727, device='cuda:0', grad_fn=<NllLossBackward0>)
68400 tensor(5.0790, device='cuda:0', grad_fn=<NllLossBackward0>)
245000
68500 tensor(5.0921, device='cuda:0', grad_fn=<NllLossBackward0>)
68600 tensor(5.0522, device='cuda:0', grad_fn=<NllLossBackward0>)
68700 tensor(4.7111, device='cuda:0', grad_fn=<NllLossBackward0>)
246000
68800 tensor(5.1136, device='cuda:0', grad_fn=<NllLossBackward0>)
68900 tensor(4.7838, device='cuda:0', grad_fn=<NllLossBackward0>)
247000
69000 tensor(5.0396, device='cuda:0', grad_fn=<NllLossBackward0>)
69100 tensor(4.8118, device='cuda:0', grad_fn=<NllLossBackward0>)
69200 tensor(4.7698, device='cuda:0', grad_fn=<NllLossBackward0>)
248000
69300 tensor(5.2231, device='cuda:0', grad_fn=<NllLossBackward0>)
69400 tensor(5.1113, device='cuda:0', grad_fn=<NllLossBackward0>)
69500 tensor(5.0799, device='cuda:0', grad_fn=<NllLossBackward0>)
249000
69600 tensor(5.0015, device='cuda:0', grad_fn=<NllLossBackward0>)
69700 tensor(5.3189, device='cuda:0', grad_fn=<NllLossBackward0>)
69800 tensor(4.8597, device='cuda:0', grad_fn=<NllLossBackward0>)
250000
69900 tensor(5.4064, device='cuda:0', grad_fn=<NllLossBackward0>)
70000 tensor(5.0562, device='cuda:0', grad_fn=<NllLossBackward0>)
70100 tensor(4.9788, device='cuda:0', grad_fn=<NllLossBackward0>)
251000
70200 tensor(5.1238, device='cuda:0', grad_fn=<NllLossBackward0>)
70300 tensor(4.7954, device='cuda:0', grad_fn=<NllLossBackward0>)
252000
70400 tensor(5.1417, device='cuda:0', grad_fn=<NllLossBackward0>)
70500 tensor(5.0851, device='cuda:0', grad_fn=<NllLossBackward0>)
70600 tensor(5.3102, device='cuda:0', grad_fn=<NllLossBackward0>)
253000
70700 tensor(5.2187, device='cuda:0', grad_fn=<NllLossBackward0>)
70800 tensor(5.4180, device='cuda:0', grad_fn=<NllLossBackward0>)
70900 tensor(5.2464, device='cuda:0', grad_fn=<NllLossBackward0>)
254000
71000 tensor(5.0232, device='cuda:0', grad_fn=<NllLossBackward0>)
71100 tensor(5.0971, device='cuda:0', grad_fn=<NllLossBackward0>)
71200 tensor(5.2289, device='cuda:0', grad_fn=<NllLossBackward0>)
255000
71300 tensor(4.8301, device='cuda:0', grad_fn=<NllLossBackward0>)
71400 tensor(4.9594, device='cuda:0', grad_fn=<NllLossBackward0>)
71500 tensor(4.9527, device='cuda:0', grad_fn=<NllLossBackward0>)
256000
71600 tensor(5.1395, device='cuda:0', grad_fn=<NllLossBackward0>)
71700 tensor(5.2029, device='cuda:0', grad_fn=<NllLossBackward0>)
257000
71800 tensor(5.2958, device='cuda:0', grad_fn=<NllLossBackward0>)
71900 tensor(4.9839, device='cuda:0', grad_fn=<NllLossBackward0>)
72000 tensor(5.4251, device='cuda:0', grad_fn=<NllLossBackward0>)
258000
72100 tensor(5.1480, device='cuda:0', grad_fn=<NllLossBackward0>)
72200 tensor(4.9666, device='cuda:0', grad_fn=<NllLossBackward0>)
72300 tensor(5.0831, device='cuda:0', grad_fn=<NllLossBackward0>)
259000
72400 tensor(4.9905, device='cuda:0', grad_fn=<NllLossBackward0>)
72500 tensor(5.0817, device='cuda:0', grad_fn=<NllLossBackward0>)
72600 tensor(5.1276, device='cuda:0', grad_fn=<NllLossBackward0>)
260000
72700 tensor(4.8184, device='cuda:0', grad_fn=<NllLossBackward0>)
72800 tensor(5.2583, device='cuda:0', grad_fn=<NllLossBackward0>)
72900 tensor(4.9964, device='cuda:0', grad_fn=<NllLossBackward0>)
261000
73000 tensor(5.1877, device='cuda:0', grad_fn=<NllLossBackward0>)
73100 tensor(4.8954, device='cuda:0', grad_fn=<NllLossBackward0>)
262000
73200 tensor(4.9722, device='cuda:0', grad_fn=<NllLossBackward0>)
73300 tensor(4.9351, device='cuda:0', grad_fn=<NllLossBackward0>)
73400 tensor(5.2009, device='cuda:0', grad_fn=<NllLossBackward0>)
263000
73500 tensor(5.0204, device='cuda:0', grad_fn=<NllLossBackward0>)
73600 tensor(4.9348, device='cuda:0', grad_fn=<NllLossBackward0>)
73700 tensor(5.0774, device='cuda:0', grad_fn=<NllLossBackward0>)
264000
73800 tensor(5.1100, device='cuda:0', grad_fn=<NllLossBackward0>)
73900 tensor(5.1179, device='cuda:0', grad_fn=<NllLossBackward0>)
74000 tensor(5.2795, device='cuda:0', grad_fn=<NllLossBackward0>)
265000
74100 tensor(4.9642, device='cuda:0', grad_fn=<NllLossBackward0>)
74200 tensor(4.9902, device='cuda:0', grad_fn=<NllLossBackward0>)
74300 tensor(4.9494, device='cuda:0', grad_fn=<NllLossBackward0>)
266000
74400 tensor(5.0772, device='cuda:0', grad_fn=<NllLossBackward0>)
74500 tensor(4.6984, device='cuda:0', grad_fn=<NllLossBackward0>)
267000
74600 tensor(5.2123, device='cuda:0', grad_fn=<NllLossBackward0>)
74700 tensor(5.2695, device='cuda:0', grad_fn=<NllLossBackward0>)
74800 tensor(4.9678, device='cuda:0', grad_fn=<NllLossBackward0>)
268000
74900 tensor(4.7941, device='cuda:0', grad_fn=<NllLossBackward0>)
75000 tensor(5.0980, device='cuda:0', grad_fn=<NllLossBackward0>)
75100 tensor(5.2916, device='cuda:0', grad_fn=<NllLossBackward0>)
269000
75200 tensor(5.0355, device='cuda:0', grad_fn=<NllLossBackward0>)
75300 tensor(5.2583, device='cuda:0', grad_fn=<NllLossBackward0>)
75400 tensor(5.0347, device='cuda:0', grad_fn=<NllLossBackward0>)
270000
75500 tensor(5.3012, device='cuda:0', grad_fn=<NllLossBackward0>)
75600 tensor(4.9714, device='cuda:0', grad_fn=<NllLossBackward0>)
271000
75700 tensor(5.0178, device='cuda:0', grad_fn=<NllLossBackward0>)
75800 tensor(4.9899, device='cuda:0', grad_fn=<NllLossBackward0>)
75900 tensor(5.0695, device='cuda:0', grad_fn=<NllLossBackward0>)
272000
76000 tensor(4.8497, device='cuda:0', grad_fn=<NllLossBackward0>)
76100 tensor(4.8143, device='cuda:0', grad_fn=<NllLossBackward0>)
76200 tensor(4.8724, device='cuda:0', grad_fn=<NllLossBackward0>)
273000
76300 tensor(5.0268, device='cuda:0', grad_fn=<NllLossBackward0>)
76400 tensor(4.8836, device='cuda:0', grad_fn=<NllLossBackward0>)
76500 tensor(5.0496, device='cuda:0', grad_fn=<NllLossBackward0>)
274000
76600 tensor(5.1337, device='cuda:0', grad_fn=<NllLossBackward0>)
76700 tensor(4.8128, device='cuda:0', grad_fn=<NllLossBackward0>)
76800 tensor(4.7389, device='cuda:0', grad_fn=<NllLossBackward0>)
275000
76900 tensor(5.0926, device='cuda:0', grad_fn=<NllLossBackward0>)
77000 tensor(4.9780, device='cuda:0', grad_fn=<NllLossBackward0>)
276000
77100 tensor(5.3241, device='cuda:0', grad_fn=<NllLossBackward0>)
77200 tensor(4.9880, device='cuda:0', grad_fn=<NllLossBackward0>)
77300 tensor(4.9250, device='cuda:0', grad_fn=<NllLossBackward0>)
277000
77400 tensor(5.0480, device='cuda:0', grad_fn=<NllLossBackward0>)
77500 tensor(4.7045, device='cuda:0', grad_fn=<NllLossBackward0>)
77600 tensor(5.0635, device='cuda:0', grad_fn=<NllLossBackward0>)
278000
77700 tensor(5.2122, device='cuda:0', grad_fn=<NllLossBackward0>)
77800 tensor(4.8158, device='cuda:0', grad_fn=<NllLossBackward0>)
77900 tensor(4.7201, device='cuda:0', grad_fn=<NllLossBackward0>)
279000
78000 tensor(5.1809, device='cuda:0', grad_fn=<NllLossBackward0>)
78100 tensor(4.8416, device='cuda:0', grad_fn=<NllLossBackward0>)
78200 tensor(5.1227, device='cuda:0', grad_fn=<NllLossBackward0>)
280000
78300 tensor(5.0000, device='cuda:0', grad_fn=<NllLossBackward0>)
78400 tensor(5.0232, device='cuda:0', grad_fn=<NllLossBackward0>)
281000
78500 tensor(5.3672, device='cuda:0', grad_fn=<NllLossBackward0>)
78600 tensor(4.8266, device='cuda:0', grad_fn=<NllLossBackward0>)
78700 tensor(5.2883, device='cuda:0', grad_fn=<NllLossBackward0>)
282000
78800 tensor(5.1556, device='cuda:0', grad_fn=<NllLossBackward0>)
78900 tensor(5.3901, device='cuda:0', grad_fn=<NllLossBackward0>)
79000 tensor(4.9368, device='cuda:0', grad_fn=<NllLossBackward0>)
283000
79100 tensor(4.7262, device='cuda:0', grad_fn=<NllLossBackward0>)
79200 tensor(4.8004, device='cuda:0', grad_fn=<NllLossBackward0>)
79300 tensor(5.2785, device='cuda:0', grad_fn=<NllLossBackward0>)
284000
79400 tensor(5.0932, device='cuda:0', grad_fn=<NllLossBackward0>)
79500 tensor(4.9747, device='cuda:0', grad_fn=<NllLossBackward0>)
79600 tensor(5.5676, device='cuda:0', grad_fn=<NllLossBackward0>)
285000
79700 tensor(4.7581, device='cuda:0', grad_fn=<NllLossBackward0>)
79800 tensor(5.0427, device='cuda:0', grad_fn=<NllLossBackward0>)
286000
79900 tensor(5.0180, device='cuda:0', grad_fn=<NllLossBackward0>)
80000 tensor(5.1512, device='cuda:0', grad_fn=<NllLossBackward0>)
80100 tensor(5.2409, device='cuda:0', grad_fn=<NllLossBackward0>)
287000
80200 tensor(5.2435, device='cuda:0', grad_fn=<NllLossBackward0>)
80300 tensor(5.2283, device='cuda:0', grad_fn=<NllLossBackward0>)
80400 tensor(5.3199, device='cuda:0', grad_fn=<NllLossBackward0>)
288000
80500 tensor(5.0696, device='cuda:0', grad_fn=<NllLossBackward0>)
80600 tensor(5.2085, device='cuda:0', grad_fn=<NllLossBackward0>)
80700 tensor(5.2603, device='cuda:0', grad_fn=<NllLossBackward0>)
289000
80800 tensor(4.7770, device='cuda:0', grad_fn=<NllLossBackward0>)
80900 tensor(5.1645, device='cuda:0', grad_fn=<NllLossBackward0>)
290000
81000 tensor(5.1184, device='cuda:0', grad_fn=<NllLossBackward0>)
81100 tensor(5.1388, device='cuda:0', grad_fn=<NllLossBackward0>)
81200 tensor(4.8794, device='cuda:0', grad_fn=<NllLossBackward0>)
291000
81300 tensor(4.9514, device='cuda:0', grad_fn=<NllLossBackward0>)
81400 tensor(5.0975, device='cuda:0', grad_fn=<NllLossBackward0>)
81500 tensor(5.0673, device='cuda:0', grad_fn=<NllLossBackward0>)
292000
81600 tensor(5.0619, device='cuda:0', grad_fn=<NllLossBackward0>)
81700 tensor(5.1551, device='cuda:0', grad_fn=<NllLossBackward0>)
81800 tensor(5.0009, device='cuda:0', grad_fn=<NllLossBackward0>)
293000
81900 tensor(5.2308, device='cuda:0', grad_fn=<NllLossBackward0>)
82000 tensor(5.0541, device='cuda:0', grad_fn=<NllLossBackward0>)
82100 tensor(5.0817, device='cuda:0', grad_fn=<NllLossBackward0>)
294000
82200 tensor(5.0208, device='cuda:0', grad_fn=<NllLossBackward0>)
82300 tensor(4.8939, device='cuda:0', grad_fn=<NllLossBackward0>)
295000
82400 tensor(5.1285, device='cuda:0', grad_fn=<NllLossBackward0>)
82500 tensor(4.8489, device='cuda:0', grad_fn=<NllLossBackward0>)
82600 tensor(5.0365, device='cuda:0', grad_fn=<NllLossBackward0>)
296000
82700 tensor(5.0726, device='cuda:0', grad_fn=<NllLossBackward0>)
82800 tensor(5.0563, device='cuda:0', grad_fn=<NllLossBackward0>)
82900 tensor(4.7439, device='cuda:0', grad_fn=<NllLossBackward0>)
297000
83000 tensor(5.1423, device='cuda:0', grad_fn=<NllLossBackward0>)
83100 tensor(5.0052, device='cuda:0', grad_fn=<NllLossBackward0>)
83200 tensor(5.2631, device='cuda:0', grad_fn=<NllLossBackward0>)
298000
83300 tensor(4.9471, device='cuda:0', grad_fn=<NllLossBackward0>)
83400 tensor(5.0234, device='cuda:0', grad_fn=<NllLossBackward0>)
83500 tensor(4.9797, device='cuda:0', grad_fn=<NllLossBackward0>)
299000
83600 tensor(4.9047, device='cuda:0', grad_fn=<NllLossBackward0>)
83700 tensor(5.1383, device='cuda:0', grad_fn=<NllLossBackward0>)
300000
83800 tensor(4.8484, device='cuda:0', grad_fn=<NllLossBackward0>)
83900 tensor(4.9279, device='cuda:0', grad_fn=<NllLossBackward0>)
84000 tensor(5.0825, device='cuda:0', grad_fn=<NllLossBackward0>)
301000
84100 tensor(5.4376, device='cuda:0', grad_fn=<NllLossBackward0>)
84200 tensor(4.5984, device='cuda:0', grad_fn=<NllLossBackward0>)
84300 tensor(5.0169, device='cuda:0', grad_fn=<NllLossBackward0>)
302000
84400 tensor(4.8503, device='cuda:0', grad_fn=<NllLossBackward0>)
84500 tensor(5.0322, device='cuda:0', grad_fn=<NllLossBackward0>)
84600 tensor(5.2033, device='cuda:0', grad_fn=<NllLossBackward0>)
303000
84700 tensor(4.9052, device='cuda:0', grad_fn=<NllLossBackward0>)
84800 tensor(4.9265, device='cuda:0', grad_fn=<NllLossBackward0>)
84900 tensor(5.2419, device='cuda:0', grad_fn=<NllLossBackward0>)
304000
85000 tensor(5.1251, device='cuda:0', grad_fn=<NllLossBackward0>)
85100 tensor(5.0005, device='cuda:0', grad_fn=<NllLossBackward0>)
305000
85200 tensor(5.0029, device='cuda:0', grad_fn=<NllLossBackward0>)
85300 tensor(5.2717, device='cuda:0', grad_fn=<NllLossBackward0>)
85400 tensor(5.2511, device='cuda:0', grad_fn=<NllLossBackward0>)
306000
85500 tensor(4.9489, device='cuda:0', grad_fn=<NllLossBackward0>)
85600 tensor(5.0751, device='cuda:0', grad_fn=<NllLossBackward0>)
85700 tensor(5.0649, device='cuda:0', grad_fn=<NllLossBackward0>)
307000
85800 tensor(5.0660, device='cuda:0', grad_fn=<NllLossBackward0>)
85900 tensor(5.3243, device='cuda:0', grad_fn=<NllLossBackward0>)
86000 tensor(5.0756, device='cuda:0', grad_fn=<NllLossBackward0>)
308000
86100 tensor(4.8535, device='cuda:0', grad_fn=<NllLossBackward0>)
86200 tensor(4.9982, device='cuda:0', grad_fn=<NllLossBackward0>)
86300 tensor(4.9693, device='cuda:0', grad_fn=<NllLossBackward0>)
309000
86400 tensor(5.0700, device='cuda:0', grad_fn=<NllLossBackward0>)
86500 tensor(5.1470, device='cuda:0', grad_fn=<NllLossBackward0>)
310000
86600 tensor(4.9735, device='cuda:0', grad_fn=<NllLossBackward0>)
86700 tensor(4.8743, device='cuda:0', grad_fn=<NllLossBackward0>)
86800 tensor(4.8816, device='cuda:0', grad_fn=<NllLossBackward0>)
311000
86900 tensor(4.7666, device='cuda:0', grad_fn=<NllLossBackward0>)
87000 tensor(5.2600, device='cuda:0', grad_fn=<NllLossBackward0>)
87100 tensor(5.0591, device='cuda:0', grad_fn=<NllLossBackward0>)
312000
87200 tensor(5.1044, device='cuda:0', grad_fn=<NllLossBackward0>)
87300 tensor(4.9555, device='cuda:0', grad_fn=<NllLossBackward0>)
87400 tensor(5.0028, device='cuda:0', grad_fn=<NllLossBackward0>)
313000
87500 tensor(5.1317, device='cuda:0', grad_fn=<NllLossBackward0>)
87600 tensor(5.2621, device='cuda:0', grad_fn=<NllLossBackward0>)
87700 tensor(4.8937, device='cuda:0', grad_fn=<NllLossBackward0>)
314000
87800 tensor(4.8559, device='cuda:0', grad_fn=<NllLossBackward0>)
87900 tensor(5.1339, device='cuda:0', grad_fn=<NllLossBackward0>)
315000
88000 tensor(4.6132, device='cuda:0', grad_fn=<NllLossBackward0>)
88100 tensor(4.9682, device='cuda:0', grad_fn=<NllLossBackward0>)
88200 tensor(5.1715, device='cuda:0', grad_fn=<NllLossBackward0>)
316000
88300 tensor(5.1112, device='cuda:0', grad_fn=<NllLossBackward0>)
88400 tensor(5.1811, device='cuda:0', grad_fn=<NllLossBackward0>)
88500 tensor(4.8046, device='cuda:0', grad_fn=<NllLossBackward0>)
317000
88600 tensor(4.8210, device='cuda:0', grad_fn=<NllLossBackward0>)
88700 tensor(4.8294, device='cuda:0', grad_fn=<NllLossBackward0>)
88800 tensor(4.8198, device='cuda:0', grad_fn=<NllLossBackward0>)
318000
88900 tensor(4.7496, device='cuda:0', grad_fn=<NllLossBackward0>)
89000 tensor(4.5654, device='cuda:0', grad_fn=<NllLossBackward0>)
89100 tensor(5.1261, device='cuda:0', grad_fn=<NllLossBackward0>)
319000
89200 tensor(4.9374, device='cuda:0', grad_fn=<NllLossBackward0>)
89300 tensor(5.3199, device='cuda:0', grad_fn=<NllLossBackward0>)
320000
89400 tensor(4.6848, device='cuda:0', grad_fn=<NllLossBackward0>)
89500 tensor(5.0381, device='cuda:0', grad_fn=<NllLossBackward0>)
89600 tensor(5.2632, device='cuda:0', grad_fn=<NllLossBackward0>)
321000
89700 tensor(4.8221, device='cuda:0', grad_fn=<NllLossBackward0>)
89800 tensor(5.0413, device='cuda:0', grad_fn=<NllLossBackward0>)
89900 tensor(5.0402, device='cuda:0', grad_fn=<NllLossBackward0>)
322000
90000 tensor(5.0519, device='cuda:0', grad_fn=<NllLossBackward0>)
90100 tensor(4.9362, device='cuda:0', grad_fn=<NllLossBackward0>)
90200 tensor(5.0257, device='cuda:0', grad_fn=<NllLossBackward0>)
323000
90300 tensor(5.0438, device='cuda:0', grad_fn=<NllLossBackward0>)
90400 tensor(4.7173, device='cuda:0', grad_fn=<NllLossBackward0>)
90500 tensor(4.9121, device='cuda:0', grad_fn=<NllLossBackward0>)
324000
90600 tensor(5.1965, device='cuda:0', grad_fn=<NllLossBackward0>)
90700 tensor(4.8207, device='cuda:0', grad_fn=<NllLossBackward0>)
325000
90800 tensor(5.0536, device='cuda:0', grad_fn=<NllLossBackward0>)
90900 tensor(5.1080, device='cuda:0', grad_fn=<NllLossBackward0>)
91000 tensor(5.1328, device='cuda:0', grad_fn=<NllLossBackward0>)
326000
91100 tensor(4.7033, device='cuda:0', grad_fn=<NllLossBackward0>)
91200 tensor(5.1643, device='cuda:0', grad_fn=<NllLossBackward0>)
91300 tensor(5.3472, device='cuda:0', grad_fn=<NllLossBackward0>)
327000
91400 tensor(4.7267, device='cuda:0', grad_fn=<NllLossBackward0>)
91500 tensor(4.9552, device='cuda:0', grad_fn=<NllLossBackward0>)
91600 tensor(5.2054, device='cuda:0', grad_fn=<NllLossBackward0>)
328000
91700 tensor(5.0471, device='cuda:0', grad_fn=<NllLossBackward0>)
91800 tensor(5.2265, device='cuda:0', grad_fn=<NllLossBackward0>)
329000
91900 tensor(5.0523, device='cuda:0', grad_fn=<NllLossBackward0>)
92000 tensor(5.0726, device='cuda:0', grad_fn=<NllLossBackward0>)
92100 tensor(4.9322, device='cuda:0', grad_fn=<NllLossBackward0>)
330000
92200 tensor(4.5777, device='cuda:0', grad_fn=<NllLossBackward0>)
92300 tensor(5.0606, device='cuda:0', grad_fn=<NllLossBackward0>)
92400 tensor(5.1319, device='cuda:0', grad_fn=<NllLossBackward0>)
331000
92500 tensor(5.1785, device='cuda:0', grad_fn=<NllLossBackward0>)
92600 tensor(5.0610, device='cuda:0', grad_fn=<NllLossBackward0>)
92700 tensor(4.7718, device='cuda:0', grad_fn=<NllLossBackward0>)
332000
92800 tensor(4.8492, device='cuda:0', grad_fn=<NllLossBackward0>)
92900 tensor(5.3673, device='cuda:0', grad_fn=<NllLossBackward0>)
93000 tensor(5.0656, device='cuda:0', grad_fn=<NllLossBackward0>)
333000
93100 tensor(4.9975, device='cuda:0', grad_fn=<NllLossBackward0>)
93200 tensor(5.1829, device='cuda:0', grad_fn=<NllLossBackward0>)
334000
93300 tensor(5.1131, device='cuda:0', grad_fn=<NllLossBackward0>)
93400 tensor(5.1585, device='cuda:0', grad_fn=<NllLossBackward0>)
93500 tensor(4.9141, device='cuda:0', grad_fn=<NllLossBackward0>)
335000
93600 tensor(5.0526, device='cuda:0', grad_fn=<NllLossBackward0>)
93700 tensor(5.1195, device='cuda:0', grad_fn=<NllLossBackward0>)
93800 tensor(4.8696, device='cuda:0', grad_fn=<NllLossBackward0>)
336000
93900 tensor(5.0752, device='cuda:0', grad_fn=<NllLossBackward0>)
94000 tensor(4.9943, device='cuda:0', grad_fn=<NllLossBackward0>)
94100 tensor(4.8657, device='cuda:0', grad_fn=<NllLossBackward0>)
337000
94200 tensor(5.1230, device='cuda:0', grad_fn=<NllLossBackward0>)
94300 tensor(5.0292, device='cuda:0', grad_fn=<NllLossBackward0>)
94400 tensor(5.0633, device='cuda:0', grad_fn=<NllLossBackward0>)
338000
94500 tensor(4.9025, device='cuda:0', grad_fn=<NllLossBackward0>)
94600 tensor(5.1955, device='cuda:0', grad_fn=<NllLossBackward0>)
339000
94700 tensor(4.7546, device='cuda:0', grad_fn=<NllLossBackward0>)
94800 tensor(5.2260, device='cuda:0', grad_fn=<NllLossBackward0>)
94900 tensor(5.2268, device='cuda:0', grad_fn=<NllLossBackward0>)
340000
95000 tensor(5.3426, device='cuda:0', grad_fn=<NllLossBackward0>)
95100 tensor(5.0509, device='cuda:0', grad_fn=<NllLossBackward0>)
95200 tensor(4.8831, device='cuda:0', grad_fn=<NllLossBackward0>)
341000
95300 tensor(5.2464, device='cuda:0', grad_fn=<NllLossBackward0>)
95400 tensor(4.7030, device='cuda:0', grad_fn=<NllLossBackward0>)
95500 tensor(4.9484, device='cuda:0', grad_fn=<NllLossBackward0>)
342000
95600 tensor(4.9381, device='cuda:0', grad_fn=<NllLossBackward0>)
95700 tensor(5.1246, device='cuda:0', grad_fn=<NllLossBackward0>)
95800 tensor(5.0476, device='cuda:0', grad_fn=<NllLossBackward0>)
343000
95900 tensor(4.8117, device='cuda:0', grad_fn=<NllLossBackward0>)
96000 tensor(5.1602, device='cuda:0', grad_fn=<NllLossBackward0>)
344000
96100 tensor(4.9363, device='cuda:0', grad_fn=<NllLossBackward0>)
96200 tensor(4.9611, device='cuda:0', grad_fn=<NllLossBackward0>)
96300 tensor(5.0053, device='cuda:0', grad_fn=<NllLossBackward0>)
345000
96400 tensor(5.0561, device='cuda:0', grad_fn=<NllLossBackward0>)
96500 tensor(5.2501, device='cuda:0', grad_fn=<NllLossBackward0>)
96600 tensor(5.0376, device='cuda:0', grad_fn=<NllLossBackward0>)
346000
96700 tensor(5.1798, device='cuda:0', grad_fn=<NllLossBackward0>)
96800 tensor(5.0684, device='cuda:0', grad_fn=<NllLossBackward0>)
96900 tensor(5.2261, device='cuda:0', grad_fn=<NllLossBackward0>)
347000
97000 tensor(4.9142, device='cuda:0', grad_fn=<NllLossBackward0>)
97100 tensor(5.0132, device='cuda:0', grad_fn=<NllLossBackward0>)
97200 tensor(5.1487, device='cuda:0', grad_fn=<NllLossBackward0>)
348000
97300 tensor(4.9732, device='cuda:0', grad_fn=<NllLossBackward0>)
97400 tensor(5.0108, device='cuda:0', grad_fn=<NllLossBackward0>)
349000
97500 tensor(5.1571, device='cuda:0', grad_fn=<NllLossBackward0>)
97600 tensor(4.9469, device='cuda:0', grad_fn=<NllLossBackward0>)
97700 tensor(5.0162, device='cuda:0', grad_fn=<NllLossBackward0>)
350000
97800 tensor(4.9770, device='cuda:0', grad_fn=<NllLossBackward0>)
97900 tensor(4.7823, device='cuda:0', grad_fn=<NllLossBackward0>)
98000 tensor(4.8846, device='cuda:0', grad_fn=<NllLossBackward0>)
351000
98100 tensor(5.2296, device='cuda:0', grad_fn=<NllLossBackward0>)
98200 tensor(4.9268, device='cuda:0', grad_fn=<NllLossBackward0>)
98300 tensor(5.0850, device='cuda:0', grad_fn=<NllLossBackward0>)
352000
98400 tensor(5.1678, device='cuda:0', grad_fn=<NllLossBackward0>)
98500 tensor(5.2236, device='cuda:0', grad_fn=<NllLossBackward0>)
353000
98600 tensor(5.2631, device='cuda:0', grad_fn=<NllLossBackward0>)
98700 tensor(5.0668, device='cuda:0', grad_fn=<NllLossBackward0>)
98800 tensor(5.0674, device='cuda:0', grad_fn=<NllLossBackward0>)
354000
98900 tensor(5.0695, device='cuda:0', grad_fn=<NllLossBackward0>)
99000 tensor(5.2623, device='cuda:0', grad_fn=<NllLossBackward0>)
99100 tensor(5.1354, device='cuda:0', grad_fn=<NllLossBackward0>)
355000
99200 tensor(5.1557, device='cuda:0', grad_fn=<NllLossBackward0>)
99300 tensor(4.9458, device='cuda:0', grad_fn=<NllLossBackward0>)
99400 tensor(5.2565, device='cuda:0', grad_fn=<NllLossBackward0>)
356000
99500 tensor(5.2732, device='cuda:0', grad_fn=<NllLossBackward0>)
99600 tensor(5.0011, device='cuda:0', grad_fn=<NllLossBackward0>)
99700 tensor(4.8497, device='cuda:0', grad_fn=<NllLossBackward0>)
357000
99800 tensor(5.0679, device='cuda:0', grad_fn=<NllLossBackward0>)
99900 tensor(5.0005, device='cuda:0', grad_fn=<NllLossBackward0>)
100000 tensor(4.9254, device='cuda:0', grad_fn=<NllLossBackward0>)
358000
100100 tensor(5.4405, device='cuda:0', grad_fn=<NllLossBackward0>)
100200 tensor(4.7585, device='cuda:0', grad_fn=<NllLossBackward0>)
359000
100300 tensor(5.0083, device='cuda:0', grad_fn=<NllLossBackward0>)
100400 tensor(4.9893, device='cuda:0', grad_fn=<NllLossBackward0>)
100500 tensor(4.8576, device='cuda:0', grad_fn=<NllLossBackward0>)
360000
100600 tensor(4.8928, device='cuda:0', grad_fn=<NllLossBackward0>)
100700 tensor(4.8947, device='cuda:0', grad_fn=<NllLossBackward0>)
100800 tensor(5.2061, device='cuda:0', grad_fn=<NllLossBackward0>)
361000
100900 tensor(5.1799, device='cuda:0', grad_fn=<NllLossBackward0>)
101000 tensor(5.0292, device='cuda:0', grad_fn=<NllLossBackward0>)
101100 tensor(5.0604, device='cuda:0', grad_fn=<NllLossBackward0>)
362000
101200 tensor(5.0287, device='cuda:0', grad_fn=<NllLossBackward0>)
101300 tensor(5.2627, device='cuda:0', grad_fn=<NllLossBackward0>)
363000
101400 tensor(4.8027, device='cuda:0', grad_fn=<NllLossBackward0>)
101500 tensor(5.0308, device='cuda:0', grad_fn=<NllLossBackward0>)
101600 tensor(5.2625, device='cuda:0', grad_fn=<NllLossBackward0>)
364000
101700 tensor(4.9086, device='cuda:0', grad_fn=<NllLossBackward0>)
101800 tensor(4.7064, device='cuda:0', grad_fn=<NllLossBackward0>)
101900 tensor(4.9304, device='cuda:0', grad_fn=<NllLossBackward0>)
365000
102000 tensor(5.1991, device='cuda:0', grad_fn=<NllLossBackward0>)
102100 tensor(5.1194, device='cuda:0', grad_fn=<NllLossBackward0>)
102200 tensor(5.3738, device='cuda:0', grad_fn=<NllLossBackward0>)
366000
102300 tensor(5.2404, device='cuda:0', grad_fn=<NllLossBackward0>)
102400 tensor(5.1821, device='cuda:0', grad_fn=<NllLossBackward0>)
102500 tensor(5.1676, device='cuda:0', grad_fn=<NllLossBackward0>)
367000
102600 tensor(5.1029, device='cuda:0', grad_fn=<NllLossBackward0>)
102700 tensor(5.1796, device='cuda:0', grad_fn=<NllLossBackward0>)
368000
102800 tensor(5.0834, device='cuda:0', grad_fn=<NllLossBackward0>)
102900 tensor(4.8583, device='cuda:0', grad_fn=<NllLossBackward0>)
103000 tensor(5.0342, device='cuda:0', grad_fn=<NllLossBackward0>)
369000
103100 tensor(5.3343, device='cuda:0', grad_fn=<NllLossBackward0>)
103200 tensor(4.9536, device='cuda:0', grad_fn=<NllLossBackward0>)
103300 tensor(5.0340, device='cuda:0', grad_fn=<NllLossBackward0>)
370000
103400 tensor(5.0580, device='cuda:0', grad_fn=<NllLossBackward0>)
103500 tensor(4.9467, device='cuda:0', grad_fn=<NllLossBackward0>)
103600 tensor(5.0845, device='cuda:0', grad_fn=<NllLossBackward0>)
371000
103700 tensor(5.1711, device='cuda:0', grad_fn=<NllLossBackward0>)
103800 tensor(5.0389, device='cuda:0', grad_fn=<NllLossBackward0>)
103900 tensor(4.9456, device='cuda:0', grad_fn=<NllLossBackward0>)
372000
104000 tensor(4.9239, device='cuda:0', grad_fn=<NllLossBackward0>)
104100 tensor(4.9678, device='cuda:0', grad_fn=<NllLossBackward0>)
373000
104200 tensor(5.0781, device='cuda:0', grad_fn=<NllLossBackward0>)
104300 tensor(4.8800, device='cuda:0', grad_fn=<NllLossBackward0>)
104400 tensor(5.2081, device='cuda:0', grad_fn=<NllLossBackward0>)
374000
104500 tensor(5.1255, device='cuda:0', grad_fn=<NllLossBackward0>)
104600 tensor(4.9805, device='cuda:0', grad_fn=<NllLossBackward0>)
104700 tensor(4.9825, device='cuda:0', grad_fn=<NllLossBackward0>)
375000
104800 tensor(5.5062, device='cuda:0', grad_fn=<NllLossBackward0>)
104900 tensor(5.0209, device='cuda:0', grad_fn=<NllLossBackward0>)
105000 tensor(5.1373, device='cuda:0', grad_fn=<NllLossBackward0>)
376000
105100 tensor(5.0935, device='cuda:0', grad_fn=<NllLossBackward0>)
105200 tensor(5.1819, device='cuda:0', grad_fn=<NllLossBackward0>)
105300 tensor(5.1386, device='cuda:0', grad_fn=<NllLossBackward0>)
377000
105400 tensor(5.3493, device='cuda:0', grad_fn=<NllLossBackward0>)
105500 tensor(5.2102, device='cuda:0', grad_fn=<NllLossBackward0>)
378000
105600 tensor(4.9484, device='cuda:0', grad_fn=<NllLossBackward0>)
105700 tensor(4.9119, device='cuda:0', grad_fn=<NllLossBackward0>)
105800 tensor(4.9584, device='cuda:0', grad_fn=<NllLossBackward0>)
379000
105900 tensor(5.1010, device='cuda:0', grad_fn=<NllLossBackward0>)
106000 tensor(5.1442, device='cuda:0', grad_fn=<NllLossBackward0>)
106100 tensor(5.3454, device='cuda:0', grad_fn=<NllLossBackward0>)
380000
106200 tensor(5.0330, device='cuda:0', grad_fn=<NllLossBackward0>)
106300 tensor(5.1396, device='cuda:0', grad_fn=<NllLossBackward0>)
106400 tensor(4.9693, device='cuda:0', grad_fn=<NllLossBackward0>)
381000
106500 tensor(5.2242, device='cuda:0', grad_fn=<NllLossBackward0>)
106600 tensor(4.9223, device='cuda:0', grad_fn=<NllLossBackward0>)
106700 tensor(5.1467, device='cuda:0', grad_fn=<NllLossBackward0>)
382000
106800 tensor(4.9459, device='cuda:0', grad_fn=<NllLossBackward0>)
106900 tensor(5.2412, device='cuda:0', grad_fn=<NllLossBackward0>)
383000
107000 tensor(4.9030, device='cuda:0', grad_fn=<NllLossBackward0>)
107100 tensor(5.1487, device='cuda:0', grad_fn=<NllLossBackward0>)
107200 tensor(5.1303, device='cuda:0', grad_fn=<NllLossBackward0>)
384000
107300 tensor(4.8918, device='cuda:0', grad_fn=<NllLossBackward0>)
107400 tensor(5.0686, device='cuda:0', grad_fn=<NllLossBackward0>)
107500 tensor(4.8140, device='cuda:0', grad_fn=<NllLossBackward0>)
385000
107600 tensor(4.8693, device='cuda:0', grad_fn=<NllLossBackward0>)
107700 tensor(5.0879, device='cuda:0', grad_fn=<NllLossBackward0>)
107800 tensor(5.0939, device='cuda:0', grad_fn=<NllLossBackward0>)
386000
107900 tensor(5.3087, device='cuda:0', grad_fn=<NllLossBackward0>)
108000 tensor(5.0659, device='cuda:0', grad_fn=<NllLossBackward0>)
108100 tensor(5.0273, device='cuda:0', grad_fn=<NllLossBackward0>)
387000
108200 tensor(5.0357, device='cuda:0', grad_fn=<NllLossBackward0>)
108300 tensor(5.0666, device='cuda:0', grad_fn=<NllLossBackward0>)
388000
108400 tensor(5.1699, device='cuda:0', grad_fn=<NllLossBackward0>)
108500 tensor(4.8493, device='cuda:0', grad_fn=<NllLossBackward0>)
108600 tensor(5.0399, device='cuda:0', grad_fn=<NllLossBackward0>)
389000
108700 tensor(4.5729, device='cuda:0', grad_fn=<NllLossBackward0>)
108800 tensor(4.9686, device='cuda:0', grad_fn=<NllLossBackward0>)
108900 tensor(4.7163, device='cuda:0', grad_fn=<NllLossBackward0>)
390000
109000 tensor(5.1307, device='cuda:0', grad_fn=<NllLossBackward0>)
109100 tensor(4.7140, device='cuda:0', grad_fn=<NllLossBackward0>)
109200 tensor(4.9282, device='cuda:0', grad_fn=<NllLossBackward0>)
391000
109300 tensor(4.5945, device='cuda:0', grad_fn=<NllLossBackward0>)
109400 tensor(4.9045, device='cuda:0', grad_fn=<NllLossBackward0>)
109500 tensor(4.8345, device='cuda:0', grad_fn=<NllLossBackward0>)
392000
109600 tensor(5.0347, device='cuda:0', grad_fn=<NllLossBackward0>)
109700 tensor(5.3972, device='cuda:0', grad_fn=<NllLossBackward0>)
393000
109800 tensor(4.7386, device='cuda:0', grad_fn=<NllLossBackward0>)
109900 tensor(4.9312, device='cuda:0', grad_fn=<NllLossBackward0>)
110000 tensor(5.0107, device='cuda:0', grad_fn=<NllLossBackward0>)
394000
110100 tensor(4.9484, device='cuda:0', grad_fn=<NllLossBackward0>)
110200 tensor(5.3088, device='cuda:0', grad_fn=<NllLossBackward0>)
110300 tensor(5.1758, device='cuda:0', grad_fn=<NllLossBackward0>)
395000
110400 tensor(4.6608, device='cuda:0', grad_fn=<NllLossBackward0>)
110500 tensor(5.0811, device='cuda:0', grad_fn=<NllLossBackward0>)
110600 tensor(4.9037, device='cuda:0', grad_fn=<NllLossBackward0>)
396000
110700 tensor(4.8701, device='cuda:0', grad_fn=<NllLossBackward0>)
110800 tensor(5.3046, device='cuda:0', grad_fn=<NllLossBackward0>)
110900 tensor(4.9689, device='cuda:0', grad_fn=<NllLossBackward0>)
397000
111000 tensor(5.1637, device='cuda:0', grad_fn=<NllLossBackward0>)
111100 tensor(4.9131, device='cuda:0', grad_fn=<NllLossBackward0>)
398000
111200 tensor(4.9308, device='cuda:0', grad_fn=<NllLossBackward0>)
111300 tensor(5.0290, device='cuda:0', grad_fn=<NllLossBackward0>)
111400 tensor(5.1249, device='cuda:0', grad_fn=<NllLossBackward0>)
399000
111500 tensor(5.2130, device='cuda:0', grad_fn=<NllLossBackward0>)
111600 tensor(5.2382, device='cuda:0', grad_fn=<NllLossBackward0>)
111700 tensor(4.8168, device='cuda:0', grad_fn=<NllLossBackward0>)
400000
111800 tensor(5.0276, device='cuda:0', grad_fn=<NllLossBackward0>)
111900 tensor(5.1121, device='cuda:0', grad_fn=<NllLossBackward0>)
112000 tensor(5.0463, device='cuda:0', grad_fn=<NllLossBackward0>)
401000
112100 tensor(4.6612, device='cuda:0', grad_fn=<NllLossBackward0>)
112200 tensor(5.0632, device='cuda:0', grad_fn=<NllLossBackward0>)
402000
112300 tensor(5.2173, device='cuda:0', grad_fn=<NllLossBackward0>)
112400 tensor(5.2271, device='cuda:0', grad_fn=<NllLossBackward0>)
112500 tensor(4.8618, device='cuda:0', grad_fn=<NllLossBackward0>)
403000
112600 tensor(4.9399, device='cuda:0', grad_fn=<NllLossBackward0>)
112700 tensor(4.9514, device='cuda:0', grad_fn=<NllLossBackward0>)
112800 tensor(4.5700, device='cuda:0', grad_fn=<NllLossBackward0>)
404000
112900 tensor(4.8508, device='cuda:0', grad_fn=<NllLossBackward0>)
113000 tensor(5.0784, device='cuda:0', grad_fn=<NllLossBackward0>)
113100 tensor(5.0205, device='cuda:0', grad_fn=<NllLossBackward0>)
405000
113200 tensor(4.5998, device='cuda:0', grad_fn=<NllLossBackward0>)
113300 tensor(4.9371, device='cuda:0', grad_fn=<NllLossBackward0>)
113400 tensor(5.0241, device='cuda:0', grad_fn=<NllLossBackward0>)
406000
113500 tensor(5.2931, device='cuda:0', grad_fn=<NllLossBackward0>)
113600 tensor(5.2488, device='cuda:0', grad_fn=<NllLossBackward0>)
407000
113700 tensor(4.8483, device='cuda:0', grad_fn=<NllLossBackward0>)
113800 tensor(5.2471, device='cuda:0', grad_fn=<NllLossBackward0>)
113900 tensor(5.0391, device='cuda:0', grad_fn=<NllLossBackward0>)
408000
114000 tensor(4.6978, device='cuda:0', grad_fn=<NllLossBackward0>)
114100 tensor(4.8170, device='cuda:0', grad_fn=<NllLossBackward0>)
114200 tensor(4.7943, device='cuda:0', grad_fn=<NllLossBackward0>)
409000
114300 tensor(4.6678, device='cuda:0', grad_fn=<NllLossBackward0>)
114400 tensor(4.8687, device='cuda:0', grad_fn=<NllLossBackward0>)
114500 tensor(4.9471, device='cuda:0', grad_fn=<NllLossBackward0>)
410000
114600 tensor(5.1608, device='cuda:0', grad_fn=<NllLossBackward0>)
114700 tensor(5.1240, device='cuda:0', grad_fn=<NllLossBackward0>)
114800 tensor(5.0586, device='cuda:0', grad_fn=<NllLossBackward0>)
411000
114900 tensor(5.0095, device='cuda:0', grad_fn=<NllLossBackward0>)
115000 tensor(4.9676, device='cuda:0', grad_fn=<NllLossBackward0>)
412000
115100 tensor(5.0771, device='cuda:0', grad_fn=<NllLossBackward0>)
115200 tensor(5.0396, device='cuda:0', grad_fn=<NllLossBackward0>)
115300 tensor(5.0663, device='cuda:0', grad_fn=<NllLossBackward0>)
413000
115400 tensor(5.2931, device='cuda:0', grad_fn=<NllLossBackward0>)
115500 tensor(5.2901, device='cuda:0', grad_fn=<NllLossBackward0>)
115600 tensor(5.0953, device='cuda:0', grad_fn=<NllLossBackward0>)
414000
115700 tensor(4.9706, device='cuda:0', grad_fn=<NllLossBackward0>)
115800 tensor(5.0768, device='cuda:0', grad_fn=<NllLossBackward0>)
115900 tensor(5.1668, device='cuda:0', grad_fn=<NllLossBackward0>)
415000
116000 tensor(5.0913, device='cuda:0', grad_fn=<NllLossBackward0>)
116100 tensor(4.8294, device='cuda:0', grad_fn=<NllLossBackward0>)
116200 tensor(5.2189, device='cuda:0', grad_fn=<NllLossBackward0>)
416000
116300 tensor(4.3633, device='cuda:0', grad_fn=<NllLossBackward0>)
116400 tensor(5.1168, device='cuda:0', grad_fn=<NllLossBackward0>)
417000
116500 tensor(5.0554, device='cuda:0', grad_fn=<NllLossBackward0>)
116600 tensor(5.1477, device='cuda:0', grad_fn=<NllLossBackward0>)
116700 tensor(4.9884, device='cuda:0', grad_fn=<NllLossBackward0>)
418000
116800 tensor(4.8381, device='cuda:0', grad_fn=<NllLossBackward0>)
116900 tensor(5.0709, device='cuda:0', grad_fn=<NllLossBackward0>)
117000 tensor(5.0518, device='cuda:0', grad_fn=<NllLossBackward0>)
419000
117100 tensor(5.0967, device='cuda:0', grad_fn=<NllLossBackward0>)
117200 tensor(4.9430, device='cuda:0', grad_fn=<NllLossBackward0>)
117300 tensor(5.2903, device='cuda:0', grad_fn=<NllLossBackward0>)
420000
117400 tensor(5.2733, device='cuda:0', grad_fn=<NllLossBackward0>)
117500 tensor(4.8548, device='cuda:0', grad_fn=<NllLossBackward0>)
421000
117600 tensor(5.1768, device='cuda:0', grad_fn=<NllLossBackward0>)
117700 tensor(5.1013, device='cuda:0', grad_fn=<NllLossBackward0>)
117800 tensor(4.9804, device='cuda:0', grad_fn=<NllLossBackward0>)
422000
117900 tensor(5.0671, device='cuda:0', grad_fn=<NllLossBackward0>)
118000 tensor(5.1332, device='cuda:0', grad_fn=<NllLossBackward0>)
118100 tensor(4.9436, device='cuda:0', grad_fn=<NllLossBackward0>)
423000
118200 tensor(4.9859, device='cuda:0', grad_fn=<NllLossBackward0>)
118300 tensor(4.9755, device='cuda:0', grad_fn=<NllLossBackward0>)
118400 tensor(5.0902, device='cuda:0', grad_fn=<NllLossBackward0>)
424000
118500 tensor(4.9845, device='cuda:0', grad_fn=<NllLossBackward0>)
118600 tensor(5.1098, device='cuda:0', grad_fn=<NllLossBackward0>)
118700 tensor(5.1192, device='cuda:0', grad_fn=<NllLossBackward0>)
425000
118800 tensor(5.1445, device='cuda:0', grad_fn=<NllLossBackward0>)
118900 tensor(5.1731, device='cuda:0', grad_fn=<NllLossBackward0>)
426000
119000 tensor(4.6613, device='cuda:0', grad_fn=<NllLossBackward0>)
119100 tensor(4.9640, device='cuda:0', grad_fn=<NllLossBackward0>)
119200 tensor(5.2998, device='cuda:0', grad_fn=<NllLossBackward0>)
427000
119300 tensor(4.9427, device='cuda:0', grad_fn=<NllLossBackward0>)
119400 tensor(4.9993, device='cuda:0', grad_fn=<NllLossBackward0>)
119500 tensor(5.0117, device='cuda:0', grad_fn=<NllLossBackward0>)
428000
119600 tensor(5.3324, device='cuda:0', grad_fn=<NllLossBackward0>)
119700 tensor(4.9375, device='cuda:0', grad_fn=<NllLossBackward0>)
119800 tensor(5.0221, device='cuda:0', grad_fn=<NllLossBackward0>)
429000
119900 tensor(4.7975, device='cuda:0', grad_fn=<NllLossBackward0>)
120000 tensor(4.9253, device='cuda:0', grad_fn=<NllLossBackward0>)
120100 tensor(4.9377, device='cuda:0', grad_fn=<NllLossBackward0>)
430000
120200 tensor(5.0909, device='cuda:0', grad_fn=<NllLossBackward0>)
120300 tensor(4.9946, device='cuda:0', grad_fn=<NllLossBackward0>)
431000
120400 tensor(5.3612, device='cuda:0', grad_fn=<NllLossBackward0>)
120500 tensor(4.9678, device='cuda:0', grad_fn=<NllLossBackward0>)
120600 tensor(5.2217, device='cuda:0', grad_fn=<NllLossBackward0>)
432000

Eval


model = Bigram(vocab_size, embed_size).to(device)
model.load_state_dict(torch.load('model.bin'))
model.eval()

res = torch.tensor(vocab.forward(['for'])).to(device)

out = model(res)
top = torch.topk(out[0], 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
list(zip(top_words, top_indices, top_probs))
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/container.py:217: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  input = module(input)
[('<unk>', 0, 0.23156249523162842),
 ('the', 1, 0.2045561522245407),
 ('a', 5, 0.0636623203754425),
 ('his', 20, 0.012841351330280304),
 ('their', 40, 0.012044394388794899),
 ('this', 28, 0.011758995242416859),
 ('tho', 33, 0.010536346584558487),
 ('some', 77, 0.008259670808911324),
 ('any', 49, 0.007337945979088545),
 ('an', 38, 0.007214350625872612)]
vocab = train_dataset.vocab
res = torch.tensor(vocab.forward(['wait'])).to(device)

out = model(res)
top = torch.topk(out[0], 20)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
list(zip(top_words, top_indices, top_probs))
[('until', 145, 0.17603398859500885),
 ('for', 9, 0.16400693356990814),
 ('<unk>', 0, 0.15191353857517242),
 ('on', 15, 0.04624223709106445),
 ('till', 677, 0.035729214549064636),
 ('a', 5, 0.03367603197693825),
 ('to', 4, 0.029361305758357048),
 ('upon', 59, 0.01995147578418255),
 ('and', 3, 0.01906605064868927),
 ('in', 6, 0.013167516328394413),
 ('at', 14, 0.011669990606606007),
 ('the', 1, 0.010971800424158573),
 ('of', 2, 0.005925077944993973),
 ('with', 16, 0.0055325529538095),
 ('In', 32, 0.004919056314975023),
 ('until\\\\nthe', 5509, 0.004719363059848547),
 ('tor', 532, 0.004647853318601847),
 ('for\\\\nthe', 389, 0.004400868900120258),
 ('two', 74, 0.0043497709557414055),
 ('patiently', 14401, 0.004239553119987249)]
cos = nn.CosineSimilarity(dim=1, eps=1e-6)

embeddings = model.model[0].weight

vec = embeddings[vocab['take']]

similarities = cos(vec, embeddings)

top = torch.topk(similarities, 10)

top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
list(zip(top_words, top_indices, top_probs))
[('take', 152, 1.0000001192092896),
 ('took', 248, 0.8024641275405884),
 ('taking', 538, 0.775847852230072),
 ('takes', 1108, 0.7307196855545044),
 ('to\\\\ntake', 3165, 0.7301462888717651),
 ('taken', 180, 0.6280043125152588),
 ('will\\\\ntake', 11101, 0.6152595281600952),
 ('tako', 6281, 0.5979241132736206),
 ('have\\\\ntaken', 15483, 0.5244049429893494),
 ('Take', 5203, 0.5183135867118835)]
vocab = train_dataset.vocab
res = torch.tensor(vocab.forward(['take'])).to(device)

out = model(res)
top = torch.topk(out[0], 20)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
list(zip(top_words, top_indices, top_probs))
[('<unk>', 0, 0.18006379902362823),
 ('the', 1, 0.11970410495996475),
 ('a', 5, 0.07213426381349564),
 ('care', 611, 0.027887443080544472),
 ('up', 69, 0.027840441092848778),
 ('it', 17, 0.023985734209418297),
 ('place', 159, 0.020694952458143234),
 ('advantage', 1458, 0.015635941177606583),
 ('his', 20, 0.014868981204926968),
 ('part', 131, 0.013506578281521797),
 ('an', 38, 0.013118326663970947),
 ('their', 40, 0.010849231854081154),
 ('hold', 478, 0.010717789642512798),
 ('them', 72, 0.010186631232500076),
 ('to', 4, 0.009746687486767769),
 ('this', 28, 0.009519988670945168),
 ('any', 49, 0.009436620399355888),
 ('her', 53, 0.008774512447416782),
 ('him', 70, 0.008407332003116608),
 ('all', 34, 0.007673078216612339)]

Create files for geval

def get_values(presc_word, model, vocab):
    ixs =  torch.tensor(vocab.forward([presc_word])).to(device)
    out = model(ixs)
    top = torch.topk(out[0], 20)
    top_indices = top.indices.tolist()
    top_probs = top.values.tolist()
    top_words = vocab.lookup_tokens(top_indices)
    return list(zip(top_words, top_probs))

def last_word(text):
    """Return the last word of a string."""
    last_word = ""
    for i in range(len(text)-1, -1, -1):
        if text[i] == ' ':
            return last_word[::-1]
        else:
            last_word += text[i]
    return last_word[::-1]

def first_word(text):
    """Return the first word of a string."""
    word = ""
    for i in range(len(text)-1):
        if text[i] == ' ':
            return word
        else:
            word += text[i]
    return word

def sum_prob(dic):
    probsum = sum(float(val) for key, val in dic.items())
    probsum = probsum - float(dic.get('<unk>', 0))
    if "<unk>" in dic.keys():
        del dic['<unk>']
    tab = [(key, val) for key, val in dic.items()]
    tab.append(('<unk>', 1-probsum))
    return tab

def format_to(dic):
    tab = sum_prob(dic)
    result = ''
    for element in tab[:-1]:
        result+=str(element[0])+':'+str(element[1])+'\t'
    result+=':'+ str(tab[-1][1])+'\n'
    return result
with lzma.open(test_file, 'rt') as file:
    predict_words = []
    results = []
    for line in file:
        split = line.split('\t')[6:] 
        predict_words.append(last_word(split[0]))
    
    vocab = train_dataset.vocab
    for presc_word in predict_words:
        results.append(dict(get_values(presc_word, model ,vocab)))
    
    with open(out_file, 'w') as outfile:
        for elem in results:
            outfile.write(format_to(elem))