challenging-america-word-ga.../zad7.ipynb
2023-06-30 14:08:09 +02:00

261 KiB
Raw Permalink Blame History

!pip install torchtext
Defaulting to user installation because normal site-packages is not writeable
Collecting torchtext
  Downloading torchtext-0.15.2-cp310-cp310-manylinux1_x86_64.whl (2.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 1.5 MB/s eta 0:00:00[36m0:00:01[36m0:00:01:01
[?25hCollecting tqdm
  Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)
Requirement already satisfied: numpy in /home/gedin/.local/lib/python3.10/site-packages (from torchtext) (1.24.3)
Collecting torchdata==0.6.1
  Downloading torchdata-0.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.6/4.6 MB 1.5 MB/s eta 0:00:00m eta 0:00:01[36m0:00:01
[?25hRequirement already satisfied: requests in /usr/lib/python3/dist-packages (from torchtext) (2.25.1)
Collecting torch==2.0.1
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 619.9/619.9 MB 1.1 MB/s eta 0:00:00m eta 0:00:01[36m0:00:09
[?25hCollecting sympy
  Downloading sympy-1.12-py3-none-any.whl (5.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.7/5.7 MB 1.7 MB/s eta 0:00:00m eta 0:00:01[36m0:00:01
[?25hCollecting nvidia-cudnn-cu11==8.5.0.96
  Using cached nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl (557.1 MB)
Collecting nvidia-cuda-cupti-cu11==11.7.101
  Using cached nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)
Collecting nvidia-cusparse-cu11==11.7.4.91
  Using cached nvidia_cusparse_cu11-11.7.4.91-py3-none-manylinux1_x86_64.whl (173.2 MB)
Collecting networkx
  Using cached networkx-3.1-py3-none-any.whl (2.1 MB)
Collecting nvidia-cufft-cu11==10.9.0.58
  Using cached nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux1_x86_64.whl (168.4 MB)
Collecting filelock
  Downloading filelock-3.12.0-py3-none-any.whl (10 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99
  Using cached nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
Collecting triton==2.0.0
  Downloading triton-2.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (63.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 63.3/63.3 MB 1.3 MB/s eta 0:00:00m eta 0:00:01[36m0:00:02
[?25hCollecting nvidia-cusolver-cu11==11.4.0.1
  Using cached nvidia_cusolver_cu11-11.4.0.1-2-py3-none-manylinux1_x86_64.whl (102.6 MB)
Requirement already satisfied: jinja2 in /home/gedin/.local/lib/python3.10/site-packages (from torch==2.0.1->torchtext) (3.1.2)
Collecting nvidia-cublas-cu11==11.10.3.66
  Using cached nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB)
Collecting typing-extensions
  Downloading typing_extensions-4.6.3-py3-none-any.whl (31 kB)
Collecting nvidia-nccl-cu11==2.14.3
  Using cached nvidia_nccl_cu11-2.14.3-py3-none-manylinux1_x86_64.whl (177.1 MB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99
  Using cached nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
Collecting nvidia-curand-cu11==10.2.10.91
  Using cached nvidia_curand_cu11-10.2.10.91-py3-none-manylinux1_x86_64.whl (54.6 MB)
Collecting nvidia-nvtx-cu11==11.7.91
  Using cached nvidia_nvtx_cu11-11.7.91-py3-none-manylinux1_x86_64.whl (98 kB)
Requirement already satisfied: urllib3>=1.25 in /usr/lib/python3/dist-packages (from torchdata==0.6.1->torchtext) (1.26.5)
Requirement already satisfied: wheel in /usr/lib/python3/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch==2.0.1->torchtext) (0.37.1)
Requirement already satisfied: setuptools in /usr/lib/python3/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch==2.0.1->torchtext) (59.6.0)
Collecting lit
  Downloading lit-16.0.5.tar.gz (138 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 138.0/138.0 KB 1.7 MB/s eta 0:00:00[31m1.6 MB/s eta 0:00:01
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting cmake
  Using cached cmake-3.26.3-py2.py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (24.0 MB)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/lib/python3/dist-packages (from jinja2->torch==2.0.1->torchtext) (2.0.1)
Collecting mpmath>=0.19
  Using cached mpmath-1.3.0-py3-none-any.whl (536 kB)
Building wheels for collected packages: lit
  Building wheel for lit (setup.py) ... [?25ldone
[?25h  Created wheel for lit: filename=lit-16.0.5-py3-none-any.whl size=88192 sha256=f6c57a31a147cbfe0af3d6bf4b856390ad14c28a9ddb38c8044ec29331b35c26
  Stored in directory: /home/gedin/.cache/pip/wheels/eb/02/84/d82f0b1a6098209edf7e3607be6cc592ebbc015a8a3127c68d
Successfully built lit
Installing collected packages: mpmath, lit, cmake, typing-extensions, tqdm, sympy, nvidia-nvtx-cu11, nvidia-nccl-cu11, nvidia-cusparse-cu11, nvidia-curand-cu11, nvidia-cufft-cu11, nvidia-cuda-runtime-cu11, nvidia-cuda-nvrtc-cu11, nvidia-cuda-cupti-cu11, nvidia-cublas-cu11, networkx, filelock, nvidia-cusolver-cu11, nvidia-cudnn-cu11, triton, torch, torchdata, torchtext
Successfully installed cmake-3.26.3 filelock-3.12.0 lit-16.0.5 mpmath-1.3.0 networkx-3.1 nvidia-cublas-cu11-11.10.3.66 nvidia-cuda-cupti-cu11-11.7.101 nvidia-cuda-nvrtc-cu11-11.7.99 nvidia-cuda-runtime-cu11-11.7.99 nvidia-cudnn-cu11-8.5.0.96 nvidia-cufft-cu11-10.9.0.58 nvidia-curand-cu11-10.2.10.91 nvidia-cusolver-cu11-11.4.0.1 nvidia-cusparse-cu11-11.7.4.91 nvidia-nccl-cu11-2.14.3 nvidia-nvtx-cu11-11.7.91 sympy-1.12 torch-2.0.1 torchdata-0.6.1 torchtext-0.15.2 tqdm-4.65.0 triton-2.0.0 typing-extensions-4.6.3
train_file ='train/in.tsv.xz'
test_file = 'dev-0/in.tsv.xz'
out_file = 'dev-0/out.tsv'
from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
import lzma
import pickle
import re
import torch
from torch import nn
from torch.utils.data import IterableDataset
import itertools
from torch.utils.data import DataLoader
import gc
embed_size = 300
device = 'cuda'
vocab_size = 25000
batch_s = 3200
learning_rate = 0.0001
epochs = 4
k = 20 #top k words
wildcard_minweight = 0.1
###preprocessing
def preprocess(line):
    line = get_rid_of_header(line)
    line = replace_endline(line)
    return line

def get_rid_of_header(line):
    line = line.split('\t')[6:]
    return "".join(line)
    
def replace_endline(line):
    line = line.replace("\\\\n", " ")
    return line


def get_last_word(text):
    """Return the last word of a string."""
    last_word = ""
    for i in range(len(text)-1, -1, -1):
        if text[i] == ' ':
            return last_word[::-1].rstrip()
        else:
            last_word += text[i]
    return last_word[::-1].rstrip()

def get_first_word(text):
    """Return the first word of a string."""
    word = ""
    for i in range(len(text)-1):
        if text[i] == ' ':
            return word
        else:
            word += text[i]
    return word


def get_words_from_line(line):
    line = line.rstrip()
    yield '<s>'
    line = preprocess(line)
    for t in line.split(' '):
        yield t
    yield '</s>'


def get_word_lines_from_file(file_name):
    n = 0
    with lzma.open(file_name, 'r') as fh:
        for line in fh:
            n+=1
            if n%1000==0:
                print(n)
            yield get_words_from_line(line.decode('utf-8'))
vocab = build_vocab_from_iterator(
    get_word_lines_from_file(train_file),
    max_tokens = vocab_size,
    specials = ['<unk>'])

with open('filename.pickle', 'wb') as handle:
    pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000
159000
160000
161000
162000
163000
164000
165000
166000
167000
168000
169000
170000
171000
172000
173000
174000
175000
176000
177000
178000
179000
180000
181000
182000
183000
184000
185000
186000
187000
188000
189000
190000
191000
192000
193000
194000
195000
196000
197000
198000
199000
200000
201000
202000
203000
204000
205000
206000
207000
208000
209000
210000
211000
212000
213000
214000
215000
216000
217000
218000
219000
220000
221000
222000
223000
224000
225000
226000
227000
228000
229000
230000
231000
232000
233000
234000
235000
236000
237000
238000
239000
240000
241000
242000
243000
244000
245000
246000
247000
248000
249000
250000
251000
252000
253000
254000
255000
256000
257000
258000
259000
260000
261000
262000
263000
264000
265000
266000
267000
268000
269000
270000
271000
272000
273000
274000
275000
276000
277000
278000
279000
280000
281000
282000
283000
284000
285000
286000
287000
288000
289000
290000
291000
292000
293000
294000
295000
296000
297000
298000
299000
300000
301000
302000
303000
304000
305000
306000
307000
308000
309000
310000
311000
312000
313000
314000
315000
316000
317000
318000
319000
320000
321000
322000
323000
324000
325000
326000
327000
328000
329000
330000
331000
332000
333000
334000
335000
336000
337000
338000
339000
340000
341000
342000
343000
344000
345000
346000
347000
348000
349000
350000
351000
352000
353000
354000
355000
356000
357000
358000
359000
360000
361000
362000
363000
364000
365000
366000
367000
368000
369000
370000
371000
372000
373000
374000
375000
376000
377000
378000
379000
380000
381000
382000
383000
384000
385000
386000
387000
388000
389000
390000
391000
392000
393000
394000
395000
396000
397000
398000
399000
400000
401000
402000
403000
404000
405000
406000
407000
408000
409000
410000
411000
412000
413000
414000
415000
416000
417000
418000
419000
420000
421000
422000
423000
424000
425000
426000
427000
428000
429000
430000
431000
432000
vocab.lookup_tokens([0, 1, 2, 10, 2000])
['<unk>', 'the', 'of', 'was', 'ladies']

Definicja sieci

Naszą prostą sieć neuronową zaimplementujemy używając frameworku PyTorch.

class SimpleBigramNeuralLanguageModel(nn.Module):
  def __init__(self, vocabulary_size, embedding_size):
      super(SimpleBigramNeuralLanguageModel, self).__init__()
      self.model = nn.Sequential(
          nn.Embedding(vocabulary_size, embedding_size),
          nn.Linear(embedding_size, vocabulary_size),
          nn.Softmax()
      )
    
  def forward(self, x):
      return self.model(x)

with open('filename.pickle','rb') as handle:
    vocab = pickle.load(handle)

vocab.set_default_index(vocab['<unk>'])
help(vocab)
Help on Vocab in module torchtext.vocab.vocab object:

class Vocab(torch.nn.modules.module.Module)
 |  Vocab(vocab) -> None
 |  
 |  Base class for all neural network modules.
 |  
 |  Your models should also subclass this class.
 |  
 |  Modules can also contain other Modules, allowing to nest them in
 |  a tree structure. You can assign the submodules as regular attributes::
 |  
 |      import torch.nn as nn
 |      import torch.nn.functional as F
 |  
 |      class Model(nn.Module):
 |          def __init__(self):
 |              super().__init__()
 |              self.conv1 = nn.Conv2d(1, 20, 5)
 |              self.conv2 = nn.Conv2d(20, 20, 5)
 |  
 |          def forward(self, x):
 |              x = F.relu(self.conv1(x))
 |              return F.relu(self.conv2(x))
 |  
 |  Submodules assigned in this way will be registered, and will have their
 |  parameters converted too when you call :meth:`to`, etc.
 |  
 |  .. note::
 |      As per the example above, an ``__init__()`` call to the parent class
 |      must be made before assignment on the child.
 |  
 |  :ivar training: Boolean represents whether this module is in training or
 |                  evaluation mode.
 |  :vartype training: bool
 |  
 |  Method resolution order:
 |      Vocab
 |      torch.nn.modules.module.Module
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __contains__(self, token: str) -> bool
 |      Args:
 |          token: The token for which to check the membership.
 |      
 |      Returns:
 |          Whether the token is member of vocab or not.
 |  
 |  __getitem__(self, token: str) -> int
 |      Args:
 |          token: The token used to lookup the corresponding index.
 |      
 |      Returns:
 |          The index corresponding to the associated token.
 |  
 |  __init__(self, vocab) -> None
 |      Initializes internal Module state, shared by both nn.Module and ScriptModule.
 |  
 |  __len__(self) -> int
 |      Returns:
 |          The length of the vocab.
 |  
 |  __prepare_scriptable__(self)
 |      Return a JITable Vocab.
 |  
 |  append_token(self, token: str) -> None
 |      Args:
 |          token: The token used to lookup the corresponding index.
 |      
 |      Raises:
 |          RuntimeError: If `token` already exists in the vocab
 |  
 |  forward(self, tokens: List[str]) -> List[int]
 |      Calls the `lookup_indices` method
 |      
 |      Args:
 |          tokens: a list of tokens used to lookup their corresponding `indices`.
 |      
 |      Returns:
 |          The indices associated with a list of `tokens`.
 |  
 |  get_default_index(self) -> Union[int, NoneType]
 |      Returns:
 |          Value of default index if it is set.
 |  
 |  get_itos(self) -> List[str]
 |      Returns:
 |          List mapping indices to tokens.
 |  
 |  get_stoi(self) -> Dict[str, int]
 |      Returns:
 |          Dictionary mapping tokens to indices.
 |  
 |  insert_token(self, token: str, index: int) -> None
 |      Args:
 |          token: The token used to lookup the corresponding index.
 |          index: The index corresponding to the associated token.
 |      Raises:
 |          RuntimeError: If `index` is not in range [0, Vocab.size()] or if `token` already exists in the vocab.
 |  
 |  lookup_indices(self, tokens: List[str]) -> List[int]
 |      Args:
 |          tokens: the tokens used to lookup their corresponding `indices`.
 |      
 |      Returns:
 |          The 'indices` associated with `tokens`.
 |  
 |  lookup_token(self, index: int) -> str
 |      Args:
 |          index: The index corresponding to the associated token.
 |      
 |      Returns:
 |          token: The token used to lookup the corresponding index.
 |      
 |      Raises:
 |          RuntimeError: If `index` not in range [0, itos.size()).
 |  
 |  lookup_tokens(self, indices: List[int]) -> List[str]
 |      Args:
 |          indices: The `indices` used to lookup their corresponding`tokens`.
 |      
 |      Returns:
 |          The `tokens` associated with `indices`.
 |      
 |      Raises:
 |          RuntimeError: If an index within `indices` is not int range [0, itos.size()).
 |  
 |  set_default_index(self, index: Union[int, NoneType]) -> None
 |      Args:
 |          index: Value of default index. This index will be returned when OOV token is queried.
 |  
 |  ----------------------------------------------------------------------
 |  Readonly properties defined here:
 |  
 |  is_jitable
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  __jit_unused_properties__ = ['is_jitable']
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from torch.nn.modules.module.Module:
 |  
 |  __call__ = _call_impl(self, *args, **kwargs)
 |  
 |  __delattr__(self, name)
 |      Implement delattr(self, name).
 |  
 |  __dir__(self)
 |      Default dir() implementation.
 |  
 |  __getattr__(self, name: str) -> Union[torch.Tensor, ForwardRef('Module')]
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  __setattr__(self, name: str, value: Union[torch.Tensor, ForwardRef('Module')]) -> None
 |      Implement setattr(self, name, value).
 |  
 |  __setstate__(self, state)
 |  
 |  add_module(self, name: str, module: Union[ForwardRef('Module'), NoneType]) -> None
 |      Adds a child module to the current module.
 |      
 |      The module can be accessed as an attribute using the given name.
 |      
 |      Args:
 |          name (str): name of the child module. The child module can be
 |              accessed from this module using the given name
 |          module (Module): child module to be added to the module.
 |  
 |  apply(self: ~T, fn: Callable[[ForwardRef('Module')], NoneType]) -> ~T
 |      Applies ``fn`` recursively to every submodule (as returned by ``.children()``)
 |      as well as self. Typical use includes initializing the parameters of a model
 |      (see also :ref:`nn-init-doc`).
 |      
 |      Args:
 |          fn (:class:`Module` -> None): function to be applied to each submodule
 |      
 |      Returns:
 |          Module: self
 |      
 |      Example::
 |      
 |          >>> @torch.no_grad()
 |          >>> def init_weights(m):
 |          >>>     print(m)
 |          >>>     if type(m) == nn.Linear:
 |          >>>         m.weight.fill_(1.0)
 |          >>>         print(m.weight)
 |          >>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
 |          >>> net.apply(init_weights)
 |          Linear(in_features=2, out_features=2, bias=True)
 |          Parameter containing:
 |          tensor([[1., 1.],
 |                  [1., 1.]], requires_grad=True)
 |          Linear(in_features=2, out_features=2, bias=True)
 |          Parameter containing:
 |          tensor([[1., 1.],
 |                  [1., 1.]], requires_grad=True)
 |          Sequential(
 |            (0): Linear(in_features=2, out_features=2, bias=True)
 |            (1): Linear(in_features=2, out_features=2, bias=True)
 |          )
 |  
 |  bfloat16(self: ~T) -> ~T
 |      Casts all floating point parameters and buffers to ``bfloat16`` datatype.
 |      
 |      .. note::
 |          This method modifies the module in-place.
 |      
 |      Returns:
 |          Module: self
 |  
 |  buffers(self, recurse: bool = True) -> Iterator[torch.Tensor]
 |      Returns an iterator over module buffers.
 |      
 |      Args:
 |          recurse (bool): if True, then yields buffers of this module
 |              and all submodules. Otherwise, yields only buffers that
 |              are direct members of this module.
 |      
 |      Yields:
 |          torch.Tensor: module buffer
 |      
 |      Example::
 |      
 |          >>> # xdoctest: +SKIP("undefined vars")
 |          >>> for buf in model.buffers():
 |          >>>     print(type(buf), buf.size())
 |          <class 'torch.Tensor'> (20L,)
 |          <class 'torch.Tensor'> (20L, 1L, 5L, 5L)
 |  
 |  children(self) -> Iterator[ForwardRef('Module')]
 |      Returns an iterator over immediate children modules.
 |      
 |      Yields:
 |          Module: a child module
 |  
 |  cpu(self: ~T) -> ~T
 |      Moves all model parameters and buffers to the CPU.
 |      
 |      .. note::
 |          This method modifies the module in-place.
 |      
 |      Returns:
 |          Module: self
 |  
 |  cuda(self: ~T, device: Union[int, torch.device, NoneType] = None) -> ~T
 |      Moves all model parameters and buffers to the GPU.
 |      
 |      This also makes associated parameters and buffers different objects. So
 |      it should be called before constructing optimizer if the module will
 |      live on GPU while being optimized.
 |      
 |      .. note::
 |          This method modifies the module in-place.
 |      
 |      Args:
 |          device (int, optional): if specified, all parameters will be
 |              copied to that device
 |      
 |      Returns:
 |          Module: self
 |  
 |  double(self: ~T) -> ~T
 |      Casts all floating point parameters and buffers to ``double`` datatype.
 |      
 |      .. note::
 |          This method modifies the module in-place.
 |      
 |      Returns:
 |          Module: self
 |  
 |  eval(self: ~T) -> ~T
 |      Sets the module in evaluation mode.
 |      
 |      This has any effect only on certain modules. See documentations of
 |      particular modules for details of their behaviors in training/evaluation
 |      mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,
 |      etc.
 |      
 |      This is equivalent with :meth:`self.train(False) <torch.nn.Module.train>`.
 |      
 |      See :ref:`locally-disable-grad-doc` for a comparison between
 |      `.eval()` and several similar mechanisms that may be confused with it.
 |      
 |      Returns:
 |          Module: self
 |  
 |  extra_repr(self) -> str
 |      Set the extra representation of the module
 |      
 |      To print customized extra information, you should re-implement
 |      this method in your own modules. Both single-line and multi-line
 |      strings are acceptable.
 |  
 |  float(self: ~T) -> ~T
 |      Casts all floating point parameters and buffers to ``float`` datatype.
 |      
 |      .. note::
 |          This method modifies the module in-place.
 |      
 |      Returns:
 |          Module: self
 |  
 |  get_buffer(self, target: str) -> 'Tensor'
 |      Returns the buffer given by ``target`` if it exists,
 |      otherwise throws an error.
 |      
 |      See the docstring for ``get_submodule`` for a more detailed
 |      explanation of this method's functionality as well as how to
 |      correctly specify ``target``.
 |      
 |      Args:
 |          target: The fully-qualified string name of the buffer
 |              to look for. (See ``get_submodule`` for how to specify a
 |              fully-qualified string.)
 |      
 |      Returns:
 |          torch.Tensor: The buffer referenced by ``target``
 |      
 |      Raises:
 |          AttributeError: If the target string references an invalid
 |              path or resolves to something that is not a
 |              buffer
 |  
 |  get_extra_state(self) -> Any
 |      Returns any extra state to include in the module's state_dict.
 |      Implement this and a corresponding :func:`set_extra_state` for your module
 |      if you need to store extra state. This function is called when building the
 |      module's `state_dict()`.
 |      
 |      Note that extra state should be picklable to ensure working serialization
 |      of the state_dict. We only provide provide backwards compatibility guarantees
 |      for serializing Tensors; other objects may break backwards compatibility if
 |      their serialized pickled form changes.
 |      
 |      Returns:
 |          object: Any extra state to store in the module's state_dict
 |  
 |  get_parameter(self, target: str) -> 'Parameter'
 |      Returns the parameter given by ``target`` if it exists,
 |      otherwise throws an error.
 |      
 |      See the docstring for ``get_submodule`` for a more detailed
 |      explanation of this method's functionality as well as how to
 |      correctly specify ``target``.
 |      
 |      Args:
 |          target: The fully-qualified string name of the Parameter
 |              to look for. (See ``get_submodule`` for how to specify a
 |              fully-qualified string.)
 |      
 |      Returns:
 |          torch.nn.Parameter: The Parameter referenced by ``target``
 |      
 |      Raises:
 |          AttributeError: If the target string references an invalid
 |              path or resolves to something that is not an
 |              ``nn.Parameter``
 |  
 |  get_submodule(self, target: str) -> 'Module'
 |      Returns the submodule given by ``target`` if it exists,
 |      otherwise throws an error.
 |      
 |      For example, let's say you have an ``nn.Module`` ``A`` that
 |      looks like this:
 |      
 |      .. code-block:: text
 |      
 |          A(
 |              (net_b): Module(
 |                  (net_c): Module(
 |                      (conv): Conv2d(16, 33, kernel_size=(3, 3), stride=(2, 2))
 |                  )
 |                  (linear): Linear(in_features=100, out_features=200, bias=True)
 |              )
 |          )
 |      
 |      (The diagram shows an ``nn.Module`` ``A``. ``A`` has a nested
 |      submodule ``net_b``, which itself has two submodules ``net_c``
 |      and ``linear``. ``net_c`` then has a submodule ``conv``.)
 |      
 |      To check whether or not we have the ``linear`` submodule, we
 |      would call ``get_submodule("net_b.linear")``. To check whether
 |      we have the ``conv`` submodule, we would call
 |      ``get_submodule("net_b.net_c.conv")``.
 |      
 |      The runtime of ``get_submodule`` is bounded by the degree
 |      of module nesting in ``target``. A query against
 |      ``named_modules`` achieves the same result, but it is O(N) in
 |      the number of transitive modules. So, for a simple check to see
 |      if some submodule exists, ``get_submodule`` should always be
 |      used.
 |      
 |      Args:
 |          target: The fully-qualified string name of the submodule
 |              to look for. (See above example for how to specify a
 |              fully-qualified string.)
 |      
 |      Returns:
 |          torch.nn.Module: The submodule referenced by ``target``
 |      
 |      Raises:
 |          AttributeError: If the target string references an invalid
 |              path or resolves to something that is not an
 |              ``nn.Module``
 |  
 |  half(self: ~T) -> ~T
 |      Casts all floating point parameters and buffers to ``half`` datatype.
 |      
 |      .. note::
 |          This method modifies the module in-place.
 |      
 |      Returns:
 |          Module: self
 |  
 |  ipu(self: ~T, device: Union[int, torch.device, NoneType] = None) -> ~T
 |      Moves all model parameters and buffers to the IPU.
 |      
 |      This also makes associated parameters and buffers different objects. So
 |      it should be called before constructing optimizer if the module will
 |      live on IPU while being optimized.
 |      
 |      .. note::
 |          This method modifies the module in-place.
 |      
 |      Arguments:
 |          device (int, optional): if specified, all parameters will be
 |              copied to that device
 |      
 |      Returns:
 |          Module: self
 |  
 |  load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True)
 |      Copies parameters and buffers from :attr:`state_dict` into
 |      this module and its descendants. If :attr:`strict` is ``True``, then
 |      the keys of :attr:`state_dict` must exactly match the keys returned
 |      by this module's :meth:`~torch.nn.Module.state_dict` function.
 |      
 |      Args:
 |          state_dict (dict): a dict containing parameters and
 |              persistent buffers.
 |          strict (bool, optional): whether to strictly enforce that the keys
 |              in :attr:`state_dict` match the keys returned by this module's
 |              :meth:`~torch.nn.Module.state_dict` function. Default: ``True``
 |      
 |      Returns:
 |          ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
 |              * **missing_keys** is a list of str containing the missing keys
 |              * **unexpected_keys** is a list of str containing the unexpected keys
 |      
 |      Note:
 |          If a parameter or buffer is registered as ``None`` and its corresponding key
 |          exists in :attr:`state_dict`, :meth:`load_state_dict` will raise a
 |          ``RuntimeError``.
 |  
 |  modules(self) -> Iterator[ForwardRef('Module')]
 |      Returns an iterator over all modules in the network.
 |      
 |      Yields:
 |          Module: a module in the network
 |      
 |      Note:
 |          Duplicate modules are returned only once. In the following
 |          example, ``l`` will be returned only once.
 |      
 |      Example::
 |      
 |          >>> l = nn.Linear(2, 2)
 |          >>> net = nn.Sequential(l, l)
 |          >>> for idx, m in enumerate(net.modules()):
 |          ...     print(idx, '->', m)
 |      
 |          0 -> Sequential(
 |            (0): Linear(in_features=2, out_features=2, bias=True)
 |            (1): Linear(in_features=2, out_features=2, bias=True)
 |          )
 |          1 -> Linear(in_features=2, out_features=2, bias=True)
 |  
 |  named_buffers(self, prefix: str = '', recurse: bool = True, remove_duplicate: bool = True) -> Iterator[Tuple[str, torch.Tensor]]
 |      Returns an iterator over module buffers, yielding both the
 |      name of the buffer as well as the buffer itself.
 |      
 |      Args:
 |          prefix (str): prefix to prepend to all buffer names.
 |          recurse (bool, optional): if True, then yields buffers of this module
 |              and all submodules. Otherwise, yields only buffers that
 |              are direct members of this module. Defaults to True.
 |          remove_duplicate (bool, optional): whether to remove the duplicated buffers in the result. Defaults to True.
 |      
 |      Yields:
 |          (str, torch.Tensor): Tuple containing the name and buffer
 |      
 |      Example::
 |      
 |          >>> # xdoctest: +SKIP("undefined vars")
 |          >>> for name, buf in self.named_buffers():
 |          >>>     if name in ['running_var']:
 |          >>>         print(buf.size())
 |  
 |  named_children(self) -> Iterator[Tuple[str, ForwardRef('Module')]]
 |      Returns an iterator over immediate children modules, yielding both
 |      the name of the module as well as the module itself.
 |      
 |      Yields:
 |          (str, Module): Tuple containing a name and child module
 |      
 |      Example::
 |      
 |          >>> # xdoctest: +SKIP("undefined vars")
 |          >>> for name, module in model.named_children():
 |          >>>     if name in ['conv4', 'conv5']:
 |          >>>         print(module)
 |  
 |  named_modules(self, memo: Union[Set[ForwardRef('Module')], NoneType] = None, prefix: str = '', remove_duplicate: bool = True)
 |      Returns an iterator over all modules in the network, yielding
 |      both the name of the module as well as the module itself.
 |      
 |      Args:
 |          memo: a memo to store the set of modules already added to the result
 |          prefix: a prefix that will be added to the name of the module
 |          remove_duplicate: whether to remove the duplicated module instances in the result
 |              or not
 |      
 |      Yields:
 |          (str, Module): Tuple of name and module
 |      
 |      Note:
 |          Duplicate modules are returned only once. In the following
 |          example, ``l`` will be returned only once.
 |      
 |      Example::
 |      
 |          >>> l = nn.Linear(2, 2)
 |          >>> net = nn.Sequential(l, l)
 |          >>> for idx, m in enumerate(net.named_modules()):
 |          ...     print(idx, '->', m)
 |      
 |          0 -> ('', Sequential(
 |            (0): Linear(in_features=2, out_features=2, bias=True)
 |            (1): Linear(in_features=2, out_features=2, bias=True)
 |          ))
 |          1 -> ('0', Linear(in_features=2, out_features=2, bias=True))
 |  
 |  named_parameters(self, prefix: str = '', recurse: bool = True, remove_duplicate: bool = True) -> Iterator[Tuple[str, torch.nn.parameter.Parameter]]
 |      Returns an iterator over module parameters, yielding both the
 |      name of the parameter as well as the parameter itself.
 |      
 |      Args:
 |          prefix (str): prefix to prepend to all parameter names.
 |          recurse (bool): if True, then yields parameters of this module
 |              and all submodules. Otherwise, yields only parameters that
 |              are direct members of this module.
 |          remove_duplicate (bool, optional): whether to remove the duplicated
 |              parameters in the result. Defaults to True.
 |      
 |      Yields:
 |          (str, Parameter): Tuple containing the name and parameter
 |      
 |      Example::
 |      
 |          >>> # xdoctest: +SKIP("undefined vars")
 |          >>> for name, param in self.named_parameters():
 |          >>>     if name in ['bias']:
 |          >>>         print(param.size())
 |  
 |  parameters(self, recurse: bool = True) -> Iterator[torch.nn.parameter.Parameter]
 |      Returns an iterator over module parameters.
 |      
 |      This is typically passed to an optimizer.
 |      
 |      Args:
 |          recurse (bool): if True, then yields parameters of this module
 |              and all submodules. Otherwise, yields only parameters that
 |              are direct members of this module.
 |      
 |      Yields:
 |          Parameter: module parameter
 |      
 |      Example::
 |      
 |          >>> # xdoctest: +SKIP("undefined vars")
 |          >>> for param in model.parameters():
 |          >>>     print(type(param), param.size())
 |          <class 'torch.Tensor'> (20L,)
 |          <class 'torch.Tensor'> (20L, 1L, 5L, 5L)
 |  
 |  register_backward_hook(self, hook: Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]) -> torch.utils.hooks.RemovableHandle
 |      Registers a backward hook on the module.
 |      
 |      This function is deprecated in favor of :meth:`~torch.nn.Module.register_full_backward_hook` and
 |      the behavior of this function will change in future versions.
 |      
 |      Returns:
 |          :class:`torch.utils.hooks.RemovableHandle`:
 |              a handle that can be used to remove the added hook by calling
 |              ``handle.remove()``
 |  
 |  register_buffer(self, name: str, tensor: Union[torch.Tensor, NoneType], persistent: bool = True) -> None
 |      Adds a buffer to the module.
 |      
 |      This is typically used to register a buffer that should not to be
 |      considered a model parameter. For example, BatchNorm's ``running_mean``
 |      is not a parameter, but is part of the module's state. Buffers, by
 |      default, are persistent and will be saved alongside parameters. This
 |      behavior can be changed by setting :attr:`persistent` to ``False``. The
 |      only difference between a persistent buffer and a non-persistent buffer
 |      is that the latter will not be a part of this module's
 |      :attr:`state_dict`.
 |      
 |      Buffers can be accessed as attributes using given names.
 |      
 |      Args:
 |          name (str): name of the buffer. The buffer can be accessed
 |              from this module using the given name
 |          tensor (Tensor or None): buffer to be registered. If ``None``, then operations
 |              that run on buffers, such as :attr:`cuda`, are ignored. If ``None``,
 |              the buffer is **not** included in the module's :attr:`state_dict`.
 |          persistent (bool): whether the buffer is part of this module's
 |              :attr:`state_dict`.
 |      
 |      Example::
 |      
 |          >>> # xdoctest: +SKIP("undefined vars")
 |          >>> self.register_buffer('running_mean', torch.zeros(num_features))
 |  
 |  register_forward_hook(self, hook: Union[Callable[[~T, Tuple[Any, ...], Any], Union[Any, NoneType]], Callable[[~T, Tuple[Any, ...], Dict[str, Any], Any], Union[Any, NoneType]]], *, prepend: bool = False, with_kwargs: bool = False) -> torch.utils.hooks.RemovableHandle
 |      Registers a forward hook on the module.
 |      
 |      The hook will be called every time after :func:`forward` has computed an output.
 |      
 |      If ``with_kwargs`` is ``False`` or not specified, the input contains only
 |      the positional arguments given to the module. Keyword arguments won't be
 |      passed to the hooks and only to the ``forward``. The hook can modify the
 |      output. It can modify the input inplace but it will not have effect on
 |      forward since this is called after :func:`forward` is called. The hook
 |      should have the following signature::
 |      
 |          hook(module, args, output) -> None or modified output
 |      
 |      If ``with_kwargs`` is ``True``, the forward hook will be passed the
 |      ``kwargs`` given to the forward function and be expected to return the
 |      output possibly modified. The hook should have the following signature::
 |      
 |          hook(module, args, kwargs, output) -> None or modified output
 |      
 |      Args:
 |          hook (Callable): The user defined hook to be registered.
 |          prepend (bool): If ``True``, the provided ``hook`` will be fired
 |              before all existing ``forward`` hooks on this
 |              :class:`torch.nn.modules.Module`. Otherwise, the provided
 |              ``hook`` will be fired after all existing ``forward`` hooks on
 |              this :class:`torch.nn.modules.Module`. Note that global
 |              ``forward`` hooks registered with
 |              :func:`register_module_forward_hook` will fire before all hooks
 |              registered by this method.
 |              Default: ``False``
 |          with_kwargs (bool): If ``True``, the ``hook`` will be passed the
 |              kwargs given to the forward function.
 |              Default: ``False``
 |      
 |      Returns:
 |          :class:`torch.utils.hooks.RemovableHandle`:
 |              a handle that can be used to remove the added hook by calling
 |              ``handle.remove()``
 |  
 |  register_forward_pre_hook(self, hook: Union[Callable[[~T, Tuple[Any, ...]], Union[Any, NoneType]], Callable[[~T, Tuple[Any, ...], Dict[str, Any]], Union[Tuple[Any, Dict[str, Any]], NoneType]]], *, prepend: bool = False, with_kwargs: bool = False) -> torch.utils.hooks.RemovableHandle
 |      Registers a forward pre-hook on the module.
 |      
 |      The hook will be called every time before :func:`forward` is invoked.
 |      
 |      
 |      If ``with_kwargs`` is false or not specified, the input contains only
 |      the positional arguments given to the module. Keyword arguments won't be
 |      passed to the hooks and only to the ``forward``. The hook can modify the
 |      input. User can either return a tuple or a single modified value in the
 |      hook. We will wrap the value into a tuple if a single value is returned
 |      (unless that value is already a tuple). The hook should have the
 |      following signature::
 |      
 |          hook(module, args) -> None or modified input
 |      
 |      If ``with_kwargs`` is true, the forward pre-hook will be passed the
 |      kwargs given to the forward function. And if the hook modifies the
 |      input, both the args and kwargs should be returned. The hook should have
 |      the following signature::
 |      
 |          hook(module, args, kwargs) -> None or a tuple of modified input and kwargs
 |      
 |      Args:
 |          hook (Callable): The user defined hook to be registered.
 |          prepend (bool): If true, the provided ``hook`` will be fired before
 |              all existing ``forward_pre`` hooks on this
 |              :class:`torch.nn.modules.Module`. Otherwise, the provided
 |              ``hook`` will be fired after all existing ``forward_pre`` hooks
 |              on this :class:`torch.nn.modules.Module`. Note that global
 |              ``forward_pre`` hooks registered with
 |              :func:`register_module_forward_pre_hook` will fire before all
 |              hooks registered by this method.
 |              Default: ``False``
 |          with_kwargs (bool): If true, the ``hook`` will be passed the kwargs
 |              given to the forward function.
 |              Default: ``False``
 |      
 |      Returns:
 |          :class:`torch.utils.hooks.RemovableHandle`:
 |              a handle that can be used to remove the added hook by calling
 |              ``handle.remove()``
 |  
 |  register_full_backward_hook(self, hook: Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]], prepend: bool = False) -> torch.utils.hooks.RemovableHandle
 |      Registers a backward hook on the module.
 |      
 |      The hook will be called every time the gradients with respect to a module
 |      are computed, i.e. the hook will execute if and only if the gradients with
 |      respect to module outputs are computed. The hook should have the following
 |      signature::
 |      
 |          hook(module, grad_input, grad_output) -> tuple(Tensor) or None
 |      
 |      The :attr:`grad_input` and :attr:`grad_output` are tuples that contain the gradients
 |      with respect to the inputs and outputs respectively. The hook should
 |      not modify its arguments, but it can optionally return a new gradient with
 |      respect to the input that will be used in place of :attr:`grad_input` in
 |      subsequent computations. :attr:`grad_input` will only correspond to the inputs given
 |      as positional arguments and all kwarg arguments are ignored. Entries
 |      in :attr:`grad_input` and :attr:`grad_output` will be ``None`` for all non-Tensor
 |      arguments.
 |      
 |      For technical reasons, when this hook is applied to a Module, its forward function will
 |      receive a view of each Tensor passed to the Module. Similarly the caller will receive a view
 |      of each Tensor returned by the Module's forward function.
 |      
 |      .. warning ::
 |          Modifying inputs or outputs inplace is not allowed when using backward hooks and
 |          will raise an error.
 |      
 |      Args:
 |          hook (Callable): The user-defined hook to be registered.
 |          prepend (bool): If true, the provided ``hook`` will be fired before
 |              all existing ``backward`` hooks on this
 |              :class:`torch.nn.modules.Module`. Otherwise, the provided
 |              ``hook`` will be fired after all existing ``backward`` hooks on
 |              this :class:`torch.nn.modules.Module`. Note that global
 |              ``backward`` hooks registered with
 |              :func:`register_module_full_backward_hook` will fire before
 |              all hooks registered by this method.
 |      
 |      Returns:
 |          :class:`torch.utils.hooks.RemovableHandle`:
 |              a handle that can be used to remove the added hook by calling
 |              ``handle.remove()``
 |  
 |  register_full_backward_pre_hook(self, hook: Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]], prepend: bool = False) -> torch.utils.hooks.RemovableHandle
 |      Registers a backward pre-hook on the module.
 |      
 |      The hook will be called every time the gradients for the module are computed.
 |      The hook should have the following signature::
 |      
 |          hook(module, grad_output) -> Tensor or None
 |      
 |      The :attr:`grad_output` is a tuple. The hook should
 |      not modify its arguments, but it can optionally return a new gradient with
 |      respect to the output that will be used in place of :attr:`grad_output` in
 |      subsequent computations. Entries in :attr:`grad_output` will be ``None`` for
 |      all non-Tensor arguments.
 |      
 |      For technical reasons, when this hook is applied to a Module, its forward function will
 |      receive a view of each Tensor passed to the Module. Similarly the caller will receive a view
 |      of each Tensor returned by the Module's forward function.
 |      
 |      .. warning ::
 |          Modifying inputs inplace is not allowed when using backward hooks and
 |          will raise an error.
 |      
 |      Args:
 |          hook (Callable): The user-defined hook to be registered.
 |          prepend (bool): If true, the provided ``hook`` will be fired before
 |              all existing ``backward_pre`` hooks on this
 |              :class:`torch.nn.modules.Module`. Otherwise, the provided
 |              ``hook`` will be fired after all existing ``backward_pre`` hooks
 |              on this :class:`torch.nn.modules.Module`. Note that global
 |              ``backward_pre`` hooks registered with
 |              :func:`register_module_full_backward_pre_hook` will fire before
 |              all hooks registered by this method.
 |      
 |      Returns:
 |          :class:`torch.utils.hooks.RemovableHandle`:
 |              a handle that can be used to remove the added hook by calling
 |              ``handle.remove()``
 |  
 |  register_load_state_dict_post_hook(self, hook)
 |      Registers a post hook to be run after module's ``load_state_dict``
 |      is called.
 |      
 |      It should have the following signature::
 |          hook(module, incompatible_keys) -> None
 |      
 |      The ``module`` argument is the current module that this hook is registered
 |      on, and the ``incompatible_keys`` argument is a ``NamedTuple`` consisting
 |      of attributes ``missing_keys`` and ``unexpected_keys``. ``missing_keys``
 |      is a ``list`` of ``str`` containing the missing keys and
 |      ``unexpected_keys`` is a ``list`` of ``str`` containing the unexpected keys.
 |      
 |      The given incompatible_keys can be modified inplace if needed.
 |      
 |      Note that the checks performed when calling :func:`load_state_dict` with
 |      ``strict=True`` are affected by modifications the hook makes to
 |      ``missing_keys`` or ``unexpected_keys``, as expected. Additions to either
 |      set of keys will result in an error being thrown when ``strict=True``, and
 |      clearing out both missing and unexpected keys will avoid an error.
 |      
 |      Returns:
 |          :class:`torch.utils.hooks.RemovableHandle`:
 |              a handle that can be used to remove the added hook by calling
 |              ``handle.remove()``
 |  
 |  register_module(self, name: str, module: Union[ForwardRef('Module'), NoneType]) -> None
 |      Alias for :func:`add_module`.
 |  
 |  register_parameter(self, name: str, param: Union[torch.nn.parameter.Parameter, NoneType]) -> None
 |      Adds a parameter to the module.
 |      
 |      The parameter can be accessed as an attribute using given name.
 |      
 |      Args:
 |          name (str): name of the parameter. The parameter can be accessed
 |              from this module using the given name
 |          param (Parameter or None): parameter to be added to the module. If
 |              ``None``, then operations that run on parameters, such as :attr:`cuda`,
 |              are ignored. If ``None``, the parameter is **not** included in the
 |              module's :attr:`state_dict`.
 |  
 |  register_state_dict_pre_hook(self, hook)
 |      These hooks will be called with arguments: ``self``, ``prefix``,
 |      and ``keep_vars`` before calling ``state_dict`` on ``self``. The registered
 |      hooks can be used to perform pre-processing before the ``state_dict``
 |      call is made.
 |  
 |  requires_grad_(self: ~T, requires_grad: bool = True) -> ~T
 |      Change if autograd should record operations on parameters in this
 |      module.
 |      
 |      This method sets the parameters' :attr:`requires_grad` attributes
 |      in-place.
 |      
 |      This method is helpful for freezing part of the module for finetuning
 |      or training parts of a model individually (e.g., GAN training).
 |      
 |      See :ref:`locally-disable-grad-doc` for a comparison between
 |      `.requires_grad_()` and several similar mechanisms that may be confused with it.
 |      
 |      Args:
 |          requires_grad (bool): whether autograd should record operations on
 |                                parameters in this module. Default: ``True``.
 |      
 |      Returns:
 |          Module: self
 |  
 |  set_extra_state(self, state: Any)
 |      This function is called from :func:`load_state_dict` to handle any extra state
 |      found within the `state_dict`. Implement this function and a corresponding
 |      :func:`get_extra_state` for your module if you need to store extra state within its
 |      `state_dict`.
 |      
 |      Args:
 |          state (dict): Extra state from the `state_dict`
 |  
 |  share_memory(self: ~T) -> ~T
 |      See :meth:`torch.Tensor.share_memory_`
 |  
 |  state_dict(self, *args, destination=None, prefix='', keep_vars=False)
 |      Returns a dictionary containing references to the whole state of the module.
 |      
 |      Both parameters and persistent buffers (e.g. running averages) are
 |      included. Keys are corresponding parameter and buffer names.
 |      Parameters and buffers set to ``None`` are not included.
 |      
 |      .. note::
 |          The returned object is a shallow copy. It contains references
 |          to the module's parameters and buffers.
 |      
 |      .. warning::
 |          Currently ``state_dict()`` also accepts positional arguments for
 |          ``destination``, ``prefix`` and ``keep_vars`` in order. However,
 |          this is being deprecated and keyword arguments will be enforced in
 |          future releases.
 |      
 |      .. warning::
 |          Please avoid the use of argument ``destination`` as it is not
 |          designed for end-users.
 |      
 |      Args:
 |          destination (dict, optional): If provided, the state of module will
 |              be updated into the dict and the same object is returned.
 |              Otherwise, an ``OrderedDict`` will be created and returned.
 |              Default: ``None``.
 |          prefix (str, optional): a prefix added to parameter and buffer
 |              names to compose the keys in state_dict. Default: ``''``.
 |          keep_vars (bool, optional): by default the :class:`~torch.Tensor` s
 |              returned in the state dict are detached from autograd. If it's
 |              set to ``True``, detaching will not be performed.
 |              Default: ``False``.
 |      
 |      Returns:
 |          dict:
 |              a dictionary containing a whole state of the module
 |      
 |      Example::
 |      
 |          >>> # xdoctest: +SKIP("undefined vars")
 |          >>> module.state_dict().keys()
 |          ['bias', 'weight']
 |  
 |  to(self, *args, **kwargs)
 |      Moves and/or casts the parameters and buffers.
 |      
 |      This can be called as
 |      
 |      .. function:: to(device=None, dtype=None, non_blocking=False)
 |         :noindex:
 |      
 |      .. function:: to(dtype, non_blocking=False)
 |         :noindex:
 |      
 |      .. function:: to(tensor, non_blocking=False)
 |         :noindex:
 |      
 |      .. function:: to(memory_format=torch.channels_last)
 |         :noindex:
 |      
 |      Its signature is similar to :meth:`torch.Tensor.to`, but only accepts
 |      floating point or complex :attr:`dtype`\ s. In addition, this method will
 |      only cast the floating point or complex parameters and buffers to :attr:`dtype`
 |      (if given). The integral parameters and buffers will be moved
 |      :attr:`device`, if that is given, but with dtypes unchanged. When
 |      :attr:`non_blocking` is set, it tries to convert/move asynchronously
 |      with respect to the host if possible, e.g., moving CPU Tensors with
 |      pinned memory to CUDA devices.
 |      
 |      See below for examples.
 |      
 |      .. note::
 |          This method modifies the module in-place.
 |      
 |      Args:
 |          device (:class:`torch.device`): the desired device of the parameters
 |              and buffers in this module
 |          dtype (:class:`torch.dtype`): the desired floating point or complex dtype of
 |              the parameters and buffers in this module
 |          tensor (torch.Tensor): Tensor whose dtype and device are the desired
 |              dtype and device for all parameters and buffers in this module
 |          memory_format (:class:`torch.memory_format`): the desired memory
 |              format for 4D parameters and buffers in this module (keyword
 |              only argument)
 |      
 |      Returns:
 |          Module: self
 |      
 |      Examples::
 |      
 |          >>> # xdoctest: +IGNORE_WANT("non-deterministic")
 |          >>> linear = nn.Linear(2, 2)
 |          >>> linear.weight
 |          Parameter containing:
 |          tensor([[ 0.1913, -0.3420],
 |                  [-0.5113, -0.2325]])
 |          >>> linear.to(torch.double)
 |          Linear(in_features=2, out_features=2, bias=True)
 |          >>> linear.weight
 |          Parameter containing:
 |          tensor([[ 0.1913, -0.3420],
 |                  [-0.5113, -0.2325]], dtype=torch.float64)
 |          >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA1)
 |          >>> gpu1 = torch.device("cuda:1")
 |          >>> linear.to(gpu1, dtype=torch.half, non_blocking=True)
 |          Linear(in_features=2, out_features=2, bias=True)
 |          >>> linear.weight
 |          Parameter containing:
 |          tensor([[ 0.1914, -0.3420],
 |                  [-0.5112, -0.2324]], dtype=torch.float16, device='cuda:1')
 |          >>> cpu = torch.device("cpu")
 |          >>> linear.to(cpu)
 |          Linear(in_features=2, out_features=2, bias=True)
 |          >>> linear.weight
 |          Parameter containing:
 |          tensor([[ 0.1914, -0.3420],
 |                  [-0.5112, -0.2324]], dtype=torch.float16)
 |      
 |          >>> linear = nn.Linear(2, 2, bias=None).to(torch.cdouble)
 |          >>> linear.weight
 |          Parameter containing:
 |          tensor([[ 0.3741+0.j,  0.2382+0.j],
 |                  [ 0.5593+0.j, -0.4443+0.j]], dtype=torch.complex128)
 |          >>> linear(torch.ones(3, 2, dtype=torch.cdouble))
 |          tensor([[0.6122+0.j, 0.1150+0.j],
 |                  [0.6122+0.j, 0.1150+0.j],
 |                  [0.6122+0.j, 0.1150+0.j]], dtype=torch.complex128)
 |  
 |  to_empty(self: ~T, *, device: Union[str, torch.device]) -> ~T
 |      Moves the parameters and buffers to the specified device without copying storage.
 |      
 |      Args:
 |          device (:class:`torch.device`): The desired device of the parameters
 |              and buffers in this module.
 |      
 |      Returns:
 |          Module: self
 |  
 |  train(self: ~T, mode: bool = True) -> ~T
 |      Sets the module in training mode.
 |      
 |      This has any effect only on certain modules. See documentations of
 |      particular modules for details of their behaviors in training/evaluation
 |      mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,
 |      etc.
 |      
 |      Args:
 |          mode (bool): whether to set training mode (``True``) or evaluation
 |                       mode (``False``). Default: ``True``.
 |      
 |      Returns:
 |          Module: self
 |  
 |  type(self: ~T, dst_type: Union[torch.dtype, str]) -> ~T
 |      Casts all parameters and buffers to :attr:`dst_type`.
 |      
 |      .. note::
 |          This method modifies the module in-place.
 |      
 |      Args:
 |          dst_type (type or string): the desired type
 |      
 |      Returns:
 |          Module: self
 |  
 |  xpu(self: ~T, device: Union[int, torch.device, NoneType] = None) -> ~T
 |      Moves all model parameters and buffers to the XPU.
 |      
 |      This also makes associated parameters and buffers different objects. So
 |      it should be called before constructing optimizer if the module will
 |      live on XPU while being optimized.
 |      
 |      .. note::
 |          This method modifies the module in-place.
 |      
 |      Arguments:
 |          device (int, optional): if specified, all parameters will be
 |              copied to that device
 |      
 |      Returns:
 |          Module: self
 |  
 |  zero_grad(self, set_to_none: bool = True) -> None
 |      Sets gradients of all model parameters to zero. See similar function
 |      under :class:`torch.optim.Optimizer` for more context.
 |      
 |      Args:
 |          set_to_none (bool): instead of setting to zero, set the grads to None.
 |              See :meth:`torch.optim.Optimizer.zero_grad` for details.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from torch.nn.modules.module.Module:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes inherited from torch.nn.modules.module.Module:
 |  
 |  T_destination = ~T_destination
 |  
 |  __annotations__ = {'__call__': typing.Callable[..., typing.Any], '_bac...
 |  
 |  call_super_init = False
 |  
 |  dump_patches = False

def look_ahead_iterator(gen):
    prev = None
    for item in gen:
        if prev is not None:
            yield (prev, item)
        prev = item

class Bigrams(IterableDataset):
  def __init__(self, text_file, vocabulary_size):
      self.vocab = build_vocab_from_iterator(
         get_word_lines_from_file(text_file),
         max_tokens = vocabulary_size,
         specials = ['<unk>'])
      self.vocab.set_default_index(self.vocab['<unk>'])
      self.vocabulary_size = vocabulary_size
      self.text_file = text_file

  def __iter__(self):
     return look_ahead_iterator(
         (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))

train_dataset = Bigrams(train_file, vocab_size)
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000
159000
160000
161000
162000
163000
164000
165000
166000
167000
168000
169000
170000
171000
172000
173000
174000
175000
176000
177000
178000
179000
180000
181000
182000
183000
184000
185000
186000
187000
188000
189000
190000
191000
192000
193000
194000
195000
196000
197000
198000
199000
200000
201000
202000
203000
204000
205000
206000
207000
208000
209000
210000
211000
212000
213000
214000
215000
216000
217000
218000
219000
220000
221000
222000
223000
224000
225000
226000
227000
228000
229000
230000
231000
232000
233000
234000
235000
236000
237000
238000
239000
240000
241000
242000
243000
244000
245000
246000
247000
248000
249000
250000
251000
252000
253000
254000
255000
256000
257000
258000
259000
260000
261000
262000
263000
264000
265000
266000
267000
268000
269000
270000
271000
272000
273000
274000
275000
276000
277000
278000
279000
280000
281000
282000
283000
284000
285000
286000
287000
288000
289000
290000
291000
292000
293000
294000
295000
296000
297000
298000
299000
300000
301000
302000
303000
304000
305000
306000
307000
308000
309000
310000
311000
312000
313000
314000
315000
316000
317000
318000
319000
320000
321000
322000
323000
324000
325000
326000
327000
328000
329000
330000
331000
332000
333000
334000
335000
336000
337000
338000
339000
340000
341000
342000
343000
344000
345000
346000
347000
348000
349000
350000
351000
352000
353000
354000
355000
356000
357000
358000
359000
360000
361000
362000
363000
364000
365000
366000
367000
368000
369000
370000
371000
372000
373000
374000
375000
376000
377000
378000
379000
380000
381000
382000
383000
384000
385000
386000
387000
388000
389000
390000
391000
392000
393000
394000
395000
396000
397000
398000
399000
400000
401000
402000
403000
404000
405000
406000
407000
408000
409000
410000
411000
412000
413000
414000
415000
416000
417000
418000
419000
420000
421000
422000
423000
424000
425000
426000
427000
428000
429000
430000
431000
432000
print(train_dataset)
<__main__.Bigrams object at 0x7fdd26d23940>
torch.cuda.memory_summary(device=None, abbreviated=False)
'|===========================================================================|\n|                  PyTorch CUDA memory summary, device ID 0                 |\n|---------------------------------------------------------------------------|\n|            CUDA OOMs: 1            |        cudaMalloc retries: 1         |\n|===========================================================================|\n|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |\n|---------------------------------------------------------------------------|\n| Allocated memory      | 699613 KiB |   1903 MiB |   3735 MiB |   3052 MiB |\n|       from large pool | 699414 KiB |   1903 MiB |   3734 MiB |   3051 MiB |\n|       from small pool |    199 KiB |      1 MiB |      1 MiB |      1 MiB |\n|---------------------------------------------------------------------------|\n| Active memory         | 699613 KiB |   1903 MiB |   3735 MiB |   3052 MiB |\n|       from large pool | 699414 KiB |   1903 MiB |   3734 MiB |   3051 MiB |\n|       from small pool |    199 KiB |      1 MiB |      1 MiB |      1 MiB |\n|---------------------------------------------------------------------------|\n| Requested memory      | 699611 KiB |   1903 MiB |   3735 MiB |   3052 MiB |\n|       from large pool | 699413 KiB |   1903 MiB |   3734 MiB |   3051 MiB |\n|       from small pool |    197 KiB |      1 MiB |      1 MiB |      1 MiB |\n|---------------------------------------------------------------------------|\n| GPU reserved memory   | 710656 KiB |   1918 MiB |   1918 MiB |   1224 MiB |\n|       from large pool | 708608 KiB |   1916 MiB |   1916 MiB |   1224 MiB |\n|       from small pool |   2048 KiB |      2 MiB |      2 MiB |      0 MiB |\n|---------------------------------------------------------------------------|\n| Non-releasable memory |  11043 KiB |  19364 KiB |  28939 KiB |  17896 KiB |\n|       from large pool |   9194 KiB |  17514 KiB |  25954 KiB |  16760 KiB |\n|       from small pool |   1849 KiB |   1950 KiB |   2985 KiB |   1136 KiB |\n|---------------------------------------------------------------------------|\n| Allocations           |      10    |      17    |      38    |      28    |\n|       from large pool |       5    |       7    |      10    |       5    |\n|       from small pool |       5    |      11    |      28    |      23    |\n|---------------------------------------------------------------------------|\n| Active allocs         |      10    |      17    |      38    |      28    |\n|       from large pool |       5    |       7    |      10    |       5    |\n|       from small pool |       5    |      11    |      28    |      23    |\n|---------------------------------------------------------------------------|\n| GPU reserved segments |       5    |       7    |       7    |       2    |\n|       from large pool |       4    |       6    |       6    |       2    |\n|       from small pool |       1    |       1    |       1    |       0    |\n|---------------------------------------------------------------------------|\n| Non-releasable allocs |       6    |       8    |      20    |      14    |\n|       from large pool |       4    |       6    |       9    |       5    |\n|       from small pool |       2    |       3    |      11    |       9    |\n|---------------------------------------------------------------------------|\n| Oversize allocations  |       0    |       0    |       0    |       0    |\n|---------------------------------------------------------------------------|\n| Oversize GPU segments |       0    |       0    |       0    |       0    |\n|===========================================================================|\n'
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"
device = 'cuda'
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
data = DataLoader(train_dataset, batch_size=batch_s)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.NLLLoss()
torch.cuda.empty_cache()
gc.collect()

model.load_state_dict(torch.load('model-bigram_final.bin'))
for i in range(1, epochs+1):
    print('epoch: =', i)
    model.train()
    step = 0
    for x, y in data: # prev, predicting, following words
       x = x.to(device)
       y = y.to(device)
       optimizer.zero_grad()
       ypredicted = model(x) #previous, following word
       loss = criterion(torch.log(ypredicted), y)
       if step % 100 == 0:
          print(step, loss)
       step += 1
       loss.backward()
       optimizer.step()
    torch.save(model.state_dict(), f'model-bigram_2nd-run{i}.bin')    
torch.save(model.state_dict(), f'model-bigram_final.bin')    
epoch: = 1
/home/gedin/.local/lib/python3.8/site-packages/torch/nn/modules/container.py:217: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  input = module(input)
0 tensor(5.9599, device='cuda:0', grad_fn=<NllLossBackward0>)
1000
100 tensor(6.1015, device='cuda:0', grad_fn=<NllLossBackward0>)
200 tensor(5.9708, device='cuda:0', grad_fn=<NllLossBackward0>)
2000
300 tensor(6.2176, device='cuda:0', grad_fn=<NllLossBackward0>)
3000
400 tensor(5.9401, device='cuda:0', grad_fn=<NllLossBackward0>)
4000
500 tensor(6.2084, device='cuda:0', grad_fn=<NllLossBackward0>)
5000
600 tensor(5.9736, device='cuda:0', grad_fn=<NllLossBackward0>)
6000
700 tensor(6.1423, device='cuda:0', grad_fn=<NllLossBackward0>)
7000
800 tensor(5.7344, device='cuda:0', grad_fn=<NllLossBackward0>)
8000
900 tensor(6.0950, device='cuda:0', grad_fn=<NllLossBackward0>)
9000
1000 tensor(5.8473, device='cuda:0', grad_fn=<NllLossBackward0>)
10000
1100 tensor(6.0612, device='cuda:0', grad_fn=<NllLossBackward0>)
11000
1200 tensor(6.1509, device='cuda:0', grad_fn=<NllLossBackward0>)
12000
1300 tensor(6.0760, device='cuda:0', grad_fn=<NllLossBackward0>)
13000
1400 tensor(6.2047, device='cuda:0', grad_fn=<NllLossBackward0>)
14000
1500 tensor(6.1186, device='cuda:0', grad_fn=<NllLossBackward0>)
15000
1600 tensor(5.8722, device='cuda:0', grad_fn=<NllLossBackward0>)
16000
1700 tensor(5.8741, device='cuda:0', grad_fn=<NllLossBackward0>)
17000
1800 tensor(5.8971, device='cuda:0', grad_fn=<NllLossBackward0>)
18000
1900 tensor(5.8521, device='cuda:0', grad_fn=<NllLossBackward0>)
19000
2000 tensor(5.9434, device='cuda:0', grad_fn=<NllLossBackward0>)
20000
2100 tensor(6.0348, device='cuda:0', grad_fn=<NllLossBackward0>)
21000
2200 tensor(5.8840, device='cuda:0', grad_fn=<NllLossBackward0>)
22000
2300 tensor(5.8641, device='cuda:0', grad_fn=<NllLossBackward0>)
23000
2400 tensor(5.9068, device='cuda:0', grad_fn=<NllLossBackward0>)
24000
2500 tensor(5.9170, device='cuda:0', grad_fn=<NllLossBackward0>)
25000
2600 tensor(5.9812, device='cuda:0', grad_fn=<NllLossBackward0>)
26000
2700 tensor(5.8985, device='cuda:0', grad_fn=<NllLossBackward0>)
27000
2800 tensor(6.0008, device='cuda:0', grad_fn=<NllLossBackward0>)
28000
2900 tensor(6.1230, device='cuda:0', grad_fn=<NllLossBackward0>)
29000
3000 tensor(5.8770, device='cuda:0', grad_fn=<NllLossBackward0>)
30000
3100 tensor(5.9268, device='cuda:0', grad_fn=<NllLossBackward0>)
31000
3200 tensor(5.8530, device='cuda:0', grad_fn=<NllLossBackward0>)
32000
3300 tensor(5.8436, device='cuda:0', grad_fn=<NllLossBackward0>)
33000
3400 tensor(5.7692, device='cuda:0', grad_fn=<NllLossBackward0>)
34000
3500 tensor(5.8909, device='cuda:0', grad_fn=<NllLossBackward0>)
35000
3600 tensor(5.8325, device='cuda:0', grad_fn=<NllLossBackward0>)
36000
3700 tensor(5.8082, device='cuda:0', grad_fn=<NllLossBackward0>)
37000
3800 tensor(5.8106, device='cuda:0', grad_fn=<NllLossBackward0>)
38000
3900 tensor(5.6382, device='cuda:0', grad_fn=<NllLossBackward0>)
39000
4000 tensor(5.6596, device='cuda:0', grad_fn=<NllLossBackward0>)
40000
4100 tensor(5.9587, device='cuda:0', grad_fn=<NllLossBackward0>)
41000
4200 tensor(5.8862, device='cuda:0', grad_fn=<NllLossBackward0>)
42000
4300 tensor(5.9541, device='cuda:0', grad_fn=<NllLossBackward0>)
43000
4400 tensor(5.8681, device='cuda:0', grad_fn=<NllLossBackward0>)
44000
4500 tensor(5.6963, device='cuda:0', grad_fn=<NllLossBackward0>)
45000
4600 tensor(6.0707, device='cuda:0', grad_fn=<NllLossBackward0>)
46000
4700 tensor(5.7091, device='cuda:0', grad_fn=<NllLossBackward0>)
47000
4800 tensor(5.8139, device='cuda:0', grad_fn=<NllLossBackward0>)
48000
4900 tensor(5.8696, device='cuda:0', grad_fn=<NllLossBackward0>)
49000
5000 tensor(5.8844, device='cuda:0', grad_fn=<NllLossBackward0>)
50000
5100 tensor(5.9806, device='cuda:0', grad_fn=<NllLossBackward0>)
51000
5200 tensor(6.0075, device='cuda:0', grad_fn=<NllLossBackward0>)
52000
5300 tensor(6.0588, device='cuda:0', grad_fn=<NllLossBackward0>)
53000
5400 tensor(5.8456, device='cuda:0', grad_fn=<NllLossBackward0>)
54000
5500 tensor(5.9166, device='cuda:0', grad_fn=<NllLossBackward0>)
55000
5600 tensor(5.6528, device='cuda:0', grad_fn=<NllLossBackward0>)
56000
5700 tensor(5.8988, device='cuda:0', grad_fn=<NllLossBackward0>)
57000
5800 tensor(5.9132, device='cuda:0', grad_fn=<NllLossBackward0>)
58000
5900 tensor(5.9460, device='cuda:0', grad_fn=<NllLossBackward0>)
59000
6000 tensor(5.7543, device='cuda:0', grad_fn=<NllLossBackward0>)
60000
6100 tensor(5.8256, device='cuda:0', grad_fn=<NllLossBackward0>)
61000
6200 tensor(5.9448, device='cuda:0', grad_fn=<NllLossBackward0>)
62000
6300 tensor(5.7601, device='cuda:0', grad_fn=<NllLossBackward0>)
63000
6400 tensor(5.7091, device='cuda:0', grad_fn=<NllLossBackward0>)
64000
6500 tensor(5.5621, device='cuda:0', grad_fn=<NllLossBackward0>)
65000
6600 tensor(5.7094, device='cuda:0', grad_fn=<NllLossBackward0>)
66000
6700 tensor(5.6785, device='cuda:0', grad_fn=<NllLossBackward0>)
67000
6800 tensor(5.9249, device='cuda:0', grad_fn=<NllLossBackward0>)
68000
6900 tensor(5.8775, device='cuda:0', grad_fn=<NllLossBackward0>)
69000
7000 tensor(5.8075, device='cuda:0', grad_fn=<NllLossBackward0>)
70000
7100 tensor(5.5748, device='cuda:0', grad_fn=<NllLossBackward0>)
71000
7200 tensor(5.7217, device='cuda:0', grad_fn=<NllLossBackward0>)
72000
7300 tensor(5.9124, device='cuda:0', grad_fn=<NllLossBackward0>)
73000
7400 tensor(5.7197, device='cuda:0', grad_fn=<NllLossBackward0>)
74000
7500 tensor(5.6429, device='cuda:0', grad_fn=<NllLossBackward0>)
75000
7600 tensor(5.6847, device='cuda:0', grad_fn=<NllLossBackward0>)
76000
7700 tensor(5.7197, device='cuda:0', grad_fn=<NllLossBackward0>)
77000
7800 tensor(5.8559, device='cuda:0', grad_fn=<NllLossBackward0>)
78000
7900 tensor(5.5600, device='cuda:0', grad_fn=<NllLossBackward0>)
79000
8000 tensor(5.6288, device='cuda:0', grad_fn=<NllLossBackward0>)
80000
8100 tensor(5.7767, device='cuda:0', grad_fn=<NllLossBackward0>)
81000
8200 tensor(5.8037, device='cuda:0', grad_fn=<NllLossBackward0>)
82000
8300 tensor(5.7344, device='cuda:0', grad_fn=<NllLossBackward0>)
83000
8400 tensor(5.8092, device='cuda:0', grad_fn=<NllLossBackward0>)
84000
8500 tensor(5.8847, device='cuda:0', grad_fn=<NllLossBackward0>)
85000
8600 tensor(5.8754, device='cuda:0', grad_fn=<NllLossBackward0>)
86000
8700 tensor(5.9227, device='cuda:0', grad_fn=<NllLossBackward0>)
87000
8800 tensor(5.8028, device='cuda:0', grad_fn=<NllLossBackward0>)
88000
8900 tensor(5.6476, device='cuda:0', grad_fn=<NllLossBackward0>)
89000
9000 tensor(5.7656, device='cuda:0', grad_fn=<NllLossBackward0>)
90000
9100 tensor(5.7805, device='cuda:0', grad_fn=<NllLossBackward0>)
91000
9200 tensor(5.6879, device='cuda:0', grad_fn=<NllLossBackward0>)
92000
9300 tensor(5.7098, device='cuda:0', grad_fn=<NllLossBackward0>)
93000
9400 tensor(5.5631, device='cuda:0', grad_fn=<NllLossBackward0>)
94000
9500 tensor(5.6497, device='cuda:0', grad_fn=<NllLossBackward0>)
95000
9600 tensor(5.7500, device='cuda:0', grad_fn=<NllLossBackward0>)
96000
9700 tensor(5.6607, device='cuda:0', grad_fn=<NllLossBackward0>)
97000
9800 tensor(5.7196, device='cuda:0', grad_fn=<NllLossBackward0>)
9900 tensor(5.5987, device='cuda:0', grad_fn=<NllLossBackward0>)
98000
10000 tensor(5.7795, device='cuda:0', grad_fn=<NllLossBackward0>)
99000
10100 tensor(5.6980, device='cuda:0', grad_fn=<NllLossBackward0>)
100000
10200 tensor(5.6093, device='cuda:0', grad_fn=<NllLossBackward0>)
101000
10300 tensor(5.6792, device='cuda:0', grad_fn=<NllLossBackward0>)
102000
10400 tensor(5.7035, device='cuda:0', grad_fn=<NllLossBackward0>)
103000
10500 tensor(5.8282, device='cuda:0', grad_fn=<NllLossBackward0>)
104000
10600 tensor(5.8605, device='cuda:0', grad_fn=<NllLossBackward0>)
105000
10700 tensor(5.7354, device='cuda:0', grad_fn=<NllLossBackward0>)
106000
10800 tensor(5.8034, device='cuda:0', grad_fn=<NllLossBackward0>)
107000
10900 tensor(5.6194, device='cuda:0', grad_fn=<NllLossBackward0>)
108000
11000 tensor(5.8502, device='cuda:0', grad_fn=<NllLossBackward0>)
109000
11100 tensor(5.4406, device='cuda:0', grad_fn=<NllLossBackward0>)
110000
11200 tensor(5.6379, device='cuda:0', grad_fn=<NllLossBackward0>)
111000
11300 tensor(5.6668, device='cuda:0', grad_fn=<NllLossBackward0>)
112000
11400 tensor(5.6140, device='cuda:0', grad_fn=<NllLossBackward0>)
113000
11500 tensor(5.6565, device='cuda:0', grad_fn=<NllLossBackward0>)
114000
11600 tensor(5.6308, device='cuda:0', grad_fn=<NllLossBackward0>)
115000
11700 tensor(5.5680, device='cuda:0', grad_fn=<NllLossBackward0>)
116000
11800 tensor(5.7604, device='cuda:0', grad_fn=<NllLossBackward0>)
117000
11900 tensor(5.5792, device='cuda:0', grad_fn=<NllLossBackward0>)
118000
12000 tensor(5.7329, device='cuda:0', grad_fn=<NllLossBackward0>)
119000
12100 tensor(5.7726, device='cuda:0', grad_fn=<NllLossBackward0>)
120000
12200 tensor(5.7151, device='cuda:0', grad_fn=<NllLossBackward0>)
121000
12300 tensor(5.8561, device='cuda:0', grad_fn=<NllLossBackward0>)
122000
12400 tensor(5.6791, device='cuda:0', grad_fn=<NllLossBackward0>)
123000
12500 tensor(5.5574, device='cuda:0', grad_fn=<NllLossBackward0>)
124000
12600 tensor(5.6817, device='cuda:0', grad_fn=<NllLossBackward0>)
125000
12700 tensor(5.5375, device='cuda:0', grad_fn=<NllLossBackward0>)
126000
12800 tensor(5.7270, device='cuda:0', grad_fn=<NllLossBackward0>)
127000
12900 tensor(5.6252, device='cuda:0', grad_fn=<NllLossBackward0>)
128000
13000 tensor(5.4536, device='cuda:0', grad_fn=<NllLossBackward0>)
129000
13100 tensor(5.6091, device='cuda:0', grad_fn=<NllLossBackward0>)
130000
13200 tensor(5.7324, device='cuda:0', grad_fn=<NllLossBackward0>)
131000
13300 tensor(5.5253, device='cuda:0', grad_fn=<NllLossBackward0>)
132000
13400 tensor(5.6491, device='cuda:0', grad_fn=<NllLossBackward0>)
133000
13500 tensor(5.5728, device='cuda:0', grad_fn=<NllLossBackward0>)
134000
13600 tensor(5.6632, device='cuda:0', grad_fn=<NllLossBackward0>)
135000
13700 tensor(5.6678, device='cuda:0', grad_fn=<NllLossBackward0>)
136000
13800 tensor(5.6112, device='cuda:0', grad_fn=<NllLossBackward0>)
137000
13900 tensor(5.4884, device='cuda:0', grad_fn=<NllLossBackward0>)
138000
14000 tensor(5.7304, device='cuda:0', grad_fn=<NllLossBackward0>)
139000
14100 tensor(5.4326, device='cuda:0', grad_fn=<NllLossBackward0>)
140000
14200 tensor(5.7188, device='cuda:0', grad_fn=<NllLossBackward0>)
141000
14300 tensor(5.6519, device='cuda:0', grad_fn=<NllLossBackward0>)
142000
14400 tensor(5.5892, device='cuda:0', grad_fn=<NllLossBackward0>)
143000
14500 tensor(5.7225, device='cuda:0', grad_fn=<NllLossBackward0>)
144000
14600 tensor(5.7216, device='cuda:0', grad_fn=<NllLossBackward0>)
145000
14700 tensor(5.5748, device='cuda:0', grad_fn=<NllLossBackward0>)
146000
14800 tensor(6.0184, device='cuda:0', grad_fn=<NllLossBackward0>)
147000
14900 tensor(5.6781, device='cuda:0', grad_fn=<NllLossBackward0>)
148000
15000 tensor(5.6038, device='cuda:0', grad_fn=<NllLossBackward0>)
149000
15100 tensor(5.7875, device='cuda:0', grad_fn=<NllLossBackward0>)
150000
15200 tensor(5.6485, device='cuda:0', grad_fn=<NllLossBackward0>)
151000
15300 tensor(5.5927, device='cuda:0', grad_fn=<NllLossBackward0>)
152000
15400 tensor(5.5156, device='cuda:0', grad_fn=<NllLossBackward0>)
153000
15500 tensor(5.6556, device='cuda:0', grad_fn=<NllLossBackward0>)
154000
15600 tensor(5.6485, device='cuda:0', grad_fn=<NllLossBackward0>)
155000
15700 tensor(5.5904, device='cuda:0', grad_fn=<NllLossBackward0>)
156000
15800 tensor(5.4613, device='cuda:0', grad_fn=<NllLossBackward0>)
157000
15900 tensor(5.6254, device='cuda:0', grad_fn=<NllLossBackward0>)
158000
16000 tensor(5.4349, device='cuda:0', grad_fn=<NllLossBackward0>)
159000
16100 tensor(5.5205, device='cuda:0', grad_fn=<NllLossBackward0>)
160000
16200 tensor(5.8051, device='cuda:0', grad_fn=<NllLossBackward0>)
161000
16300 tensor(5.6452, device='cuda:0', grad_fn=<NllLossBackward0>)
162000
16400 tensor(5.6071, device='cuda:0', grad_fn=<NllLossBackward0>)
163000
16500 tensor(5.7237, device='cuda:0', grad_fn=<NllLossBackward0>)
164000
16600 tensor(5.5771, device='cuda:0', grad_fn=<NllLossBackward0>)
165000
16700 tensor(5.5355, device='cuda:0', grad_fn=<NllLossBackward0>)
166000
16800 tensor(5.6363, device='cuda:0', grad_fn=<NllLossBackward0>)
167000
16900 tensor(5.3746, device='cuda:0', grad_fn=<NllLossBackward0>)
168000
17000 tensor(5.6707, device='cuda:0', grad_fn=<NllLossBackward0>)
169000
17100 tensor(5.5359, device='cuda:0', grad_fn=<NllLossBackward0>)
170000
17200 tensor(5.6118, device='cuda:0', grad_fn=<NllLossBackward0>)
171000
17300 tensor(5.6740, device='cuda:0', grad_fn=<NllLossBackward0>)
172000
17400 tensor(5.4438, device='cuda:0', grad_fn=<NllLossBackward0>)
173000
17500 tensor(5.5001, device='cuda:0', grad_fn=<NllLossBackward0>)
174000
17600 tensor(5.4953, device='cuda:0', grad_fn=<NllLossBackward0>)
175000
17700 tensor(5.5398, device='cuda:0', grad_fn=<NllLossBackward0>)
176000
17800 tensor(5.6053, device='cuda:0', grad_fn=<NllLossBackward0>)
177000
17900 tensor(5.4726, device='cuda:0', grad_fn=<NllLossBackward0>)
178000
18000 tensor(5.6747, device='cuda:0', grad_fn=<NllLossBackward0>)
179000
18100 tensor(5.6238, device='cuda:0', grad_fn=<NllLossBackward0>)
180000
18200 tensor(5.5469, device='cuda:0', grad_fn=<NllLossBackward0>)
181000
18300 tensor(5.5299, device='cuda:0', grad_fn=<NllLossBackward0>)
182000
18400 tensor(5.6323, device='cuda:0', grad_fn=<NllLossBackward0>)
183000
18500 tensor(5.5893, device='cuda:0', grad_fn=<NllLossBackward0>)
184000
18600 tensor(5.7452, device='cuda:0', grad_fn=<NllLossBackward0>)
185000
18700 tensor(5.5576, device='cuda:0', grad_fn=<NllLossBackward0>)
186000
18800 tensor(5.7439, device='cuda:0', grad_fn=<NllLossBackward0>)
187000
18900 tensor(5.6106, device='cuda:0', grad_fn=<NllLossBackward0>)
188000
19000 tensor(5.6647, device='cuda:0', grad_fn=<NllLossBackward0>)
189000
19100 tensor(5.7728, device='cuda:0', grad_fn=<NllLossBackward0>)
190000
19200 tensor(5.6169, device='cuda:0', grad_fn=<NllLossBackward0>)
191000
19300 tensor(5.7852, device='cuda:0', grad_fn=<NllLossBackward0>)
192000
19400 tensor(5.5627, device='cuda:0', grad_fn=<NllLossBackward0>)
193000
19500 tensor(5.5682, device='cuda:0', grad_fn=<NllLossBackward0>)
194000
19600 tensor(5.5978, device='cuda:0', grad_fn=<NllLossBackward0>)
195000
19700 tensor(5.6453, device='cuda:0', grad_fn=<NllLossBackward0>)
196000
19800 tensor(5.4786, device='cuda:0', grad_fn=<NllLossBackward0>)
197000
19900 tensor(5.4894, device='cuda:0', grad_fn=<NllLossBackward0>)
198000
20000 tensor(5.4999, device='cuda:0', grad_fn=<NllLossBackward0>)
199000
20100 tensor(5.4881, device='cuda:0', grad_fn=<NllLossBackward0>)
200000
20200 tensor(5.3915, device='cuda:0', grad_fn=<NllLossBackward0>)
201000
20300 tensor(5.5216, device='cuda:0', grad_fn=<NllLossBackward0>)
20400 tensor(5.5761, device='cuda:0', grad_fn=<NllLossBackward0>)
202000
20500 tensor(5.5586, device='cuda:0', grad_fn=<NllLossBackward0>)
203000
20600 tensor(5.7870, device='cuda:0', grad_fn=<NllLossBackward0>)
204000
20700 tensor(5.5776, device='cuda:0', grad_fn=<NllLossBackward0>)
205000
20800 tensor(5.4417, device='cuda:0', grad_fn=<NllLossBackward0>)
206000
20900 tensor(5.7186, device='cuda:0', grad_fn=<NllLossBackward0>)
207000
21000 tensor(5.5415, device='cuda:0', grad_fn=<NllLossBackward0>)
208000
21100 tensor(5.5141, device='cuda:0', grad_fn=<NllLossBackward0>)
209000
21200 tensor(5.4401, device='cuda:0', grad_fn=<NllLossBackward0>)
210000
21300 tensor(5.6511, device='cuda:0', grad_fn=<NllLossBackward0>)
211000
21400 tensor(5.6474, device='cuda:0', grad_fn=<NllLossBackward0>)
212000
21500 tensor(5.3946, device='cuda:0', grad_fn=<NllLossBackward0>)
213000
21600 tensor(5.3958, device='cuda:0', grad_fn=<NllLossBackward0>)
214000
21700 tensor(5.4040, device='cuda:0', grad_fn=<NllLossBackward0>)
215000
21800 tensor(5.5745, device='cuda:0', grad_fn=<NllLossBackward0>)
216000
21900 tensor(5.4996, device='cuda:0', grad_fn=<NllLossBackward0>)
217000
22000 tensor(5.5234, device='cuda:0', grad_fn=<NllLossBackward0>)
218000
22100 tensor(5.3870, device='cuda:0', grad_fn=<NllLossBackward0>)
219000
22200 tensor(5.2661, device='cuda:0', grad_fn=<NllLossBackward0>)
220000
22300 tensor(5.7031, device='cuda:0', grad_fn=<NllLossBackward0>)
221000
22400 tensor(5.3633, device='cuda:0', grad_fn=<NllLossBackward0>)
222000
22500 tensor(5.4404, device='cuda:0', grad_fn=<NllLossBackward0>)
223000
22600 tensor(5.5951, device='cuda:0', grad_fn=<NllLossBackward0>)
224000
22700 tensor(5.3901, device='cuda:0', grad_fn=<NllLossBackward0>)
225000
22800 tensor(5.6404, device='cuda:0', grad_fn=<NllLossBackward0>)
226000
22900 tensor(5.6646, device='cuda:0', grad_fn=<NllLossBackward0>)
227000
23000 tensor(5.5949, device='cuda:0', grad_fn=<NllLossBackward0>)
228000
23100 tensor(5.5284, device='cuda:0', grad_fn=<NllLossBackward0>)
229000
23200 tensor(5.5617, device='cuda:0', grad_fn=<NllLossBackward0>)
230000
23300 tensor(5.6426, device='cuda:0', grad_fn=<NllLossBackward0>)
231000
23400 tensor(5.7283, device='cuda:0', grad_fn=<NllLossBackward0>)
232000
23500 tensor(5.4558, device='cuda:0', grad_fn=<NllLossBackward0>)
233000
23600 tensor(5.4600, device='cuda:0', grad_fn=<NllLossBackward0>)
234000
23700 tensor(5.4961, device='cuda:0', grad_fn=<NllLossBackward0>)
235000
23800 tensor(5.3373, device='cuda:0', grad_fn=<NllLossBackward0>)
236000
23900 tensor(5.4470, device='cuda:0', grad_fn=<NllLossBackward0>)
237000
24000 tensor(5.4346, device='cuda:0', grad_fn=<NllLossBackward0>)
238000
24100 tensor(5.5112, device='cuda:0', grad_fn=<NllLossBackward0>)
239000
24200 tensor(5.6918, device='cuda:0', grad_fn=<NllLossBackward0>)
240000
24300 tensor(5.6115, device='cuda:0', grad_fn=<NllLossBackward0>)
241000
24400 tensor(5.7404, device='cuda:0', grad_fn=<NllLossBackward0>)
242000
24500 tensor(5.4982, device='cuda:0', grad_fn=<NllLossBackward0>)
243000
24600 tensor(5.6136, device='cuda:0', grad_fn=<NllLossBackward0>)
244000
24700 tensor(5.5225, device='cuda:0', grad_fn=<NllLossBackward0>)
245000
24800 tensor(5.5563, device='cuda:0', grad_fn=<NllLossBackward0>)
246000
24900 tensor(5.6283, device='cuda:0', grad_fn=<NllLossBackward0>)
247000
25000 tensor(5.6176, device='cuda:0', grad_fn=<NllLossBackward0>)
248000
25100 tensor(5.5795, device='cuda:0', grad_fn=<NllLossBackward0>)
249000
25200 tensor(5.5831, device='cuda:0', grad_fn=<NllLossBackward0>)
250000
25300 tensor(5.5894, device='cuda:0', grad_fn=<NllLossBackward0>)
251000
25400 tensor(5.5670, device='cuda:0', grad_fn=<NllLossBackward0>)
252000
25500 tensor(5.5016, device='cuda:0', grad_fn=<NllLossBackward0>)
253000
25600 tensor(5.7909, device='cuda:0', grad_fn=<NllLossBackward0>)
254000
25700 tensor(5.5229, device='cuda:0', grad_fn=<NllLossBackward0>)
255000
25800 tensor(5.6035, device='cuda:0', grad_fn=<NllLossBackward0>)
256000
25900 tensor(5.5293, device='cuda:0', grad_fn=<NllLossBackward0>)
257000
26000 tensor(5.5553, device='cuda:0', grad_fn=<NllLossBackward0>)
258000
26100 tensor(5.4476, device='cuda:0', grad_fn=<NllLossBackward0>)
259000
26200 tensor(5.3721, device='cuda:0', grad_fn=<NllLossBackward0>)
260000
26300 tensor(5.6142, device='cuda:0', grad_fn=<NllLossBackward0>)
261000
26400 tensor(5.6202, device='cuda:0', grad_fn=<NllLossBackward0>)
262000
26500 tensor(5.3529, device='cuda:0', grad_fn=<NllLossBackward0>)
263000
26600 tensor(5.7148, device='cuda:0', grad_fn=<NllLossBackward0>)
264000
26700 tensor(5.5755, device='cuda:0', grad_fn=<NllLossBackward0>)
265000
26800 tensor(5.7480, device='cuda:0', grad_fn=<NllLossBackward0>)
266000
26900 tensor(5.5025, device='cuda:0', grad_fn=<NllLossBackward0>)
267000
27000 tensor(5.4017, device='cuda:0', grad_fn=<NllLossBackward0>)
268000
27100 tensor(5.3996, device='cuda:0', grad_fn=<NllLossBackward0>)
269000
27200 tensor(5.4862, device='cuda:0', grad_fn=<NllLossBackward0>)
270000
27300 tensor(5.6392, device='cuda:0', grad_fn=<NllLossBackward0>)
271000
27400 tensor(5.5634, device='cuda:0', grad_fn=<NllLossBackward0>)
272000
27500 tensor(5.4420, device='cuda:0', grad_fn=<NllLossBackward0>)
273000
27600 tensor(5.7835, device='cuda:0', grad_fn=<NllLossBackward0>)
274000
27700 tensor(5.5555, device='cuda:0', grad_fn=<NllLossBackward0>)
275000
27800 tensor(5.5381, device='cuda:0', grad_fn=<NllLossBackward0>)
276000
27900 tensor(5.6515, device='cuda:0', grad_fn=<NllLossBackward0>)
277000
28000 tensor(5.5254, device='cuda:0', grad_fn=<NllLossBackward0>)
278000
28100 tensor(5.4929, device='cuda:0', grad_fn=<NllLossBackward0>)
279000
28200 tensor(5.6218, device='cuda:0', grad_fn=<NllLossBackward0>)
280000
28300 tensor(5.2878, device='cuda:0', grad_fn=<NllLossBackward0>)
281000
28400 tensor(5.7112, device='cuda:0', grad_fn=<NllLossBackward0>)
282000
28500 tensor(5.5490, device='cuda:0', grad_fn=<NllLossBackward0>)
283000
28600 tensor(5.4572, device='cuda:0', grad_fn=<NllLossBackward0>)
284000
28700 tensor(5.6349, device='cuda:0', grad_fn=<NllLossBackward0>)
285000
28800 tensor(5.6607, device='cuda:0', grad_fn=<NllLossBackward0>)
286000
28900 tensor(5.5422, device='cuda:0', grad_fn=<NllLossBackward0>)
287000
29000 tensor(5.4277, device='cuda:0', grad_fn=<NllLossBackward0>)
288000
29100 tensor(5.1870, device='cuda:0', grad_fn=<NllLossBackward0>)
289000
29200 tensor(5.3593, device='cuda:0', grad_fn=<NllLossBackward0>)
290000
29300 tensor(5.6512, device='cuda:0', grad_fn=<NllLossBackward0>)
291000
29400 tensor(5.8051, device='cuda:0', grad_fn=<NllLossBackward0>)
292000
29500 tensor(5.5308, device='cuda:0', grad_fn=<NllLossBackward0>)
293000
29600 tensor(5.3791, device='cuda:0', grad_fn=<NllLossBackward0>)
294000
29700 tensor(5.6108, device='cuda:0', grad_fn=<NllLossBackward0>)
295000
29800 tensor(5.4015, device='cuda:0', grad_fn=<NllLossBackward0>)
296000
29900 tensor(5.6953, device='cuda:0', grad_fn=<NllLossBackward0>)
297000
30000 tensor(5.3925, device='cuda:0', grad_fn=<NllLossBackward0>)
298000
30100 tensor(5.4241, device='cuda:0', grad_fn=<NllLossBackward0>)
299000
30200 tensor(5.4216, device='cuda:0', grad_fn=<NllLossBackward0>)
300000
30300 tensor(5.5074, device='cuda:0', grad_fn=<NllLossBackward0>)
301000
30400 tensor(5.3631, device='cuda:0', grad_fn=<NllLossBackward0>)
302000
30500 tensor(5.5690, device='cuda:0', grad_fn=<NllLossBackward0>)
30600 tensor(5.4734, device='cuda:0', grad_fn=<NllLossBackward0>)
303000
30700 tensor(5.5061, device='cuda:0', grad_fn=<NllLossBackward0>)
304000
30800 tensor(5.5709, device='cuda:0', grad_fn=<NllLossBackward0>)
305000
30900 tensor(5.5478, device='cuda:0', grad_fn=<NllLossBackward0>)
306000
31000 tensor(5.6687, device='cuda:0', grad_fn=<NllLossBackward0>)
307000
31100 tensor(5.2899, device='cuda:0', grad_fn=<NllLossBackward0>)
308000
31200 tensor(5.3663, device='cuda:0', grad_fn=<NllLossBackward0>)
309000
31300 tensor(5.6274, device='cuda:0', grad_fn=<NllLossBackward0>)
310000
31400 tensor(5.4358, device='cuda:0', grad_fn=<NllLossBackward0>)
311000
31500 tensor(5.5738, device='cuda:0', grad_fn=<NllLossBackward0>)
312000
31600 tensor(5.5612, device='cuda:0', grad_fn=<NllLossBackward0>)
313000
31700 tensor(5.5104, device='cuda:0', grad_fn=<NllLossBackward0>)
314000
31800 tensor(5.6343, device='cuda:0', grad_fn=<NllLossBackward0>)
315000
31900 tensor(5.2243, device='cuda:0', grad_fn=<NllLossBackward0>)
316000
32000 tensor(5.4320, device='cuda:0', grad_fn=<NllLossBackward0>)
317000
32100 tensor(5.3344, device='cuda:0', grad_fn=<NllLossBackward0>)
318000
32200 tensor(5.6543, device='cuda:0', grad_fn=<NllLossBackward0>)
319000
32300 tensor(5.6512, device='cuda:0', grad_fn=<NllLossBackward0>)
320000
32400 tensor(5.6237, device='cuda:0', grad_fn=<NllLossBackward0>)
321000
32500 tensor(5.4246, device='cuda:0', grad_fn=<NllLossBackward0>)
322000
32600 tensor(5.5469, device='cuda:0', grad_fn=<NllLossBackward0>)
323000
32700 tensor(5.5338, device='cuda:0', grad_fn=<NllLossBackward0>)
324000
32800 tensor(5.6954, device='cuda:0', grad_fn=<NllLossBackward0>)
325000
32900 tensor(5.5754, device='cuda:0', grad_fn=<NllLossBackward0>)
326000
33000 tensor(5.3334, device='cuda:0', grad_fn=<NllLossBackward0>)
327000
33100 tensor(5.5284, device='cuda:0', grad_fn=<NllLossBackward0>)
328000
33200 tensor(5.6350, device='cuda:0', grad_fn=<NllLossBackward0>)
329000
33300 tensor(5.4312, device='cuda:0', grad_fn=<NllLossBackward0>)
330000
33400 tensor(5.6854, device='cuda:0', grad_fn=<NllLossBackward0>)
331000
33500 tensor(5.4921, device='cuda:0', grad_fn=<NllLossBackward0>)
332000
33600 tensor(5.4345, device='cuda:0', grad_fn=<NllLossBackward0>)
333000
33700 tensor(5.4950, device='cuda:0', grad_fn=<NllLossBackward0>)
334000
33800 tensor(5.5757, device='cuda:0', grad_fn=<NllLossBackward0>)
335000
33900 tensor(5.3466, device='cuda:0', grad_fn=<NllLossBackward0>)
336000
34000 tensor(5.5373, device='cuda:0', grad_fn=<NllLossBackward0>)
337000
34100 tensor(5.5144, device='cuda:0', grad_fn=<NllLossBackward0>)
338000
34200 tensor(5.5543, device='cuda:0', grad_fn=<NllLossBackward0>)
339000
34300 tensor(5.3564, device='cuda:0', grad_fn=<NllLossBackward0>)
340000
34400 tensor(5.8091, device='cuda:0', grad_fn=<NllLossBackward0>)
341000
34500 tensor(5.6699, device='cuda:0', grad_fn=<NllLossBackward0>)
342000
34600 tensor(5.5536, device='cuda:0', grad_fn=<NllLossBackward0>)
343000
34700 tensor(5.6261, device='cuda:0', grad_fn=<NllLossBackward0>)
344000
34800 tensor(5.6504, device='cuda:0', grad_fn=<NllLossBackward0>)
345000
34900 tensor(5.7067, device='cuda:0', grad_fn=<NllLossBackward0>)
346000
35000 tensor(5.7307, device='cuda:0', grad_fn=<NllLossBackward0>)
347000
35100 tensor(5.4831, device='cuda:0', grad_fn=<NllLossBackward0>)
348000
35200 tensor(5.4367, device='cuda:0', grad_fn=<NllLossBackward0>)
349000
35300 tensor(5.6503, device='cuda:0', grad_fn=<NllLossBackward0>)
350000
35400 tensor(5.2892, device='cuda:0', grad_fn=<NllLossBackward0>)
351000
35500 tensor(5.4198, device='cuda:0', grad_fn=<NllLossBackward0>)
352000
35600 tensor(5.4870, device='cuda:0', grad_fn=<NllLossBackward0>)
353000
35700 tensor(5.4489, device='cuda:0', grad_fn=<NllLossBackward0>)
354000
35800 tensor(5.5170, device='cuda:0', grad_fn=<NllLossBackward0>)
355000
35900 tensor(5.4699, device='cuda:0', grad_fn=<NllLossBackward0>)
356000
36000 tensor(5.2451, device='cuda:0', grad_fn=<NllLossBackward0>)
357000
36100 tensor(5.6311, device='cuda:0', grad_fn=<NllLossBackward0>)
358000
36200 tensor(5.5157, device='cuda:0', grad_fn=<NllLossBackward0>)
359000
36300 tensor(5.7751, device='cuda:0', grad_fn=<NllLossBackward0>)
360000
36400 tensor(5.4740, device='cuda:0', grad_fn=<NllLossBackward0>)
361000
36500 tensor(5.4746, device='cuda:0', grad_fn=<NllLossBackward0>)
362000
36600 tensor(5.5244, device='cuda:0', grad_fn=<NllLossBackward0>)
363000
36700 tensor(5.3037, device='cuda:0', grad_fn=<NllLossBackward0>)
364000
36800 tensor(5.4238, device='cuda:0', grad_fn=<NllLossBackward0>)
365000
36900 tensor(5.5203, device='cuda:0', grad_fn=<NllLossBackward0>)
366000
37000 tensor(5.4431, device='cuda:0', grad_fn=<NllLossBackward0>)
367000
37100 tensor(5.4286, device='cuda:0', grad_fn=<NllLossBackward0>)
368000
37200 tensor(5.5108, device='cuda:0', grad_fn=<NllLossBackward0>)
369000
37300 tensor(5.4229, device='cuda:0', grad_fn=<NllLossBackward0>)
370000
37400 tensor(5.8406, device='cuda:0', grad_fn=<NllLossBackward0>)
371000
37500 tensor(5.4602, device='cuda:0', grad_fn=<NllLossBackward0>)
372000
37600 tensor(5.4417, device='cuda:0', grad_fn=<NllLossBackward0>)
373000
37700 tensor(5.6200, device='cuda:0', grad_fn=<NllLossBackward0>)
374000
37800 tensor(5.4527, device='cuda:0', grad_fn=<NllLossBackward0>)
375000
37900 tensor(5.4631, device='cuda:0', grad_fn=<NllLossBackward0>)
376000
38000 tensor(5.5196, device='cuda:0', grad_fn=<NllLossBackward0>)
377000
38100 tensor(5.5436, device='cuda:0', grad_fn=<NllLossBackward0>)
378000
38200 tensor(5.5269, device='cuda:0', grad_fn=<NllLossBackward0>)
379000
38300 tensor(5.4716, device='cuda:0', grad_fn=<NllLossBackward0>)
380000
38400 tensor(5.5081, device='cuda:0', grad_fn=<NllLossBackward0>)
381000
38500 tensor(5.5249, device='cuda:0', grad_fn=<NllLossBackward0>)
382000
38600 tensor(5.5018, device='cuda:0', grad_fn=<NllLossBackward0>)
383000
38700 tensor(5.4845, device='cuda:0', grad_fn=<NllLossBackward0>)
384000
38800 tensor(5.5505, device='cuda:0', grad_fn=<NllLossBackward0>)
385000
38900 tensor(5.6658, device='cuda:0', grad_fn=<NllLossBackward0>)
386000
39000 tensor(5.3333, device='cuda:0', grad_fn=<NllLossBackward0>)
387000
39100 tensor(5.5598, device='cuda:0', grad_fn=<NllLossBackward0>)
388000
39200 tensor(5.6624, device='cuda:0', grad_fn=<NllLossBackward0>)
389000
39300 tensor(5.4714, device='cuda:0', grad_fn=<NllLossBackward0>)
390000
39400 tensor(5.5470, device='cuda:0', grad_fn=<NllLossBackward0>)
391000
39500 tensor(5.6905, device='cuda:0', grad_fn=<NllLossBackward0>)
392000
39600 tensor(5.3592, device='cuda:0', grad_fn=<NllLossBackward0>)
393000
39700 tensor(5.3170, device='cuda:0', grad_fn=<NllLossBackward0>)
394000
39800 tensor(5.4491, device='cuda:0', grad_fn=<NllLossBackward0>)
395000
39900 tensor(5.2872, device='cuda:0', grad_fn=<NllLossBackward0>)
396000
40000 tensor(5.3865, device='cuda:0', grad_fn=<NllLossBackward0>)
397000
40100 tensor(5.4536, device='cuda:0', grad_fn=<NllLossBackward0>)
398000
40200 tensor(5.4382, device='cuda:0', grad_fn=<NllLossBackward0>)
399000
40300 tensor(5.4819, device='cuda:0', grad_fn=<NllLossBackward0>)
40400 tensor(5.5250, device='cuda:0', grad_fn=<NllLossBackward0>)
400000
40500 tensor(5.4396, device='cuda:0', grad_fn=<NllLossBackward0>)
401000
40600 tensor(5.5062, device='cuda:0', grad_fn=<NllLossBackward0>)
402000
40700 tensor(5.5362, device='cuda:0', grad_fn=<NllLossBackward0>)
403000
40800 tensor(5.5015, device='cuda:0', grad_fn=<NllLossBackward0>)
404000
40900 tensor(5.4610, device='cuda:0', grad_fn=<NllLossBackward0>)
405000
41000 tensor(5.5083, device='cuda:0', grad_fn=<NllLossBackward0>)
406000
41100 tensor(5.4346, device='cuda:0', grad_fn=<NllLossBackward0>)
407000
41200 tensor(5.3340, device='cuda:0', grad_fn=<NllLossBackward0>)
408000
41300 tensor(5.4608, device='cuda:0', grad_fn=<NllLossBackward0>)
409000
41400 tensor(5.3758, device='cuda:0', grad_fn=<NllLossBackward0>)
410000
41500 tensor(5.5160, device='cuda:0', grad_fn=<NllLossBackward0>)
411000
41600 tensor(5.4290, device='cuda:0', grad_fn=<NllLossBackward0>)
412000
41700 tensor(5.4426, device='cuda:0', grad_fn=<NllLossBackward0>)
413000
41800 tensor(5.4764, device='cuda:0', grad_fn=<NllLossBackward0>)
414000
41900 tensor(5.4730, device='cuda:0', grad_fn=<NllLossBackward0>)
415000
42000 tensor(5.6150, device='cuda:0', grad_fn=<NllLossBackward0>)
416000
42100 tensor(5.3622, device='cuda:0', grad_fn=<NllLossBackward0>)
417000
42200 tensor(5.4380, device='cuda:0', grad_fn=<NllLossBackward0>)
418000
42300 tensor(5.5031, device='cuda:0', grad_fn=<NllLossBackward0>)
419000
42400 tensor(5.3124, device='cuda:0', grad_fn=<NllLossBackward0>)
420000
42500 tensor(5.4812, device='cuda:0', grad_fn=<NllLossBackward0>)
421000
42600 tensor(5.2723, device='cuda:0', grad_fn=<NllLossBackward0>)
422000
42700 tensor(5.5998, device='cuda:0', grad_fn=<NllLossBackward0>)
423000
42800 tensor(5.5254, device='cuda:0', grad_fn=<NllLossBackward0>)
424000
42900 tensor(5.3716, device='cuda:0', grad_fn=<NllLossBackward0>)
425000
43000 tensor(5.5020, device='cuda:0', grad_fn=<NllLossBackward0>)
426000
43100 tensor(5.5091, device='cuda:0', grad_fn=<NllLossBackward0>)
427000
43200 tensor(5.3182, device='cuda:0', grad_fn=<NllLossBackward0>)
428000
43300 tensor(5.4001, device='cuda:0', grad_fn=<NllLossBackward0>)
429000
43400 tensor(5.5150, device='cuda:0', grad_fn=<NllLossBackward0>)
430000
43500 tensor(5.2440, device='cuda:0', grad_fn=<NllLossBackward0>)
431000
43600 tensor(5.4439, device='cuda:0', grad_fn=<NllLossBackward0>)
432000
epoch: = 2
0 tensor(5.3953, device='cuda:0', grad_fn=<NllLossBackward0>)
1000
100 tensor(5.4847, device='cuda:0', grad_fn=<NllLossBackward0>)
200 tensor(5.3626, device='cuda:0', grad_fn=<NllLossBackward0>)
2000
300 tensor(5.4127, device='cuda:0', grad_fn=<NllLossBackward0>)
3000
400 tensor(5.3734, device='cuda:0', grad_fn=<NllLossBackward0>)
4000
500 tensor(5.5564, device='cuda:0', grad_fn=<NllLossBackward0>)
5000
600 tensor(5.3391, device='cuda:0', grad_fn=<NllLossBackward0>)
6000
700 tensor(5.6198, device='cuda:0', grad_fn=<NllLossBackward0>)
7000
800 tensor(5.2255, device='cuda:0', grad_fn=<NllLossBackward0>)
8000
900 tensor(5.5161, device='cuda:0', grad_fn=<NllLossBackward0>)
9000
1000 tensor(5.3517, device='cuda:0', grad_fn=<NllLossBackward0>)
10000
1100 tensor(5.5420, device='cuda:0', grad_fn=<NllLossBackward0>)
11000
1200 tensor(5.6031, device='cuda:0', grad_fn=<NllLossBackward0>)
12000
1300 tensor(5.5343, device='cuda:0', grad_fn=<NllLossBackward0>)
13000
1400 tensor(5.5547, device='cuda:0', grad_fn=<NllLossBackward0>)
14000
1500 tensor(5.6080, device='cuda:0', grad_fn=<NllLossBackward0>)
15000
1600 tensor(5.2940, device='cuda:0', grad_fn=<NllLossBackward0>)
16000
1700 tensor(5.3671, device='cuda:0', grad_fn=<NllLossBackward0>)
17000
1800 tensor(5.3777, device='cuda:0', grad_fn=<NllLossBackward0>)
18000
1900 tensor(5.3593, device='cuda:0', grad_fn=<NllLossBackward0>)
19000
2000 tensor(5.4348, device='cuda:0', grad_fn=<NllLossBackward0>)
20000
2100 tensor(5.5513, device='cuda:0', grad_fn=<NllLossBackward0>)
21000
2200 tensor(5.3939, device='cuda:0', grad_fn=<NllLossBackward0>)
22000
2300 tensor(5.4063, device='cuda:0', grad_fn=<NllLossBackward0>)
23000
2400 tensor(5.4092, device='cuda:0', grad_fn=<NllLossBackward0>)
24000
2500 tensor(5.4460, device='cuda:0', grad_fn=<NllLossBackward0>)
25000
2600 tensor(5.4738, device='cuda:0', grad_fn=<NllLossBackward0>)
26000
2700 tensor(5.4848, device='cuda:0', grad_fn=<NllLossBackward0>)
27000
2800 tensor(5.5244, device='cuda:0', grad_fn=<NllLossBackward0>)
28000
2900 tensor(5.6711, device='cuda:0', grad_fn=<NllLossBackward0>)
29000
3000 tensor(5.4024, device='cuda:0', grad_fn=<NllLossBackward0>)
30000
3100 tensor(5.4842, device='cuda:0', grad_fn=<NllLossBackward0>)
31000
3200 tensor(5.4863, device='cuda:0', grad_fn=<NllLossBackward0>)
32000
3300 tensor(5.4114, device='cuda:0', grad_fn=<NllLossBackward0>)
33000
3400 tensor(5.3231, device='cuda:0', grad_fn=<NllLossBackward0>)
34000
3500 tensor(5.4598, device='cuda:0', grad_fn=<NllLossBackward0>)
35000
3600 tensor(5.4579, device='cuda:0', grad_fn=<NllLossBackward0>)
36000
3700 tensor(5.3890, device='cuda:0', grad_fn=<NllLossBackward0>)
37000
3800 tensor(5.4162, device='cuda:0', grad_fn=<NllLossBackward0>)
38000
3900 tensor(5.2854, device='cuda:0', grad_fn=<NllLossBackward0>)
39000
4000 tensor(5.3370, device='cuda:0', grad_fn=<NllLossBackward0>)
40000
4100 tensor(5.5078, device='cuda:0', grad_fn=<NllLossBackward0>)
41000
4200 tensor(5.5341, device='cuda:0', grad_fn=<NllLossBackward0>)
42000
4300 tensor(5.4704, device='cuda:0', grad_fn=<NllLossBackward0>)
43000
4400 tensor(5.4990, device='cuda:0', grad_fn=<NllLossBackward0>)
44000
4500 tensor(5.3300, device='cuda:0', grad_fn=<NllLossBackward0>)
45000
4600 tensor(5.6674, device='cuda:0', grad_fn=<NllLossBackward0>)
46000
4700 tensor(5.3622, device='cuda:0', grad_fn=<NllLossBackward0>)
47000
4800 tensor(5.4762, device='cuda:0', grad_fn=<NllLossBackward0>)
48000
4900 tensor(5.5403, device='cuda:0', grad_fn=<NllLossBackward0>)
49000
5000 tensor(5.5359, device='cuda:0', grad_fn=<NllLossBackward0>)
50000
5100 tensor(5.6058, device='cuda:0', grad_fn=<NllLossBackward0>)
51000
5200 tensor(5.6209, device='cuda:0', grad_fn=<NllLossBackward0>)
52000
5300 tensor(5.6273, device='cuda:0', grad_fn=<NllLossBackward0>)
53000
5400 tensor(5.4695, device='cuda:0', grad_fn=<NllLossBackward0>)
54000
5500 tensor(5.5771, device='cuda:0', grad_fn=<NllLossBackward0>)
55000
5600 tensor(5.3552, device='cuda:0', grad_fn=<NllLossBackward0>)
56000
5700 tensor(5.5957, device='cuda:0', grad_fn=<NllLossBackward0>)
57000
5800 tensor(5.5952, device='cuda:0', grad_fn=<NllLossBackward0>)
58000
5900 tensor(5.5643, device='cuda:0', grad_fn=<NllLossBackward0>)
59000
6000 tensor(5.4346, device='cuda:0', grad_fn=<NllLossBackward0>)
60000
6100 tensor(5.4620, device='cuda:0', grad_fn=<NllLossBackward0>)
61000
6200 tensor(5.6256, device='cuda:0', grad_fn=<NllLossBackward0>)
62000
6300 tensor(5.4832, device='cuda:0', grad_fn=<NllLossBackward0>)
63000
6400 tensor(5.4063, device='cuda:0', grad_fn=<NllLossBackward0>)
64000
6500 tensor(5.2587, device='cuda:0', grad_fn=<NllLossBackward0>)
65000
6600 tensor(5.4320, device='cuda:0', grad_fn=<NllLossBackward0>)
66000
6700 tensor(5.3770, device='cuda:0', grad_fn=<NllLossBackward0>)
67000
6800 tensor(5.6077, device='cuda:0', grad_fn=<NllLossBackward0>)
68000
6900 tensor(5.5788, device='cuda:0', grad_fn=<NllLossBackward0>)
69000
7000 tensor(5.4929, device='cuda:0', grad_fn=<NllLossBackward0>)
70000
7100 tensor(5.2828, device='cuda:0', grad_fn=<NllLossBackward0>)
71000
7200 tensor(5.3992, device='cuda:0', grad_fn=<NllLossBackward0>)
72000
7300 tensor(5.6273, device='cuda:0', grad_fn=<NllLossBackward0>)
73000
7400 tensor(5.4385, device='cuda:0', grad_fn=<NllLossBackward0>)
74000
7500 tensor(5.3176, device='cuda:0', grad_fn=<NllLossBackward0>)
75000
7600 tensor(5.3834, device='cuda:0', grad_fn=<NllLossBackward0>)
76000
7700 tensor(5.4532, device='cuda:0', grad_fn=<NllLossBackward0>)
77000
7800 tensor(5.5669, device='cuda:0', grad_fn=<NllLossBackward0>)
78000
7900 tensor(5.2508, device='cuda:0', grad_fn=<NllLossBackward0>)
79000
8000 tensor(5.3027, device='cuda:0', grad_fn=<NllLossBackward0>)
80000
8100 tensor(5.4813, device='cuda:0', grad_fn=<NllLossBackward0>)
81000
8200 tensor(5.4822, device='cuda:0', grad_fn=<NllLossBackward0>)
82000
8300 tensor(5.4510, device='cuda:0', grad_fn=<NllLossBackward0>)
83000
8400 tensor(5.5712, device='cuda:0', grad_fn=<NllLossBackward0>)
84000
8500 tensor(5.5634, device='cuda:0', grad_fn=<NllLossBackward0>)
85000
8600 tensor(5.5616, device='cuda:0', grad_fn=<NllLossBackward0>)
86000
8700 tensor(5.6568, device='cuda:0', grad_fn=<NllLossBackward0>)
87000
8800 tensor(5.5397, device='cuda:0', grad_fn=<NllLossBackward0>)
88000
8900 tensor(5.3852, device='cuda:0', grad_fn=<NllLossBackward0>)
89000
9000 tensor(5.5022, device='cuda:0', grad_fn=<NllLossBackward0>)
90000
9100 tensor(5.5088, device='cuda:0', grad_fn=<NllLossBackward0>)
91000
9200 tensor(5.4214, device='cuda:0', grad_fn=<NllLossBackward0>)
92000
9300 tensor(5.4641, device='cuda:0', grad_fn=<NllLossBackward0>)
93000
9400 tensor(5.3085, device='cuda:0', grad_fn=<NllLossBackward0>)
94000
9500 tensor(5.3852, device='cuda:0', grad_fn=<NllLossBackward0>)
95000
9600 tensor(5.5097, device='cuda:0', grad_fn=<NllLossBackward0>)
96000
9700 tensor(5.4373, device='cuda:0', grad_fn=<NllLossBackward0>)
97000
9800 tensor(5.4786, device='cuda:0', grad_fn=<NllLossBackward0>)
9900 tensor(5.3198, device='cuda:0', grad_fn=<NllLossBackward0>)
98000
10000 tensor(5.5310, device='cuda:0', grad_fn=<NllLossBackward0>)
99000
10100 tensor(5.4341, device='cuda:0', grad_fn=<NllLossBackward0>)
100000
10200 tensor(5.3571, device='cuda:0', grad_fn=<NllLossBackward0>)
101000
10300 tensor(5.4712, device='cuda:0', grad_fn=<NllLossBackward0>)
102000
10400 tensor(5.4810, device='cuda:0', grad_fn=<NllLossBackward0>)
103000
10500 tensor(5.5463, device='cuda:0', grad_fn=<NllLossBackward0>)
104000
10600 tensor(5.6233, device='cuda:0', grad_fn=<NllLossBackward0>)
105000
10700 tensor(5.4678, device='cuda:0', grad_fn=<NllLossBackward0>)
106000
10800 tensor(5.5040, device='cuda:0', grad_fn=<NllLossBackward0>)
107000
10900 tensor(5.3963, device='cuda:0', grad_fn=<NllLossBackward0>)
108000
11000 tensor(5.6295, device='cuda:0', grad_fn=<NllLossBackward0>)
109000
11100 tensor(5.2378, device='cuda:0', grad_fn=<NllLossBackward0>)
110000
11200 tensor(5.4184, device='cuda:0', grad_fn=<NllLossBackward0>)
111000
11300 tensor(5.4404, device='cuda:0', grad_fn=<NllLossBackward0>)
112000
11400 tensor(5.3875, device='cuda:0', grad_fn=<NllLossBackward0>)
113000
11500 tensor(5.4523, device='cuda:0', grad_fn=<NllLossBackward0>)
114000
11600 tensor(5.4418, device='cuda:0', grad_fn=<NllLossBackward0>)
115000
11700 tensor(5.3604, device='cuda:0', grad_fn=<NllLossBackward0>)
116000
11800 tensor(5.5647, device='cuda:0', grad_fn=<NllLossBackward0>)
117000
11900 tensor(5.3936, device='cuda:0', grad_fn=<NllLossBackward0>)
118000
12000 tensor(5.4823, device='cuda:0', grad_fn=<NllLossBackward0>)
119000
12100 tensor(5.5069, device='cuda:0', grad_fn=<NllLossBackward0>)
120000
12200 tensor(5.4983, device='cuda:0', grad_fn=<NllLossBackward0>)
121000
12300 tensor(5.6030, device='cuda:0', grad_fn=<NllLossBackward0>)
122000
12400 tensor(5.4763, device='cuda:0', grad_fn=<NllLossBackward0>)
123000
12500 tensor(5.3718, device='cuda:0', grad_fn=<NllLossBackward0>)
124000
12600 tensor(5.4416, device='cuda:0', grad_fn=<NllLossBackward0>)
125000
12700 tensor(5.3554, device='cuda:0', grad_fn=<NllLossBackward0>)
126000
12800 tensor(5.5392, device='cuda:0', grad_fn=<NllLossBackward0>)
127000
12900 tensor(5.4164, device='cuda:0', grad_fn=<NllLossBackward0>)
128000
13000 tensor(5.2286, device='cuda:0', grad_fn=<NllLossBackward0>)
129000
13100 tensor(5.4288, device='cuda:0', grad_fn=<NllLossBackward0>)
130000
13200 tensor(5.4770, device='cuda:0', grad_fn=<NllLossBackward0>)
131000
13300 tensor(5.3352, device='cuda:0', grad_fn=<NllLossBackward0>)
132000
13400 tensor(5.4349, device='cuda:0', grad_fn=<NllLossBackward0>)
133000
13500 tensor(5.3860, device='cuda:0', grad_fn=<NllLossBackward0>)
134000
13600 tensor(5.4648, device='cuda:0', grad_fn=<NllLossBackward0>)
135000
13700 tensor(5.4444, device='cuda:0', grad_fn=<NllLossBackward0>)
136000
13800 tensor(5.4320, device='cuda:0', grad_fn=<NllLossBackward0>)
137000
13900 tensor(5.2935, device='cuda:0', grad_fn=<NllLossBackward0>)
138000
14000 tensor(5.5387, device='cuda:0', grad_fn=<NllLossBackward0>)
139000
14100 tensor(5.2424, device='cuda:0', grad_fn=<NllLossBackward0>)
140000
14200 tensor(5.5177, device='cuda:0', grad_fn=<NllLossBackward0>)
141000
14300 tensor(5.4831, device='cuda:0', grad_fn=<NllLossBackward0>)
142000
14400 tensor(5.3877, device='cuda:0', grad_fn=<NllLossBackward0>)
143000
14500 tensor(5.4919, device='cuda:0', grad_fn=<NllLossBackward0>)
144000
14600 tensor(5.5253, device='cuda:0', grad_fn=<NllLossBackward0>)
145000
14700 tensor(5.3948, device='cuda:0', grad_fn=<NllLossBackward0>)
146000
14800 tensor(5.8442, device='cuda:0', grad_fn=<NllLossBackward0>)
147000
14900 tensor(5.4967, device='cuda:0', grad_fn=<NllLossBackward0>)
148000
15000 tensor(5.3788, device='cuda:0', grad_fn=<NllLossBackward0>)
149000
15100 tensor(5.5832, device='cuda:0', grad_fn=<NllLossBackward0>)
150000
15200 tensor(5.4482, device='cuda:0', grad_fn=<NllLossBackward0>)
151000
15300 tensor(5.4260, device='cuda:0', grad_fn=<NllLossBackward0>)
152000
15400 tensor(5.3273, device='cuda:0', grad_fn=<NllLossBackward0>)
153000
15500 tensor(5.4840, device='cuda:0', grad_fn=<NllLossBackward0>)
154000
15600 tensor(5.4851, device='cuda:0', grad_fn=<NllLossBackward0>)
155000
15700 tensor(5.3871, device='cuda:0', grad_fn=<NllLossBackward0>)
156000
15800 tensor(5.2933, device='cuda:0', grad_fn=<NllLossBackward0>)
157000
15900 tensor(5.4374, device='cuda:0', grad_fn=<NllLossBackward0>)
158000
16000 tensor(5.2555, device='cuda:0', grad_fn=<NllLossBackward0>)
159000
16100 tensor(5.3127, device='cuda:0', grad_fn=<NllLossBackward0>)
160000
16200 tensor(5.6423, device='cuda:0', grad_fn=<NllLossBackward0>)
161000
16300 tensor(5.4702, device='cuda:0', grad_fn=<NllLossBackward0>)
162000
16400 tensor(5.4419, device='cuda:0', grad_fn=<NllLossBackward0>)
163000
16500 tensor(5.5640, device='cuda:0', grad_fn=<NllLossBackward0>)
164000
16600 tensor(5.4099, device='cuda:0', grad_fn=<NllLossBackward0>)
165000
16700 tensor(5.3822, device='cuda:0', grad_fn=<NllLossBackward0>)
166000
16800 tensor(5.4643, device='cuda:0', grad_fn=<NllLossBackward0>)
167000
16900 tensor(5.2234, device='cuda:0', grad_fn=<NllLossBackward0>)
168000
17000 tensor(5.5021, device='cuda:0', grad_fn=<NllLossBackward0>)
169000
17100 tensor(5.3524, device='cuda:0', grad_fn=<NllLossBackward0>)
170000
17200 tensor(5.4725, device='cuda:0', grad_fn=<NllLossBackward0>)
171000
17300 tensor(5.5034, device='cuda:0', grad_fn=<NllLossBackward0>)
172000
17400 tensor(5.2911, device='cuda:0', grad_fn=<NllLossBackward0>)
173000
17500 tensor(5.3147, device='cuda:0', grad_fn=<NllLossBackward0>)
174000
17600 tensor(5.3426, device='cuda:0', grad_fn=<NllLossBackward0>)
175000
17700 tensor(5.3414, device='cuda:0', grad_fn=<NllLossBackward0>)
176000
17800 tensor(5.3991, device='cuda:0', grad_fn=<NllLossBackward0>)
177000
17900 tensor(5.2936, device='cuda:0', grad_fn=<NllLossBackward0>)
178000
18000 tensor(5.5238, device='cuda:0', grad_fn=<NllLossBackward0>)
179000
18100 tensor(5.4684, device='cuda:0', grad_fn=<NllLossBackward0>)
180000
18200 tensor(5.3916, device='cuda:0', grad_fn=<NllLossBackward0>)
181000
18300 tensor(5.3888, device='cuda:0', grad_fn=<NllLossBackward0>)
182000
18400 tensor(5.4299, device='cuda:0', grad_fn=<NllLossBackward0>)
183000
18500 tensor(5.4103, device='cuda:0', grad_fn=<NllLossBackward0>)
184000
18600 tensor(5.5980, device='cuda:0', grad_fn=<NllLossBackward0>)
185000
18700 tensor(5.4135, device='cuda:0', grad_fn=<NllLossBackward0>)
186000
18800 tensor(5.5855, device='cuda:0', grad_fn=<NllLossBackward0>)
187000
18900 tensor(5.4583, device='cuda:0', grad_fn=<NllLossBackward0>)
188000
19000 tensor(5.4854, device='cuda:0', grad_fn=<NllLossBackward0>)
189000
19100 tensor(5.5879, device='cuda:0', grad_fn=<NllLossBackward0>)
190000
19200 tensor(5.4675, device='cuda:0', grad_fn=<NllLossBackward0>)
191000
19300 tensor(5.5741, device='cuda:0', grad_fn=<NllLossBackward0>)
192000
19400 tensor(5.3977, device='cuda:0', grad_fn=<NllLossBackward0>)
193000
19500 tensor(5.4042, device='cuda:0', grad_fn=<NllLossBackward0>)
194000
19600 tensor(5.4364, device='cuda:0', grad_fn=<NllLossBackward0>)
195000
19700 tensor(5.4868, device='cuda:0', grad_fn=<NllLossBackward0>)
196000
19800 tensor(5.3476, device='cuda:0', grad_fn=<NllLossBackward0>)
197000
19900 tensor(5.3553, device='cuda:0', grad_fn=<NllLossBackward0>)
198000
20000 tensor(5.3707, device='cuda:0', grad_fn=<NllLossBackward0>)
199000
20100 tensor(5.3226, device='cuda:0', grad_fn=<NllLossBackward0>)
200000
20200 tensor(5.2488, device='cuda:0', grad_fn=<NllLossBackward0>)
201000
20300 tensor(5.3648, device='cuda:0', grad_fn=<NllLossBackward0>)
20400 tensor(5.4156, device='cuda:0', grad_fn=<NllLossBackward0>)
202000
20500 tensor(5.4102, device='cuda:0', grad_fn=<NllLossBackward0>)
203000
20600 tensor(5.6109, device='cuda:0', grad_fn=<NllLossBackward0>)
204000
20700 tensor(5.4335, device='cuda:0', grad_fn=<NllLossBackward0>)
205000
20800 tensor(5.2795, device='cuda:0', grad_fn=<NllLossBackward0>)
206000
20900 tensor(5.5609, device='cuda:0', grad_fn=<NllLossBackward0>)
207000
21000 tensor(5.3918, device='cuda:0', grad_fn=<NllLossBackward0>)
208000
21100 tensor(5.3831, device='cuda:0', grad_fn=<NllLossBackward0>)
209000
21200 tensor(5.2790, device='cuda:0', grad_fn=<NllLossBackward0>)
210000
21300 tensor(5.4710, device='cuda:0', grad_fn=<NllLossBackward0>)
211000
21400 tensor(5.5050, device='cuda:0', grad_fn=<NllLossBackward0>)
212000
21500 tensor(5.2692, device='cuda:0', grad_fn=<NllLossBackward0>)
213000
21600 tensor(5.2668, device='cuda:0', grad_fn=<NllLossBackward0>)
214000
21700 tensor(5.2633, device='cuda:0', grad_fn=<NllLossBackward0>)
215000
21800 tensor(5.4067, device='cuda:0', grad_fn=<NllLossBackward0>)
216000
21900 tensor(5.3829, device='cuda:0', grad_fn=<NllLossBackward0>)
217000
22000 tensor(5.3773, device='cuda:0', grad_fn=<NllLossBackward0>)
218000
22100 tensor(5.2472, device='cuda:0', grad_fn=<NllLossBackward0>)
219000
22200 tensor(5.1171, device='cuda:0', grad_fn=<NllLossBackward0>)
220000
22300 tensor(5.5545, device='cuda:0', grad_fn=<NllLossBackward0>)
221000
22400 tensor(5.2499, device='cuda:0', grad_fn=<NllLossBackward0>)
222000
22500 tensor(5.2943, device='cuda:0', grad_fn=<NllLossBackward0>)
223000
22600 tensor(5.4748, device='cuda:0', grad_fn=<NllLossBackward0>)
224000
22700 tensor(5.2436, device='cuda:0', grad_fn=<NllLossBackward0>)
225000
22800 tensor(5.5053, device='cuda:0', grad_fn=<NllLossBackward0>)
226000
22900 tensor(5.5519, device='cuda:0', grad_fn=<NllLossBackward0>)
227000
23000 tensor(5.4541, device='cuda:0', grad_fn=<NllLossBackward0>)
228000
23100 tensor(5.4279, device='cuda:0', grad_fn=<NllLossBackward0>)
229000
23200 tensor(5.4286, device='cuda:0', grad_fn=<NllLossBackward0>)
230000
23300 tensor(5.5179, device='cuda:0', grad_fn=<NllLossBackward0>)
231000
23400 tensor(5.5355, device='cuda:0', grad_fn=<NllLossBackward0>)
232000
23500 tensor(5.3505, device='cuda:0', grad_fn=<NllLossBackward0>)
233000
23600 tensor(5.3313, device='cuda:0', grad_fn=<NllLossBackward0>)
234000
23700 tensor(5.3509, device='cuda:0', grad_fn=<NllLossBackward0>)
235000
23800 tensor(5.2170, device='cuda:0', grad_fn=<NllLossBackward0>)
236000
23900 tensor(5.3101, device='cuda:0', grad_fn=<NllLossBackward0>)
237000
24000 tensor(5.2962, device='cuda:0', grad_fn=<NllLossBackward0>)
238000
24100 tensor(5.3882, device='cuda:0', grad_fn=<NllLossBackward0>)
239000
24200 tensor(5.5633, device='cuda:0', grad_fn=<NllLossBackward0>)
240000
24300 tensor(5.4595, device='cuda:0', grad_fn=<NllLossBackward0>)
241000
24400 tensor(5.5932, device='cuda:0', grad_fn=<NllLossBackward0>)
242000
24500 tensor(5.3717, device='cuda:0', grad_fn=<NllLossBackward0>)
243000
24600 tensor(5.4943, device='cuda:0', grad_fn=<NllLossBackward0>)
244000
24700 tensor(5.3985, device='cuda:0', grad_fn=<NllLossBackward0>)
245000
24800 tensor(5.4347, device='cuda:0', grad_fn=<NllLossBackward0>)
246000
24900 tensor(5.5008, device='cuda:0', grad_fn=<NllLossBackward0>)
247000
25000 tensor(5.5100, device='cuda:0', grad_fn=<NllLossBackward0>)
248000
25100 tensor(5.4427, device='cuda:0', grad_fn=<NllLossBackward0>)
249000
25200 tensor(5.4508, device='cuda:0', grad_fn=<NllLossBackward0>)
250000
25300 tensor(5.4724, device='cuda:0', grad_fn=<NllLossBackward0>)
251000
25400 tensor(5.4525, device='cuda:0', grad_fn=<NllLossBackward0>)
252000
25500 tensor(5.3620, device='cuda:0', grad_fn=<NllLossBackward0>)
253000
25600 tensor(5.6446, device='cuda:0', grad_fn=<NllLossBackward0>)
254000
25700 tensor(5.3966, device='cuda:0', grad_fn=<NllLossBackward0>)
255000
25800 tensor(5.4889, device='cuda:0', grad_fn=<NllLossBackward0>)
256000
25900 tensor(5.4251, device='cuda:0', grad_fn=<NllLossBackward0>)
257000
26000 tensor(5.4346, device='cuda:0', grad_fn=<NllLossBackward0>)
258000
26100 tensor(5.3395, device='cuda:0', grad_fn=<NllLossBackward0>)
259000
26200 tensor(5.2695, device='cuda:0', grad_fn=<NllLossBackward0>)
260000
26300 tensor(5.4767, device='cuda:0', grad_fn=<NllLossBackward0>)
261000
26400 tensor(5.5083, device='cuda:0', grad_fn=<NllLossBackward0>)
262000
26500 tensor(5.2347, device='cuda:0', grad_fn=<NllLossBackward0>)
263000
26600 tensor(5.5761, device='cuda:0', grad_fn=<NllLossBackward0>)
264000
26700 tensor(5.4402, device='cuda:0', grad_fn=<NllLossBackward0>)
265000
26800 tensor(5.6173, device='cuda:0', grad_fn=<NllLossBackward0>)
266000
26900 tensor(5.3775, device='cuda:0', grad_fn=<NllLossBackward0>)
267000
27000 tensor(5.2863, device='cuda:0', grad_fn=<NllLossBackward0>)
268000
27100 tensor(5.3007, device='cuda:0', grad_fn=<NllLossBackward0>)
269000
27200 tensor(5.3551, device='cuda:0', grad_fn=<NllLossBackward0>)
270000
27300 tensor(5.5439, device='cuda:0', grad_fn=<NllLossBackward0>)
271000
27400 tensor(5.4334, device='cuda:0', grad_fn=<NllLossBackward0>)
272000
27500 tensor(5.3266, device='cuda:0', grad_fn=<NllLossBackward0>)
273000
27600 tensor(5.6412, device='cuda:0', grad_fn=<NllLossBackward0>)
274000
27700 tensor(5.4420, device='cuda:0', grad_fn=<NllLossBackward0>)
275000
27800 tensor(5.4381, device='cuda:0', grad_fn=<NllLossBackward0>)
276000
27900 tensor(5.5550, device='cuda:0', grad_fn=<NllLossBackward0>)
277000
28000 tensor(5.4154, device='cuda:0', grad_fn=<NllLossBackward0>)
278000
28100 tensor(5.3823, device='cuda:0', grad_fn=<NllLossBackward0>)
279000
28200 tensor(5.5344, device='cuda:0', grad_fn=<NllLossBackward0>)
280000
28300 tensor(5.1615, device='cuda:0', grad_fn=<NllLossBackward0>)
281000
28400 tensor(5.6069, device='cuda:0', grad_fn=<NllLossBackward0>)
282000
28500 tensor(5.4426, device='cuda:0', grad_fn=<NllLossBackward0>)
283000
28600 tensor(5.3672, device='cuda:0', grad_fn=<NllLossBackward0>)
284000
28700 tensor(5.5133, device='cuda:0', grad_fn=<NllLossBackward0>)
285000
28800 tensor(5.5556, device='cuda:0', grad_fn=<NllLossBackward0>)
286000
28900 tensor(5.4294, device='cuda:0', grad_fn=<NllLossBackward0>)
287000
29000 tensor(5.3359, device='cuda:0', grad_fn=<NllLossBackward0>)
288000
29100 tensor(5.0951, device='cuda:0', grad_fn=<NllLossBackward0>)
289000
29200 tensor(5.2511, device='cuda:0', grad_fn=<NllLossBackward0>)
290000
29300 tensor(5.5364, device='cuda:0', grad_fn=<NllLossBackward0>)
291000
29400 tensor(5.6708, device='cuda:0', grad_fn=<NllLossBackward0>)
292000
29500 tensor(5.4371, device='cuda:0', grad_fn=<NllLossBackward0>)
293000
29600 tensor(5.2942, device='cuda:0', grad_fn=<NllLossBackward0>)
294000
29700 tensor(5.4637, device='cuda:0', grad_fn=<NllLossBackward0>)
295000
29800 tensor(5.2914, device='cuda:0', grad_fn=<NllLossBackward0>)
296000
29900 tensor(5.5562, device='cuda:0', grad_fn=<NllLossBackward0>)
297000
30000 tensor(5.2833, device='cuda:0', grad_fn=<NllLossBackward0>)
298000
30100 tensor(5.3481, device='cuda:0', grad_fn=<NllLossBackward0>)
299000
30200 tensor(5.3122, device='cuda:0', grad_fn=<NllLossBackward0>)
300000
30300 tensor(5.4103, device='cuda:0', grad_fn=<NllLossBackward0>)
301000
30400 tensor(5.2480, device='cuda:0', grad_fn=<NllLossBackward0>)
302000
30500 tensor(5.4258, device='cuda:0', grad_fn=<NllLossBackward0>)
30600 tensor(5.3835, device='cuda:0', grad_fn=<NllLossBackward0>)
303000
30700 tensor(5.4193, device='cuda:0', grad_fn=<NllLossBackward0>)
304000
30800 tensor(5.4438, device='cuda:0', grad_fn=<NllLossBackward0>)
305000
30900 tensor(5.4518, device='cuda:0', grad_fn=<NllLossBackward0>)
306000
31000 tensor(5.5607, device='cuda:0', grad_fn=<NllLossBackward0>)
307000
31100 tensor(5.2059, device='cuda:0', grad_fn=<NllLossBackward0>)
308000
31200 tensor(5.2571, device='cuda:0', grad_fn=<NllLossBackward0>)
309000
31300 tensor(5.5208, device='cuda:0', grad_fn=<NllLossBackward0>)
310000
31400 tensor(5.3061, device='cuda:0', grad_fn=<NllLossBackward0>)
311000
31500 tensor(5.4834, device='cuda:0', grad_fn=<NllLossBackward0>)
312000
31600 tensor(5.4653, device='cuda:0', grad_fn=<NllLossBackward0>)
313000
31700 tensor(5.4308, device='cuda:0', grad_fn=<NllLossBackward0>)
314000
31800 tensor(5.5400, device='cuda:0', grad_fn=<NllLossBackward0>)
315000
31900 tensor(5.1536, device='cuda:0', grad_fn=<NllLossBackward0>)
316000
32000 tensor(5.3460, device='cuda:0', grad_fn=<NllLossBackward0>)
317000
32100 tensor(5.2300, device='cuda:0', grad_fn=<NllLossBackward0>)
318000
32200 tensor(5.5511, device='cuda:0', grad_fn=<NllLossBackward0>)
319000
32300 tensor(5.5391, device='cuda:0', grad_fn=<NllLossBackward0>)
320000
32400 tensor(5.5157, device='cuda:0', grad_fn=<NllLossBackward0>)
321000
32500 tensor(5.3336, device='cuda:0', grad_fn=<NllLossBackward0>)
322000
32600 tensor(5.4475, device='cuda:0', grad_fn=<NllLossBackward0>)
323000
32700 tensor(5.3894, device='cuda:0', grad_fn=<NllLossBackward0>)
324000
32800 tensor(5.6022, device='cuda:0', grad_fn=<NllLossBackward0>)
325000
32900 tensor(5.4663, device='cuda:0', grad_fn=<NllLossBackward0>)
326000
33000 tensor(5.2387, device='cuda:0', grad_fn=<NllLossBackward0>)
327000
33100 tensor(5.4446, device='cuda:0', grad_fn=<NllLossBackward0>)
328000
33200 tensor(5.5450, device='cuda:0', grad_fn=<NllLossBackward0>)
329000
33300 tensor(5.3179, device='cuda:0', grad_fn=<NllLossBackward0>)
330000
33400 tensor(5.5905, device='cuda:0', grad_fn=<NllLossBackward0>)
331000
33500 tensor(5.4066, device='cuda:0', grad_fn=<NllLossBackward0>)
332000
33600 tensor(5.3542, device='cuda:0', grad_fn=<NllLossBackward0>)
333000
33700 tensor(5.4097, device='cuda:0', grad_fn=<NllLossBackward0>)
334000
33800 tensor(5.4912, device='cuda:0', grad_fn=<NllLossBackward0>)
335000
33900 tensor(5.2358, device='cuda:0', grad_fn=<NllLossBackward0>)
336000
34000 tensor(5.4470, device='cuda:0', grad_fn=<NllLossBackward0>)
337000
34100 tensor(5.4207, device='cuda:0', grad_fn=<NllLossBackward0>)
338000
34200 tensor(5.4651, device='cuda:0', grad_fn=<NllLossBackward0>)
339000
34300 tensor(5.2545, device='cuda:0', grad_fn=<NllLossBackward0>)
340000
34400 tensor(5.7106, device='cuda:0', grad_fn=<NllLossBackward0>)
341000
34500 tensor(5.5699, device='cuda:0', grad_fn=<NllLossBackward0>)
342000
34600 tensor(5.4638, device='cuda:0', grad_fn=<NllLossBackward0>)
343000
34700 tensor(5.5382, device='cuda:0', grad_fn=<NllLossBackward0>)
344000
34800 tensor(5.5603, device='cuda:0', grad_fn=<NllLossBackward0>)
345000
34900 tensor(5.6072, device='cuda:0', grad_fn=<NllLossBackward0>)
346000
35000 tensor(5.6037, device='cuda:0', grad_fn=<NllLossBackward0>)
347000
35100 tensor(5.4069, device='cuda:0', grad_fn=<NllLossBackward0>)
348000
35200 tensor(5.3398, device='cuda:0', grad_fn=<NllLossBackward0>)
349000
35300 tensor(5.5607, device='cuda:0', grad_fn=<NllLossBackward0>)
350000
35400 tensor(5.2068, device='cuda:0', grad_fn=<NllLossBackward0>)
351000
35500 tensor(5.3112, device='cuda:0', grad_fn=<NllLossBackward0>)
352000
35600 tensor(5.4126, device='cuda:0', grad_fn=<NllLossBackward0>)
353000
35700 tensor(5.3091, device='cuda:0', grad_fn=<NllLossBackward0>)
354000
35800 tensor(5.4252, device='cuda:0', grad_fn=<NllLossBackward0>)
355000
35900 tensor(5.3956, device='cuda:0', grad_fn=<NllLossBackward0>)
356000
36000 tensor(5.1705, device='cuda:0', grad_fn=<NllLossBackward0>)
357000
36100 tensor(5.5497, device='cuda:0', grad_fn=<NllLossBackward0>)
358000
36200 tensor(5.4066, device='cuda:0', grad_fn=<NllLossBackward0>)
359000
36300 tensor(5.6858, device='cuda:0', grad_fn=<NllLossBackward0>)
360000
36400 tensor(5.3812, device='cuda:0', grad_fn=<NllLossBackward0>)
361000
36500 tensor(5.3990, device='cuda:0', grad_fn=<NllLossBackward0>)
362000
36600 tensor(5.4302, device='cuda:0', grad_fn=<NllLossBackward0>)
363000
36700 tensor(5.2253, device='cuda:0', grad_fn=<NllLossBackward0>)
364000
36800 tensor(5.3347, device='cuda:0', grad_fn=<NllLossBackward0>)
365000
36900 tensor(5.4426, device='cuda:0', grad_fn=<NllLossBackward0>)
366000
37000 tensor(5.3419, device='cuda:0', grad_fn=<NllLossBackward0>)
367000
37100 tensor(5.3579, device='cuda:0', grad_fn=<NllLossBackward0>)
368000
37200 tensor(5.4332, device='cuda:0', grad_fn=<NllLossBackward0>)
369000
37300 tensor(5.3362, device='cuda:0', grad_fn=<NllLossBackward0>)
370000
37400 tensor(5.7100, device='cuda:0', grad_fn=<NllLossBackward0>)
371000
37500 tensor(5.3742, device='cuda:0', grad_fn=<NllLossBackward0>)
372000
37600 tensor(5.3615, device='cuda:0', grad_fn=<NllLossBackward0>)
373000
37700 tensor(5.5402, device='cuda:0', grad_fn=<NllLossBackward0>)
374000
37800 tensor(5.3734, device='cuda:0', grad_fn=<NllLossBackward0>)
375000
37900 tensor(5.3621, device='cuda:0', grad_fn=<NllLossBackward0>)
376000
38000 tensor(5.4380, device='cuda:0', grad_fn=<NllLossBackward0>)
377000
38100 tensor(5.4513, device='cuda:0', grad_fn=<NllLossBackward0>)
378000
38200 tensor(5.4554, device='cuda:0', grad_fn=<NllLossBackward0>)
379000
38300 tensor(5.3735, device='cuda:0', grad_fn=<NllLossBackward0>)
380000
38400 tensor(5.4297, device='cuda:0', grad_fn=<NllLossBackward0>)
381000
38500 tensor(5.4561, device='cuda:0', grad_fn=<NllLossBackward0>)
382000
38600 tensor(5.4118, device='cuda:0', grad_fn=<NllLossBackward0>)
383000
38700 tensor(5.3996, device='cuda:0', grad_fn=<NllLossBackward0>)
384000
38800 tensor(5.4825, device='cuda:0', grad_fn=<NllLossBackward0>)
385000
38900 tensor(5.5692, device='cuda:0', grad_fn=<NllLossBackward0>)
386000
39000 tensor(5.2573, device='cuda:0', grad_fn=<NllLossBackward0>)
387000
39100 tensor(5.4847, device='cuda:0', grad_fn=<NllLossBackward0>)
388000
39200 tensor(5.5802, device='cuda:0', grad_fn=<NllLossBackward0>)
389000
39300 tensor(5.3968, device='cuda:0', grad_fn=<NllLossBackward0>)
390000
39400 tensor(5.4666, device='cuda:0', grad_fn=<NllLossBackward0>)
391000
39500 tensor(5.5847, device='cuda:0', grad_fn=<NllLossBackward0>)
392000
39600 tensor(5.2648, device='cuda:0', grad_fn=<NllLossBackward0>)
393000
39700 tensor(5.2423, device='cuda:0', grad_fn=<NllLossBackward0>)
394000
39800 tensor(5.3731, device='cuda:0', grad_fn=<NllLossBackward0>)
395000
39900 tensor(5.2014, device='cuda:0', grad_fn=<NllLossBackward0>)
396000
40000 tensor(5.2903, device='cuda:0', grad_fn=<NllLossBackward0>)
397000
40100 tensor(5.3712, device='cuda:0', grad_fn=<NllLossBackward0>)
398000
40200 tensor(5.3557, device='cuda:0', grad_fn=<NllLossBackward0>)
399000
40300 tensor(5.4151, device='cuda:0', grad_fn=<NllLossBackward0>)
40400 tensor(5.4358, device='cuda:0', grad_fn=<NllLossBackward0>)
400000
40500 tensor(5.3498, device='cuda:0', grad_fn=<NllLossBackward0>)
401000
40600 tensor(5.4152, device='cuda:0', grad_fn=<NllLossBackward0>)
402000
40700 tensor(5.4551, device='cuda:0', grad_fn=<NllLossBackward0>)
403000
40800 tensor(5.4138, device='cuda:0', grad_fn=<NllLossBackward0>)
404000
40900 tensor(5.3628, device='cuda:0', grad_fn=<NllLossBackward0>)
405000
41000 tensor(5.4124, device='cuda:0', grad_fn=<NllLossBackward0>)
406000
41100 tensor(5.3750, device='cuda:0', grad_fn=<NllLossBackward0>)
407000
41200 tensor(5.2687, device='cuda:0', grad_fn=<NllLossBackward0>)
408000
41300 tensor(5.3987, device='cuda:0', grad_fn=<NllLossBackward0>)
409000
41400 tensor(5.2976, device='cuda:0', grad_fn=<NllLossBackward0>)
410000
41500 tensor(5.4418, device='cuda:0', grad_fn=<NllLossBackward0>)
411000
41600 tensor(5.3558, device='cuda:0', grad_fn=<NllLossBackward0>)
412000
41700 tensor(5.3767, device='cuda:0', grad_fn=<NllLossBackward0>)
413000
41800 tensor(5.3836, device='cuda:0', grad_fn=<NllLossBackward0>)
414000
41900 tensor(5.3904, device='cuda:0', grad_fn=<NllLossBackward0>)
415000
42000 tensor(5.5445, device='cuda:0', grad_fn=<NllLossBackward0>)
416000
42100 tensor(5.2890, device='cuda:0', grad_fn=<NllLossBackward0>)
417000
42200 tensor(5.3691, device='cuda:0', grad_fn=<NllLossBackward0>)
418000
42300 tensor(5.4364, device='cuda:0', grad_fn=<NllLossBackward0>)
419000
42400 tensor(5.2507, device='cuda:0', grad_fn=<NllLossBackward0>)
420000
42500 tensor(5.4215, device='cuda:0', grad_fn=<NllLossBackward0>)
421000
42600 tensor(5.2136, device='cuda:0', grad_fn=<NllLossBackward0>)
422000
42700 tensor(5.5296, device='cuda:0', grad_fn=<NllLossBackward0>)
423000
42800 tensor(5.4544, device='cuda:0', grad_fn=<NllLossBackward0>)
424000
42900 tensor(5.3009, device='cuda:0', grad_fn=<NllLossBackward0>)
425000
43000 tensor(5.4403, device='cuda:0', grad_fn=<NllLossBackward0>)
426000
43100 tensor(5.4384, device='cuda:0', grad_fn=<NllLossBackward0>)
427000
43200 tensor(5.2520, device='cuda:0', grad_fn=<NllLossBackward0>)
428000
43300 tensor(5.2945, device='cuda:0', grad_fn=<NllLossBackward0>)
429000
43400 tensor(5.4455, device='cuda:0', grad_fn=<NllLossBackward0>)
430000
43500 tensor(5.1633, device='cuda:0', grad_fn=<NllLossBackward0>)
431000
43600 tensor(5.3649, device='cuda:0', grad_fn=<NllLossBackward0>)
432000
epoch: = 3
0 tensor(5.3427, device='cuda:0', grad_fn=<NllLossBackward0>)
1000
100 tensor(5.4180, device='cuda:0', grad_fn=<NllLossBackward0>)
200 tensor(5.2939, device='cuda:0', grad_fn=<NllLossBackward0>)
2000
300 tensor(5.3083, device='cuda:0', grad_fn=<NllLossBackward0>)
3000
400 tensor(5.3086, device='cuda:0', grad_fn=<NllLossBackward0>)
4000
500 tensor(5.4733, device='cuda:0', grad_fn=<NllLossBackward0>)
5000
600 tensor(5.2627, device='cuda:0', grad_fn=<NllLossBackward0>)
6000
700 tensor(5.5664, device='cuda:0', grad_fn=<NllLossBackward0>)
7000
800 tensor(5.1641, device='cuda:0', grad_fn=<NllLossBackward0>)
8000
900 tensor(5.4272, device='cuda:0', grad_fn=<NllLossBackward0>)
9000
1000 tensor(5.2926, device='cuda:0', grad_fn=<NllLossBackward0>)
10000
1100 tensor(5.4848, device='cuda:0', grad_fn=<NllLossBackward0>)
11000
1200 tensor(5.5283, device='cuda:0', grad_fn=<NllLossBackward0>)
12000
1300 tensor(5.4635, device='cuda:0', grad_fn=<NllLossBackward0>)
13000
1400 tensor(5.4590, device='cuda:0', grad_fn=<NllLossBackward0>)
14000
1500 tensor(5.5386, device='cuda:0', grad_fn=<NllLossBackward0>)
15000
1600 tensor(5.2150, device='cuda:0', grad_fn=<NllLossBackward0>)
16000
1700 tensor(5.3116, device='cuda:0', grad_fn=<NllLossBackward0>)
17000
1800 tensor(5.3130, device='cuda:0', grad_fn=<NllLossBackward0>)
18000
1900 tensor(5.2889, device='cuda:0', grad_fn=<NllLossBackward0>)
19000
2000 tensor(5.3574, device='cuda:0', grad_fn=<NllLossBackward0>)
20000
2100 tensor(5.4860, device='cuda:0', grad_fn=<NllLossBackward0>)
21000
2200 tensor(5.3206, device='cuda:0', grad_fn=<NllLossBackward0>)
22000
2300 tensor(5.3447, device='cuda:0', grad_fn=<NllLossBackward0>)
23000
2400 tensor(5.3333, device='cuda:0', grad_fn=<NllLossBackward0>)
24000
2500 tensor(5.3822, device='cuda:0', grad_fn=<NllLossBackward0>)
25000
2600 tensor(5.4039, device='cuda:0', grad_fn=<NllLossBackward0>)
26000
2700 tensor(5.4280, device='cuda:0', grad_fn=<NllLossBackward0>)
27000
2800 tensor(5.4575, device='cuda:0', grad_fn=<NllLossBackward0>)
28000
2900 tensor(5.5878, device='cuda:0', grad_fn=<NllLossBackward0>)
29000
3000 tensor(5.3311, device='cuda:0', grad_fn=<NllLossBackward0>)
30000
3100 tensor(5.4103, device='cuda:0', grad_fn=<NllLossBackward0>)
31000
3200 tensor(5.4323, device='cuda:0', grad_fn=<NllLossBackward0>)
32000
3300 tensor(5.3521, device='cuda:0', grad_fn=<NllLossBackward0>)
33000
3400 tensor(5.2512, device='cuda:0', grad_fn=<NllLossBackward0>)
34000
3500 tensor(5.3813, device='cuda:0', grad_fn=<NllLossBackward0>)
35000
3600 tensor(5.4000, device='cuda:0', grad_fn=<NllLossBackward0>)
36000
3700 tensor(5.3312, device='cuda:0', grad_fn=<NllLossBackward0>)
37000
3800 tensor(5.3553, device='cuda:0', grad_fn=<NllLossBackward0>)
38000
3900 tensor(5.2275, device='cuda:0', grad_fn=<NllLossBackward0>)
39000
4000 tensor(5.2883, device='cuda:0', grad_fn=<NllLossBackward0>)
40000
4100 tensor(5.4294, device='cuda:0', grad_fn=<NllLossBackward0>)
41000
4200 tensor(5.4801, device='cuda:0', grad_fn=<NllLossBackward0>)
42000
4300 tensor(5.3863, device='cuda:0', grad_fn=<NllLossBackward0>)
43000
4400 tensor(5.4470, device='cuda:0', grad_fn=<NllLossBackward0>)
44000
4500 tensor(5.2610, device='cuda:0', grad_fn=<NllLossBackward0>)
45000
4600 tensor(5.5962, device='cuda:0', grad_fn=<NllLossBackward0>)
46000
4700 tensor(5.3029, device='cuda:0', grad_fn=<NllLossBackward0>)
47000
4800 tensor(5.4265, device='cuda:0', grad_fn=<NllLossBackward0>)
48000
4900 tensor(5.4823, device='cuda:0', grad_fn=<NllLossBackward0>)
49000
5000 tensor(5.4749, device='cuda:0', grad_fn=<NllLossBackward0>)
50000
5100 tensor(5.5356, device='cuda:0', grad_fn=<NllLossBackward0>)
51000
5200 tensor(5.5513, device='cuda:0', grad_fn=<NllLossBackward0>)
52000
5300 tensor(5.5476, device='cuda:0', grad_fn=<NllLossBackward0>)
53000
5400 tensor(5.4039, device='cuda:0', grad_fn=<NllLossBackward0>)
54000
5500 tensor(5.5156, device='cuda:0', grad_fn=<NllLossBackward0>)
55000
5600 tensor(5.2975, device='cuda:0', grad_fn=<NllLossBackward0>)
56000
5700 tensor(5.5492, device='cuda:0', grad_fn=<NllLossBackward0>)
57000
5800 tensor(5.5379, device='cuda:0', grad_fn=<NllLossBackward0>)
58000
5900 tensor(5.4874, device='cuda:0', grad_fn=<NllLossBackward0>)
59000
6000 tensor(5.3808, device='cuda:0', grad_fn=<NllLossBackward0>)
60000
6100 tensor(5.3932, device='cuda:0', grad_fn=<NllLossBackward0>)
61000
6200 tensor(5.5657, device='cuda:0', grad_fn=<NllLossBackward0>)
62000
6300 tensor(5.4233, device='cuda:0', grad_fn=<NllLossBackward0>)
63000
6400 tensor(5.3438, device='cuda:0', grad_fn=<NllLossBackward0>)
64000
6500 tensor(5.2002, device='cuda:0', grad_fn=<NllLossBackward0>)
65000
6600 tensor(5.3774, device='cuda:0', grad_fn=<NllLossBackward0>)
66000
6700 tensor(5.3193, device='cuda:0', grad_fn=<NllLossBackward0>)
67000
6800 tensor(5.5394, device='cuda:0', grad_fn=<NllLossBackward0>)
68000
6900 tensor(5.5196, device='cuda:0', grad_fn=<NllLossBackward0>)
69000
7000 tensor(5.4282, device='cuda:0', grad_fn=<NllLossBackward0>)
70000
7100 tensor(5.2296, device='cuda:0', grad_fn=<NllLossBackward0>)
71000
7200 tensor(5.3175, device='cuda:0', grad_fn=<NllLossBackward0>)
72000
7300 tensor(5.5642, device='cuda:0', grad_fn=<NllLossBackward0>)
73000
7400 tensor(5.3784, device='cuda:0', grad_fn=<NllLossBackward0>)
74000
7500 tensor(5.2475, device='cuda:0', grad_fn=<NllLossBackward0>)
75000
7600 tensor(5.3194, device='cuda:0', grad_fn=<NllLossBackward0>)
76000
7700 tensor(5.3934, device='cuda:0', grad_fn=<NllLossBackward0>)
77000
7800 tensor(5.5041, device='cuda:0', grad_fn=<NllLossBackward0>)
78000
7900 tensor(5.1814, device='cuda:0', grad_fn=<NllLossBackward0>)
79000
8000 tensor(5.2426, device='cuda:0', grad_fn=<NllLossBackward0>)
80000
8100 tensor(5.4104, device='cuda:0', grad_fn=<NllLossBackward0>)
81000
8200 tensor(5.4198, device='cuda:0', grad_fn=<NllLossBackward0>)
82000
8300 tensor(5.3854, device='cuda:0', grad_fn=<NllLossBackward0>)
83000
8400 tensor(5.5128, device='cuda:0', grad_fn=<NllLossBackward0>)
84000
8500 tensor(5.4898, device='cuda:0', grad_fn=<NllLossBackward0>)
85000
8600 tensor(5.4943, device='cuda:0', grad_fn=<NllLossBackward0>)
86000
8700 tensor(5.6012, device='cuda:0', grad_fn=<NllLossBackward0>)
87000
8800 tensor(5.4790, device='cuda:0', grad_fn=<NllLossBackward0>)
88000
8900 tensor(5.3312, device='cuda:0', grad_fn=<NllLossBackward0>)
89000
9000 tensor(5.4456, device='cuda:0', grad_fn=<NllLossBackward0>)
90000
9100 tensor(5.4537, device='cuda:0', grad_fn=<NllLossBackward0>)
91000
9200 tensor(5.3643, device='cuda:0', grad_fn=<NllLossBackward0>)
92000
9300 tensor(5.4085, device='cuda:0', grad_fn=<NllLossBackward0>)
93000
9400 tensor(5.2527, device='cuda:0', grad_fn=<NllLossBackward0>)
94000
9500 tensor(5.3289, device='cuda:0', grad_fn=<NllLossBackward0>)
95000
9600 tensor(5.4516, device='cuda:0', grad_fn=<NllLossBackward0>)
96000
9700 tensor(5.3881, device='cuda:0', grad_fn=<NllLossBackward0>)
97000
9800 tensor(5.4321, device='cuda:0', grad_fn=<NllLossBackward0>)
9900 tensor(5.2532, device='cuda:0', grad_fn=<NllLossBackward0>)
98000
10000 tensor(5.4727, device='cuda:0', grad_fn=<NllLossBackward0>)
99000
10100 tensor(5.3607, device='cuda:0', grad_fn=<NllLossBackward0>)
100000
10200 tensor(5.2989, device='cuda:0', grad_fn=<NllLossBackward0>)
101000
10300 tensor(5.4168, device='cuda:0', grad_fn=<NllLossBackward0>)
102000
10400 tensor(5.4272, device='cuda:0', grad_fn=<NllLossBackward0>)
103000
10500 tensor(5.4838, device='cuda:0', grad_fn=<NllLossBackward0>)
104000
10600 tensor(5.5675, device='cuda:0', grad_fn=<NllLossBackward0>)
105000
10700 tensor(5.4027, device='cuda:0', grad_fn=<NllLossBackward0>)
106000
10800 tensor(5.4252, device='cuda:0', grad_fn=<NllLossBackward0>)
107000
10900 tensor(5.3408, device='cuda:0', grad_fn=<NllLossBackward0>)
108000
11000 tensor(5.5754, device='cuda:0', grad_fn=<NllLossBackward0>)
109000
11100 tensor(5.1920, device='cuda:0', grad_fn=<NllLossBackward0>)
110000
11200 tensor(5.3604, device='cuda:0', grad_fn=<NllLossBackward0>)
111000
11300 tensor(5.3836, device='cuda:0', grad_fn=<NllLossBackward0>)
112000
11400 tensor(5.3330, device='cuda:0', grad_fn=<NllLossBackward0>)
113000
11500 tensor(5.4023, device='cuda:0', grad_fn=<NllLossBackward0>)
114000
11600 tensor(5.3923, device='cuda:0', grad_fn=<NllLossBackward0>)
115000
11700 tensor(5.3145, device='cuda:0', grad_fn=<NllLossBackward0>)
116000
11800 tensor(5.5174, device='cuda:0', grad_fn=<NllLossBackward0>)
117000
11900 tensor(5.3522, device='cuda:0', grad_fn=<NllLossBackward0>)
118000
12000 tensor(5.4232, device='cuda:0', grad_fn=<NllLossBackward0>)
119000
12100 tensor(5.4382, device='cuda:0', grad_fn=<NllLossBackward0>)
120000
12200 tensor(5.4488, device='cuda:0', grad_fn=<NllLossBackward0>)
121000
12300 tensor(5.5409, device='cuda:0', grad_fn=<NllLossBackward0>)
122000
12400 tensor(5.4200, device='cuda:0', grad_fn=<NllLossBackward0>)
123000
12500 tensor(5.3292, device='cuda:0', grad_fn=<NllLossBackward0>)
124000
12600 tensor(5.3788, device='cuda:0', grad_fn=<NllLossBackward0>)
125000
12700 tensor(5.3116, device='cuda:0', grad_fn=<NllLossBackward0>)
126000
12800 tensor(5.4948, device='cuda:0', grad_fn=<NllLossBackward0>)
127000
12900 tensor(5.3557, device='cuda:0', grad_fn=<NllLossBackward0>)
128000
13000 tensor(5.1732, device='cuda:0', grad_fn=<NllLossBackward0>)
129000
13100 tensor(5.3782, device='cuda:0', grad_fn=<NllLossBackward0>)
130000
13200 tensor(5.4178, device='cuda:0', grad_fn=<NllLossBackward0>)
131000
13300 tensor(5.2929, device='cuda:0', grad_fn=<NllLossBackward0>)
132000
13400 tensor(5.3806, device='cuda:0', grad_fn=<NllLossBackward0>)
133000
13500 tensor(5.3394, device='cuda:0', grad_fn=<NllLossBackward0>)
134000
13600 tensor(5.4191, device='cuda:0', grad_fn=<NllLossBackward0>)
135000
13700 tensor(5.3856, device='cuda:0', grad_fn=<NllLossBackward0>)
136000
13800 tensor(5.3839, device='cuda:0', grad_fn=<NllLossBackward0>)
137000
13900 tensor(5.2391, device='cuda:0', grad_fn=<NllLossBackward0>)
138000
14000 tensor(5.4865, device='cuda:0', grad_fn=<NllLossBackward0>)
139000
14100 tensor(5.1952, device='cuda:0', grad_fn=<NllLossBackward0>)
140000
14200 tensor(5.4670, device='cuda:0', grad_fn=<NllLossBackward0>)
141000
14300 tensor(5.4385, device='cuda:0', grad_fn=<NllLossBackward0>)
142000
14400 tensor(5.3347, device='cuda:0', grad_fn=<NllLossBackward0>)
143000
14500 tensor(5.4370, device='cuda:0', grad_fn=<NllLossBackward0>)
144000
14600 tensor(5.4695, device='cuda:0', grad_fn=<NllLossBackward0>)
145000
14700 tensor(5.3453, device='cuda:0', grad_fn=<NllLossBackward0>)
146000
14800 tensor(5.7928, device='cuda:0', grad_fn=<NllLossBackward0>)
147000
14900 tensor(5.4451, device='cuda:0', grad_fn=<NllLossBackward0>)
148000
15000 tensor(5.3087, device='cuda:0', grad_fn=<NllLossBackward0>)
149000
15100 tensor(5.5241, device='cuda:0', grad_fn=<NllLossBackward0>)
150000
15200 tensor(5.3894, device='cuda:0', grad_fn=<NllLossBackward0>)
151000
15300 tensor(5.3809, device='cuda:0', grad_fn=<NllLossBackward0>)
152000
15400 tensor(5.2696, device='cuda:0', grad_fn=<NllLossBackward0>)
153000
15500 tensor(5.4343, device='cuda:0', grad_fn=<NllLossBackward0>)
154000
15600 tensor(5.4322, device='cuda:0', grad_fn=<NllLossBackward0>)
155000
15700 tensor(5.3296, device='cuda:0', grad_fn=<NllLossBackward0>)
156000
15800 tensor(5.2456, device='cuda:0', grad_fn=<NllLossBackward0>)
157000
15900 tensor(5.3806, device='cuda:0', grad_fn=<NllLossBackward0>)
158000
16000 tensor(5.2008, device='cuda:0', grad_fn=<NllLossBackward0>)
159000
16100 tensor(5.2489, device='cuda:0', grad_fn=<NllLossBackward0>)
160000
16200 tensor(5.5902, device='cuda:0', grad_fn=<NllLossBackward0>)
161000
16300 tensor(5.4159, device='cuda:0', grad_fn=<NllLossBackward0>)
162000
16400 tensor(5.3966, device='cuda:0', grad_fn=<NllLossBackward0>)
163000
16500 tensor(5.5113, device='cuda:0', grad_fn=<NllLossBackward0>)
164000
16600 tensor(5.3599, device='cuda:0', grad_fn=<NllLossBackward0>)
165000
16700 tensor(5.3372, device='cuda:0', grad_fn=<NllLossBackward0>)
166000
16800 tensor(5.4158, device='cuda:0', grad_fn=<NllLossBackward0>)
167000
16900 tensor(5.1788, device='cuda:0', grad_fn=<NllLossBackward0>)
168000
17000 tensor(5.4497, device='cuda:0', grad_fn=<NllLossBackward0>)
169000
17100 tensor(5.2981, device='cuda:0', grad_fn=<NllLossBackward0>)
170000
17200 tensor(5.4330, device='cuda:0', grad_fn=<NllLossBackward0>)
171000
17300 tensor(5.4495, device='cuda:0', grad_fn=<NllLossBackward0>)
172000
17400 tensor(5.2431, device='cuda:0', grad_fn=<NllLossBackward0>)
173000
17500 tensor(5.2652, device='cuda:0', grad_fn=<NllLossBackward0>)
174000
17600 tensor(5.3007, device='cuda:0', grad_fn=<NllLossBackward0>)
175000
17700 tensor(5.2852, device='cuda:0', grad_fn=<NllLossBackward0>)
176000
17800 tensor(5.3431, device='cuda:0', grad_fn=<NllLossBackward0>)
177000
17900 tensor(5.2395, device='cuda:0', grad_fn=<NllLossBackward0>)
178000
18000 tensor(5.4841, device='cuda:0', grad_fn=<NllLossBackward0>)
179000
18100 tensor(5.4218, device='cuda:0', grad_fn=<NllLossBackward0>)
180000
18200 tensor(5.3397, device='cuda:0', grad_fn=<NllLossBackward0>)
181000
18300 tensor(5.3426, device='cuda:0', grad_fn=<NllLossBackward0>)
182000
18400 tensor(5.3654, device='cuda:0', grad_fn=<NllLossBackward0>)
183000
18500 tensor(5.3484, device='cuda:0', grad_fn=<NllLossBackward0>)
184000
18600 tensor(5.5509, device='cuda:0', grad_fn=<NllLossBackward0>)
185000
18700 tensor(5.3702, device='cuda:0', grad_fn=<NllLossBackward0>)
186000
18800 tensor(5.5361, device='cuda:0', grad_fn=<NllLossBackward0>)
187000
18900 tensor(5.4132, device='cuda:0', grad_fn=<NllLossBackward0>)
188000
19000 tensor(5.4235, device='cuda:0', grad_fn=<NllLossBackward0>)
189000
19100 tensor(5.5318, device='cuda:0', grad_fn=<NllLossBackward0>)
190000
19200 tensor(5.4136, device='cuda:0', grad_fn=<NllLossBackward0>)
191000
19300 tensor(5.5053, device='cuda:0', grad_fn=<NllLossBackward0>)
192000
19400 tensor(5.3472, device='cuda:0', grad_fn=<NllLossBackward0>)
193000
19500 tensor(5.3511, device='cuda:0', grad_fn=<NllLossBackward0>)
194000
19600 tensor(5.3861, device='cuda:0', grad_fn=<NllLossBackward0>)
195000
19700 tensor(5.4345, device='cuda:0', grad_fn=<NllLossBackward0>)
196000
19800 tensor(5.3067, device='cuda:0', grad_fn=<NllLossBackward0>)
197000
19900 tensor(5.3079, device='cuda:0', grad_fn=<NllLossBackward0>)
198000
20000 tensor(5.3268, device='cuda:0', grad_fn=<NllLossBackward0>)
199000
20100 tensor(5.2668, device='cuda:0', grad_fn=<NllLossBackward0>)
200000
20200 tensor(5.1998, device='cuda:0', grad_fn=<NllLossBackward0>)
201000
20300 tensor(5.3105, device='cuda:0', grad_fn=<NllLossBackward0>)
20400 tensor(5.3584, device='cuda:0', grad_fn=<NllLossBackward0>)
202000
20500 tensor(5.3580, device='cuda:0', grad_fn=<NllLossBackward0>)
203000
20600 tensor(5.5528, device='cuda:0', grad_fn=<NllLossBackward0>)
204000
20700 tensor(5.3871, device='cuda:0', grad_fn=<NllLossBackward0>)
205000
20800 tensor(5.2208, device='cuda:0', grad_fn=<NllLossBackward0>)
206000
20900 tensor(5.5007, device='cuda:0', grad_fn=<NllLossBackward0>)
207000
21000 tensor(5.3396, device='cuda:0', grad_fn=<NllLossBackward0>)
208000
21100 tensor(5.3407, device='cuda:0', grad_fn=<NllLossBackward0>)
209000
21200 tensor(5.2243, device='cuda:0', grad_fn=<NllLossBackward0>)
210000
21300 tensor(5.4206, device='cuda:0', grad_fn=<NllLossBackward0>)
211000
21400 tensor(5.4574, device='cuda:0', grad_fn=<NllLossBackward0>)
212000
21500 tensor(5.2328, device='cuda:0', grad_fn=<NllLossBackward0>)
213000
21600 tensor(5.2233, device='cuda:0', grad_fn=<NllLossBackward0>)
214000
21700 tensor(5.2152, device='cuda:0', grad_fn=<NllLossBackward0>)
215000
21800 tensor(5.3497, device='cuda:0', grad_fn=<NllLossBackward0>)
216000
21900 tensor(5.3425, device='cuda:0', grad_fn=<NllLossBackward0>)
217000
22000 tensor(5.3277, device='cuda:0', grad_fn=<NllLossBackward0>)
218000
22100 tensor(5.2012, device='cuda:0', grad_fn=<NllLossBackward0>)
219000
22200 tensor(5.0736, device='cuda:0', grad_fn=<NllLossBackward0>)
220000
22300 tensor(5.5070, device='cuda:0', grad_fn=<NllLossBackward0>)
221000
22400 tensor(5.2190, device='cuda:0', grad_fn=<NllLossBackward0>)
222000
22500 tensor(5.2434, device='cuda:0', grad_fn=<NllLossBackward0>)
223000
22600 tensor(5.4325, device='cuda:0', grad_fn=<NllLossBackward0>)
224000
22700 tensor(5.1909, device='cuda:0', grad_fn=<NllLossBackward0>)
225000
22800 tensor(5.4576, device='cuda:0', grad_fn=<NllLossBackward0>)
226000
22900 tensor(5.5069, device='cuda:0', grad_fn=<NllLossBackward0>)
227000
23000 tensor(5.4041, device='cuda:0', grad_fn=<NllLossBackward0>)
228000
23100 tensor(5.3908, device='cuda:0', grad_fn=<NllLossBackward0>)
229000
23200 tensor(5.3866, device='cuda:0', grad_fn=<NllLossBackward0>)
230000
23300 tensor(5.4714, device='cuda:0', grad_fn=<NllLossBackward0>)
231000
23400 tensor(5.4781, device='cuda:0', grad_fn=<NllLossBackward0>)
232000
23500 tensor(5.3154, device='cuda:0', grad_fn=<NllLossBackward0>)
233000
23600 tensor(5.2854, device='cuda:0', grad_fn=<NllLossBackward0>)
234000
23700 tensor(5.3050, device='cuda:0', grad_fn=<NllLossBackward0>)
235000
23800 tensor(5.1721, device='cuda:0', grad_fn=<NllLossBackward0>)
236000
23900 tensor(5.2637, device='cuda:0', grad_fn=<NllLossBackward0>)
237000
24000 tensor(5.2519, device='cuda:0', grad_fn=<NllLossBackward0>)
238000
24100 tensor(5.3407, device='cuda:0', grad_fn=<NllLossBackward0>)
239000
24200 tensor(5.5137, device='cuda:0', grad_fn=<NllLossBackward0>)
240000
24300 tensor(5.4080, device='cuda:0', grad_fn=<NllLossBackward0>)
241000
24400 tensor(5.5379, device='cuda:0', grad_fn=<NllLossBackward0>)
242000
24500 tensor(5.3255, device='cuda:0', grad_fn=<NllLossBackward0>)
243000
24600 tensor(5.4515, device='cuda:0', grad_fn=<NllLossBackward0>)
244000
24700 tensor(5.3535, device='cuda:0', grad_fn=<NllLossBackward0>)
245000
24800 tensor(5.3935, device='cuda:0', grad_fn=<NllLossBackward0>)
246000
24900 tensor(5.4553, device='cuda:0', grad_fn=<NllLossBackward0>)
247000
25000 tensor(5.4708, device='cuda:0', grad_fn=<NllLossBackward0>)
248000
25100 tensor(5.3920, device='cuda:0', grad_fn=<NllLossBackward0>)
249000
25200 tensor(5.4083, device='cuda:0', grad_fn=<NllLossBackward0>)
250000
25300 tensor(5.4332, device='cuda:0', grad_fn=<NllLossBackward0>)
251000
25400 tensor(5.4136, device='cuda:0', grad_fn=<NllLossBackward0>)
252000
25500 tensor(5.3147, device='cuda:0', grad_fn=<NllLossBackward0>)
253000
25600 tensor(5.5860, device='cuda:0', grad_fn=<NllLossBackward0>)
254000
25700 tensor(5.3490, device='cuda:0', grad_fn=<NllLossBackward0>)
255000
25800 tensor(5.4464, device='cuda:0', grad_fn=<NllLossBackward0>)
256000
25900 tensor(5.3857, device='cuda:0', grad_fn=<NllLossBackward0>)
257000
26000 tensor(5.3893, device='cuda:0', grad_fn=<NllLossBackward0>)
258000
26100 tensor(5.3041, device='cuda:0', grad_fn=<NllLossBackward0>)
259000
26200 tensor(5.2321, device='cuda:0', grad_fn=<NllLossBackward0>)
260000
26300 tensor(5.4289, device='cuda:0', grad_fn=<NllLossBackward0>)
261000
26400 tensor(5.4663, device='cuda:0', grad_fn=<NllLossBackward0>)
262000
26500 tensor(5.1922, device='cuda:0', grad_fn=<NllLossBackward0>)
263000
26600 tensor(5.5283, device='cuda:0', grad_fn=<NllLossBackward0>)
264000
26700 tensor(5.3933, device='cuda:0', grad_fn=<NllLossBackward0>)
265000
26800 tensor(5.5680, device='cuda:0', grad_fn=<NllLossBackward0>)
266000
26900 tensor(5.3281, device='cuda:0', grad_fn=<NllLossBackward0>)
267000
27000 tensor(5.2408, device='cuda:0', grad_fn=<NllLossBackward0>)
268000
27100 tensor(5.2671, device='cuda:0', grad_fn=<NllLossBackward0>)
269000
27200 tensor(5.3099, device='cuda:0', grad_fn=<NllLossBackward0>)
270000
27300 tensor(5.5049, device='cuda:0', grad_fn=<NllLossBackward0>)
271000
27400 tensor(5.3850, device='cuda:0', grad_fn=<NllLossBackward0>)
272000
27500 tensor(5.2843, device='cuda:0', grad_fn=<NllLossBackward0>)
273000
27600 tensor(5.5777, device='cuda:0', grad_fn=<NllLossBackward0>)
274000
27700 tensor(5.4017, device='cuda:0', grad_fn=<NllLossBackward0>)
275000
27800 tensor(5.3994, device='cuda:0', grad_fn=<NllLossBackward0>)
276000
27900 tensor(5.5128, device='cuda:0', grad_fn=<NllLossBackward0>)
277000
28000 tensor(5.3708, device='cuda:0', grad_fn=<NllLossBackward0>)
278000
28100 tensor(5.3382, device='cuda:0', grad_fn=<NllLossBackward0>)
279000
28200 tensor(5.4996, device='cuda:0', grad_fn=<NllLossBackward0>)
280000
28300 tensor(5.1214, device='cuda:0', grad_fn=<NllLossBackward0>)
281000
28400 tensor(5.5647, device='cuda:0', grad_fn=<NllLossBackward0>)
282000
28500 tensor(5.3959, device='cuda:0', grad_fn=<NllLossBackward0>)
283000
28600 tensor(5.3312, device='cuda:0', grad_fn=<NllLossBackward0>)
284000
28700 tensor(5.4663, device='cuda:0', grad_fn=<NllLossBackward0>)
285000
28800 tensor(5.5155, device='cuda:0', grad_fn=<NllLossBackward0>)
286000
28900 tensor(5.3872, device='cuda:0', grad_fn=<NllLossBackward0>)
287000
29000 tensor(5.3017, device='cuda:0', grad_fn=<NllLossBackward0>)
288000
29100 tensor(5.0583, device='cuda:0', grad_fn=<NllLossBackward0>)
289000
29200 tensor(5.2099, device='cuda:0', grad_fn=<NllLossBackward0>)
290000
29300 tensor(5.4934, device='cuda:0', grad_fn=<NllLossBackward0>)
291000
29400 tensor(5.6202, device='cuda:0', grad_fn=<NllLossBackward0>)
292000
29500 tensor(5.4016, device='cuda:0', grad_fn=<NllLossBackward0>)
293000
29600 tensor(5.2601, device='cuda:0', grad_fn=<NllLossBackward0>)
294000
29700 tensor(5.4038, device='cuda:0', grad_fn=<NllLossBackward0>)
295000
29800 tensor(5.2475, device='cuda:0', grad_fn=<NllLossBackward0>)
296000
29900 tensor(5.4960, device='cuda:0', grad_fn=<NllLossBackward0>)
297000
30000 tensor(5.2438, device='cuda:0', grad_fn=<NllLossBackward0>)
298000
30100 tensor(5.3221, device='cuda:0', grad_fn=<NllLossBackward0>)
299000
30200 tensor(5.2686, device='cuda:0', grad_fn=<NllLossBackward0>)
300000
30300 tensor(5.3735, device='cuda:0', grad_fn=<NllLossBackward0>)
301000
30400 tensor(5.2057, device='cuda:0', grad_fn=<NllLossBackward0>)
302000
30500 tensor(5.3767, device='cuda:0', grad_fn=<NllLossBackward0>)
30600 tensor(5.3515, device='cuda:0', grad_fn=<NllLossBackward0>)
303000
30700 tensor(5.3841, device='cuda:0', grad_fn=<NllLossBackward0>)
304000
30800 tensor(5.3889, device='cuda:0', grad_fn=<NllLossBackward0>)
305000
30900 tensor(5.4117, device='cuda:0', grad_fn=<NllLossBackward0>)
306000
31000 tensor(5.5205, device='cuda:0', grad_fn=<NllLossBackward0>)
307000
31100 tensor(5.1742, device='cuda:0', grad_fn=<NllLossBackward0>)
308000
31200 tensor(5.2173, device='cuda:0', grad_fn=<NllLossBackward0>)
309000
31300 tensor(5.4785, device='cuda:0', grad_fn=<NllLossBackward0>)
310000
31400 tensor(5.2577, device='cuda:0', grad_fn=<NllLossBackward0>)
311000
31500 tensor(5.4429, device='cuda:0', grad_fn=<NllLossBackward0>)
312000
31600 tensor(5.4289, device='cuda:0', grad_fn=<NllLossBackward0>)
313000
31700 tensor(5.3961, device='cuda:0', grad_fn=<NllLossBackward0>)
314000
31800 tensor(5.4999, device='cuda:0', grad_fn=<NllLossBackward0>)
315000
31900 tensor(5.1248, device='cuda:0', grad_fn=<NllLossBackward0>)
316000
32000 tensor(5.3122, device='cuda:0', grad_fn=<NllLossBackward0>)
317000
32100 tensor(5.1931, device='cuda:0', grad_fn=<NllLossBackward0>)
318000
32200 tensor(5.5096, device='cuda:0', grad_fn=<NllLossBackward0>)
319000
32300 tensor(5.4973, device='cuda:0', grad_fn=<NllLossBackward0>)
320000
32400 tensor(5.4742, device='cuda:0', grad_fn=<NllLossBackward0>)
321000
32500 tensor(5.2964, device='cuda:0', grad_fn=<NllLossBackward0>)
322000
32600 tensor(5.4063, device='cuda:0', grad_fn=<NllLossBackward0>)
323000
32700 tensor(5.3369, device='cuda:0', grad_fn=<NllLossBackward0>)
324000
32800 tensor(5.5636, device='cuda:0', grad_fn=<NllLossBackward0>)
325000
32900 tensor(5.4245, device='cuda:0', grad_fn=<NllLossBackward0>)
326000
33000 tensor(5.2032, device='cuda:0', grad_fn=<NllLossBackward0>)
327000
33100 tensor(5.4095, device='cuda:0', grad_fn=<NllLossBackward0>)
328000
33200 tensor(5.5071, device='cuda:0', grad_fn=<NllLossBackward0>)
329000
33300 tensor(5.2729, device='cuda:0', grad_fn=<NllLossBackward0>)
330000
33400 tensor(5.5492, device='cuda:0', grad_fn=<NllLossBackward0>)
331000
33500 tensor(5.3701, device='cuda:0', grad_fn=<NllLossBackward0>)
332000
33600 tensor(5.3223, device='cuda:0', grad_fn=<NllLossBackward0>)
333000
33700 tensor(5.3725, device='cuda:0', grad_fn=<NllLossBackward0>)
334000
33800 tensor(5.4572, device='cuda:0', grad_fn=<NllLossBackward0>)
335000
33900 tensor(5.1889, device='cuda:0', grad_fn=<NllLossBackward0>)
336000
34000 tensor(5.4090, device='cuda:0', grad_fn=<NllLossBackward0>)
337000
34100 tensor(5.3798, device='cuda:0', grad_fn=<NllLossBackward0>)
338000
34200 tensor(5.4259, device='cuda:0', grad_fn=<NllLossBackward0>)
339000
34300 tensor(5.2132, device='cuda:0', grad_fn=<NllLossBackward0>)
340000
34400 tensor(5.6692, device='cuda:0', grad_fn=<NllLossBackward0>)
341000
34500 tensor(5.5324, device='cuda:0', grad_fn=<NllLossBackward0>)
342000
34600 tensor(5.4271, device='cuda:0', grad_fn=<NllLossBackward0>)
343000
34700 tensor(5.4978, device='cuda:0', grad_fn=<NllLossBackward0>)
344000
34800 tensor(5.5230, device='cuda:0', grad_fn=<NllLossBackward0>)
345000
34900 tensor(5.5652, device='cuda:0', grad_fn=<NllLossBackward0>)
346000
35000 tensor(5.5478, device='cuda:0', grad_fn=<NllLossBackward0>)
347000
35100 tensor(5.3700, device='cuda:0', grad_fn=<NllLossBackward0>)
348000
35200 tensor(5.2958, device='cuda:0', grad_fn=<NllLossBackward0>)
349000
35300 tensor(5.5219, device='cuda:0', grad_fn=<NllLossBackward0>)
350000
35400 tensor(5.1702, device='cuda:0', grad_fn=<NllLossBackward0>)
351000
35500 tensor(5.2604, device='cuda:0', grad_fn=<NllLossBackward0>)
352000
35600 tensor(5.3821, device='cuda:0', grad_fn=<NllLossBackward0>)
353000
35700 tensor(5.2551, device='cuda:0', grad_fn=<NllLossBackward0>)
354000
35800 tensor(5.3840, device='cuda:0', grad_fn=<NllLossBackward0>)
355000
35900 tensor(5.3635, device='cuda:0', grad_fn=<NllLossBackward0>)
356000
36000 tensor(5.1400, device='cuda:0', grad_fn=<NllLossBackward0>)
357000
36100 tensor(5.5134, device='cuda:0', grad_fn=<NllLossBackward0>)
358000
36200 tensor(5.3632, device='cuda:0', grad_fn=<NllLossBackward0>)
359000
36300 tensor(5.6461, device='cuda:0', grad_fn=<NllLossBackward0>)
360000
36400 tensor(5.3415, device='cuda:0', grad_fn=<NllLossBackward0>)
361000
36500 tensor(5.3659, device='cuda:0', grad_fn=<NllLossBackward0>)
362000
36600 tensor(5.3874, device='cuda:0', grad_fn=<NllLossBackward0>)
363000
36700 tensor(5.1886, device='cuda:0', grad_fn=<NllLossBackward0>)
364000
36800 tensor(5.2958, device='cuda:0', grad_fn=<NllLossBackward0>)
365000
36900 tensor(5.4094, device='cuda:0', grad_fn=<NllLossBackward0>)
366000
37000 tensor(5.3023, device='cuda:0', grad_fn=<NllLossBackward0>)
367000
37100 tensor(5.3287, device='cuda:0', grad_fn=<NllLossBackward0>)
368000
37200 tensor(5.3996, device='cuda:0', grad_fn=<NllLossBackward0>)
369000
37300 tensor(5.3001, device='cuda:0', grad_fn=<NllLossBackward0>)
370000
37400 tensor(5.6516, device='cuda:0', grad_fn=<NllLossBackward0>)
371000
37500 tensor(5.3366, device='cuda:0', grad_fn=<NllLossBackward0>)
372000
37600 tensor(5.3282, device='cuda:0', grad_fn=<NllLossBackward0>)
373000
37700 tensor(5.5061, device='cuda:0', grad_fn=<NllLossBackward0>)
374000
37800 tensor(5.3408, device='cuda:0', grad_fn=<NllLossBackward0>)
375000
37900 tensor(5.3203, device='cuda:0', grad_fn=<NllLossBackward0>)
376000
38000 tensor(5.3996, device='cuda:0', grad_fn=<NllLossBackward0>)
377000
38100 tensor(5.4133, device='cuda:0', grad_fn=<NllLossBackward0>)
378000
38200 tensor(5.4262, device='cuda:0', grad_fn=<NllLossBackward0>)
379000
38300 tensor(5.3305, device='cuda:0', grad_fn=<NllLossBackward0>)
380000
38400 tensor(5.3983, device='cuda:0', grad_fn=<NllLossBackward0>)
381000
38500 tensor(5.4246, device='cuda:0', grad_fn=<NllLossBackward0>)
382000
38600 tensor(5.3713, device='cuda:0', grad_fn=<NllLossBackward0>)
383000
38700 tensor(5.3634, device='cuda:0', grad_fn=<NllLossBackward0>)
384000
38800 tensor(5.4504, device='cuda:0', grad_fn=<NllLossBackward0>)
385000
38900 tensor(5.5273, device='cuda:0', grad_fn=<NllLossBackward0>)
386000
39000 tensor(5.2229, device='cuda:0', grad_fn=<NllLossBackward0>)
387000
39100 tensor(5.4503, device='cuda:0', grad_fn=<NllLossBackward0>)
388000
39200 tensor(5.5406, device='cuda:0', grad_fn=<NllLossBackward0>)
389000
39300 tensor(5.3640, device='cuda:0', grad_fn=<NllLossBackward0>)
390000
39400 tensor(5.4311, device='cuda:0', grad_fn=<NllLossBackward0>)
391000
39500 tensor(5.5292, device='cuda:0', grad_fn=<NllLossBackward0>)
392000
39600 tensor(5.2217, device='cuda:0', grad_fn=<NllLossBackward0>)
393000
39700 tensor(5.2121, device='cuda:0', grad_fn=<NllLossBackward0>)
394000
39800 tensor(5.3415, device='cuda:0', grad_fn=<NllLossBackward0>)
395000
39900 tensor(5.1605, device='cuda:0', grad_fn=<NllLossBackward0>)
396000
40000 tensor(5.2472, device='cuda:0', grad_fn=<NllLossBackward0>)
397000
40100 tensor(5.3351, device='cuda:0', grad_fn=<NllLossBackward0>)
398000
40200 tensor(5.3198, device='cuda:0', grad_fn=<NllLossBackward0>)
399000
40300 tensor(5.3862, device='cuda:0', grad_fn=<NllLossBackward0>)
40400 tensor(5.3946, device='cuda:0', grad_fn=<NllLossBackward0>)
400000
40500 tensor(5.3120, device='cuda:0', grad_fn=<NllLossBackward0>)
401000
40600 tensor(5.3741, device='cuda:0', grad_fn=<NllLossBackward0>)
402000
40700 tensor(5.4199, device='cuda:0', grad_fn=<NllLossBackward0>)
403000
40800 tensor(5.3702, device='cuda:0', grad_fn=<NllLossBackward0>)
404000
40900 tensor(5.3212, device='cuda:0', grad_fn=<NllLossBackward0>)
405000
41000 tensor(5.3683, device='cuda:0', grad_fn=<NllLossBackward0>)
406000
41100 tensor(5.3491, device='cuda:0', grad_fn=<NllLossBackward0>)
407000
41200 tensor(5.2400, device='cuda:0', grad_fn=<NllLossBackward0>)
408000
41300 tensor(5.3728, device='cuda:0', grad_fn=<NllLossBackward0>)
409000
41400 tensor(5.2643, device='cuda:0', grad_fn=<NllLossBackward0>)
410000
41500 tensor(5.4064, device='cuda:0', grad_fn=<NllLossBackward0>)
411000
41600 tensor(5.3238, device='cuda:0', grad_fn=<NllLossBackward0>)
412000
41700 tensor(5.3469, device='cuda:0', grad_fn=<NllLossBackward0>)
413000
41800 tensor(5.3432, device='cuda:0', grad_fn=<NllLossBackward0>)
414000
41900 tensor(5.3521, device='cuda:0', grad_fn=<NllLossBackward0>)
415000
42000 tensor(5.5087, device='cuda:0', grad_fn=<NllLossBackward0>)
416000
42100 tensor(5.2556, device='cuda:0', grad_fn=<NllLossBackward0>)
417000
42200 tensor(5.3407, device='cuda:0', grad_fn=<NllLossBackward0>)
418000
42300 tensor(5.4058, device='cuda:0', grad_fn=<NllLossBackward0>)
419000
42400 tensor(5.2231, device='cuda:0', grad_fn=<NllLossBackward0>)
420000
42500 tensor(5.3912, device='cuda:0', grad_fn=<NllLossBackward0>)
421000
42600 tensor(5.1878, device='cuda:0', grad_fn=<NllLossBackward0>)
422000
42700 tensor(5.4955, device='cuda:0', grad_fn=<NllLossBackward0>)
423000
42800 tensor(5.4193, device='cuda:0', grad_fn=<NllLossBackward0>)
424000
42900 tensor(5.2662, device='cuda:0', grad_fn=<NllLossBackward0>)
425000
43000 tensor(5.4093, device='cuda:0', grad_fn=<NllLossBackward0>)
426000
43100 tensor(5.4089, device='cuda:0', grad_fn=<NllLossBackward0>)
427000
43200 tensor(5.2223, device='cuda:0', grad_fn=<NllLossBackward0>)
428000
43300 tensor(5.2456, device='cuda:0', grad_fn=<NllLossBackward0>)
429000
43400 tensor(5.4129, device='cuda:0', grad_fn=<NllLossBackward0>)
430000
43500 tensor(5.1283, device='cuda:0', grad_fn=<NllLossBackward0>)
431000
43600 tensor(5.3275, device='cuda:0', grad_fn=<NllLossBackward0>)
432000
epoch: = 4
0 tensor(5.3172, device='cuda:0', grad_fn=<NllLossBackward0>)
1000
100 tensor(5.3864, device='cuda:0', grad_fn=<NllLossBackward0>)
200 tensor(5.2618, device='cuda:0', grad_fn=<NllLossBackward0>)
2000
300 tensor(5.2652, device='cuda:0', grad_fn=<NllLossBackward0>)
3000
400 tensor(5.2749, device='cuda:0', grad_fn=<NllLossBackward0>)
4000
500 tensor(5.4347, device='cuda:0', grad_fn=<NllLossBackward0>)
5000
600 tensor(5.2271, device='cuda:0', grad_fn=<NllLossBackward0>)
6000
700 tensor(5.5396, device='cuda:0', grad_fn=<NllLossBackward0>)
7000
800 tensor(5.1379, device='cuda:0', grad_fn=<NllLossBackward0>)
8000
900 tensor(5.3861, device='cuda:0', grad_fn=<NllLossBackward0>)
9000
1000 tensor(5.2629, device='cuda:0', grad_fn=<NllLossBackward0>)
10000
1100 tensor(5.4575, device='cuda:0', grad_fn=<NllLossBackward0>)
11000
1200 tensor(5.4936, device='cuda:0', grad_fn=<NllLossBackward0>)
12000
1300 tensor(5.4281, device='cuda:0', grad_fn=<NllLossBackward0>)
13000
1400 tensor(5.4186, device='cuda:0', grad_fn=<NllLossBackward0>)
14000
1500 tensor(5.5070, device='cuda:0', grad_fn=<NllLossBackward0>)
15000
1600 tensor(5.1769, device='cuda:0', grad_fn=<NllLossBackward0>)
16000
1700 tensor(5.2856, device='cuda:0', grad_fn=<NllLossBackward0>)
17000
1800 tensor(5.2827, device='cuda:0', grad_fn=<NllLossBackward0>)
18000
1900 tensor(5.2544, device='cuda:0', grad_fn=<NllLossBackward0>)
19000
2000 tensor(5.3218, device='cuda:0', grad_fn=<NllLossBackward0>)
20000
2100 tensor(5.4549, device='cuda:0', grad_fn=<NllLossBackward0>)
21000
2200 tensor(5.2864, device='cuda:0', grad_fn=<NllLossBackward0>)
22000
2300 tensor(5.3145, device='cuda:0', grad_fn=<NllLossBackward0>)
23000
2400 tensor(5.2987, device='cuda:0', grad_fn=<NllLossBackward0>)
24000
2500 tensor(5.3498, device='cuda:0', grad_fn=<NllLossBackward0>)
25000
2600 tensor(5.3730, device='cuda:0', grad_fn=<NllLossBackward0>)
26000
2700 tensor(5.4017, device='cuda:0', grad_fn=<NllLossBackward0>)
27000
2800 tensor(5.4255, device='cuda:0', grad_fn=<NllLossBackward0>)
28000
2900 tensor(5.5475, device='cuda:0', grad_fn=<NllLossBackward0>)
29000
3000 tensor(5.2988, device='cuda:0', grad_fn=<NllLossBackward0>)
30000
3100 tensor(5.3753, device='cuda:0', grad_fn=<NllLossBackward0>)
31000
3200 tensor(5.4049, device='cuda:0', grad_fn=<NllLossBackward0>)
32000
3300 tensor(5.3206, device='cuda:0', grad_fn=<NllLossBackward0>)
33000
3400 tensor(5.2159, device='cuda:0', grad_fn=<NllLossBackward0>)
34000
3500 tensor(5.3423, device='cuda:0', grad_fn=<NllLossBackward0>)
35000
3600 tensor(5.3717, device='cuda:0', grad_fn=<NllLossBackward0>)
36000
3700 tensor(5.3042, device='cuda:0', grad_fn=<NllLossBackward0>)
37000
3800 tensor(5.3258, device='cuda:0', grad_fn=<NllLossBackward0>)
38000
3900 tensor(5.1989, device='cuda:0', grad_fn=<NllLossBackward0>)
39000
4000 tensor(5.2650, device='cuda:0', grad_fn=<NllLossBackward0>)
40000
4100 tensor(5.3953, device='cuda:0', grad_fn=<NllLossBackward0>)
41000
4200 tensor(5.4542, device='cuda:0', grad_fn=<NllLossBackward0>)
42000
4300 tensor(5.3466, device='cuda:0', grad_fn=<NllLossBackward0>)
43000
4400 tensor(5.4222, device='cuda:0', grad_fn=<NllLossBackward0>)
44000
4500 tensor(5.2254, device='cuda:0', grad_fn=<NllLossBackward0>)
45000
4600 tensor(5.5610, device='cuda:0', grad_fn=<NllLossBackward0>)
46000
4700 tensor(5.2753, device='cuda:0', grad_fn=<NllLossBackward0>)
47000
4800 tensor(5.4028, device='cuda:0', grad_fn=<NllLossBackward0>)
48000
4900 tensor(5.4516, device='cuda:0', grad_fn=<NllLossBackward0>)
49000
5000 tensor(5.4464, device='cuda:0', grad_fn=<NllLossBackward0>)
50000
5100 tensor(5.5018, device='cuda:0', grad_fn=<NllLossBackward0>)
51000
5200 tensor(5.5194, device='cuda:0', grad_fn=<NllLossBackward0>)
52000
5300 tensor(5.5077, device='cuda:0', grad_fn=<NllLossBackward0>)
53000
5400 tensor(5.3746, device='cuda:0', grad_fn=<NllLossBackward0>)
54000
5500 tensor(5.4847, device='cuda:0', grad_fn=<NllLossBackward0>)
55000
5600 tensor(5.2664, device='cuda:0', grad_fn=<NllLossBackward0>)
56000
5700 tensor(5.5265, device='cuda:0', grad_fn=<NllLossBackward0>)
57000
5800 tensor(5.5101, device='cuda:0', grad_fn=<NllLossBackward0>)
58000
5900 tensor(5.4513, device='cuda:0', grad_fn=<NllLossBackward0>)
59000
6000 tensor(5.3554, device='cuda:0', grad_fn=<NllLossBackward0>)
60000
6100 tensor(5.3616, device='cuda:0', grad_fn=<NllLossBackward0>)
61000
6200 tensor(5.5360, device='cuda:0', grad_fn=<NllLossBackward0>)
62000
6300 tensor(5.3952, device='cuda:0', grad_fn=<NllLossBackward0>)
63000
6400 tensor(5.3132, device='cuda:0', grad_fn=<NllLossBackward0>)
64000
6500 tensor(5.1732, device='cuda:0', grad_fn=<NllLossBackward0>)
65000
6600 tensor(5.3505, device='cuda:0', grad_fn=<NllLossBackward0>)
66000
6700 tensor(5.2919, device='cuda:0', grad_fn=<NllLossBackward0>)
67000
6800 tensor(5.5064, device='cuda:0', grad_fn=<NllLossBackward0>)
68000
6900 tensor(5.4881, device='cuda:0', grad_fn=<NllLossBackward0>)
69000
7000 tensor(5.3978, device='cuda:0', grad_fn=<NllLossBackward0>)
70000
7100 tensor(5.2030, device='cuda:0', grad_fn=<NllLossBackward0>)
71000
7200 tensor(5.2738, device='cuda:0', grad_fn=<NllLossBackward0>)
72000
7300 tensor(5.5317, device='cuda:0', grad_fn=<NllLossBackward0>)
73000
7400 tensor(5.3487, device='cuda:0', grad_fn=<NllLossBackward0>)
74000
7500 tensor(5.2133, device='cuda:0', grad_fn=<NllLossBackward0>)
75000
7600 tensor(5.2878, device='cuda:0', grad_fn=<NllLossBackward0>)
76000
7700 tensor(5.3644, device='cuda:0', grad_fn=<NllLossBackward0>)
77000
7800 tensor(5.4711, device='cuda:0', grad_fn=<NllLossBackward0>)
78000
7900 tensor(5.1445, device='cuda:0', grad_fn=<NllLossBackward0>)
79000
8000 tensor(5.2138, device='cuda:0', grad_fn=<NllLossBackward0>)
80000
8100 tensor(5.3741, device='cuda:0', grad_fn=<NllLossBackward0>)
81000
8200 tensor(5.3893, device='cuda:0', grad_fn=<NllLossBackward0>)
82000
8300 tensor(5.3492, device='cuda:0', grad_fn=<NllLossBackward0>)
83000
8400 tensor(5.4797, device='cuda:0', grad_fn=<NllLossBackward0>)
84000
8500 tensor(5.4501, device='cuda:0', grad_fn=<NllLossBackward0>)
85000
8600 tensor(5.4600, device='cuda:0', grad_fn=<NllLossBackward0>)
86000
8700 tensor(5.5758, device='cuda:0', grad_fn=<NllLossBackward0>)
87000
8800 tensor(5.4493, device='cuda:0', grad_fn=<NllLossBackward0>)
88000
8900 tensor(5.3035, device='cuda:0', grad_fn=<NllLossBackward0>)
89000
9000 tensor(5.4164, device='cuda:0', grad_fn=<NllLossBackward0>)
90000
9100 tensor(5.4273, device='cuda:0', grad_fn=<NllLossBackward0>)
91000
9200 tensor(5.3343, device='cuda:0', grad_fn=<NllLossBackward0>)
92000
9300 tensor(5.3797, device='cuda:0', grad_fn=<NllLossBackward0>)
93000
9400 tensor(5.2260, device='cuda:0', grad_fn=<NllLossBackward0>)
94000
9500 tensor(5.3006, device='cuda:0', grad_fn=<NllLossBackward0>)
95000
9600 tensor(5.4211, device='cuda:0', grad_fn=<NllLossBackward0>)
96000
9700 tensor(5.3615, device='cuda:0', grad_fn=<NllLossBackward0>)
97000
9800 tensor(5.4089, device='cuda:0', grad_fn=<NllLossBackward0>)
9900 tensor(5.2200, device='cuda:0', grad_fn=<NllLossBackward0>)
98000
10000 tensor(5.4428, device='cuda:0', grad_fn=<NllLossBackward0>)
99000
10100 tensor(5.3219, device='cuda:0', grad_fn=<NllLossBackward0>)
100000
10200 tensor(5.2692, device='cuda:0', grad_fn=<NllLossBackward0>)
101000
10300 tensor(5.3854, device='cuda:0', grad_fn=<NllLossBackward0>)
102000
10400 tensor(5.3984, device='cuda:0', grad_fn=<NllLossBackward0>)
103000
10500 tensor(5.4516, device='cuda:0', grad_fn=<NllLossBackward0>)
104000
10600 tensor(5.5380, device='cuda:0', grad_fn=<NllLossBackward0>)
105000
10700 tensor(5.3724, device='cuda:0', grad_fn=<NllLossBackward0>)
106000
10800 tensor(5.3862, device='cuda:0', grad_fn=<NllLossBackward0>)
107000
10900 tensor(5.3102, device='cuda:0', grad_fn=<NllLossBackward0>)
108000
11000 tensor(5.5487, device='cuda:0', grad_fn=<NllLossBackward0>)
109000
11100 tensor(5.1684, device='cuda:0', grad_fn=<NllLossBackward0>)
110000
11200 tensor(5.3303, device='cuda:0', grad_fn=<NllLossBackward0>)
111000
11300 tensor(5.3537, device='cuda:0', grad_fn=<NllLossBackward0>)
112000
11400 tensor(5.3064, device='cuda:0', grad_fn=<NllLossBackward0>)
113000
11500 tensor(5.3775, device='cuda:0', grad_fn=<NllLossBackward0>)
114000
11600 tensor(5.3649, device='cuda:0', grad_fn=<NllLossBackward0>)
115000
11700 tensor(5.2920, device='cuda:0', grad_fn=<NllLossBackward0>)
116000
11800 tensor(5.4908, device='cuda:0', grad_fn=<NllLossBackward0>)
117000
11900 tensor(5.3293, device='cuda:0', grad_fn=<NllLossBackward0>)
118000
12000 tensor(5.3926, device='cuda:0', grad_fn=<NllLossBackward0>)
119000
12100 tensor(5.4045, device='cuda:0', grad_fn=<NllLossBackward0>)
120000
12200 tensor(5.4246, device='cuda:0', grad_fn=<NllLossBackward0>)
121000
12300 tensor(5.5096, device='cuda:0', grad_fn=<NllLossBackward0>)
122000
12400 tensor(5.3884, device='cuda:0', grad_fn=<NllLossBackward0>)
123000
12500 tensor(5.3057, device='cuda:0', grad_fn=<NllLossBackward0>)
124000
12600 tensor(5.3466, device='cuda:0', grad_fn=<NllLossBackward0>)
125000
12700 tensor(5.2898, device='cuda:0', grad_fn=<NllLossBackward0>)
126000
12800 tensor(5.4714, device='cuda:0', grad_fn=<NllLossBackward0>)
127000
12900 tensor(5.3255, device='cuda:0', grad_fn=<NllLossBackward0>)
128000
13000 tensor(5.1438, device='cuda:0', grad_fn=<NllLossBackward0>)
129000
13100 tensor(5.3498, device='cuda:0', grad_fn=<NllLossBackward0>)
130000
13200 tensor(5.3890, device='cuda:0', grad_fn=<NllLossBackward0>)
131000
13300 tensor(5.2710, device='cuda:0', grad_fn=<NllLossBackward0>)
132000
13400 tensor(5.3541, device='cuda:0', grad_fn=<NllLossBackward0>)
133000
13500 tensor(5.3156, device='cuda:0', grad_fn=<NllLossBackward0>)
134000
13600 tensor(5.3957, device='cuda:0', grad_fn=<NllLossBackward0>)
135000
13700 tensor(5.3548, device='cuda:0', grad_fn=<NllLossBackward0>)
136000
13800 tensor(5.3577, device='cuda:0', grad_fn=<NllLossBackward0>)
137000
13900 tensor(5.2122, device='cuda:0', grad_fn=<NllLossBackward0>)
138000
14000 tensor(5.4587, device='cuda:0', grad_fn=<NllLossBackward0>)
139000
14100 tensor(5.1704, device='cuda:0', grad_fn=<NllLossBackward0>)
140000
14200 tensor(5.4419, device='cuda:0', grad_fn=<NllLossBackward0>)
141000
14300 tensor(5.4142, device='cuda:0', grad_fn=<NllLossBackward0>)
142000
14400 tensor(5.3058, device='cuda:0', grad_fn=<NllLossBackward0>)
143000
14500 tensor(5.4082, device='cuda:0', grad_fn=<NllLossBackward0>)
144000
14600 tensor(5.4414, device='cuda:0', grad_fn=<NllLossBackward0>)
145000
14700 tensor(5.3177, device='cuda:0', grad_fn=<NllLossBackward0>)
146000
14800 tensor(5.7665, device='cuda:0', grad_fn=<NllLossBackward0>)
147000
14900 tensor(5.4171, device='cuda:0', grad_fn=<NllLossBackward0>)
148000
15000 tensor(5.2698, device='cuda:0', grad_fn=<NllLossBackward0>)
149000
15100 tensor(5.4915, device='cuda:0', grad_fn=<NllLossBackward0>)
150000
15200 tensor(5.3576, device='cuda:0', grad_fn=<NllLossBackward0>)
151000
15300 tensor(5.3567, device='cuda:0', grad_fn=<NllLossBackward0>)
152000
15400 tensor(5.2379, device='cuda:0', grad_fn=<NllLossBackward0>)
153000
15500 tensor(5.4092, device='cuda:0', grad_fn=<NllLossBackward0>)
154000
15600 tensor(5.4042, device='cuda:0', grad_fn=<NllLossBackward0>)
155000
15700 tensor(5.3017, device='cuda:0', grad_fn=<NllLossBackward0>)
156000
15800 tensor(5.2188, device='cuda:0', grad_fn=<NllLossBackward0>)
157000
15900 tensor(5.3497, device='cuda:0', grad_fn=<NllLossBackward0>)
158000
16000 tensor(5.1718, device='cuda:0', grad_fn=<NllLossBackward0>)
159000
16100 tensor(5.2145, device='cuda:0', grad_fn=<NllLossBackward0>)
160000
16200 tensor(5.5591, device='cuda:0', grad_fn=<NllLossBackward0>)
161000
16300 tensor(5.3864, device='cuda:0', grad_fn=<NllLossBackward0>)
162000
16400 tensor(5.3719, device='cuda:0', grad_fn=<NllLossBackward0>)
163000
16500 tensor(5.4842, device='cuda:0', grad_fn=<NllLossBackward0>)
164000
16600 tensor(5.3329, device='cuda:0', grad_fn=<NllLossBackward0>)
165000
16700 tensor(5.3130, device='cuda:0', grad_fn=<NllLossBackward0>)
166000
16800 tensor(5.3903, device='cuda:0', grad_fn=<NllLossBackward0>)
167000
16900 tensor(5.1551, device='cuda:0', grad_fn=<NllLossBackward0>)
168000
17000 tensor(5.4229, device='cuda:0', grad_fn=<NllLossBackward0>)
169000
17100 tensor(5.2686, device='cuda:0', grad_fn=<NllLossBackward0>)
170000
17200 tensor(5.4099, device='cuda:0', grad_fn=<NllLossBackward0>)
171000
17300 tensor(5.4198, device='cuda:0', grad_fn=<NllLossBackward0>)
172000
17400 tensor(5.2162, device='cuda:0', grad_fn=<NllLossBackward0>)
173000
17500 tensor(5.2385, device='cuda:0', grad_fn=<NllLossBackward0>)
174000
17600 tensor(5.2786, device='cuda:0', grad_fn=<NllLossBackward0>)
175000
17700 tensor(5.2576, device='cuda:0', grad_fn=<NllLossBackward0>)
176000
17800 tensor(5.3158, device='cuda:0', grad_fn=<NllLossBackward0>)
177000
17900 tensor(5.2105, device='cuda:0', grad_fn=<NllLossBackward0>)
178000
18000 tensor(5.4627, device='cuda:0', grad_fn=<NllLossBackward0>)
179000
18100 tensor(5.3966, device='cuda:0', grad_fn=<NllLossBackward0>)
180000
18200 tensor(5.3108, device='cuda:0', grad_fn=<NllLossBackward0>)
181000
18300 tensor(5.3148, device='cuda:0', grad_fn=<NllLossBackward0>)
182000
18400 tensor(5.3321, device='cuda:0', grad_fn=<NllLossBackward0>)
183000
18500 tensor(5.3171, device='cuda:0', grad_fn=<NllLossBackward0>)
184000
18600 tensor(5.5247, device='cuda:0', grad_fn=<NllLossBackward0>)
185000
18700 tensor(5.3469, device='cuda:0', grad_fn=<NllLossBackward0>)
186000
18800 tensor(5.5092, device='cuda:0', grad_fn=<NllLossBackward0>)
187000
18900 tensor(5.3902, device='cuda:0', grad_fn=<NllLossBackward0>)
188000
19000 tensor(5.3904, device='cuda:0', grad_fn=<NllLossBackward0>)
189000
19100 tensor(5.5019, device='cuda:0', grad_fn=<NllLossBackward0>)
190000
19200 tensor(5.3838, device='cuda:0', grad_fn=<NllLossBackward0>)
191000
19300 tensor(5.4674, device='cuda:0', grad_fn=<NllLossBackward0>)
192000
19400 tensor(5.3223, device='cuda:0', grad_fn=<NllLossBackward0>)
193000
19500 tensor(5.3235, device='cuda:0', grad_fn=<NllLossBackward0>)
194000
19600 tensor(5.3589, device='cuda:0', grad_fn=<NllLossBackward0>)
195000
19700 tensor(5.4063, device='cuda:0', grad_fn=<NllLossBackward0>)
196000
19800 tensor(5.2838, device='cuda:0', grad_fn=<NllLossBackward0>)
197000
19900 tensor(5.2807, device='cuda:0', grad_fn=<NllLossBackward0>)
198000
20000 tensor(5.3038, device='cuda:0', grad_fn=<NllLossBackward0>)
199000
20100 tensor(5.2397, device='cuda:0', grad_fn=<NllLossBackward0>)
200000
20200 tensor(5.1723, device='cuda:0', grad_fn=<NllLossBackward0>)
201000
20300 tensor(5.2827, device='cuda:0', grad_fn=<NllLossBackward0>)
20400 tensor(5.3245, device='cuda:0', grad_fn=<NllLossBackward0>)
202000
20500 tensor(5.3303, device='cuda:0', grad_fn=<NllLossBackward0>)
203000
20600 tensor(5.5211, device='cuda:0', grad_fn=<NllLossBackward0>)
204000
20700 tensor(5.3629, device='cuda:0', grad_fn=<NllLossBackward0>)
205000
20800 tensor(5.1882, device='cuda:0', grad_fn=<NllLossBackward0>)
206000
20900 tensor(5.4671, device='cuda:0', grad_fn=<NllLossBackward0>)
207000
21000 tensor(5.3110, device='cuda:0', grad_fn=<NllLossBackward0>)
208000
21100 tensor(5.3181, device='cuda:0', grad_fn=<NllLossBackward0>)
209000
21200 tensor(5.1968, device='cuda:0', grad_fn=<NllLossBackward0>)
210000
21300 tensor(5.3940, device='cuda:0', grad_fn=<NllLossBackward0>)
211000
21400 tensor(5.4308, device='cuda:0', grad_fn=<NllLossBackward0>)
212000
21500 tensor(5.2127, device='cuda:0', grad_fn=<NllLossBackward0>)
213000
21600 tensor(5.2003, device='cuda:0', grad_fn=<NllLossBackward0>)
214000
21700 tensor(5.1881, device='cuda:0', grad_fn=<NllLossBackward0>)
215000
21800 tensor(5.3180, device='cuda:0', grad_fn=<NllLossBackward0>)
216000
21900 tensor(5.3197, device='cuda:0', grad_fn=<NllLossBackward0>)
217000
22000 tensor(5.3005, device='cuda:0', grad_fn=<NllLossBackward0>)
218000
22100 tensor(5.1776, device='cuda:0', grad_fn=<NllLossBackward0>)
219000
22200 tensor(5.0509, device='cuda:0', grad_fn=<NllLossBackward0>)
220000
22300 tensor(5.4807, device='cuda:0', grad_fn=<NllLossBackward0>)
221000
22400 tensor(5.2040, device='cuda:0', grad_fn=<NllLossBackward0>)
222000
22500 tensor(5.2161, device='cuda:0', grad_fn=<NllLossBackward0>)
223000
22600 tensor(5.4083, device='cuda:0', grad_fn=<NllLossBackward0>)
224000
22700 tensor(5.1619, device='cuda:0', grad_fn=<NllLossBackward0>)
225000
22800 tensor(5.4301, device='cuda:0', grad_fn=<NllLossBackward0>)
226000
22900 tensor(5.4791, device='cuda:0', grad_fn=<NllLossBackward0>)
227000
23000 tensor(5.3785, device='cuda:0', grad_fn=<NllLossBackward0>)
228000
23100 tensor(5.3705, device='cuda:0', grad_fn=<NllLossBackward0>)
229000
23200 tensor(5.3633, device='cuda:0', grad_fn=<NllLossBackward0>)
230000
23300 tensor(5.4443, device='cuda:0', grad_fn=<NllLossBackward0>)
231000
23400 tensor(5.4496, device='cuda:0', grad_fn=<NllLossBackward0>)
232000
23500 tensor(5.2961, device='cuda:0', grad_fn=<NllLossBackward0>)
233000
23600 tensor(5.2603, device='cuda:0', grad_fn=<NllLossBackward0>)
234000
23700 tensor(5.2793, device='cuda:0', grad_fn=<NllLossBackward0>)
235000
23800 tensor(5.1461, device='cuda:0', grad_fn=<NllLossBackward0>)
236000
23900 tensor(5.2376, device='cuda:0', grad_fn=<NllLossBackward0>)
237000
24000 tensor(5.2269, device='cuda:0', grad_fn=<NllLossBackward0>)
238000
24100 tensor(5.3154, device='cuda:0', grad_fn=<NllLossBackward0>)
239000
24200 tensor(5.4852, device='cuda:0', grad_fn=<NllLossBackward0>)
240000
24300 tensor(5.3785, device='cuda:0', grad_fn=<NllLossBackward0>)
241000
24400 tensor(5.5053, device='cuda:0', grad_fn=<NllLossBackward0>)
242000
24500 tensor(5.2987, device='cuda:0', grad_fn=<NllLossBackward0>)
243000
24600 tensor(5.4275, device='cuda:0', grad_fn=<NllLossBackward0>)
244000
24700 tensor(5.3283, device='cuda:0', grad_fn=<NllLossBackward0>)
245000
24800 tensor(5.3707, device='cuda:0', grad_fn=<NllLossBackward0>)
246000
24900 tensor(5.4294, device='cuda:0', grad_fn=<NllLossBackward0>)
247000
25000 tensor(5.4479, device='cuda:0', grad_fn=<NllLossBackward0>)
248000
25100 tensor(5.3629, device='cuda:0', grad_fn=<NllLossBackward0>)
249000
25200 tensor(5.3849, device='cuda:0', grad_fn=<NllLossBackward0>)
250000
25300 tensor(5.4124, device='cuda:0', grad_fn=<NllLossBackward0>)
251000
25400 tensor(5.3932, device='cuda:0', grad_fn=<NllLossBackward0>)
252000
25500 tensor(5.2893, device='cuda:0', grad_fn=<NllLossBackward0>)
253000
25600 tensor(5.5512, device='cuda:0', grad_fn=<NllLossBackward0>)
254000
25700 tensor(5.3227, device='cuda:0', grad_fn=<NllLossBackward0>)
255000
25800 tensor(5.4217, device='cuda:0', grad_fn=<NllLossBackward0>)
256000
25900 tensor(5.3637, device='cuda:0', grad_fn=<NllLossBackward0>)
257000
26000 tensor(5.3632, device='cuda:0', grad_fn=<NllLossBackward0>)
258000
26100 tensor(5.2841, device='cuda:0', grad_fn=<NllLossBackward0>)
259000
26200 tensor(5.2107, device='cuda:0', grad_fn=<NllLossBackward0>)
260000
26300 tensor(5.4024, device='cuda:0', grad_fn=<NllLossBackward0>)
261000
26400 tensor(5.4410, device='cuda:0', grad_fn=<NllLossBackward0>)
262000
26500 tensor(5.1685, device='cuda:0', grad_fn=<NllLossBackward0>)
263000
26600 tensor(5.5023, device='cuda:0', grad_fn=<NllLossBackward0>)
264000
26700 tensor(5.3654, device='cuda:0', grad_fn=<NllLossBackward0>)
265000
26800 tensor(5.5407, device='cuda:0', grad_fn=<NllLossBackward0>)
266000
26900 tensor(5.3000, device='cuda:0', grad_fn=<NllLossBackward0>)
267000
27000 tensor(5.2141, device='cuda:0', grad_fn=<NllLossBackward0>)
268000
27100 tensor(5.2490, device='cuda:0', grad_fn=<NllLossBackward0>)
269000
27200 tensor(5.2850, device='cuda:0', grad_fn=<NllLossBackward0>)
270000
27300 tensor(5.4811, device='cuda:0', grad_fn=<NllLossBackward0>)
271000
27400 tensor(5.3561, device='cuda:0', grad_fn=<NllLossBackward0>)
272000
27500 tensor(5.2602, device='cuda:0', grad_fn=<NllLossBackward0>)
273000
27600 tensor(5.5429, device='cuda:0', grad_fn=<NllLossBackward0>)
274000
27700 tensor(5.3794, device='cuda:0', grad_fn=<NllLossBackward0>)
275000
27800 tensor(5.3792, device='cuda:0', grad_fn=<NllLossBackward0>)
276000
27900 tensor(5.4873, device='cuda:0', grad_fn=<NllLossBackward0>)
277000
28000 tensor(5.3454, device='cuda:0', grad_fn=<NllLossBackward0>)
278000
28100 tensor(5.3113, device='cuda:0', grad_fn=<NllLossBackward0>)
279000
28200 tensor(5.4785, device='cuda:0', grad_fn=<NllLossBackward0>)
280000
28300 tensor(5.1013, device='cuda:0', grad_fn=<NllLossBackward0>)
281000
28400 tensor(5.5403, device='cuda:0', grad_fn=<NllLossBackward0>)
282000
28500 tensor(5.3676, device='cuda:0', grad_fn=<NllLossBackward0>)
283000
28600 tensor(5.3108, device='cuda:0', grad_fn=<NllLossBackward0>)
284000
28700 tensor(5.4403, device='cuda:0', grad_fn=<NllLossBackward0>)
285000
28800 tensor(5.4926, device='cuda:0', grad_fn=<NllLossBackward0>)
286000
28900 tensor(5.3638, device='cuda:0', grad_fn=<NllLossBackward0>)
287000
29000 tensor(5.2819, device='cuda:0', grad_fn=<NllLossBackward0>)
288000
29100 tensor(5.0362, device='cuda:0', grad_fn=<NllLossBackward0>)
289000
29200 tensor(5.1871, device='cuda:0', grad_fn=<NllLossBackward0>)
290000
29300 tensor(5.4697, device='cuda:0', grad_fn=<NllLossBackward0>)
291000
29400 tensor(5.5909, device='cuda:0', grad_fn=<NllLossBackward0>)
292000
29500 tensor(5.3807, device='cuda:0', grad_fn=<NllLossBackward0>)
293000
29600 tensor(5.2398, device='cuda:0', grad_fn=<NllLossBackward0>)
294000
29700 tensor(5.3690, device='cuda:0', grad_fn=<NllLossBackward0>)
295000
29800 tensor(5.2220, device='cuda:0', grad_fn=<NllLossBackward0>)
296000
29900 tensor(5.4597, device='cuda:0', grad_fn=<NllLossBackward0>)
297000
30000 tensor(5.2205, device='cuda:0', grad_fn=<NllLossBackward0>)
298000
30100 tensor(5.3061, device='cuda:0', grad_fn=<NllLossBackward0>)
299000
30200 tensor(5.2432, device='cuda:0', grad_fn=<NllLossBackward0>)
300000
30300 tensor(5.3527, device='cuda:0', grad_fn=<NllLossBackward0>)
301000
30400 tensor(5.1823, device='cuda:0', grad_fn=<NllLossBackward0>)
302000
30500 tensor(5.3526, device='cuda:0', grad_fn=<NllLossBackward0>)
30600 tensor(5.3318, device='cuda:0', grad_fn=<NllLossBackward0>)
303000
30700 tensor(5.3634, device='cuda:0', grad_fn=<NllLossBackward0>)
304000
30800 tensor(5.3571, device='cuda:0', grad_fn=<NllLossBackward0>)
305000
30900 tensor(5.3875, device='cuda:0', grad_fn=<NllLossBackward0>)
306000
31000 tensor(5.4983, device='cuda:0', grad_fn=<NllLossBackward0>)
307000
31100 tensor(5.1554, device='cuda:0', grad_fn=<NllLossBackward0>)
308000
31200 tensor(5.1952, device='cuda:0', grad_fn=<NllLossBackward0>)
309000
31300 tensor(5.4546, device='cuda:0', grad_fn=<NllLossBackward0>)
310000
31400 tensor(5.2307, device='cuda:0', grad_fn=<NllLossBackward0>)
311000
31500 tensor(5.4188, device='cuda:0', grad_fn=<NllLossBackward0>)
312000
31600 tensor(5.4085, device='cuda:0', grad_fn=<NllLossBackward0>)
313000
31700 tensor(5.3744, device='cuda:0', grad_fn=<NllLossBackward0>)
314000
31800 tensor(5.4766, device='cuda:0', grad_fn=<NllLossBackward0>)
315000
31900 tensor(5.1062, device='cuda:0', grad_fn=<NllLossBackward0>)
316000
32000 tensor(5.2924, device='cuda:0', grad_fn=<NllLossBackward0>)
317000
32100 tensor(5.1728, device='cuda:0', grad_fn=<NllLossBackward0>)
318000
32200 tensor(5.4863, device='cuda:0', grad_fn=<NllLossBackward0>)
319000
32300 tensor(5.4748, device='cuda:0', grad_fn=<NllLossBackward0>)
320000
32400 tensor(5.4518, device='cuda:0', grad_fn=<NllLossBackward0>)
321000
32500 tensor(5.2752, device='cuda:0', grad_fn=<NllLossBackward0>)
322000
32600 tensor(5.3822, device='cuda:0', grad_fn=<NllLossBackward0>)
323000
32700 tensor(5.3088, device='cuda:0', grad_fn=<NllLossBackward0>)
324000
32800 tensor(5.5403, device='cuda:0', grad_fn=<NllLossBackward0>)
325000
32900 tensor(5.4000, device='cuda:0', grad_fn=<NllLossBackward0>)
326000
33000 tensor(5.1837, device='cuda:0', grad_fn=<NllLossBackward0>)
327000
33100 tensor(5.3888, device='cuda:0', grad_fn=<NllLossBackward0>)
328000
33200 tensor(5.4849, device='cuda:0', grad_fn=<NllLossBackward0>)
329000
33300 tensor(5.2471, device='cuda:0', grad_fn=<NllLossBackward0>)
330000
33400 tensor(5.5246, device='cuda:0', grad_fn=<NllLossBackward0>)
331000
33500 tensor(5.3479, device='cuda:0', grad_fn=<NllLossBackward0>)
332000
33600 tensor(5.3043, device='cuda:0', grad_fn=<NllLossBackward0>)
333000
33700 tensor(5.3487, device='cuda:0', grad_fn=<NllLossBackward0>)
334000
33800 tensor(5.4368, device='cuda:0', grad_fn=<NllLossBackward0>)
335000
33900 tensor(5.1620, device='cuda:0', grad_fn=<NllLossBackward0>)
336000
34000 tensor(5.3873, device='cuda:0', grad_fn=<NllLossBackward0>)
337000
34100 tensor(5.3545, device='cuda:0', grad_fn=<NllLossBackward0>)
338000
34200 tensor(5.4001, device='cuda:0', grad_fn=<NllLossBackward0>)
339000
34300 tensor(5.1902, device='cuda:0', grad_fn=<NllLossBackward0>)
340000
34400 tensor(5.6453, device='cuda:0', grad_fn=<NllLossBackward0>)
341000
34500 tensor(5.5124, device='cuda:0', grad_fn=<NllLossBackward0>)
342000
34600 tensor(5.4069, device='cuda:0', grad_fn=<NllLossBackward0>)
343000
34700 tensor(5.4734, device='cuda:0', grad_fn=<NllLossBackward0>)
344000
34800 tensor(5.5014, device='cuda:0', grad_fn=<NllLossBackward0>)
345000
34900 tensor(5.5412, device='cuda:0', grad_fn=<NllLossBackward0>)
346000
35000 tensor(5.5132, device='cuda:0', grad_fn=<NllLossBackward0>)
347000
35100 tensor(5.3455, device='cuda:0', grad_fn=<NllLossBackward0>)
348000
35200 tensor(5.2694, device='cuda:0', grad_fn=<NllLossBackward0>)
349000
35300 tensor(5.4988, device='cuda:0', grad_fn=<NllLossBackward0>)
350000
35400 tensor(5.1485, device='cuda:0', grad_fn=<NllLossBackward0>)
351000
35500 tensor(5.2299, device='cuda:0', grad_fn=<NllLossBackward0>)
352000
35600 tensor(5.3643, device='cuda:0', grad_fn=<NllLossBackward0>)
353000
35700 tensor(5.2247, device='cuda:0', grad_fn=<NllLossBackward0>)
354000
35800 tensor(5.3615, device='cuda:0', grad_fn=<NllLossBackward0>)
355000
35900 tensor(5.3453, device='cuda:0', grad_fn=<NllLossBackward0>)
356000
36000 tensor(5.1217, device='cuda:0', grad_fn=<NllLossBackward0>)
357000
36100 tensor(5.4909, device='cuda:0', grad_fn=<NllLossBackward0>)
358000
36200 tensor(5.3382, device='cuda:0', grad_fn=<NllLossBackward0>)
359000
36300 tensor(5.6225, device='cuda:0', grad_fn=<NllLossBackward0>)
360000
36400 tensor(5.3167, device='cuda:0', grad_fn=<NllLossBackward0>)
361000
36500 tensor(5.3458, device='cuda:0', grad_fn=<NllLossBackward0>)
362000
36600 tensor(5.3608, device='cuda:0', grad_fn=<NllLossBackward0>)
363000
36700 tensor(5.1660, device='cuda:0', grad_fn=<NllLossBackward0>)
364000
36800 tensor(5.2737, device='cuda:0', grad_fn=<NllLossBackward0>)
365000
36900 tensor(5.3883, device='cuda:0', grad_fn=<NllLossBackward0>)
366000
37000 tensor(5.2783, device='cuda:0', grad_fn=<NllLossBackward0>)
367000
37100 tensor(5.3110, device='cuda:0', grad_fn=<NllLossBackward0>)
368000
37200 tensor(5.3794, device='cuda:0', grad_fn=<NllLossBackward0>)
369000
37300 tensor(5.2802, device='cuda:0', grad_fn=<NllLossBackward0>)
370000
37400 tensor(5.6133, device='cuda:0', grad_fn=<NllLossBackward0>)
371000
37500 tensor(5.3138, device='cuda:0', grad_fn=<NllLossBackward0>)
372000
37600 tensor(5.3083, device='cuda:0', grad_fn=<NllLossBackward0>)
373000
37700 tensor(5.4860, device='cuda:0', grad_fn=<NllLossBackward0>)
374000
37800 tensor(5.3216, device='cuda:0', grad_fn=<NllLossBackward0>)
375000
37900 tensor(5.2969, device='cuda:0', grad_fn=<NllLossBackward0>)
376000
38000 tensor(5.3759, device='cuda:0', grad_fn=<NllLossBackward0>)
377000
38100 tensor(5.3914, device='cuda:0', grad_fn=<NllLossBackward0>)
378000
38200 tensor(5.4089, device='cuda:0', grad_fn=<NllLossBackward0>)
379000
38300 tensor(5.3068, device='cuda:0', grad_fn=<NllLossBackward0>)
380000
38400 tensor(5.3798, device='cuda:0', grad_fn=<NllLossBackward0>)
381000
38500 tensor(5.4051, device='cuda:0', grad_fn=<NllLossBackward0>)
382000
38600 tensor(5.3471, device='cuda:0', grad_fn=<NllLossBackward0>)
383000
38700 tensor(5.3415, device='cuda:0', grad_fn=<NllLossBackward0>)
384000
38800 tensor(5.4310, device='cuda:0', grad_fn=<NllLossBackward0>)
385000
38900 tensor(5.5029, device='cuda:0', grad_fn=<NllLossBackward0>)
386000
39000 tensor(5.2021, device='cuda:0', grad_fn=<NllLossBackward0>)
387000
39100 tensor(5.4283, device='cuda:0', grad_fn=<NllLossBackward0>)
388000
39200 tensor(5.5158, device='cuda:0', grad_fn=<NllLossBackward0>)
389000
39300 tensor(5.3452, device='cuda:0', grad_fn=<NllLossBackward0>)
390000
39400 tensor(5.4111, device='cuda:0', grad_fn=<NllLossBackward0>)
391000
39500 tensor(5.4969, device='cuda:0', grad_fn=<NllLossBackward0>)
392000
39600 tensor(5.1952, device='cuda:0', grad_fn=<NllLossBackward0>)
393000
39700 tensor(5.1946, device='cuda:0', grad_fn=<NllLossBackward0>)
394000
39800 tensor(5.3234, device='cuda:0', grad_fn=<NllLossBackward0>)
395000
39900 tensor(5.1354, device='cuda:0', grad_fn=<NllLossBackward0>)
396000
40000 tensor(5.2210, device='cuda:0', grad_fn=<NllLossBackward0>)
397000
40100 tensor(5.3133, device='cuda:0', grad_fn=<NllLossBackward0>)
398000
40200 tensor(5.2990, device='cuda:0', grad_fn=<NllLossBackward0>)
399000
40300 tensor(5.3684, device='cuda:0', grad_fn=<NllLossBackward0>)
40400 tensor(5.3700, device='cuda:0', grad_fn=<NllLossBackward0>)
400000
40500 tensor(5.2911, device='cuda:0', grad_fn=<NllLossBackward0>)
401000
40600 tensor(5.3497, device='cuda:0', grad_fn=<NllLossBackward0>)
402000
40700 tensor(5.3981, device='cuda:0', grad_fn=<NllLossBackward0>)
403000
40800 tensor(5.3436, device='cuda:0', grad_fn=<NllLossBackward0>)
404000
40900 tensor(5.2978, device='cuda:0', grad_fn=<NllLossBackward0>)
405000
41000 tensor(5.3420, device='cuda:0', grad_fn=<NllLossBackward0>)
406000
41100 tensor(5.3342, device='cuda:0', grad_fn=<NllLossBackward0>)
407000
41200 tensor(5.2226, device='cuda:0', grad_fn=<NllLossBackward0>)
408000
41300 tensor(5.3573, device='cuda:0', grad_fn=<NllLossBackward0>)
409000
41400 tensor(5.2448, device='cuda:0', grad_fn=<NllLossBackward0>)
410000
41500 tensor(5.3863, device='cuda:0', grad_fn=<NllLossBackward0>)
411000
41600 tensor(5.3051, device='cuda:0', grad_fn=<NllLossBackward0>)
412000
41700 tensor(5.3294, device='cuda:0', grad_fn=<NllLossBackward0>)
413000
41800 tensor(5.3191, device='cuda:0', grad_fn=<NllLossBackward0>)
414000
41900 tensor(5.3289, device='cuda:0', grad_fn=<NllLossBackward0>)
415000
42000 tensor(5.4860, device='cuda:0', grad_fn=<NllLossBackward0>)
416000
42100 tensor(5.2358, device='cuda:0', grad_fn=<NllLossBackward0>)
417000
42200 tensor(5.3253, device='cuda:0', grad_fn=<NllLossBackward0>)
418000
42300 tensor(5.3869, device='cuda:0', grad_fn=<NllLossBackward0>)
419000
42400 tensor(5.2062, device='cuda:0', grad_fn=<NllLossBackward0>)
420000
42500 tensor(5.3712, device='cuda:0', grad_fn=<NllLossBackward0>)
421000
42600 tensor(5.1718, device='cuda:0', grad_fn=<NllLossBackward0>)
422000
42700 tensor(5.4735, device='cuda:0', grad_fn=<NllLossBackward0>)
423000
42800 tensor(5.3973, device='cuda:0', grad_fn=<NllLossBackward0>)
424000
42900 tensor(5.2447, device='cuda:0', grad_fn=<NllLossBackward0>)
425000
43000 tensor(5.3896, device='cuda:0', grad_fn=<NllLossBackward0>)
426000
43100 tensor(5.3916, device='cuda:0', grad_fn=<NllLossBackward0>)
427000
43200 tensor(5.2044, device='cuda:0', grad_fn=<NllLossBackward0>)
428000
43300 tensor(5.2167, device='cuda:0', grad_fn=<NllLossBackward0>)
429000
43400 tensor(5.3933, device='cuda:0', grad_fn=<NllLossBackward0>)
430000
43500 tensor(5.1078, device='cuda:0', grad_fn=<NllLossBackward0>)
431000
43600 tensor(5.3045, device='cuda:0', grad_fn=<NllLossBackward0>)
432000
device = 'cuda'
torch.cuda.empty_cache()
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
model.load_state_dict(torch.load(f'model-bigram_final.bin'))
model.eval()

ixs = torch.tensor(vocab.forward(['will'])).to(device)

out = model(ixs)
top = torch.topk(out[0], 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
list(zip(top_words, top_indices, top_probs))
[('be', 11, 0.2570849657058716),
 ('<unk>', 0, 0.07411641627550125),
 ('not', 22, 0.05940083786845207),
 ('have', 28, 0.02751326560974121),
 ('bo', 167, 0.014936885796487331),
 ('make', 116, 0.013943656347692013),
 ('give', 193, 0.011286991648375988),
 ('take', 153, 0.011171611957252026),
 ('do', 86, 0.010088067501783371),
 ('he', 20, 0.009703895077109337)]
vocab = train_dataset.vocab
ixs = torch.tensor(vocab.forward(['cerned.'])).to(device)

out = model(ixs)
top = torch.topk(out[0], 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
list(zip(top_words, top_indices, top_probs))
[('<unk>', 0, 0.19996878504753113),
 ('and', 3, 0.05288130044937134),
 ('of', 2, 0.042051784694194794),
 ('the', 1, 0.026572922244668007),
 ('to', 4, 0.022689413279294968),
 ('in', 6, 0.015904497355222702),
 ('The', 17, 0.012827681377530098),
 ('a', 5, 0.00961760152131319),
 ('for', 8, 0.008938422426581383),
 ('</s>', 32, 0.00840282253921032)]
cos = nn.CosineSimilarity(dim=1, eps=1e-6)

embeddings = model.model[0].weight

vec = embeddings[vocab['cerned.']]

similarities = cos(vec, embeddings)

top = torch.topk(similarities, 10)

top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
list(zip(top_words, top_indices, top_probs))
[('<unk>', 0, 1.0),
 ('particular,', 14538, 0.24527804553508759),
 ('revolution.', 20446, 0.23776617646217346),
 ('Territory.', 14189, 0.23417341709136963),
 ('or-', 2261, 0.22888363897800446),
 ('3', 479, 0.2288265973329544),
 ('speak.', 13722, 0.2252315878868103),
 ('attend.', 19397, 0.22110989689826965),
 ('say,', 1455, 0.22106117010116577),
 ('Lee.', 15326, 0.21764159202575684)]
def get_values_from_model(presc_word, model, vocab, k):
    ixs =  torch.tensor(vocab.forward([presc_word])).to(device)
    out = model(ixs)
    top = torch.topk(out[0], k)
    top_indices = top.indices.tolist()
    top_probs = top.values.tolist()
    top_words = vocab.lookup_tokens(top_indices)
    return list(zip(top_words, top_probs))

def gonito_format(dic):
    tab = summarize_probs_unk(dic)
    result = ''
    for element in tab[:-1]:
        result+=str(element[0])+':'+str(element[1])+'\t'
    result+=':'+ str(tab[-1][1])+'\n'
    return result

def summarize_probs_unk(dic):
    if '<unk>' in dic.keys():
        probsum = sum(float(val) for key, val in dic.items())
        for key in dic:
            dic[key] = dic[key]/probsum ###leave some space for wildcard
        wildcard = dic['<unk>']
        del dic['<unk>']
        tab = [(key, val) for key, val in dic.items()]
        tab.append(('<unk>', wildcard))
    else:
        probsum = sum(float(val) for key, val in dic.items())
        for key in dic:
            dic[key] = dic[key]/(probsum*(1+wildcard_minweight)) #plus, becouse it's denominator
        tab = [(key, val) for key, val in dic.items()]
        tab.append(('<unk>', 1-sum([val for val in dic.values()])))
    return tab

model.load_state_dict(torch.load('model-bigram_final.bin'))
<All keys matched successfully>

with lzma.open(test_file, 'rt') as file:
    predict_words = []
    results = []
    for line in file:
#         print(line)
        line = preprocess(line) #get only relevant
        split = line.split('\t')
        predict_words.append(get_last_word(split[0])) #get_first_word(split[1])
    vocab = train_dataset.vocab
    for presc_word in predict_words:
        results.append(dict(get_values_from_model(presc_word, model, vocab, k=k)))
    with open(out_file, 'w') as outfile:
        for elem in results:
            outfile.write(gonito_format(elem))