261 KiB
261 KiB
!pip install torchtext
Defaulting to user installation because normal site-packages is not writeable Collecting torchtext Downloading torchtext-0.15.2-cp310-cp310-manylinux1_x86_64.whl (2.0 MB) [2K [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m[36m0:00:01[0m:01[0m [?25hCollecting tqdm Using cached tqdm-4.65.0-py3-none-any.whl (77 kB) Requirement already satisfied: numpy in /home/gedin/.local/lib/python3.10/site-packages (from torchtext) (1.24.3) Collecting torchdata==0.6.1 Downloading torchdata-0.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB) [2K [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m [?25hRequirement already satisfied: requests in /usr/lib/python3/dist-packages (from torchtext) (2.25.1) Collecting torch==2.0.1 Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB) [2K [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:09[0m [?25hCollecting sympy Downloading sympy-1.12-py3-none-any.whl (5.7 MB) [2K [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m [?25hCollecting nvidia-cudnn-cu11==8.5.0.96 Using cached nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl (557.1 MB) Collecting nvidia-cuda-cupti-cu11==11.7.101 Using cached nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB) Collecting nvidia-cusparse-cu11==11.7.4.91 Using cached nvidia_cusparse_cu11-11.7.4.91-py3-none-manylinux1_x86_64.whl (173.2 MB) Collecting networkx Using cached networkx-3.1-py3-none-any.whl (2.1 MB) Collecting nvidia-cufft-cu11==10.9.0.58 Using cached nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux1_x86_64.whl (168.4 MB) Collecting filelock Downloading filelock-3.12.0-py3-none-any.whl (10 kB) Collecting nvidia-cuda-runtime-cu11==11.7.99 Using cached nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB) Collecting triton==2.0.0 Downloading triton-2.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (63.3 MB) [2K [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.3/63.3 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:02[0m [?25hCollecting nvidia-cusolver-cu11==11.4.0.1 Using cached nvidia_cusolver_cu11-11.4.0.1-2-py3-none-manylinux1_x86_64.whl (102.6 MB) Requirement already satisfied: jinja2 in /home/gedin/.local/lib/python3.10/site-packages (from torch==2.0.1->torchtext) (3.1.2) Collecting nvidia-cublas-cu11==11.10.3.66 Using cached nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB) Collecting typing-extensions Downloading typing_extensions-4.6.3-py3-none-any.whl (31 kB) Collecting nvidia-nccl-cu11==2.14.3 Using cached nvidia_nccl_cu11-2.14.3-py3-none-manylinux1_x86_64.whl (177.1 MB) Collecting nvidia-cuda-nvrtc-cu11==11.7.99 Using cached nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB) Collecting nvidia-curand-cu11==10.2.10.91 Using cached nvidia_curand_cu11-10.2.10.91-py3-none-manylinux1_x86_64.whl (54.6 MB) Collecting nvidia-nvtx-cu11==11.7.91 Using cached nvidia_nvtx_cu11-11.7.91-py3-none-manylinux1_x86_64.whl (98 kB) Requirement already satisfied: urllib3>=1.25 in /usr/lib/python3/dist-packages (from torchdata==0.6.1->torchtext) (1.26.5) Requirement already satisfied: wheel in /usr/lib/python3/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch==2.0.1->torchtext) (0.37.1) Requirement already satisfied: setuptools in /usr/lib/python3/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch==2.0.1->torchtext) (59.6.0) Collecting lit Downloading lit-16.0.5.tar.gz (138 kB) [2K [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 KB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m[31m1.6 MB/s[0m eta [36m0:00:01[0m [?25h Preparing metadata (setup.py) ... [?25ldone [?25hCollecting cmake Using cached cmake-3.26.3-py2.py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (24.0 MB) Requirement already satisfied: MarkupSafe>=2.0 in /usr/lib/python3/dist-packages (from jinja2->torch==2.0.1->torchtext) (2.0.1) Collecting mpmath>=0.19 Using cached mpmath-1.3.0-py3-none-any.whl (536 kB) Building wheels for collected packages: lit Building wheel for lit (setup.py) ... [?25ldone [?25h Created wheel for lit: filename=lit-16.0.5-py3-none-any.whl size=88192 sha256=f6c57a31a147cbfe0af3d6bf4b856390ad14c28a9ddb38c8044ec29331b35c26 Stored in directory: /home/gedin/.cache/pip/wheels/eb/02/84/d82f0b1a6098209edf7e3607be6cc592ebbc015a8a3127c68d Successfully built lit Installing collected packages: mpmath, lit, cmake, typing-extensions, tqdm, sympy, nvidia-nvtx-cu11, nvidia-nccl-cu11, nvidia-cusparse-cu11, nvidia-curand-cu11, nvidia-cufft-cu11, nvidia-cuda-runtime-cu11, nvidia-cuda-nvrtc-cu11, nvidia-cuda-cupti-cu11, nvidia-cublas-cu11, networkx, filelock, nvidia-cusolver-cu11, nvidia-cudnn-cu11, triton, torch, torchdata, torchtext Successfully installed cmake-3.26.3 filelock-3.12.0 lit-16.0.5 mpmath-1.3.0 networkx-3.1 nvidia-cublas-cu11-11.10.3.66 nvidia-cuda-cupti-cu11-11.7.101 nvidia-cuda-nvrtc-cu11-11.7.99 nvidia-cuda-runtime-cu11-11.7.99 nvidia-cudnn-cu11-8.5.0.96 nvidia-cufft-cu11-10.9.0.58 nvidia-curand-cu11-10.2.10.91 nvidia-cusolver-cu11-11.4.0.1 nvidia-cusparse-cu11-11.7.4.91 nvidia-nccl-cu11-2.14.3 nvidia-nvtx-cu11-11.7.91 sympy-1.12 torch-2.0.1 torchdata-0.6.1 torchtext-0.15.2 tqdm-4.65.0 triton-2.0.0 typing-extensions-4.6.3
train_file ='train/in.tsv.xz'
test_file = 'dev-0/in.tsv.xz'
out_file = 'dev-0/out.tsv'
from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
import lzma
import pickle
import re
import torch
from torch import nn
from torch.utils.data import IterableDataset
import itertools
from torch.utils.data import DataLoader
import gc
embed_size = 300
device = 'cuda'
vocab_size = 25000
batch_s = 3200
learning_rate = 0.0001
epochs = 4
k = 20 #top k words
wildcard_minweight = 0.1
###preprocessing
def preprocess(line):
line = get_rid_of_header(line)
line = replace_endline(line)
return line
def get_rid_of_header(line):
line = line.split('\t')[6:]
return "".join(line)
def replace_endline(line):
line = line.replace("\\\\n", " ")
return line
def get_last_word(text):
"""Return the last word of a string."""
last_word = ""
for i in range(len(text)-1, -1, -1):
if text[i] == ' ':
return last_word[::-1].rstrip()
else:
last_word += text[i]
return last_word[::-1].rstrip()
def get_first_word(text):
"""Return the first word of a string."""
word = ""
for i in range(len(text)-1):
if text[i] == ' ':
return word
else:
word += text[i]
return word
def get_words_from_line(line):
line = line.rstrip()
yield '<s>'
line = preprocess(line)
for t in line.split(' '):
yield t
yield '</s>'
def get_word_lines_from_file(file_name):
n = 0
with lzma.open(file_name, 'r') as fh:
for line in fh:
n+=1
if n%1000==0:
print(n)
yield get_words_from_line(line.decode('utf-8'))
vocab = build_vocab_from_iterator(
get_word_lines_from_file(train_file),
max_tokens = vocab_size,
specials = ['<unk>'])
with open('filename.pickle', 'wb') as handle:
pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000 21000 22000 23000 24000 25000 26000 27000 28000 29000 30000 31000 32000 33000 34000 35000 36000 37000 38000 39000 40000 41000 42000 43000 44000 45000 46000 47000 48000 49000 50000 51000 52000 53000 54000 55000 56000 57000 58000 59000 60000 61000 62000 63000 64000 65000 66000 67000 68000 69000 70000 71000 72000 73000 74000 75000 76000 77000 78000 79000 80000 81000 82000 83000 84000 85000 86000 87000 88000 89000 90000 91000 92000 93000 94000 95000 96000 97000 98000 99000 100000 101000 102000 103000 104000 105000 106000 107000 108000 109000 110000 111000 112000 113000 114000 115000 116000 117000 118000 119000 120000 121000 122000 123000 124000 125000 126000 127000 128000 129000 130000 131000 132000 133000 134000 135000 136000 137000 138000 139000 140000 141000 142000 143000 144000 145000 146000 147000 148000 149000 150000 151000 152000 153000 154000 155000 156000 157000 158000 159000 160000 161000 162000 163000 164000 165000 166000 167000 168000 169000 170000 171000 172000 173000 174000 175000 176000 177000 178000 179000 180000 181000 182000 183000 184000 185000 186000 187000 188000 189000 190000 191000 192000 193000 194000 195000 196000 197000 198000 199000 200000 201000 202000 203000 204000 205000 206000 207000 208000 209000 210000 211000 212000 213000 214000 215000 216000 217000 218000 219000 220000 221000 222000 223000 224000 225000 226000 227000 228000 229000 230000 231000 232000 233000 234000 235000 236000 237000 238000 239000 240000 241000 242000 243000 244000 245000 246000 247000 248000 249000 250000 251000 252000 253000 254000 255000 256000 257000 258000 259000 260000 261000 262000 263000 264000 265000 266000 267000 268000 269000 270000 271000 272000 273000 274000 275000 276000 277000 278000 279000 280000 281000 282000 283000 284000 285000 286000 287000 288000 289000 290000 291000 292000 293000 294000 295000 296000 297000 298000 299000 300000 301000 302000 303000 304000 305000 306000 307000 308000 309000 310000 311000 312000 313000 314000 315000 316000 317000 318000 319000 320000 321000 322000 323000 324000 325000 326000 327000 328000 329000 330000 331000 332000 333000 334000 335000 336000 337000 338000 339000 340000 341000 342000 343000 344000 345000 346000 347000 348000 349000 350000 351000 352000 353000 354000 355000 356000 357000 358000 359000 360000 361000 362000 363000 364000 365000 366000 367000 368000 369000 370000 371000 372000 373000 374000 375000 376000 377000 378000 379000 380000 381000 382000 383000 384000 385000 386000 387000 388000 389000 390000 391000 392000 393000 394000 395000 396000 397000 398000 399000 400000 401000 402000 403000 404000 405000 406000 407000 408000 409000 410000 411000 412000 413000 414000 415000 416000 417000 418000 419000 420000 421000 422000 423000 424000 425000 426000 427000 428000 429000 430000 431000 432000
vocab.lookup_tokens([0, 1, 2, 10, 2000])
['<unk>', 'the', 'of', 'was', 'ladies']
Definicja sieci
Naszą prostą sieć neuronową zaimplementujemy używając frameworku PyTorch.
class SimpleBigramNeuralLanguageModel(nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleBigramNeuralLanguageModel, self).__init__()
self.model = nn.Sequential(
nn.Embedding(vocabulary_size, embedding_size),
nn.Linear(embedding_size, vocabulary_size),
nn.Softmax()
)
def forward(self, x):
return self.model(x)
with open('filename.pickle','rb') as handle:
vocab = pickle.load(handle)
vocab.set_default_index(vocab['<unk>'])
help(vocab)
Help on Vocab in module torchtext.vocab.vocab object: class Vocab(torch.nn.modules.module.Module) | Vocab(vocab) -> None | | Base class for all neural network modules. | | Your models should also subclass this class. | | Modules can also contain other Modules, allowing to nest them in | a tree structure. You can assign the submodules as regular attributes:: | | import torch.nn as nn | import torch.nn.functional as F | | class Model(nn.Module): | def __init__(self): | super().__init__() | self.conv1 = nn.Conv2d(1, 20, 5) | self.conv2 = nn.Conv2d(20, 20, 5) | | def forward(self, x): | x = F.relu(self.conv1(x)) | return F.relu(self.conv2(x)) | | Submodules assigned in this way will be registered, and will have their | parameters converted too when you call :meth:`to`, etc. | | .. note:: | As per the example above, an ``__init__()`` call to the parent class | must be made before assignment on the child. | | :ivar training: Boolean represents whether this module is in training or | evaluation mode. | :vartype training: bool | | Method resolution order: | Vocab | torch.nn.modules.module.Module | builtins.object | | Methods defined here: | | __contains__(self, token: str) -> bool | Args: | token: The token for which to check the membership. | | Returns: | Whether the token is member of vocab or not. | | __getitem__(self, token: str) -> int | Args: | token: The token used to lookup the corresponding index. | | Returns: | The index corresponding to the associated token. | | __init__(self, vocab) -> None | Initializes internal Module state, shared by both nn.Module and ScriptModule. | | __len__(self) -> int | Returns: | The length of the vocab. | | __prepare_scriptable__(self) | Return a JITable Vocab. | | append_token(self, token: str) -> None | Args: | token: The token used to lookup the corresponding index. | | Raises: | RuntimeError: If `token` already exists in the vocab | | forward(self, tokens: List[str]) -> List[int] | Calls the `lookup_indices` method | | Args: | tokens: a list of tokens used to lookup their corresponding `indices`. | | Returns: | The indices associated with a list of `tokens`. | | get_default_index(self) -> Union[int, NoneType] | Returns: | Value of default index if it is set. | | get_itos(self) -> List[str] | Returns: | List mapping indices to tokens. | | get_stoi(self) -> Dict[str, int] | Returns: | Dictionary mapping tokens to indices. | | insert_token(self, token: str, index: int) -> None | Args: | token: The token used to lookup the corresponding index. | index: The index corresponding to the associated token. | Raises: | RuntimeError: If `index` is not in range [0, Vocab.size()] or if `token` already exists in the vocab. | | lookup_indices(self, tokens: List[str]) -> List[int] | Args: | tokens: the tokens used to lookup their corresponding `indices`. | | Returns: | The 'indices` associated with `tokens`. | | lookup_token(self, index: int) -> str | Args: | index: The index corresponding to the associated token. | | Returns: | token: The token used to lookup the corresponding index. | | Raises: | RuntimeError: If `index` not in range [0, itos.size()). | | lookup_tokens(self, indices: List[int]) -> List[str] | Args: | indices: The `indices` used to lookup their corresponding`tokens`. | | Returns: | The `tokens` associated with `indices`. | | Raises: | RuntimeError: If an index within `indices` is not int range [0, itos.size()). | | set_default_index(self, index: Union[int, NoneType]) -> None | Args: | index: Value of default index. This index will be returned when OOV token is queried. | | ---------------------------------------------------------------------- | Readonly properties defined here: | | is_jitable | | ---------------------------------------------------------------------- | Data and other attributes defined here: | | __jit_unused_properties__ = ['is_jitable'] | | ---------------------------------------------------------------------- | Methods inherited from torch.nn.modules.module.Module: | | __call__ = _call_impl(self, *args, **kwargs) | | __delattr__(self, name) | Implement delattr(self, name). | | __dir__(self) | Default dir() implementation. | | __getattr__(self, name: str) -> Union[torch.Tensor, ForwardRef('Module')] | | __repr__(self) | Return repr(self). | | __setattr__(self, name: str, value: Union[torch.Tensor, ForwardRef('Module')]) -> None | Implement setattr(self, name, value). | | __setstate__(self, state) | | add_module(self, name: str, module: Union[ForwardRef('Module'), NoneType]) -> None | Adds a child module to the current module. | | The module can be accessed as an attribute using the given name. | | Args: | name (str): name of the child module. The child module can be | accessed from this module using the given name | module (Module): child module to be added to the module. | | apply(self: ~T, fn: Callable[[ForwardRef('Module')], NoneType]) -> ~T | Applies ``fn`` recursively to every submodule (as returned by ``.children()``) | as well as self. Typical use includes initializing the parameters of a model | (see also :ref:`nn-init-doc`). | | Args: | fn (:class:`Module` -> None): function to be applied to each submodule | | Returns: | Module: self | | Example:: | | >>> @torch.no_grad() | >>> def init_weights(m): | >>> print(m) | >>> if type(m) == nn.Linear: | >>> m.weight.fill_(1.0) | >>> print(m.weight) | >>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2)) | >>> net.apply(init_weights) | Linear(in_features=2, out_features=2, bias=True) | Parameter containing: | tensor([[1., 1.], | [1., 1.]], requires_grad=True) | Linear(in_features=2, out_features=2, bias=True) | Parameter containing: | tensor([[1., 1.], | [1., 1.]], requires_grad=True) | Sequential( | (0): Linear(in_features=2, out_features=2, bias=True) | (1): Linear(in_features=2, out_features=2, bias=True) | ) | | bfloat16(self: ~T) -> ~T | Casts all floating point parameters and buffers to ``bfloat16`` datatype. | | .. note:: | This method modifies the module in-place. | | Returns: | Module: self | | buffers(self, recurse: bool = True) -> Iterator[torch.Tensor] | Returns an iterator over module buffers. | | Args: | recurse (bool): if True, then yields buffers of this module | and all submodules. Otherwise, yields only buffers that | are direct members of this module. | | Yields: | torch.Tensor: module buffer | | Example:: | | >>> # xdoctest: +SKIP("undefined vars") | >>> for buf in model.buffers(): | >>> print(type(buf), buf.size()) | <class 'torch.Tensor'> (20L,) | <class 'torch.Tensor'> (20L, 1L, 5L, 5L) | | children(self) -> Iterator[ForwardRef('Module')] | Returns an iterator over immediate children modules. | | Yields: | Module: a child module | | cpu(self: ~T) -> ~T | Moves all model parameters and buffers to the CPU. | | .. note:: | This method modifies the module in-place. | | Returns: | Module: self | | cuda(self: ~T, device: Union[int, torch.device, NoneType] = None) -> ~T | Moves all model parameters and buffers to the GPU. | | This also makes associated parameters and buffers different objects. So | it should be called before constructing optimizer if the module will | live on GPU while being optimized. | | .. note:: | This method modifies the module in-place. | | Args: | device (int, optional): if specified, all parameters will be | copied to that device | | Returns: | Module: self | | double(self: ~T) -> ~T | Casts all floating point parameters and buffers to ``double`` datatype. | | .. note:: | This method modifies the module in-place. | | Returns: | Module: self | | eval(self: ~T) -> ~T | Sets the module in evaluation mode. | | This has any effect only on certain modules. See documentations of | particular modules for details of their behaviors in training/evaluation | mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, | etc. | | This is equivalent with :meth:`self.train(False) <torch.nn.Module.train>`. | | See :ref:`locally-disable-grad-doc` for a comparison between | `.eval()` and several similar mechanisms that may be confused with it. | | Returns: | Module: self | | extra_repr(self) -> str | Set the extra representation of the module | | To print customized extra information, you should re-implement | this method in your own modules. Both single-line and multi-line | strings are acceptable. | | float(self: ~T) -> ~T | Casts all floating point parameters and buffers to ``float`` datatype. | | .. note:: | This method modifies the module in-place. | | Returns: | Module: self | | get_buffer(self, target: str) -> 'Tensor' | Returns the buffer given by ``target`` if it exists, | otherwise throws an error. | | See the docstring for ``get_submodule`` for a more detailed | explanation of this method's functionality as well as how to | correctly specify ``target``. | | Args: | target: The fully-qualified string name of the buffer | to look for. (See ``get_submodule`` for how to specify a | fully-qualified string.) | | Returns: | torch.Tensor: The buffer referenced by ``target`` | | Raises: | AttributeError: If the target string references an invalid | path or resolves to something that is not a | buffer | | get_extra_state(self) -> Any | Returns any extra state to include in the module's state_dict. | Implement this and a corresponding :func:`set_extra_state` for your module | if you need to store extra state. This function is called when building the | module's `state_dict()`. | | Note that extra state should be picklable to ensure working serialization | of the state_dict. We only provide provide backwards compatibility guarantees | for serializing Tensors; other objects may break backwards compatibility if | their serialized pickled form changes. | | Returns: | object: Any extra state to store in the module's state_dict | | get_parameter(self, target: str) -> 'Parameter' | Returns the parameter given by ``target`` if it exists, | otherwise throws an error. | | See the docstring for ``get_submodule`` for a more detailed | explanation of this method's functionality as well as how to | correctly specify ``target``. | | Args: | target: The fully-qualified string name of the Parameter | to look for. (See ``get_submodule`` for how to specify a | fully-qualified string.) | | Returns: | torch.nn.Parameter: The Parameter referenced by ``target`` | | Raises: | AttributeError: If the target string references an invalid | path or resolves to something that is not an | ``nn.Parameter`` | | get_submodule(self, target: str) -> 'Module' | Returns the submodule given by ``target`` if it exists, | otherwise throws an error. | | For example, let's say you have an ``nn.Module`` ``A`` that | looks like this: | | .. code-block:: text | | A( | (net_b): Module( | (net_c): Module( | (conv): Conv2d(16, 33, kernel_size=(3, 3), stride=(2, 2)) | ) | (linear): Linear(in_features=100, out_features=200, bias=True) | ) | ) | | (The diagram shows an ``nn.Module`` ``A``. ``A`` has a nested | submodule ``net_b``, which itself has two submodules ``net_c`` | and ``linear``. ``net_c`` then has a submodule ``conv``.) | | To check whether or not we have the ``linear`` submodule, we | would call ``get_submodule("net_b.linear")``. To check whether | we have the ``conv`` submodule, we would call | ``get_submodule("net_b.net_c.conv")``. | | The runtime of ``get_submodule`` is bounded by the degree | of module nesting in ``target``. A query against | ``named_modules`` achieves the same result, but it is O(N) in | the number of transitive modules. So, for a simple check to see | if some submodule exists, ``get_submodule`` should always be | used. | | Args: | target: The fully-qualified string name of the submodule | to look for. (See above example for how to specify a | fully-qualified string.) | | Returns: | torch.nn.Module: The submodule referenced by ``target`` | | Raises: | AttributeError: If the target string references an invalid | path or resolves to something that is not an | ``nn.Module`` | | half(self: ~T) -> ~T | Casts all floating point parameters and buffers to ``half`` datatype. | | .. note:: | This method modifies the module in-place. | | Returns: | Module: self | | ipu(self: ~T, device: Union[int, torch.device, NoneType] = None) -> ~T | Moves all model parameters and buffers to the IPU. | | This also makes associated parameters and buffers different objects. So | it should be called before constructing optimizer if the module will | live on IPU while being optimized. | | .. note:: | This method modifies the module in-place. | | Arguments: | device (int, optional): if specified, all parameters will be | copied to that device | | Returns: | Module: self | | load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True) | Copies parameters and buffers from :attr:`state_dict` into | this module and its descendants. If :attr:`strict` is ``True``, then | the keys of :attr:`state_dict` must exactly match the keys returned | by this module's :meth:`~torch.nn.Module.state_dict` function. | | Args: | state_dict (dict): a dict containing parameters and | persistent buffers. | strict (bool, optional): whether to strictly enforce that the keys | in :attr:`state_dict` match the keys returned by this module's | :meth:`~torch.nn.Module.state_dict` function. Default: ``True`` | | Returns: | ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields: | * **missing_keys** is a list of str containing the missing keys | * **unexpected_keys** is a list of str containing the unexpected keys | | Note: | If a parameter or buffer is registered as ``None`` and its corresponding key | exists in :attr:`state_dict`, :meth:`load_state_dict` will raise a | ``RuntimeError``. | | modules(self) -> Iterator[ForwardRef('Module')] | Returns an iterator over all modules in the network. | | Yields: | Module: a module in the network | | Note: | Duplicate modules are returned only once. In the following | example, ``l`` will be returned only once. | | Example:: | | >>> l = nn.Linear(2, 2) | >>> net = nn.Sequential(l, l) | >>> for idx, m in enumerate(net.modules()): | ... print(idx, '->', m) | | 0 -> Sequential( | (0): Linear(in_features=2, out_features=2, bias=True) | (1): Linear(in_features=2, out_features=2, bias=True) | ) | 1 -> Linear(in_features=2, out_features=2, bias=True) | | named_buffers(self, prefix: str = '', recurse: bool = True, remove_duplicate: bool = True) -> Iterator[Tuple[str, torch.Tensor]] | Returns an iterator over module buffers, yielding both the | name of the buffer as well as the buffer itself. | | Args: | prefix (str): prefix to prepend to all buffer names. | recurse (bool, optional): if True, then yields buffers of this module | and all submodules. Otherwise, yields only buffers that | are direct members of this module. Defaults to True. | remove_duplicate (bool, optional): whether to remove the duplicated buffers in the result. Defaults to True. | | Yields: | (str, torch.Tensor): Tuple containing the name and buffer | | Example:: | | >>> # xdoctest: +SKIP("undefined vars") | >>> for name, buf in self.named_buffers(): | >>> if name in ['running_var']: | >>> print(buf.size()) | | named_children(self) -> Iterator[Tuple[str, ForwardRef('Module')]] | Returns an iterator over immediate children modules, yielding both | the name of the module as well as the module itself. | | Yields: | (str, Module): Tuple containing a name and child module | | Example:: | | >>> # xdoctest: +SKIP("undefined vars") | >>> for name, module in model.named_children(): | >>> if name in ['conv4', 'conv5']: | >>> print(module) | | named_modules(self, memo: Union[Set[ForwardRef('Module')], NoneType] = None, prefix: str = '', remove_duplicate: bool = True) | Returns an iterator over all modules in the network, yielding | both the name of the module as well as the module itself. | | Args: | memo: a memo to store the set of modules already added to the result | prefix: a prefix that will be added to the name of the module | remove_duplicate: whether to remove the duplicated module instances in the result | or not | | Yields: | (str, Module): Tuple of name and module | | Note: | Duplicate modules are returned only once. In the following | example, ``l`` will be returned only once. | | Example:: | | >>> l = nn.Linear(2, 2) | >>> net = nn.Sequential(l, l) | >>> for idx, m in enumerate(net.named_modules()): | ... print(idx, '->', m) | | 0 -> ('', Sequential( | (0): Linear(in_features=2, out_features=2, bias=True) | (1): Linear(in_features=2, out_features=2, bias=True) | )) | 1 -> ('0', Linear(in_features=2, out_features=2, bias=True)) | | named_parameters(self, prefix: str = '', recurse: bool = True, remove_duplicate: bool = True) -> Iterator[Tuple[str, torch.nn.parameter.Parameter]] | Returns an iterator over module parameters, yielding both the | name of the parameter as well as the parameter itself. | | Args: | prefix (str): prefix to prepend to all parameter names. | recurse (bool): if True, then yields parameters of this module | and all submodules. Otherwise, yields only parameters that | are direct members of this module. | remove_duplicate (bool, optional): whether to remove the duplicated | parameters in the result. Defaults to True. | | Yields: | (str, Parameter): Tuple containing the name and parameter | | Example:: | | >>> # xdoctest: +SKIP("undefined vars") | >>> for name, param in self.named_parameters(): | >>> if name in ['bias']: | >>> print(param.size()) | | parameters(self, recurse: bool = True) -> Iterator[torch.nn.parameter.Parameter] | Returns an iterator over module parameters. | | This is typically passed to an optimizer. | | Args: | recurse (bool): if True, then yields parameters of this module | and all submodules. Otherwise, yields only parameters that | are direct members of this module. | | Yields: | Parameter: module parameter | | Example:: | | >>> # xdoctest: +SKIP("undefined vars") | >>> for param in model.parameters(): | >>> print(type(param), param.size()) | <class 'torch.Tensor'> (20L,) | <class 'torch.Tensor'> (20L, 1L, 5L, 5L) | | register_backward_hook(self, hook: Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]) -> torch.utils.hooks.RemovableHandle | Registers a backward hook on the module. | | This function is deprecated in favor of :meth:`~torch.nn.Module.register_full_backward_hook` and | the behavior of this function will change in future versions. | | Returns: | :class:`torch.utils.hooks.RemovableHandle`: | a handle that can be used to remove the added hook by calling | ``handle.remove()`` | | register_buffer(self, name: str, tensor: Union[torch.Tensor, NoneType], persistent: bool = True) -> None | Adds a buffer to the module. | | This is typically used to register a buffer that should not to be | considered a model parameter. For example, BatchNorm's ``running_mean`` | is not a parameter, but is part of the module's state. Buffers, by | default, are persistent and will be saved alongside parameters. This | behavior can be changed by setting :attr:`persistent` to ``False``. The | only difference between a persistent buffer and a non-persistent buffer | is that the latter will not be a part of this module's | :attr:`state_dict`. | | Buffers can be accessed as attributes using given names. | | Args: | name (str): name of the buffer. The buffer can be accessed | from this module using the given name | tensor (Tensor or None): buffer to be registered. If ``None``, then operations | that run on buffers, such as :attr:`cuda`, are ignored. If ``None``, | the buffer is **not** included in the module's :attr:`state_dict`. | persistent (bool): whether the buffer is part of this module's | :attr:`state_dict`. | | Example:: | | >>> # xdoctest: +SKIP("undefined vars") | >>> self.register_buffer('running_mean', torch.zeros(num_features)) | | register_forward_hook(self, hook: Union[Callable[[~T, Tuple[Any, ...], Any], Union[Any, NoneType]], Callable[[~T, Tuple[Any, ...], Dict[str, Any], Any], Union[Any, NoneType]]], *, prepend: bool = False, with_kwargs: bool = False) -> torch.utils.hooks.RemovableHandle | Registers a forward hook on the module. | | The hook will be called every time after :func:`forward` has computed an output. | | If ``with_kwargs`` is ``False`` or not specified, the input contains only | the positional arguments given to the module. Keyword arguments won't be | passed to the hooks and only to the ``forward``. The hook can modify the | output. It can modify the input inplace but it will not have effect on | forward since this is called after :func:`forward` is called. The hook | should have the following signature:: | | hook(module, args, output) -> None or modified output | | If ``with_kwargs`` is ``True``, the forward hook will be passed the | ``kwargs`` given to the forward function and be expected to return the | output possibly modified. The hook should have the following signature:: | | hook(module, args, kwargs, output) -> None or modified output | | Args: | hook (Callable): The user defined hook to be registered. | prepend (bool): If ``True``, the provided ``hook`` will be fired | before all existing ``forward`` hooks on this | :class:`torch.nn.modules.Module`. Otherwise, the provided | ``hook`` will be fired after all existing ``forward`` hooks on | this :class:`torch.nn.modules.Module`. Note that global | ``forward`` hooks registered with | :func:`register_module_forward_hook` will fire before all hooks | registered by this method. | Default: ``False`` | with_kwargs (bool): If ``True``, the ``hook`` will be passed the | kwargs given to the forward function. | Default: ``False`` | | Returns: | :class:`torch.utils.hooks.RemovableHandle`: | a handle that can be used to remove the added hook by calling | ``handle.remove()`` | | register_forward_pre_hook(self, hook: Union[Callable[[~T, Tuple[Any, ...]], Union[Any, NoneType]], Callable[[~T, Tuple[Any, ...], Dict[str, Any]], Union[Tuple[Any, Dict[str, Any]], NoneType]]], *, prepend: bool = False, with_kwargs: bool = False) -> torch.utils.hooks.RemovableHandle | Registers a forward pre-hook on the module. | | The hook will be called every time before :func:`forward` is invoked. | | | If ``with_kwargs`` is false or not specified, the input contains only | the positional arguments given to the module. Keyword arguments won't be | passed to the hooks and only to the ``forward``. The hook can modify the | input. User can either return a tuple or a single modified value in the | hook. We will wrap the value into a tuple if a single value is returned | (unless that value is already a tuple). The hook should have the | following signature:: | | hook(module, args) -> None or modified input | | If ``with_kwargs`` is true, the forward pre-hook will be passed the | kwargs given to the forward function. And if the hook modifies the | input, both the args and kwargs should be returned. The hook should have | the following signature:: | | hook(module, args, kwargs) -> None or a tuple of modified input and kwargs | | Args: | hook (Callable): The user defined hook to be registered. | prepend (bool): If true, the provided ``hook`` will be fired before | all existing ``forward_pre`` hooks on this | :class:`torch.nn.modules.Module`. Otherwise, the provided | ``hook`` will be fired after all existing ``forward_pre`` hooks | on this :class:`torch.nn.modules.Module`. Note that global | ``forward_pre`` hooks registered with | :func:`register_module_forward_pre_hook` will fire before all | hooks registered by this method. | Default: ``False`` | with_kwargs (bool): If true, the ``hook`` will be passed the kwargs | given to the forward function. | Default: ``False`` | | Returns: | :class:`torch.utils.hooks.RemovableHandle`: | a handle that can be used to remove the added hook by calling | ``handle.remove()`` | | register_full_backward_hook(self, hook: Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]], prepend: bool = False) -> torch.utils.hooks.RemovableHandle | Registers a backward hook on the module. | | The hook will be called every time the gradients with respect to a module | are computed, i.e. the hook will execute if and only if the gradients with | respect to module outputs are computed. The hook should have the following | signature:: | | hook(module, grad_input, grad_output) -> tuple(Tensor) or None | | The :attr:`grad_input` and :attr:`grad_output` are tuples that contain the gradients | with respect to the inputs and outputs respectively. The hook should | not modify its arguments, but it can optionally return a new gradient with | respect to the input that will be used in place of :attr:`grad_input` in | subsequent computations. :attr:`grad_input` will only correspond to the inputs given | as positional arguments and all kwarg arguments are ignored. Entries | in :attr:`grad_input` and :attr:`grad_output` will be ``None`` for all non-Tensor | arguments. | | For technical reasons, when this hook is applied to a Module, its forward function will | receive a view of each Tensor passed to the Module. Similarly the caller will receive a view | of each Tensor returned by the Module's forward function. | | .. warning :: | Modifying inputs or outputs inplace is not allowed when using backward hooks and | will raise an error. | | Args: | hook (Callable): The user-defined hook to be registered. | prepend (bool): If true, the provided ``hook`` will be fired before | all existing ``backward`` hooks on this | :class:`torch.nn.modules.Module`. Otherwise, the provided | ``hook`` will be fired after all existing ``backward`` hooks on | this :class:`torch.nn.modules.Module`. Note that global | ``backward`` hooks registered with | :func:`register_module_full_backward_hook` will fire before | all hooks registered by this method. | | Returns: | :class:`torch.utils.hooks.RemovableHandle`: | a handle that can be used to remove the added hook by calling | ``handle.remove()`` | | register_full_backward_pre_hook(self, hook: Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]], prepend: bool = False) -> torch.utils.hooks.RemovableHandle | Registers a backward pre-hook on the module. | | The hook will be called every time the gradients for the module are computed. | The hook should have the following signature:: | | hook(module, grad_output) -> Tensor or None | | The :attr:`grad_output` is a tuple. The hook should | not modify its arguments, but it can optionally return a new gradient with | respect to the output that will be used in place of :attr:`grad_output` in | subsequent computations. Entries in :attr:`grad_output` will be ``None`` for | all non-Tensor arguments. | | For technical reasons, when this hook is applied to a Module, its forward function will | receive a view of each Tensor passed to the Module. Similarly the caller will receive a view | of each Tensor returned by the Module's forward function. | | .. warning :: | Modifying inputs inplace is not allowed when using backward hooks and | will raise an error. | | Args: | hook (Callable): The user-defined hook to be registered. | prepend (bool): If true, the provided ``hook`` will be fired before | all existing ``backward_pre`` hooks on this | :class:`torch.nn.modules.Module`. Otherwise, the provided | ``hook`` will be fired after all existing ``backward_pre`` hooks | on this :class:`torch.nn.modules.Module`. Note that global | ``backward_pre`` hooks registered with | :func:`register_module_full_backward_pre_hook` will fire before | all hooks registered by this method. | | Returns: | :class:`torch.utils.hooks.RemovableHandle`: | a handle that can be used to remove the added hook by calling | ``handle.remove()`` | | register_load_state_dict_post_hook(self, hook) | Registers a post hook to be run after module's ``load_state_dict`` | is called. | | It should have the following signature:: | hook(module, incompatible_keys) -> None | | The ``module`` argument is the current module that this hook is registered | on, and the ``incompatible_keys`` argument is a ``NamedTuple`` consisting | of attributes ``missing_keys`` and ``unexpected_keys``. ``missing_keys`` | is a ``list`` of ``str`` containing the missing keys and | ``unexpected_keys`` is a ``list`` of ``str`` containing the unexpected keys. | | The given incompatible_keys can be modified inplace if needed. | | Note that the checks performed when calling :func:`load_state_dict` with | ``strict=True`` are affected by modifications the hook makes to | ``missing_keys`` or ``unexpected_keys``, as expected. Additions to either | set of keys will result in an error being thrown when ``strict=True``, and | clearing out both missing and unexpected keys will avoid an error. | | Returns: | :class:`torch.utils.hooks.RemovableHandle`: | a handle that can be used to remove the added hook by calling | ``handle.remove()`` | | register_module(self, name: str, module: Union[ForwardRef('Module'), NoneType]) -> None | Alias for :func:`add_module`. | | register_parameter(self, name: str, param: Union[torch.nn.parameter.Parameter, NoneType]) -> None | Adds a parameter to the module. | | The parameter can be accessed as an attribute using given name. | | Args: | name (str): name of the parameter. The parameter can be accessed | from this module using the given name | param (Parameter or None): parameter to be added to the module. If | ``None``, then operations that run on parameters, such as :attr:`cuda`, | are ignored. If ``None``, the parameter is **not** included in the | module's :attr:`state_dict`. | | register_state_dict_pre_hook(self, hook) | These hooks will be called with arguments: ``self``, ``prefix``, | and ``keep_vars`` before calling ``state_dict`` on ``self``. The registered | hooks can be used to perform pre-processing before the ``state_dict`` | call is made. | | requires_grad_(self: ~T, requires_grad: bool = True) -> ~T | Change if autograd should record operations on parameters in this | module. | | This method sets the parameters' :attr:`requires_grad` attributes | in-place. | | This method is helpful for freezing part of the module for finetuning | or training parts of a model individually (e.g., GAN training). | | See :ref:`locally-disable-grad-doc` for a comparison between | `.requires_grad_()` and several similar mechanisms that may be confused with it. | | Args: | requires_grad (bool): whether autograd should record operations on | parameters in this module. Default: ``True``. | | Returns: | Module: self | | set_extra_state(self, state: Any) | This function is called from :func:`load_state_dict` to handle any extra state | found within the `state_dict`. Implement this function and a corresponding | :func:`get_extra_state` for your module if you need to store extra state within its | `state_dict`. | | Args: | state (dict): Extra state from the `state_dict` | | share_memory(self: ~T) -> ~T | See :meth:`torch.Tensor.share_memory_` | | state_dict(self, *args, destination=None, prefix='', keep_vars=False) | Returns a dictionary containing references to the whole state of the module. | | Both parameters and persistent buffers (e.g. running averages) are | included. Keys are corresponding parameter and buffer names. | Parameters and buffers set to ``None`` are not included. | | .. note:: | The returned object is a shallow copy. It contains references | to the module's parameters and buffers. | | .. warning:: | Currently ``state_dict()`` also accepts positional arguments for | ``destination``, ``prefix`` and ``keep_vars`` in order. However, | this is being deprecated and keyword arguments will be enforced in | future releases. | | .. warning:: | Please avoid the use of argument ``destination`` as it is not | designed for end-users. | | Args: | destination (dict, optional): If provided, the state of module will | be updated into the dict and the same object is returned. | Otherwise, an ``OrderedDict`` will be created and returned. | Default: ``None``. | prefix (str, optional): a prefix added to parameter and buffer | names to compose the keys in state_dict. Default: ``''``. | keep_vars (bool, optional): by default the :class:`~torch.Tensor` s | returned in the state dict are detached from autograd. If it's | set to ``True``, detaching will not be performed. | Default: ``False``. | | Returns: | dict: | a dictionary containing a whole state of the module | | Example:: | | >>> # xdoctest: +SKIP("undefined vars") | >>> module.state_dict().keys() | ['bias', 'weight'] | | to(self, *args, **kwargs) | Moves and/or casts the parameters and buffers. | | This can be called as | | .. function:: to(device=None, dtype=None, non_blocking=False) | :noindex: | | .. function:: to(dtype, non_blocking=False) | :noindex: | | .. function:: to(tensor, non_blocking=False) | :noindex: | | .. function:: to(memory_format=torch.channels_last) | :noindex: | | Its signature is similar to :meth:`torch.Tensor.to`, but only accepts | floating point or complex :attr:`dtype`\ s. In addition, this method will | only cast the floating point or complex parameters and buffers to :attr:`dtype` | (if given). The integral parameters and buffers will be moved | :attr:`device`, if that is given, but with dtypes unchanged. When | :attr:`non_blocking` is set, it tries to convert/move asynchronously | with respect to the host if possible, e.g., moving CPU Tensors with | pinned memory to CUDA devices. | | See below for examples. | | .. note:: | This method modifies the module in-place. | | Args: | device (:class:`torch.device`): the desired device of the parameters | and buffers in this module | dtype (:class:`torch.dtype`): the desired floating point or complex dtype of | the parameters and buffers in this module | tensor (torch.Tensor): Tensor whose dtype and device are the desired | dtype and device for all parameters and buffers in this module | memory_format (:class:`torch.memory_format`): the desired memory | format for 4D parameters and buffers in this module (keyword | only argument) | | Returns: | Module: self | | Examples:: | | >>> # xdoctest: +IGNORE_WANT("non-deterministic") | >>> linear = nn.Linear(2, 2) | >>> linear.weight | Parameter containing: | tensor([[ 0.1913, -0.3420], | [-0.5113, -0.2325]]) | >>> linear.to(torch.double) | Linear(in_features=2, out_features=2, bias=True) | >>> linear.weight | Parameter containing: | tensor([[ 0.1913, -0.3420], | [-0.5113, -0.2325]], dtype=torch.float64) | >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA1) | >>> gpu1 = torch.device("cuda:1") | >>> linear.to(gpu1, dtype=torch.half, non_blocking=True) | Linear(in_features=2, out_features=2, bias=True) | >>> linear.weight | Parameter containing: | tensor([[ 0.1914, -0.3420], | [-0.5112, -0.2324]], dtype=torch.float16, device='cuda:1') | >>> cpu = torch.device("cpu") | >>> linear.to(cpu) | Linear(in_features=2, out_features=2, bias=True) | >>> linear.weight | Parameter containing: | tensor([[ 0.1914, -0.3420], | [-0.5112, -0.2324]], dtype=torch.float16) | | >>> linear = nn.Linear(2, 2, bias=None).to(torch.cdouble) | >>> linear.weight | Parameter containing: | tensor([[ 0.3741+0.j, 0.2382+0.j], | [ 0.5593+0.j, -0.4443+0.j]], dtype=torch.complex128) | >>> linear(torch.ones(3, 2, dtype=torch.cdouble)) | tensor([[0.6122+0.j, 0.1150+0.j], | [0.6122+0.j, 0.1150+0.j], | [0.6122+0.j, 0.1150+0.j]], dtype=torch.complex128) | | to_empty(self: ~T, *, device: Union[str, torch.device]) -> ~T | Moves the parameters and buffers to the specified device without copying storage. | | Args: | device (:class:`torch.device`): The desired device of the parameters | and buffers in this module. | | Returns: | Module: self | | train(self: ~T, mode: bool = True) -> ~T | Sets the module in training mode. | | This has any effect only on certain modules. See documentations of | particular modules for details of their behaviors in training/evaluation | mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, | etc. | | Args: | mode (bool): whether to set training mode (``True``) or evaluation | mode (``False``). Default: ``True``. | | Returns: | Module: self | | type(self: ~T, dst_type: Union[torch.dtype, str]) -> ~T | Casts all parameters and buffers to :attr:`dst_type`. | | .. note:: | This method modifies the module in-place. | | Args: | dst_type (type or string): the desired type | | Returns: | Module: self | | xpu(self: ~T, device: Union[int, torch.device, NoneType] = None) -> ~T | Moves all model parameters and buffers to the XPU. | | This also makes associated parameters and buffers different objects. So | it should be called before constructing optimizer if the module will | live on XPU while being optimized. | | .. note:: | This method modifies the module in-place. | | Arguments: | device (int, optional): if specified, all parameters will be | copied to that device | | Returns: | Module: self | | zero_grad(self, set_to_none: bool = True) -> None | Sets gradients of all model parameters to zero. See similar function | under :class:`torch.optim.Optimizer` for more context. | | Args: | set_to_none (bool): instead of setting to zero, set the grads to None. | See :meth:`torch.optim.Optimizer.zero_grad` for details. | | ---------------------------------------------------------------------- | Data descriptors inherited from torch.nn.modules.module.Module: | | __dict__ | dictionary for instance variables (if defined) | | __weakref__ | list of weak references to the object (if defined) | | ---------------------------------------------------------------------- | Data and other attributes inherited from torch.nn.modules.module.Module: | | T_destination = ~T_destination | | __annotations__ = {'__call__': typing.Callable[..., typing.Any], '_bac... | | call_super_init = False | | dump_patches = False
def look_ahead_iterator(gen):
prev = None
for item in gen:
if prev is not None:
yield (prev, item)
prev = item
class Bigrams(IterableDataset):
def __init__(self, text_file, vocabulary_size):
self.vocab = build_vocab_from_iterator(
get_word_lines_from_file(text_file),
max_tokens = vocabulary_size,
specials = ['<unk>'])
self.vocab.set_default_index(self.vocab['<unk>'])
self.vocabulary_size = vocabulary_size
self.text_file = text_file
def __iter__(self):
return look_ahead_iterator(
(self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))
train_dataset = Bigrams(train_file, vocab_size)
1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000 21000 22000 23000 24000 25000 26000 27000 28000 29000 30000 31000 32000 33000 34000 35000 36000 37000 38000 39000 40000 41000 42000 43000 44000 45000 46000 47000 48000 49000 50000 51000 52000 53000 54000 55000 56000 57000 58000 59000 60000 61000 62000 63000 64000 65000 66000 67000 68000 69000 70000 71000 72000 73000 74000 75000 76000 77000 78000 79000 80000 81000 82000 83000 84000 85000 86000 87000 88000 89000 90000 91000 92000 93000 94000 95000 96000 97000 98000 99000 100000 101000 102000 103000 104000 105000 106000 107000 108000 109000 110000 111000 112000 113000 114000 115000 116000 117000 118000 119000 120000 121000 122000 123000 124000 125000 126000 127000 128000 129000 130000 131000 132000 133000 134000 135000 136000 137000 138000 139000 140000 141000 142000 143000 144000 145000 146000 147000 148000 149000 150000 151000 152000 153000 154000 155000 156000 157000 158000 159000 160000 161000 162000 163000 164000 165000 166000 167000 168000 169000 170000 171000 172000 173000 174000 175000 176000 177000 178000 179000 180000 181000 182000 183000 184000 185000 186000 187000 188000 189000 190000 191000 192000 193000 194000 195000 196000 197000 198000 199000 200000 201000 202000 203000 204000 205000 206000 207000 208000 209000 210000 211000 212000 213000 214000 215000 216000 217000 218000 219000 220000 221000 222000 223000 224000 225000 226000 227000 228000 229000 230000 231000 232000 233000 234000 235000 236000 237000 238000 239000 240000 241000 242000 243000 244000 245000 246000 247000 248000 249000 250000 251000 252000 253000 254000 255000 256000 257000 258000 259000 260000 261000 262000 263000 264000 265000 266000 267000 268000 269000 270000 271000 272000 273000 274000 275000 276000 277000 278000 279000 280000 281000 282000 283000 284000 285000 286000 287000 288000 289000 290000 291000 292000 293000 294000 295000 296000 297000 298000 299000 300000 301000 302000 303000 304000 305000 306000 307000 308000 309000 310000 311000 312000 313000 314000 315000 316000 317000 318000 319000 320000 321000 322000 323000 324000 325000 326000 327000 328000 329000 330000 331000 332000 333000 334000 335000 336000 337000 338000 339000 340000 341000 342000 343000 344000 345000 346000 347000 348000 349000 350000 351000 352000 353000 354000 355000 356000 357000 358000 359000 360000 361000 362000 363000 364000 365000 366000 367000 368000 369000 370000 371000 372000 373000 374000 375000 376000 377000 378000 379000 380000 381000 382000 383000 384000 385000 386000 387000 388000 389000 390000 391000 392000 393000 394000 395000 396000 397000 398000 399000 400000 401000 402000 403000 404000 405000 406000 407000 408000 409000 410000 411000 412000 413000 414000 415000 416000 417000 418000 419000 420000 421000 422000 423000 424000 425000 426000 427000 428000 429000 430000 431000 432000
print(train_dataset)
<__main__.Bigrams object at 0x7fdd26d23940>
torch.cuda.memory_summary(device=None, abbreviated=False)
'|===========================================================================|\n| PyTorch CUDA memory summary, device ID 0 |\n|---------------------------------------------------------------------------|\n| CUDA OOMs: 1 | cudaMalloc retries: 1 |\n|===========================================================================|\n| Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed |\n|---------------------------------------------------------------------------|\n| Allocated memory | 699613 KiB | 1903 MiB | 3735 MiB | 3052 MiB |\n| from large pool | 699414 KiB | 1903 MiB | 3734 MiB | 3051 MiB |\n| from small pool | 199 KiB | 1 MiB | 1 MiB | 1 MiB |\n|---------------------------------------------------------------------------|\n| Active memory | 699613 KiB | 1903 MiB | 3735 MiB | 3052 MiB |\n| from large pool | 699414 KiB | 1903 MiB | 3734 MiB | 3051 MiB |\n| from small pool | 199 KiB | 1 MiB | 1 MiB | 1 MiB |\n|---------------------------------------------------------------------------|\n| Requested memory | 699611 KiB | 1903 MiB | 3735 MiB | 3052 MiB |\n| from large pool | 699413 KiB | 1903 MiB | 3734 MiB | 3051 MiB |\n| from small pool | 197 KiB | 1 MiB | 1 MiB | 1 MiB |\n|---------------------------------------------------------------------------|\n| GPU reserved memory | 710656 KiB | 1918 MiB | 1918 MiB | 1224 MiB |\n| from large pool | 708608 KiB | 1916 MiB | 1916 MiB | 1224 MiB |\n| from small pool | 2048 KiB | 2 MiB | 2 MiB | 0 MiB |\n|---------------------------------------------------------------------------|\n| Non-releasable memory | 11043 KiB | 19364 KiB | 28939 KiB | 17896 KiB |\n| from large pool | 9194 KiB | 17514 KiB | 25954 KiB | 16760 KiB |\n| from small pool | 1849 KiB | 1950 KiB | 2985 KiB | 1136 KiB |\n|---------------------------------------------------------------------------|\n| Allocations | 10 | 17 | 38 | 28 |\n| from large pool | 5 | 7 | 10 | 5 |\n| from small pool | 5 | 11 | 28 | 23 |\n|---------------------------------------------------------------------------|\n| Active allocs | 10 | 17 | 38 | 28 |\n| from large pool | 5 | 7 | 10 | 5 |\n| from small pool | 5 | 11 | 28 | 23 |\n|---------------------------------------------------------------------------|\n| GPU reserved segments | 5 | 7 | 7 | 2 |\n| from large pool | 4 | 6 | 6 | 2 |\n| from small pool | 1 | 1 | 1 | 0 |\n|---------------------------------------------------------------------------|\n| Non-releasable allocs | 6 | 8 | 20 | 14 |\n| from large pool | 4 | 6 | 9 | 5 |\n| from small pool | 2 | 3 | 11 | 9 |\n|---------------------------------------------------------------------------|\n| Oversize allocations | 0 | 0 | 0 | 0 |\n|---------------------------------------------------------------------------|\n| Oversize GPU segments | 0 | 0 | 0 | 0 |\n|===========================================================================|\n'
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"
device = 'cuda'
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
data = DataLoader(train_dataset, batch_size=batch_s)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.NLLLoss()
torch.cuda.empty_cache()
gc.collect()
model.load_state_dict(torch.load('model-bigram_final.bin'))
for i in range(1, epochs+1):
print('epoch: =', i)
model.train()
step = 0
for x, y in data: # prev, predicting, following words
x = x.to(device)
y = y.to(device)
optimizer.zero_grad()
ypredicted = model(x) #previous, following word
loss = criterion(torch.log(ypredicted), y)
if step % 100 == 0:
print(step, loss)
step += 1
loss.backward()
optimizer.step()
torch.save(model.state_dict(), f'model-bigram_2nd-run{i}.bin')
torch.save(model.state_dict(), f'model-bigram_final.bin')
epoch: = 1
/home/gedin/.local/lib/python3.8/site-packages/torch/nn/modules/container.py:217: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument. input = module(input)
0 tensor(5.9599, device='cuda:0', grad_fn=<NllLossBackward0>) 1000 100 tensor(6.1015, device='cuda:0', grad_fn=<NllLossBackward0>) 200 tensor(5.9708, device='cuda:0', grad_fn=<NllLossBackward0>) 2000 300 tensor(6.2176, device='cuda:0', grad_fn=<NllLossBackward0>) 3000 400 tensor(5.9401, device='cuda:0', grad_fn=<NllLossBackward0>) 4000 500 tensor(6.2084, device='cuda:0', grad_fn=<NllLossBackward0>) 5000 600 tensor(5.9736, device='cuda:0', grad_fn=<NllLossBackward0>) 6000 700 tensor(6.1423, device='cuda:0', grad_fn=<NllLossBackward0>) 7000 800 tensor(5.7344, device='cuda:0', grad_fn=<NllLossBackward0>) 8000 900 tensor(6.0950, device='cuda:0', grad_fn=<NllLossBackward0>) 9000 1000 tensor(5.8473, device='cuda:0', grad_fn=<NllLossBackward0>) 10000 1100 tensor(6.0612, device='cuda:0', grad_fn=<NllLossBackward0>) 11000 1200 tensor(6.1509, device='cuda:0', grad_fn=<NllLossBackward0>) 12000 1300 tensor(6.0760, device='cuda:0', grad_fn=<NllLossBackward0>) 13000 1400 tensor(6.2047, device='cuda:0', grad_fn=<NllLossBackward0>) 14000 1500 tensor(6.1186, device='cuda:0', grad_fn=<NllLossBackward0>) 15000 1600 tensor(5.8722, device='cuda:0', grad_fn=<NllLossBackward0>) 16000 1700 tensor(5.8741, device='cuda:0', grad_fn=<NllLossBackward0>) 17000 1800 tensor(5.8971, device='cuda:0', grad_fn=<NllLossBackward0>) 18000 1900 tensor(5.8521, device='cuda:0', grad_fn=<NllLossBackward0>) 19000 2000 tensor(5.9434, device='cuda:0', grad_fn=<NllLossBackward0>) 20000 2100 tensor(6.0348, device='cuda:0', grad_fn=<NllLossBackward0>) 21000 2200 tensor(5.8840, device='cuda:0', grad_fn=<NllLossBackward0>) 22000 2300 tensor(5.8641, device='cuda:0', grad_fn=<NllLossBackward0>) 23000 2400 tensor(5.9068, device='cuda:0', grad_fn=<NllLossBackward0>) 24000 2500 tensor(5.9170, device='cuda:0', grad_fn=<NllLossBackward0>) 25000 2600 tensor(5.9812, device='cuda:0', grad_fn=<NllLossBackward0>) 26000 2700 tensor(5.8985, device='cuda:0', grad_fn=<NllLossBackward0>) 27000 2800 tensor(6.0008, device='cuda:0', grad_fn=<NllLossBackward0>) 28000 2900 tensor(6.1230, device='cuda:0', grad_fn=<NllLossBackward0>) 29000 3000 tensor(5.8770, device='cuda:0', grad_fn=<NllLossBackward0>) 30000 3100 tensor(5.9268, device='cuda:0', grad_fn=<NllLossBackward0>) 31000 3200 tensor(5.8530, device='cuda:0', grad_fn=<NllLossBackward0>) 32000 3300 tensor(5.8436, device='cuda:0', grad_fn=<NllLossBackward0>) 33000 3400 tensor(5.7692, device='cuda:0', grad_fn=<NllLossBackward0>) 34000 3500 tensor(5.8909, device='cuda:0', grad_fn=<NllLossBackward0>) 35000 3600 tensor(5.8325, device='cuda:0', grad_fn=<NllLossBackward0>) 36000 3700 tensor(5.8082, device='cuda:0', grad_fn=<NllLossBackward0>) 37000 3800 tensor(5.8106, device='cuda:0', grad_fn=<NllLossBackward0>) 38000 3900 tensor(5.6382, device='cuda:0', grad_fn=<NllLossBackward0>) 39000 4000 tensor(5.6596, device='cuda:0', grad_fn=<NllLossBackward0>) 40000 4100 tensor(5.9587, device='cuda:0', grad_fn=<NllLossBackward0>) 41000 4200 tensor(5.8862, device='cuda:0', grad_fn=<NllLossBackward0>) 42000 4300 tensor(5.9541, device='cuda:0', grad_fn=<NllLossBackward0>) 43000 4400 tensor(5.8681, device='cuda:0', grad_fn=<NllLossBackward0>) 44000 4500 tensor(5.6963, device='cuda:0', grad_fn=<NllLossBackward0>) 45000 4600 tensor(6.0707, device='cuda:0', grad_fn=<NllLossBackward0>) 46000 4700 tensor(5.7091, device='cuda:0', grad_fn=<NllLossBackward0>) 47000 4800 tensor(5.8139, device='cuda:0', grad_fn=<NllLossBackward0>) 48000 4900 tensor(5.8696, device='cuda:0', grad_fn=<NllLossBackward0>) 49000 5000 tensor(5.8844, device='cuda:0', grad_fn=<NllLossBackward0>) 50000 5100 tensor(5.9806, device='cuda:0', grad_fn=<NllLossBackward0>) 51000 5200 tensor(6.0075, device='cuda:0', grad_fn=<NllLossBackward0>) 52000 5300 tensor(6.0588, device='cuda:0', grad_fn=<NllLossBackward0>) 53000 5400 tensor(5.8456, device='cuda:0', grad_fn=<NllLossBackward0>) 54000 5500 tensor(5.9166, device='cuda:0', grad_fn=<NllLossBackward0>) 55000 5600 tensor(5.6528, device='cuda:0', grad_fn=<NllLossBackward0>) 56000 5700 tensor(5.8988, device='cuda:0', grad_fn=<NllLossBackward0>) 57000 5800 tensor(5.9132, device='cuda:0', grad_fn=<NllLossBackward0>) 58000 5900 tensor(5.9460, device='cuda:0', grad_fn=<NllLossBackward0>) 59000 6000 tensor(5.7543, device='cuda:0', grad_fn=<NllLossBackward0>) 60000 6100 tensor(5.8256, device='cuda:0', grad_fn=<NllLossBackward0>) 61000 6200 tensor(5.9448, device='cuda:0', grad_fn=<NllLossBackward0>) 62000 6300 tensor(5.7601, device='cuda:0', grad_fn=<NllLossBackward0>) 63000 6400 tensor(5.7091, device='cuda:0', grad_fn=<NllLossBackward0>) 64000 6500 tensor(5.5621, device='cuda:0', grad_fn=<NllLossBackward0>) 65000 6600 tensor(5.7094, device='cuda:0', grad_fn=<NllLossBackward0>) 66000 6700 tensor(5.6785, device='cuda:0', grad_fn=<NllLossBackward0>) 67000 6800 tensor(5.9249, device='cuda:0', grad_fn=<NllLossBackward0>) 68000 6900 tensor(5.8775, device='cuda:0', grad_fn=<NllLossBackward0>) 69000 7000 tensor(5.8075, device='cuda:0', grad_fn=<NllLossBackward0>) 70000 7100 tensor(5.5748, device='cuda:0', grad_fn=<NllLossBackward0>) 71000 7200 tensor(5.7217, device='cuda:0', grad_fn=<NllLossBackward0>) 72000 7300 tensor(5.9124, device='cuda:0', grad_fn=<NllLossBackward0>) 73000 7400 tensor(5.7197, device='cuda:0', grad_fn=<NllLossBackward0>) 74000 7500 tensor(5.6429, device='cuda:0', grad_fn=<NllLossBackward0>) 75000 7600 tensor(5.6847, device='cuda:0', grad_fn=<NllLossBackward0>) 76000 7700 tensor(5.7197, device='cuda:0', grad_fn=<NllLossBackward0>) 77000 7800 tensor(5.8559, device='cuda:0', grad_fn=<NllLossBackward0>) 78000 7900 tensor(5.5600, device='cuda:0', grad_fn=<NllLossBackward0>) 79000 8000 tensor(5.6288, device='cuda:0', grad_fn=<NllLossBackward0>) 80000 8100 tensor(5.7767, device='cuda:0', grad_fn=<NllLossBackward0>) 81000 8200 tensor(5.8037, device='cuda:0', grad_fn=<NllLossBackward0>) 82000 8300 tensor(5.7344, device='cuda:0', grad_fn=<NllLossBackward0>) 83000 8400 tensor(5.8092, device='cuda:0', grad_fn=<NllLossBackward0>) 84000 8500 tensor(5.8847, device='cuda:0', grad_fn=<NllLossBackward0>) 85000 8600 tensor(5.8754, device='cuda:0', grad_fn=<NllLossBackward0>) 86000 8700 tensor(5.9227, device='cuda:0', grad_fn=<NllLossBackward0>) 87000 8800 tensor(5.8028, device='cuda:0', grad_fn=<NllLossBackward0>) 88000 8900 tensor(5.6476, device='cuda:0', grad_fn=<NllLossBackward0>) 89000 9000 tensor(5.7656, device='cuda:0', grad_fn=<NllLossBackward0>) 90000 9100 tensor(5.7805, device='cuda:0', grad_fn=<NllLossBackward0>) 91000 9200 tensor(5.6879, device='cuda:0', grad_fn=<NllLossBackward0>) 92000 9300 tensor(5.7098, device='cuda:0', grad_fn=<NllLossBackward0>) 93000 9400 tensor(5.5631, device='cuda:0', grad_fn=<NllLossBackward0>) 94000 9500 tensor(5.6497, device='cuda:0', grad_fn=<NllLossBackward0>) 95000 9600 tensor(5.7500, device='cuda:0', grad_fn=<NllLossBackward0>) 96000 9700 tensor(5.6607, device='cuda:0', grad_fn=<NllLossBackward0>) 97000 9800 tensor(5.7196, device='cuda:0', grad_fn=<NllLossBackward0>) 9900 tensor(5.5987, device='cuda:0', grad_fn=<NllLossBackward0>) 98000 10000 tensor(5.7795, device='cuda:0', grad_fn=<NllLossBackward0>) 99000 10100 tensor(5.6980, device='cuda:0', grad_fn=<NllLossBackward0>) 100000 10200 tensor(5.6093, device='cuda:0', grad_fn=<NllLossBackward0>) 101000 10300 tensor(5.6792, device='cuda:0', grad_fn=<NllLossBackward0>) 102000 10400 tensor(5.7035, device='cuda:0', grad_fn=<NllLossBackward0>) 103000 10500 tensor(5.8282, device='cuda:0', grad_fn=<NllLossBackward0>) 104000 10600 tensor(5.8605, device='cuda:0', grad_fn=<NllLossBackward0>) 105000 10700 tensor(5.7354, device='cuda:0', grad_fn=<NllLossBackward0>) 106000 10800 tensor(5.8034, device='cuda:0', grad_fn=<NllLossBackward0>) 107000 10900 tensor(5.6194, device='cuda:0', grad_fn=<NllLossBackward0>) 108000 11000 tensor(5.8502, device='cuda:0', grad_fn=<NllLossBackward0>) 109000 11100 tensor(5.4406, device='cuda:0', grad_fn=<NllLossBackward0>) 110000 11200 tensor(5.6379, device='cuda:0', grad_fn=<NllLossBackward0>) 111000 11300 tensor(5.6668, device='cuda:0', grad_fn=<NllLossBackward0>) 112000 11400 tensor(5.6140, device='cuda:0', grad_fn=<NllLossBackward0>) 113000 11500 tensor(5.6565, device='cuda:0', grad_fn=<NllLossBackward0>) 114000 11600 tensor(5.6308, device='cuda:0', grad_fn=<NllLossBackward0>) 115000 11700 tensor(5.5680, device='cuda:0', grad_fn=<NllLossBackward0>) 116000 11800 tensor(5.7604, device='cuda:0', grad_fn=<NllLossBackward0>) 117000 11900 tensor(5.5792, device='cuda:0', grad_fn=<NllLossBackward0>) 118000 12000 tensor(5.7329, device='cuda:0', grad_fn=<NllLossBackward0>) 119000 12100 tensor(5.7726, device='cuda:0', grad_fn=<NllLossBackward0>) 120000 12200 tensor(5.7151, device='cuda:0', grad_fn=<NllLossBackward0>) 121000 12300 tensor(5.8561, device='cuda:0', grad_fn=<NllLossBackward0>) 122000 12400 tensor(5.6791, device='cuda:0', grad_fn=<NllLossBackward0>) 123000 12500 tensor(5.5574, device='cuda:0', grad_fn=<NllLossBackward0>) 124000 12600 tensor(5.6817, device='cuda:0', grad_fn=<NllLossBackward0>) 125000 12700 tensor(5.5375, device='cuda:0', grad_fn=<NllLossBackward0>) 126000 12800 tensor(5.7270, device='cuda:0', grad_fn=<NllLossBackward0>) 127000 12900 tensor(5.6252, device='cuda:0', grad_fn=<NllLossBackward0>) 128000 13000 tensor(5.4536, device='cuda:0', grad_fn=<NllLossBackward0>) 129000 13100 tensor(5.6091, device='cuda:0', grad_fn=<NllLossBackward0>) 130000 13200 tensor(5.7324, device='cuda:0', grad_fn=<NllLossBackward0>) 131000 13300 tensor(5.5253, device='cuda:0', grad_fn=<NllLossBackward0>) 132000 13400 tensor(5.6491, device='cuda:0', grad_fn=<NllLossBackward0>) 133000 13500 tensor(5.5728, device='cuda:0', grad_fn=<NllLossBackward0>) 134000 13600 tensor(5.6632, device='cuda:0', grad_fn=<NllLossBackward0>) 135000 13700 tensor(5.6678, device='cuda:0', grad_fn=<NllLossBackward0>) 136000 13800 tensor(5.6112, device='cuda:0', grad_fn=<NllLossBackward0>) 137000 13900 tensor(5.4884, device='cuda:0', grad_fn=<NllLossBackward0>) 138000 14000 tensor(5.7304, device='cuda:0', grad_fn=<NllLossBackward0>) 139000 14100 tensor(5.4326, device='cuda:0', grad_fn=<NllLossBackward0>) 140000 14200 tensor(5.7188, device='cuda:0', grad_fn=<NllLossBackward0>) 141000 14300 tensor(5.6519, device='cuda:0', grad_fn=<NllLossBackward0>) 142000 14400 tensor(5.5892, device='cuda:0', grad_fn=<NllLossBackward0>) 143000 14500 tensor(5.7225, device='cuda:0', grad_fn=<NllLossBackward0>) 144000 14600 tensor(5.7216, device='cuda:0', grad_fn=<NllLossBackward0>) 145000 14700 tensor(5.5748, device='cuda:0', grad_fn=<NllLossBackward0>) 146000 14800 tensor(6.0184, device='cuda:0', grad_fn=<NllLossBackward0>) 147000 14900 tensor(5.6781, device='cuda:0', grad_fn=<NllLossBackward0>) 148000 15000 tensor(5.6038, device='cuda:0', grad_fn=<NllLossBackward0>) 149000 15100 tensor(5.7875, device='cuda:0', grad_fn=<NllLossBackward0>) 150000 15200 tensor(5.6485, device='cuda:0', grad_fn=<NllLossBackward0>) 151000 15300 tensor(5.5927, device='cuda:0', grad_fn=<NllLossBackward0>) 152000 15400 tensor(5.5156, device='cuda:0', grad_fn=<NllLossBackward0>) 153000 15500 tensor(5.6556, device='cuda:0', grad_fn=<NllLossBackward0>) 154000 15600 tensor(5.6485, device='cuda:0', grad_fn=<NllLossBackward0>) 155000 15700 tensor(5.5904, device='cuda:0', grad_fn=<NllLossBackward0>) 156000 15800 tensor(5.4613, device='cuda:0', grad_fn=<NllLossBackward0>) 157000 15900 tensor(5.6254, device='cuda:0', grad_fn=<NllLossBackward0>) 158000 16000 tensor(5.4349, device='cuda:0', grad_fn=<NllLossBackward0>) 159000 16100 tensor(5.5205, device='cuda:0', grad_fn=<NllLossBackward0>) 160000 16200 tensor(5.8051, device='cuda:0', grad_fn=<NllLossBackward0>) 161000 16300 tensor(5.6452, device='cuda:0', grad_fn=<NllLossBackward0>) 162000 16400 tensor(5.6071, device='cuda:0', grad_fn=<NllLossBackward0>) 163000 16500 tensor(5.7237, device='cuda:0', grad_fn=<NllLossBackward0>) 164000 16600 tensor(5.5771, device='cuda:0', grad_fn=<NllLossBackward0>) 165000 16700 tensor(5.5355, device='cuda:0', grad_fn=<NllLossBackward0>) 166000 16800 tensor(5.6363, device='cuda:0', grad_fn=<NllLossBackward0>) 167000 16900 tensor(5.3746, device='cuda:0', grad_fn=<NllLossBackward0>) 168000 17000 tensor(5.6707, device='cuda:0', grad_fn=<NllLossBackward0>) 169000 17100 tensor(5.5359, device='cuda:0', grad_fn=<NllLossBackward0>) 170000 17200 tensor(5.6118, device='cuda:0', grad_fn=<NllLossBackward0>) 171000 17300 tensor(5.6740, device='cuda:0', grad_fn=<NllLossBackward0>) 172000 17400 tensor(5.4438, device='cuda:0', grad_fn=<NllLossBackward0>) 173000 17500 tensor(5.5001, device='cuda:0', grad_fn=<NllLossBackward0>) 174000 17600 tensor(5.4953, device='cuda:0', grad_fn=<NllLossBackward0>) 175000 17700 tensor(5.5398, device='cuda:0', grad_fn=<NllLossBackward0>) 176000 17800 tensor(5.6053, device='cuda:0', grad_fn=<NllLossBackward0>) 177000 17900 tensor(5.4726, device='cuda:0', grad_fn=<NllLossBackward0>) 178000 18000 tensor(5.6747, device='cuda:0', grad_fn=<NllLossBackward0>) 179000 18100 tensor(5.6238, device='cuda:0', grad_fn=<NllLossBackward0>) 180000 18200 tensor(5.5469, device='cuda:0', grad_fn=<NllLossBackward0>) 181000 18300 tensor(5.5299, device='cuda:0', grad_fn=<NllLossBackward0>) 182000 18400 tensor(5.6323, device='cuda:0', grad_fn=<NllLossBackward0>) 183000 18500 tensor(5.5893, device='cuda:0', grad_fn=<NllLossBackward0>) 184000 18600 tensor(5.7452, device='cuda:0', grad_fn=<NllLossBackward0>) 185000 18700 tensor(5.5576, device='cuda:0', grad_fn=<NllLossBackward0>) 186000 18800 tensor(5.7439, device='cuda:0', grad_fn=<NllLossBackward0>) 187000 18900 tensor(5.6106, device='cuda:0', grad_fn=<NllLossBackward0>) 188000 19000 tensor(5.6647, device='cuda:0', grad_fn=<NllLossBackward0>) 189000 19100 tensor(5.7728, device='cuda:0', grad_fn=<NllLossBackward0>) 190000 19200 tensor(5.6169, device='cuda:0', grad_fn=<NllLossBackward0>) 191000 19300 tensor(5.7852, device='cuda:0', grad_fn=<NllLossBackward0>) 192000 19400 tensor(5.5627, device='cuda:0', grad_fn=<NllLossBackward0>) 193000 19500 tensor(5.5682, device='cuda:0', grad_fn=<NllLossBackward0>) 194000 19600 tensor(5.5978, device='cuda:0', grad_fn=<NllLossBackward0>) 195000 19700 tensor(5.6453, device='cuda:0', grad_fn=<NllLossBackward0>) 196000 19800 tensor(5.4786, device='cuda:0', grad_fn=<NllLossBackward0>) 197000 19900 tensor(5.4894, device='cuda:0', grad_fn=<NllLossBackward0>) 198000 20000 tensor(5.4999, device='cuda:0', grad_fn=<NllLossBackward0>) 199000 20100 tensor(5.4881, device='cuda:0', grad_fn=<NllLossBackward0>) 200000 20200 tensor(5.3915, device='cuda:0', grad_fn=<NllLossBackward0>) 201000 20300 tensor(5.5216, device='cuda:0', grad_fn=<NllLossBackward0>) 20400 tensor(5.5761, device='cuda:0', grad_fn=<NllLossBackward0>) 202000 20500 tensor(5.5586, device='cuda:0', grad_fn=<NllLossBackward0>) 203000 20600 tensor(5.7870, device='cuda:0', grad_fn=<NllLossBackward0>) 204000 20700 tensor(5.5776, device='cuda:0', grad_fn=<NllLossBackward0>) 205000 20800 tensor(5.4417, device='cuda:0', grad_fn=<NllLossBackward0>) 206000 20900 tensor(5.7186, device='cuda:0', grad_fn=<NllLossBackward0>) 207000 21000 tensor(5.5415, device='cuda:0', grad_fn=<NllLossBackward0>) 208000 21100 tensor(5.5141, device='cuda:0', grad_fn=<NllLossBackward0>) 209000 21200 tensor(5.4401, device='cuda:0', grad_fn=<NllLossBackward0>) 210000 21300 tensor(5.6511, device='cuda:0', grad_fn=<NllLossBackward0>) 211000 21400 tensor(5.6474, device='cuda:0', grad_fn=<NllLossBackward0>) 212000 21500 tensor(5.3946, device='cuda:0', grad_fn=<NllLossBackward0>) 213000 21600 tensor(5.3958, device='cuda:0', grad_fn=<NllLossBackward0>) 214000 21700 tensor(5.4040, device='cuda:0', grad_fn=<NllLossBackward0>) 215000 21800 tensor(5.5745, device='cuda:0', grad_fn=<NllLossBackward0>) 216000 21900 tensor(5.4996, device='cuda:0', grad_fn=<NllLossBackward0>) 217000 22000 tensor(5.5234, device='cuda:0', grad_fn=<NllLossBackward0>) 218000 22100 tensor(5.3870, device='cuda:0', grad_fn=<NllLossBackward0>) 219000 22200 tensor(5.2661, device='cuda:0', grad_fn=<NllLossBackward0>) 220000 22300 tensor(5.7031, device='cuda:0', grad_fn=<NllLossBackward0>) 221000 22400 tensor(5.3633, device='cuda:0', grad_fn=<NllLossBackward0>) 222000 22500 tensor(5.4404, device='cuda:0', grad_fn=<NllLossBackward0>) 223000 22600 tensor(5.5951, device='cuda:0', grad_fn=<NllLossBackward0>) 224000 22700 tensor(5.3901, device='cuda:0', grad_fn=<NllLossBackward0>) 225000 22800 tensor(5.6404, device='cuda:0', grad_fn=<NllLossBackward0>) 226000 22900 tensor(5.6646, device='cuda:0', grad_fn=<NllLossBackward0>) 227000 23000 tensor(5.5949, device='cuda:0', grad_fn=<NllLossBackward0>) 228000 23100 tensor(5.5284, device='cuda:0', grad_fn=<NllLossBackward0>) 229000 23200 tensor(5.5617, device='cuda:0', grad_fn=<NllLossBackward0>) 230000 23300 tensor(5.6426, device='cuda:0', grad_fn=<NllLossBackward0>) 231000 23400 tensor(5.7283, device='cuda:0', grad_fn=<NllLossBackward0>) 232000 23500 tensor(5.4558, device='cuda:0', grad_fn=<NllLossBackward0>) 233000 23600 tensor(5.4600, device='cuda:0', grad_fn=<NllLossBackward0>) 234000 23700 tensor(5.4961, device='cuda:0', grad_fn=<NllLossBackward0>) 235000 23800 tensor(5.3373, device='cuda:0', grad_fn=<NllLossBackward0>) 236000 23900 tensor(5.4470, device='cuda:0', grad_fn=<NllLossBackward0>) 237000 24000 tensor(5.4346, device='cuda:0', grad_fn=<NllLossBackward0>) 238000 24100 tensor(5.5112, device='cuda:0', grad_fn=<NllLossBackward0>) 239000 24200 tensor(5.6918, device='cuda:0', grad_fn=<NllLossBackward0>) 240000 24300 tensor(5.6115, device='cuda:0', grad_fn=<NllLossBackward0>) 241000 24400 tensor(5.7404, device='cuda:0', grad_fn=<NllLossBackward0>) 242000 24500 tensor(5.4982, device='cuda:0', grad_fn=<NllLossBackward0>) 243000 24600 tensor(5.6136, device='cuda:0', grad_fn=<NllLossBackward0>) 244000 24700 tensor(5.5225, device='cuda:0', grad_fn=<NllLossBackward0>) 245000 24800 tensor(5.5563, device='cuda:0', grad_fn=<NllLossBackward0>) 246000 24900 tensor(5.6283, device='cuda:0', grad_fn=<NllLossBackward0>) 247000 25000 tensor(5.6176, device='cuda:0', grad_fn=<NllLossBackward0>) 248000 25100 tensor(5.5795, device='cuda:0', grad_fn=<NllLossBackward0>) 249000 25200 tensor(5.5831, device='cuda:0', grad_fn=<NllLossBackward0>) 250000 25300 tensor(5.5894, device='cuda:0', grad_fn=<NllLossBackward0>) 251000 25400 tensor(5.5670, device='cuda:0', grad_fn=<NllLossBackward0>) 252000 25500 tensor(5.5016, device='cuda:0', grad_fn=<NllLossBackward0>) 253000 25600 tensor(5.7909, device='cuda:0', grad_fn=<NllLossBackward0>) 254000 25700 tensor(5.5229, device='cuda:0', grad_fn=<NllLossBackward0>) 255000 25800 tensor(5.6035, device='cuda:0', grad_fn=<NllLossBackward0>) 256000 25900 tensor(5.5293, device='cuda:0', grad_fn=<NllLossBackward0>) 257000 26000 tensor(5.5553, device='cuda:0', grad_fn=<NllLossBackward0>) 258000 26100 tensor(5.4476, device='cuda:0', grad_fn=<NllLossBackward0>) 259000 26200 tensor(5.3721, device='cuda:0', grad_fn=<NllLossBackward0>) 260000 26300 tensor(5.6142, device='cuda:0', grad_fn=<NllLossBackward0>) 261000 26400 tensor(5.6202, device='cuda:0', grad_fn=<NllLossBackward0>) 262000 26500 tensor(5.3529, device='cuda:0', grad_fn=<NllLossBackward0>) 263000 26600 tensor(5.7148, device='cuda:0', grad_fn=<NllLossBackward0>) 264000 26700 tensor(5.5755, device='cuda:0', grad_fn=<NllLossBackward0>) 265000 26800 tensor(5.7480, device='cuda:0', grad_fn=<NllLossBackward0>) 266000 26900 tensor(5.5025, device='cuda:0', grad_fn=<NllLossBackward0>) 267000 27000 tensor(5.4017, device='cuda:0', grad_fn=<NllLossBackward0>) 268000 27100 tensor(5.3996, device='cuda:0', grad_fn=<NllLossBackward0>) 269000 27200 tensor(5.4862, device='cuda:0', grad_fn=<NllLossBackward0>) 270000 27300 tensor(5.6392, device='cuda:0', grad_fn=<NllLossBackward0>) 271000 27400 tensor(5.5634, device='cuda:0', grad_fn=<NllLossBackward0>) 272000 27500 tensor(5.4420, device='cuda:0', grad_fn=<NllLossBackward0>) 273000 27600 tensor(5.7835, device='cuda:0', grad_fn=<NllLossBackward0>) 274000 27700 tensor(5.5555, device='cuda:0', grad_fn=<NllLossBackward0>) 275000 27800 tensor(5.5381, device='cuda:0', grad_fn=<NllLossBackward0>) 276000 27900 tensor(5.6515, device='cuda:0', grad_fn=<NllLossBackward0>) 277000 28000 tensor(5.5254, device='cuda:0', grad_fn=<NllLossBackward0>) 278000 28100 tensor(5.4929, device='cuda:0', grad_fn=<NllLossBackward0>) 279000 28200 tensor(5.6218, device='cuda:0', grad_fn=<NllLossBackward0>) 280000 28300 tensor(5.2878, device='cuda:0', grad_fn=<NllLossBackward0>) 281000 28400 tensor(5.7112, device='cuda:0', grad_fn=<NllLossBackward0>) 282000 28500 tensor(5.5490, device='cuda:0', grad_fn=<NllLossBackward0>) 283000 28600 tensor(5.4572, device='cuda:0', grad_fn=<NllLossBackward0>) 284000 28700 tensor(5.6349, device='cuda:0', grad_fn=<NllLossBackward0>) 285000 28800 tensor(5.6607, device='cuda:0', grad_fn=<NllLossBackward0>) 286000 28900 tensor(5.5422, device='cuda:0', grad_fn=<NllLossBackward0>) 287000 29000 tensor(5.4277, device='cuda:0', grad_fn=<NllLossBackward0>) 288000 29100 tensor(5.1870, device='cuda:0', grad_fn=<NllLossBackward0>) 289000 29200 tensor(5.3593, device='cuda:0', grad_fn=<NllLossBackward0>) 290000 29300 tensor(5.6512, device='cuda:0', grad_fn=<NllLossBackward0>) 291000 29400 tensor(5.8051, device='cuda:0', grad_fn=<NllLossBackward0>) 292000 29500 tensor(5.5308, device='cuda:0', grad_fn=<NllLossBackward0>) 293000 29600 tensor(5.3791, device='cuda:0', grad_fn=<NllLossBackward0>) 294000 29700 tensor(5.6108, device='cuda:0', grad_fn=<NllLossBackward0>) 295000 29800 tensor(5.4015, device='cuda:0', grad_fn=<NllLossBackward0>) 296000 29900 tensor(5.6953, device='cuda:0', grad_fn=<NllLossBackward0>) 297000 30000 tensor(5.3925, device='cuda:0', grad_fn=<NllLossBackward0>) 298000 30100 tensor(5.4241, device='cuda:0', grad_fn=<NllLossBackward0>) 299000 30200 tensor(5.4216, device='cuda:0', grad_fn=<NllLossBackward0>) 300000 30300 tensor(5.5074, device='cuda:0', grad_fn=<NllLossBackward0>) 301000 30400 tensor(5.3631, device='cuda:0', grad_fn=<NllLossBackward0>) 302000 30500 tensor(5.5690, device='cuda:0', grad_fn=<NllLossBackward0>) 30600 tensor(5.4734, device='cuda:0', grad_fn=<NllLossBackward0>) 303000 30700 tensor(5.5061, device='cuda:0', grad_fn=<NllLossBackward0>) 304000 30800 tensor(5.5709, device='cuda:0', grad_fn=<NllLossBackward0>) 305000 30900 tensor(5.5478, device='cuda:0', grad_fn=<NllLossBackward0>) 306000 31000 tensor(5.6687, device='cuda:0', grad_fn=<NllLossBackward0>) 307000 31100 tensor(5.2899, device='cuda:0', grad_fn=<NllLossBackward0>) 308000 31200 tensor(5.3663, device='cuda:0', grad_fn=<NllLossBackward0>) 309000 31300 tensor(5.6274, device='cuda:0', grad_fn=<NllLossBackward0>) 310000 31400 tensor(5.4358, device='cuda:0', grad_fn=<NllLossBackward0>) 311000 31500 tensor(5.5738, device='cuda:0', grad_fn=<NllLossBackward0>) 312000 31600 tensor(5.5612, device='cuda:0', grad_fn=<NllLossBackward0>) 313000 31700 tensor(5.5104, device='cuda:0', grad_fn=<NllLossBackward0>) 314000 31800 tensor(5.6343, device='cuda:0', grad_fn=<NllLossBackward0>) 315000 31900 tensor(5.2243, device='cuda:0', grad_fn=<NllLossBackward0>) 316000 32000 tensor(5.4320, device='cuda:0', grad_fn=<NllLossBackward0>) 317000 32100 tensor(5.3344, device='cuda:0', grad_fn=<NllLossBackward0>) 318000 32200 tensor(5.6543, device='cuda:0', grad_fn=<NllLossBackward0>) 319000 32300 tensor(5.6512, device='cuda:0', grad_fn=<NllLossBackward0>) 320000 32400 tensor(5.6237, device='cuda:0', grad_fn=<NllLossBackward0>) 321000 32500 tensor(5.4246, device='cuda:0', grad_fn=<NllLossBackward0>) 322000 32600 tensor(5.5469, device='cuda:0', grad_fn=<NllLossBackward0>) 323000 32700 tensor(5.5338, device='cuda:0', grad_fn=<NllLossBackward0>) 324000 32800 tensor(5.6954, device='cuda:0', grad_fn=<NllLossBackward0>) 325000 32900 tensor(5.5754, device='cuda:0', grad_fn=<NllLossBackward0>) 326000 33000 tensor(5.3334, device='cuda:0', grad_fn=<NllLossBackward0>) 327000 33100 tensor(5.5284, device='cuda:0', grad_fn=<NllLossBackward0>) 328000 33200 tensor(5.6350, device='cuda:0', grad_fn=<NllLossBackward0>) 329000 33300 tensor(5.4312, device='cuda:0', grad_fn=<NllLossBackward0>) 330000 33400 tensor(5.6854, device='cuda:0', grad_fn=<NllLossBackward0>) 331000 33500 tensor(5.4921, device='cuda:0', grad_fn=<NllLossBackward0>) 332000 33600 tensor(5.4345, device='cuda:0', grad_fn=<NllLossBackward0>) 333000 33700 tensor(5.4950, device='cuda:0', grad_fn=<NllLossBackward0>) 334000 33800 tensor(5.5757, device='cuda:0', grad_fn=<NllLossBackward0>) 335000 33900 tensor(5.3466, device='cuda:0', grad_fn=<NllLossBackward0>) 336000 34000 tensor(5.5373, device='cuda:0', grad_fn=<NllLossBackward0>) 337000 34100 tensor(5.5144, device='cuda:0', grad_fn=<NllLossBackward0>) 338000 34200 tensor(5.5543, device='cuda:0', grad_fn=<NllLossBackward0>) 339000 34300 tensor(5.3564, device='cuda:0', grad_fn=<NllLossBackward0>) 340000 34400 tensor(5.8091, device='cuda:0', grad_fn=<NllLossBackward0>) 341000 34500 tensor(5.6699, device='cuda:0', grad_fn=<NllLossBackward0>) 342000 34600 tensor(5.5536, device='cuda:0', grad_fn=<NllLossBackward0>) 343000 34700 tensor(5.6261, device='cuda:0', grad_fn=<NllLossBackward0>) 344000 34800 tensor(5.6504, device='cuda:0', grad_fn=<NllLossBackward0>) 345000 34900 tensor(5.7067, device='cuda:0', grad_fn=<NllLossBackward0>) 346000 35000 tensor(5.7307, device='cuda:0', grad_fn=<NllLossBackward0>) 347000 35100 tensor(5.4831, device='cuda:0', grad_fn=<NllLossBackward0>) 348000 35200 tensor(5.4367, device='cuda:0', grad_fn=<NllLossBackward0>) 349000 35300 tensor(5.6503, device='cuda:0', grad_fn=<NllLossBackward0>) 350000 35400 tensor(5.2892, device='cuda:0', grad_fn=<NllLossBackward0>) 351000 35500 tensor(5.4198, device='cuda:0', grad_fn=<NllLossBackward0>) 352000 35600 tensor(5.4870, device='cuda:0', grad_fn=<NllLossBackward0>) 353000 35700 tensor(5.4489, device='cuda:0', grad_fn=<NllLossBackward0>) 354000 35800 tensor(5.5170, device='cuda:0', grad_fn=<NllLossBackward0>) 355000 35900 tensor(5.4699, device='cuda:0', grad_fn=<NllLossBackward0>) 356000 36000 tensor(5.2451, device='cuda:0', grad_fn=<NllLossBackward0>) 357000 36100 tensor(5.6311, device='cuda:0', grad_fn=<NllLossBackward0>) 358000 36200 tensor(5.5157, device='cuda:0', grad_fn=<NllLossBackward0>) 359000 36300 tensor(5.7751, device='cuda:0', grad_fn=<NllLossBackward0>) 360000 36400 tensor(5.4740, device='cuda:0', grad_fn=<NllLossBackward0>) 361000 36500 tensor(5.4746, device='cuda:0', grad_fn=<NllLossBackward0>) 362000 36600 tensor(5.5244, device='cuda:0', grad_fn=<NllLossBackward0>) 363000 36700 tensor(5.3037, device='cuda:0', grad_fn=<NllLossBackward0>) 364000 36800 tensor(5.4238, device='cuda:0', grad_fn=<NllLossBackward0>) 365000 36900 tensor(5.5203, device='cuda:0', grad_fn=<NllLossBackward0>) 366000 37000 tensor(5.4431, device='cuda:0', grad_fn=<NllLossBackward0>) 367000 37100 tensor(5.4286, device='cuda:0', grad_fn=<NllLossBackward0>) 368000 37200 tensor(5.5108, device='cuda:0', grad_fn=<NllLossBackward0>) 369000 37300 tensor(5.4229, device='cuda:0', grad_fn=<NllLossBackward0>) 370000 37400 tensor(5.8406, device='cuda:0', grad_fn=<NllLossBackward0>) 371000 37500 tensor(5.4602, device='cuda:0', grad_fn=<NllLossBackward0>) 372000 37600 tensor(5.4417, device='cuda:0', grad_fn=<NllLossBackward0>) 373000 37700 tensor(5.6200, device='cuda:0', grad_fn=<NllLossBackward0>) 374000 37800 tensor(5.4527, device='cuda:0', grad_fn=<NllLossBackward0>) 375000 37900 tensor(5.4631, device='cuda:0', grad_fn=<NllLossBackward0>) 376000 38000 tensor(5.5196, device='cuda:0', grad_fn=<NllLossBackward0>) 377000 38100 tensor(5.5436, device='cuda:0', grad_fn=<NllLossBackward0>) 378000 38200 tensor(5.5269, device='cuda:0', grad_fn=<NllLossBackward0>) 379000 38300 tensor(5.4716, device='cuda:0', grad_fn=<NllLossBackward0>) 380000 38400 tensor(5.5081, device='cuda:0', grad_fn=<NllLossBackward0>) 381000 38500 tensor(5.5249, device='cuda:0', grad_fn=<NllLossBackward0>) 382000 38600 tensor(5.5018, device='cuda:0', grad_fn=<NllLossBackward0>) 383000 38700 tensor(5.4845, device='cuda:0', grad_fn=<NllLossBackward0>) 384000 38800 tensor(5.5505, device='cuda:0', grad_fn=<NllLossBackward0>) 385000 38900 tensor(5.6658, device='cuda:0', grad_fn=<NllLossBackward0>) 386000 39000 tensor(5.3333, device='cuda:0', grad_fn=<NllLossBackward0>) 387000 39100 tensor(5.5598, device='cuda:0', grad_fn=<NllLossBackward0>) 388000 39200 tensor(5.6624, device='cuda:0', grad_fn=<NllLossBackward0>) 389000 39300 tensor(5.4714, device='cuda:0', grad_fn=<NllLossBackward0>) 390000 39400 tensor(5.5470, device='cuda:0', grad_fn=<NllLossBackward0>) 391000 39500 tensor(5.6905, device='cuda:0', grad_fn=<NllLossBackward0>) 392000 39600 tensor(5.3592, device='cuda:0', grad_fn=<NllLossBackward0>) 393000 39700 tensor(5.3170, device='cuda:0', grad_fn=<NllLossBackward0>) 394000 39800 tensor(5.4491, device='cuda:0', grad_fn=<NllLossBackward0>) 395000 39900 tensor(5.2872, device='cuda:0', grad_fn=<NllLossBackward0>) 396000 40000 tensor(5.3865, device='cuda:0', grad_fn=<NllLossBackward0>) 397000 40100 tensor(5.4536, device='cuda:0', grad_fn=<NllLossBackward0>) 398000 40200 tensor(5.4382, device='cuda:0', grad_fn=<NllLossBackward0>) 399000 40300 tensor(5.4819, device='cuda:0', grad_fn=<NllLossBackward0>) 40400 tensor(5.5250, device='cuda:0', grad_fn=<NllLossBackward0>) 400000 40500 tensor(5.4396, device='cuda:0', grad_fn=<NllLossBackward0>) 401000 40600 tensor(5.5062, device='cuda:0', grad_fn=<NllLossBackward0>) 402000 40700 tensor(5.5362, device='cuda:0', grad_fn=<NllLossBackward0>) 403000 40800 tensor(5.5015, device='cuda:0', grad_fn=<NllLossBackward0>) 404000 40900 tensor(5.4610, device='cuda:0', grad_fn=<NllLossBackward0>) 405000 41000 tensor(5.5083, device='cuda:0', grad_fn=<NllLossBackward0>) 406000 41100 tensor(5.4346, device='cuda:0', grad_fn=<NllLossBackward0>) 407000 41200 tensor(5.3340, device='cuda:0', grad_fn=<NllLossBackward0>) 408000 41300 tensor(5.4608, device='cuda:0', grad_fn=<NllLossBackward0>) 409000 41400 tensor(5.3758, device='cuda:0', grad_fn=<NllLossBackward0>) 410000 41500 tensor(5.5160, device='cuda:0', grad_fn=<NllLossBackward0>) 411000 41600 tensor(5.4290, device='cuda:0', grad_fn=<NllLossBackward0>) 412000 41700 tensor(5.4426, device='cuda:0', grad_fn=<NllLossBackward0>) 413000 41800 tensor(5.4764, device='cuda:0', grad_fn=<NllLossBackward0>) 414000 41900 tensor(5.4730, device='cuda:0', grad_fn=<NllLossBackward0>) 415000 42000 tensor(5.6150, device='cuda:0', grad_fn=<NllLossBackward0>) 416000 42100 tensor(5.3622, device='cuda:0', grad_fn=<NllLossBackward0>) 417000 42200 tensor(5.4380, device='cuda:0', grad_fn=<NllLossBackward0>) 418000 42300 tensor(5.5031, device='cuda:0', grad_fn=<NllLossBackward0>) 419000 42400 tensor(5.3124, device='cuda:0', grad_fn=<NllLossBackward0>) 420000 42500 tensor(5.4812, device='cuda:0', grad_fn=<NllLossBackward0>) 421000 42600 tensor(5.2723, device='cuda:0', grad_fn=<NllLossBackward0>) 422000 42700 tensor(5.5998, device='cuda:0', grad_fn=<NllLossBackward0>) 423000 42800 tensor(5.5254, device='cuda:0', grad_fn=<NllLossBackward0>) 424000 42900 tensor(5.3716, device='cuda:0', grad_fn=<NllLossBackward0>) 425000 43000 tensor(5.5020, device='cuda:0', grad_fn=<NllLossBackward0>) 426000 43100 tensor(5.5091, device='cuda:0', grad_fn=<NllLossBackward0>) 427000 43200 tensor(5.3182, device='cuda:0', grad_fn=<NllLossBackward0>) 428000 43300 tensor(5.4001, device='cuda:0', grad_fn=<NllLossBackward0>) 429000 43400 tensor(5.5150, device='cuda:0', grad_fn=<NllLossBackward0>) 430000 43500 tensor(5.2440, device='cuda:0', grad_fn=<NllLossBackward0>) 431000 43600 tensor(5.4439, device='cuda:0', grad_fn=<NllLossBackward0>) 432000 epoch: = 2 0 tensor(5.3953, device='cuda:0', grad_fn=<NllLossBackward0>) 1000 100 tensor(5.4847, device='cuda:0', grad_fn=<NllLossBackward0>) 200 tensor(5.3626, device='cuda:0', grad_fn=<NllLossBackward0>) 2000 300 tensor(5.4127, device='cuda:0', grad_fn=<NllLossBackward0>) 3000 400 tensor(5.3734, device='cuda:0', grad_fn=<NllLossBackward0>) 4000 500 tensor(5.5564, device='cuda:0', grad_fn=<NllLossBackward0>) 5000 600 tensor(5.3391, device='cuda:0', grad_fn=<NllLossBackward0>) 6000 700 tensor(5.6198, device='cuda:0', grad_fn=<NllLossBackward0>) 7000 800 tensor(5.2255, device='cuda:0', grad_fn=<NllLossBackward0>) 8000 900 tensor(5.5161, device='cuda:0', grad_fn=<NllLossBackward0>) 9000 1000 tensor(5.3517, device='cuda:0', grad_fn=<NllLossBackward0>) 10000 1100 tensor(5.5420, device='cuda:0', grad_fn=<NllLossBackward0>) 11000 1200 tensor(5.6031, device='cuda:0', grad_fn=<NllLossBackward0>) 12000 1300 tensor(5.5343, device='cuda:0', grad_fn=<NllLossBackward0>) 13000 1400 tensor(5.5547, device='cuda:0', grad_fn=<NllLossBackward0>) 14000 1500 tensor(5.6080, device='cuda:0', grad_fn=<NllLossBackward0>) 15000 1600 tensor(5.2940, device='cuda:0', grad_fn=<NllLossBackward0>) 16000 1700 tensor(5.3671, device='cuda:0', grad_fn=<NllLossBackward0>) 17000 1800 tensor(5.3777, device='cuda:0', grad_fn=<NllLossBackward0>) 18000 1900 tensor(5.3593, device='cuda:0', grad_fn=<NllLossBackward0>) 19000 2000 tensor(5.4348, device='cuda:0', grad_fn=<NllLossBackward0>) 20000 2100 tensor(5.5513, device='cuda:0', grad_fn=<NllLossBackward0>) 21000 2200 tensor(5.3939, device='cuda:0', grad_fn=<NllLossBackward0>) 22000 2300 tensor(5.4063, device='cuda:0', grad_fn=<NllLossBackward0>) 23000 2400 tensor(5.4092, device='cuda:0', grad_fn=<NllLossBackward0>) 24000 2500 tensor(5.4460, device='cuda:0', grad_fn=<NllLossBackward0>) 25000 2600 tensor(5.4738, device='cuda:0', grad_fn=<NllLossBackward0>) 26000 2700 tensor(5.4848, device='cuda:0', grad_fn=<NllLossBackward0>) 27000 2800 tensor(5.5244, device='cuda:0', grad_fn=<NllLossBackward0>) 28000 2900 tensor(5.6711, device='cuda:0', grad_fn=<NllLossBackward0>) 29000 3000 tensor(5.4024, device='cuda:0', grad_fn=<NllLossBackward0>) 30000 3100 tensor(5.4842, device='cuda:0', grad_fn=<NllLossBackward0>) 31000 3200 tensor(5.4863, device='cuda:0', grad_fn=<NllLossBackward0>) 32000 3300 tensor(5.4114, device='cuda:0', grad_fn=<NllLossBackward0>) 33000 3400 tensor(5.3231, device='cuda:0', grad_fn=<NllLossBackward0>) 34000 3500 tensor(5.4598, device='cuda:0', grad_fn=<NllLossBackward0>) 35000 3600 tensor(5.4579, device='cuda:0', grad_fn=<NllLossBackward0>) 36000 3700 tensor(5.3890, device='cuda:0', grad_fn=<NllLossBackward0>) 37000 3800 tensor(5.4162, device='cuda:0', grad_fn=<NllLossBackward0>) 38000 3900 tensor(5.2854, device='cuda:0', grad_fn=<NllLossBackward0>) 39000 4000 tensor(5.3370, device='cuda:0', grad_fn=<NllLossBackward0>) 40000 4100 tensor(5.5078, device='cuda:0', grad_fn=<NllLossBackward0>) 41000 4200 tensor(5.5341, device='cuda:0', grad_fn=<NllLossBackward0>) 42000 4300 tensor(5.4704, device='cuda:0', grad_fn=<NllLossBackward0>) 43000 4400 tensor(5.4990, device='cuda:0', grad_fn=<NllLossBackward0>) 44000 4500 tensor(5.3300, device='cuda:0', grad_fn=<NllLossBackward0>) 45000 4600 tensor(5.6674, device='cuda:0', grad_fn=<NllLossBackward0>) 46000 4700 tensor(5.3622, device='cuda:0', grad_fn=<NllLossBackward0>) 47000 4800 tensor(5.4762, device='cuda:0', grad_fn=<NllLossBackward0>) 48000 4900 tensor(5.5403, device='cuda:0', grad_fn=<NllLossBackward0>) 49000 5000 tensor(5.5359, device='cuda:0', grad_fn=<NllLossBackward0>) 50000 5100 tensor(5.6058, device='cuda:0', grad_fn=<NllLossBackward0>) 51000 5200 tensor(5.6209, device='cuda:0', grad_fn=<NllLossBackward0>) 52000 5300 tensor(5.6273, device='cuda:0', grad_fn=<NllLossBackward0>) 53000 5400 tensor(5.4695, device='cuda:0', grad_fn=<NllLossBackward0>) 54000 5500 tensor(5.5771, device='cuda:0', grad_fn=<NllLossBackward0>) 55000 5600 tensor(5.3552, device='cuda:0', grad_fn=<NllLossBackward0>) 56000 5700 tensor(5.5957, device='cuda:0', grad_fn=<NllLossBackward0>) 57000 5800 tensor(5.5952, device='cuda:0', grad_fn=<NllLossBackward0>) 58000 5900 tensor(5.5643, device='cuda:0', grad_fn=<NllLossBackward0>) 59000 6000 tensor(5.4346, device='cuda:0', grad_fn=<NllLossBackward0>) 60000 6100 tensor(5.4620, device='cuda:0', grad_fn=<NllLossBackward0>) 61000 6200 tensor(5.6256, device='cuda:0', grad_fn=<NllLossBackward0>) 62000 6300 tensor(5.4832, device='cuda:0', grad_fn=<NllLossBackward0>) 63000 6400 tensor(5.4063, device='cuda:0', grad_fn=<NllLossBackward0>) 64000 6500 tensor(5.2587, device='cuda:0', grad_fn=<NllLossBackward0>) 65000 6600 tensor(5.4320, device='cuda:0', grad_fn=<NllLossBackward0>) 66000 6700 tensor(5.3770, device='cuda:0', grad_fn=<NllLossBackward0>) 67000 6800 tensor(5.6077, device='cuda:0', grad_fn=<NllLossBackward0>) 68000 6900 tensor(5.5788, device='cuda:0', grad_fn=<NllLossBackward0>) 69000 7000 tensor(5.4929, device='cuda:0', grad_fn=<NllLossBackward0>) 70000 7100 tensor(5.2828, device='cuda:0', grad_fn=<NllLossBackward0>) 71000 7200 tensor(5.3992, device='cuda:0', grad_fn=<NllLossBackward0>) 72000 7300 tensor(5.6273, device='cuda:0', grad_fn=<NllLossBackward0>) 73000 7400 tensor(5.4385, device='cuda:0', grad_fn=<NllLossBackward0>) 74000 7500 tensor(5.3176, device='cuda:0', grad_fn=<NllLossBackward0>) 75000 7600 tensor(5.3834, device='cuda:0', grad_fn=<NllLossBackward0>) 76000 7700 tensor(5.4532, device='cuda:0', grad_fn=<NllLossBackward0>) 77000 7800 tensor(5.5669, device='cuda:0', grad_fn=<NllLossBackward0>) 78000 7900 tensor(5.2508, device='cuda:0', grad_fn=<NllLossBackward0>) 79000 8000 tensor(5.3027, device='cuda:0', grad_fn=<NllLossBackward0>) 80000 8100 tensor(5.4813, device='cuda:0', grad_fn=<NllLossBackward0>) 81000 8200 tensor(5.4822, device='cuda:0', grad_fn=<NllLossBackward0>) 82000 8300 tensor(5.4510, device='cuda:0', grad_fn=<NllLossBackward0>) 83000 8400 tensor(5.5712, device='cuda:0', grad_fn=<NllLossBackward0>) 84000 8500 tensor(5.5634, device='cuda:0', grad_fn=<NllLossBackward0>) 85000 8600 tensor(5.5616, device='cuda:0', grad_fn=<NllLossBackward0>) 86000 8700 tensor(5.6568, device='cuda:0', grad_fn=<NllLossBackward0>) 87000 8800 tensor(5.5397, device='cuda:0', grad_fn=<NllLossBackward0>) 88000 8900 tensor(5.3852, device='cuda:0', grad_fn=<NllLossBackward0>) 89000 9000 tensor(5.5022, device='cuda:0', grad_fn=<NllLossBackward0>) 90000 9100 tensor(5.5088, device='cuda:0', grad_fn=<NllLossBackward0>) 91000 9200 tensor(5.4214, device='cuda:0', grad_fn=<NllLossBackward0>) 92000 9300 tensor(5.4641, device='cuda:0', grad_fn=<NllLossBackward0>) 93000 9400 tensor(5.3085, device='cuda:0', grad_fn=<NllLossBackward0>) 94000 9500 tensor(5.3852, device='cuda:0', grad_fn=<NllLossBackward0>) 95000 9600 tensor(5.5097, device='cuda:0', grad_fn=<NllLossBackward0>) 96000 9700 tensor(5.4373, device='cuda:0', grad_fn=<NllLossBackward0>) 97000 9800 tensor(5.4786, device='cuda:0', grad_fn=<NllLossBackward0>) 9900 tensor(5.3198, device='cuda:0', grad_fn=<NllLossBackward0>) 98000 10000 tensor(5.5310, device='cuda:0', grad_fn=<NllLossBackward0>) 99000 10100 tensor(5.4341, device='cuda:0', grad_fn=<NllLossBackward0>) 100000 10200 tensor(5.3571, device='cuda:0', grad_fn=<NllLossBackward0>) 101000 10300 tensor(5.4712, device='cuda:0', grad_fn=<NllLossBackward0>) 102000 10400 tensor(5.4810, device='cuda:0', grad_fn=<NllLossBackward0>) 103000 10500 tensor(5.5463, device='cuda:0', grad_fn=<NllLossBackward0>) 104000 10600 tensor(5.6233, device='cuda:0', grad_fn=<NllLossBackward0>) 105000 10700 tensor(5.4678, device='cuda:0', grad_fn=<NllLossBackward0>) 106000 10800 tensor(5.5040, device='cuda:0', grad_fn=<NllLossBackward0>) 107000 10900 tensor(5.3963, device='cuda:0', grad_fn=<NllLossBackward0>) 108000 11000 tensor(5.6295, device='cuda:0', grad_fn=<NllLossBackward0>) 109000 11100 tensor(5.2378, device='cuda:0', grad_fn=<NllLossBackward0>) 110000 11200 tensor(5.4184, device='cuda:0', grad_fn=<NllLossBackward0>) 111000 11300 tensor(5.4404, device='cuda:0', grad_fn=<NllLossBackward0>) 112000 11400 tensor(5.3875, device='cuda:0', grad_fn=<NllLossBackward0>) 113000 11500 tensor(5.4523, device='cuda:0', grad_fn=<NllLossBackward0>) 114000 11600 tensor(5.4418, device='cuda:0', grad_fn=<NllLossBackward0>) 115000 11700 tensor(5.3604, device='cuda:0', grad_fn=<NllLossBackward0>) 116000 11800 tensor(5.5647, device='cuda:0', grad_fn=<NllLossBackward0>) 117000 11900 tensor(5.3936, device='cuda:0', grad_fn=<NllLossBackward0>) 118000 12000 tensor(5.4823, device='cuda:0', grad_fn=<NllLossBackward0>) 119000 12100 tensor(5.5069, device='cuda:0', grad_fn=<NllLossBackward0>) 120000 12200 tensor(5.4983, device='cuda:0', grad_fn=<NllLossBackward0>) 121000 12300 tensor(5.6030, device='cuda:0', grad_fn=<NllLossBackward0>) 122000 12400 tensor(5.4763, device='cuda:0', grad_fn=<NllLossBackward0>) 123000 12500 tensor(5.3718, device='cuda:0', grad_fn=<NllLossBackward0>) 124000 12600 tensor(5.4416, device='cuda:0', grad_fn=<NllLossBackward0>) 125000 12700 tensor(5.3554, device='cuda:0', grad_fn=<NllLossBackward0>) 126000 12800 tensor(5.5392, device='cuda:0', grad_fn=<NllLossBackward0>) 127000 12900 tensor(5.4164, device='cuda:0', grad_fn=<NllLossBackward0>) 128000 13000 tensor(5.2286, device='cuda:0', grad_fn=<NllLossBackward0>) 129000 13100 tensor(5.4288, device='cuda:0', grad_fn=<NllLossBackward0>) 130000 13200 tensor(5.4770, device='cuda:0', grad_fn=<NllLossBackward0>) 131000 13300 tensor(5.3352, device='cuda:0', grad_fn=<NllLossBackward0>) 132000 13400 tensor(5.4349, device='cuda:0', grad_fn=<NllLossBackward0>) 133000 13500 tensor(5.3860, device='cuda:0', grad_fn=<NllLossBackward0>) 134000 13600 tensor(5.4648, device='cuda:0', grad_fn=<NllLossBackward0>) 135000 13700 tensor(5.4444, device='cuda:0', grad_fn=<NllLossBackward0>) 136000 13800 tensor(5.4320, device='cuda:0', grad_fn=<NllLossBackward0>) 137000 13900 tensor(5.2935, device='cuda:0', grad_fn=<NllLossBackward0>) 138000 14000 tensor(5.5387, device='cuda:0', grad_fn=<NllLossBackward0>) 139000 14100 tensor(5.2424, device='cuda:0', grad_fn=<NllLossBackward0>) 140000 14200 tensor(5.5177, device='cuda:0', grad_fn=<NllLossBackward0>) 141000 14300 tensor(5.4831, device='cuda:0', grad_fn=<NllLossBackward0>) 142000 14400 tensor(5.3877, device='cuda:0', grad_fn=<NllLossBackward0>) 143000 14500 tensor(5.4919, device='cuda:0', grad_fn=<NllLossBackward0>) 144000 14600 tensor(5.5253, device='cuda:0', grad_fn=<NllLossBackward0>) 145000 14700 tensor(5.3948, device='cuda:0', grad_fn=<NllLossBackward0>) 146000 14800 tensor(5.8442, device='cuda:0', grad_fn=<NllLossBackward0>) 147000 14900 tensor(5.4967, device='cuda:0', grad_fn=<NllLossBackward0>) 148000 15000 tensor(5.3788, device='cuda:0', grad_fn=<NllLossBackward0>) 149000 15100 tensor(5.5832, device='cuda:0', grad_fn=<NllLossBackward0>) 150000 15200 tensor(5.4482, device='cuda:0', grad_fn=<NllLossBackward0>) 151000 15300 tensor(5.4260, device='cuda:0', grad_fn=<NllLossBackward0>) 152000 15400 tensor(5.3273, device='cuda:0', grad_fn=<NllLossBackward0>) 153000 15500 tensor(5.4840, device='cuda:0', grad_fn=<NllLossBackward0>) 154000 15600 tensor(5.4851, device='cuda:0', grad_fn=<NllLossBackward0>) 155000 15700 tensor(5.3871, device='cuda:0', grad_fn=<NllLossBackward0>) 156000 15800 tensor(5.2933, device='cuda:0', grad_fn=<NllLossBackward0>) 157000 15900 tensor(5.4374, device='cuda:0', grad_fn=<NllLossBackward0>) 158000 16000 tensor(5.2555, device='cuda:0', grad_fn=<NllLossBackward0>) 159000 16100 tensor(5.3127, device='cuda:0', grad_fn=<NllLossBackward0>) 160000 16200 tensor(5.6423, device='cuda:0', grad_fn=<NllLossBackward0>) 161000 16300 tensor(5.4702, device='cuda:0', grad_fn=<NllLossBackward0>) 162000 16400 tensor(5.4419, device='cuda:0', grad_fn=<NllLossBackward0>) 163000 16500 tensor(5.5640, device='cuda:0', grad_fn=<NllLossBackward0>) 164000 16600 tensor(5.4099, device='cuda:0', grad_fn=<NllLossBackward0>) 165000 16700 tensor(5.3822, device='cuda:0', grad_fn=<NllLossBackward0>) 166000 16800 tensor(5.4643, device='cuda:0', grad_fn=<NllLossBackward0>) 167000 16900 tensor(5.2234, device='cuda:0', grad_fn=<NllLossBackward0>) 168000 17000 tensor(5.5021, device='cuda:0', grad_fn=<NllLossBackward0>) 169000 17100 tensor(5.3524, device='cuda:0', grad_fn=<NllLossBackward0>) 170000 17200 tensor(5.4725, device='cuda:0', grad_fn=<NllLossBackward0>) 171000 17300 tensor(5.5034, device='cuda:0', grad_fn=<NllLossBackward0>) 172000 17400 tensor(5.2911, device='cuda:0', grad_fn=<NllLossBackward0>) 173000 17500 tensor(5.3147, device='cuda:0', grad_fn=<NllLossBackward0>) 174000 17600 tensor(5.3426, device='cuda:0', grad_fn=<NllLossBackward0>) 175000 17700 tensor(5.3414, device='cuda:0', grad_fn=<NllLossBackward0>) 176000 17800 tensor(5.3991, device='cuda:0', grad_fn=<NllLossBackward0>) 177000 17900 tensor(5.2936, device='cuda:0', grad_fn=<NllLossBackward0>) 178000 18000 tensor(5.5238, device='cuda:0', grad_fn=<NllLossBackward0>) 179000 18100 tensor(5.4684, device='cuda:0', grad_fn=<NllLossBackward0>) 180000 18200 tensor(5.3916, device='cuda:0', grad_fn=<NllLossBackward0>) 181000 18300 tensor(5.3888, device='cuda:0', grad_fn=<NllLossBackward0>) 182000 18400 tensor(5.4299, device='cuda:0', grad_fn=<NllLossBackward0>) 183000 18500 tensor(5.4103, device='cuda:0', grad_fn=<NllLossBackward0>) 184000 18600 tensor(5.5980, device='cuda:0', grad_fn=<NllLossBackward0>) 185000 18700 tensor(5.4135, device='cuda:0', grad_fn=<NllLossBackward0>) 186000 18800 tensor(5.5855, device='cuda:0', grad_fn=<NllLossBackward0>) 187000 18900 tensor(5.4583, device='cuda:0', grad_fn=<NllLossBackward0>) 188000 19000 tensor(5.4854, device='cuda:0', grad_fn=<NllLossBackward0>) 189000 19100 tensor(5.5879, device='cuda:0', grad_fn=<NllLossBackward0>) 190000 19200 tensor(5.4675, device='cuda:0', grad_fn=<NllLossBackward0>) 191000 19300 tensor(5.5741, device='cuda:0', grad_fn=<NllLossBackward0>) 192000 19400 tensor(5.3977, device='cuda:0', grad_fn=<NllLossBackward0>) 193000 19500 tensor(5.4042, device='cuda:0', grad_fn=<NllLossBackward0>) 194000 19600 tensor(5.4364, device='cuda:0', grad_fn=<NllLossBackward0>) 195000 19700 tensor(5.4868, device='cuda:0', grad_fn=<NllLossBackward0>) 196000 19800 tensor(5.3476, device='cuda:0', grad_fn=<NllLossBackward0>) 197000 19900 tensor(5.3553, device='cuda:0', grad_fn=<NllLossBackward0>) 198000 20000 tensor(5.3707, device='cuda:0', grad_fn=<NllLossBackward0>) 199000 20100 tensor(5.3226, device='cuda:0', grad_fn=<NllLossBackward0>) 200000 20200 tensor(5.2488, device='cuda:0', grad_fn=<NllLossBackward0>) 201000 20300 tensor(5.3648, device='cuda:0', grad_fn=<NllLossBackward0>) 20400 tensor(5.4156, device='cuda:0', grad_fn=<NllLossBackward0>) 202000 20500 tensor(5.4102, device='cuda:0', grad_fn=<NllLossBackward0>) 203000 20600 tensor(5.6109, device='cuda:0', grad_fn=<NllLossBackward0>) 204000 20700 tensor(5.4335, device='cuda:0', grad_fn=<NllLossBackward0>) 205000 20800 tensor(5.2795, device='cuda:0', grad_fn=<NllLossBackward0>) 206000 20900 tensor(5.5609, device='cuda:0', grad_fn=<NllLossBackward0>) 207000 21000 tensor(5.3918, device='cuda:0', grad_fn=<NllLossBackward0>) 208000 21100 tensor(5.3831, device='cuda:0', grad_fn=<NllLossBackward0>) 209000 21200 tensor(5.2790, device='cuda:0', grad_fn=<NllLossBackward0>) 210000 21300 tensor(5.4710, device='cuda:0', grad_fn=<NllLossBackward0>) 211000 21400 tensor(5.5050, device='cuda:0', grad_fn=<NllLossBackward0>) 212000 21500 tensor(5.2692, device='cuda:0', grad_fn=<NllLossBackward0>) 213000 21600 tensor(5.2668, device='cuda:0', grad_fn=<NllLossBackward0>) 214000 21700 tensor(5.2633, device='cuda:0', grad_fn=<NllLossBackward0>) 215000 21800 tensor(5.4067, device='cuda:0', grad_fn=<NllLossBackward0>) 216000 21900 tensor(5.3829, device='cuda:0', grad_fn=<NllLossBackward0>) 217000 22000 tensor(5.3773, device='cuda:0', grad_fn=<NllLossBackward0>) 218000 22100 tensor(5.2472, device='cuda:0', grad_fn=<NllLossBackward0>) 219000 22200 tensor(5.1171, device='cuda:0', grad_fn=<NllLossBackward0>) 220000 22300 tensor(5.5545, device='cuda:0', grad_fn=<NllLossBackward0>) 221000 22400 tensor(5.2499, device='cuda:0', grad_fn=<NllLossBackward0>) 222000 22500 tensor(5.2943, device='cuda:0', grad_fn=<NllLossBackward0>) 223000 22600 tensor(5.4748, device='cuda:0', grad_fn=<NllLossBackward0>) 224000 22700 tensor(5.2436, device='cuda:0', grad_fn=<NllLossBackward0>) 225000 22800 tensor(5.5053, device='cuda:0', grad_fn=<NllLossBackward0>) 226000 22900 tensor(5.5519, device='cuda:0', grad_fn=<NllLossBackward0>) 227000 23000 tensor(5.4541, device='cuda:0', grad_fn=<NllLossBackward0>) 228000 23100 tensor(5.4279, device='cuda:0', grad_fn=<NllLossBackward0>) 229000 23200 tensor(5.4286, device='cuda:0', grad_fn=<NllLossBackward0>) 230000 23300 tensor(5.5179, device='cuda:0', grad_fn=<NllLossBackward0>) 231000 23400 tensor(5.5355, device='cuda:0', grad_fn=<NllLossBackward0>) 232000 23500 tensor(5.3505, device='cuda:0', grad_fn=<NllLossBackward0>) 233000 23600 tensor(5.3313, device='cuda:0', grad_fn=<NllLossBackward0>) 234000 23700 tensor(5.3509, device='cuda:0', grad_fn=<NllLossBackward0>) 235000 23800 tensor(5.2170, device='cuda:0', grad_fn=<NllLossBackward0>) 236000 23900 tensor(5.3101, device='cuda:0', grad_fn=<NllLossBackward0>) 237000 24000 tensor(5.2962, device='cuda:0', grad_fn=<NllLossBackward0>) 238000 24100 tensor(5.3882, device='cuda:0', grad_fn=<NllLossBackward0>) 239000 24200 tensor(5.5633, device='cuda:0', grad_fn=<NllLossBackward0>) 240000 24300 tensor(5.4595, device='cuda:0', grad_fn=<NllLossBackward0>) 241000 24400 tensor(5.5932, device='cuda:0', grad_fn=<NllLossBackward0>) 242000 24500 tensor(5.3717, device='cuda:0', grad_fn=<NllLossBackward0>) 243000 24600 tensor(5.4943, device='cuda:0', grad_fn=<NllLossBackward0>) 244000 24700 tensor(5.3985, device='cuda:0', grad_fn=<NllLossBackward0>) 245000 24800 tensor(5.4347, device='cuda:0', grad_fn=<NllLossBackward0>) 246000 24900 tensor(5.5008, device='cuda:0', grad_fn=<NllLossBackward0>) 247000 25000 tensor(5.5100, device='cuda:0', grad_fn=<NllLossBackward0>) 248000 25100 tensor(5.4427, device='cuda:0', grad_fn=<NllLossBackward0>) 249000 25200 tensor(5.4508, device='cuda:0', grad_fn=<NllLossBackward0>) 250000 25300 tensor(5.4724, device='cuda:0', grad_fn=<NllLossBackward0>) 251000 25400 tensor(5.4525, device='cuda:0', grad_fn=<NllLossBackward0>) 252000 25500 tensor(5.3620, device='cuda:0', grad_fn=<NllLossBackward0>) 253000 25600 tensor(5.6446, device='cuda:0', grad_fn=<NllLossBackward0>) 254000 25700 tensor(5.3966, device='cuda:0', grad_fn=<NllLossBackward0>) 255000 25800 tensor(5.4889, device='cuda:0', grad_fn=<NllLossBackward0>) 256000 25900 tensor(5.4251, device='cuda:0', grad_fn=<NllLossBackward0>) 257000 26000 tensor(5.4346, device='cuda:0', grad_fn=<NllLossBackward0>) 258000 26100 tensor(5.3395, device='cuda:0', grad_fn=<NllLossBackward0>) 259000 26200 tensor(5.2695, device='cuda:0', grad_fn=<NllLossBackward0>) 260000 26300 tensor(5.4767, device='cuda:0', grad_fn=<NllLossBackward0>) 261000 26400 tensor(5.5083, device='cuda:0', grad_fn=<NllLossBackward0>) 262000 26500 tensor(5.2347, device='cuda:0', grad_fn=<NllLossBackward0>) 263000 26600 tensor(5.5761, device='cuda:0', grad_fn=<NllLossBackward0>) 264000 26700 tensor(5.4402, device='cuda:0', grad_fn=<NllLossBackward0>) 265000 26800 tensor(5.6173, device='cuda:0', grad_fn=<NllLossBackward0>) 266000 26900 tensor(5.3775, device='cuda:0', grad_fn=<NllLossBackward0>) 267000 27000 tensor(5.2863, device='cuda:0', grad_fn=<NllLossBackward0>) 268000 27100 tensor(5.3007, device='cuda:0', grad_fn=<NllLossBackward0>) 269000 27200 tensor(5.3551, device='cuda:0', grad_fn=<NllLossBackward0>) 270000 27300 tensor(5.5439, device='cuda:0', grad_fn=<NllLossBackward0>) 271000 27400 tensor(5.4334, device='cuda:0', grad_fn=<NllLossBackward0>) 272000 27500 tensor(5.3266, device='cuda:0', grad_fn=<NllLossBackward0>) 273000 27600 tensor(5.6412, device='cuda:0', grad_fn=<NllLossBackward0>) 274000 27700 tensor(5.4420, device='cuda:0', grad_fn=<NllLossBackward0>) 275000 27800 tensor(5.4381, device='cuda:0', grad_fn=<NllLossBackward0>) 276000 27900 tensor(5.5550, device='cuda:0', grad_fn=<NllLossBackward0>) 277000 28000 tensor(5.4154, device='cuda:0', grad_fn=<NllLossBackward0>) 278000 28100 tensor(5.3823, device='cuda:0', grad_fn=<NllLossBackward0>) 279000 28200 tensor(5.5344, device='cuda:0', grad_fn=<NllLossBackward0>) 280000 28300 tensor(5.1615, device='cuda:0', grad_fn=<NllLossBackward0>) 281000 28400 tensor(5.6069, device='cuda:0', grad_fn=<NllLossBackward0>) 282000 28500 tensor(5.4426, device='cuda:0', grad_fn=<NllLossBackward0>) 283000 28600 tensor(5.3672, device='cuda:0', grad_fn=<NllLossBackward0>) 284000 28700 tensor(5.5133, device='cuda:0', grad_fn=<NllLossBackward0>) 285000 28800 tensor(5.5556, device='cuda:0', grad_fn=<NllLossBackward0>) 286000 28900 tensor(5.4294, device='cuda:0', grad_fn=<NllLossBackward0>) 287000 29000 tensor(5.3359, device='cuda:0', grad_fn=<NllLossBackward0>) 288000 29100 tensor(5.0951, device='cuda:0', grad_fn=<NllLossBackward0>) 289000 29200 tensor(5.2511, device='cuda:0', grad_fn=<NllLossBackward0>) 290000 29300 tensor(5.5364, device='cuda:0', grad_fn=<NllLossBackward0>) 291000 29400 tensor(5.6708, device='cuda:0', grad_fn=<NllLossBackward0>) 292000 29500 tensor(5.4371, device='cuda:0', grad_fn=<NllLossBackward0>) 293000 29600 tensor(5.2942, device='cuda:0', grad_fn=<NllLossBackward0>) 294000 29700 tensor(5.4637, device='cuda:0', grad_fn=<NllLossBackward0>) 295000 29800 tensor(5.2914, device='cuda:0', grad_fn=<NllLossBackward0>) 296000 29900 tensor(5.5562, device='cuda:0', grad_fn=<NllLossBackward0>) 297000 30000 tensor(5.2833, device='cuda:0', grad_fn=<NllLossBackward0>) 298000 30100 tensor(5.3481, device='cuda:0', grad_fn=<NllLossBackward0>) 299000 30200 tensor(5.3122, device='cuda:0', grad_fn=<NllLossBackward0>) 300000 30300 tensor(5.4103, device='cuda:0', grad_fn=<NllLossBackward0>) 301000 30400 tensor(5.2480, device='cuda:0', grad_fn=<NllLossBackward0>) 302000 30500 tensor(5.4258, device='cuda:0', grad_fn=<NllLossBackward0>) 30600 tensor(5.3835, device='cuda:0', grad_fn=<NllLossBackward0>) 303000 30700 tensor(5.4193, device='cuda:0', grad_fn=<NllLossBackward0>) 304000 30800 tensor(5.4438, device='cuda:0', grad_fn=<NllLossBackward0>) 305000 30900 tensor(5.4518, device='cuda:0', grad_fn=<NllLossBackward0>) 306000 31000 tensor(5.5607, device='cuda:0', grad_fn=<NllLossBackward0>) 307000 31100 tensor(5.2059, device='cuda:0', grad_fn=<NllLossBackward0>) 308000 31200 tensor(5.2571, device='cuda:0', grad_fn=<NllLossBackward0>) 309000 31300 tensor(5.5208, device='cuda:0', grad_fn=<NllLossBackward0>) 310000 31400 tensor(5.3061, device='cuda:0', grad_fn=<NllLossBackward0>) 311000 31500 tensor(5.4834, device='cuda:0', grad_fn=<NllLossBackward0>) 312000 31600 tensor(5.4653, device='cuda:0', grad_fn=<NllLossBackward0>) 313000 31700 tensor(5.4308, device='cuda:0', grad_fn=<NllLossBackward0>) 314000 31800 tensor(5.5400, device='cuda:0', grad_fn=<NllLossBackward0>) 315000 31900 tensor(5.1536, device='cuda:0', grad_fn=<NllLossBackward0>) 316000 32000 tensor(5.3460, device='cuda:0', grad_fn=<NllLossBackward0>) 317000 32100 tensor(5.2300, device='cuda:0', grad_fn=<NllLossBackward0>) 318000 32200 tensor(5.5511, device='cuda:0', grad_fn=<NllLossBackward0>) 319000 32300 tensor(5.5391, device='cuda:0', grad_fn=<NllLossBackward0>) 320000 32400 tensor(5.5157, device='cuda:0', grad_fn=<NllLossBackward0>) 321000 32500 tensor(5.3336, device='cuda:0', grad_fn=<NllLossBackward0>) 322000 32600 tensor(5.4475, device='cuda:0', grad_fn=<NllLossBackward0>) 323000 32700 tensor(5.3894, device='cuda:0', grad_fn=<NllLossBackward0>) 324000 32800 tensor(5.6022, device='cuda:0', grad_fn=<NllLossBackward0>) 325000 32900 tensor(5.4663, device='cuda:0', grad_fn=<NllLossBackward0>) 326000 33000 tensor(5.2387, device='cuda:0', grad_fn=<NllLossBackward0>) 327000 33100 tensor(5.4446, device='cuda:0', grad_fn=<NllLossBackward0>) 328000 33200 tensor(5.5450, device='cuda:0', grad_fn=<NllLossBackward0>) 329000 33300 tensor(5.3179, device='cuda:0', grad_fn=<NllLossBackward0>) 330000 33400 tensor(5.5905, device='cuda:0', grad_fn=<NllLossBackward0>) 331000 33500 tensor(5.4066, device='cuda:0', grad_fn=<NllLossBackward0>) 332000 33600 tensor(5.3542, device='cuda:0', grad_fn=<NllLossBackward0>) 333000 33700 tensor(5.4097, device='cuda:0', grad_fn=<NllLossBackward0>) 334000 33800 tensor(5.4912, device='cuda:0', grad_fn=<NllLossBackward0>) 335000 33900 tensor(5.2358, device='cuda:0', grad_fn=<NllLossBackward0>) 336000 34000 tensor(5.4470, device='cuda:0', grad_fn=<NllLossBackward0>) 337000 34100 tensor(5.4207, device='cuda:0', grad_fn=<NllLossBackward0>) 338000 34200 tensor(5.4651, device='cuda:0', grad_fn=<NllLossBackward0>) 339000 34300 tensor(5.2545, device='cuda:0', grad_fn=<NllLossBackward0>) 340000 34400 tensor(5.7106, device='cuda:0', grad_fn=<NllLossBackward0>) 341000 34500 tensor(5.5699, device='cuda:0', grad_fn=<NllLossBackward0>) 342000 34600 tensor(5.4638, device='cuda:0', grad_fn=<NllLossBackward0>) 343000 34700 tensor(5.5382, device='cuda:0', grad_fn=<NllLossBackward0>) 344000 34800 tensor(5.5603, device='cuda:0', grad_fn=<NllLossBackward0>) 345000 34900 tensor(5.6072, device='cuda:0', grad_fn=<NllLossBackward0>) 346000 35000 tensor(5.6037, device='cuda:0', grad_fn=<NllLossBackward0>) 347000 35100 tensor(5.4069, device='cuda:0', grad_fn=<NllLossBackward0>) 348000 35200 tensor(5.3398, device='cuda:0', grad_fn=<NllLossBackward0>) 349000 35300 tensor(5.5607, device='cuda:0', grad_fn=<NllLossBackward0>) 350000 35400 tensor(5.2068, device='cuda:0', grad_fn=<NllLossBackward0>) 351000 35500 tensor(5.3112, device='cuda:0', grad_fn=<NllLossBackward0>) 352000 35600 tensor(5.4126, device='cuda:0', grad_fn=<NllLossBackward0>) 353000 35700 tensor(5.3091, device='cuda:0', grad_fn=<NllLossBackward0>) 354000 35800 tensor(5.4252, device='cuda:0', grad_fn=<NllLossBackward0>) 355000 35900 tensor(5.3956, device='cuda:0', grad_fn=<NllLossBackward0>) 356000 36000 tensor(5.1705, device='cuda:0', grad_fn=<NllLossBackward0>) 357000 36100 tensor(5.5497, device='cuda:0', grad_fn=<NllLossBackward0>) 358000 36200 tensor(5.4066, device='cuda:0', grad_fn=<NllLossBackward0>) 359000 36300 tensor(5.6858, device='cuda:0', grad_fn=<NllLossBackward0>) 360000 36400 tensor(5.3812, device='cuda:0', grad_fn=<NllLossBackward0>) 361000 36500 tensor(5.3990, device='cuda:0', grad_fn=<NllLossBackward0>) 362000 36600 tensor(5.4302, device='cuda:0', grad_fn=<NllLossBackward0>) 363000 36700 tensor(5.2253, device='cuda:0', grad_fn=<NllLossBackward0>) 364000 36800 tensor(5.3347, device='cuda:0', grad_fn=<NllLossBackward0>) 365000 36900 tensor(5.4426, device='cuda:0', grad_fn=<NllLossBackward0>) 366000 37000 tensor(5.3419, device='cuda:0', grad_fn=<NllLossBackward0>) 367000 37100 tensor(5.3579, device='cuda:0', grad_fn=<NllLossBackward0>) 368000 37200 tensor(5.4332, device='cuda:0', grad_fn=<NllLossBackward0>) 369000 37300 tensor(5.3362, device='cuda:0', grad_fn=<NllLossBackward0>) 370000 37400 tensor(5.7100, device='cuda:0', grad_fn=<NllLossBackward0>) 371000 37500 tensor(5.3742, device='cuda:0', grad_fn=<NllLossBackward0>) 372000 37600 tensor(5.3615, device='cuda:0', grad_fn=<NllLossBackward0>) 373000 37700 tensor(5.5402, device='cuda:0', grad_fn=<NllLossBackward0>) 374000 37800 tensor(5.3734, device='cuda:0', grad_fn=<NllLossBackward0>) 375000 37900 tensor(5.3621, device='cuda:0', grad_fn=<NllLossBackward0>) 376000 38000 tensor(5.4380, device='cuda:0', grad_fn=<NllLossBackward0>) 377000 38100 tensor(5.4513, device='cuda:0', grad_fn=<NllLossBackward0>) 378000 38200 tensor(5.4554, device='cuda:0', grad_fn=<NllLossBackward0>) 379000 38300 tensor(5.3735, device='cuda:0', grad_fn=<NllLossBackward0>) 380000 38400 tensor(5.4297, device='cuda:0', grad_fn=<NllLossBackward0>) 381000 38500 tensor(5.4561, device='cuda:0', grad_fn=<NllLossBackward0>) 382000 38600 tensor(5.4118, device='cuda:0', grad_fn=<NllLossBackward0>) 383000 38700 tensor(5.3996, device='cuda:0', grad_fn=<NllLossBackward0>) 384000 38800 tensor(5.4825, device='cuda:0', grad_fn=<NllLossBackward0>) 385000 38900 tensor(5.5692, device='cuda:0', grad_fn=<NllLossBackward0>) 386000 39000 tensor(5.2573, device='cuda:0', grad_fn=<NllLossBackward0>) 387000 39100 tensor(5.4847, device='cuda:0', grad_fn=<NllLossBackward0>) 388000 39200 tensor(5.5802, device='cuda:0', grad_fn=<NllLossBackward0>) 389000 39300 tensor(5.3968, device='cuda:0', grad_fn=<NllLossBackward0>) 390000 39400 tensor(5.4666, device='cuda:0', grad_fn=<NllLossBackward0>) 391000 39500 tensor(5.5847, device='cuda:0', grad_fn=<NllLossBackward0>) 392000 39600 tensor(5.2648, device='cuda:0', grad_fn=<NllLossBackward0>) 393000 39700 tensor(5.2423, device='cuda:0', grad_fn=<NllLossBackward0>) 394000 39800 tensor(5.3731, device='cuda:0', grad_fn=<NllLossBackward0>) 395000 39900 tensor(5.2014, device='cuda:0', grad_fn=<NllLossBackward0>) 396000 40000 tensor(5.2903, device='cuda:0', grad_fn=<NllLossBackward0>) 397000 40100 tensor(5.3712, device='cuda:0', grad_fn=<NllLossBackward0>) 398000 40200 tensor(5.3557, device='cuda:0', grad_fn=<NllLossBackward0>) 399000 40300 tensor(5.4151, device='cuda:0', grad_fn=<NllLossBackward0>) 40400 tensor(5.4358, device='cuda:0', grad_fn=<NllLossBackward0>) 400000 40500 tensor(5.3498, device='cuda:0', grad_fn=<NllLossBackward0>) 401000 40600 tensor(5.4152, device='cuda:0', grad_fn=<NllLossBackward0>) 402000 40700 tensor(5.4551, device='cuda:0', grad_fn=<NllLossBackward0>) 403000 40800 tensor(5.4138, device='cuda:0', grad_fn=<NllLossBackward0>) 404000 40900 tensor(5.3628, device='cuda:0', grad_fn=<NllLossBackward0>) 405000 41000 tensor(5.4124, device='cuda:0', grad_fn=<NllLossBackward0>) 406000 41100 tensor(5.3750, device='cuda:0', grad_fn=<NllLossBackward0>) 407000 41200 tensor(5.2687, device='cuda:0', grad_fn=<NllLossBackward0>) 408000 41300 tensor(5.3987, device='cuda:0', grad_fn=<NllLossBackward0>) 409000 41400 tensor(5.2976, device='cuda:0', grad_fn=<NllLossBackward0>) 410000 41500 tensor(5.4418, device='cuda:0', grad_fn=<NllLossBackward0>) 411000 41600 tensor(5.3558, device='cuda:0', grad_fn=<NllLossBackward0>) 412000 41700 tensor(5.3767, device='cuda:0', grad_fn=<NllLossBackward0>) 413000 41800 tensor(5.3836, device='cuda:0', grad_fn=<NllLossBackward0>) 414000 41900 tensor(5.3904, device='cuda:0', grad_fn=<NllLossBackward0>) 415000 42000 tensor(5.5445, device='cuda:0', grad_fn=<NllLossBackward0>) 416000 42100 tensor(5.2890, device='cuda:0', grad_fn=<NllLossBackward0>) 417000 42200 tensor(5.3691, device='cuda:0', grad_fn=<NllLossBackward0>) 418000 42300 tensor(5.4364, device='cuda:0', grad_fn=<NllLossBackward0>) 419000 42400 tensor(5.2507, device='cuda:0', grad_fn=<NllLossBackward0>) 420000 42500 tensor(5.4215, device='cuda:0', grad_fn=<NllLossBackward0>) 421000 42600 tensor(5.2136, device='cuda:0', grad_fn=<NllLossBackward0>) 422000 42700 tensor(5.5296, device='cuda:0', grad_fn=<NllLossBackward0>) 423000 42800 tensor(5.4544, device='cuda:0', grad_fn=<NllLossBackward0>) 424000 42900 tensor(5.3009, device='cuda:0', grad_fn=<NllLossBackward0>) 425000 43000 tensor(5.4403, device='cuda:0', grad_fn=<NllLossBackward0>) 426000 43100 tensor(5.4384, device='cuda:0', grad_fn=<NllLossBackward0>) 427000 43200 tensor(5.2520, device='cuda:0', grad_fn=<NllLossBackward0>) 428000 43300 tensor(5.2945, device='cuda:0', grad_fn=<NllLossBackward0>) 429000 43400 tensor(5.4455, device='cuda:0', grad_fn=<NllLossBackward0>) 430000 43500 tensor(5.1633, device='cuda:0', grad_fn=<NllLossBackward0>) 431000 43600 tensor(5.3649, device='cuda:0', grad_fn=<NllLossBackward0>) 432000 epoch: = 3 0 tensor(5.3427, device='cuda:0', grad_fn=<NllLossBackward0>) 1000 100 tensor(5.4180, device='cuda:0', grad_fn=<NllLossBackward0>) 200 tensor(5.2939, device='cuda:0', grad_fn=<NllLossBackward0>) 2000 300 tensor(5.3083, device='cuda:0', grad_fn=<NllLossBackward0>) 3000 400 tensor(5.3086, device='cuda:0', grad_fn=<NllLossBackward0>) 4000 500 tensor(5.4733, device='cuda:0', grad_fn=<NllLossBackward0>) 5000 600 tensor(5.2627, device='cuda:0', grad_fn=<NllLossBackward0>) 6000 700 tensor(5.5664, device='cuda:0', grad_fn=<NllLossBackward0>) 7000 800 tensor(5.1641, device='cuda:0', grad_fn=<NllLossBackward0>) 8000 900 tensor(5.4272, device='cuda:0', grad_fn=<NllLossBackward0>) 9000 1000 tensor(5.2926, device='cuda:0', grad_fn=<NllLossBackward0>) 10000 1100 tensor(5.4848, device='cuda:0', grad_fn=<NllLossBackward0>) 11000 1200 tensor(5.5283, device='cuda:0', grad_fn=<NllLossBackward0>) 12000 1300 tensor(5.4635, device='cuda:0', grad_fn=<NllLossBackward0>) 13000 1400 tensor(5.4590, device='cuda:0', grad_fn=<NllLossBackward0>) 14000 1500 tensor(5.5386, device='cuda:0', grad_fn=<NllLossBackward0>) 15000 1600 tensor(5.2150, device='cuda:0', grad_fn=<NllLossBackward0>) 16000 1700 tensor(5.3116, device='cuda:0', grad_fn=<NllLossBackward0>) 17000 1800 tensor(5.3130, device='cuda:0', grad_fn=<NllLossBackward0>) 18000 1900 tensor(5.2889, device='cuda:0', grad_fn=<NllLossBackward0>) 19000 2000 tensor(5.3574, device='cuda:0', grad_fn=<NllLossBackward0>) 20000 2100 tensor(5.4860, device='cuda:0', grad_fn=<NllLossBackward0>) 21000 2200 tensor(5.3206, device='cuda:0', grad_fn=<NllLossBackward0>) 22000 2300 tensor(5.3447, device='cuda:0', grad_fn=<NllLossBackward0>) 23000 2400 tensor(5.3333, device='cuda:0', grad_fn=<NllLossBackward0>) 24000 2500 tensor(5.3822, device='cuda:0', grad_fn=<NllLossBackward0>) 25000 2600 tensor(5.4039, device='cuda:0', grad_fn=<NllLossBackward0>) 26000 2700 tensor(5.4280, device='cuda:0', grad_fn=<NllLossBackward0>) 27000 2800 tensor(5.4575, device='cuda:0', grad_fn=<NllLossBackward0>) 28000 2900 tensor(5.5878, device='cuda:0', grad_fn=<NllLossBackward0>) 29000 3000 tensor(5.3311, device='cuda:0', grad_fn=<NllLossBackward0>) 30000 3100 tensor(5.4103, device='cuda:0', grad_fn=<NllLossBackward0>) 31000 3200 tensor(5.4323, device='cuda:0', grad_fn=<NllLossBackward0>) 32000 3300 tensor(5.3521, device='cuda:0', grad_fn=<NllLossBackward0>) 33000 3400 tensor(5.2512, device='cuda:0', grad_fn=<NllLossBackward0>) 34000 3500 tensor(5.3813, device='cuda:0', grad_fn=<NllLossBackward0>) 35000 3600 tensor(5.4000, device='cuda:0', grad_fn=<NllLossBackward0>) 36000 3700 tensor(5.3312, device='cuda:0', grad_fn=<NllLossBackward0>) 37000 3800 tensor(5.3553, device='cuda:0', grad_fn=<NllLossBackward0>) 38000 3900 tensor(5.2275, device='cuda:0', grad_fn=<NllLossBackward0>) 39000 4000 tensor(5.2883, device='cuda:0', grad_fn=<NllLossBackward0>) 40000 4100 tensor(5.4294, device='cuda:0', grad_fn=<NllLossBackward0>) 41000 4200 tensor(5.4801, device='cuda:0', grad_fn=<NllLossBackward0>) 42000 4300 tensor(5.3863, device='cuda:0', grad_fn=<NllLossBackward0>) 43000 4400 tensor(5.4470, device='cuda:0', grad_fn=<NllLossBackward0>) 44000 4500 tensor(5.2610, device='cuda:0', grad_fn=<NllLossBackward0>) 45000 4600 tensor(5.5962, device='cuda:0', grad_fn=<NllLossBackward0>) 46000 4700 tensor(5.3029, device='cuda:0', grad_fn=<NllLossBackward0>) 47000 4800 tensor(5.4265, device='cuda:0', grad_fn=<NllLossBackward0>) 48000 4900 tensor(5.4823, device='cuda:0', grad_fn=<NllLossBackward0>) 49000 5000 tensor(5.4749, device='cuda:0', grad_fn=<NllLossBackward0>) 50000 5100 tensor(5.5356, device='cuda:0', grad_fn=<NllLossBackward0>) 51000 5200 tensor(5.5513, device='cuda:0', grad_fn=<NllLossBackward0>) 52000 5300 tensor(5.5476, device='cuda:0', grad_fn=<NllLossBackward0>) 53000 5400 tensor(5.4039, device='cuda:0', grad_fn=<NllLossBackward0>) 54000 5500 tensor(5.5156, device='cuda:0', grad_fn=<NllLossBackward0>) 55000 5600 tensor(5.2975, device='cuda:0', grad_fn=<NllLossBackward0>) 56000 5700 tensor(5.5492, device='cuda:0', grad_fn=<NllLossBackward0>) 57000 5800 tensor(5.5379, device='cuda:0', grad_fn=<NllLossBackward0>) 58000 5900 tensor(5.4874, device='cuda:0', grad_fn=<NllLossBackward0>) 59000 6000 tensor(5.3808, device='cuda:0', grad_fn=<NllLossBackward0>) 60000 6100 tensor(5.3932, device='cuda:0', grad_fn=<NllLossBackward0>) 61000 6200 tensor(5.5657, device='cuda:0', grad_fn=<NllLossBackward0>) 62000 6300 tensor(5.4233, device='cuda:0', grad_fn=<NllLossBackward0>) 63000 6400 tensor(5.3438, device='cuda:0', grad_fn=<NllLossBackward0>) 64000 6500 tensor(5.2002, device='cuda:0', grad_fn=<NllLossBackward0>) 65000 6600 tensor(5.3774, device='cuda:0', grad_fn=<NllLossBackward0>) 66000 6700 tensor(5.3193, device='cuda:0', grad_fn=<NllLossBackward0>) 67000 6800 tensor(5.5394, device='cuda:0', grad_fn=<NllLossBackward0>) 68000 6900 tensor(5.5196, device='cuda:0', grad_fn=<NllLossBackward0>) 69000 7000 tensor(5.4282, device='cuda:0', grad_fn=<NllLossBackward0>) 70000 7100 tensor(5.2296, device='cuda:0', grad_fn=<NllLossBackward0>) 71000 7200 tensor(5.3175, device='cuda:0', grad_fn=<NllLossBackward0>) 72000 7300 tensor(5.5642, device='cuda:0', grad_fn=<NllLossBackward0>) 73000 7400 tensor(5.3784, device='cuda:0', grad_fn=<NllLossBackward0>) 74000 7500 tensor(5.2475, device='cuda:0', grad_fn=<NllLossBackward0>) 75000 7600 tensor(5.3194, device='cuda:0', grad_fn=<NllLossBackward0>) 76000 7700 tensor(5.3934, device='cuda:0', grad_fn=<NllLossBackward0>) 77000 7800 tensor(5.5041, device='cuda:0', grad_fn=<NllLossBackward0>) 78000 7900 tensor(5.1814, device='cuda:0', grad_fn=<NllLossBackward0>) 79000 8000 tensor(5.2426, device='cuda:0', grad_fn=<NllLossBackward0>) 80000 8100 tensor(5.4104, device='cuda:0', grad_fn=<NllLossBackward0>) 81000 8200 tensor(5.4198, device='cuda:0', grad_fn=<NllLossBackward0>) 82000 8300 tensor(5.3854, device='cuda:0', grad_fn=<NllLossBackward0>) 83000 8400 tensor(5.5128, device='cuda:0', grad_fn=<NllLossBackward0>) 84000 8500 tensor(5.4898, device='cuda:0', grad_fn=<NllLossBackward0>) 85000 8600 tensor(5.4943, device='cuda:0', grad_fn=<NllLossBackward0>) 86000 8700 tensor(5.6012, device='cuda:0', grad_fn=<NllLossBackward0>) 87000 8800 tensor(5.4790, device='cuda:0', grad_fn=<NllLossBackward0>) 88000 8900 tensor(5.3312, device='cuda:0', grad_fn=<NllLossBackward0>) 89000 9000 tensor(5.4456, device='cuda:0', grad_fn=<NllLossBackward0>) 90000 9100 tensor(5.4537, device='cuda:0', grad_fn=<NllLossBackward0>) 91000 9200 tensor(5.3643, device='cuda:0', grad_fn=<NllLossBackward0>) 92000 9300 tensor(5.4085, device='cuda:0', grad_fn=<NllLossBackward0>) 93000 9400 tensor(5.2527, device='cuda:0', grad_fn=<NllLossBackward0>) 94000 9500 tensor(5.3289, device='cuda:0', grad_fn=<NllLossBackward0>) 95000 9600 tensor(5.4516, device='cuda:0', grad_fn=<NllLossBackward0>) 96000 9700 tensor(5.3881, device='cuda:0', grad_fn=<NllLossBackward0>) 97000 9800 tensor(5.4321, device='cuda:0', grad_fn=<NllLossBackward0>) 9900 tensor(5.2532, device='cuda:0', grad_fn=<NllLossBackward0>) 98000 10000 tensor(5.4727, device='cuda:0', grad_fn=<NllLossBackward0>) 99000 10100 tensor(5.3607, device='cuda:0', grad_fn=<NllLossBackward0>) 100000 10200 tensor(5.2989, device='cuda:0', grad_fn=<NllLossBackward0>) 101000 10300 tensor(5.4168, device='cuda:0', grad_fn=<NllLossBackward0>) 102000 10400 tensor(5.4272, device='cuda:0', grad_fn=<NllLossBackward0>) 103000 10500 tensor(5.4838, device='cuda:0', grad_fn=<NllLossBackward0>) 104000 10600 tensor(5.5675, device='cuda:0', grad_fn=<NllLossBackward0>) 105000 10700 tensor(5.4027, device='cuda:0', grad_fn=<NllLossBackward0>) 106000 10800 tensor(5.4252, device='cuda:0', grad_fn=<NllLossBackward0>) 107000 10900 tensor(5.3408, device='cuda:0', grad_fn=<NllLossBackward0>) 108000 11000 tensor(5.5754, device='cuda:0', grad_fn=<NllLossBackward0>) 109000 11100 tensor(5.1920, device='cuda:0', grad_fn=<NllLossBackward0>) 110000 11200 tensor(5.3604, device='cuda:0', grad_fn=<NllLossBackward0>) 111000 11300 tensor(5.3836, device='cuda:0', grad_fn=<NllLossBackward0>) 112000 11400 tensor(5.3330, device='cuda:0', grad_fn=<NllLossBackward0>) 113000 11500 tensor(5.4023, device='cuda:0', grad_fn=<NllLossBackward0>) 114000 11600 tensor(5.3923, device='cuda:0', grad_fn=<NllLossBackward0>) 115000 11700 tensor(5.3145, device='cuda:0', grad_fn=<NllLossBackward0>) 116000 11800 tensor(5.5174, device='cuda:0', grad_fn=<NllLossBackward0>) 117000 11900 tensor(5.3522, device='cuda:0', grad_fn=<NllLossBackward0>) 118000 12000 tensor(5.4232, device='cuda:0', grad_fn=<NllLossBackward0>) 119000 12100 tensor(5.4382, device='cuda:0', grad_fn=<NllLossBackward0>) 120000 12200 tensor(5.4488, device='cuda:0', grad_fn=<NllLossBackward0>) 121000 12300 tensor(5.5409, device='cuda:0', grad_fn=<NllLossBackward0>) 122000 12400 tensor(5.4200, device='cuda:0', grad_fn=<NllLossBackward0>) 123000 12500 tensor(5.3292, device='cuda:0', grad_fn=<NllLossBackward0>) 124000 12600 tensor(5.3788, device='cuda:0', grad_fn=<NllLossBackward0>) 125000 12700 tensor(5.3116, device='cuda:0', grad_fn=<NllLossBackward0>) 126000 12800 tensor(5.4948, device='cuda:0', grad_fn=<NllLossBackward0>) 127000 12900 tensor(5.3557, device='cuda:0', grad_fn=<NllLossBackward0>) 128000 13000 tensor(5.1732, device='cuda:0', grad_fn=<NllLossBackward0>) 129000 13100 tensor(5.3782, device='cuda:0', grad_fn=<NllLossBackward0>) 130000 13200 tensor(5.4178, device='cuda:0', grad_fn=<NllLossBackward0>) 131000 13300 tensor(5.2929, device='cuda:0', grad_fn=<NllLossBackward0>) 132000 13400 tensor(5.3806, device='cuda:0', grad_fn=<NllLossBackward0>) 133000 13500 tensor(5.3394, device='cuda:0', grad_fn=<NllLossBackward0>) 134000 13600 tensor(5.4191, device='cuda:0', grad_fn=<NllLossBackward0>) 135000 13700 tensor(5.3856, device='cuda:0', grad_fn=<NllLossBackward0>) 136000 13800 tensor(5.3839, device='cuda:0', grad_fn=<NllLossBackward0>) 137000 13900 tensor(5.2391, device='cuda:0', grad_fn=<NllLossBackward0>) 138000 14000 tensor(5.4865, device='cuda:0', grad_fn=<NllLossBackward0>) 139000 14100 tensor(5.1952, device='cuda:0', grad_fn=<NllLossBackward0>) 140000 14200 tensor(5.4670, device='cuda:0', grad_fn=<NllLossBackward0>) 141000 14300 tensor(5.4385, device='cuda:0', grad_fn=<NllLossBackward0>) 142000 14400 tensor(5.3347, device='cuda:0', grad_fn=<NllLossBackward0>) 143000 14500 tensor(5.4370, device='cuda:0', grad_fn=<NllLossBackward0>) 144000 14600 tensor(5.4695, device='cuda:0', grad_fn=<NllLossBackward0>) 145000 14700 tensor(5.3453, device='cuda:0', grad_fn=<NllLossBackward0>) 146000 14800 tensor(5.7928, device='cuda:0', grad_fn=<NllLossBackward0>) 147000 14900 tensor(5.4451, device='cuda:0', grad_fn=<NllLossBackward0>) 148000 15000 tensor(5.3087, device='cuda:0', grad_fn=<NllLossBackward0>) 149000 15100 tensor(5.5241, device='cuda:0', grad_fn=<NllLossBackward0>) 150000 15200 tensor(5.3894, device='cuda:0', grad_fn=<NllLossBackward0>) 151000 15300 tensor(5.3809, device='cuda:0', grad_fn=<NllLossBackward0>) 152000 15400 tensor(5.2696, device='cuda:0', grad_fn=<NllLossBackward0>) 153000 15500 tensor(5.4343, device='cuda:0', grad_fn=<NllLossBackward0>) 154000 15600 tensor(5.4322, device='cuda:0', grad_fn=<NllLossBackward0>) 155000 15700 tensor(5.3296, device='cuda:0', grad_fn=<NllLossBackward0>) 156000 15800 tensor(5.2456, device='cuda:0', grad_fn=<NllLossBackward0>) 157000 15900 tensor(5.3806, device='cuda:0', grad_fn=<NllLossBackward0>) 158000 16000 tensor(5.2008, device='cuda:0', grad_fn=<NllLossBackward0>) 159000 16100 tensor(5.2489, device='cuda:0', grad_fn=<NllLossBackward0>) 160000 16200 tensor(5.5902, device='cuda:0', grad_fn=<NllLossBackward0>) 161000 16300 tensor(5.4159, device='cuda:0', grad_fn=<NllLossBackward0>) 162000 16400 tensor(5.3966, device='cuda:0', grad_fn=<NllLossBackward0>) 163000 16500 tensor(5.5113, device='cuda:0', grad_fn=<NllLossBackward0>) 164000 16600 tensor(5.3599, device='cuda:0', grad_fn=<NllLossBackward0>) 165000 16700 tensor(5.3372, device='cuda:0', grad_fn=<NllLossBackward0>) 166000 16800 tensor(5.4158, device='cuda:0', grad_fn=<NllLossBackward0>) 167000 16900 tensor(5.1788, device='cuda:0', grad_fn=<NllLossBackward0>) 168000 17000 tensor(5.4497, device='cuda:0', grad_fn=<NllLossBackward0>) 169000 17100 tensor(5.2981, device='cuda:0', grad_fn=<NllLossBackward0>) 170000 17200 tensor(5.4330, device='cuda:0', grad_fn=<NllLossBackward0>) 171000 17300 tensor(5.4495, device='cuda:0', grad_fn=<NllLossBackward0>) 172000 17400 tensor(5.2431, device='cuda:0', grad_fn=<NllLossBackward0>) 173000 17500 tensor(5.2652, device='cuda:0', grad_fn=<NllLossBackward0>) 174000 17600 tensor(5.3007, device='cuda:0', grad_fn=<NllLossBackward0>) 175000 17700 tensor(5.2852, device='cuda:0', grad_fn=<NllLossBackward0>) 176000 17800 tensor(5.3431, device='cuda:0', grad_fn=<NllLossBackward0>) 177000 17900 tensor(5.2395, device='cuda:0', grad_fn=<NllLossBackward0>) 178000 18000 tensor(5.4841, device='cuda:0', grad_fn=<NllLossBackward0>) 179000 18100 tensor(5.4218, device='cuda:0', grad_fn=<NllLossBackward0>) 180000 18200 tensor(5.3397, device='cuda:0', grad_fn=<NllLossBackward0>) 181000 18300 tensor(5.3426, device='cuda:0', grad_fn=<NllLossBackward0>) 182000 18400 tensor(5.3654, device='cuda:0', grad_fn=<NllLossBackward0>) 183000 18500 tensor(5.3484, device='cuda:0', grad_fn=<NllLossBackward0>) 184000 18600 tensor(5.5509, device='cuda:0', grad_fn=<NllLossBackward0>) 185000 18700 tensor(5.3702, device='cuda:0', grad_fn=<NllLossBackward0>) 186000 18800 tensor(5.5361, device='cuda:0', grad_fn=<NllLossBackward0>) 187000 18900 tensor(5.4132, device='cuda:0', grad_fn=<NllLossBackward0>) 188000 19000 tensor(5.4235, device='cuda:0', grad_fn=<NllLossBackward0>) 189000 19100 tensor(5.5318, device='cuda:0', grad_fn=<NllLossBackward0>) 190000 19200 tensor(5.4136, device='cuda:0', grad_fn=<NllLossBackward0>) 191000 19300 tensor(5.5053, device='cuda:0', grad_fn=<NllLossBackward0>) 192000 19400 tensor(5.3472, device='cuda:0', grad_fn=<NllLossBackward0>) 193000 19500 tensor(5.3511, device='cuda:0', grad_fn=<NllLossBackward0>) 194000 19600 tensor(5.3861, device='cuda:0', grad_fn=<NllLossBackward0>) 195000 19700 tensor(5.4345, device='cuda:0', grad_fn=<NllLossBackward0>) 196000 19800 tensor(5.3067, device='cuda:0', grad_fn=<NllLossBackward0>) 197000 19900 tensor(5.3079, device='cuda:0', grad_fn=<NllLossBackward0>) 198000 20000 tensor(5.3268, device='cuda:0', grad_fn=<NllLossBackward0>) 199000 20100 tensor(5.2668, device='cuda:0', grad_fn=<NllLossBackward0>) 200000 20200 tensor(5.1998, device='cuda:0', grad_fn=<NllLossBackward0>) 201000 20300 tensor(5.3105, device='cuda:0', grad_fn=<NllLossBackward0>) 20400 tensor(5.3584, device='cuda:0', grad_fn=<NllLossBackward0>) 202000 20500 tensor(5.3580, device='cuda:0', grad_fn=<NllLossBackward0>) 203000 20600 tensor(5.5528, device='cuda:0', grad_fn=<NllLossBackward0>) 204000 20700 tensor(5.3871, device='cuda:0', grad_fn=<NllLossBackward0>) 205000 20800 tensor(5.2208, device='cuda:0', grad_fn=<NllLossBackward0>) 206000 20900 tensor(5.5007, device='cuda:0', grad_fn=<NllLossBackward0>) 207000 21000 tensor(5.3396, device='cuda:0', grad_fn=<NllLossBackward0>) 208000 21100 tensor(5.3407, device='cuda:0', grad_fn=<NllLossBackward0>) 209000 21200 tensor(5.2243, device='cuda:0', grad_fn=<NllLossBackward0>) 210000 21300 tensor(5.4206, device='cuda:0', grad_fn=<NllLossBackward0>) 211000 21400 tensor(5.4574, device='cuda:0', grad_fn=<NllLossBackward0>) 212000 21500 tensor(5.2328, device='cuda:0', grad_fn=<NllLossBackward0>) 213000 21600 tensor(5.2233, device='cuda:0', grad_fn=<NllLossBackward0>) 214000 21700 tensor(5.2152, device='cuda:0', grad_fn=<NllLossBackward0>) 215000 21800 tensor(5.3497, device='cuda:0', grad_fn=<NllLossBackward0>) 216000 21900 tensor(5.3425, device='cuda:0', grad_fn=<NllLossBackward0>) 217000 22000 tensor(5.3277, device='cuda:0', grad_fn=<NllLossBackward0>) 218000 22100 tensor(5.2012, device='cuda:0', grad_fn=<NllLossBackward0>) 219000 22200 tensor(5.0736, device='cuda:0', grad_fn=<NllLossBackward0>) 220000 22300 tensor(5.5070, device='cuda:0', grad_fn=<NllLossBackward0>) 221000 22400 tensor(5.2190, device='cuda:0', grad_fn=<NllLossBackward0>) 222000 22500 tensor(5.2434, device='cuda:0', grad_fn=<NllLossBackward0>) 223000 22600 tensor(5.4325, device='cuda:0', grad_fn=<NllLossBackward0>) 224000 22700 tensor(5.1909, device='cuda:0', grad_fn=<NllLossBackward0>) 225000 22800 tensor(5.4576, device='cuda:0', grad_fn=<NllLossBackward0>) 226000 22900 tensor(5.5069, device='cuda:0', grad_fn=<NllLossBackward0>) 227000 23000 tensor(5.4041, device='cuda:0', grad_fn=<NllLossBackward0>) 228000 23100 tensor(5.3908, device='cuda:0', grad_fn=<NllLossBackward0>) 229000 23200 tensor(5.3866, device='cuda:0', grad_fn=<NllLossBackward0>) 230000 23300 tensor(5.4714, device='cuda:0', grad_fn=<NllLossBackward0>) 231000 23400 tensor(5.4781, device='cuda:0', grad_fn=<NllLossBackward0>) 232000 23500 tensor(5.3154, device='cuda:0', grad_fn=<NllLossBackward0>) 233000 23600 tensor(5.2854, device='cuda:0', grad_fn=<NllLossBackward0>) 234000 23700 tensor(5.3050, device='cuda:0', grad_fn=<NllLossBackward0>) 235000 23800 tensor(5.1721, device='cuda:0', grad_fn=<NllLossBackward0>) 236000 23900 tensor(5.2637, device='cuda:0', grad_fn=<NllLossBackward0>) 237000 24000 tensor(5.2519, device='cuda:0', grad_fn=<NllLossBackward0>) 238000 24100 tensor(5.3407, device='cuda:0', grad_fn=<NllLossBackward0>) 239000 24200 tensor(5.5137, device='cuda:0', grad_fn=<NllLossBackward0>) 240000 24300 tensor(5.4080, device='cuda:0', grad_fn=<NllLossBackward0>) 241000 24400 tensor(5.5379, device='cuda:0', grad_fn=<NllLossBackward0>) 242000 24500 tensor(5.3255, device='cuda:0', grad_fn=<NllLossBackward0>) 243000 24600 tensor(5.4515, device='cuda:0', grad_fn=<NllLossBackward0>) 244000 24700 tensor(5.3535, device='cuda:0', grad_fn=<NllLossBackward0>) 245000 24800 tensor(5.3935, device='cuda:0', grad_fn=<NllLossBackward0>) 246000 24900 tensor(5.4553, device='cuda:0', grad_fn=<NllLossBackward0>) 247000 25000 tensor(5.4708, device='cuda:0', grad_fn=<NllLossBackward0>) 248000 25100 tensor(5.3920, device='cuda:0', grad_fn=<NllLossBackward0>) 249000 25200 tensor(5.4083, device='cuda:0', grad_fn=<NllLossBackward0>) 250000 25300 tensor(5.4332, device='cuda:0', grad_fn=<NllLossBackward0>) 251000 25400 tensor(5.4136, device='cuda:0', grad_fn=<NllLossBackward0>) 252000 25500 tensor(5.3147, device='cuda:0', grad_fn=<NllLossBackward0>) 253000 25600 tensor(5.5860, device='cuda:0', grad_fn=<NllLossBackward0>) 254000 25700 tensor(5.3490, device='cuda:0', grad_fn=<NllLossBackward0>) 255000 25800 tensor(5.4464, device='cuda:0', grad_fn=<NllLossBackward0>) 256000 25900 tensor(5.3857, device='cuda:0', grad_fn=<NllLossBackward0>) 257000 26000 tensor(5.3893, device='cuda:0', grad_fn=<NllLossBackward0>) 258000 26100 tensor(5.3041, device='cuda:0', grad_fn=<NllLossBackward0>) 259000 26200 tensor(5.2321, device='cuda:0', grad_fn=<NllLossBackward0>) 260000 26300 tensor(5.4289, device='cuda:0', grad_fn=<NllLossBackward0>) 261000 26400 tensor(5.4663, device='cuda:0', grad_fn=<NllLossBackward0>) 262000 26500 tensor(5.1922, device='cuda:0', grad_fn=<NllLossBackward0>) 263000 26600 tensor(5.5283, device='cuda:0', grad_fn=<NllLossBackward0>) 264000 26700 tensor(5.3933, device='cuda:0', grad_fn=<NllLossBackward0>) 265000 26800 tensor(5.5680, device='cuda:0', grad_fn=<NllLossBackward0>) 266000 26900 tensor(5.3281, device='cuda:0', grad_fn=<NllLossBackward0>) 267000 27000 tensor(5.2408, device='cuda:0', grad_fn=<NllLossBackward0>) 268000 27100 tensor(5.2671, device='cuda:0', grad_fn=<NllLossBackward0>) 269000 27200 tensor(5.3099, device='cuda:0', grad_fn=<NllLossBackward0>) 270000 27300 tensor(5.5049, device='cuda:0', grad_fn=<NllLossBackward0>) 271000 27400 tensor(5.3850, device='cuda:0', grad_fn=<NllLossBackward0>) 272000 27500 tensor(5.2843, device='cuda:0', grad_fn=<NllLossBackward0>) 273000 27600 tensor(5.5777, device='cuda:0', grad_fn=<NllLossBackward0>) 274000 27700 tensor(5.4017, device='cuda:0', grad_fn=<NllLossBackward0>) 275000 27800 tensor(5.3994, device='cuda:0', grad_fn=<NllLossBackward0>) 276000 27900 tensor(5.5128, device='cuda:0', grad_fn=<NllLossBackward0>) 277000 28000 tensor(5.3708, device='cuda:0', grad_fn=<NllLossBackward0>) 278000 28100 tensor(5.3382, device='cuda:0', grad_fn=<NllLossBackward0>) 279000 28200 tensor(5.4996, device='cuda:0', grad_fn=<NllLossBackward0>) 280000 28300 tensor(5.1214, device='cuda:0', grad_fn=<NllLossBackward0>) 281000 28400 tensor(5.5647, device='cuda:0', grad_fn=<NllLossBackward0>) 282000 28500 tensor(5.3959, device='cuda:0', grad_fn=<NllLossBackward0>) 283000 28600 tensor(5.3312, device='cuda:0', grad_fn=<NllLossBackward0>) 284000 28700 tensor(5.4663, device='cuda:0', grad_fn=<NllLossBackward0>) 285000 28800 tensor(5.5155, device='cuda:0', grad_fn=<NllLossBackward0>) 286000 28900 tensor(5.3872, device='cuda:0', grad_fn=<NllLossBackward0>) 287000 29000 tensor(5.3017, device='cuda:0', grad_fn=<NllLossBackward0>) 288000 29100 tensor(5.0583, device='cuda:0', grad_fn=<NllLossBackward0>) 289000 29200 tensor(5.2099, device='cuda:0', grad_fn=<NllLossBackward0>) 290000 29300 tensor(5.4934, device='cuda:0', grad_fn=<NllLossBackward0>) 291000 29400 tensor(5.6202, device='cuda:0', grad_fn=<NllLossBackward0>) 292000 29500 tensor(5.4016, device='cuda:0', grad_fn=<NllLossBackward0>) 293000 29600 tensor(5.2601, device='cuda:0', grad_fn=<NllLossBackward0>) 294000 29700 tensor(5.4038, device='cuda:0', grad_fn=<NllLossBackward0>) 295000 29800 tensor(5.2475, device='cuda:0', grad_fn=<NllLossBackward0>) 296000 29900 tensor(5.4960, device='cuda:0', grad_fn=<NllLossBackward0>) 297000 30000 tensor(5.2438, device='cuda:0', grad_fn=<NllLossBackward0>) 298000 30100 tensor(5.3221, device='cuda:0', grad_fn=<NllLossBackward0>) 299000 30200 tensor(5.2686, device='cuda:0', grad_fn=<NllLossBackward0>) 300000 30300 tensor(5.3735, device='cuda:0', grad_fn=<NllLossBackward0>) 301000 30400 tensor(5.2057, device='cuda:0', grad_fn=<NllLossBackward0>) 302000 30500 tensor(5.3767, device='cuda:0', grad_fn=<NllLossBackward0>) 30600 tensor(5.3515, device='cuda:0', grad_fn=<NllLossBackward0>) 303000 30700 tensor(5.3841, device='cuda:0', grad_fn=<NllLossBackward0>) 304000 30800 tensor(5.3889, device='cuda:0', grad_fn=<NllLossBackward0>) 305000 30900 tensor(5.4117, device='cuda:0', grad_fn=<NllLossBackward0>) 306000 31000 tensor(5.5205, device='cuda:0', grad_fn=<NllLossBackward0>) 307000 31100 tensor(5.1742, device='cuda:0', grad_fn=<NllLossBackward0>) 308000 31200 tensor(5.2173, device='cuda:0', grad_fn=<NllLossBackward0>) 309000 31300 tensor(5.4785, device='cuda:0', grad_fn=<NllLossBackward0>) 310000 31400 tensor(5.2577, device='cuda:0', grad_fn=<NllLossBackward0>) 311000 31500 tensor(5.4429, device='cuda:0', grad_fn=<NllLossBackward0>) 312000 31600 tensor(5.4289, device='cuda:0', grad_fn=<NllLossBackward0>) 313000 31700 tensor(5.3961, device='cuda:0', grad_fn=<NllLossBackward0>) 314000 31800 tensor(5.4999, device='cuda:0', grad_fn=<NllLossBackward0>) 315000 31900 tensor(5.1248, device='cuda:0', grad_fn=<NllLossBackward0>) 316000 32000 tensor(5.3122, device='cuda:0', grad_fn=<NllLossBackward0>) 317000 32100 tensor(5.1931, device='cuda:0', grad_fn=<NllLossBackward0>) 318000 32200 tensor(5.5096, device='cuda:0', grad_fn=<NllLossBackward0>) 319000 32300 tensor(5.4973, device='cuda:0', grad_fn=<NllLossBackward0>) 320000 32400 tensor(5.4742, device='cuda:0', grad_fn=<NllLossBackward0>) 321000 32500 tensor(5.2964, device='cuda:0', grad_fn=<NllLossBackward0>) 322000 32600 tensor(5.4063, device='cuda:0', grad_fn=<NllLossBackward0>) 323000 32700 tensor(5.3369, device='cuda:0', grad_fn=<NllLossBackward0>) 324000 32800 tensor(5.5636, device='cuda:0', grad_fn=<NllLossBackward0>) 325000 32900 tensor(5.4245, device='cuda:0', grad_fn=<NllLossBackward0>) 326000 33000 tensor(5.2032, device='cuda:0', grad_fn=<NllLossBackward0>) 327000 33100 tensor(5.4095, device='cuda:0', grad_fn=<NllLossBackward0>) 328000 33200 tensor(5.5071, device='cuda:0', grad_fn=<NllLossBackward0>) 329000 33300 tensor(5.2729, device='cuda:0', grad_fn=<NllLossBackward0>) 330000 33400 tensor(5.5492, device='cuda:0', grad_fn=<NllLossBackward0>) 331000 33500 tensor(5.3701, device='cuda:0', grad_fn=<NllLossBackward0>) 332000 33600 tensor(5.3223, device='cuda:0', grad_fn=<NllLossBackward0>) 333000 33700 tensor(5.3725, device='cuda:0', grad_fn=<NllLossBackward0>) 334000 33800 tensor(5.4572, device='cuda:0', grad_fn=<NllLossBackward0>) 335000 33900 tensor(5.1889, device='cuda:0', grad_fn=<NllLossBackward0>) 336000 34000 tensor(5.4090, device='cuda:0', grad_fn=<NllLossBackward0>) 337000 34100 tensor(5.3798, device='cuda:0', grad_fn=<NllLossBackward0>) 338000 34200 tensor(5.4259, device='cuda:0', grad_fn=<NllLossBackward0>) 339000 34300 tensor(5.2132, device='cuda:0', grad_fn=<NllLossBackward0>) 340000 34400 tensor(5.6692, device='cuda:0', grad_fn=<NllLossBackward0>) 341000 34500 tensor(5.5324, device='cuda:0', grad_fn=<NllLossBackward0>) 342000 34600 tensor(5.4271, device='cuda:0', grad_fn=<NllLossBackward0>) 343000 34700 tensor(5.4978, device='cuda:0', grad_fn=<NllLossBackward0>) 344000 34800 tensor(5.5230, device='cuda:0', grad_fn=<NllLossBackward0>) 345000 34900 tensor(5.5652, device='cuda:0', grad_fn=<NllLossBackward0>) 346000 35000 tensor(5.5478, device='cuda:0', grad_fn=<NllLossBackward0>) 347000 35100 tensor(5.3700, device='cuda:0', grad_fn=<NllLossBackward0>) 348000 35200 tensor(5.2958, device='cuda:0', grad_fn=<NllLossBackward0>) 349000 35300 tensor(5.5219, device='cuda:0', grad_fn=<NllLossBackward0>) 350000 35400 tensor(5.1702, device='cuda:0', grad_fn=<NllLossBackward0>) 351000 35500 tensor(5.2604, device='cuda:0', grad_fn=<NllLossBackward0>) 352000 35600 tensor(5.3821, device='cuda:0', grad_fn=<NllLossBackward0>) 353000 35700 tensor(5.2551, device='cuda:0', grad_fn=<NllLossBackward0>) 354000 35800 tensor(5.3840, device='cuda:0', grad_fn=<NllLossBackward0>) 355000 35900 tensor(5.3635, device='cuda:0', grad_fn=<NllLossBackward0>) 356000 36000 tensor(5.1400, device='cuda:0', grad_fn=<NllLossBackward0>) 357000 36100 tensor(5.5134, device='cuda:0', grad_fn=<NllLossBackward0>) 358000 36200 tensor(5.3632, device='cuda:0', grad_fn=<NllLossBackward0>) 359000 36300 tensor(5.6461, device='cuda:0', grad_fn=<NllLossBackward0>) 360000 36400 tensor(5.3415, device='cuda:0', grad_fn=<NllLossBackward0>) 361000 36500 tensor(5.3659, device='cuda:0', grad_fn=<NllLossBackward0>) 362000 36600 tensor(5.3874, device='cuda:0', grad_fn=<NllLossBackward0>) 363000 36700 tensor(5.1886, device='cuda:0', grad_fn=<NllLossBackward0>) 364000 36800 tensor(5.2958, device='cuda:0', grad_fn=<NllLossBackward0>) 365000 36900 tensor(5.4094, device='cuda:0', grad_fn=<NllLossBackward0>) 366000 37000 tensor(5.3023, device='cuda:0', grad_fn=<NllLossBackward0>) 367000 37100 tensor(5.3287, device='cuda:0', grad_fn=<NllLossBackward0>) 368000 37200 tensor(5.3996, device='cuda:0', grad_fn=<NllLossBackward0>) 369000 37300 tensor(5.3001, device='cuda:0', grad_fn=<NllLossBackward0>) 370000 37400 tensor(5.6516, device='cuda:0', grad_fn=<NllLossBackward0>) 371000 37500 tensor(5.3366, device='cuda:0', grad_fn=<NllLossBackward0>) 372000 37600 tensor(5.3282, device='cuda:0', grad_fn=<NllLossBackward0>) 373000 37700 tensor(5.5061, device='cuda:0', grad_fn=<NllLossBackward0>) 374000 37800 tensor(5.3408, device='cuda:0', grad_fn=<NllLossBackward0>) 375000 37900 tensor(5.3203, device='cuda:0', grad_fn=<NllLossBackward0>) 376000 38000 tensor(5.3996, device='cuda:0', grad_fn=<NllLossBackward0>) 377000 38100 tensor(5.4133, device='cuda:0', grad_fn=<NllLossBackward0>) 378000 38200 tensor(5.4262, device='cuda:0', grad_fn=<NllLossBackward0>) 379000 38300 tensor(5.3305, device='cuda:0', grad_fn=<NllLossBackward0>) 380000 38400 tensor(5.3983, device='cuda:0', grad_fn=<NllLossBackward0>) 381000 38500 tensor(5.4246, device='cuda:0', grad_fn=<NllLossBackward0>) 382000 38600 tensor(5.3713, device='cuda:0', grad_fn=<NllLossBackward0>) 383000 38700 tensor(5.3634, device='cuda:0', grad_fn=<NllLossBackward0>) 384000 38800 tensor(5.4504, device='cuda:0', grad_fn=<NllLossBackward0>) 385000 38900 tensor(5.5273, device='cuda:0', grad_fn=<NllLossBackward0>) 386000 39000 tensor(5.2229, device='cuda:0', grad_fn=<NllLossBackward0>) 387000 39100 tensor(5.4503, device='cuda:0', grad_fn=<NllLossBackward0>) 388000 39200 tensor(5.5406, device='cuda:0', grad_fn=<NllLossBackward0>) 389000 39300 tensor(5.3640, device='cuda:0', grad_fn=<NllLossBackward0>) 390000 39400 tensor(5.4311, device='cuda:0', grad_fn=<NllLossBackward0>) 391000 39500 tensor(5.5292, device='cuda:0', grad_fn=<NllLossBackward0>) 392000 39600 tensor(5.2217, device='cuda:0', grad_fn=<NllLossBackward0>) 393000 39700 tensor(5.2121, device='cuda:0', grad_fn=<NllLossBackward0>) 394000 39800 tensor(5.3415, device='cuda:0', grad_fn=<NllLossBackward0>) 395000 39900 tensor(5.1605, device='cuda:0', grad_fn=<NllLossBackward0>) 396000 40000 tensor(5.2472, device='cuda:0', grad_fn=<NllLossBackward0>) 397000 40100 tensor(5.3351, device='cuda:0', grad_fn=<NllLossBackward0>) 398000 40200 tensor(5.3198, device='cuda:0', grad_fn=<NllLossBackward0>) 399000 40300 tensor(5.3862, device='cuda:0', grad_fn=<NllLossBackward0>) 40400 tensor(5.3946, device='cuda:0', grad_fn=<NllLossBackward0>) 400000 40500 tensor(5.3120, device='cuda:0', grad_fn=<NllLossBackward0>) 401000 40600 tensor(5.3741, device='cuda:0', grad_fn=<NllLossBackward0>) 402000 40700 tensor(5.4199, device='cuda:0', grad_fn=<NllLossBackward0>) 403000 40800 tensor(5.3702, device='cuda:0', grad_fn=<NllLossBackward0>) 404000 40900 tensor(5.3212, device='cuda:0', grad_fn=<NllLossBackward0>) 405000 41000 tensor(5.3683, device='cuda:0', grad_fn=<NllLossBackward0>) 406000 41100 tensor(5.3491, device='cuda:0', grad_fn=<NllLossBackward0>) 407000 41200 tensor(5.2400, device='cuda:0', grad_fn=<NllLossBackward0>) 408000 41300 tensor(5.3728, device='cuda:0', grad_fn=<NllLossBackward0>) 409000 41400 tensor(5.2643, device='cuda:0', grad_fn=<NllLossBackward0>) 410000 41500 tensor(5.4064, device='cuda:0', grad_fn=<NllLossBackward0>) 411000 41600 tensor(5.3238, device='cuda:0', grad_fn=<NllLossBackward0>) 412000 41700 tensor(5.3469, device='cuda:0', grad_fn=<NllLossBackward0>) 413000 41800 tensor(5.3432, device='cuda:0', grad_fn=<NllLossBackward0>) 414000 41900 tensor(5.3521, device='cuda:0', grad_fn=<NllLossBackward0>) 415000 42000 tensor(5.5087, device='cuda:0', grad_fn=<NllLossBackward0>) 416000 42100 tensor(5.2556, device='cuda:0', grad_fn=<NllLossBackward0>) 417000 42200 tensor(5.3407, device='cuda:0', grad_fn=<NllLossBackward0>) 418000 42300 tensor(5.4058, device='cuda:0', grad_fn=<NllLossBackward0>) 419000 42400 tensor(5.2231, device='cuda:0', grad_fn=<NllLossBackward0>) 420000 42500 tensor(5.3912, device='cuda:0', grad_fn=<NllLossBackward0>) 421000 42600 tensor(5.1878, device='cuda:0', grad_fn=<NllLossBackward0>) 422000 42700 tensor(5.4955, device='cuda:0', grad_fn=<NllLossBackward0>) 423000 42800 tensor(5.4193, device='cuda:0', grad_fn=<NllLossBackward0>) 424000 42900 tensor(5.2662, device='cuda:0', grad_fn=<NllLossBackward0>) 425000 43000 tensor(5.4093, device='cuda:0', grad_fn=<NllLossBackward0>) 426000 43100 tensor(5.4089, device='cuda:0', grad_fn=<NllLossBackward0>) 427000 43200 tensor(5.2223, device='cuda:0', grad_fn=<NllLossBackward0>) 428000 43300 tensor(5.2456, device='cuda:0', grad_fn=<NllLossBackward0>) 429000 43400 tensor(5.4129, device='cuda:0', grad_fn=<NllLossBackward0>) 430000 43500 tensor(5.1283, device='cuda:0', grad_fn=<NllLossBackward0>) 431000 43600 tensor(5.3275, device='cuda:0', grad_fn=<NllLossBackward0>) 432000 epoch: = 4 0 tensor(5.3172, device='cuda:0', grad_fn=<NllLossBackward0>) 1000 100 tensor(5.3864, device='cuda:0', grad_fn=<NllLossBackward0>) 200 tensor(5.2618, device='cuda:0', grad_fn=<NllLossBackward0>) 2000 300 tensor(5.2652, device='cuda:0', grad_fn=<NllLossBackward0>) 3000 400 tensor(5.2749, device='cuda:0', grad_fn=<NllLossBackward0>) 4000 500 tensor(5.4347, device='cuda:0', grad_fn=<NllLossBackward0>) 5000 600 tensor(5.2271, device='cuda:0', grad_fn=<NllLossBackward0>) 6000 700 tensor(5.5396, device='cuda:0', grad_fn=<NllLossBackward0>) 7000 800 tensor(5.1379, device='cuda:0', grad_fn=<NllLossBackward0>) 8000 900 tensor(5.3861, device='cuda:0', grad_fn=<NllLossBackward0>) 9000 1000 tensor(5.2629, device='cuda:0', grad_fn=<NllLossBackward0>) 10000 1100 tensor(5.4575, device='cuda:0', grad_fn=<NllLossBackward0>) 11000 1200 tensor(5.4936, device='cuda:0', grad_fn=<NllLossBackward0>) 12000 1300 tensor(5.4281, device='cuda:0', grad_fn=<NllLossBackward0>) 13000 1400 tensor(5.4186, device='cuda:0', grad_fn=<NllLossBackward0>) 14000 1500 tensor(5.5070, device='cuda:0', grad_fn=<NllLossBackward0>) 15000 1600 tensor(5.1769, device='cuda:0', grad_fn=<NllLossBackward0>) 16000 1700 tensor(5.2856, device='cuda:0', grad_fn=<NllLossBackward0>) 17000 1800 tensor(5.2827, device='cuda:0', grad_fn=<NllLossBackward0>) 18000 1900 tensor(5.2544, device='cuda:0', grad_fn=<NllLossBackward0>) 19000 2000 tensor(5.3218, device='cuda:0', grad_fn=<NllLossBackward0>) 20000 2100 tensor(5.4549, device='cuda:0', grad_fn=<NllLossBackward0>) 21000 2200 tensor(5.2864, device='cuda:0', grad_fn=<NllLossBackward0>) 22000 2300 tensor(5.3145, device='cuda:0', grad_fn=<NllLossBackward0>) 23000 2400 tensor(5.2987, device='cuda:0', grad_fn=<NllLossBackward0>) 24000 2500 tensor(5.3498, device='cuda:0', grad_fn=<NllLossBackward0>) 25000 2600 tensor(5.3730, device='cuda:0', grad_fn=<NllLossBackward0>) 26000 2700 tensor(5.4017, device='cuda:0', grad_fn=<NllLossBackward0>) 27000 2800 tensor(5.4255, device='cuda:0', grad_fn=<NllLossBackward0>) 28000 2900 tensor(5.5475, device='cuda:0', grad_fn=<NllLossBackward0>) 29000 3000 tensor(5.2988, device='cuda:0', grad_fn=<NllLossBackward0>) 30000 3100 tensor(5.3753, device='cuda:0', grad_fn=<NllLossBackward0>) 31000 3200 tensor(5.4049, device='cuda:0', grad_fn=<NllLossBackward0>) 32000 3300 tensor(5.3206, device='cuda:0', grad_fn=<NllLossBackward0>) 33000 3400 tensor(5.2159, device='cuda:0', grad_fn=<NllLossBackward0>) 34000 3500 tensor(5.3423, device='cuda:0', grad_fn=<NllLossBackward0>) 35000 3600 tensor(5.3717, device='cuda:0', grad_fn=<NllLossBackward0>) 36000 3700 tensor(5.3042, device='cuda:0', grad_fn=<NllLossBackward0>) 37000 3800 tensor(5.3258, device='cuda:0', grad_fn=<NllLossBackward0>) 38000 3900 tensor(5.1989, device='cuda:0', grad_fn=<NllLossBackward0>) 39000 4000 tensor(5.2650, device='cuda:0', grad_fn=<NllLossBackward0>) 40000 4100 tensor(5.3953, device='cuda:0', grad_fn=<NllLossBackward0>) 41000 4200 tensor(5.4542, device='cuda:0', grad_fn=<NllLossBackward0>) 42000 4300 tensor(5.3466, device='cuda:0', grad_fn=<NllLossBackward0>) 43000 4400 tensor(5.4222, device='cuda:0', grad_fn=<NllLossBackward0>) 44000 4500 tensor(5.2254, device='cuda:0', grad_fn=<NllLossBackward0>) 45000 4600 tensor(5.5610, device='cuda:0', grad_fn=<NllLossBackward0>) 46000 4700 tensor(5.2753, device='cuda:0', grad_fn=<NllLossBackward0>) 47000 4800 tensor(5.4028, device='cuda:0', grad_fn=<NllLossBackward0>) 48000 4900 tensor(5.4516, device='cuda:0', grad_fn=<NllLossBackward0>) 49000 5000 tensor(5.4464, device='cuda:0', grad_fn=<NllLossBackward0>) 50000 5100 tensor(5.5018, device='cuda:0', grad_fn=<NllLossBackward0>) 51000 5200 tensor(5.5194, device='cuda:0', grad_fn=<NllLossBackward0>) 52000 5300 tensor(5.5077, device='cuda:0', grad_fn=<NllLossBackward0>) 53000 5400 tensor(5.3746, device='cuda:0', grad_fn=<NllLossBackward0>) 54000 5500 tensor(5.4847, device='cuda:0', grad_fn=<NllLossBackward0>) 55000 5600 tensor(5.2664, device='cuda:0', grad_fn=<NllLossBackward0>) 56000 5700 tensor(5.5265, device='cuda:0', grad_fn=<NllLossBackward0>) 57000 5800 tensor(5.5101, device='cuda:0', grad_fn=<NllLossBackward0>) 58000 5900 tensor(5.4513, device='cuda:0', grad_fn=<NllLossBackward0>) 59000 6000 tensor(5.3554, device='cuda:0', grad_fn=<NllLossBackward0>) 60000 6100 tensor(5.3616, device='cuda:0', grad_fn=<NllLossBackward0>) 61000 6200 tensor(5.5360, device='cuda:0', grad_fn=<NllLossBackward0>) 62000 6300 tensor(5.3952, device='cuda:0', grad_fn=<NllLossBackward0>) 63000 6400 tensor(5.3132, device='cuda:0', grad_fn=<NllLossBackward0>) 64000 6500 tensor(5.1732, device='cuda:0', grad_fn=<NllLossBackward0>) 65000 6600 tensor(5.3505, device='cuda:0', grad_fn=<NllLossBackward0>) 66000 6700 tensor(5.2919, device='cuda:0', grad_fn=<NllLossBackward0>) 67000 6800 tensor(5.5064, device='cuda:0', grad_fn=<NllLossBackward0>) 68000 6900 tensor(5.4881, device='cuda:0', grad_fn=<NllLossBackward0>) 69000 7000 tensor(5.3978, device='cuda:0', grad_fn=<NllLossBackward0>) 70000 7100 tensor(5.2030, device='cuda:0', grad_fn=<NllLossBackward0>) 71000 7200 tensor(5.2738, device='cuda:0', grad_fn=<NllLossBackward0>) 72000 7300 tensor(5.5317, device='cuda:0', grad_fn=<NllLossBackward0>) 73000 7400 tensor(5.3487, device='cuda:0', grad_fn=<NllLossBackward0>) 74000 7500 tensor(5.2133, device='cuda:0', grad_fn=<NllLossBackward0>) 75000 7600 tensor(5.2878, device='cuda:0', grad_fn=<NllLossBackward0>) 76000 7700 tensor(5.3644, device='cuda:0', grad_fn=<NllLossBackward0>) 77000 7800 tensor(5.4711, device='cuda:0', grad_fn=<NllLossBackward0>) 78000 7900 tensor(5.1445, device='cuda:0', grad_fn=<NllLossBackward0>) 79000 8000 tensor(5.2138, device='cuda:0', grad_fn=<NllLossBackward0>) 80000 8100 tensor(5.3741, device='cuda:0', grad_fn=<NllLossBackward0>) 81000 8200 tensor(5.3893, device='cuda:0', grad_fn=<NllLossBackward0>) 82000 8300 tensor(5.3492, device='cuda:0', grad_fn=<NllLossBackward0>) 83000 8400 tensor(5.4797, device='cuda:0', grad_fn=<NllLossBackward0>) 84000 8500 tensor(5.4501, device='cuda:0', grad_fn=<NllLossBackward0>) 85000 8600 tensor(5.4600, device='cuda:0', grad_fn=<NllLossBackward0>) 86000 8700 tensor(5.5758, device='cuda:0', grad_fn=<NllLossBackward0>) 87000 8800 tensor(5.4493, device='cuda:0', grad_fn=<NllLossBackward0>) 88000 8900 tensor(5.3035, device='cuda:0', grad_fn=<NllLossBackward0>) 89000 9000 tensor(5.4164, device='cuda:0', grad_fn=<NllLossBackward0>) 90000 9100 tensor(5.4273, device='cuda:0', grad_fn=<NllLossBackward0>) 91000 9200 tensor(5.3343, device='cuda:0', grad_fn=<NllLossBackward0>) 92000 9300 tensor(5.3797, device='cuda:0', grad_fn=<NllLossBackward0>) 93000 9400 tensor(5.2260, device='cuda:0', grad_fn=<NllLossBackward0>) 94000 9500 tensor(5.3006, device='cuda:0', grad_fn=<NllLossBackward0>) 95000 9600 tensor(5.4211, device='cuda:0', grad_fn=<NllLossBackward0>) 96000 9700 tensor(5.3615, device='cuda:0', grad_fn=<NllLossBackward0>) 97000 9800 tensor(5.4089, device='cuda:0', grad_fn=<NllLossBackward0>) 9900 tensor(5.2200, device='cuda:0', grad_fn=<NllLossBackward0>) 98000 10000 tensor(5.4428, device='cuda:0', grad_fn=<NllLossBackward0>) 99000 10100 tensor(5.3219, device='cuda:0', grad_fn=<NllLossBackward0>) 100000 10200 tensor(5.2692, device='cuda:0', grad_fn=<NllLossBackward0>) 101000 10300 tensor(5.3854, device='cuda:0', grad_fn=<NllLossBackward0>) 102000 10400 tensor(5.3984, device='cuda:0', grad_fn=<NllLossBackward0>) 103000 10500 tensor(5.4516, device='cuda:0', grad_fn=<NllLossBackward0>) 104000 10600 tensor(5.5380, device='cuda:0', grad_fn=<NllLossBackward0>) 105000 10700 tensor(5.3724, device='cuda:0', grad_fn=<NllLossBackward0>) 106000 10800 tensor(5.3862, device='cuda:0', grad_fn=<NllLossBackward0>) 107000 10900 tensor(5.3102, device='cuda:0', grad_fn=<NllLossBackward0>) 108000 11000 tensor(5.5487, device='cuda:0', grad_fn=<NllLossBackward0>) 109000 11100 tensor(5.1684, device='cuda:0', grad_fn=<NllLossBackward0>) 110000 11200 tensor(5.3303, device='cuda:0', grad_fn=<NllLossBackward0>) 111000 11300 tensor(5.3537, device='cuda:0', grad_fn=<NllLossBackward0>) 112000 11400 tensor(5.3064, device='cuda:0', grad_fn=<NllLossBackward0>) 113000 11500 tensor(5.3775, device='cuda:0', grad_fn=<NllLossBackward0>) 114000 11600 tensor(5.3649, device='cuda:0', grad_fn=<NllLossBackward0>) 115000 11700 tensor(5.2920, device='cuda:0', grad_fn=<NllLossBackward0>) 116000 11800 tensor(5.4908, device='cuda:0', grad_fn=<NllLossBackward0>) 117000 11900 tensor(5.3293, device='cuda:0', grad_fn=<NllLossBackward0>) 118000 12000 tensor(5.3926, device='cuda:0', grad_fn=<NllLossBackward0>) 119000 12100 tensor(5.4045, device='cuda:0', grad_fn=<NllLossBackward0>) 120000 12200 tensor(5.4246, device='cuda:0', grad_fn=<NllLossBackward0>) 121000 12300 tensor(5.5096, device='cuda:0', grad_fn=<NllLossBackward0>) 122000 12400 tensor(5.3884, device='cuda:0', grad_fn=<NllLossBackward0>) 123000 12500 tensor(5.3057, device='cuda:0', grad_fn=<NllLossBackward0>) 124000 12600 tensor(5.3466, device='cuda:0', grad_fn=<NllLossBackward0>) 125000 12700 tensor(5.2898, device='cuda:0', grad_fn=<NllLossBackward0>) 126000 12800 tensor(5.4714, device='cuda:0', grad_fn=<NllLossBackward0>) 127000 12900 tensor(5.3255, device='cuda:0', grad_fn=<NllLossBackward0>) 128000 13000 tensor(5.1438, device='cuda:0', grad_fn=<NllLossBackward0>) 129000 13100 tensor(5.3498, device='cuda:0', grad_fn=<NllLossBackward0>) 130000 13200 tensor(5.3890, device='cuda:0', grad_fn=<NllLossBackward0>) 131000 13300 tensor(5.2710, device='cuda:0', grad_fn=<NllLossBackward0>) 132000 13400 tensor(5.3541, device='cuda:0', grad_fn=<NllLossBackward0>) 133000 13500 tensor(5.3156, device='cuda:0', grad_fn=<NllLossBackward0>) 134000 13600 tensor(5.3957, device='cuda:0', grad_fn=<NllLossBackward0>) 135000 13700 tensor(5.3548, device='cuda:0', grad_fn=<NllLossBackward0>) 136000 13800 tensor(5.3577, device='cuda:0', grad_fn=<NllLossBackward0>) 137000 13900 tensor(5.2122, device='cuda:0', grad_fn=<NllLossBackward0>) 138000 14000 tensor(5.4587, device='cuda:0', grad_fn=<NllLossBackward0>) 139000 14100 tensor(5.1704, device='cuda:0', grad_fn=<NllLossBackward0>) 140000 14200 tensor(5.4419, device='cuda:0', grad_fn=<NllLossBackward0>) 141000 14300 tensor(5.4142, device='cuda:0', grad_fn=<NllLossBackward0>) 142000 14400 tensor(5.3058, device='cuda:0', grad_fn=<NllLossBackward0>) 143000 14500 tensor(5.4082, device='cuda:0', grad_fn=<NllLossBackward0>) 144000 14600 tensor(5.4414, device='cuda:0', grad_fn=<NllLossBackward0>) 145000 14700 tensor(5.3177, device='cuda:0', grad_fn=<NllLossBackward0>) 146000 14800 tensor(5.7665, device='cuda:0', grad_fn=<NllLossBackward0>) 147000 14900 tensor(5.4171, device='cuda:0', grad_fn=<NllLossBackward0>) 148000 15000 tensor(5.2698, device='cuda:0', grad_fn=<NllLossBackward0>) 149000 15100 tensor(5.4915, device='cuda:0', grad_fn=<NllLossBackward0>) 150000 15200 tensor(5.3576, device='cuda:0', grad_fn=<NllLossBackward0>) 151000 15300 tensor(5.3567, device='cuda:0', grad_fn=<NllLossBackward0>) 152000 15400 tensor(5.2379, device='cuda:0', grad_fn=<NllLossBackward0>) 153000 15500 tensor(5.4092, device='cuda:0', grad_fn=<NllLossBackward0>) 154000 15600 tensor(5.4042, device='cuda:0', grad_fn=<NllLossBackward0>) 155000 15700 tensor(5.3017, device='cuda:0', grad_fn=<NllLossBackward0>) 156000 15800 tensor(5.2188, device='cuda:0', grad_fn=<NllLossBackward0>) 157000 15900 tensor(5.3497, device='cuda:0', grad_fn=<NllLossBackward0>) 158000 16000 tensor(5.1718, device='cuda:0', grad_fn=<NllLossBackward0>) 159000 16100 tensor(5.2145, device='cuda:0', grad_fn=<NllLossBackward0>) 160000 16200 tensor(5.5591, device='cuda:0', grad_fn=<NllLossBackward0>) 161000 16300 tensor(5.3864, device='cuda:0', grad_fn=<NllLossBackward0>) 162000 16400 tensor(5.3719, device='cuda:0', grad_fn=<NllLossBackward0>) 163000 16500 tensor(5.4842, device='cuda:0', grad_fn=<NllLossBackward0>) 164000 16600 tensor(5.3329, device='cuda:0', grad_fn=<NllLossBackward0>) 165000 16700 tensor(5.3130, device='cuda:0', grad_fn=<NllLossBackward0>) 166000 16800 tensor(5.3903, device='cuda:0', grad_fn=<NllLossBackward0>) 167000 16900 tensor(5.1551, device='cuda:0', grad_fn=<NllLossBackward0>) 168000 17000 tensor(5.4229, device='cuda:0', grad_fn=<NllLossBackward0>) 169000 17100 tensor(5.2686, device='cuda:0', grad_fn=<NllLossBackward0>) 170000 17200 tensor(5.4099, device='cuda:0', grad_fn=<NllLossBackward0>) 171000 17300 tensor(5.4198, device='cuda:0', grad_fn=<NllLossBackward0>) 172000 17400 tensor(5.2162, device='cuda:0', grad_fn=<NllLossBackward0>) 173000 17500 tensor(5.2385, device='cuda:0', grad_fn=<NllLossBackward0>) 174000 17600 tensor(5.2786, device='cuda:0', grad_fn=<NllLossBackward0>) 175000 17700 tensor(5.2576, device='cuda:0', grad_fn=<NllLossBackward0>) 176000 17800 tensor(5.3158, device='cuda:0', grad_fn=<NllLossBackward0>) 177000 17900 tensor(5.2105, device='cuda:0', grad_fn=<NllLossBackward0>) 178000 18000 tensor(5.4627, device='cuda:0', grad_fn=<NllLossBackward0>) 179000 18100 tensor(5.3966, device='cuda:0', grad_fn=<NllLossBackward0>) 180000 18200 tensor(5.3108, device='cuda:0', grad_fn=<NllLossBackward0>) 181000 18300 tensor(5.3148, device='cuda:0', grad_fn=<NllLossBackward0>) 182000 18400 tensor(5.3321, device='cuda:0', grad_fn=<NllLossBackward0>) 183000 18500 tensor(5.3171, device='cuda:0', grad_fn=<NllLossBackward0>) 184000 18600 tensor(5.5247, device='cuda:0', grad_fn=<NllLossBackward0>) 185000 18700 tensor(5.3469, device='cuda:0', grad_fn=<NllLossBackward0>) 186000 18800 tensor(5.5092, device='cuda:0', grad_fn=<NllLossBackward0>) 187000 18900 tensor(5.3902, device='cuda:0', grad_fn=<NllLossBackward0>) 188000 19000 tensor(5.3904, device='cuda:0', grad_fn=<NllLossBackward0>) 189000 19100 tensor(5.5019, device='cuda:0', grad_fn=<NllLossBackward0>) 190000 19200 tensor(5.3838, device='cuda:0', grad_fn=<NllLossBackward0>) 191000 19300 tensor(5.4674, device='cuda:0', grad_fn=<NllLossBackward0>) 192000 19400 tensor(5.3223, device='cuda:0', grad_fn=<NllLossBackward0>) 193000 19500 tensor(5.3235, device='cuda:0', grad_fn=<NllLossBackward0>) 194000 19600 tensor(5.3589, device='cuda:0', grad_fn=<NllLossBackward0>) 195000 19700 tensor(5.4063, device='cuda:0', grad_fn=<NllLossBackward0>) 196000 19800 tensor(5.2838, device='cuda:0', grad_fn=<NllLossBackward0>) 197000 19900 tensor(5.2807, device='cuda:0', grad_fn=<NllLossBackward0>) 198000 20000 tensor(5.3038, device='cuda:0', grad_fn=<NllLossBackward0>) 199000 20100 tensor(5.2397, device='cuda:0', grad_fn=<NllLossBackward0>) 200000 20200 tensor(5.1723, device='cuda:0', grad_fn=<NllLossBackward0>) 201000 20300 tensor(5.2827, device='cuda:0', grad_fn=<NllLossBackward0>) 20400 tensor(5.3245, device='cuda:0', grad_fn=<NllLossBackward0>) 202000 20500 tensor(5.3303, device='cuda:0', grad_fn=<NllLossBackward0>) 203000 20600 tensor(5.5211, device='cuda:0', grad_fn=<NllLossBackward0>) 204000 20700 tensor(5.3629, device='cuda:0', grad_fn=<NllLossBackward0>) 205000 20800 tensor(5.1882, device='cuda:0', grad_fn=<NllLossBackward0>) 206000 20900 tensor(5.4671, device='cuda:0', grad_fn=<NllLossBackward0>) 207000 21000 tensor(5.3110, device='cuda:0', grad_fn=<NllLossBackward0>) 208000 21100 tensor(5.3181, device='cuda:0', grad_fn=<NllLossBackward0>) 209000 21200 tensor(5.1968, device='cuda:0', grad_fn=<NllLossBackward0>) 210000 21300 tensor(5.3940, device='cuda:0', grad_fn=<NllLossBackward0>) 211000 21400 tensor(5.4308, device='cuda:0', grad_fn=<NllLossBackward0>) 212000 21500 tensor(5.2127, device='cuda:0', grad_fn=<NllLossBackward0>) 213000 21600 tensor(5.2003, device='cuda:0', grad_fn=<NllLossBackward0>) 214000 21700 tensor(5.1881, device='cuda:0', grad_fn=<NllLossBackward0>) 215000 21800 tensor(5.3180, device='cuda:0', grad_fn=<NllLossBackward0>) 216000 21900 tensor(5.3197, device='cuda:0', grad_fn=<NllLossBackward0>) 217000 22000 tensor(5.3005, device='cuda:0', grad_fn=<NllLossBackward0>) 218000 22100 tensor(5.1776, device='cuda:0', grad_fn=<NllLossBackward0>) 219000 22200 tensor(5.0509, device='cuda:0', grad_fn=<NllLossBackward0>) 220000 22300 tensor(5.4807, device='cuda:0', grad_fn=<NllLossBackward0>) 221000 22400 tensor(5.2040, device='cuda:0', grad_fn=<NllLossBackward0>) 222000 22500 tensor(5.2161, device='cuda:0', grad_fn=<NllLossBackward0>) 223000 22600 tensor(5.4083, device='cuda:0', grad_fn=<NllLossBackward0>) 224000 22700 tensor(5.1619, device='cuda:0', grad_fn=<NllLossBackward0>) 225000 22800 tensor(5.4301, device='cuda:0', grad_fn=<NllLossBackward0>) 226000 22900 tensor(5.4791, device='cuda:0', grad_fn=<NllLossBackward0>) 227000 23000 tensor(5.3785, device='cuda:0', grad_fn=<NllLossBackward0>) 228000 23100 tensor(5.3705, device='cuda:0', grad_fn=<NllLossBackward0>) 229000 23200 tensor(5.3633, device='cuda:0', grad_fn=<NllLossBackward0>) 230000 23300 tensor(5.4443, device='cuda:0', grad_fn=<NllLossBackward0>) 231000 23400 tensor(5.4496, device='cuda:0', grad_fn=<NllLossBackward0>) 232000 23500 tensor(5.2961, device='cuda:0', grad_fn=<NllLossBackward0>) 233000 23600 tensor(5.2603, device='cuda:0', grad_fn=<NllLossBackward0>) 234000 23700 tensor(5.2793, device='cuda:0', grad_fn=<NllLossBackward0>) 235000 23800 tensor(5.1461, device='cuda:0', grad_fn=<NllLossBackward0>) 236000 23900 tensor(5.2376, device='cuda:0', grad_fn=<NllLossBackward0>) 237000 24000 tensor(5.2269, device='cuda:0', grad_fn=<NllLossBackward0>) 238000 24100 tensor(5.3154, device='cuda:0', grad_fn=<NllLossBackward0>) 239000 24200 tensor(5.4852, device='cuda:0', grad_fn=<NllLossBackward0>) 240000 24300 tensor(5.3785, device='cuda:0', grad_fn=<NllLossBackward0>) 241000 24400 tensor(5.5053, device='cuda:0', grad_fn=<NllLossBackward0>) 242000 24500 tensor(5.2987, device='cuda:0', grad_fn=<NllLossBackward0>) 243000 24600 tensor(5.4275, device='cuda:0', grad_fn=<NllLossBackward0>) 244000 24700 tensor(5.3283, device='cuda:0', grad_fn=<NllLossBackward0>) 245000 24800 tensor(5.3707, device='cuda:0', grad_fn=<NllLossBackward0>) 246000 24900 tensor(5.4294, device='cuda:0', grad_fn=<NllLossBackward0>) 247000 25000 tensor(5.4479, device='cuda:0', grad_fn=<NllLossBackward0>) 248000 25100 tensor(5.3629, device='cuda:0', grad_fn=<NllLossBackward0>) 249000 25200 tensor(5.3849, device='cuda:0', grad_fn=<NllLossBackward0>) 250000 25300 tensor(5.4124, device='cuda:0', grad_fn=<NllLossBackward0>) 251000 25400 tensor(5.3932, device='cuda:0', grad_fn=<NllLossBackward0>) 252000 25500 tensor(5.2893, device='cuda:0', grad_fn=<NllLossBackward0>) 253000 25600 tensor(5.5512, device='cuda:0', grad_fn=<NllLossBackward0>) 254000 25700 tensor(5.3227, device='cuda:0', grad_fn=<NllLossBackward0>) 255000 25800 tensor(5.4217, device='cuda:0', grad_fn=<NllLossBackward0>) 256000 25900 tensor(5.3637, device='cuda:0', grad_fn=<NllLossBackward0>) 257000 26000 tensor(5.3632, device='cuda:0', grad_fn=<NllLossBackward0>) 258000 26100 tensor(5.2841, device='cuda:0', grad_fn=<NllLossBackward0>) 259000 26200 tensor(5.2107, device='cuda:0', grad_fn=<NllLossBackward0>) 260000 26300 tensor(5.4024, device='cuda:0', grad_fn=<NllLossBackward0>) 261000 26400 tensor(5.4410, device='cuda:0', grad_fn=<NllLossBackward0>) 262000 26500 tensor(5.1685, device='cuda:0', grad_fn=<NllLossBackward0>) 263000 26600 tensor(5.5023, device='cuda:0', grad_fn=<NllLossBackward0>) 264000 26700 tensor(5.3654, device='cuda:0', grad_fn=<NllLossBackward0>) 265000 26800 tensor(5.5407, device='cuda:0', grad_fn=<NllLossBackward0>) 266000 26900 tensor(5.3000, device='cuda:0', grad_fn=<NllLossBackward0>) 267000 27000 tensor(5.2141, device='cuda:0', grad_fn=<NllLossBackward0>) 268000 27100 tensor(5.2490, device='cuda:0', grad_fn=<NllLossBackward0>) 269000 27200 tensor(5.2850, device='cuda:0', grad_fn=<NllLossBackward0>) 270000 27300 tensor(5.4811, device='cuda:0', grad_fn=<NllLossBackward0>) 271000 27400 tensor(5.3561, device='cuda:0', grad_fn=<NllLossBackward0>) 272000 27500 tensor(5.2602, device='cuda:0', grad_fn=<NllLossBackward0>) 273000 27600 tensor(5.5429, device='cuda:0', grad_fn=<NllLossBackward0>) 274000 27700 tensor(5.3794, device='cuda:0', grad_fn=<NllLossBackward0>) 275000 27800 tensor(5.3792, device='cuda:0', grad_fn=<NllLossBackward0>) 276000 27900 tensor(5.4873, device='cuda:0', grad_fn=<NllLossBackward0>) 277000 28000 tensor(5.3454, device='cuda:0', grad_fn=<NllLossBackward0>) 278000 28100 tensor(5.3113, device='cuda:0', grad_fn=<NllLossBackward0>) 279000 28200 tensor(5.4785, device='cuda:0', grad_fn=<NllLossBackward0>) 280000 28300 tensor(5.1013, device='cuda:0', grad_fn=<NllLossBackward0>) 281000 28400 tensor(5.5403, device='cuda:0', grad_fn=<NllLossBackward0>) 282000 28500 tensor(5.3676, device='cuda:0', grad_fn=<NllLossBackward0>) 283000 28600 tensor(5.3108, device='cuda:0', grad_fn=<NllLossBackward0>) 284000 28700 tensor(5.4403, device='cuda:0', grad_fn=<NllLossBackward0>) 285000 28800 tensor(5.4926, device='cuda:0', grad_fn=<NllLossBackward0>) 286000 28900 tensor(5.3638, device='cuda:0', grad_fn=<NllLossBackward0>) 287000 29000 tensor(5.2819, device='cuda:0', grad_fn=<NllLossBackward0>) 288000 29100 tensor(5.0362, device='cuda:0', grad_fn=<NllLossBackward0>) 289000 29200 tensor(5.1871, device='cuda:0', grad_fn=<NllLossBackward0>) 290000 29300 tensor(5.4697, device='cuda:0', grad_fn=<NllLossBackward0>) 291000 29400 tensor(5.5909, device='cuda:0', grad_fn=<NllLossBackward0>) 292000 29500 tensor(5.3807, device='cuda:0', grad_fn=<NllLossBackward0>) 293000 29600 tensor(5.2398, device='cuda:0', grad_fn=<NllLossBackward0>) 294000 29700 tensor(5.3690, device='cuda:0', grad_fn=<NllLossBackward0>) 295000 29800 tensor(5.2220, device='cuda:0', grad_fn=<NllLossBackward0>) 296000 29900 tensor(5.4597, device='cuda:0', grad_fn=<NllLossBackward0>) 297000 30000 tensor(5.2205, device='cuda:0', grad_fn=<NllLossBackward0>) 298000 30100 tensor(5.3061, device='cuda:0', grad_fn=<NllLossBackward0>) 299000 30200 tensor(5.2432, device='cuda:0', grad_fn=<NllLossBackward0>) 300000 30300 tensor(5.3527, device='cuda:0', grad_fn=<NllLossBackward0>) 301000 30400 tensor(5.1823, device='cuda:0', grad_fn=<NllLossBackward0>) 302000 30500 tensor(5.3526, device='cuda:0', grad_fn=<NllLossBackward0>) 30600 tensor(5.3318, device='cuda:0', grad_fn=<NllLossBackward0>) 303000 30700 tensor(5.3634, device='cuda:0', grad_fn=<NllLossBackward0>) 304000 30800 tensor(5.3571, device='cuda:0', grad_fn=<NllLossBackward0>) 305000 30900 tensor(5.3875, device='cuda:0', grad_fn=<NllLossBackward0>) 306000 31000 tensor(5.4983, device='cuda:0', grad_fn=<NllLossBackward0>) 307000 31100 tensor(5.1554, device='cuda:0', grad_fn=<NllLossBackward0>) 308000 31200 tensor(5.1952, device='cuda:0', grad_fn=<NllLossBackward0>) 309000 31300 tensor(5.4546, device='cuda:0', grad_fn=<NllLossBackward0>) 310000 31400 tensor(5.2307, device='cuda:0', grad_fn=<NllLossBackward0>) 311000 31500 tensor(5.4188, device='cuda:0', grad_fn=<NllLossBackward0>) 312000 31600 tensor(5.4085, device='cuda:0', grad_fn=<NllLossBackward0>) 313000 31700 tensor(5.3744, device='cuda:0', grad_fn=<NllLossBackward0>) 314000 31800 tensor(5.4766, device='cuda:0', grad_fn=<NllLossBackward0>) 315000 31900 tensor(5.1062, device='cuda:0', grad_fn=<NllLossBackward0>) 316000 32000 tensor(5.2924, device='cuda:0', grad_fn=<NllLossBackward0>) 317000 32100 tensor(5.1728, device='cuda:0', grad_fn=<NllLossBackward0>) 318000 32200 tensor(5.4863, device='cuda:0', grad_fn=<NllLossBackward0>) 319000 32300 tensor(5.4748, device='cuda:0', grad_fn=<NllLossBackward0>) 320000 32400 tensor(5.4518, device='cuda:0', grad_fn=<NllLossBackward0>) 321000 32500 tensor(5.2752, device='cuda:0', grad_fn=<NllLossBackward0>) 322000 32600 tensor(5.3822, device='cuda:0', grad_fn=<NllLossBackward0>) 323000 32700 tensor(5.3088, device='cuda:0', grad_fn=<NllLossBackward0>) 324000 32800 tensor(5.5403, device='cuda:0', grad_fn=<NllLossBackward0>) 325000 32900 tensor(5.4000, device='cuda:0', grad_fn=<NllLossBackward0>) 326000 33000 tensor(5.1837, device='cuda:0', grad_fn=<NllLossBackward0>) 327000 33100 tensor(5.3888, device='cuda:0', grad_fn=<NllLossBackward0>) 328000 33200 tensor(5.4849, device='cuda:0', grad_fn=<NllLossBackward0>) 329000 33300 tensor(5.2471, device='cuda:0', grad_fn=<NllLossBackward0>) 330000 33400 tensor(5.5246, device='cuda:0', grad_fn=<NllLossBackward0>) 331000 33500 tensor(5.3479, device='cuda:0', grad_fn=<NllLossBackward0>) 332000 33600 tensor(5.3043, device='cuda:0', grad_fn=<NllLossBackward0>) 333000 33700 tensor(5.3487, device='cuda:0', grad_fn=<NllLossBackward0>) 334000 33800 tensor(5.4368, device='cuda:0', grad_fn=<NllLossBackward0>) 335000 33900 tensor(5.1620, device='cuda:0', grad_fn=<NllLossBackward0>) 336000 34000 tensor(5.3873, device='cuda:0', grad_fn=<NllLossBackward0>) 337000 34100 tensor(5.3545, device='cuda:0', grad_fn=<NllLossBackward0>) 338000 34200 tensor(5.4001, device='cuda:0', grad_fn=<NllLossBackward0>) 339000 34300 tensor(5.1902, device='cuda:0', grad_fn=<NllLossBackward0>) 340000 34400 tensor(5.6453, device='cuda:0', grad_fn=<NllLossBackward0>) 341000 34500 tensor(5.5124, device='cuda:0', grad_fn=<NllLossBackward0>) 342000 34600 tensor(5.4069, device='cuda:0', grad_fn=<NllLossBackward0>) 343000 34700 tensor(5.4734, device='cuda:0', grad_fn=<NllLossBackward0>) 344000 34800 tensor(5.5014, device='cuda:0', grad_fn=<NllLossBackward0>) 345000 34900 tensor(5.5412, device='cuda:0', grad_fn=<NllLossBackward0>) 346000 35000 tensor(5.5132, device='cuda:0', grad_fn=<NllLossBackward0>) 347000 35100 tensor(5.3455, device='cuda:0', grad_fn=<NllLossBackward0>) 348000 35200 tensor(5.2694, device='cuda:0', grad_fn=<NllLossBackward0>) 349000 35300 tensor(5.4988, device='cuda:0', grad_fn=<NllLossBackward0>) 350000 35400 tensor(5.1485, device='cuda:0', grad_fn=<NllLossBackward0>) 351000 35500 tensor(5.2299, device='cuda:0', grad_fn=<NllLossBackward0>) 352000 35600 tensor(5.3643, device='cuda:0', grad_fn=<NllLossBackward0>) 353000 35700 tensor(5.2247, device='cuda:0', grad_fn=<NllLossBackward0>) 354000 35800 tensor(5.3615, device='cuda:0', grad_fn=<NllLossBackward0>) 355000 35900 tensor(5.3453, device='cuda:0', grad_fn=<NllLossBackward0>) 356000 36000 tensor(5.1217, device='cuda:0', grad_fn=<NllLossBackward0>) 357000 36100 tensor(5.4909, device='cuda:0', grad_fn=<NllLossBackward0>) 358000 36200 tensor(5.3382, device='cuda:0', grad_fn=<NllLossBackward0>) 359000 36300 tensor(5.6225, device='cuda:0', grad_fn=<NllLossBackward0>) 360000 36400 tensor(5.3167, device='cuda:0', grad_fn=<NllLossBackward0>) 361000 36500 tensor(5.3458, device='cuda:0', grad_fn=<NllLossBackward0>) 362000 36600 tensor(5.3608, device='cuda:0', grad_fn=<NllLossBackward0>) 363000 36700 tensor(5.1660, device='cuda:0', grad_fn=<NllLossBackward0>) 364000 36800 tensor(5.2737, device='cuda:0', grad_fn=<NllLossBackward0>) 365000 36900 tensor(5.3883, device='cuda:0', grad_fn=<NllLossBackward0>) 366000 37000 tensor(5.2783, device='cuda:0', grad_fn=<NllLossBackward0>) 367000 37100 tensor(5.3110, device='cuda:0', grad_fn=<NllLossBackward0>) 368000 37200 tensor(5.3794, device='cuda:0', grad_fn=<NllLossBackward0>) 369000 37300 tensor(5.2802, device='cuda:0', grad_fn=<NllLossBackward0>) 370000 37400 tensor(5.6133, device='cuda:0', grad_fn=<NllLossBackward0>) 371000 37500 tensor(5.3138, device='cuda:0', grad_fn=<NllLossBackward0>) 372000 37600 tensor(5.3083, device='cuda:0', grad_fn=<NllLossBackward0>) 373000 37700 tensor(5.4860, device='cuda:0', grad_fn=<NllLossBackward0>) 374000 37800 tensor(5.3216, device='cuda:0', grad_fn=<NllLossBackward0>) 375000 37900 tensor(5.2969, device='cuda:0', grad_fn=<NllLossBackward0>) 376000 38000 tensor(5.3759, device='cuda:0', grad_fn=<NllLossBackward0>) 377000 38100 tensor(5.3914, device='cuda:0', grad_fn=<NllLossBackward0>) 378000 38200 tensor(5.4089, device='cuda:0', grad_fn=<NllLossBackward0>) 379000 38300 tensor(5.3068, device='cuda:0', grad_fn=<NllLossBackward0>) 380000 38400 tensor(5.3798, device='cuda:0', grad_fn=<NllLossBackward0>) 381000 38500 tensor(5.4051, device='cuda:0', grad_fn=<NllLossBackward0>) 382000 38600 tensor(5.3471, device='cuda:0', grad_fn=<NllLossBackward0>) 383000 38700 tensor(5.3415, device='cuda:0', grad_fn=<NllLossBackward0>) 384000 38800 tensor(5.4310, device='cuda:0', grad_fn=<NllLossBackward0>) 385000 38900 tensor(5.5029, device='cuda:0', grad_fn=<NllLossBackward0>) 386000 39000 tensor(5.2021, device='cuda:0', grad_fn=<NllLossBackward0>) 387000 39100 tensor(5.4283, device='cuda:0', grad_fn=<NllLossBackward0>) 388000 39200 tensor(5.5158, device='cuda:0', grad_fn=<NllLossBackward0>) 389000 39300 tensor(5.3452, device='cuda:0', grad_fn=<NllLossBackward0>) 390000 39400 tensor(5.4111, device='cuda:0', grad_fn=<NllLossBackward0>) 391000 39500 tensor(5.4969, device='cuda:0', grad_fn=<NllLossBackward0>) 392000 39600 tensor(5.1952, device='cuda:0', grad_fn=<NllLossBackward0>) 393000 39700 tensor(5.1946, device='cuda:0', grad_fn=<NllLossBackward0>) 394000 39800 tensor(5.3234, device='cuda:0', grad_fn=<NllLossBackward0>) 395000 39900 tensor(5.1354, device='cuda:0', grad_fn=<NllLossBackward0>) 396000 40000 tensor(5.2210, device='cuda:0', grad_fn=<NllLossBackward0>) 397000 40100 tensor(5.3133, device='cuda:0', grad_fn=<NllLossBackward0>) 398000 40200 tensor(5.2990, device='cuda:0', grad_fn=<NllLossBackward0>) 399000 40300 tensor(5.3684, device='cuda:0', grad_fn=<NllLossBackward0>) 40400 tensor(5.3700, device='cuda:0', grad_fn=<NllLossBackward0>) 400000 40500 tensor(5.2911, device='cuda:0', grad_fn=<NllLossBackward0>) 401000 40600 tensor(5.3497, device='cuda:0', grad_fn=<NllLossBackward0>) 402000 40700 tensor(5.3981, device='cuda:0', grad_fn=<NllLossBackward0>) 403000 40800 tensor(5.3436, device='cuda:0', grad_fn=<NllLossBackward0>) 404000 40900 tensor(5.2978, device='cuda:0', grad_fn=<NllLossBackward0>) 405000 41000 tensor(5.3420, device='cuda:0', grad_fn=<NllLossBackward0>) 406000 41100 tensor(5.3342, device='cuda:0', grad_fn=<NllLossBackward0>) 407000 41200 tensor(5.2226, device='cuda:0', grad_fn=<NllLossBackward0>) 408000 41300 tensor(5.3573, device='cuda:0', grad_fn=<NllLossBackward0>) 409000 41400 tensor(5.2448, device='cuda:0', grad_fn=<NllLossBackward0>) 410000 41500 tensor(5.3863, device='cuda:0', grad_fn=<NllLossBackward0>) 411000 41600 tensor(5.3051, device='cuda:0', grad_fn=<NllLossBackward0>) 412000 41700 tensor(5.3294, device='cuda:0', grad_fn=<NllLossBackward0>) 413000 41800 tensor(5.3191, device='cuda:0', grad_fn=<NllLossBackward0>) 414000 41900 tensor(5.3289, device='cuda:0', grad_fn=<NllLossBackward0>) 415000 42000 tensor(5.4860, device='cuda:0', grad_fn=<NllLossBackward0>) 416000 42100 tensor(5.2358, device='cuda:0', grad_fn=<NllLossBackward0>) 417000 42200 tensor(5.3253, device='cuda:0', grad_fn=<NllLossBackward0>) 418000 42300 tensor(5.3869, device='cuda:0', grad_fn=<NllLossBackward0>) 419000 42400 tensor(5.2062, device='cuda:0', grad_fn=<NllLossBackward0>) 420000 42500 tensor(5.3712, device='cuda:0', grad_fn=<NllLossBackward0>) 421000 42600 tensor(5.1718, device='cuda:0', grad_fn=<NllLossBackward0>) 422000 42700 tensor(5.4735, device='cuda:0', grad_fn=<NllLossBackward0>) 423000 42800 tensor(5.3973, device='cuda:0', grad_fn=<NllLossBackward0>) 424000 42900 tensor(5.2447, device='cuda:0', grad_fn=<NllLossBackward0>) 425000 43000 tensor(5.3896, device='cuda:0', grad_fn=<NllLossBackward0>) 426000 43100 tensor(5.3916, device='cuda:0', grad_fn=<NllLossBackward0>) 427000 43200 tensor(5.2044, device='cuda:0', grad_fn=<NllLossBackward0>) 428000 43300 tensor(5.2167, device='cuda:0', grad_fn=<NllLossBackward0>) 429000 43400 tensor(5.3933, device='cuda:0', grad_fn=<NllLossBackward0>) 430000 43500 tensor(5.1078, device='cuda:0', grad_fn=<NllLossBackward0>) 431000 43600 tensor(5.3045, device='cuda:0', grad_fn=<NllLossBackward0>) 432000
device = 'cuda'
torch.cuda.empty_cache()
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
model.load_state_dict(torch.load(f'model-bigram_final.bin'))
model.eval()
ixs = torch.tensor(vocab.forward(['will'])).to(device)
out = model(ixs)
top = torch.topk(out[0], 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
list(zip(top_words, top_indices, top_probs))
[('be', 11, 0.2570849657058716), ('<unk>', 0, 0.07411641627550125), ('not', 22, 0.05940083786845207), ('have', 28, 0.02751326560974121), ('bo', 167, 0.014936885796487331), ('make', 116, 0.013943656347692013), ('give', 193, 0.011286991648375988), ('take', 153, 0.011171611957252026), ('do', 86, 0.010088067501783371), ('he', 20, 0.009703895077109337)]
vocab = train_dataset.vocab
ixs = torch.tensor(vocab.forward(['cerned.'])).to(device)
out = model(ixs)
top = torch.topk(out[0], 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
list(zip(top_words, top_indices, top_probs))
[('<unk>', 0, 0.19996878504753113), ('and', 3, 0.05288130044937134), ('of', 2, 0.042051784694194794), ('the', 1, 0.026572922244668007), ('to', 4, 0.022689413279294968), ('in', 6, 0.015904497355222702), ('The', 17, 0.012827681377530098), ('a', 5, 0.00961760152131319), ('for', 8, 0.008938422426581383), ('</s>', 32, 0.00840282253921032)]
cos = nn.CosineSimilarity(dim=1, eps=1e-6)
embeddings = model.model[0].weight
vec = embeddings[vocab['cerned.']]
similarities = cos(vec, embeddings)
top = torch.topk(similarities, 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
list(zip(top_words, top_indices, top_probs))
[('<unk>', 0, 1.0), ('particular,', 14538, 0.24527804553508759), ('revolution.', 20446, 0.23776617646217346), ('Territory.', 14189, 0.23417341709136963), ('or-', 2261, 0.22888363897800446), ('3', 479, 0.2288265973329544), ('speak.', 13722, 0.2252315878868103), ('attend.', 19397, 0.22110989689826965), ('say,', 1455, 0.22106117010116577), ('Lee.', 15326, 0.21764159202575684)]
def get_values_from_model(presc_word, model, vocab, k):
ixs = torch.tensor(vocab.forward([presc_word])).to(device)
out = model(ixs)
top = torch.topk(out[0], k)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
return list(zip(top_words, top_probs))
def gonito_format(dic):
tab = summarize_probs_unk(dic)
result = ''
for element in tab[:-1]:
result+=str(element[0])+':'+str(element[1])+'\t'
result+=':'+ str(tab[-1][1])+'\n'
return result
def summarize_probs_unk(dic):
if '<unk>' in dic.keys():
probsum = sum(float(val) for key, val in dic.items())
for key in dic:
dic[key] = dic[key]/probsum ###leave some space for wildcard
wildcard = dic['<unk>']
del dic['<unk>']
tab = [(key, val) for key, val in dic.items()]
tab.append(('<unk>', wildcard))
else:
probsum = sum(float(val) for key, val in dic.items())
for key in dic:
dic[key] = dic[key]/(probsum*(1+wildcard_minweight)) #plus, becouse it's denominator
tab = [(key, val) for key, val in dic.items()]
tab.append(('<unk>', 1-sum([val for val in dic.values()])))
return tab
model.load_state_dict(torch.load('model-bigram_final.bin'))
<All keys matched successfully>
with lzma.open(test_file, 'rt') as file:
predict_words = []
results = []
for line in file:
# print(line)
line = preprocess(line) #get only relevant
split = line.split('\t')
predict_words.append(get_last_word(split[0])) #get_first_word(split[1])
vocab = train_dataset.vocab
for presc_word in predict_words:
results.append(dict(get_values_from_model(presc_word, model, vocab, k=k)))
with open(out_file, 'w') as outfile:
for elem in results:
outfile.write(gonito_format(elem))