6219 lines
261 KiB
Plaintext
6219 lines
261 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Defaulting to user installation because normal site-packages is not writeable\n",
|
|
"Collecting torchtext\n",
|
|
" Downloading torchtext-0.15.2-cp310-cp310-manylinux1_x86_64.whl (2.0 MB)\n",
|
|
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m[36m0:00:01\u001b[0m[36m0:00:01\u001b[0m:01\u001b[0m\n",
|
|
"\u001b[?25hCollecting tqdm\n",
|
|
" Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)\n",
|
|
"Requirement already satisfied: numpy in /home/gedin/.local/lib/python3.10/site-packages (from torchtext) (1.24.3)\n",
|
|
"Collecting torchdata==0.6.1\n",
|
|
" Downloading torchdata-0.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)\n",
|
|
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.6/4.6 MB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:01\u001b[0m\n",
|
|
"\u001b[?25hRequirement already satisfied: requests in /usr/lib/python3/dist-packages (from torchtext) (2.25.1)\n",
|
|
"Collecting torch==2.0.1\n",
|
|
" Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)\n",
|
|
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m619.9/619.9 MB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:09\u001b[0m\n",
|
|
"\u001b[?25hCollecting sympy\n",
|
|
" Downloading sympy-1.12-py3-none-any.whl (5.7 MB)\n",
|
|
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.7/5.7 MB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:01\u001b[0m\n",
|
|
"\u001b[?25hCollecting nvidia-cudnn-cu11==8.5.0.96\n",
|
|
" Using cached nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl (557.1 MB)\n",
|
|
"Collecting nvidia-cuda-cupti-cu11==11.7.101\n",
|
|
" Using cached nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)\n",
|
|
"Collecting nvidia-cusparse-cu11==11.7.4.91\n",
|
|
" Using cached nvidia_cusparse_cu11-11.7.4.91-py3-none-manylinux1_x86_64.whl (173.2 MB)\n",
|
|
"Collecting networkx\n",
|
|
" Using cached networkx-3.1-py3-none-any.whl (2.1 MB)\n",
|
|
"Collecting nvidia-cufft-cu11==10.9.0.58\n",
|
|
" Using cached nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux1_x86_64.whl (168.4 MB)\n",
|
|
"Collecting filelock\n",
|
|
" Downloading filelock-3.12.0-py3-none-any.whl (10 kB)\n",
|
|
"Collecting nvidia-cuda-runtime-cu11==11.7.99\n",
|
|
" Using cached nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)\n",
|
|
"Collecting triton==2.0.0\n",
|
|
" Downloading triton-2.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (63.3 MB)\n",
|
|
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m63.3/63.3 MB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:02\u001b[0m\n",
|
|
"\u001b[?25hCollecting nvidia-cusolver-cu11==11.4.0.1\n",
|
|
" Using cached nvidia_cusolver_cu11-11.4.0.1-2-py3-none-manylinux1_x86_64.whl (102.6 MB)\n",
|
|
"Requirement already satisfied: jinja2 in /home/gedin/.local/lib/python3.10/site-packages (from torch==2.0.1->torchtext) (3.1.2)\n",
|
|
"Collecting nvidia-cublas-cu11==11.10.3.66\n",
|
|
" Using cached nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB)\n",
|
|
"Collecting typing-extensions\n",
|
|
" Downloading typing_extensions-4.6.3-py3-none-any.whl (31 kB)\n",
|
|
"Collecting nvidia-nccl-cu11==2.14.3\n",
|
|
" Using cached nvidia_nccl_cu11-2.14.3-py3-none-manylinux1_x86_64.whl (177.1 MB)\n",
|
|
"Collecting nvidia-cuda-nvrtc-cu11==11.7.99\n",
|
|
" Using cached nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)\n",
|
|
"Collecting nvidia-curand-cu11==10.2.10.91\n",
|
|
" Using cached nvidia_curand_cu11-10.2.10.91-py3-none-manylinux1_x86_64.whl (54.6 MB)\n",
|
|
"Collecting nvidia-nvtx-cu11==11.7.91\n",
|
|
" Using cached nvidia_nvtx_cu11-11.7.91-py3-none-manylinux1_x86_64.whl (98 kB)\n",
|
|
"Requirement already satisfied: urllib3>=1.25 in /usr/lib/python3/dist-packages (from torchdata==0.6.1->torchtext) (1.26.5)\n",
|
|
"Requirement already satisfied: wheel in /usr/lib/python3/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch==2.0.1->torchtext) (0.37.1)\n",
|
|
"Requirement already satisfied: setuptools in /usr/lib/python3/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch==2.0.1->torchtext) (59.6.0)\n",
|
|
"Collecting lit\n",
|
|
" Downloading lit-16.0.5.tar.gz (138 kB)\n",
|
|
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m138.0/138.0 KB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n",
|
|
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
|
"\u001b[?25hCollecting cmake\n",
|
|
" Using cached cmake-3.26.3-py2.py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (24.0 MB)\n",
|
|
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/lib/python3/dist-packages (from jinja2->torch==2.0.1->torchtext) (2.0.1)\n",
|
|
"Collecting mpmath>=0.19\n",
|
|
" Using cached mpmath-1.3.0-py3-none-any.whl (536 kB)\n",
|
|
"Building wheels for collected packages: lit\n",
|
|
" Building wheel for lit (setup.py) ... \u001b[?25ldone\n",
|
|
"\u001b[?25h Created wheel for lit: filename=lit-16.0.5-py3-none-any.whl size=88192 sha256=f6c57a31a147cbfe0af3d6bf4b856390ad14c28a9ddb38c8044ec29331b35c26\n",
|
|
" Stored in directory: /home/gedin/.cache/pip/wheels/eb/02/84/d82f0b1a6098209edf7e3607be6cc592ebbc015a8a3127c68d\n",
|
|
"Successfully built lit\n",
|
|
"Installing collected packages: mpmath, lit, cmake, typing-extensions, tqdm, sympy, nvidia-nvtx-cu11, nvidia-nccl-cu11, nvidia-cusparse-cu11, nvidia-curand-cu11, nvidia-cufft-cu11, nvidia-cuda-runtime-cu11, nvidia-cuda-nvrtc-cu11, nvidia-cuda-cupti-cu11, nvidia-cublas-cu11, networkx, filelock, nvidia-cusolver-cu11, nvidia-cudnn-cu11, triton, torch, torchdata, torchtext\n",
|
|
"Successfully installed cmake-3.26.3 filelock-3.12.0 lit-16.0.5 mpmath-1.3.0 networkx-3.1 nvidia-cublas-cu11-11.10.3.66 nvidia-cuda-cupti-cu11-11.7.101 nvidia-cuda-nvrtc-cu11-11.7.99 nvidia-cuda-runtime-cu11-11.7.99 nvidia-cudnn-cu11-8.5.0.96 nvidia-cufft-cu11-10.9.0.58 nvidia-curand-cu11-10.2.10.91 nvidia-cusolver-cu11-11.4.0.1 nvidia-cusparse-cu11-11.7.4.91 nvidia-nccl-cu11-2.14.3 nvidia-nvtx-cu11-11.7.91 sympy-1.12 torch-2.0.1 torchdata-0.6.1 torchtext-0.15.2 tqdm-4.65.0 triton-2.0.0 typing-extensions-4.6.3\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"!pip install torchtext"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_file ='train/in.tsv.xz'\n",
|
|
"test_file = 'dev-0/in.tsv.xz'\n",
|
|
"out_file = 'dev-0/out.tsv'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from itertools import islice\n",
|
|
"import regex as re\n",
|
|
"import sys\n",
|
|
"from torchtext.vocab import build_vocab_from_iterator\n",
|
|
"import lzma\n",
|
|
"import pickle\n",
|
|
"import re\n",
|
|
"import torch\n",
|
|
"from torch import nn\n",
|
|
"from torch.utils.data import IterableDataset\n",
|
|
"import itertools\n",
|
|
"from torch.utils.data import DataLoader\n",
|
|
"import gc"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"embed_size = 300\n",
|
|
"device = 'cuda'\n",
|
|
"vocab_size = 25000\n",
|
|
"batch_s = 3200\n",
|
|
"learning_rate = 0.0001\n",
|
|
"epochs = 4\n",
|
|
"k = 20 #top k words\n",
|
|
"wildcard_minweight = 0.1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"###preprocessing\n",
|
|
"def preprocess(line):\n",
|
|
" line = get_rid_of_header(line)\n",
|
|
" line = replace_endline(line)\n",
|
|
" return line\n",
|
|
"\n",
|
|
"def get_rid_of_header(line):\n",
|
|
" line = line.split('\\t')[6:]\n",
|
|
" return \"\".join(line)\n",
|
|
" \n",
|
|
"def replace_endline(line):\n",
|
|
" line = line.replace(\"\\\\n\", \" \")\n",
|
|
" return line\n",
|
|
"\n",
|
|
"\n",
|
|
"def get_last_word(text):\n",
|
|
" \"\"\"Return the last word of a string.\"\"\"\n",
|
|
" last_word = \"\"\n",
|
|
" for i in range(len(text)-1, -1, -1):\n",
|
|
" if text[i] == ' ':\n",
|
|
" return last_word[::-1].rstrip()\n",
|
|
" else:\n",
|
|
" last_word += text[i]\n",
|
|
" return last_word[::-1].rstrip()\n",
|
|
"\n",
|
|
"def get_first_word(text):\n",
|
|
" \"\"\"Return the first word of a string.\"\"\"\n",
|
|
" word = \"\"\n",
|
|
" for i in range(len(text)-1):\n",
|
|
" if text[i] == ' ':\n",
|
|
" return word\n",
|
|
" else:\n",
|
|
" word += text[i]\n",
|
|
" return word\n",
|
|
"\n",
|
|
"\n",
|
|
"def get_words_from_line(line):\n",
|
|
" line = line.rstrip()\n",
|
|
" yield '<s>'\n",
|
|
" line = preprocess(line)\n",
|
|
" for t in line.split(' '):\n",
|
|
" yield t\n",
|
|
" yield '</s>'\n",
|
|
"\n",
|
|
"\n",
|
|
"def get_word_lines_from_file(file_name):\n",
|
|
" n = 0\n",
|
|
" with lzma.open(file_name, 'r') as fh:\n",
|
|
" for line in fh:\n",
|
|
" n+=1\n",
|
|
" if n%1000==0:\n",
|
|
" print(n)\n",
|
|
" yield get_words_from_line(line.decode('utf-8'))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"1000\n",
|
|
"2000\n",
|
|
"3000\n",
|
|
"4000\n",
|
|
"5000\n",
|
|
"6000\n",
|
|
"7000\n",
|
|
"8000\n",
|
|
"9000\n",
|
|
"10000\n",
|
|
"11000\n",
|
|
"12000\n",
|
|
"13000\n",
|
|
"14000\n",
|
|
"15000\n",
|
|
"16000\n",
|
|
"17000\n",
|
|
"18000\n",
|
|
"19000\n",
|
|
"20000\n",
|
|
"21000\n",
|
|
"22000\n",
|
|
"23000\n",
|
|
"24000\n",
|
|
"25000\n",
|
|
"26000\n",
|
|
"27000\n",
|
|
"28000\n",
|
|
"29000\n",
|
|
"30000\n",
|
|
"31000\n",
|
|
"32000\n",
|
|
"33000\n",
|
|
"34000\n",
|
|
"35000\n",
|
|
"36000\n",
|
|
"37000\n",
|
|
"38000\n",
|
|
"39000\n",
|
|
"40000\n",
|
|
"41000\n",
|
|
"42000\n",
|
|
"43000\n",
|
|
"44000\n",
|
|
"45000\n",
|
|
"46000\n",
|
|
"47000\n",
|
|
"48000\n",
|
|
"49000\n",
|
|
"50000\n",
|
|
"51000\n",
|
|
"52000\n",
|
|
"53000\n",
|
|
"54000\n",
|
|
"55000\n",
|
|
"56000\n",
|
|
"57000\n",
|
|
"58000\n",
|
|
"59000\n",
|
|
"60000\n",
|
|
"61000\n",
|
|
"62000\n",
|
|
"63000\n",
|
|
"64000\n",
|
|
"65000\n",
|
|
"66000\n",
|
|
"67000\n",
|
|
"68000\n",
|
|
"69000\n",
|
|
"70000\n",
|
|
"71000\n",
|
|
"72000\n",
|
|
"73000\n",
|
|
"74000\n",
|
|
"75000\n",
|
|
"76000\n",
|
|
"77000\n",
|
|
"78000\n",
|
|
"79000\n",
|
|
"80000\n",
|
|
"81000\n",
|
|
"82000\n",
|
|
"83000\n",
|
|
"84000\n",
|
|
"85000\n",
|
|
"86000\n",
|
|
"87000\n",
|
|
"88000\n",
|
|
"89000\n",
|
|
"90000\n",
|
|
"91000\n",
|
|
"92000\n",
|
|
"93000\n",
|
|
"94000\n",
|
|
"95000\n",
|
|
"96000\n",
|
|
"97000\n",
|
|
"98000\n",
|
|
"99000\n",
|
|
"100000\n",
|
|
"101000\n",
|
|
"102000\n",
|
|
"103000\n",
|
|
"104000\n",
|
|
"105000\n",
|
|
"106000\n",
|
|
"107000\n",
|
|
"108000\n",
|
|
"109000\n",
|
|
"110000\n",
|
|
"111000\n",
|
|
"112000\n",
|
|
"113000\n",
|
|
"114000\n",
|
|
"115000\n",
|
|
"116000\n",
|
|
"117000\n",
|
|
"118000\n",
|
|
"119000\n",
|
|
"120000\n",
|
|
"121000\n",
|
|
"122000\n",
|
|
"123000\n",
|
|
"124000\n",
|
|
"125000\n",
|
|
"126000\n",
|
|
"127000\n",
|
|
"128000\n",
|
|
"129000\n",
|
|
"130000\n",
|
|
"131000\n",
|
|
"132000\n",
|
|
"133000\n",
|
|
"134000\n",
|
|
"135000\n",
|
|
"136000\n",
|
|
"137000\n",
|
|
"138000\n",
|
|
"139000\n",
|
|
"140000\n",
|
|
"141000\n",
|
|
"142000\n",
|
|
"143000\n",
|
|
"144000\n",
|
|
"145000\n",
|
|
"146000\n",
|
|
"147000\n",
|
|
"148000\n",
|
|
"149000\n",
|
|
"150000\n",
|
|
"151000\n",
|
|
"152000\n",
|
|
"153000\n",
|
|
"154000\n",
|
|
"155000\n",
|
|
"156000\n",
|
|
"157000\n",
|
|
"158000\n",
|
|
"159000\n",
|
|
"160000\n",
|
|
"161000\n",
|
|
"162000\n",
|
|
"163000\n",
|
|
"164000\n",
|
|
"165000\n",
|
|
"166000\n",
|
|
"167000\n",
|
|
"168000\n",
|
|
"169000\n",
|
|
"170000\n",
|
|
"171000\n",
|
|
"172000\n",
|
|
"173000\n",
|
|
"174000\n",
|
|
"175000\n",
|
|
"176000\n",
|
|
"177000\n",
|
|
"178000\n",
|
|
"179000\n",
|
|
"180000\n",
|
|
"181000\n",
|
|
"182000\n",
|
|
"183000\n",
|
|
"184000\n",
|
|
"185000\n",
|
|
"186000\n",
|
|
"187000\n",
|
|
"188000\n",
|
|
"189000\n",
|
|
"190000\n",
|
|
"191000\n",
|
|
"192000\n",
|
|
"193000\n",
|
|
"194000\n",
|
|
"195000\n",
|
|
"196000\n",
|
|
"197000\n",
|
|
"198000\n",
|
|
"199000\n",
|
|
"200000\n",
|
|
"201000\n",
|
|
"202000\n",
|
|
"203000\n",
|
|
"204000\n",
|
|
"205000\n",
|
|
"206000\n",
|
|
"207000\n",
|
|
"208000\n",
|
|
"209000\n",
|
|
"210000\n",
|
|
"211000\n",
|
|
"212000\n",
|
|
"213000\n",
|
|
"214000\n",
|
|
"215000\n",
|
|
"216000\n",
|
|
"217000\n",
|
|
"218000\n",
|
|
"219000\n",
|
|
"220000\n",
|
|
"221000\n",
|
|
"222000\n",
|
|
"223000\n",
|
|
"224000\n",
|
|
"225000\n",
|
|
"226000\n",
|
|
"227000\n",
|
|
"228000\n",
|
|
"229000\n",
|
|
"230000\n",
|
|
"231000\n",
|
|
"232000\n",
|
|
"233000\n",
|
|
"234000\n",
|
|
"235000\n",
|
|
"236000\n",
|
|
"237000\n",
|
|
"238000\n",
|
|
"239000\n",
|
|
"240000\n",
|
|
"241000\n",
|
|
"242000\n",
|
|
"243000\n",
|
|
"244000\n",
|
|
"245000\n",
|
|
"246000\n",
|
|
"247000\n",
|
|
"248000\n",
|
|
"249000\n",
|
|
"250000\n",
|
|
"251000\n",
|
|
"252000\n",
|
|
"253000\n",
|
|
"254000\n",
|
|
"255000\n",
|
|
"256000\n",
|
|
"257000\n",
|
|
"258000\n",
|
|
"259000\n",
|
|
"260000\n",
|
|
"261000\n",
|
|
"262000\n",
|
|
"263000\n",
|
|
"264000\n",
|
|
"265000\n",
|
|
"266000\n",
|
|
"267000\n",
|
|
"268000\n",
|
|
"269000\n",
|
|
"270000\n",
|
|
"271000\n",
|
|
"272000\n",
|
|
"273000\n",
|
|
"274000\n",
|
|
"275000\n",
|
|
"276000\n",
|
|
"277000\n",
|
|
"278000\n",
|
|
"279000\n",
|
|
"280000\n",
|
|
"281000\n",
|
|
"282000\n",
|
|
"283000\n",
|
|
"284000\n",
|
|
"285000\n",
|
|
"286000\n",
|
|
"287000\n",
|
|
"288000\n",
|
|
"289000\n",
|
|
"290000\n",
|
|
"291000\n",
|
|
"292000\n",
|
|
"293000\n",
|
|
"294000\n",
|
|
"295000\n",
|
|
"296000\n",
|
|
"297000\n",
|
|
"298000\n",
|
|
"299000\n",
|
|
"300000\n",
|
|
"301000\n",
|
|
"302000\n",
|
|
"303000\n",
|
|
"304000\n",
|
|
"305000\n",
|
|
"306000\n",
|
|
"307000\n",
|
|
"308000\n",
|
|
"309000\n",
|
|
"310000\n",
|
|
"311000\n",
|
|
"312000\n",
|
|
"313000\n",
|
|
"314000\n",
|
|
"315000\n",
|
|
"316000\n",
|
|
"317000\n",
|
|
"318000\n",
|
|
"319000\n",
|
|
"320000\n",
|
|
"321000\n",
|
|
"322000\n",
|
|
"323000\n",
|
|
"324000\n",
|
|
"325000\n",
|
|
"326000\n",
|
|
"327000\n",
|
|
"328000\n",
|
|
"329000\n",
|
|
"330000\n",
|
|
"331000\n",
|
|
"332000\n",
|
|
"333000\n",
|
|
"334000\n",
|
|
"335000\n",
|
|
"336000\n",
|
|
"337000\n",
|
|
"338000\n",
|
|
"339000\n",
|
|
"340000\n",
|
|
"341000\n",
|
|
"342000\n",
|
|
"343000\n",
|
|
"344000\n",
|
|
"345000\n",
|
|
"346000\n",
|
|
"347000\n",
|
|
"348000\n",
|
|
"349000\n",
|
|
"350000\n",
|
|
"351000\n",
|
|
"352000\n",
|
|
"353000\n",
|
|
"354000\n",
|
|
"355000\n",
|
|
"356000\n",
|
|
"357000\n",
|
|
"358000\n",
|
|
"359000\n",
|
|
"360000\n",
|
|
"361000\n",
|
|
"362000\n",
|
|
"363000\n",
|
|
"364000\n",
|
|
"365000\n",
|
|
"366000\n",
|
|
"367000\n",
|
|
"368000\n",
|
|
"369000\n",
|
|
"370000\n",
|
|
"371000\n",
|
|
"372000\n",
|
|
"373000\n",
|
|
"374000\n",
|
|
"375000\n",
|
|
"376000\n",
|
|
"377000\n",
|
|
"378000\n",
|
|
"379000\n",
|
|
"380000\n",
|
|
"381000\n",
|
|
"382000\n",
|
|
"383000\n",
|
|
"384000\n",
|
|
"385000\n",
|
|
"386000\n",
|
|
"387000\n",
|
|
"388000\n",
|
|
"389000\n",
|
|
"390000\n",
|
|
"391000\n",
|
|
"392000\n",
|
|
"393000\n",
|
|
"394000\n",
|
|
"395000\n",
|
|
"396000\n",
|
|
"397000\n",
|
|
"398000\n",
|
|
"399000\n",
|
|
"400000\n",
|
|
"401000\n",
|
|
"402000\n",
|
|
"403000\n",
|
|
"404000\n",
|
|
"405000\n",
|
|
"406000\n",
|
|
"407000\n",
|
|
"408000\n",
|
|
"409000\n",
|
|
"410000\n",
|
|
"411000\n",
|
|
"412000\n",
|
|
"413000\n",
|
|
"414000\n",
|
|
"415000\n",
|
|
"416000\n",
|
|
"417000\n",
|
|
"418000\n",
|
|
"419000\n",
|
|
"420000\n",
|
|
"421000\n",
|
|
"422000\n",
|
|
"423000\n",
|
|
"424000\n",
|
|
"425000\n",
|
|
"426000\n",
|
|
"427000\n",
|
|
"428000\n",
|
|
"429000\n",
|
|
"430000\n",
|
|
"431000\n",
|
|
"432000\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"vocab = build_vocab_from_iterator(\n",
|
|
" get_word_lines_from_file(train_file),\n",
|
|
" max_tokens = vocab_size,\n",
|
|
" specials = ['<unk>'])\n",
|
|
"\n",
|
|
"with open('filename.pickle', 'wb') as handle:\n",
|
|
" pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"['<unk>', 'the', 'of', 'was', 'ladies']"
|
|
]
|
|
},
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"vocab.lookup_tokens([0, 1, 2, 10, 2000])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Definicja sieci\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Naszą prostą sieć neuronową zaimplementujemy używając frameworku PyTorch.\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"class SimpleBigramNeuralLanguageModel(nn.Module):\n",
|
|
" def __init__(self, vocabulary_size, embedding_size):\n",
|
|
" super(SimpleBigramNeuralLanguageModel, self).__init__()\n",
|
|
" self.model = nn.Sequential(\n",
|
|
" nn.Embedding(vocabulary_size, embedding_size),\n",
|
|
" nn.Linear(embedding_size, vocabulary_size),\n",
|
|
" nn.Softmax()\n",
|
|
" )\n",
|
|
" \n",
|
|
" def forward(self, x):\n",
|
|
" return self.model(x)\n",
|
|
"\n",
|
|
"with open('filename.pickle','rb') as handle:\n",
|
|
" vocab = pickle.load(handle)\n",
|
|
"\n",
|
|
"vocab.set_default_index(vocab['<unk>'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Help on Vocab in module torchtext.vocab.vocab object:\n",
|
|
"\n",
|
|
"class Vocab(torch.nn.modules.module.Module)\n",
|
|
" | Vocab(vocab) -> None\n",
|
|
" | \n",
|
|
" | Base class for all neural network modules.\n",
|
|
" | \n",
|
|
" | Your models should also subclass this class.\n",
|
|
" | \n",
|
|
" | Modules can also contain other Modules, allowing to nest them in\n",
|
|
" | a tree structure. You can assign the submodules as regular attributes::\n",
|
|
" | \n",
|
|
" | import torch.nn as nn\n",
|
|
" | import torch.nn.functional as F\n",
|
|
" | \n",
|
|
" | class Model(nn.Module):\n",
|
|
" | def __init__(self):\n",
|
|
" | super().__init__()\n",
|
|
" | self.conv1 = nn.Conv2d(1, 20, 5)\n",
|
|
" | self.conv2 = nn.Conv2d(20, 20, 5)\n",
|
|
" | \n",
|
|
" | def forward(self, x):\n",
|
|
" | x = F.relu(self.conv1(x))\n",
|
|
" | return F.relu(self.conv2(x))\n",
|
|
" | \n",
|
|
" | Submodules assigned in this way will be registered, and will have their\n",
|
|
" | parameters converted too when you call :meth:`to`, etc.\n",
|
|
" | \n",
|
|
" | .. note::\n",
|
|
" | As per the example above, an ``__init__()`` call to the parent class\n",
|
|
" | must be made before assignment on the child.\n",
|
|
" | \n",
|
|
" | :ivar training: Boolean represents whether this module is in training or\n",
|
|
" | evaluation mode.\n",
|
|
" | :vartype training: bool\n",
|
|
" | \n",
|
|
" | Method resolution order:\n",
|
|
" | Vocab\n",
|
|
" | torch.nn.modules.module.Module\n",
|
|
" | builtins.object\n",
|
|
" | \n",
|
|
" | Methods defined here:\n",
|
|
" | \n",
|
|
" | __contains__(self, token: str) -> bool\n",
|
|
" | Args:\n",
|
|
" | token: The token for which to check the membership.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | Whether the token is member of vocab or not.\n",
|
|
" | \n",
|
|
" | __getitem__(self, token: str) -> int\n",
|
|
" | Args:\n",
|
|
" | token: The token used to lookup the corresponding index.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | The index corresponding to the associated token.\n",
|
|
" | \n",
|
|
" | __init__(self, vocab) -> None\n",
|
|
" | Initializes internal Module state, shared by both nn.Module and ScriptModule.\n",
|
|
" | \n",
|
|
" | __len__(self) -> int\n",
|
|
" | Returns:\n",
|
|
" | The length of the vocab.\n",
|
|
" | \n",
|
|
" | __prepare_scriptable__(self)\n",
|
|
" | Return a JITable Vocab.\n",
|
|
" | \n",
|
|
" | append_token(self, token: str) -> None\n",
|
|
" | Args:\n",
|
|
" | token: The token used to lookup the corresponding index.\n",
|
|
" | \n",
|
|
" | Raises:\n",
|
|
" | RuntimeError: If `token` already exists in the vocab\n",
|
|
" | \n",
|
|
" | forward(self, tokens: List[str]) -> List[int]\n",
|
|
" | Calls the `lookup_indices` method\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | tokens: a list of tokens used to lookup their corresponding `indices`.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | The indices associated with a list of `tokens`.\n",
|
|
" | \n",
|
|
" | get_default_index(self) -> Union[int, NoneType]\n",
|
|
" | Returns:\n",
|
|
" | Value of default index if it is set.\n",
|
|
" | \n",
|
|
" | get_itos(self) -> List[str]\n",
|
|
" | Returns:\n",
|
|
" | List mapping indices to tokens.\n",
|
|
" | \n",
|
|
" | get_stoi(self) -> Dict[str, int]\n",
|
|
" | Returns:\n",
|
|
" | Dictionary mapping tokens to indices.\n",
|
|
" | \n",
|
|
" | insert_token(self, token: str, index: int) -> None\n",
|
|
" | Args:\n",
|
|
" | token: The token used to lookup the corresponding index.\n",
|
|
" | index: The index corresponding to the associated token.\n",
|
|
" | Raises:\n",
|
|
" | RuntimeError: If `index` is not in range [0, Vocab.size()] or if `token` already exists in the vocab.\n",
|
|
" | \n",
|
|
" | lookup_indices(self, tokens: List[str]) -> List[int]\n",
|
|
" | Args:\n",
|
|
" | tokens: the tokens used to lookup their corresponding `indices`.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | The 'indices` associated with `tokens`.\n",
|
|
" | \n",
|
|
" | lookup_token(self, index: int) -> str\n",
|
|
" | Args:\n",
|
|
" | index: The index corresponding to the associated token.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | token: The token used to lookup the corresponding index.\n",
|
|
" | \n",
|
|
" | Raises:\n",
|
|
" | RuntimeError: If `index` not in range [0, itos.size()).\n",
|
|
" | \n",
|
|
" | lookup_tokens(self, indices: List[int]) -> List[str]\n",
|
|
" | Args:\n",
|
|
" | indices: The `indices` used to lookup their corresponding`tokens`.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | The `tokens` associated with `indices`.\n",
|
|
" | \n",
|
|
" | Raises:\n",
|
|
" | RuntimeError: If an index within `indices` is not int range [0, itos.size()).\n",
|
|
" | \n",
|
|
" | set_default_index(self, index: Union[int, NoneType]) -> None\n",
|
|
" | Args:\n",
|
|
" | index: Value of default index. This index will be returned when OOV token is queried.\n",
|
|
" | \n",
|
|
" | ----------------------------------------------------------------------\n",
|
|
" | Readonly properties defined here:\n",
|
|
" | \n",
|
|
" | is_jitable\n",
|
|
" | \n",
|
|
" | ----------------------------------------------------------------------\n",
|
|
" | Data and other attributes defined here:\n",
|
|
" | \n",
|
|
" | __jit_unused_properties__ = ['is_jitable']\n",
|
|
" | \n",
|
|
" | ----------------------------------------------------------------------\n",
|
|
" | Methods inherited from torch.nn.modules.module.Module:\n",
|
|
" | \n",
|
|
" | __call__ = _call_impl(self, *args, **kwargs)\n",
|
|
" | \n",
|
|
" | __delattr__(self, name)\n",
|
|
" | Implement delattr(self, name).\n",
|
|
" | \n",
|
|
" | __dir__(self)\n",
|
|
" | Default dir() implementation.\n",
|
|
" | \n",
|
|
" | __getattr__(self, name: str) -> Union[torch.Tensor, ForwardRef('Module')]\n",
|
|
" | \n",
|
|
" | __repr__(self)\n",
|
|
" | Return repr(self).\n",
|
|
" | \n",
|
|
" | __setattr__(self, name: str, value: Union[torch.Tensor, ForwardRef('Module')]) -> None\n",
|
|
" | Implement setattr(self, name, value).\n",
|
|
" | \n",
|
|
" | __setstate__(self, state)\n",
|
|
" | \n",
|
|
" | add_module(self, name: str, module: Union[ForwardRef('Module'), NoneType]) -> None\n",
|
|
" | Adds a child module to the current module.\n",
|
|
" | \n",
|
|
" | The module can be accessed as an attribute using the given name.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | name (str): name of the child module. The child module can be\n",
|
|
" | accessed from this module using the given name\n",
|
|
" | module (Module): child module to be added to the module.\n",
|
|
" | \n",
|
|
" | apply(self: ~T, fn: Callable[[ForwardRef('Module')], NoneType]) -> ~T\n",
|
|
" | Applies ``fn`` recursively to every submodule (as returned by ``.children()``)\n",
|
|
" | as well as self. Typical use includes initializing the parameters of a model\n",
|
|
" | (see also :ref:`nn-init-doc`).\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | fn (:class:`Module` -> None): function to be applied to each submodule\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | Module: self\n",
|
|
" | \n",
|
|
" | Example::\n",
|
|
" | \n",
|
|
" | >>> @torch.no_grad()\n",
|
|
" | >>> def init_weights(m):\n",
|
|
" | >>> print(m)\n",
|
|
" | >>> if type(m) == nn.Linear:\n",
|
|
" | >>> m.weight.fill_(1.0)\n",
|
|
" | >>> print(m.weight)\n",
|
|
" | >>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))\n",
|
|
" | >>> net.apply(init_weights)\n",
|
|
" | Linear(in_features=2, out_features=2, bias=True)\n",
|
|
" | Parameter containing:\n",
|
|
" | tensor([[1., 1.],\n",
|
|
" | [1., 1.]], requires_grad=True)\n",
|
|
" | Linear(in_features=2, out_features=2, bias=True)\n",
|
|
" | Parameter containing:\n",
|
|
" | tensor([[1., 1.],\n",
|
|
" | [1., 1.]], requires_grad=True)\n",
|
|
" | Sequential(\n",
|
|
" | (0): Linear(in_features=2, out_features=2, bias=True)\n",
|
|
" | (1): Linear(in_features=2, out_features=2, bias=True)\n",
|
|
" | )\n",
|
|
" | \n",
|
|
" | bfloat16(self: ~T) -> ~T\n",
|
|
" | Casts all floating point parameters and buffers to ``bfloat16`` datatype.\n",
|
|
" | \n",
|
|
" | .. note::\n",
|
|
" | This method modifies the module in-place.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | Module: self\n",
|
|
" | \n",
|
|
" | buffers(self, recurse: bool = True) -> Iterator[torch.Tensor]\n",
|
|
" | Returns an iterator over module buffers.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | recurse (bool): if True, then yields buffers of this module\n",
|
|
" | and all submodules. Otherwise, yields only buffers that\n",
|
|
" | are direct members of this module.\n",
|
|
" | \n",
|
|
" | Yields:\n",
|
|
" | torch.Tensor: module buffer\n",
|
|
" | \n",
|
|
" | Example::\n",
|
|
" | \n",
|
|
" | >>> # xdoctest: +SKIP(\"undefined vars\")\n",
|
|
" | >>> for buf in model.buffers():\n",
|
|
" | >>> print(type(buf), buf.size())\n",
|
|
" | <class 'torch.Tensor'> (20L,)\n",
|
|
" | <class 'torch.Tensor'> (20L, 1L, 5L, 5L)\n",
|
|
" | \n",
|
|
" | children(self) -> Iterator[ForwardRef('Module')]\n",
|
|
" | Returns an iterator over immediate children modules.\n",
|
|
" | \n",
|
|
" | Yields:\n",
|
|
" | Module: a child module\n",
|
|
" | \n",
|
|
" | cpu(self: ~T) -> ~T\n",
|
|
" | Moves all model parameters and buffers to the CPU.\n",
|
|
" | \n",
|
|
" | .. note::\n",
|
|
" | This method modifies the module in-place.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | Module: self\n",
|
|
" | \n",
|
|
" | cuda(self: ~T, device: Union[int, torch.device, NoneType] = None) -> ~T\n",
|
|
" | Moves all model parameters and buffers to the GPU.\n",
|
|
" | \n",
|
|
" | This also makes associated parameters and buffers different objects. So\n",
|
|
" | it should be called before constructing optimizer if the module will\n",
|
|
" | live on GPU while being optimized.\n",
|
|
" | \n",
|
|
" | .. note::\n",
|
|
" | This method modifies the module in-place.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | device (int, optional): if specified, all parameters will be\n",
|
|
" | copied to that device\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | Module: self\n",
|
|
" | \n",
|
|
" | double(self: ~T) -> ~T\n",
|
|
" | Casts all floating point parameters and buffers to ``double`` datatype.\n",
|
|
" | \n",
|
|
" | .. note::\n",
|
|
" | This method modifies the module in-place.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | Module: self\n",
|
|
" | \n",
|
|
" | eval(self: ~T) -> ~T\n",
|
|
" | Sets the module in evaluation mode.\n",
|
|
" | \n",
|
|
" | This has any effect only on certain modules. See documentations of\n",
|
|
" | particular modules for details of their behaviors in training/evaluation\n",
|
|
" | mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,\n",
|
|
" | etc.\n",
|
|
" | \n",
|
|
" | This is equivalent with :meth:`self.train(False) <torch.nn.Module.train>`.\n",
|
|
" | \n",
|
|
" | See :ref:`locally-disable-grad-doc` for a comparison between\n",
|
|
" | `.eval()` and several similar mechanisms that may be confused with it.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | Module: self\n",
|
|
" | \n",
|
|
" | extra_repr(self) -> str\n",
|
|
" | Set the extra representation of the module\n",
|
|
" | \n",
|
|
" | To print customized extra information, you should re-implement\n",
|
|
" | this method in your own modules. Both single-line and multi-line\n",
|
|
" | strings are acceptable.\n",
|
|
" | \n",
|
|
" | float(self: ~T) -> ~T\n",
|
|
" | Casts all floating point parameters and buffers to ``float`` datatype.\n",
|
|
" | \n",
|
|
" | .. note::\n",
|
|
" | This method modifies the module in-place.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | Module: self\n",
|
|
" | \n",
|
|
" | get_buffer(self, target: str) -> 'Tensor'\n",
|
|
" | Returns the buffer given by ``target`` if it exists,\n",
|
|
" | otherwise throws an error.\n",
|
|
" | \n",
|
|
" | See the docstring for ``get_submodule`` for a more detailed\n",
|
|
" | explanation of this method's functionality as well as how to\n",
|
|
" | correctly specify ``target``.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | target: The fully-qualified string name of the buffer\n",
|
|
" | to look for. (See ``get_submodule`` for how to specify a\n",
|
|
" | fully-qualified string.)\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | torch.Tensor: The buffer referenced by ``target``\n",
|
|
" | \n",
|
|
" | Raises:\n",
|
|
" | AttributeError: If the target string references an invalid\n",
|
|
" | path or resolves to something that is not a\n",
|
|
" | buffer\n",
|
|
" | \n",
|
|
" | get_extra_state(self) -> Any\n",
|
|
" | Returns any extra state to include in the module's state_dict.\n",
|
|
" | Implement this and a corresponding :func:`set_extra_state` for your module\n",
|
|
" | if you need to store extra state. This function is called when building the\n",
|
|
" | module's `state_dict()`.\n",
|
|
" | \n",
|
|
" | Note that extra state should be picklable to ensure working serialization\n",
|
|
" | of the state_dict. We only provide provide backwards compatibility guarantees\n",
|
|
" | for serializing Tensors; other objects may break backwards compatibility if\n",
|
|
" | their serialized pickled form changes.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | object: Any extra state to store in the module's state_dict\n",
|
|
" | \n",
|
|
" | get_parameter(self, target: str) -> 'Parameter'\n",
|
|
" | Returns the parameter given by ``target`` if it exists,\n",
|
|
" | otherwise throws an error.\n",
|
|
" | \n",
|
|
" | See the docstring for ``get_submodule`` for a more detailed\n",
|
|
" | explanation of this method's functionality as well as how to\n",
|
|
" | correctly specify ``target``.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | target: The fully-qualified string name of the Parameter\n",
|
|
" | to look for. (See ``get_submodule`` for how to specify a\n",
|
|
" | fully-qualified string.)\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | torch.nn.Parameter: The Parameter referenced by ``target``\n",
|
|
" | \n",
|
|
" | Raises:\n",
|
|
" | AttributeError: If the target string references an invalid\n",
|
|
" | path or resolves to something that is not an\n",
|
|
" | ``nn.Parameter``\n",
|
|
" | \n",
|
|
" | get_submodule(self, target: str) -> 'Module'\n",
|
|
" | Returns the submodule given by ``target`` if it exists,\n",
|
|
" | otherwise throws an error.\n",
|
|
" | \n",
|
|
" | For example, let's say you have an ``nn.Module`` ``A`` that\n",
|
|
" | looks like this:\n",
|
|
" | \n",
|
|
" | .. code-block:: text\n",
|
|
" | \n",
|
|
" | A(\n",
|
|
" | (net_b): Module(\n",
|
|
" | (net_c): Module(\n",
|
|
" | (conv): Conv2d(16, 33, kernel_size=(3, 3), stride=(2, 2))\n",
|
|
" | )\n",
|
|
" | (linear): Linear(in_features=100, out_features=200, bias=True)\n",
|
|
" | )\n",
|
|
" | )\n",
|
|
" | \n",
|
|
" | (The diagram shows an ``nn.Module`` ``A``. ``A`` has a nested\n",
|
|
" | submodule ``net_b``, which itself has two submodules ``net_c``\n",
|
|
" | and ``linear``. ``net_c`` then has a submodule ``conv``.)\n",
|
|
" | \n",
|
|
" | To check whether or not we have the ``linear`` submodule, we\n",
|
|
" | would call ``get_submodule(\"net_b.linear\")``. To check whether\n",
|
|
" | we have the ``conv`` submodule, we would call\n",
|
|
" | ``get_submodule(\"net_b.net_c.conv\")``.\n",
|
|
" | \n",
|
|
" | The runtime of ``get_submodule`` is bounded by the degree\n",
|
|
" | of module nesting in ``target``. A query against\n",
|
|
" | ``named_modules`` achieves the same result, but it is O(N) in\n",
|
|
" | the number of transitive modules. So, for a simple check to see\n",
|
|
" | if some submodule exists, ``get_submodule`` should always be\n",
|
|
" | used.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | target: The fully-qualified string name of the submodule\n",
|
|
" | to look for. (See above example for how to specify a\n",
|
|
" | fully-qualified string.)\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | torch.nn.Module: The submodule referenced by ``target``\n",
|
|
" | \n",
|
|
" | Raises:\n",
|
|
" | AttributeError: If the target string references an invalid\n",
|
|
" | path or resolves to something that is not an\n",
|
|
" | ``nn.Module``\n",
|
|
" | \n",
|
|
" | half(self: ~T) -> ~T\n",
|
|
" | Casts all floating point parameters and buffers to ``half`` datatype.\n",
|
|
" | \n",
|
|
" | .. note::\n",
|
|
" | This method modifies the module in-place.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | Module: self\n",
|
|
" | \n",
|
|
" | ipu(self: ~T, device: Union[int, torch.device, NoneType] = None) -> ~T\n",
|
|
" | Moves all model parameters and buffers to the IPU.\n",
|
|
" | \n",
|
|
" | This also makes associated parameters and buffers different objects. So\n",
|
|
" | it should be called before constructing optimizer if the module will\n",
|
|
" | live on IPU while being optimized.\n",
|
|
" | \n",
|
|
" | .. note::\n",
|
|
" | This method modifies the module in-place.\n",
|
|
" | \n",
|
|
" | Arguments:\n",
|
|
" | device (int, optional): if specified, all parameters will be\n",
|
|
" | copied to that device\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | Module: self\n",
|
|
" | \n",
|
|
" | load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True)\n",
|
|
" | Copies parameters and buffers from :attr:`state_dict` into\n",
|
|
" | this module and its descendants. If :attr:`strict` is ``True``, then\n",
|
|
" | the keys of :attr:`state_dict` must exactly match the keys returned\n",
|
|
" | by this module's :meth:`~torch.nn.Module.state_dict` function.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | state_dict (dict): a dict containing parameters and\n",
|
|
" | persistent buffers.\n",
|
|
" | strict (bool, optional): whether to strictly enforce that the keys\n",
|
|
" | in :attr:`state_dict` match the keys returned by this module's\n",
|
|
" | :meth:`~torch.nn.Module.state_dict` function. Default: ``True``\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:\n",
|
|
" | * **missing_keys** is a list of str containing the missing keys\n",
|
|
" | * **unexpected_keys** is a list of str containing the unexpected keys\n",
|
|
" | \n",
|
|
" | Note:\n",
|
|
" | If a parameter or buffer is registered as ``None`` and its corresponding key\n",
|
|
" | exists in :attr:`state_dict`, :meth:`load_state_dict` will raise a\n",
|
|
" | ``RuntimeError``.\n",
|
|
" | \n",
|
|
" | modules(self) -> Iterator[ForwardRef('Module')]\n",
|
|
" | Returns an iterator over all modules in the network.\n",
|
|
" | \n",
|
|
" | Yields:\n",
|
|
" | Module: a module in the network\n",
|
|
" | \n",
|
|
" | Note:\n",
|
|
" | Duplicate modules are returned only once. In the following\n",
|
|
" | example, ``l`` will be returned only once.\n",
|
|
" | \n",
|
|
" | Example::\n",
|
|
" | \n",
|
|
" | >>> l = nn.Linear(2, 2)\n",
|
|
" | >>> net = nn.Sequential(l, l)\n",
|
|
" | >>> for idx, m in enumerate(net.modules()):\n",
|
|
" | ... print(idx, '->', m)\n",
|
|
" | \n",
|
|
" | 0 -> Sequential(\n",
|
|
" | (0): Linear(in_features=2, out_features=2, bias=True)\n",
|
|
" | (1): Linear(in_features=2, out_features=2, bias=True)\n",
|
|
" | )\n",
|
|
" | 1 -> Linear(in_features=2, out_features=2, bias=True)\n",
|
|
" | \n",
|
|
" | named_buffers(self, prefix: str = '', recurse: bool = True, remove_duplicate: bool = True) -> Iterator[Tuple[str, torch.Tensor]]\n",
|
|
" | Returns an iterator over module buffers, yielding both the\n",
|
|
" | name of the buffer as well as the buffer itself.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | prefix (str): prefix to prepend to all buffer names.\n",
|
|
" | recurse (bool, optional): if True, then yields buffers of this module\n",
|
|
" | and all submodules. Otherwise, yields only buffers that\n",
|
|
" | are direct members of this module. Defaults to True.\n",
|
|
" | remove_duplicate (bool, optional): whether to remove the duplicated buffers in the result. Defaults to True.\n",
|
|
" | \n",
|
|
" | Yields:\n",
|
|
" | (str, torch.Tensor): Tuple containing the name and buffer\n",
|
|
" | \n",
|
|
" | Example::\n",
|
|
" | \n",
|
|
" | >>> # xdoctest: +SKIP(\"undefined vars\")\n",
|
|
" | >>> for name, buf in self.named_buffers():\n",
|
|
" | >>> if name in ['running_var']:\n",
|
|
" | >>> print(buf.size())\n",
|
|
" | \n",
|
|
" | named_children(self) -> Iterator[Tuple[str, ForwardRef('Module')]]\n",
|
|
" | Returns an iterator over immediate children modules, yielding both\n",
|
|
" | the name of the module as well as the module itself.\n",
|
|
" | \n",
|
|
" | Yields:\n",
|
|
" | (str, Module): Tuple containing a name and child module\n",
|
|
" | \n",
|
|
" | Example::\n",
|
|
" | \n",
|
|
" | >>> # xdoctest: +SKIP(\"undefined vars\")\n",
|
|
" | >>> for name, module in model.named_children():\n",
|
|
" | >>> if name in ['conv4', 'conv5']:\n",
|
|
" | >>> print(module)\n",
|
|
" | \n",
|
|
" | named_modules(self, memo: Union[Set[ForwardRef('Module')], NoneType] = None, prefix: str = '', remove_duplicate: bool = True)\n",
|
|
" | Returns an iterator over all modules in the network, yielding\n",
|
|
" | both the name of the module as well as the module itself.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | memo: a memo to store the set of modules already added to the result\n",
|
|
" | prefix: a prefix that will be added to the name of the module\n",
|
|
" | remove_duplicate: whether to remove the duplicated module instances in the result\n",
|
|
" | or not\n",
|
|
" | \n",
|
|
" | Yields:\n",
|
|
" | (str, Module): Tuple of name and module\n",
|
|
" | \n",
|
|
" | Note:\n",
|
|
" | Duplicate modules are returned only once. In the following\n",
|
|
" | example, ``l`` will be returned only once.\n",
|
|
" | \n",
|
|
" | Example::\n",
|
|
" | \n",
|
|
" | >>> l = nn.Linear(2, 2)\n",
|
|
" | >>> net = nn.Sequential(l, l)\n",
|
|
" | >>> for idx, m in enumerate(net.named_modules()):\n",
|
|
" | ... print(idx, '->', m)\n",
|
|
" | \n",
|
|
" | 0 -> ('', Sequential(\n",
|
|
" | (0): Linear(in_features=2, out_features=2, bias=True)\n",
|
|
" | (1): Linear(in_features=2, out_features=2, bias=True)\n",
|
|
" | ))\n",
|
|
" | 1 -> ('0', Linear(in_features=2, out_features=2, bias=True))\n",
|
|
" | \n",
|
|
" | named_parameters(self, prefix: str = '', recurse: bool = True, remove_duplicate: bool = True) -> Iterator[Tuple[str, torch.nn.parameter.Parameter]]\n",
|
|
" | Returns an iterator over module parameters, yielding both the\n",
|
|
" | name of the parameter as well as the parameter itself.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | prefix (str): prefix to prepend to all parameter names.\n",
|
|
" | recurse (bool): if True, then yields parameters of this module\n",
|
|
" | and all submodules. Otherwise, yields only parameters that\n",
|
|
" | are direct members of this module.\n",
|
|
" | remove_duplicate (bool, optional): whether to remove the duplicated\n",
|
|
" | parameters in the result. Defaults to True.\n",
|
|
" | \n",
|
|
" | Yields:\n",
|
|
" | (str, Parameter): Tuple containing the name and parameter\n",
|
|
" | \n",
|
|
" | Example::\n",
|
|
" | \n",
|
|
" | >>> # xdoctest: +SKIP(\"undefined vars\")\n",
|
|
" | >>> for name, param in self.named_parameters():\n",
|
|
" | >>> if name in ['bias']:\n",
|
|
" | >>> print(param.size())\n",
|
|
" | \n",
|
|
" | parameters(self, recurse: bool = True) -> Iterator[torch.nn.parameter.Parameter]\n",
|
|
" | Returns an iterator over module parameters.\n",
|
|
" | \n",
|
|
" | This is typically passed to an optimizer.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | recurse (bool): if True, then yields parameters of this module\n",
|
|
" | and all submodules. Otherwise, yields only parameters that\n",
|
|
" | are direct members of this module.\n",
|
|
" | \n",
|
|
" | Yields:\n",
|
|
" | Parameter: module parameter\n",
|
|
" | \n",
|
|
" | Example::\n",
|
|
" | \n",
|
|
" | >>> # xdoctest: +SKIP(\"undefined vars\")\n",
|
|
" | >>> for param in model.parameters():\n",
|
|
" | >>> print(type(param), param.size())\n",
|
|
" | <class 'torch.Tensor'> (20L,)\n",
|
|
" | <class 'torch.Tensor'> (20L, 1L, 5L, 5L)\n",
|
|
" | \n",
|
|
" | register_backward_hook(self, hook: Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]) -> torch.utils.hooks.RemovableHandle\n",
|
|
" | Registers a backward hook on the module.\n",
|
|
" | \n",
|
|
" | This function is deprecated in favor of :meth:`~torch.nn.Module.register_full_backward_hook` and\n",
|
|
" | the behavior of this function will change in future versions.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | :class:`torch.utils.hooks.RemovableHandle`:\n",
|
|
" | a handle that can be used to remove the added hook by calling\n",
|
|
" | ``handle.remove()``\n",
|
|
" | \n",
|
|
" | register_buffer(self, name: str, tensor: Union[torch.Tensor, NoneType], persistent: bool = True) -> None\n",
|
|
" | Adds a buffer to the module.\n",
|
|
" | \n",
|
|
" | This is typically used to register a buffer that should not to be\n",
|
|
" | considered a model parameter. For example, BatchNorm's ``running_mean``\n",
|
|
" | is not a parameter, but is part of the module's state. Buffers, by\n",
|
|
" | default, are persistent and will be saved alongside parameters. This\n",
|
|
" | behavior can be changed by setting :attr:`persistent` to ``False``. The\n",
|
|
" | only difference between a persistent buffer and a non-persistent buffer\n",
|
|
" | is that the latter will not be a part of this module's\n",
|
|
" | :attr:`state_dict`.\n",
|
|
" | \n",
|
|
" | Buffers can be accessed as attributes using given names.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | name (str): name of the buffer. The buffer can be accessed\n",
|
|
" | from this module using the given name\n",
|
|
" | tensor (Tensor or None): buffer to be registered. If ``None``, then operations\n",
|
|
" | that run on buffers, such as :attr:`cuda`, are ignored. If ``None``,\n",
|
|
" | the buffer is **not** included in the module's :attr:`state_dict`.\n",
|
|
" | persistent (bool): whether the buffer is part of this module's\n",
|
|
" | :attr:`state_dict`.\n",
|
|
" | \n",
|
|
" | Example::\n",
|
|
" | \n",
|
|
" | >>> # xdoctest: +SKIP(\"undefined vars\")\n",
|
|
" | >>> self.register_buffer('running_mean', torch.zeros(num_features))\n",
|
|
" | \n",
|
|
" | register_forward_hook(self, hook: Union[Callable[[~T, Tuple[Any, ...], Any], Union[Any, NoneType]], Callable[[~T, Tuple[Any, ...], Dict[str, Any], Any], Union[Any, NoneType]]], *, prepend: bool = False, with_kwargs: bool = False) -> torch.utils.hooks.RemovableHandle\n",
|
|
" | Registers a forward hook on the module.\n",
|
|
" | \n",
|
|
" | The hook will be called every time after :func:`forward` has computed an output.\n",
|
|
" | \n",
|
|
" | If ``with_kwargs`` is ``False`` or not specified, the input contains only\n",
|
|
" | the positional arguments given to the module. Keyword arguments won't be\n",
|
|
" | passed to the hooks and only to the ``forward``. The hook can modify the\n",
|
|
" | output. It can modify the input inplace but it will not have effect on\n",
|
|
" | forward since this is called after :func:`forward` is called. The hook\n",
|
|
" | should have the following signature::\n",
|
|
" | \n",
|
|
" | hook(module, args, output) -> None or modified output\n",
|
|
" | \n",
|
|
" | If ``with_kwargs`` is ``True``, the forward hook will be passed the\n",
|
|
" | ``kwargs`` given to the forward function and be expected to return the\n",
|
|
" | output possibly modified. The hook should have the following signature::\n",
|
|
" | \n",
|
|
" | hook(module, args, kwargs, output) -> None or modified output\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | hook (Callable): The user defined hook to be registered.\n",
|
|
" | prepend (bool): If ``True``, the provided ``hook`` will be fired\n",
|
|
" | before all existing ``forward`` hooks on this\n",
|
|
" | :class:`torch.nn.modules.Module`. Otherwise, the provided\n",
|
|
" | ``hook`` will be fired after all existing ``forward`` hooks on\n",
|
|
" | this :class:`torch.nn.modules.Module`. Note that global\n",
|
|
" | ``forward`` hooks registered with\n",
|
|
" | :func:`register_module_forward_hook` will fire before all hooks\n",
|
|
" | registered by this method.\n",
|
|
" | Default: ``False``\n",
|
|
" | with_kwargs (bool): If ``True``, the ``hook`` will be passed the\n",
|
|
" | kwargs given to the forward function.\n",
|
|
" | Default: ``False``\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | :class:`torch.utils.hooks.RemovableHandle`:\n",
|
|
" | a handle that can be used to remove the added hook by calling\n",
|
|
" | ``handle.remove()``\n",
|
|
" | \n",
|
|
" | register_forward_pre_hook(self, hook: Union[Callable[[~T, Tuple[Any, ...]], Union[Any, NoneType]], Callable[[~T, Tuple[Any, ...], Dict[str, Any]], Union[Tuple[Any, Dict[str, Any]], NoneType]]], *, prepend: bool = False, with_kwargs: bool = False) -> torch.utils.hooks.RemovableHandle\n",
|
|
" | Registers a forward pre-hook on the module.\n",
|
|
" | \n",
|
|
" | The hook will be called every time before :func:`forward` is invoked.\n",
|
|
" | \n",
|
|
" | \n",
|
|
" | If ``with_kwargs`` is false or not specified, the input contains only\n",
|
|
" | the positional arguments given to the module. Keyword arguments won't be\n",
|
|
" | passed to the hooks and only to the ``forward``. The hook can modify the\n",
|
|
" | input. User can either return a tuple or a single modified value in the\n",
|
|
" | hook. We will wrap the value into a tuple if a single value is returned\n",
|
|
" | (unless that value is already a tuple). The hook should have the\n",
|
|
" | following signature::\n",
|
|
" | \n",
|
|
" | hook(module, args) -> None or modified input\n",
|
|
" | \n",
|
|
" | If ``with_kwargs`` is true, the forward pre-hook will be passed the\n",
|
|
" | kwargs given to the forward function. And if the hook modifies the\n",
|
|
" | input, both the args and kwargs should be returned. The hook should have\n",
|
|
" | the following signature::\n",
|
|
" | \n",
|
|
" | hook(module, args, kwargs) -> None or a tuple of modified input and kwargs\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | hook (Callable): The user defined hook to be registered.\n",
|
|
" | prepend (bool): If true, the provided ``hook`` will be fired before\n",
|
|
" | all existing ``forward_pre`` hooks on this\n",
|
|
" | :class:`torch.nn.modules.Module`. Otherwise, the provided\n",
|
|
" | ``hook`` will be fired after all existing ``forward_pre`` hooks\n",
|
|
" | on this :class:`torch.nn.modules.Module`. Note that global\n",
|
|
" | ``forward_pre`` hooks registered with\n",
|
|
" | :func:`register_module_forward_pre_hook` will fire before all\n",
|
|
" | hooks registered by this method.\n",
|
|
" | Default: ``False``\n",
|
|
" | with_kwargs (bool): If true, the ``hook`` will be passed the kwargs\n",
|
|
" | given to the forward function.\n",
|
|
" | Default: ``False``\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | :class:`torch.utils.hooks.RemovableHandle`:\n",
|
|
" | a handle that can be used to remove the added hook by calling\n",
|
|
" | ``handle.remove()``\n",
|
|
" | \n",
|
|
" | register_full_backward_hook(self, hook: Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]], prepend: bool = False) -> torch.utils.hooks.RemovableHandle\n",
|
|
" | Registers a backward hook on the module.\n",
|
|
" | \n",
|
|
" | The hook will be called every time the gradients with respect to a module\n",
|
|
" | are computed, i.e. the hook will execute if and only if the gradients with\n",
|
|
" | respect to module outputs are computed. The hook should have the following\n",
|
|
" | signature::\n",
|
|
" | \n",
|
|
" | hook(module, grad_input, grad_output) -> tuple(Tensor) or None\n",
|
|
" | \n",
|
|
" | The :attr:`grad_input` and :attr:`grad_output` are tuples that contain the gradients\n",
|
|
" | with respect to the inputs and outputs respectively. The hook should\n",
|
|
" | not modify its arguments, but it can optionally return a new gradient with\n",
|
|
" | respect to the input that will be used in place of :attr:`grad_input` in\n",
|
|
" | subsequent computations. :attr:`grad_input` will only correspond to the inputs given\n",
|
|
" | as positional arguments and all kwarg arguments are ignored. Entries\n",
|
|
" | in :attr:`grad_input` and :attr:`grad_output` will be ``None`` for all non-Tensor\n",
|
|
" | arguments.\n",
|
|
" | \n",
|
|
" | For technical reasons, when this hook is applied to a Module, its forward function will\n",
|
|
" | receive a view of each Tensor passed to the Module. Similarly the caller will receive a view\n",
|
|
" | of each Tensor returned by the Module's forward function.\n",
|
|
" | \n",
|
|
" | .. warning ::\n",
|
|
" | Modifying inputs or outputs inplace is not allowed when using backward hooks and\n",
|
|
" | will raise an error.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | hook (Callable): The user-defined hook to be registered.\n",
|
|
" | prepend (bool): If true, the provided ``hook`` will be fired before\n",
|
|
" | all existing ``backward`` hooks on this\n",
|
|
" | :class:`torch.nn.modules.Module`. Otherwise, the provided\n",
|
|
" | ``hook`` will be fired after all existing ``backward`` hooks on\n",
|
|
" | this :class:`torch.nn.modules.Module`. Note that global\n",
|
|
" | ``backward`` hooks registered with\n",
|
|
" | :func:`register_module_full_backward_hook` will fire before\n",
|
|
" | all hooks registered by this method.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | :class:`torch.utils.hooks.RemovableHandle`:\n",
|
|
" | a handle that can be used to remove the added hook by calling\n",
|
|
" | ``handle.remove()``\n",
|
|
" | \n",
|
|
" | register_full_backward_pre_hook(self, hook: Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]], prepend: bool = False) -> torch.utils.hooks.RemovableHandle\n",
|
|
" | Registers a backward pre-hook on the module.\n",
|
|
" | \n",
|
|
" | The hook will be called every time the gradients for the module are computed.\n",
|
|
" | The hook should have the following signature::\n",
|
|
" | \n",
|
|
" | hook(module, grad_output) -> Tensor or None\n",
|
|
" | \n",
|
|
" | The :attr:`grad_output` is a tuple. The hook should\n",
|
|
" | not modify its arguments, but it can optionally return a new gradient with\n",
|
|
" | respect to the output that will be used in place of :attr:`grad_output` in\n",
|
|
" | subsequent computations. Entries in :attr:`grad_output` will be ``None`` for\n",
|
|
" | all non-Tensor arguments.\n",
|
|
" | \n",
|
|
" | For technical reasons, when this hook is applied to a Module, its forward function will\n",
|
|
" | receive a view of each Tensor passed to the Module. Similarly the caller will receive a view\n",
|
|
" | of each Tensor returned by the Module's forward function.\n",
|
|
" | \n",
|
|
" | .. warning ::\n",
|
|
" | Modifying inputs inplace is not allowed when using backward hooks and\n",
|
|
" | will raise an error.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | hook (Callable): The user-defined hook to be registered.\n",
|
|
" | prepend (bool): If true, the provided ``hook`` will be fired before\n",
|
|
" | all existing ``backward_pre`` hooks on this\n",
|
|
" | :class:`torch.nn.modules.Module`. Otherwise, the provided\n",
|
|
" | ``hook`` will be fired after all existing ``backward_pre`` hooks\n",
|
|
" | on this :class:`torch.nn.modules.Module`. Note that global\n",
|
|
" | ``backward_pre`` hooks registered with\n",
|
|
" | :func:`register_module_full_backward_pre_hook` will fire before\n",
|
|
" | all hooks registered by this method.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | :class:`torch.utils.hooks.RemovableHandle`:\n",
|
|
" | a handle that can be used to remove the added hook by calling\n",
|
|
" | ``handle.remove()``\n",
|
|
" | \n",
|
|
" | register_load_state_dict_post_hook(self, hook)\n",
|
|
" | Registers a post hook to be run after module's ``load_state_dict``\n",
|
|
" | is called.\n",
|
|
" | \n",
|
|
" | It should have the following signature::\n",
|
|
" | hook(module, incompatible_keys) -> None\n",
|
|
" | \n",
|
|
" | The ``module`` argument is the current module that this hook is registered\n",
|
|
" | on, and the ``incompatible_keys`` argument is a ``NamedTuple`` consisting\n",
|
|
" | of attributes ``missing_keys`` and ``unexpected_keys``. ``missing_keys``\n",
|
|
" | is a ``list`` of ``str`` containing the missing keys and\n",
|
|
" | ``unexpected_keys`` is a ``list`` of ``str`` containing the unexpected keys.\n",
|
|
" | \n",
|
|
" | The given incompatible_keys can be modified inplace if needed.\n",
|
|
" | \n",
|
|
" | Note that the checks performed when calling :func:`load_state_dict` with\n",
|
|
" | ``strict=True`` are affected by modifications the hook makes to\n",
|
|
" | ``missing_keys`` or ``unexpected_keys``, as expected. Additions to either\n",
|
|
" | set of keys will result in an error being thrown when ``strict=True``, and\n",
|
|
" | clearing out both missing and unexpected keys will avoid an error.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | :class:`torch.utils.hooks.RemovableHandle`:\n",
|
|
" | a handle that can be used to remove the added hook by calling\n",
|
|
" | ``handle.remove()``\n",
|
|
" | \n",
|
|
" | register_module(self, name: str, module: Union[ForwardRef('Module'), NoneType]) -> None\n",
|
|
" | Alias for :func:`add_module`.\n",
|
|
" | \n",
|
|
" | register_parameter(self, name: str, param: Union[torch.nn.parameter.Parameter, NoneType]) -> None\n",
|
|
" | Adds a parameter to the module.\n",
|
|
" | \n",
|
|
" | The parameter can be accessed as an attribute using given name.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | name (str): name of the parameter. The parameter can be accessed\n",
|
|
" | from this module using the given name\n",
|
|
" | param (Parameter or None): parameter to be added to the module. If\n",
|
|
" | ``None``, then operations that run on parameters, such as :attr:`cuda`,\n",
|
|
" | are ignored. If ``None``, the parameter is **not** included in the\n",
|
|
" | module's :attr:`state_dict`.\n",
|
|
" | \n",
|
|
" | register_state_dict_pre_hook(self, hook)\n",
|
|
" | These hooks will be called with arguments: ``self``, ``prefix``,\n",
|
|
" | and ``keep_vars`` before calling ``state_dict`` on ``self``. The registered\n",
|
|
" | hooks can be used to perform pre-processing before the ``state_dict``\n",
|
|
" | call is made.\n",
|
|
" | \n",
|
|
" | requires_grad_(self: ~T, requires_grad: bool = True) -> ~T\n",
|
|
" | Change if autograd should record operations on parameters in this\n",
|
|
" | module.\n",
|
|
" | \n",
|
|
" | This method sets the parameters' :attr:`requires_grad` attributes\n",
|
|
" | in-place.\n",
|
|
" | \n",
|
|
" | This method is helpful for freezing part of the module for finetuning\n",
|
|
" | or training parts of a model individually (e.g., GAN training).\n",
|
|
" | \n",
|
|
" | See :ref:`locally-disable-grad-doc` for a comparison between\n",
|
|
" | `.requires_grad_()` and several similar mechanisms that may be confused with it.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | requires_grad (bool): whether autograd should record operations on\n",
|
|
" | parameters in this module. Default: ``True``.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | Module: self\n",
|
|
" | \n",
|
|
" | set_extra_state(self, state: Any)\n",
|
|
" | This function is called from :func:`load_state_dict` to handle any extra state\n",
|
|
" | found within the `state_dict`. Implement this function and a corresponding\n",
|
|
" | :func:`get_extra_state` for your module if you need to store extra state within its\n",
|
|
" | `state_dict`.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | state (dict): Extra state from the `state_dict`\n",
|
|
" | \n",
|
|
" | share_memory(self: ~T) -> ~T\n",
|
|
" | See :meth:`torch.Tensor.share_memory_`\n",
|
|
" | \n",
|
|
" | state_dict(self, *args, destination=None, prefix='', keep_vars=False)\n",
|
|
" | Returns a dictionary containing references to the whole state of the module.\n",
|
|
" | \n",
|
|
" | Both parameters and persistent buffers (e.g. running averages) are\n",
|
|
" | included. Keys are corresponding parameter and buffer names.\n",
|
|
" | Parameters and buffers set to ``None`` are not included.\n",
|
|
" | \n",
|
|
" | .. note::\n",
|
|
" | The returned object is a shallow copy. It contains references\n",
|
|
" | to the module's parameters and buffers.\n",
|
|
" | \n",
|
|
" | .. warning::\n",
|
|
" | Currently ``state_dict()`` also accepts positional arguments for\n",
|
|
" | ``destination``, ``prefix`` and ``keep_vars`` in order. However,\n",
|
|
" | this is being deprecated and keyword arguments will be enforced in\n",
|
|
" | future releases.\n",
|
|
" | \n",
|
|
" | .. warning::\n",
|
|
" | Please avoid the use of argument ``destination`` as it is not\n",
|
|
" | designed for end-users.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | destination (dict, optional): If provided, the state of module will\n",
|
|
" | be updated into the dict and the same object is returned.\n",
|
|
" | Otherwise, an ``OrderedDict`` will be created and returned.\n",
|
|
" | Default: ``None``.\n",
|
|
" | prefix (str, optional): a prefix added to parameter and buffer\n",
|
|
" | names to compose the keys in state_dict. Default: ``''``.\n",
|
|
" | keep_vars (bool, optional): by default the :class:`~torch.Tensor` s\n",
|
|
" | returned in the state dict are detached from autograd. If it's\n",
|
|
" | set to ``True``, detaching will not be performed.\n",
|
|
" | Default: ``False``.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | dict:\n",
|
|
" | a dictionary containing a whole state of the module\n",
|
|
" | \n",
|
|
" | Example::\n",
|
|
" | \n",
|
|
" | >>> # xdoctest: +SKIP(\"undefined vars\")\n",
|
|
" | >>> module.state_dict().keys()\n",
|
|
" | ['bias', 'weight']\n",
|
|
" | \n",
|
|
" | to(self, *args, **kwargs)\n",
|
|
" | Moves and/or casts the parameters and buffers.\n",
|
|
" | \n",
|
|
" | This can be called as\n",
|
|
" | \n",
|
|
" | .. function:: to(device=None, dtype=None, non_blocking=False)\n",
|
|
" | :noindex:\n",
|
|
" | \n",
|
|
" | .. function:: to(dtype, non_blocking=False)\n",
|
|
" | :noindex:\n",
|
|
" | \n",
|
|
" | .. function:: to(tensor, non_blocking=False)\n",
|
|
" | :noindex:\n",
|
|
" | \n",
|
|
" | .. function:: to(memory_format=torch.channels_last)\n",
|
|
" | :noindex:\n",
|
|
" | \n",
|
|
" | Its signature is similar to :meth:`torch.Tensor.to`, but only accepts\n",
|
|
" | floating point or complex :attr:`dtype`\\ s. In addition, this method will\n",
|
|
" | only cast the floating point or complex parameters and buffers to :attr:`dtype`\n",
|
|
" | (if given). The integral parameters and buffers will be moved\n",
|
|
" | :attr:`device`, if that is given, but with dtypes unchanged. When\n",
|
|
" | :attr:`non_blocking` is set, it tries to convert/move asynchronously\n",
|
|
" | with respect to the host if possible, e.g., moving CPU Tensors with\n",
|
|
" | pinned memory to CUDA devices.\n",
|
|
" | \n",
|
|
" | See below for examples.\n",
|
|
" | \n",
|
|
" | .. note::\n",
|
|
" | This method modifies the module in-place.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | device (:class:`torch.device`): the desired device of the parameters\n",
|
|
" | and buffers in this module\n",
|
|
" | dtype (:class:`torch.dtype`): the desired floating point or complex dtype of\n",
|
|
" | the parameters and buffers in this module\n",
|
|
" | tensor (torch.Tensor): Tensor whose dtype and device are the desired\n",
|
|
" | dtype and device for all parameters and buffers in this module\n",
|
|
" | memory_format (:class:`torch.memory_format`): the desired memory\n",
|
|
" | format for 4D parameters and buffers in this module (keyword\n",
|
|
" | only argument)\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | Module: self\n",
|
|
" | \n",
|
|
" | Examples::\n",
|
|
" | \n",
|
|
" | >>> # xdoctest: +IGNORE_WANT(\"non-deterministic\")\n",
|
|
" | >>> linear = nn.Linear(2, 2)\n",
|
|
" | >>> linear.weight\n",
|
|
" | Parameter containing:\n",
|
|
" | tensor([[ 0.1913, -0.3420],\n",
|
|
" | [-0.5113, -0.2325]])\n",
|
|
" | >>> linear.to(torch.double)\n",
|
|
" | Linear(in_features=2, out_features=2, bias=True)\n",
|
|
" | >>> linear.weight\n",
|
|
" | Parameter containing:\n",
|
|
" | tensor([[ 0.1913, -0.3420],\n",
|
|
" | [-0.5113, -0.2325]], dtype=torch.float64)\n",
|
|
" | >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA1)\n",
|
|
" | >>> gpu1 = torch.device(\"cuda:1\")\n",
|
|
" | >>> linear.to(gpu1, dtype=torch.half, non_blocking=True)\n",
|
|
" | Linear(in_features=2, out_features=2, bias=True)\n",
|
|
" | >>> linear.weight\n",
|
|
" | Parameter containing:\n",
|
|
" | tensor([[ 0.1914, -0.3420],\n",
|
|
" | [-0.5112, -0.2324]], dtype=torch.float16, device='cuda:1')\n",
|
|
" | >>> cpu = torch.device(\"cpu\")\n",
|
|
" | >>> linear.to(cpu)\n",
|
|
" | Linear(in_features=2, out_features=2, bias=True)\n",
|
|
" | >>> linear.weight\n",
|
|
" | Parameter containing:\n",
|
|
" | tensor([[ 0.1914, -0.3420],\n",
|
|
" | [-0.5112, -0.2324]], dtype=torch.float16)\n",
|
|
" | \n",
|
|
" | >>> linear = nn.Linear(2, 2, bias=None).to(torch.cdouble)\n",
|
|
" | >>> linear.weight\n",
|
|
" | Parameter containing:\n",
|
|
" | tensor([[ 0.3741+0.j, 0.2382+0.j],\n",
|
|
" | [ 0.5593+0.j, -0.4443+0.j]], dtype=torch.complex128)\n",
|
|
" | >>> linear(torch.ones(3, 2, dtype=torch.cdouble))\n",
|
|
" | tensor([[0.6122+0.j, 0.1150+0.j],\n",
|
|
" | [0.6122+0.j, 0.1150+0.j],\n",
|
|
" | [0.6122+0.j, 0.1150+0.j]], dtype=torch.complex128)\n",
|
|
" | \n",
|
|
" | to_empty(self: ~T, *, device: Union[str, torch.device]) -> ~T\n",
|
|
" | Moves the parameters and buffers to the specified device without copying storage.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | device (:class:`torch.device`): The desired device of the parameters\n",
|
|
" | and buffers in this module.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | Module: self\n",
|
|
" | \n",
|
|
" | train(self: ~T, mode: bool = True) -> ~T\n",
|
|
" | Sets the module in training mode.\n",
|
|
" | \n",
|
|
" | This has any effect only on certain modules. See documentations of\n",
|
|
" | particular modules for details of their behaviors in training/evaluation\n",
|
|
" | mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,\n",
|
|
" | etc.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | mode (bool): whether to set training mode (``True``) or evaluation\n",
|
|
" | mode (``False``). Default: ``True``.\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | Module: self\n",
|
|
" | \n",
|
|
" | type(self: ~T, dst_type: Union[torch.dtype, str]) -> ~T\n",
|
|
" | Casts all parameters and buffers to :attr:`dst_type`.\n",
|
|
" | \n",
|
|
" | .. note::\n",
|
|
" | This method modifies the module in-place.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | dst_type (type or string): the desired type\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | Module: self\n",
|
|
" | \n",
|
|
" | xpu(self: ~T, device: Union[int, torch.device, NoneType] = None) -> ~T\n",
|
|
" | Moves all model parameters and buffers to the XPU.\n",
|
|
" | \n",
|
|
" | This also makes associated parameters and buffers different objects. So\n",
|
|
" | it should be called before constructing optimizer if the module will\n",
|
|
" | live on XPU while being optimized.\n",
|
|
" | \n",
|
|
" | .. note::\n",
|
|
" | This method modifies the module in-place.\n",
|
|
" | \n",
|
|
" | Arguments:\n",
|
|
" | device (int, optional): if specified, all parameters will be\n",
|
|
" | copied to that device\n",
|
|
" | \n",
|
|
" | Returns:\n",
|
|
" | Module: self\n",
|
|
" | \n",
|
|
" | zero_grad(self, set_to_none: bool = True) -> None\n",
|
|
" | Sets gradients of all model parameters to zero. See similar function\n",
|
|
" | under :class:`torch.optim.Optimizer` for more context.\n",
|
|
" | \n",
|
|
" | Args:\n",
|
|
" | set_to_none (bool): instead of setting to zero, set the grads to None.\n",
|
|
" | See :meth:`torch.optim.Optimizer.zero_grad` for details.\n",
|
|
" | \n",
|
|
" | ----------------------------------------------------------------------\n",
|
|
" | Data descriptors inherited from torch.nn.modules.module.Module:\n",
|
|
" | \n",
|
|
" | __dict__\n",
|
|
" | dictionary for instance variables (if defined)\n",
|
|
" | \n",
|
|
" | __weakref__\n",
|
|
" | list of weak references to the object (if defined)\n",
|
|
" | \n",
|
|
" | ----------------------------------------------------------------------\n",
|
|
" | Data and other attributes inherited from torch.nn.modules.module.Module:\n",
|
|
" | \n",
|
|
" | T_destination = ~T_destination\n",
|
|
" | \n",
|
|
" | __annotations__ = {'__call__': typing.Callable[..., typing.Any], '_bac...\n",
|
|
" | \n",
|
|
" | call_super_init = False\n",
|
|
" | \n",
|
|
" | dump_patches = False\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"help(vocab)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"1000\n",
|
|
"2000\n",
|
|
"3000\n",
|
|
"4000\n",
|
|
"5000\n",
|
|
"6000\n",
|
|
"7000\n",
|
|
"8000\n",
|
|
"9000\n",
|
|
"10000\n",
|
|
"11000\n",
|
|
"12000\n",
|
|
"13000\n",
|
|
"14000\n",
|
|
"15000\n",
|
|
"16000\n",
|
|
"17000\n",
|
|
"18000\n",
|
|
"19000\n",
|
|
"20000\n",
|
|
"21000\n",
|
|
"22000\n",
|
|
"23000\n",
|
|
"24000\n",
|
|
"25000\n",
|
|
"26000\n",
|
|
"27000\n",
|
|
"28000\n",
|
|
"29000\n",
|
|
"30000\n",
|
|
"31000\n",
|
|
"32000\n",
|
|
"33000\n",
|
|
"34000\n",
|
|
"35000\n",
|
|
"36000\n",
|
|
"37000\n",
|
|
"38000\n",
|
|
"39000\n",
|
|
"40000\n",
|
|
"41000\n",
|
|
"42000\n",
|
|
"43000\n",
|
|
"44000\n",
|
|
"45000\n",
|
|
"46000\n",
|
|
"47000\n",
|
|
"48000\n",
|
|
"49000\n",
|
|
"50000\n",
|
|
"51000\n",
|
|
"52000\n",
|
|
"53000\n",
|
|
"54000\n",
|
|
"55000\n",
|
|
"56000\n",
|
|
"57000\n",
|
|
"58000\n",
|
|
"59000\n",
|
|
"60000\n",
|
|
"61000\n",
|
|
"62000\n",
|
|
"63000\n",
|
|
"64000\n",
|
|
"65000\n",
|
|
"66000\n",
|
|
"67000\n",
|
|
"68000\n",
|
|
"69000\n",
|
|
"70000\n",
|
|
"71000\n",
|
|
"72000\n",
|
|
"73000\n",
|
|
"74000\n",
|
|
"75000\n",
|
|
"76000\n",
|
|
"77000\n",
|
|
"78000\n",
|
|
"79000\n",
|
|
"80000\n",
|
|
"81000\n",
|
|
"82000\n",
|
|
"83000\n",
|
|
"84000\n",
|
|
"85000\n",
|
|
"86000\n",
|
|
"87000\n",
|
|
"88000\n",
|
|
"89000\n",
|
|
"90000\n",
|
|
"91000\n",
|
|
"92000\n",
|
|
"93000\n",
|
|
"94000\n",
|
|
"95000\n",
|
|
"96000\n",
|
|
"97000\n",
|
|
"98000\n",
|
|
"99000\n",
|
|
"100000\n",
|
|
"101000\n",
|
|
"102000\n",
|
|
"103000\n",
|
|
"104000\n",
|
|
"105000\n",
|
|
"106000\n",
|
|
"107000\n",
|
|
"108000\n",
|
|
"109000\n",
|
|
"110000\n",
|
|
"111000\n",
|
|
"112000\n",
|
|
"113000\n",
|
|
"114000\n",
|
|
"115000\n",
|
|
"116000\n",
|
|
"117000\n",
|
|
"118000\n",
|
|
"119000\n",
|
|
"120000\n",
|
|
"121000\n",
|
|
"122000\n",
|
|
"123000\n",
|
|
"124000\n",
|
|
"125000\n",
|
|
"126000\n",
|
|
"127000\n",
|
|
"128000\n",
|
|
"129000\n",
|
|
"130000\n",
|
|
"131000\n",
|
|
"132000\n",
|
|
"133000\n",
|
|
"134000\n",
|
|
"135000\n",
|
|
"136000\n",
|
|
"137000\n",
|
|
"138000\n",
|
|
"139000\n",
|
|
"140000\n",
|
|
"141000\n",
|
|
"142000\n",
|
|
"143000\n",
|
|
"144000\n",
|
|
"145000\n",
|
|
"146000\n",
|
|
"147000\n",
|
|
"148000\n",
|
|
"149000\n",
|
|
"150000\n",
|
|
"151000\n",
|
|
"152000\n",
|
|
"153000\n",
|
|
"154000\n",
|
|
"155000\n",
|
|
"156000\n",
|
|
"157000\n",
|
|
"158000\n",
|
|
"159000\n",
|
|
"160000\n",
|
|
"161000\n",
|
|
"162000\n",
|
|
"163000\n",
|
|
"164000\n",
|
|
"165000\n",
|
|
"166000\n",
|
|
"167000\n",
|
|
"168000\n",
|
|
"169000\n",
|
|
"170000\n",
|
|
"171000\n",
|
|
"172000\n",
|
|
"173000\n",
|
|
"174000\n",
|
|
"175000\n",
|
|
"176000\n",
|
|
"177000\n",
|
|
"178000\n",
|
|
"179000\n",
|
|
"180000\n",
|
|
"181000\n",
|
|
"182000\n",
|
|
"183000\n",
|
|
"184000\n",
|
|
"185000\n",
|
|
"186000\n",
|
|
"187000\n",
|
|
"188000\n",
|
|
"189000\n",
|
|
"190000\n",
|
|
"191000\n",
|
|
"192000\n",
|
|
"193000\n",
|
|
"194000\n",
|
|
"195000\n",
|
|
"196000\n",
|
|
"197000\n",
|
|
"198000\n",
|
|
"199000\n",
|
|
"200000\n",
|
|
"201000\n",
|
|
"202000\n",
|
|
"203000\n",
|
|
"204000\n",
|
|
"205000\n",
|
|
"206000\n",
|
|
"207000\n",
|
|
"208000\n",
|
|
"209000\n",
|
|
"210000\n",
|
|
"211000\n",
|
|
"212000\n",
|
|
"213000\n",
|
|
"214000\n",
|
|
"215000\n",
|
|
"216000\n",
|
|
"217000\n",
|
|
"218000\n",
|
|
"219000\n",
|
|
"220000\n",
|
|
"221000\n",
|
|
"222000\n",
|
|
"223000\n",
|
|
"224000\n",
|
|
"225000\n",
|
|
"226000\n",
|
|
"227000\n",
|
|
"228000\n",
|
|
"229000\n",
|
|
"230000\n",
|
|
"231000\n",
|
|
"232000\n",
|
|
"233000\n",
|
|
"234000\n",
|
|
"235000\n",
|
|
"236000\n",
|
|
"237000\n",
|
|
"238000\n",
|
|
"239000\n",
|
|
"240000\n",
|
|
"241000\n",
|
|
"242000\n",
|
|
"243000\n",
|
|
"244000\n",
|
|
"245000\n",
|
|
"246000\n",
|
|
"247000\n",
|
|
"248000\n",
|
|
"249000\n",
|
|
"250000\n",
|
|
"251000\n",
|
|
"252000\n",
|
|
"253000\n",
|
|
"254000\n",
|
|
"255000\n",
|
|
"256000\n",
|
|
"257000\n",
|
|
"258000\n",
|
|
"259000\n",
|
|
"260000\n",
|
|
"261000\n",
|
|
"262000\n",
|
|
"263000\n",
|
|
"264000\n",
|
|
"265000\n",
|
|
"266000\n",
|
|
"267000\n",
|
|
"268000\n",
|
|
"269000\n",
|
|
"270000\n",
|
|
"271000\n",
|
|
"272000\n",
|
|
"273000\n",
|
|
"274000\n",
|
|
"275000\n",
|
|
"276000\n",
|
|
"277000\n",
|
|
"278000\n",
|
|
"279000\n",
|
|
"280000\n",
|
|
"281000\n",
|
|
"282000\n",
|
|
"283000\n",
|
|
"284000\n",
|
|
"285000\n",
|
|
"286000\n",
|
|
"287000\n",
|
|
"288000\n",
|
|
"289000\n",
|
|
"290000\n",
|
|
"291000\n",
|
|
"292000\n",
|
|
"293000\n",
|
|
"294000\n",
|
|
"295000\n",
|
|
"296000\n",
|
|
"297000\n",
|
|
"298000\n",
|
|
"299000\n",
|
|
"300000\n",
|
|
"301000\n",
|
|
"302000\n",
|
|
"303000\n",
|
|
"304000\n",
|
|
"305000\n",
|
|
"306000\n",
|
|
"307000\n",
|
|
"308000\n",
|
|
"309000\n",
|
|
"310000\n",
|
|
"311000\n",
|
|
"312000\n",
|
|
"313000\n",
|
|
"314000\n",
|
|
"315000\n",
|
|
"316000\n",
|
|
"317000\n",
|
|
"318000\n",
|
|
"319000\n",
|
|
"320000\n",
|
|
"321000\n",
|
|
"322000\n",
|
|
"323000\n",
|
|
"324000\n",
|
|
"325000\n",
|
|
"326000\n",
|
|
"327000\n",
|
|
"328000\n",
|
|
"329000\n",
|
|
"330000\n",
|
|
"331000\n",
|
|
"332000\n",
|
|
"333000\n",
|
|
"334000\n",
|
|
"335000\n",
|
|
"336000\n",
|
|
"337000\n",
|
|
"338000\n",
|
|
"339000\n",
|
|
"340000\n",
|
|
"341000\n",
|
|
"342000\n",
|
|
"343000\n",
|
|
"344000\n",
|
|
"345000\n",
|
|
"346000\n",
|
|
"347000\n",
|
|
"348000\n",
|
|
"349000\n",
|
|
"350000\n",
|
|
"351000\n",
|
|
"352000\n",
|
|
"353000\n",
|
|
"354000\n",
|
|
"355000\n",
|
|
"356000\n",
|
|
"357000\n",
|
|
"358000\n",
|
|
"359000\n",
|
|
"360000\n",
|
|
"361000\n",
|
|
"362000\n",
|
|
"363000\n",
|
|
"364000\n",
|
|
"365000\n",
|
|
"366000\n",
|
|
"367000\n",
|
|
"368000\n",
|
|
"369000\n",
|
|
"370000\n",
|
|
"371000\n",
|
|
"372000\n",
|
|
"373000\n",
|
|
"374000\n",
|
|
"375000\n",
|
|
"376000\n",
|
|
"377000\n",
|
|
"378000\n",
|
|
"379000\n",
|
|
"380000\n",
|
|
"381000\n",
|
|
"382000\n",
|
|
"383000\n",
|
|
"384000\n",
|
|
"385000\n",
|
|
"386000\n",
|
|
"387000\n",
|
|
"388000\n",
|
|
"389000\n",
|
|
"390000\n",
|
|
"391000\n",
|
|
"392000\n",
|
|
"393000\n",
|
|
"394000\n",
|
|
"395000\n",
|
|
"396000\n",
|
|
"397000\n",
|
|
"398000\n",
|
|
"399000\n",
|
|
"400000\n",
|
|
"401000\n",
|
|
"402000\n",
|
|
"403000\n",
|
|
"404000\n",
|
|
"405000\n",
|
|
"406000\n",
|
|
"407000\n",
|
|
"408000\n",
|
|
"409000\n",
|
|
"410000\n",
|
|
"411000\n",
|
|
"412000\n",
|
|
"413000\n",
|
|
"414000\n",
|
|
"415000\n",
|
|
"416000\n",
|
|
"417000\n",
|
|
"418000\n",
|
|
"419000\n",
|
|
"420000\n",
|
|
"421000\n",
|
|
"422000\n",
|
|
"423000\n",
|
|
"424000\n",
|
|
"425000\n",
|
|
"426000\n",
|
|
"427000\n",
|
|
"428000\n",
|
|
"429000\n",
|
|
"430000\n",
|
|
"431000\n",
|
|
"432000\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"def look_ahead_iterator(gen):\n",
|
|
" prev = None\n",
|
|
" for item in gen:\n",
|
|
" if prev is not None:\n",
|
|
" yield (prev, item)\n",
|
|
" prev = item\n",
|
|
"\n",
|
|
"class Bigrams(IterableDataset):\n",
|
|
" def __init__(self, text_file, vocabulary_size):\n",
|
|
" self.vocab = build_vocab_from_iterator(\n",
|
|
" get_word_lines_from_file(text_file),\n",
|
|
" max_tokens = vocabulary_size,\n",
|
|
" specials = ['<unk>'])\n",
|
|
" self.vocab.set_default_index(self.vocab['<unk>'])\n",
|
|
" self.vocabulary_size = vocabulary_size\n",
|
|
" self.text_file = text_file\n",
|
|
"\n",
|
|
" def __iter__(self):\n",
|
|
" return look_ahead_iterator(\n",
|
|
" (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))\n",
|
|
"\n",
|
|
"train_dataset = Bigrams(train_file, vocab_size)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"<__main__.Bigrams object at 0x7fdd26d23940>\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(train_dataset)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'|===========================================================================|\\n| PyTorch CUDA memory summary, device ID 0 |\\n|---------------------------------------------------------------------------|\\n| CUDA OOMs: 1 | cudaMalloc retries: 1 |\\n|===========================================================================|\\n| Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed |\\n|---------------------------------------------------------------------------|\\n| Allocated memory | 699613 KiB | 1903 MiB | 3735 MiB | 3052 MiB |\\n| from large pool | 699414 KiB | 1903 MiB | 3734 MiB | 3051 MiB |\\n| from small pool | 199 KiB | 1 MiB | 1 MiB | 1 MiB |\\n|---------------------------------------------------------------------------|\\n| Active memory | 699613 KiB | 1903 MiB | 3735 MiB | 3052 MiB |\\n| from large pool | 699414 KiB | 1903 MiB | 3734 MiB | 3051 MiB |\\n| from small pool | 199 KiB | 1 MiB | 1 MiB | 1 MiB |\\n|---------------------------------------------------------------------------|\\n| Requested memory | 699611 KiB | 1903 MiB | 3735 MiB | 3052 MiB |\\n| from large pool | 699413 KiB | 1903 MiB | 3734 MiB | 3051 MiB |\\n| from small pool | 197 KiB | 1 MiB | 1 MiB | 1 MiB |\\n|---------------------------------------------------------------------------|\\n| GPU reserved memory | 710656 KiB | 1918 MiB | 1918 MiB | 1224 MiB |\\n| from large pool | 708608 KiB | 1916 MiB | 1916 MiB | 1224 MiB |\\n| from small pool | 2048 KiB | 2 MiB | 2 MiB | 0 MiB |\\n|---------------------------------------------------------------------------|\\n| Non-releasable memory | 11043 KiB | 19364 KiB | 28939 KiB | 17896 KiB |\\n| from large pool | 9194 KiB | 17514 KiB | 25954 KiB | 16760 KiB |\\n| from small pool | 1849 KiB | 1950 KiB | 2985 KiB | 1136 KiB |\\n|---------------------------------------------------------------------------|\\n| Allocations | 10 | 17 | 38 | 28 |\\n| from large pool | 5 | 7 | 10 | 5 |\\n| from small pool | 5 | 11 | 28 | 23 |\\n|---------------------------------------------------------------------------|\\n| Active allocs | 10 | 17 | 38 | 28 |\\n| from large pool | 5 | 7 | 10 | 5 |\\n| from small pool | 5 | 11 | 28 | 23 |\\n|---------------------------------------------------------------------------|\\n| GPU reserved segments | 5 | 7 | 7 | 2 |\\n| from large pool | 4 | 6 | 6 | 2 |\\n| from small pool | 1 | 1 | 1 | 0 |\\n|---------------------------------------------------------------------------|\\n| Non-releasable allocs | 6 | 8 | 20 | 14 |\\n| from large pool | 4 | 6 | 9 | 5 |\\n| from small pool | 2 | 3 | 11 | 9 |\\n|---------------------------------------------------------------------------|\\n| Oversize allocations | 0 | 0 | 0 | 0 |\\n|---------------------------------------------------------------------------|\\n| Oversize GPU segments | 0 | 0 | 0 | 0 |\\n|===========================================================================|\\n'"
|
|
]
|
|
},
|
|
"execution_count": 17,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"torch.cuda.memory_summary(device=None, abbreviated=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"max_split_size_mb:256\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"device = 'cuda'\n",
|
|
"model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"epoch: = 1\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/home/gedin/.local/lib/python3.8/site-packages/torch/nn/modules/container.py:217: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
|
|
" input = module(input)\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"0 tensor(5.9599, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"1000\n",
|
|
"100 tensor(6.1015, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"200 tensor(5.9708, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"2000\n",
|
|
"300 tensor(6.2176, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"3000\n",
|
|
"400 tensor(5.9401, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"4000\n",
|
|
"500 tensor(6.2084, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"5000\n",
|
|
"600 tensor(5.9736, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"6000\n",
|
|
"700 tensor(6.1423, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"7000\n",
|
|
"800 tensor(5.7344, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"8000\n",
|
|
"900 tensor(6.0950, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"9000\n",
|
|
"1000 tensor(5.8473, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"10000\n",
|
|
"1100 tensor(6.0612, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"11000\n",
|
|
"1200 tensor(6.1509, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"12000\n",
|
|
"1300 tensor(6.0760, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"13000\n",
|
|
"1400 tensor(6.2047, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"14000\n",
|
|
"1500 tensor(6.1186, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"15000\n",
|
|
"1600 tensor(5.8722, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"16000\n",
|
|
"1700 tensor(5.8741, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"17000\n",
|
|
"1800 tensor(5.8971, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"18000\n",
|
|
"1900 tensor(5.8521, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"19000\n",
|
|
"2000 tensor(5.9434, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"20000\n",
|
|
"2100 tensor(6.0348, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"21000\n",
|
|
"2200 tensor(5.8840, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"22000\n",
|
|
"2300 tensor(5.8641, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"23000\n",
|
|
"2400 tensor(5.9068, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"24000\n",
|
|
"2500 tensor(5.9170, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"25000\n",
|
|
"2600 tensor(5.9812, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"26000\n",
|
|
"2700 tensor(5.8985, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"27000\n",
|
|
"2800 tensor(6.0008, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"28000\n",
|
|
"2900 tensor(6.1230, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"29000\n",
|
|
"3000 tensor(5.8770, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"30000\n",
|
|
"3100 tensor(5.9268, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"31000\n",
|
|
"3200 tensor(5.8530, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"32000\n",
|
|
"3300 tensor(5.8436, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"33000\n",
|
|
"3400 tensor(5.7692, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"34000\n",
|
|
"3500 tensor(5.8909, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"35000\n",
|
|
"3600 tensor(5.8325, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"36000\n",
|
|
"3700 tensor(5.8082, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"37000\n",
|
|
"3800 tensor(5.8106, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"38000\n",
|
|
"3900 tensor(5.6382, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"39000\n",
|
|
"4000 tensor(5.6596, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"40000\n",
|
|
"4100 tensor(5.9587, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"41000\n",
|
|
"4200 tensor(5.8862, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"42000\n",
|
|
"4300 tensor(5.9541, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"43000\n",
|
|
"4400 tensor(5.8681, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"44000\n",
|
|
"4500 tensor(5.6963, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"45000\n",
|
|
"4600 tensor(6.0707, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"46000\n",
|
|
"4700 tensor(5.7091, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"47000\n",
|
|
"4800 tensor(5.8139, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"48000\n",
|
|
"4900 tensor(5.8696, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"49000\n",
|
|
"5000 tensor(5.8844, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"50000\n",
|
|
"5100 tensor(5.9806, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"51000\n",
|
|
"5200 tensor(6.0075, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"52000\n",
|
|
"5300 tensor(6.0588, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"53000\n",
|
|
"5400 tensor(5.8456, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"54000\n",
|
|
"5500 tensor(5.9166, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"55000\n",
|
|
"5600 tensor(5.6528, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"56000\n",
|
|
"5700 tensor(5.8988, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"57000\n",
|
|
"5800 tensor(5.9132, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"58000\n",
|
|
"5900 tensor(5.9460, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"59000\n",
|
|
"6000 tensor(5.7543, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"60000\n",
|
|
"6100 tensor(5.8256, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"61000\n",
|
|
"6200 tensor(5.9448, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"62000\n",
|
|
"6300 tensor(5.7601, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"63000\n",
|
|
"6400 tensor(5.7091, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"64000\n",
|
|
"6500 tensor(5.5621, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"65000\n",
|
|
"6600 tensor(5.7094, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"66000\n",
|
|
"6700 tensor(5.6785, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"67000\n",
|
|
"6800 tensor(5.9249, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"68000\n",
|
|
"6900 tensor(5.8775, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"69000\n",
|
|
"7000 tensor(5.8075, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"70000\n",
|
|
"7100 tensor(5.5748, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"71000\n",
|
|
"7200 tensor(5.7217, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"72000\n",
|
|
"7300 tensor(5.9124, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"73000\n",
|
|
"7400 tensor(5.7197, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"74000\n",
|
|
"7500 tensor(5.6429, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"75000\n",
|
|
"7600 tensor(5.6847, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"76000\n",
|
|
"7700 tensor(5.7197, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"77000\n",
|
|
"7800 tensor(5.8559, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"78000\n",
|
|
"7900 tensor(5.5600, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"79000\n",
|
|
"8000 tensor(5.6288, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"80000\n",
|
|
"8100 tensor(5.7767, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"81000\n",
|
|
"8200 tensor(5.8037, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"82000\n",
|
|
"8300 tensor(5.7344, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"83000\n",
|
|
"8400 tensor(5.8092, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"84000\n",
|
|
"8500 tensor(5.8847, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"85000\n",
|
|
"8600 tensor(5.8754, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"86000\n",
|
|
"8700 tensor(5.9227, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"87000\n",
|
|
"8800 tensor(5.8028, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"88000\n",
|
|
"8900 tensor(5.6476, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"89000\n",
|
|
"9000 tensor(5.7656, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"90000\n",
|
|
"9100 tensor(5.7805, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"91000\n",
|
|
"9200 tensor(5.6879, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"92000\n",
|
|
"9300 tensor(5.7098, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"93000\n",
|
|
"9400 tensor(5.5631, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"94000\n",
|
|
"9500 tensor(5.6497, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"95000\n",
|
|
"9600 tensor(5.7500, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"96000\n",
|
|
"9700 tensor(5.6607, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"97000\n",
|
|
"9800 tensor(5.7196, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"9900 tensor(5.5987, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"98000\n",
|
|
"10000 tensor(5.7795, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"99000\n",
|
|
"10100 tensor(5.6980, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"100000\n",
|
|
"10200 tensor(5.6093, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"101000\n",
|
|
"10300 tensor(5.6792, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"102000\n",
|
|
"10400 tensor(5.7035, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"103000\n",
|
|
"10500 tensor(5.8282, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"104000\n",
|
|
"10600 tensor(5.8605, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"105000\n",
|
|
"10700 tensor(5.7354, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"106000\n",
|
|
"10800 tensor(5.8034, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"107000\n",
|
|
"10900 tensor(5.6194, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"108000\n",
|
|
"11000 tensor(5.8502, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"109000\n",
|
|
"11100 tensor(5.4406, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"110000\n",
|
|
"11200 tensor(5.6379, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"111000\n",
|
|
"11300 tensor(5.6668, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"112000\n",
|
|
"11400 tensor(5.6140, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"113000\n",
|
|
"11500 tensor(5.6565, device='cuda:0', grad_fn=<NllLossBackward0>)\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"114000\n",
|
|
"11600 tensor(5.6308, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"115000\n",
|
|
"11700 tensor(5.5680, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"116000\n",
|
|
"11800 tensor(5.7604, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"117000\n",
|
|
"11900 tensor(5.5792, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"118000\n",
|
|
"12000 tensor(5.7329, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"119000\n",
|
|
"12100 tensor(5.7726, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"120000\n",
|
|
"12200 tensor(5.7151, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"121000\n",
|
|
"12300 tensor(5.8561, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"122000\n",
|
|
"12400 tensor(5.6791, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"123000\n",
|
|
"12500 tensor(5.5574, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"124000\n",
|
|
"12600 tensor(5.6817, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"125000\n",
|
|
"12700 tensor(5.5375, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"126000\n",
|
|
"12800 tensor(5.7270, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"127000\n",
|
|
"12900 tensor(5.6252, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"128000\n",
|
|
"13000 tensor(5.4536, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"129000\n",
|
|
"13100 tensor(5.6091, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"130000\n",
|
|
"13200 tensor(5.7324, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"131000\n",
|
|
"13300 tensor(5.5253, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"132000\n",
|
|
"13400 tensor(5.6491, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"133000\n",
|
|
"13500 tensor(5.5728, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"134000\n",
|
|
"13600 tensor(5.6632, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"135000\n",
|
|
"13700 tensor(5.6678, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"136000\n",
|
|
"13800 tensor(5.6112, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"137000\n",
|
|
"13900 tensor(5.4884, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"138000\n",
|
|
"14000 tensor(5.7304, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"139000\n",
|
|
"14100 tensor(5.4326, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"140000\n",
|
|
"14200 tensor(5.7188, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"141000\n",
|
|
"14300 tensor(5.6519, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"142000\n",
|
|
"14400 tensor(5.5892, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"143000\n",
|
|
"14500 tensor(5.7225, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"144000\n",
|
|
"14600 tensor(5.7216, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"145000\n",
|
|
"14700 tensor(5.5748, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"146000\n",
|
|
"14800 tensor(6.0184, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"147000\n",
|
|
"14900 tensor(5.6781, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"148000\n",
|
|
"15000 tensor(5.6038, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"149000\n",
|
|
"15100 tensor(5.7875, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"150000\n",
|
|
"15200 tensor(5.6485, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"151000\n",
|
|
"15300 tensor(5.5927, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"152000\n",
|
|
"15400 tensor(5.5156, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"153000\n",
|
|
"15500 tensor(5.6556, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"154000\n",
|
|
"15600 tensor(5.6485, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"155000\n",
|
|
"15700 tensor(5.5904, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"156000\n",
|
|
"15800 tensor(5.4613, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"157000\n",
|
|
"15900 tensor(5.6254, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"158000\n",
|
|
"16000 tensor(5.4349, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"159000\n",
|
|
"16100 tensor(5.5205, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"160000\n",
|
|
"16200 tensor(5.8051, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"161000\n",
|
|
"16300 tensor(5.6452, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"162000\n",
|
|
"16400 tensor(5.6071, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"163000\n",
|
|
"16500 tensor(5.7237, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"164000\n",
|
|
"16600 tensor(5.5771, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"165000\n",
|
|
"16700 tensor(5.5355, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"166000\n",
|
|
"16800 tensor(5.6363, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"167000\n",
|
|
"16900 tensor(5.3746, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"168000\n",
|
|
"17000 tensor(5.6707, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"169000\n",
|
|
"17100 tensor(5.5359, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"170000\n",
|
|
"17200 tensor(5.6118, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"171000\n",
|
|
"17300 tensor(5.6740, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"172000\n",
|
|
"17400 tensor(5.4438, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"173000\n",
|
|
"17500 tensor(5.5001, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"174000\n",
|
|
"17600 tensor(5.4953, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"175000\n",
|
|
"17700 tensor(5.5398, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"176000\n",
|
|
"17800 tensor(5.6053, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"177000\n",
|
|
"17900 tensor(5.4726, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"178000\n",
|
|
"18000 tensor(5.6747, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"179000\n",
|
|
"18100 tensor(5.6238, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"180000\n",
|
|
"18200 tensor(5.5469, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"181000\n",
|
|
"18300 tensor(5.5299, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"182000\n",
|
|
"18400 tensor(5.6323, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"183000\n",
|
|
"18500 tensor(5.5893, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"184000\n",
|
|
"18600 tensor(5.7452, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"185000\n",
|
|
"18700 tensor(5.5576, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"186000\n",
|
|
"18800 tensor(5.7439, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"187000\n",
|
|
"18900 tensor(5.6106, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"188000\n",
|
|
"19000 tensor(5.6647, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"189000\n",
|
|
"19100 tensor(5.7728, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"190000\n",
|
|
"19200 tensor(5.6169, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"191000\n",
|
|
"19300 tensor(5.7852, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"192000\n",
|
|
"19400 tensor(5.5627, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"193000\n",
|
|
"19500 tensor(5.5682, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"194000\n",
|
|
"19600 tensor(5.5978, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"195000\n",
|
|
"19700 tensor(5.6453, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"196000\n",
|
|
"19800 tensor(5.4786, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"197000\n",
|
|
"19900 tensor(5.4894, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"198000\n",
|
|
"20000 tensor(5.4999, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"199000\n",
|
|
"20100 tensor(5.4881, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"200000\n",
|
|
"20200 tensor(5.3915, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"201000\n",
|
|
"20300 tensor(5.5216, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"20400 tensor(5.5761, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"202000\n",
|
|
"20500 tensor(5.5586, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"203000\n",
|
|
"20600 tensor(5.7870, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"204000\n",
|
|
"20700 tensor(5.5776, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"205000\n",
|
|
"20800 tensor(5.4417, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"206000\n",
|
|
"20900 tensor(5.7186, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"207000\n",
|
|
"21000 tensor(5.5415, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"208000\n",
|
|
"21100 tensor(5.5141, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"209000\n",
|
|
"21200 tensor(5.4401, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"210000\n",
|
|
"21300 tensor(5.6511, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"211000\n",
|
|
"21400 tensor(5.6474, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"212000\n",
|
|
"21500 tensor(5.3946, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"213000\n",
|
|
"21600 tensor(5.3958, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"214000\n",
|
|
"21700 tensor(5.4040, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"215000\n",
|
|
"21800 tensor(5.5745, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"216000\n",
|
|
"21900 tensor(5.4996, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"217000\n",
|
|
"22000 tensor(5.5234, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"218000\n",
|
|
"22100 tensor(5.3870, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"219000\n",
|
|
"22200 tensor(5.2661, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"220000\n",
|
|
"22300 tensor(5.7031, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"221000\n",
|
|
"22400 tensor(5.3633, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"222000\n",
|
|
"22500 tensor(5.4404, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"223000\n",
|
|
"22600 tensor(5.5951, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"224000\n",
|
|
"22700 tensor(5.3901, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"225000\n",
|
|
"22800 tensor(5.6404, device='cuda:0', grad_fn=<NllLossBackward0>)\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"226000\n",
|
|
"22900 tensor(5.6646, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"227000\n",
|
|
"23000 tensor(5.5949, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"228000\n",
|
|
"23100 tensor(5.5284, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"229000\n",
|
|
"23200 tensor(5.5617, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"230000\n",
|
|
"23300 tensor(5.6426, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"231000\n",
|
|
"23400 tensor(5.7283, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"232000\n",
|
|
"23500 tensor(5.4558, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"233000\n",
|
|
"23600 tensor(5.4600, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"234000\n",
|
|
"23700 tensor(5.4961, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"235000\n",
|
|
"23800 tensor(5.3373, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"236000\n",
|
|
"23900 tensor(5.4470, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"237000\n",
|
|
"24000 tensor(5.4346, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"238000\n",
|
|
"24100 tensor(5.5112, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"239000\n",
|
|
"24200 tensor(5.6918, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"240000\n",
|
|
"24300 tensor(5.6115, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"241000\n",
|
|
"24400 tensor(5.7404, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"242000\n",
|
|
"24500 tensor(5.4982, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"243000\n",
|
|
"24600 tensor(5.6136, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"244000\n",
|
|
"24700 tensor(5.5225, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"245000\n",
|
|
"24800 tensor(5.5563, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"246000\n",
|
|
"24900 tensor(5.6283, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"247000\n",
|
|
"25000 tensor(5.6176, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"248000\n",
|
|
"25100 tensor(5.5795, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"249000\n",
|
|
"25200 tensor(5.5831, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"250000\n",
|
|
"25300 tensor(5.5894, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"251000\n",
|
|
"25400 tensor(5.5670, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"252000\n",
|
|
"25500 tensor(5.5016, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"253000\n",
|
|
"25600 tensor(5.7909, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"254000\n",
|
|
"25700 tensor(5.5229, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"255000\n",
|
|
"25800 tensor(5.6035, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"256000\n",
|
|
"25900 tensor(5.5293, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"257000\n",
|
|
"26000 tensor(5.5553, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"258000\n",
|
|
"26100 tensor(5.4476, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"259000\n",
|
|
"26200 tensor(5.3721, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"260000\n",
|
|
"26300 tensor(5.6142, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"261000\n",
|
|
"26400 tensor(5.6202, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"262000\n",
|
|
"26500 tensor(5.3529, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"263000\n",
|
|
"26600 tensor(5.7148, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"264000\n",
|
|
"26700 tensor(5.5755, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"265000\n",
|
|
"26800 tensor(5.7480, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"266000\n",
|
|
"26900 tensor(5.5025, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"267000\n",
|
|
"27000 tensor(5.4017, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"268000\n",
|
|
"27100 tensor(5.3996, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"269000\n",
|
|
"27200 tensor(5.4862, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"270000\n",
|
|
"27300 tensor(5.6392, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"271000\n",
|
|
"27400 tensor(5.5634, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"272000\n",
|
|
"27500 tensor(5.4420, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"273000\n",
|
|
"27600 tensor(5.7835, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"274000\n",
|
|
"27700 tensor(5.5555, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"275000\n",
|
|
"27800 tensor(5.5381, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"276000\n",
|
|
"27900 tensor(5.6515, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"277000\n",
|
|
"28000 tensor(5.5254, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"278000\n",
|
|
"28100 tensor(5.4929, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"279000\n",
|
|
"28200 tensor(5.6218, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"280000\n",
|
|
"28300 tensor(5.2878, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"281000\n",
|
|
"28400 tensor(5.7112, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"282000\n",
|
|
"28500 tensor(5.5490, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"283000\n",
|
|
"28600 tensor(5.4572, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"284000\n",
|
|
"28700 tensor(5.6349, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"285000\n",
|
|
"28800 tensor(5.6607, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"286000\n",
|
|
"28900 tensor(5.5422, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"287000\n",
|
|
"29000 tensor(5.4277, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"288000\n",
|
|
"29100 tensor(5.1870, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"289000\n",
|
|
"29200 tensor(5.3593, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"290000\n",
|
|
"29300 tensor(5.6512, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"291000\n",
|
|
"29400 tensor(5.8051, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"292000\n",
|
|
"29500 tensor(5.5308, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"293000\n",
|
|
"29600 tensor(5.3791, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"294000\n",
|
|
"29700 tensor(5.6108, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"295000\n",
|
|
"29800 tensor(5.4015, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"296000\n",
|
|
"29900 tensor(5.6953, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"297000\n",
|
|
"30000 tensor(5.3925, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"298000\n",
|
|
"30100 tensor(5.4241, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"299000\n",
|
|
"30200 tensor(5.4216, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"300000\n",
|
|
"30300 tensor(5.5074, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"301000\n",
|
|
"30400 tensor(5.3631, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"302000\n",
|
|
"30500 tensor(5.5690, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"30600 tensor(5.4734, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"303000\n",
|
|
"30700 tensor(5.5061, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"304000\n",
|
|
"30800 tensor(5.5709, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"305000\n",
|
|
"30900 tensor(5.5478, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"306000\n",
|
|
"31000 tensor(5.6687, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"307000\n",
|
|
"31100 tensor(5.2899, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"308000\n",
|
|
"31200 tensor(5.3663, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"309000\n",
|
|
"31300 tensor(5.6274, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"310000\n",
|
|
"31400 tensor(5.4358, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"311000\n",
|
|
"31500 tensor(5.5738, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"312000\n",
|
|
"31600 tensor(5.5612, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"313000\n",
|
|
"31700 tensor(5.5104, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"314000\n",
|
|
"31800 tensor(5.6343, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"315000\n",
|
|
"31900 tensor(5.2243, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"316000\n",
|
|
"32000 tensor(5.4320, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"317000\n",
|
|
"32100 tensor(5.3344, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"318000\n",
|
|
"32200 tensor(5.6543, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"319000\n",
|
|
"32300 tensor(5.6512, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"320000\n",
|
|
"32400 tensor(5.6237, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"321000\n",
|
|
"32500 tensor(5.4246, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"322000\n",
|
|
"32600 tensor(5.5469, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"323000\n",
|
|
"32700 tensor(5.5338, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"324000\n",
|
|
"32800 tensor(5.6954, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"325000\n",
|
|
"32900 tensor(5.5754, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"326000\n",
|
|
"33000 tensor(5.3334, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"327000\n",
|
|
"33100 tensor(5.5284, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"328000\n",
|
|
"33200 tensor(5.6350, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"329000\n",
|
|
"33300 tensor(5.4312, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"330000\n",
|
|
"33400 tensor(5.6854, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"331000\n",
|
|
"33500 tensor(5.4921, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"332000\n",
|
|
"33600 tensor(5.4345, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"333000\n",
|
|
"33700 tensor(5.4950, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"334000\n",
|
|
"33800 tensor(5.5757, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"335000\n",
|
|
"33900 tensor(5.3466, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"336000\n",
|
|
"34000 tensor(5.5373, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"337000\n",
|
|
"34100 tensor(5.5144, device='cuda:0', grad_fn=<NllLossBackward0>)\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"338000\n",
|
|
"34200 tensor(5.5543, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"339000\n",
|
|
"34300 tensor(5.3564, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"340000\n",
|
|
"34400 tensor(5.8091, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"341000\n",
|
|
"34500 tensor(5.6699, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"342000\n",
|
|
"34600 tensor(5.5536, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"343000\n",
|
|
"34700 tensor(5.6261, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"344000\n",
|
|
"34800 tensor(5.6504, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"345000\n",
|
|
"34900 tensor(5.7067, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"346000\n",
|
|
"35000 tensor(5.7307, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"347000\n",
|
|
"35100 tensor(5.4831, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"348000\n",
|
|
"35200 tensor(5.4367, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"349000\n",
|
|
"35300 tensor(5.6503, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"350000\n",
|
|
"35400 tensor(5.2892, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"351000\n",
|
|
"35500 tensor(5.4198, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"352000\n",
|
|
"35600 tensor(5.4870, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"353000\n",
|
|
"35700 tensor(5.4489, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"354000\n",
|
|
"35800 tensor(5.5170, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"355000\n",
|
|
"35900 tensor(5.4699, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"356000\n",
|
|
"36000 tensor(5.2451, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"357000\n",
|
|
"36100 tensor(5.6311, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"358000\n",
|
|
"36200 tensor(5.5157, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"359000\n",
|
|
"36300 tensor(5.7751, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"360000\n",
|
|
"36400 tensor(5.4740, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"361000\n",
|
|
"36500 tensor(5.4746, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"362000\n",
|
|
"36600 tensor(5.5244, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"363000\n",
|
|
"36700 tensor(5.3037, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"364000\n",
|
|
"36800 tensor(5.4238, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"365000\n",
|
|
"36900 tensor(5.5203, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"366000\n",
|
|
"37000 tensor(5.4431, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"367000\n",
|
|
"37100 tensor(5.4286, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"368000\n",
|
|
"37200 tensor(5.5108, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"369000\n",
|
|
"37300 tensor(5.4229, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"370000\n",
|
|
"37400 tensor(5.8406, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"371000\n",
|
|
"37500 tensor(5.4602, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"372000\n",
|
|
"37600 tensor(5.4417, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"373000\n",
|
|
"37700 tensor(5.6200, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"374000\n",
|
|
"37800 tensor(5.4527, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"375000\n",
|
|
"37900 tensor(5.4631, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"376000\n",
|
|
"38000 tensor(5.5196, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"377000\n",
|
|
"38100 tensor(5.5436, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"378000\n",
|
|
"38200 tensor(5.5269, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"379000\n",
|
|
"38300 tensor(5.4716, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"380000\n",
|
|
"38400 tensor(5.5081, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"381000\n",
|
|
"38500 tensor(5.5249, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"382000\n",
|
|
"38600 tensor(5.5018, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"383000\n",
|
|
"38700 tensor(5.4845, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"384000\n",
|
|
"38800 tensor(5.5505, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"385000\n",
|
|
"38900 tensor(5.6658, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"386000\n",
|
|
"39000 tensor(5.3333, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"387000\n",
|
|
"39100 tensor(5.5598, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"388000\n",
|
|
"39200 tensor(5.6624, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"389000\n",
|
|
"39300 tensor(5.4714, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"390000\n",
|
|
"39400 tensor(5.5470, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"391000\n",
|
|
"39500 tensor(5.6905, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"392000\n",
|
|
"39600 tensor(5.3592, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"393000\n",
|
|
"39700 tensor(5.3170, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"394000\n",
|
|
"39800 tensor(5.4491, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"395000\n",
|
|
"39900 tensor(5.2872, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"396000\n",
|
|
"40000 tensor(5.3865, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"397000\n",
|
|
"40100 tensor(5.4536, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"398000\n",
|
|
"40200 tensor(5.4382, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"399000\n",
|
|
"40300 tensor(5.4819, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"40400 tensor(5.5250, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"400000\n",
|
|
"40500 tensor(5.4396, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"401000\n",
|
|
"40600 tensor(5.5062, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"402000\n",
|
|
"40700 tensor(5.5362, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"403000\n",
|
|
"40800 tensor(5.5015, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"404000\n",
|
|
"40900 tensor(5.4610, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"405000\n",
|
|
"41000 tensor(5.5083, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"406000\n",
|
|
"41100 tensor(5.4346, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"407000\n",
|
|
"41200 tensor(5.3340, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"408000\n",
|
|
"41300 tensor(5.4608, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"409000\n",
|
|
"41400 tensor(5.3758, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"410000\n",
|
|
"41500 tensor(5.5160, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"411000\n",
|
|
"41600 tensor(5.4290, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"412000\n",
|
|
"41700 tensor(5.4426, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"413000\n",
|
|
"41800 tensor(5.4764, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"414000\n",
|
|
"41900 tensor(5.4730, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"415000\n",
|
|
"42000 tensor(5.6150, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"416000\n",
|
|
"42100 tensor(5.3622, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"417000\n",
|
|
"42200 tensor(5.4380, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"418000\n",
|
|
"42300 tensor(5.5031, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"419000\n",
|
|
"42400 tensor(5.3124, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"420000\n",
|
|
"42500 tensor(5.4812, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"421000\n",
|
|
"42600 tensor(5.2723, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"422000\n",
|
|
"42700 tensor(5.5998, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"423000\n",
|
|
"42800 tensor(5.5254, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"424000\n",
|
|
"42900 tensor(5.3716, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"425000\n",
|
|
"43000 tensor(5.5020, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"426000\n",
|
|
"43100 tensor(5.5091, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"427000\n",
|
|
"43200 tensor(5.3182, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"428000\n",
|
|
"43300 tensor(5.4001, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"429000\n",
|
|
"43400 tensor(5.5150, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"430000\n",
|
|
"43500 tensor(5.2440, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"431000\n",
|
|
"43600 tensor(5.4439, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"432000\n",
|
|
"epoch: = 2\n",
|
|
"0 tensor(5.3953, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"1000\n",
|
|
"100 tensor(5.4847, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"200 tensor(5.3626, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"2000\n",
|
|
"300 tensor(5.4127, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"3000\n",
|
|
"400 tensor(5.3734, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"4000\n",
|
|
"500 tensor(5.5564, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"5000\n",
|
|
"600 tensor(5.3391, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"6000\n",
|
|
"700 tensor(5.6198, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"7000\n",
|
|
"800 tensor(5.2255, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"8000\n",
|
|
"900 tensor(5.5161, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"9000\n",
|
|
"1000 tensor(5.3517, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"10000\n",
|
|
"1100 tensor(5.5420, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"11000\n",
|
|
"1200 tensor(5.6031, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"12000\n",
|
|
"1300 tensor(5.5343, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"13000\n",
|
|
"1400 tensor(5.5547, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"14000\n",
|
|
"1500 tensor(5.6080, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"15000\n",
|
|
"1600 tensor(5.2940, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"16000\n",
|
|
"1700 tensor(5.3671, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"17000\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"1800 tensor(5.3777, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"18000\n",
|
|
"1900 tensor(5.3593, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"19000\n",
|
|
"2000 tensor(5.4348, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"20000\n",
|
|
"2100 tensor(5.5513, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"21000\n",
|
|
"2200 tensor(5.3939, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"22000\n",
|
|
"2300 tensor(5.4063, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"23000\n",
|
|
"2400 tensor(5.4092, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"24000\n",
|
|
"2500 tensor(5.4460, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"25000\n",
|
|
"2600 tensor(5.4738, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"26000\n",
|
|
"2700 tensor(5.4848, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"27000\n",
|
|
"2800 tensor(5.5244, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"28000\n",
|
|
"2900 tensor(5.6711, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"29000\n",
|
|
"3000 tensor(5.4024, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"30000\n",
|
|
"3100 tensor(5.4842, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"31000\n",
|
|
"3200 tensor(5.4863, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"32000\n",
|
|
"3300 tensor(5.4114, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"33000\n",
|
|
"3400 tensor(5.3231, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"34000\n",
|
|
"3500 tensor(5.4598, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"35000\n",
|
|
"3600 tensor(5.4579, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"36000\n",
|
|
"3700 tensor(5.3890, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"37000\n",
|
|
"3800 tensor(5.4162, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"38000\n",
|
|
"3900 tensor(5.2854, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"39000\n",
|
|
"4000 tensor(5.3370, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"40000\n",
|
|
"4100 tensor(5.5078, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"41000\n",
|
|
"4200 tensor(5.5341, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"42000\n",
|
|
"4300 tensor(5.4704, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"43000\n",
|
|
"4400 tensor(5.4990, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"44000\n",
|
|
"4500 tensor(5.3300, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"45000\n",
|
|
"4600 tensor(5.6674, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"46000\n",
|
|
"4700 tensor(5.3622, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"47000\n",
|
|
"4800 tensor(5.4762, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"48000\n",
|
|
"4900 tensor(5.5403, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"49000\n",
|
|
"5000 tensor(5.5359, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"50000\n",
|
|
"5100 tensor(5.6058, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"51000\n",
|
|
"5200 tensor(5.6209, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"52000\n",
|
|
"5300 tensor(5.6273, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"53000\n",
|
|
"5400 tensor(5.4695, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"54000\n",
|
|
"5500 tensor(5.5771, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"55000\n",
|
|
"5600 tensor(5.3552, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"56000\n",
|
|
"5700 tensor(5.5957, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"57000\n",
|
|
"5800 tensor(5.5952, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"58000\n",
|
|
"5900 tensor(5.5643, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"59000\n",
|
|
"6000 tensor(5.4346, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"60000\n",
|
|
"6100 tensor(5.4620, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"61000\n",
|
|
"6200 tensor(5.6256, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"62000\n",
|
|
"6300 tensor(5.4832, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"63000\n",
|
|
"6400 tensor(5.4063, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"64000\n",
|
|
"6500 tensor(5.2587, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"65000\n",
|
|
"6600 tensor(5.4320, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"66000\n",
|
|
"6700 tensor(5.3770, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"67000\n",
|
|
"6800 tensor(5.6077, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"68000\n",
|
|
"6900 tensor(5.5788, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"69000\n",
|
|
"7000 tensor(5.4929, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"70000\n",
|
|
"7100 tensor(5.2828, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"71000\n",
|
|
"7200 tensor(5.3992, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"72000\n",
|
|
"7300 tensor(5.6273, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"73000\n",
|
|
"7400 tensor(5.4385, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"74000\n",
|
|
"7500 tensor(5.3176, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"75000\n",
|
|
"7600 tensor(5.3834, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"76000\n",
|
|
"7700 tensor(5.4532, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"77000\n",
|
|
"7800 tensor(5.5669, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"78000\n",
|
|
"7900 tensor(5.2508, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"79000\n",
|
|
"8000 tensor(5.3027, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"80000\n",
|
|
"8100 tensor(5.4813, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"81000\n",
|
|
"8200 tensor(5.4822, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"82000\n",
|
|
"8300 tensor(5.4510, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"83000\n",
|
|
"8400 tensor(5.5712, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"84000\n",
|
|
"8500 tensor(5.5634, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"85000\n",
|
|
"8600 tensor(5.5616, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"86000\n",
|
|
"8700 tensor(5.6568, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"87000\n",
|
|
"8800 tensor(5.5397, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"88000\n",
|
|
"8900 tensor(5.3852, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"89000\n",
|
|
"9000 tensor(5.5022, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"90000\n",
|
|
"9100 tensor(5.5088, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"91000\n",
|
|
"9200 tensor(5.4214, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"92000\n",
|
|
"9300 tensor(5.4641, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"93000\n",
|
|
"9400 tensor(5.3085, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"94000\n",
|
|
"9500 tensor(5.3852, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"95000\n",
|
|
"9600 tensor(5.5097, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"96000\n",
|
|
"9700 tensor(5.4373, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"97000\n",
|
|
"9800 tensor(5.4786, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"9900 tensor(5.3198, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"98000\n",
|
|
"10000 tensor(5.5310, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"99000\n",
|
|
"10100 tensor(5.4341, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"100000\n",
|
|
"10200 tensor(5.3571, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"101000\n",
|
|
"10300 tensor(5.4712, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"102000\n",
|
|
"10400 tensor(5.4810, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"103000\n",
|
|
"10500 tensor(5.5463, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"104000\n",
|
|
"10600 tensor(5.6233, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"105000\n",
|
|
"10700 tensor(5.4678, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"106000\n",
|
|
"10800 tensor(5.5040, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"107000\n",
|
|
"10900 tensor(5.3963, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"108000\n",
|
|
"11000 tensor(5.6295, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"109000\n",
|
|
"11100 tensor(5.2378, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"110000\n",
|
|
"11200 tensor(5.4184, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"111000\n",
|
|
"11300 tensor(5.4404, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"112000\n",
|
|
"11400 tensor(5.3875, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"113000\n",
|
|
"11500 tensor(5.4523, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"114000\n",
|
|
"11600 tensor(5.4418, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"115000\n",
|
|
"11700 tensor(5.3604, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"116000\n",
|
|
"11800 tensor(5.5647, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"117000\n",
|
|
"11900 tensor(5.3936, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"118000\n",
|
|
"12000 tensor(5.4823, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"119000\n",
|
|
"12100 tensor(5.5069, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"120000\n",
|
|
"12200 tensor(5.4983, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"121000\n",
|
|
"12300 tensor(5.6030, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"122000\n",
|
|
"12400 tensor(5.4763, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"123000\n",
|
|
"12500 tensor(5.3718, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"124000\n",
|
|
"12600 tensor(5.4416, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"125000\n",
|
|
"12700 tensor(5.3554, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"126000\n",
|
|
"12800 tensor(5.5392, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"127000\n",
|
|
"12900 tensor(5.4164, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"128000\n",
|
|
"13000 tensor(5.2286, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"129000\n",
|
|
"13100 tensor(5.4288, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"130000\n",
|
|
"13200 tensor(5.4770, device='cuda:0', grad_fn=<NllLossBackward0>)\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"131000\n",
|
|
"13300 tensor(5.3352, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"132000\n",
|
|
"13400 tensor(5.4349, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"133000\n",
|
|
"13500 tensor(5.3860, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"134000\n",
|
|
"13600 tensor(5.4648, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"135000\n",
|
|
"13700 tensor(5.4444, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"136000\n",
|
|
"13800 tensor(5.4320, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"137000\n",
|
|
"13900 tensor(5.2935, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"138000\n",
|
|
"14000 tensor(5.5387, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"139000\n",
|
|
"14100 tensor(5.2424, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"140000\n",
|
|
"14200 tensor(5.5177, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"141000\n",
|
|
"14300 tensor(5.4831, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"142000\n",
|
|
"14400 tensor(5.3877, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"143000\n",
|
|
"14500 tensor(5.4919, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"144000\n",
|
|
"14600 tensor(5.5253, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"145000\n",
|
|
"14700 tensor(5.3948, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"146000\n",
|
|
"14800 tensor(5.8442, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"147000\n",
|
|
"14900 tensor(5.4967, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"148000\n",
|
|
"15000 tensor(5.3788, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"149000\n",
|
|
"15100 tensor(5.5832, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"150000\n",
|
|
"15200 tensor(5.4482, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"151000\n",
|
|
"15300 tensor(5.4260, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"152000\n",
|
|
"15400 tensor(5.3273, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"153000\n",
|
|
"15500 tensor(5.4840, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"154000\n",
|
|
"15600 tensor(5.4851, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"155000\n",
|
|
"15700 tensor(5.3871, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"156000\n",
|
|
"15800 tensor(5.2933, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"157000\n",
|
|
"15900 tensor(5.4374, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"158000\n",
|
|
"16000 tensor(5.2555, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"159000\n",
|
|
"16100 tensor(5.3127, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"160000\n",
|
|
"16200 tensor(5.6423, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"161000\n",
|
|
"16300 tensor(5.4702, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"162000\n",
|
|
"16400 tensor(5.4419, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"163000\n",
|
|
"16500 tensor(5.5640, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"164000\n",
|
|
"16600 tensor(5.4099, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"165000\n",
|
|
"16700 tensor(5.3822, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"166000\n",
|
|
"16800 tensor(5.4643, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"167000\n",
|
|
"16900 tensor(5.2234, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"168000\n",
|
|
"17000 tensor(5.5021, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"169000\n",
|
|
"17100 tensor(5.3524, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"170000\n",
|
|
"17200 tensor(5.4725, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"171000\n",
|
|
"17300 tensor(5.5034, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"172000\n",
|
|
"17400 tensor(5.2911, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"173000\n",
|
|
"17500 tensor(5.3147, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"174000\n",
|
|
"17600 tensor(5.3426, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"175000\n",
|
|
"17700 tensor(5.3414, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"176000\n",
|
|
"17800 tensor(5.3991, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"177000\n",
|
|
"17900 tensor(5.2936, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"178000\n",
|
|
"18000 tensor(5.5238, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"179000\n",
|
|
"18100 tensor(5.4684, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"180000\n",
|
|
"18200 tensor(5.3916, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"181000\n",
|
|
"18300 tensor(5.3888, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"182000\n",
|
|
"18400 tensor(5.4299, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"183000\n",
|
|
"18500 tensor(5.4103, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"184000\n",
|
|
"18600 tensor(5.5980, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"185000\n",
|
|
"18700 tensor(5.4135, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"186000\n",
|
|
"18800 tensor(5.5855, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"187000\n",
|
|
"18900 tensor(5.4583, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"188000\n",
|
|
"19000 tensor(5.4854, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"189000\n",
|
|
"19100 tensor(5.5879, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"190000\n",
|
|
"19200 tensor(5.4675, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"191000\n",
|
|
"19300 tensor(5.5741, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"192000\n",
|
|
"19400 tensor(5.3977, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"193000\n",
|
|
"19500 tensor(5.4042, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"194000\n",
|
|
"19600 tensor(5.4364, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"195000\n",
|
|
"19700 tensor(5.4868, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"196000\n",
|
|
"19800 tensor(5.3476, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"197000\n",
|
|
"19900 tensor(5.3553, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"198000\n",
|
|
"20000 tensor(5.3707, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"199000\n",
|
|
"20100 tensor(5.3226, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"200000\n",
|
|
"20200 tensor(5.2488, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"201000\n",
|
|
"20300 tensor(5.3648, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"20400 tensor(5.4156, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"202000\n",
|
|
"20500 tensor(5.4102, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"203000\n",
|
|
"20600 tensor(5.6109, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"204000\n",
|
|
"20700 tensor(5.4335, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"205000\n",
|
|
"20800 tensor(5.2795, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"206000\n",
|
|
"20900 tensor(5.5609, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"207000\n",
|
|
"21000 tensor(5.3918, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"208000\n",
|
|
"21100 tensor(5.3831, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"209000\n",
|
|
"21200 tensor(5.2790, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"210000\n",
|
|
"21300 tensor(5.4710, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"211000\n",
|
|
"21400 tensor(5.5050, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"212000\n",
|
|
"21500 tensor(5.2692, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"213000\n",
|
|
"21600 tensor(5.2668, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"214000\n",
|
|
"21700 tensor(5.2633, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"215000\n",
|
|
"21800 tensor(5.4067, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"216000\n",
|
|
"21900 tensor(5.3829, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"217000\n",
|
|
"22000 tensor(5.3773, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"218000\n",
|
|
"22100 tensor(5.2472, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"219000\n",
|
|
"22200 tensor(5.1171, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"220000\n",
|
|
"22300 tensor(5.5545, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"221000\n",
|
|
"22400 tensor(5.2499, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"222000\n",
|
|
"22500 tensor(5.2943, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"223000\n",
|
|
"22600 tensor(5.4748, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"224000\n",
|
|
"22700 tensor(5.2436, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"225000\n",
|
|
"22800 tensor(5.5053, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"226000\n",
|
|
"22900 tensor(5.5519, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"227000\n",
|
|
"23000 tensor(5.4541, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"228000\n",
|
|
"23100 tensor(5.4279, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"229000\n",
|
|
"23200 tensor(5.4286, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"230000\n",
|
|
"23300 tensor(5.5179, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"231000\n",
|
|
"23400 tensor(5.5355, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"232000\n",
|
|
"23500 tensor(5.3505, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"233000\n",
|
|
"23600 tensor(5.3313, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"234000\n",
|
|
"23700 tensor(5.3509, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"235000\n",
|
|
"23800 tensor(5.2170, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"236000\n",
|
|
"23900 tensor(5.3101, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"237000\n",
|
|
"24000 tensor(5.2962, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"238000\n",
|
|
"24100 tensor(5.3882, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"239000\n",
|
|
"24200 tensor(5.5633, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"240000\n",
|
|
"24300 tensor(5.4595, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"241000\n",
|
|
"24400 tensor(5.5932, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"242000\n",
|
|
"24500 tensor(5.3717, device='cuda:0', grad_fn=<NllLossBackward0>)\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"243000\n",
|
|
"24600 tensor(5.4943, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"244000\n",
|
|
"24700 tensor(5.3985, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"245000\n",
|
|
"24800 tensor(5.4347, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"246000\n",
|
|
"24900 tensor(5.5008, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"247000\n",
|
|
"25000 tensor(5.5100, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"248000\n",
|
|
"25100 tensor(5.4427, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"249000\n",
|
|
"25200 tensor(5.4508, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"250000\n",
|
|
"25300 tensor(5.4724, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"251000\n",
|
|
"25400 tensor(5.4525, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"252000\n",
|
|
"25500 tensor(5.3620, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"253000\n",
|
|
"25600 tensor(5.6446, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"254000\n",
|
|
"25700 tensor(5.3966, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"255000\n",
|
|
"25800 tensor(5.4889, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"256000\n",
|
|
"25900 tensor(5.4251, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"257000\n",
|
|
"26000 tensor(5.4346, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"258000\n",
|
|
"26100 tensor(5.3395, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"259000\n",
|
|
"26200 tensor(5.2695, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"260000\n",
|
|
"26300 tensor(5.4767, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"261000\n",
|
|
"26400 tensor(5.5083, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"262000\n",
|
|
"26500 tensor(5.2347, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"263000\n",
|
|
"26600 tensor(5.5761, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"264000\n",
|
|
"26700 tensor(5.4402, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"265000\n",
|
|
"26800 tensor(5.6173, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"266000\n",
|
|
"26900 tensor(5.3775, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"267000\n",
|
|
"27000 tensor(5.2863, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"268000\n",
|
|
"27100 tensor(5.3007, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"269000\n",
|
|
"27200 tensor(5.3551, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"270000\n",
|
|
"27300 tensor(5.5439, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"271000\n",
|
|
"27400 tensor(5.4334, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"272000\n",
|
|
"27500 tensor(5.3266, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"273000\n",
|
|
"27600 tensor(5.6412, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"274000\n",
|
|
"27700 tensor(5.4420, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"275000\n",
|
|
"27800 tensor(5.4381, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"276000\n",
|
|
"27900 tensor(5.5550, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"277000\n",
|
|
"28000 tensor(5.4154, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"278000\n",
|
|
"28100 tensor(5.3823, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"279000\n",
|
|
"28200 tensor(5.5344, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"280000\n",
|
|
"28300 tensor(5.1615, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"281000\n",
|
|
"28400 tensor(5.6069, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"282000\n",
|
|
"28500 tensor(5.4426, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"283000\n",
|
|
"28600 tensor(5.3672, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"284000\n",
|
|
"28700 tensor(5.5133, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"285000\n",
|
|
"28800 tensor(5.5556, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"286000\n",
|
|
"28900 tensor(5.4294, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"287000\n",
|
|
"29000 tensor(5.3359, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"288000\n",
|
|
"29100 tensor(5.0951, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"289000\n",
|
|
"29200 tensor(5.2511, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"290000\n",
|
|
"29300 tensor(5.5364, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"291000\n",
|
|
"29400 tensor(5.6708, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"292000\n",
|
|
"29500 tensor(5.4371, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"293000\n",
|
|
"29600 tensor(5.2942, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"294000\n",
|
|
"29700 tensor(5.4637, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"295000\n",
|
|
"29800 tensor(5.2914, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"296000\n",
|
|
"29900 tensor(5.5562, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"297000\n",
|
|
"30000 tensor(5.2833, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"298000\n",
|
|
"30100 tensor(5.3481, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"299000\n",
|
|
"30200 tensor(5.3122, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"300000\n",
|
|
"30300 tensor(5.4103, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"301000\n",
|
|
"30400 tensor(5.2480, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"302000\n",
|
|
"30500 tensor(5.4258, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"30600 tensor(5.3835, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"303000\n",
|
|
"30700 tensor(5.4193, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"304000\n",
|
|
"30800 tensor(5.4438, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"305000\n",
|
|
"30900 tensor(5.4518, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"306000\n",
|
|
"31000 tensor(5.5607, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"307000\n",
|
|
"31100 tensor(5.2059, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"308000\n",
|
|
"31200 tensor(5.2571, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"309000\n",
|
|
"31300 tensor(5.5208, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"310000\n",
|
|
"31400 tensor(5.3061, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"311000\n",
|
|
"31500 tensor(5.4834, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"312000\n",
|
|
"31600 tensor(5.4653, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"313000\n",
|
|
"31700 tensor(5.4308, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"314000\n",
|
|
"31800 tensor(5.5400, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"315000\n",
|
|
"31900 tensor(5.1536, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"316000\n",
|
|
"32000 tensor(5.3460, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"317000\n",
|
|
"32100 tensor(5.2300, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"318000\n",
|
|
"32200 tensor(5.5511, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"319000\n",
|
|
"32300 tensor(5.5391, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"320000\n",
|
|
"32400 tensor(5.5157, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"321000\n",
|
|
"32500 tensor(5.3336, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"322000\n",
|
|
"32600 tensor(5.4475, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"323000\n",
|
|
"32700 tensor(5.3894, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"324000\n",
|
|
"32800 tensor(5.6022, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"325000\n",
|
|
"32900 tensor(5.4663, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"326000\n",
|
|
"33000 tensor(5.2387, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"327000\n",
|
|
"33100 tensor(5.4446, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"328000\n",
|
|
"33200 tensor(5.5450, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"329000\n",
|
|
"33300 tensor(5.3179, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"330000\n",
|
|
"33400 tensor(5.5905, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"331000\n",
|
|
"33500 tensor(5.4066, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"332000\n",
|
|
"33600 tensor(5.3542, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"333000\n",
|
|
"33700 tensor(5.4097, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"334000\n",
|
|
"33800 tensor(5.4912, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"335000\n",
|
|
"33900 tensor(5.2358, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"336000\n",
|
|
"34000 tensor(5.4470, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"337000\n",
|
|
"34100 tensor(5.4207, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"338000\n",
|
|
"34200 tensor(5.4651, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"339000\n",
|
|
"34300 tensor(5.2545, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"340000\n",
|
|
"34400 tensor(5.7106, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"341000\n",
|
|
"34500 tensor(5.5699, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"342000\n",
|
|
"34600 tensor(5.4638, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"343000\n",
|
|
"34700 tensor(5.5382, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"344000\n",
|
|
"34800 tensor(5.5603, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"345000\n",
|
|
"34900 tensor(5.6072, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"346000\n",
|
|
"35000 tensor(5.6037, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"347000\n",
|
|
"35100 tensor(5.4069, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"348000\n",
|
|
"35200 tensor(5.3398, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"349000\n",
|
|
"35300 tensor(5.5607, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"350000\n",
|
|
"35400 tensor(5.2068, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"351000\n",
|
|
"35500 tensor(5.3112, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"352000\n",
|
|
"35600 tensor(5.4126, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"353000\n",
|
|
"35700 tensor(5.3091, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"354000\n",
|
|
"35800 tensor(5.4252, device='cuda:0', grad_fn=<NllLossBackward0>)\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"355000\n",
|
|
"35900 tensor(5.3956, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"356000\n",
|
|
"36000 tensor(5.1705, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"357000\n",
|
|
"36100 tensor(5.5497, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"358000\n",
|
|
"36200 tensor(5.4066, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"359000\n",
|
|
"36300 tensor(5.6858, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"360000\n",
|
|
"36400 tensor(5.3812, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"361000\n",
|
|
"36500 tensor(5.3990, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"362000\n",
|
|
"36600 tensor(5.4302, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"363000\n",
|
|
"36700 tensor(5.2253, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"364000\n",
|
|
"36800 tensor(5.3347, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"365000\n",
|
|
"36900 tensor(5.4426, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"366000\n",
|
|
"37000 tensor(5.3419, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"367000\n",
|
|
"37100 tensor(5.3579, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"368000\n",
|
|
"37200 tensor(5.4332, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"369000\n",
|
|
"37300 tensor(5.3362, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"370000\n",
|
|
"37400 tensor(5.7100, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"371000\n",
|
|
"37500 tensor(5.3742, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"372000\n",
|
|
"37600 tensor(5.3615, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"373000\n",
|
|
"37700 tensor(5.5402, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"374000\n",
|
|
"37800 tensor(5.3734, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"375000\n",
|
|
"37900 tensor(5.3621, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"376000\n",
|
|
"38000 tensor(5.4380, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"377000\n",
|
|
"38100 tensor(5.4513, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"378000\n",
|
|
"38200 tensor(5.4554, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"379000\n",
|
|
"38300 tensor(5.3735, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"380000\n",
|
|
"38400 tensor(5.4297, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"381000\n",
|
|
"38500 tensor(5.4561, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"382000\n",
|
|
"38600 tensor(5.4118, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"383000\n",
|
|
"38700 tensor(5.3996, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"384000\n",
|
|
"38800 tensor(5.4825, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"385000\n",
|
|
"38900 tensor(5.5692, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"386000\n",
|
|
"39000 tensor(5.2573, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"387000\n",
|
|
"39100 tensor(5.4847, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"388000\n",
|
|
"39200 tensor(5.5802, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"389000\n",
|
|
"39300 tensor(5.3968, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"390000\n",
|
|
"39400 tensor(5.4666, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"391000\n",
|
|
"39500 tensor(5.5847, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"392000\n",
|
|
"39600 tensor(5.2648, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"393000\n",
|
|
"39700 tensor(5.2423, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"394000\n",
|
|
"39800 tensor(5.3731, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"395000\n",
|
|
"39900 tensor(5.2014, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"396000\n",
|
|
"40000 tensor(5.2903, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"397000\n",
|
|
"40100 tensor(5.3712, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"398000\n",
|
|
"40200 tensor(5.3557, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"399000\n",
|
|
"40300 tensor(5.4151, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"40400 tensor(5.4358, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"400000\n",
|
|
"40500 tensor(5.3498, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"401000\n",
|
|
"40600 tensor(5.4152, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"402000\n",
|
|
"40700 tensor(5.4551, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"403000\n",
|
|
"40800 tensor(5.4138, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"404000\n",
|
|
"40900 tensor(5.3628, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"405000\n",
|
|
"41000 tensor(5.4124, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"406000\n",
|
|
"41100 tensor(5.3750, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"407000\n",
|
|
"41200 tensor(5.2687, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"408000\n",
|
|
"41300 tensor(5.3987, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"409000\n",
|
|
"41400 tensor(5.2976, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"410000\n",
|
|
"41500 tensor(5.4418, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"411000\n",
|
|
"41600 tensor(5.3558, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"412000\n",
|
|
"41700 tensor(5.3767, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"413000\n",
|
|
"41800 tensor(5.3836, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"414000\n",
|
|
"41900 tensor(5.3904, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"415000\n",
|
|
"42000 tensor(5.5445, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"416000\n",
|
|
"42100 tensor(5.2890, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"417000\n",
|
|
"42200 tensor(5.3691, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"418000\n",
|
|
"42300 tensor(5.4364, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"419000\n",
|
|
"42400 tensor(5.2507, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"420000\n",
|
|
"42500 tensor(5.4215, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"421000\n",
|
|
"42600 tensor(5.2136, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"422000\n",
|
|
"42700 tensor(5.5296, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"423000\n",
|
|
"42800 tensor(5.4544, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"424000\n",
|
|
"42900 tensor(5.3009, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"425000\n",
|
|
"43000 tensor(5.4403, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"426000\n",
|
|
"43100 tensor(5.4384, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"427000\n",
|
|
"43200 tensor(5.2520, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"428000\n",
|
|
"43300 tensor(5.2945, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"429000\n",
|
|
"43400 tensor(5.4455, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"430000\n",
|
|
"43500 tensor(5.1633, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"431000\n",
|
|
"43600 tensor(5.3649, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"432000\n",
|
|
"epoch: = 3\n",
|
|
"0 tensor(5.3427, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"1000\n",
|
|
"100 tensor(5.4180, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"200 tensor(5.2939, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"2000\n",
|
|
"300 tensor(5.3083, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"3000\n",
|
|
"400 tensor(5.3086, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"4000\n",
|
|
"500 tensor(5.4733, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"5000\n",
|
|
"600 tensor(5.2627, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"6000\n",
|
|
"700 tensor(5.5664, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"7000\n",
|
|
"800 tensor(5.1641, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"8000\n",
|
|
"900 tensor(5.4272, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"9000\n",
|
|
"1000 tensor(5.2926, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"10000\n",
|
|
"1100 tensor(5.4848, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"11000\n",
|
|
"1200 tensor(5.5283, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"12000\n",
|
|
"1300 tensor(5.4635, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"13000\n",
|
|
"1400 tensor(5.4590, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"14000\n",
|
|
"1500 tensor(5.5386, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"15000\n",
|
|
"1600 tensor(5.2150, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"16000\n",
|
|
"1700 tensor(5.3116, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"17000\n",
|
|
"1800 tensor(5.3130, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"18000\n",
|
|
"1900 tensor(5.2889, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"19000\n",
|
|
"2000 tensor(5.3574, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"20000\n",
|
|
"2100 tensor(5.4860, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"21000\n",
|
|
"2200 tensor(5.3206, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"22000\n",
|
|
"2300 tensor(5.3447, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"23000\n",
|
|
"2400 tensor(5.3333, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"24000\n",
|
|
"2500 tensor(5.3822, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"25000\n",
|
|
"2600 tensor(5.4039, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"26000\n",
|
|
"2700 tensor(5.4280, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"27000\n",
|
|
"2800 tensor(5.4575, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"28000\n",
|
|
"2900 tensor(5.5878, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"29000\n",
|
|
"3000 tensor(5.3311, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"30000\n",
|
|
"3100 tensor(5.4103, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"31000\n",
|
|
"3200 tensor(5.4323, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"32000\n",
|
|
"3300 tensor(5.3521, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"33000\n",
|
|
"3400 tensor(5.2512, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"34000\n",
|
|
"3500 tensor(5.3813, device='cuda:0', grad_fn=<NllLossBackward0>)\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"35000\n",
|
|
"3600 tensor(5.4000, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"36000\n",
|
|
"3700 tensor(5.3312, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"37000\n",
|
|
"3800 tensor(5.3553, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"38000\n",
|
|
"3900 tensor(5.2275, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"39000\n",
|
|
"4000 tensor(5.2883, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"40000\n",
|
|
"4100 tensor(5.4294, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"41000\n",
|
|
"4200 tensor(5.4801, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"42000\n",
|
|
"4300 tensor(5.3863, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"43000\n",
|
|
"4400 tensor(5.4470, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"44000\n",
|
|
"4500 tensor(5.2610, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"45000\n",
|
|
"4600 tensor(5.5962, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"46000\n",
|
|
"4700 tensor(5.3029, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"47000\n",
|
|
"4800 tensor(5.4265, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"48000\n",
|
|
"4900 tensor(5.4823, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"49000\n",
|
|
"5000 tensor(5.4749, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"50000\n",
|
|
"5100 tensor(5.5356, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"51000\n",
|
|
"5200 tensor(5.5513, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"52000\n",
|
|
"5300 tensor(5.5476, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"53000\n",
|
|
"5400 tensor(5.4039, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"54000\n",
|
|
"5500 tensor(5.5156, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"55000\n",
|
|
"5600 tensor(5.2975, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"56000\n",
|
|
"5700 tensor(5.5492, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"57000\n",
|
|
"5800 tensor(5.5379, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"58000\n",
|
|
"5900 tensor(5.4874, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"59000\n",
|
|
"6000 tensor(5.3808, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"60000\n",
|
|
"6100 tensor(5.3932, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"61000\n",
|
|
"6200 tensor(5.5657, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"62000\n",
|
|
"6300 tensor(5.4233, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"63000\n",
|
|
"6400 tensor(5.3438, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"64000\n",
|
|
"6500 tensor(5.2002, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"65000\n",
|
|
"6600 tensor(5.3774, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"66000\n",
|
|
"6700 tensor(5.3193, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"67000\n",
|
|
"6800 tensor(5.5394, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"68000\n",
|
|
"6900 tensor(5.5196, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"69000\n",
|
|
"7000 tensor(5.4282, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"70000\n",
|
|
"7100 tensor(5.2296, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"71000\n",
|
|
"7200 tensor(5.3175, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"72000\n",
|
|
"7300 tensor(5.5642, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"73000\n",
|
|
"7400 tensor(5.3784, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"74000\n",
|
|
"7500 tensor(5.2475, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"75000\n",
|
|
"7600 tensor(5.3194, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"76000\n",
|
|
"7700 tensor(5.3934, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"77000\n",
|
|
"7800 tensor(5.5041, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"78000\n",
|
|
"7900 tensor(5.1814, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"79000\n",
|
|
"8000 tensor(5.2426, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"80000\n",
|
|
"8100 tensor(5.4104, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"81000\n",
|
|
"8200 tensor(5.4198, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"82000\n",
|
|
"8300 tensor(5.3854, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"83000\n",
|
|
"8400 tensor(5.5128, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"84000\n",
|
|
"8500 tensor(5.4898, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"85000\n",
|
|
"8600 tensor(5.4943, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"86000\n",
|
|
"8700 tensor(5.6012, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"87000\n",
|
|
"8800 tensor(5.4790, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"88000\n",
|
|
"8900 tensor(5.3312, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"89000\n",
|
|
"9000 tensor(5.4456, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"90000\n",
|
|
"9100 tensor(5.4537, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"91000\n",
|
|
"9200 tensor(5.3643, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"92000\n",
|
|
"9300 tensor(5.4085, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"93000\n",
|
|
"9400 tensor(5.2527, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"94000\n",
|
|
"9500 tensor(5.3289, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"95000\n",
|
|
"9600 tensor(5.4516, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"96000\n",
|
|
"9700 tensor(5.3881, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"97000\n",
|
|
"9800 tensor(5.4321, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"9900 tensor(5.2532, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"98000\n",
|
|
"10000 tensor(5.4727, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"99000\n",
|
|
"10100 tensor(5.3607, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"100000\n",
|
|
"10200 tensor(5.2989, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"101000\n",
|
|
"10300 tensor(5.4168, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"102000\n",
|
|
"10400 tensor(5.4272, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"103000\n",
|
|
"10500 tensor(5.4838, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"104000\n",
|
|
"10600 tensor(5.5675, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"105000\n",
|
|
"10700 tensor(5.4027, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"106000\n",
|
|
"10800 tensor(5.4252, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"107000\n",
|
|
"10900 tensor(5.3408, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"108000\n",
|
|
"11000 tensor(5.5754, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"109000\n",
|
|
"11100 tensor(5.1920, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"110000\n",
|
|
"11200 tensor(5.3604, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"111000\n",
|
|
"11300 tensor(5.3836, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"112000\n",
|
|
"11400 tensor(5.3330, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"113000\n",
|
|
"11500 tensor(5.4023, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"114000\n",
|
|
"11600 tensor(5.3923, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"115000\n",
|
|
"11700 tensor(5.3145, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"116000\n",
|
|
"11800 tensor(5.5174, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"117000\n",
|
|
"11900 tensor(5.3522, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"118000\n",
|
|
"12000 tensor(5.4232, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"119000\n",
|
|
"12100 tensor(5.4382, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"120000\n",
|
|
"12200 tensor(5.4488, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"121000\n",
|
|
"12300 tensor(5.5409, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"122000\n",
|
|
"12400 tensor(5.4200, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"123000\n",
|
|
"12500 tensor(5.3292, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"124000\n",
|
|
"12600 tensor(5.3788, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"125000\n",
|
|
"12700 tensor(5.3116, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"126000\n",
|
|
"12800 tensor(5.4948, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"127000\n",
|
|
"12900 tensor(5.3557, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"128000\n",
|
|
"13000 tensor(5.1732, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"129000\n",
|
|
"13100 tensor(5.3782, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"130000\n",
|
|
"13200 tensor(5.4178, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"131000\n",
|
|
"13300 tensor(5.2929, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"132000\n",
|
|
"13400 tensor(5.3806, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"133000\n",
|
|
"13500 tensor(5.3394, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"134000\n",
|
|
"13600 tensor(5.4191, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"135000\n",
|
|
"13700 tensor(5.3856, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"136000\n",
|
|
"13800 tensor(5.3839, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"137000\n",
|
|
"13900 tensor(5.2391, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"138000\n",
|
|
"14000 tensor(5.4865, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"139000\n",
|
|
"14100 tensor(5.1952, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"140000\n",
|
|
"14200 tensor(5.4670, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"141000\n",
|
|
"14300 tensor(5.4385, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"142000\n",
|
|
"14400 tensor(5.3347, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"143000\n",
|
|
"14500 tensor(5.4370, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"144000\n",
|
|
"14600 tensor(5.4695, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"145000\n",
|
|
"14700 tensor(5.3453, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"146000\n",
|
|
"14800 tensor(5.7928, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"147000\n",
|
|
"14900 tensor(5.4451, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"148000\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"15000 tensor(5.3087, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"149000\n",
|
|
"15100 tensor(5.5241, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"150000\n",
|
|
"15200 tensor(5.3894, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"151000\n",
|
|
"15300 tensor(5.3809, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"152000\n",
|
|
"15400 tensor(5.2696, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"153000\n",
|
|
"15500 tensor(5.4343, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"154000\n",
|
|
"15600 tensor(5.4322, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"155000\n",
|
|
"15700 tensor(5.3296, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"156000\n",
|
|
"15800 tensor(5.2456, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"157000\n",
|
|
"15900 tensor(5.3806, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"158000\n",
|
|
"16000 tensor(5.2008, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"159000\n",
|
|
"16100 tensor(5.2489, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"160000\n",
|
|
"16200 tensor(5.5902, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"161000\n",
|
|
"16300 tensor(5.4159, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"162000\n",
|
|
"16400 tensor(5.3966, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"163000\n",
|
|
"16500 tensor(5.5113, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"164000\n",
|
|
"16600 tensor(5.3599, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"165000\n",
|
|
"16700 tensor(5.3372, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"166000\n",
|
|
"16800 tensor(5.4158, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"167000\n",
|
|
"16900 tensor(5.1788, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"168000\n",
|
|
"17000 tensor(5.4497, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"169000\n",
|
|
"17100 tensor(5.2981, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"170000\n",
|
|
"17200 tensor(5.4330, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"171000\n",
|
|
"17300 tensor(5.4495, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"172000\n",
|
|
"17400 tensor(5.2431, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"173000\n",
|
|
"17500 tensor(5.2652, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"174000\n",
|
|
"17600 tensor(5.3007, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"175000\n",
|
|
"17700 tensor(5.2852, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"176000\n",
|
|
"17800 tensor(5.3431, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"177000\n",
|
|
"17900 tensor(5.2395, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"178000\n",
|
|
"18000 tensor(5.4841, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"179000\n",
|
|
"18100 tensor(5.4218, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"180000\n",
|
|
"18200 tensor(5.3397, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"181000\n",
|
|
"18300 tensor(5.3426, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"182000\n",
|
|
"18400 tensor(5.3654, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"183000\n",
|
|
"18500 tensor(5.3484, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"184000\n",
|
|
"18600 tensor(5.5509, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"185000\n",
|
|
"18700 tensor(5.3702, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"186000\n",
|
|
"18800 tensor(5.5361, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"187000\n",
|
|
"18900 tensor(5.4132, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"188000\n",
|
|
"19000 tensor(5.4235, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"189000\n",
|
|
"19100 tensor(5.5318, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"190000\n",
|
|
"19200 tensor(5.4136, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"191000\n",
|
|
"19300 tensor(5.5053, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"192000\n",
|
|
"19400 tensor(5.3472, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"193000\n",
|
|
"19500 tensor(5.3511, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"194000\n",
|
|
"19600 tensor(5.3861, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"195000\n",
|
|
"19700 tensor(5.4345, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"196000\n",
|
|
"19800 tensor(5.3067, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"197000\n",
|
|
"19900 tensor(5.3079, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"198000\n",
|
|
"20000 tensor(5.3268, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"199000\n",
|
|
"20100 tensor(5.2668, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"200000\n",
|
|
"20200 tensor(5.1998, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"201000\n",
|
|
"20300 tensor(5.3105, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"20400 tensor(5.3584, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"202000\n",
|
|
"20500 tensor(5.3580, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"203000\n",
|
|
"20600 tensor(5.5528, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"204000\n",
|
|
"20700 tensor(5.3871, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"205000\n",
|
|
"20800 tensor(5.2208, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"206000\n",
|
|
"20900 tensor(5.5007, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"207000\n",
|
|
"21000 tensor(5.3396, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"208000\n",
|
|
"21100 tensor(5.3407, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"209000\n",
|
|
"21200 tensor(5.2243, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"210000\n",
|
|
"21300 tensor(5.4206, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"211000\n",
|
|
"21400 tensor(5.4574, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"212000\n",
|
|
"21500 tensor(5.2328, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"213000\n",
|
|
"21600 tensor(5.2233, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"214000\n",
|
|
"21700 tensor(5.2152, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"215000\n",
|
|
"21800 tensor(5.3497, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"216000\n",
|
|
"21900 tensor(5.3425, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"217000\n",
|
|
"22000 tensor(5.3277, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"218000\n",
|
|
"22100 tensor(5.2012, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"219000\n",
|
|
"22200 tensor(5.0736, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"220000\n",
|
|
"22300 tensor(5.5070, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"221000\n",
|
|
"22400 tensor(5.2190, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"222000\n",
|
|
"22500 tensor(5.2434, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"223000\n",
|
|
"22600 tensor(5.4325, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"224000\n",
|
|
"22700 tensor(5.1909, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"225000\n",
|
|
"22800 tensor(5.4576, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"226000\n",
|
|
"22900 tensor(5.5069, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"227000\n",
|
|
"23000 tensor(5.4041, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"228000\n",
|
|
"23100 tensor(5.3908, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"229000\n",
|
|
"23200 tensor(5.3866, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"230000\n",
|
|
"23300 tensor(5.4714, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"231000\n",
|
|
"23400 tensor(5.4781, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"232000\n",
|
|
"23500 tensor(5.3154, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"233000\n",
|
|
"23600 tensor(5.2854, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"234000\n",
|
|
"23700 tensor(5.3050, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"235000\n",
|
|
"23800 tensor(5.1721, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"236000\n",
|
|
"23900 tensor(5.2637, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"237000\n",
|
|
"24000 tensor(5.2519, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"238000\n",
|
|
"24100 tensor(5.3407, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"239000\n",
|
|
"24200 tensor(5.5137, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"240000\n",
|
|
"24300 tensor(5.4080, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"241000\n",
|
|
"24400 tensor(5.5379, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"242000\n",
|
|
"24500 tensor(5.3255, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"243000\n",
|
|
"24600 tensor(5.4515, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"244000\n",
|
|
"24700 tensor(5.3535, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"245000\n",
|
|
"24800 tensor(5.3935, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"246000\n",
|
|
"24900 tensor(5.4553, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"247000\n",
|
|
"25000 tensor(5.4708, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"248000\n",
|
|
"25100 tensor(5.3920, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"249000\n",
|
|
"25200 tensor(5.4083, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"250000\n",
|
|
"25300 tensor(5.4332, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"251000\n",
|
|
"25400 tensor(5.4136, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"252000\n",
|
|
"25500 tensor(5.3147, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"253000\n",
|
|
"25600 tensor(5.5860, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"254000\n",
|
|
"25700 tensor(5.3490, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"255000\n",
|
|
"25800 tensor(5.4464, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"256000\n",
|
|
"25900 tensor(5.3857, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"257000\n",
|
|
"26000 tensor(5.3893, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"258000\n",
|
|
"26100 tensor(5.3041, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"259000\n",
|
|
"26200 tensor(5.2321, device='cuda:0', grad_fn=<NllLossBackward0>)\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"260000\n",
|
|
"26300 tensor(5.4289, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"261000\n",
|
|
"26400 tensor(5.4663, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"262000\n",
|
|
"26500 tensor(5.1922, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"263000\n",
|
|
"26600 tensor(5.5283, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"264000\n",
|
|
"26700 tensor(5.3933, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"265000\n",
|
|
"26800 tensor(5.5680, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"266000\n",
|
|
"26900 tensor(5.3281, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"267000\n",
|
|
"27000 tensor(5.2408, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"268000\n",
|
|
"27100 tensor(5.2671, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"269000\n",
|
|
"27200 tensor(5.3099, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"270000\n",
|
|
"27300 tensor(5.5049, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"271000\n",
|
|
"27400 tensor(5.3850, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"272000\n",
|
|
"27500 tensor(5.2843, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"273000\n",
|
|
"27600 tensor(5.5777, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"274000\n",
|
|
"27700 tensor(5.4017, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"275000\n",
|
|
"27800 tensor(5.3994, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"276000\n",
|
|
"27900 tensor(5.5128, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"277000\n",
|
|
"28000 tensor(5.3708, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"278000\n",
|
|
"28100 tensor(5.3382, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"279000\n",
|
|
"28200 tensor(5.4996, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"280000\n",
|
|
"28300 tensor(5.1214, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"281000\n",
|
|
"28400 tensor(5.5647, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"282000\n",
|
|
"28500 tensor(5.3959, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"283000\n",
|
|
"28600 tensor(5.3312, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"284000\n",
|
|
"28700 tensor(5.4663, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"285000\n",
|
|
"28800 tensor(5.5155, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"286000\n",
|
|
"28900 tensor(5.3872, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"287000\n",
|
|
"29000 tensor(5.3017, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"288000\n",
|
|
"29100 tensor(5.0583, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"289000\n",
|
|
"29200 tensor(5.2099, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"290000\n",
|
|
"29300 tensor(5.4934, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"291000\n",
|
|
"29400 tensor(5.6202, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"292000\n",
|
|
"29500 tensor(5.4016, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"293000\n",
|
|
"29600 tensor(5.2601, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"294000\n",
|
|
"29700 tensor(5.4038, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"295000\n",
|
|
"29800 tensor(5.2475, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"296000\n",
|
|
"29900 tensor(5.4960, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"297000\n",
|
|
"30000 tensor(5.2438, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"298000\n",
|
|
"30100 tensor(5.3221, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"299000\n",
|
|
"30200 tensor(5.2686, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"300000\n",
|
|
"30300 tensor(5.3735, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"301000\n",
|
|
"30400 tensor(5.2057, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"302000\n",
|
|
"30500 tensor(5.3767, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"30600 tensor(5.3515, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"303000\n",
|
|
"30700 tensor(5.3841, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"304000\n",
|
|
"30800 tensor(5.3889, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"305000\n",
|
|
"30900 tensor(5.4117, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"306000\n",
|
|
"31000 tensor(5.5205, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"307000\n",
|
|
"31100 tensor(5.1742, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"308000\n",
|
|
"31200 tensor(5.2173, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"309000\n",
|
|
"31300 tensor(5.4785, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"310000\n",
|
|
"31400 tensor(5.2577, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"311000\n",
|
|
"31500 tensor(5.4429, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"312000\n",
|
|
"31600 tensor(5.4289, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"313000\n",
|
|
"31700 tensor(5.3961, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"314000\n",
|
|
"31800 tensor(5.4999, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"315000\n",
|
|
"31900 tensor(5.1248, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"316000\n",
|
|
"32000 tensor(5.3122, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"317000\n",
|
|
"32100 tensor(5.1931, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"318000\n",
|
|
"32200 tensor(5.5096, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"319000\n",
|
|
"32300 tensor(5.4973, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"320000\n",
|
|
"32400 tensor(5.4742, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"321000\n",
|
|
"32500 tensor(5.2964, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"322000\n",
|
|
"32600 tensor(5.4063, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"323000\n",
|
|
"32700 tensor(5.3369, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"324000\n",
|
|
"32800 tensor(5.5636, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"325000\n",
|
|
"32900 tensor(5.4245, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"326000\n",
|
|
"33000 tensor(5.2032, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"327000\n",
|
|
"33100 tensor(5.4095, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"328000\n",
|
|
"33200 tensor(5.5071, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"329000\n",
|
|
"33300 tensor(5.2729, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"330000\n",
|
|
"33400 tensor(5.5492, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"331000\n",
|
|
"33500 tensor(5.3701, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"332000\n",
|
|
"33600 tensor(5.3223, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"333000\n",
|
|
"33700 tensor(5.3725, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"334000\n",
|
|
"33800 tensor(5.4572, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"335000\n",
|
|
"33900 tensor(5.1889, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"336000\n",
|
|
"34000 tensor(5.4090, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"337000\n",
|
|
"34100 tensor(5.3798, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"338000\n",
|
|
"34200 tensor(5.4259, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"339000\n",
|
|
"34300 tensor(5.2132, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"340000\n",
|
|
"34400 tensor(5.6692, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"341000\n",
|
|
"34500 tensor(5.5324, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"342000\n",
|
|
"34600 tensor(5.4271, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"343000\n",
|
|
"34700 tensor(5.4978, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"344000\n",
|
|
"34800 tensor(5.5230, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"345000\n",
|
|
"34900 tensor(5.5652, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"346000\n",
|
|
"35000 tensor(5.5478, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"347000\n",
|
|
"35100 tensor(5.3700, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"348000\n",
|
|
"35200 tensor(5.2958, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"349000\n",
|
|
"35300 tensor(5.5219, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"350000\n",
|
|
"35400 tensor(5.1702, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"351000\n",
|
|
"35500 tensor(5.2604, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"352000\n",
|
|
"35600 tensor(5.3821, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"353000\n",
|
|
"35700 tensor(5.2551, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"354000\n",
|
|
"35800 tensor(5.3840, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"355000\n",
|
|
"35900 tensor(5.3635, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"356000\n",
|
|
"36000 tensor(5.1400, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"357000\n",
|
|
"36100 tensor(5.5134, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"358000\n",
|
|
"36200 tensor(5.3632, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"359000\n",
|
|
"36300 tensor(5.6461, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"360000\n",
|
|
"36400 tensor(5.3415, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"361000\n",
|
|
"36500 tensor(5.3659, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"362000\n",
|
|
"36600 tensor(5.3874, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"363000\n",
|
|
"36700 tensor(5.1886, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"364000\n",
|
|
"36800 tensor(5.2958, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"365000\n",
|
|
"36900 tensor(5.4094, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"366000\n",
|
|
"37000 tensor(5.3023, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"367000\n",
|
|
"37100 tensor(5.3287, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"368000\n",
|
|
"37200 tensor(5.3996, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"369000\n",
|
|
"37300 tensor(5.3001, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"370000\n",
|
|
"37400 tensor(5.6516, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"371000\n",
|
|
"37500 tensor(5.3366, device='cuda:0', grad_fn=<NllLossBackward0>)\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"372000\n",
|
|
"37600 tensor(5.3282, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"373000\n",
|
|
"37700 tensor(5.5061, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"374000\n",
|
|
"37800 tensor(5.3408, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"375000\n",
|
|
"37900 tensor(5.3203, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"376000\n",
|
|
"38000 tensor(5.3996, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"377000\n",
|
|
"38100 tensor(5.4133, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"378000\n",
|
|
"38200 tensor(5.4262, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"379000\n",
|
|
"38300 tensor(5.3305, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"380000\n",
|
|
"38400 tensor(5.3983, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"381000\n",
|
|
"38500 tensor(5.4246, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"382000\n",
|
|
"38600 tensor(5.3713, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"383000\n",
|
|
"38700 tensor(5.3634, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"384000\n",
|
|
"38800 tensor(5.4504, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"385000\n",
|
|
"38900 tensor(5.5273, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"386000\n",
|
|
"39000 tensor(5.2229, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"387000\n",
|
|
"39100 tensor(5.4503, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"388000\n",
|
|
"39200 tensor(5.5406, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"389000\n",
|
|
"39300 tensor(5.3640, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"390000\n",
|
|
"39400 tensor(5.4311, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"391000\n",
|
|
"39500 tensor(5.5292, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"392000\n",
|
|
"39600 tensor(5.2217, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"393000\n",
|
|
"39700 tensor(5.2121, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"394000\n",
|
|
"39800 tensor(5.3415, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"395000\n",
|
|
"39900 tensor(5.1605, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"396000\n",
|
|
"40000 tensor(5.2472, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"397000\n",
|
|
"40100 tensor(5.3351, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"398000\n",
|
|
"40200 tensor(5.3198, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"399000\n",
|
|
"40300 tensor(5.3862, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"40400 tensor(5.3946, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"400000\n",
|
|
"40500 tensor(5.3120, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"401000\n",
|
|
"40600 tensor(5.3741, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"402000\n",
|
|
"40700 tensor(5.4199, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"403000\n",
|
|
"40800 tensor(5.3702, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"404000\n",
|
|
"40900 tensor(5.3212, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"405000\n",
|
|
"41000 tensor(5.3683, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"406000\n",
|
|
"41100 tensor(5.3491, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"407000\n",
|
|
"41200 tensor(5.2400, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"408000\n",
|
|
"41300 tensor(5.3728, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"409000\n",
|
|
"41400 tensor(5.2643, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"410000\n",
|
|
"41500 tensor(5.4064, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"411000\n",
|
|
"41600 tensor(5.3238, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"412000\n",
|
|
"41700 tensor(5.3469, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"413000\n",
|
|
"41800 tensor(5.3432, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"414000\n",
|
|
"41900 tensor(5.3521, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"415000\n",
|
|
"42000 tensor(5.5087, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"416000\n",
|
|
"42100 tensor(5.2556, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"417000\n",
|
|
"42200 tensor(5.3407, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"418000\n",
|
|
"42300 tensor(5.4058, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"419000\n",
|
|
"42400 tensor(5.2231, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"420000\n",
|
|
"42500 tensor(5.3912, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"421000\n",
|
|
"42600 tensor(5.1878, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"422000\n",
|
|
"42700 tensor(5.4955, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"423000\n",
|
|
"42800 tensor(5.4193, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"424000\n",
|
|
"42900 tensor(5.2662, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"425000\n",
|
|
"43000 tensor(5.4093, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"426000\n",
|
|
"43100 tensor(5.4089, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"427000\n",
|
|
"43200 tensor(5.2223, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"428000\n",
|
|
"43300 tensor(5.2456, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"429000\n",
|
|
"43400 tensor(5.4129, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"430000\n",
|
|
"43500 tensor(5.1283, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"431000\n",
|
|
"43600 tensor(5.3275, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"432000\n",
|
|
"epoch: = 4\n",
|
|
"0 tensor(5.3172, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"1000\n",
|
|
"100 tensor(5.3864, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"200 tensor(5.2618, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"2000\n",
|
|
"300 tensor(5.2652, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"3000\n",
|
|
"400 tensor(5.2749, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"4000\n",
|
|
"500 tensor(5.4347, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"5000\n",
|
|
"600 tensor(5.2271, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"6000\n",
|
|
"700 tensor(5.5396, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"7000\n",
|
|
"800 tensor(5.1379, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"8000\n",
|
|
"900 tensor(5.3861, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"9000\n",
|
|
"1000 tensor(5.2629, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"10000\n",
|
|
"1100 tensor(5.4575, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"11000\n",
|
|
"1200 tensor(5.4936, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"12000\n",
|
|
"1300 tensor(5.4281, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"13000\n",
|
|
"1400 tensor(5.4186, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"14000\n",
|
|
"1500 tensor(5.5070, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"15000\n",
|
|
"1600 tensor(5.1769, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"16000\n",
|
|
"1700 tensor(5.2856, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"17000\n",
|
|
"1800 tensor(5.2827, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"18000\n",
|
|
"1900 tensor(5.2544, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"19000\n",
|
|
"2000 tensor(5.3218, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"20000\n",
|
|
"2100 tensor(5.4549, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"21000\n",
|
|
"2200 tensor(5.2864, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"22000\n",
|
|
"2300 tensor(5.3145, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"23000\n",
|
|
"2400 tensor(5.2987, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"24000\n",
|
|
"2500 tensor(5.3498, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"25000\n",
|
|
"2600 tensor(5.3730, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"26000\n",
|
|
"2700 tensor(5.4017, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"27000\n",
|
|
"2800 tensor(5.4255, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"28000\n",
|
|
"2900 tensor(5.5475, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"29000\n",
|
|
"3000 tensor(5.2988, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"30000\n",
|
|
"3100 tensor(5.3753, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"31000\n",
|
|
"3200 tensor(5.4049, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"32000\n",
|
|
"3300 tensor(5.3206, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"33000\n",
|
|
"3400 tensor(5.2159, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"34000\n",
|
|
"3500 tensor(5.3423, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"35000\n",
|
|
"3600 tensor(5.3717, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"36000\n",
|
|
"3700 tensor(5.3042, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"37000\n",
|
|
"3800 tensor(5.3258, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"38000\n",
|
|
"3900 tensor(5.1989, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"39000\n",
|
|
"4000 tensor(5.2650, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"40000\n",
|
|
"4100 tensor(5.3953, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"41000\n",
|
|
"4200 tensor(5.4542, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"42000\n",
|
|
"4300 tensor(5.3466, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"43000\n",
|
|
"4400 tensor(5.4222, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"44000\n",
|
|
"4500 tensor(5.2254, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"45000\n",
|
|
"4600 tensor(5.5610, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"46000\n",
|
|
"4700 tensor(5.2753, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"47000\n",
|
|
"4800 tensor(5.4028, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"48000\n",
|
|
"4900 tensor(5.4516, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"49000\n",
|
|
"5000 tensor(5.4464, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"50000\n",
|
|
"5100 tensor(5.5018, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"51000\n",
|
|
"5200 tensor(5.5194, device='cuda:0', grad_fn=<NllLossBackward0>)\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"52000\n",
|
|
"5300 tensor(5.5077, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"53000\n",
|
|
"5400 tensor(5.3746, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"54000\n",
|
|
"5500 tensor(5.4847, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"55000\n",
|
|
"5600 tensor(5.2664, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"56000\n",
|
|
"5700 tensor(5.5265, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"57000\n",
|
|
"5800 tensor(5.5101, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"58000\n",
|
|
"5900 tensor(5.4513, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"59000\n",
|
|
"6000 tensor(5.3554, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"60000\n",
|
|
"6100 tensor(5.3616, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"61000\n",
|
|
"6200 tensor(5.5360, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"62000\n",
|
|
"6300 tensor(5.3952, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"63000\n",
|
|
"6400 tensor(5.3132, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"64000\n",
|
|
"6500 tensor(5.1732, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"65000\n",
|
|
"6600 tensor(5.3505, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"66000\n",
|
|
"6700 tensor(5.2919, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"67000\n",
|
|
"6800 tensor(5.5064, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"68000\n",
|
|
"6900 tensor(5.4881, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"69000\n",
|
|
"7000 tensor(5.3978, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"70000\n",
|
|
"7100 tensor(5.2030, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"71000\n",
|
|
"7200 tensor(5.2738, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"72000\n",
|
|
"7300 tensor(5.5317, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"73000\n",
|
|
"7400 tensor(5.3487, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"74000\n",
|
|
"7500 tensor(5.2133, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"75000\n",
|
|
"7600 tensor(5.2878, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"76000\n",
|
|
"7700 tensor(5.3644, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"77000\n",
|
|
"7800 tensor(5.4711, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"78000\n",
|
|
"7900 tensor(5.1445, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"79000\n",
|
|
"8000 tensor(5.2138, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"80000\n",
|
|
"8100 tensor(5.3741, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"81000\n",
|
|
"8200 tensor(5.3893, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"82000\n",
|
|
"8300 tensor(5.3492, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"83000\n",
|
|
"8400 tensor(5.4797, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"84000\n",
|
|
"8500 tensor(5.4501, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"85000\n",
|
|
"8600 tensor(5.4600, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"86000\n",
|
|
"8700 tensor(5.5758, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"87000\n",
|
|
"8800 tensor(5.4493, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"88000\n",
|
|
"8900 tensor(5.3035, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"89000\n",
|
|
"9000 tensor(5.4164, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"90000\n",
|
|
"9100 tensor(5.4273, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"91000\n",
|
|
"9200 tensor(5.3343, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"92000\n",
|
|
"9300 tensor(5.3797, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"93000\n",
|
|
"9400 tensor(5.2260, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"94000\n",
|
|
"9500 tensor(5.3006, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"95000\n",
|
|
"9600 tensor(5.4211, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"96000\n",
|
|
"9700 tensor(5.3615, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"97000\n",
|
|
"9800 tensor(5.4089, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"9900 tensor(5.2200, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"98000\n",
|
|
"10000 tensor(5.4428, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"99000\n",
|
|
"10100 tensor(5.3219, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"100000\n",
|
|
"10200 tensor(5.2692, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"101000\n",
|
|
"10300 tensor(5.3854, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"102000\n",
|
|
"10400 tensor(5.3984, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"103000\n",
|
|
"10500 tensor(5.4516, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"104000\n",
|
|
"10600 tensor(5.5380, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"105000\n",
|
|
"10700 tensor(5.3724, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"106000\n",
|
|
"10800 tensor(5.3862, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"107000\n",
|
|
"10900 tensor(5.3102, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"108000\n",
|
|
"11000 tensor(5.5487, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"109000\n",
|
|
"11100 tensor(5.1684, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"110000\n",
|
|
"11200 tensor(5.3303, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"111000\n",
|
|
"11300 tensor(5.3537, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"112000\n",
|
|
"11400 tensor(5.3064, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"113000\n",
|
|
"11500 tensor(5.3775, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"114000\n",
|
|
"11600 tensor(5.3649, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"115000\n",
|
|
"11700 tensor(5.2920, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"116000\n",
|
|
"11800 tensor(5.4908, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"117000\n",
|
|
"11900 tensor(5.3293, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"118000\n",
|
|
"12000 tensor(5.3926, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"119000\n",
|
|
"12100 tensor(5.4045, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"120000\n",
|
|
"12200 tensor(5.4246, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"121000\n",
|
|
"12300 tensor(5.5096, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"122000\n",
|
|
"12400 tensor(5.3884, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"123000\n",
|
|
"12500 tensor(5.3057, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"124000\n",
|
|
"12600 tensor(5.3466, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"125000\n",
|
|
"12700 tensor(5.2898, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"126000\n",
|
|
"12800 tensor(5.4714, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"127000\n",
|
|
"12900 tensor(5.3255, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"128000\n",
|
|
"13000 tensor(5.1438, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"129000\n",
|
|
"13100 tensor(5.3498, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"130000\n",
|
|
"13200 tensor(5.3890, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"131000\n",
|
|
"13300 tensor(5.2710, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"132000\n",
|
|
"13400 tensor(5.3541, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"133000\n",
|
|
"13500 tensor(5.3156, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"134000\n",
|
|
"13600 tensor(5.3957, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"135000\n",
|
|
"13700 tensor(5.3548, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"136000\n",
|
|
"13800 tensor(5.3577, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"137000\n",
|
|
"13900 tensor(5.2122, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"138000\n",
|
|
"14000 tensor(5.4587, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"139000\n",
|
|
"14100 tensor(5.1704, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"140000\n",
|
|
"14200 tensor(5.4419, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"141000\n",
|
|
"14300 tensor(5.4142, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"142000\n",
|
|
"14400 tensor(5.3058, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"143000\n",
|
|
"14500 tensor(5.4082, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"144000\n",
|
|
"14600 tensor(5.4414, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"145000\n",
|
|
"14700 tensor(5.3177, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"146000\n",
|
|
"14800 tensor(5.7665, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"147000\n",
|
|
"14900 tensor(5.4171, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"148000\n",
|
|
"15000 tensor(5.2698, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"149000\n",
|
|
"15100 tensor(5.4915, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"150000\n",
|
|
"15200 tensor(5.3576, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"151000\n",
|
|
"15300 tensor(5.3567, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"152000\n",
|
|
"15400 tensor(5.2379, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"153000\n",
|
|
"15500 tensor(5.4092, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"154000\n",
|
|
"15600 tensor(5.4042, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"155000\n",
|
|
"15700 tensor(5.3017, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"156000\n",
|
|
"15800 tensor(5.2188, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"157000\n",
|
|
"15900 tensor(5.3497, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"158000\n",
|
|
"16000 tensor(5.1718, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"159000\n",
|
|
"16100 tensor(5.2145, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"160000\n",
|
|
"16200 tensor(5.5591, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"161000\n",
|
|
"16300 tensor(5.3864, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"162000\n",
|
|
"16400 tensor(5.3719, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"163000\n",
|
|
"16500 tensor(5.4842, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"164000\n",
|
|
"16600 tensor(5.3329, device='cuda:0', grad_fn=<NllLossBackward0>)\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"165000\n",
|
|
"16700 tensor(5.3130, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"166000\n",
|
|
"16800 tensor(5.3903, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"167000\n",
|
|
"16900 tensor(5.1551, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"168000\n",
|
|
"17000 tensor(5.4229, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"169000\n",
|
|
"17100 tensor(5.2686, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"170000\n",
|
|
"17200 tensor(5.4099, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"171000\n",
|
|
"17300 tensor(5.4198, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"172000\n",
|
|
"17400 tensor(5.2162, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"173000\n",
|
|
"17500 tensor(5.2385, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"174000\n",
|
|
"17600 tensor(5.2786, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"175000\n",
|
|
"17700 tensor(5.2576, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"176000\n",
|
|
"17800 tensor(5.3158, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"177000\n",
|
|
"17900 tensor(5.2105, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"178000\n",
|
|
"18000 tensor(5.4627, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"179000\n",
|
|
"18100 tensor(5.3966, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"180000\n",
|
|
"18200 tensor(5.3108, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"181000\n",
|
|
"18300 tensor(5.3148, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"182000\n",
|
|
"18400 tensor(5.3321, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"183000\n",
|
|
"18500 tensor(5.3171, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"184000\n",
|
|
"18600 tensor(5.5247, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"185000\n",
|
|
"18700 tensor(5.3469, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"186000\n",
|
|
"18800 tensor(5.5092, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"187000\n",
|
|
"18900 tensor(5.3902, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"188000\n",
|
|
"19000 tensor(5.3904, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"189000\n",
|
|
"19100 tensor(5.5019, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"190000\n",
|
|
"19200 tensor(5.3838, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"191000\n",
|
|
"19300 tensor(5.4674, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"192000\n",
|
|
"19400 tensor(5.3223, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"193000\n",
|
|
"19500 tensor(5.3235, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"194000\n",
|
|
"19600 tensor(5.3589, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"195000\n",
|
|
"19700 tensor(5.4063, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"196000\n",
|
|
"19800 tensor(5.2838, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"197000\n",
|
|
"19900 tensor(5.2807, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"198000\n",
|
|
"20000 tensor(5.3038, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"199000\n",
|
|
"20100 tensor(5.2397, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"200000\n",
|
|
"20200 tensor(5.1723, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"201000\n",
|
|
"20300 tensor(5.2827, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"20400 tensor(5.3245, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"202000\n",
|
|
"20500 tensor(5.3303, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"203000\n",
|
|
"20600 tensor(5.5211, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"204000\n",
|
|
"20700 tensor(5.3629, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"205000\n",
|
|
"20800 tensor(5.1882, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"206000\n",
|
|
"20900 tensor(5.4671, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"207000\n",
|
|
"21000 tensor(5.3110, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"208000\n",
|
|
"21100 tensor(5.3181, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"209000\n",
|
|
"21200 tensor(5.1968, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"210000\n",
|
|
"21300 tensor(5.3940, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"211000\n",
|
|
"21400 tensor(5.4308, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"212000\n",
|
|
"21500 tensor(5.2127, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"213000\n",
|
|
"21600 tensor(5.2003, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"214000\n",
|
|
"21700 tensor(5.1881, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"215000\n",
|
|
"21800 tensor(5.3180, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"216000\n",
|
|
"21900 tensor(5.3197, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"217000\n",
|
|
"22000 tensor(5.3005, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"218000\n",
|
|
"22100 tensor(5.1776, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"219000\n",
|
|
"22200 tensor(5.0509, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"220000\n",
|
|
"22300 tensor(5.4807, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"221000\n",
|
|
"22400 tensor(5.2040, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"222000\n",
|
|
"22500 tensor(5.2161, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"223000\n",
|
|
"22600 tensor(5.4083, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"224000\n",
|
|
"22700 tensor(5.1619, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"225000\n",
|
|
"22800 tensor(5.4301, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"226000\n",
|
|
"22900 tensor(5.4791, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"227000\n",
|
|
"23000 tensor(5.3785, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"228000\n",
|
|
"23100 tensor(5.3705, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"229000\n",
|
|
"23200 tensor(5.3633, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"230000\n",
|
|
"23300 tensor(5.4443, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"231000\n",
|
|
"23400 tensor(5.4496, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"232000\n",
|
|
"23500 tensor(5.2961, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"233000\n",
|
|
"23600 tensor(5.2603, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"234000\n",
|
|
"23700 tensor(5.2793, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"235000\n",
|
|
"23800 tensor(5.1461, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"236000\n",
|
|
"23900 tensor(5.2376, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"237000\n",
|
|
"24000 tensor(5.2269, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"238000\n",
|
|
"24100 tensor(5.3154, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"239000\n",
|
|
"24200 tensor(5.4852, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"240000\n",
|
|
"24300 tensor(5.3785, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"241000\n",
|
|
"24400 tensor(5.5053, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"242000\n",
|
|
"24500 tensor(5.2987, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"243000\n",
|
|
"24600 tensor(5.4275, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"244000\n",
|
|
"24700 tensor(5.3283, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"245000\n",
|
|
"24800 tensor(5.3707, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"246000\n",
|
|
"24900 tensor(5.4294, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"247000\n",
|
|
"25000 tensor(5.4479, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"248000\n",
|
|
"25100 tensor(5.3629, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"249000\n",
|
|
"25200 tensor(5.3849, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"250000\n",
|
|
"25300 tensor(5.4124, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"251000\n",
|
|
"25400 tensor(5.3932, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"252000\n",
|
|
"25500 tensor(5.2893, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"253000\n",
|
|
"25600 tensor(5.5512, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"254000\n",
|
|
"25700 tensor(5.3227, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"255000\n",
|
|
"25800 tensor(5.4217, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"256000\n",
|
|
"25900 tensor(5.3637, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"257000\n",
|
|
"26000 tensor(5.3632, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"258000\n",
|
|
"26100 tensor(5.2841, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"259000\n",
|
|
"26200 tensor(5.2107, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"260000\n",
|
|
"26300 tensor(5.4024, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"261000\n",
|
|
"26400 tensor(5.4410, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"262000\n",
|
|
"26500 tensor(5.1685, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"263000\n",
|
|
"26600 tensor(5.5023, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"264000\n",
|
|
"26700 tensor(5.3654, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"265000\n",
|
|
"26800 tensor(5.5407, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"266000\n",
|
|
"26900 tensor(5.3000, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"267000\n",
|
|
"27000 tensor(5.2141, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"268000\n",
|
|
"27100 tensor(5.2490, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"269000\n",
|
|
"27200 tensor(5.2850, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"270000\n",
|
|
"27300 tensor(5.4811, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"271000\n",
|
|
"27400 tensor(5.3561, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"272000\n",
|
|
"27500 tensor(5.2602, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"273000\n",
|
|
"27600 tensor(5.5429, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"274000\n",
|
|
"27700 tensor(5.3794, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"275000\n",
|
|
"27800 tensor(5.3792, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"276000\n",
|
|
"27900 tensor(5.4873, device='cuda:0', grad_fn=<NllLossBackward0>)\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"277000\n",
|
|
"28000 tensor(5.3454, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"278000\n",
|
|
"28100 tensor(5.3113, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"279000\n",
|
|
"28200 tensor(5.4785, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"280000\n",
|
|
"28300 tensor(5.1013, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"281000\n",
|
|
"28400 tensor(5.5403, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"282000\n",
|
|
"28500 tensor(5.3676, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"283000\n",
|
|
"28600 tensor(5.3108, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"284000\n",
|
|
"28700 tensor(5.4403, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"285000\n",
|
|
"28800 tensor(5.4926, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"286000\n",
|
|
"28900 tensor(5.3638, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"287000\n",
|
|
"29000 tensor(5.2819, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"288000\n",
|
|
"29100 tensor(5.0362, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"289000\n",
|
|
"29200 tensor(5.1871, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"290000\n",
|
|
"29300 tensor(5.4697, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"291000\n",
|
|
"29400 tensor(5.5909, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"292000\n",
|
|
"29500 tensor(5.3807, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"293000\n",
|
|
"29600 tensor(5.2398, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"294000\n",
|
|
"29700 tensor(5.3690, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"295000\n",
|
|
"29800 tensor(5.2220, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"296000\n",
|
|
"29900 tensor(5.4597, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"297000\n",
|
|
"30000 tensor(5.2205, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"298000\n",
|
|
"30100 tensor(5.3061, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"299000\n",
|
|
"30200 tensor(5.2432, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"300000\n",
|
|
"30300 tensor(5.3527, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"301000\n",
|
|
"30400 tensor(5.1823, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"302000\n",
|
|
"30500 tensor(5.3526, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"30600 tensor(5.3318, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"303000\n",
|
|
"30700 tensor(5.3634, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"304000\n",
|
|
"30800 tensor(5.3571, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"305000\n",
|
|
"30900 tensor(5.3875, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"306000\n",
|
|
"31000 tensor(5.4983, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"307000\n",
|
|
"31100 tensor(5.1554, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"308000\n",
|
|
"31200 tensor(5.1952, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"309000\n",
|
|
"31300 tensor(5.4546, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"310000\n",
|
|
"31400 tensor(5.2307, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"311000\n",
|
|
"31500 tensor(5.4188, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"312000\n",
|
|
"31600 tensor(5.4085, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"313000\n",
|
|
"31700 tensor(5.3744, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"314000\n",
|
|
"31800 tensor(5.4766, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"315000\n",
|
|
"31900 tensor(5.1062, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"316000\n",
|
|
"32000 tensor(5.2924, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"317000\n",
|
|
"32100 tensor(5.1728, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"318000\n",
|
|
"32200 tensor(5.4863, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"319000\n",
|
|
"32300 tensor(5.4748, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"320000\n",
|
|
"32400 tensor(5.4518, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"321000\n",
|
|
"32500 tensor(5.2752, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"322000\n",
|
|
"32600 tensor(5.3822, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"323000\n",
|
|
"32700 tensor(5.3088, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"324000\n",
|
|
"32800 tensor(5.5403, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"325000\n",
|
|
"32900 tensor(5.4000, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"326000\n",
|
|
"33000 tensor(5.1837, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"327000\n",
|
|
"33100 tensor(5.3888, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"328000\n",
|
|
"33200 tensor(5.4849, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"329000\n",
|
|
"33300 tensor(5.2471, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"330000\n",
|
|
"33400 tensor(5.5246, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"331000\n",
|
|
"33500 tensor(5.3479, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"332000\n",
|
|
"33600 tensor(5.3043, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"333000\n",
|
|
"33700 tensor(5.3487, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"334000\n",
|
|
"33800 tensor(5.4368, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"335000\n",
|
|
"33900 tensor(5.1620, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"336000\n",
|
|
"34000 tensor(5.3873, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"337000\n",
|
|
"34100 tensor(5.3545, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"338000\n",
|
|
"34200 tensor(5.4001, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"339000\n",
|
|
"34300 tensor(5.1902, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"340000\n",
|
|
"34400 tensor(5.6453, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"341000\n",
|
|
"34500 tensor(5.5124, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"342000\n",
|
|
"34600 tensor(5.4069, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"343000\n",
|
|
"34700 tensor(5.4734, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"344000\n",
|
|
"34800 tensor(5.5014, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"345000\n",
|
|
"34900 tensor(5.5412, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"346000\n",
|
|
"35000 tensor(5.5132, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"347000\n",
|
|
"35100 tensor(5.3455, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"348000\n",
|
|
"35200 tensor(5.2694, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"349000\n",
|
|
"35300 tensor(5.4988, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"350000\n",
|
|
"35400 tensor(5.1485, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"351000\n",
|
|
"35500 tensor(5.2299, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"352000\n",
|
|
"35600 tensor(5.3643, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"353000\n",
|
|
"35700 tensor(5.2247, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"354000\n",
|
|
"35800 tensor(5.3615, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"355000\n",
|
|
"35900 tensor(5.3453, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"356000\n",
|
|
"36000 tensor(5.1217, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"357000\n",
|
|
"36100 tensor(5.4909, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"358000\n",
|
|
"36200 tensor(5.3382, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"359000\n",
|
|
"36300 tensor(5.6225, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"360000\n",
|
|
"36400 tensor(5.3167, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"361000\n",
|
|
"36500 tensor(5.3458, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"362000\n",
|
|
"36600 tensor(5.3608, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"363000\n",
|
|
"36700 tensor(5.1660, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"364000\n",
|
|
"36800 tensor(5.2737, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"365000\n",
|
|
"36900 tensor(5.3883, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"366000\n",
|
|
"37000 tensor(5.2783, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"367000\n",
|
|
"37100 tensor(5.3110, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"368000\n",
|
|
"37200 tensor(5.3794, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"369000\n",
|
|
"37300 tensor(5.2802, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"370000\n",
|
|
"37400 tensor(5.6133, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"371000\n",
|
|
"37500 tensor(5.3138, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"372000\n",
|
|
"37600 tensor(5.3083, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"373000\n",
|
|
"37700 tensor(5.4860, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"374000\n",
|
|
"37800 tensor(5.3216, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"375000\n",
|
|
"37900 tensor(5.2969, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"376000\n",
|
|
"38000 tensor(5.3759, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"377000\n",
|
|
"38100 tensor(5.3914, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"378000\n",
|
|
"38200 tensor(5.4089, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"379000\n",
|
|
"38300 tensor(5.3068, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"380000\n",
|
|
"38400 tensor(5.3798, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"381000\n",
|
|
"38500 tensor(5.4051, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"382000\n",
|
|
"38600 tensor(5.3471, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"383000\n",
|
|
"38700 tensor(5.3415, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"384000\n",
|
|
"38800 tensor(5.4310, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"385000\n",
|
|
"38900 tensor(5.5029, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"386000\n",
|
|
"39000 tensor(5.2021, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"387000\n",
|
|
"39100 tensor(5.4283, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"388000\n",
|
|
"39200 tensor(5.5158, device='cuda:0', grad_fn=<NllLossBackward0>)\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"389000\n",
|
|
"39300 tensor(5.3452, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"390000\n",
|
|
"39400 tensor(5.4111, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"391000\n",
|
|
"39500 tensor(5.4969, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"392000\n",
|
|
"39600 tensor(5.1952, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"393000\n",
|
|
"39700 tensor(5.1946, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"394000\n",
|
|
"39800 tensor(5.3234, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"395000\n",
|
|
"39900 tensor(5.1354, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"396000\n",
|
|
"40000 tensor(5.2210, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"397000\n",
|
|
"40100 tensor(5.3133, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"398000\n",
|
|
"40200 tensor(5.2990, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"399000\n",
|
|
"40300 tensor(5.3684, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"40400 tensor(5.3700, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"400000\n",
|
|
"40500 tensor(5.2911, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"401000\n",
|
|
"40600 tensor(5.3497, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"402000\n",
|
|
"40700 tensor(5.3981, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"403000\n",
|
|
"40800 tensor(5.3436, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"404000\n",
|
|
"40900 tensor(5.2978, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"405000\n",
|
|
"41000 tensor(5.3420, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"406000\n",
|
|
"41100 tensor(5.3342, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"407000\n",
|
|
"41200 tensor(5.2226, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"408000\n",
|
|
"41300 tensor(5.3573, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"409000\n",
|
|
"41400 tensor(5.2448, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"410000\n",
|
|
"41500 tensor(5.3863, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"411000\n",
|
|
"41600 tensor(5.3051, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"412000\n",
|
|
"41700 tensor(5.3294, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"413000\n",
|
|
"41800 tensor(5.3191, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"414000\n",
|
|
"41900 tensor(5.3289, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"415000\n",
|
|
"42000 tensor(5.4860, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"416000\n",
|
|
"42100 tensor(5.2358, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"417000\n",
|
|
"42200 tensor(5.3253, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"418000\n",
|
|
"42300 tensor(5.3869, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"419000\n",
|
|
"42400 tensor(5.2062, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"420000\n",
|
|
"42500 tensor(5.3712, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"421000\n",
|
|
"42600 tensor(5.1718, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"422000\n",
|
|
"42700 tensor(5.4735, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"423000\n",
|
|
"42800 tensor(5.3973, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"424000\n",
|
|
"42900 tensor(5.2447, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"425000\n",
|
|
"43000 tensor(5.3896, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"426000\n",
|
|
"43100 tensor(5.3916, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"427000\n",
|
|
"43200 tensor(5.2044, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"428000\n",
|
|
"43300 tensor(5.2167, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"429000\n",
|
|
"43400 tensor(5.3933, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"430000\n",
|
|
"43500 tensor(5.1078, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"431000\n",
|
|
"43600 tensor(5.3045, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
|
|
"432000\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"data = DataLoader(train_dataset, batch_size=batch_s)\n",
|
|
"optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)\n",
|
|
"criterion = torch.nn.NLLLoss()\n",
|
|
"torch.cuda.empty_cache()\n",
|
|
"gc.collect()\n",
|
|
"\n",
|
|
"model.load_state_dict(torch.load('model-bigram_final.bin'))\n",
|
|
"for i in range(1, epochs+1):\n",
|
|
" print('epoch: =', i)\n",
|
|
" model.train()\n",
|
|
" step = 0\n",
|
|
" for x, y in data: # prev, predicting, following words\n",
|
|
" x = x.to(device)\n",
|
|
" y = y.to(device)\n",
|
|
" optimizer.zero_grad()\n",
|
|
" ypredicted = model(x) #previous, following word\n",
|
|
" loss = criterion(torch.log(ypredicted), y)\n",
|
|
" if step % 100 == 0:\n",
|
|
" print(step, loss)\n",
|
|
" step += 1\n",
|
|
" loss.backward()\n",
|
|
" optimizer.step()\n",
|
|
" torch.save(model.state_dict(), f'model-bigram_2nd-run{i}.bin') \n",
|
|
"torch.save(model.state_dict(), f'model-bigram_final.bin') "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[('be', 11, 0.2570849657058716),\n",
|
|
" ('<unk>', 0, 0.07411641627550125),\n",
|
|
" ('not', 22, 0.05940083786845207),\n",
|
|
" ('have', 28, 0.02751326560974121),\n",
|
|
" ('bo', 167, 0.014936885796487331),\n",
|
|
" ('make', 116, 0.013943656347692013),\n",
|
|
" ('give', 193, 0.011286991648375988),\n",
|
|
" ('take', 153, 0.011171611957252026),\n",
|
|
" ('do', 86, 0.010088067501783371),\n",
|
|
" ('he', 20, 0.009703895077109337)]"
|
|
]
|
|
},
|
|
"execution_count": 17,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"device = 'cuda'\n",
|
|
"torch.cuda.empty_cache()\n",
|
|
"model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)\n",
|
|
"model.load_state_dict(torch.load(f'model-bigram_final.bin'))\n",
|
|
"model.eval()\n",
|
|
"\n",
|
|
"ixs = torch.tensor(vocab.forward(['will'])).to(device)\n",
|
|
"\n",
|
|
"out = model(ixs)\n",
|
|
"top = torch.topk(out[0], 10)\n",
|
|
"top_indices = top.indices.tolist()\n",
|
|
"top_probs = top.values.tolist()\n",
|
|
"top_words = vocab.lookup_tokens(top_indices)\n",
|
|
"list(zip(top_words, top_indices, top_probs))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 34,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[('<unk>', 0, 0.19996878504753113),\n",
|
|
" ('and', 3, 0.05288130044937134),\n",
|
|
" ('of', 2, 0.042051784694194794),\n",
|
|
" ('the', 1, 0.026572922244668007),\n",
|
|
" ('to', 4, 0.022689413279294968),\n",
|
|
" ('in', 6, 0.015904497355222702),\n",
|
|
" ('The', 17, 0.012827681377530098),\n",
|
|
" ('a', 5, 0.00961760152131319),\n",
|
|
" ('for', 8, 0.008938422426581383),\n",
|
|
" ('</s>', 32, 0.00840282253921032)]"
|
|
]
|
|
},
|
|
"execution_count": 34,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"vocab = train_dataset.vocab\n",
|
|
"ixs = torch.tensor(vocab.forward(['cerned.'])).to(device)\n",
|
|
"\n",
|
|
"out = model(ixs)\n",
|
|
"top = torch.topk(out[0], 10)\n",
|
|
"top_indices = top.indices.tolist()\n",
|
|
"top_probs = top.values.tolist()\n",
|
|
"top_words = vocab.lookup_tokens(top_indices)\n",
|
|
"list(zip(top_words, top_indices, top_probs))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 33,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[('<unk>', 0, 1.0),\n",
|
|
" ('particular,', 14538, 0.24527804553508759),\n",
|
|
" ('revolution.', 20446, 0.23776617646217346),\n",
|
|
" ('Territory.', 14189, 0.23417341709136963),\n",
|
|
" ('or-', 2261, 0.22888363897800446),\n",
|
|
" ('3', 479, 0.2288265973329544),\n",
|
|
" ('speak.', 13722, 0.2252315878868103),\n",
|
|
" ('attend.', 19397, 0.22110989689826965),\n",
|
|
" ('say,', 1455, 0.22106117010116577),\n",
|
|
" ('Lee.', 15326, 0.21764159202575684)]"
|
|
]
|
|
},
|
|
"execution_count": 33,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"cos = nn.CosineSimilarity(dim=1, eps=1e-6)\n",
|
|
"\n",
|
|
"embeddings = model.model[0].weight\n",
|
|
"\n",
|
|
"vec = embeddings[vocab['cerned.']]\n",
|
|
"\n",
|
|
"similarities = cos(vec, embeddings)\n",
|
|
"\n",
|
|
"top = torch.topk(similarities, 10)\n",
|
|
"\n",
|
|
"top_indices = top.indices.tolist()\n",
|
|
"top_probs = top.values.tolist()\n",
|
|
"top_words = vocab.lookup_tokens(top_indices)\n",
|
|
"list(zip(top_words, top_indices, top_probs))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 30,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def get_values_from_model(presc_word, model, vocab, k):\n",
|
|
" ixs = torch.tensor(vocab.forward([presc_word])).to(device)\n",
|
|
" out = model(ixs)\n",
|
|
" top = torch.topk(out[0], k)\n",
|
|
" top_indices = top.indices.tolist()\n",
|
|
" top_probs = top.values.tolist()\n",
|
|
" top_words = vocab.lookup_tokens(top_indices)\n",
|
|
" return list(zip(top_words, top_probs))\n",
|
|
"\n",
|
|
"def gonito_format(dic):\n",
|
|
" tab = summarize_probs_unk(dic)\n",
|
|
" result = ''\n",
|
|
" for element in tab[:-1]:\n",
|
|
" result+=str(element[0])+':'+str(element[1])+'\\t'\n",
|
|
" result+=':'+ str(tab[-1][1])+'\\n'\n",
|
|
" return result\n",
|
|
"\n",
|
|
"def summarize_probs_unk(dic):\n",
|
|
" if '<unk>' in dic.keys():\n",
|
|
" probsum = sum(float(val) for key, val in dic.items())\n",
|
|
" for key in dic:\n",
|
|
" dic[key] = dic[key]/probsum ###leave some space for wildcard\n",
|
|
" wildcard = dic['<unk>']\n",
|
|
" del dic['<unk>']\n",
|
|
" tab = [(key, val) for key, val in dic.items()]\n",
|
|
" tab.append(('<unk>', wildcard))\n",
|
|
" else:\n",
|
|
" probsum = sum(float(val) for key, val in dic.items())\n",
|
|
" for key in dic:\n",
|
|
" dic[key] = dic[key]/(probsum*(1+wildcard_minweight)) #plus, becouse it's denominator\n",
|
|
" tab = [(key, val) for key, val in dic.items()]\n",
|
|
" tab.append(('<unk>', 1-sum([val for val in dic.values()])))\n",
|
|
" return tab\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 31,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<All keys matched successfully>"
|
|
]
|
|
},
|
|
"execution_count": 31,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"model.load_state_dict(torch.load('model-bigram_final.bin'))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 32,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"with lzma.open(test_file, 'rt') as file:\n",
|
|
" predict_words = []\n",
|
|
" results = []\n",
|
|
" for line in file:\n",
|
|
"# print(line)\n",
|
|
" line = preprocess(line) #get only relevant\n",
|
|
" split = line.split('\\t')\n",
|
|
" predict_words.append(get_last_word(split[0])) #get_first_word(split[1])\n",
|
|
" vocab = train_dataset.vocab\n",
|
|
" for presc_word in predict_words:\n",
|
|
" results.append(dict(get_values_from_model(presc_word, model, vocab, k=k)))\n",
|
|
" with open(out_file, 'w') as outfile:\n",
|
|
" for elem in results:\n",
|
|
" outfile.write(gonito_format(elem))\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.6"
|
|
},
|
|
"org": null
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|