challenging-america-word-ga.../zad7neural_networks.ipynb
2023-04-28 09:55:14 +02:00

1212 lines
70 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"id": "8Iy6jV8cXBuT"
},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true,
"id": "vLUNBqCuXBuV",
"pycharm": {
"is_executing": true
}
},
"outputs": [],
"source": [
"import itertools\n",
"import lzma\n",
"\n",
"import regex as re\n",
"import torch\n",
"from torch import nn\n",
"from torch.utils.data import IterableDataset, DataLoader\n",
"from torchtext.vocab import build_vocab_from_iterator\n",
"from google.colab import drive"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"id": "y8M2LxjXXBuY"
},
"source": [
"## Definitions"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"id": "wMM1C4pKXBuY"
},
"source": [
"### Functions"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "VYFHWbTlXBuZ"
},
"outputs": [],
"source": [
"def clean_text(line: str):\n",
" # Preprocessing\n",
" separated = line.split('\\t')\n",
" prefix = separated[6].replace(r'\\n', ' ').replace('\\\\n', ' ').replace(' ', ' ').replace('.', '').replace(',', '').replace('?', '').replace('!', '').replace('(', '').replace(')', '').replace(';', '').replace(':', '').replace('\"', '').replace(\"'\", '').replace('-', ' ').replace(' ', ' ')\n",
" suffix = separated[7].replace(r'\\n', ' ').replace('\\\\n', ' ').replace(' ', ' ').replace('.', '').replace(',', '').replace('?', '').replace('!', '').replace('(', '').replace(')', '').replace(';', '').replace(':', '').replace('\"', '').replace(\"'\", '').replace('-', ' ').replace(' ', ' ')\n",
" return prefix + ' ' + suffix"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "qycsWH4gXBua"
},
"outputs": [],
"source": [
"def get_words_from_line(line):\n",
" line = clean_text(line)\n",
" for word in line.split():\n",
" yield word"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"id": "S3JF1_zWXBua"
},
"outputs": [],
"source": [
"def get_word_lines_from_file(file_name):\n",
" with lzma.open(file_name, mode='rt', encoding='utf-8') as fid:\n",
" for line in fid:\n",
" yield get_words_from_line(line)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"id": "-20wlI9hXBub"
},
"outputs": [],
"source": [
"def look_ahead_iterator(gen):\n",
" prev = None\n",
" for item in gen:\n",
" if prev is not None:\n",
" yield (prev, item)\n",
" prev = item"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"id": "jL5ZrQGMXBub"
},
"outputs": [],
"source": [
"def prediction(word: str) -> str:\n",
" ixs = torch.tensor(vocab.forward([word])).to(device)\n",
" out = model(ixs)\n",
" top = torch.topk(out[0], 5)\n",
" top_indices = top.indices.tolist()\n",
" top_probs = top.values.tolist()\n",
" top_words = vocab.lookup_tokens(top_indices)\n",
" zipped = list(zip(top_words, top_probs))\n",
" for index, element in enumerate(zipped):\n",
" unk = None\n",
" if '<unk>' in element:\n",
" unk = zipped.pop(index)\n",
" zipped.append(('', unk[1]))\n",
" break\n",
" if unk is None:\n",
" zipped[-1] = ('', zipped[-1][1])\n",
" return ' '.join([f'{x[0]}:{x[1]}' for x in zipped])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"id": "KByjDByYXBuc"
},
"outputs": [],
"source": [
"def save_outs(folder_name):\n",
" print(f'Creating outputs in {folder_name}')\n",
" with lzma.open(f'/content/drive/MyDrive/Colab Notebooks/{folder_name}/in.tsv.xz', mode='rt', encoding='utf-8') as fid:\n",
" with open(f'/content/drive/MyDrive/Colab Notebooks/{folder_name}/out.tsv', 'w', encoding='utf-8', newline='\\n') as f:\n",
" for line in fid:\n",
" separated = line.split('\\t')\n",
" prefix = separated[6].replace(r'\\n', ' ').split()[-1]\n",
" output_line = prediction(prefix)\n",
" f.write(output_line + '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"id": "dHW2X57NXBud"
},
"source": [
"### Classes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class Bigrams(IterableDataset):\n",
" def __init__(self, text_file, vocabulary_size):\n",
" self.vocab = build_vocab_from_iterator(\n",
" get_word_lines_from_file(text_file),\n",
" max_tokens=vocabulary_size,\n",
" specials=['<unk>'])\n",
" self.vocab.set_default_index(self.vocab['<unk>'])\n",
" self.vocabulary_size = vocabulary_size\n",
" self.text_file = text_file\n",
"\n",
" def __iter__(self):\n",
" return look_ahead_iterator(\n",
" (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "XQD2jLnOXBue"
},
"outputs": [],
"source": [
"class SimpleBigramNeuralLanguageModel(nn.Module):\n",
" def __init__(self, vocabulary_size, embedding_size):\n",
" super(SimpleBigramNeuralLanguageModel, self).__init__()\n",
" self.model = nn.Sequential(\n",
" nn.Embedding(vocabulary_size, embedding_size),\n",
" nn.Linear(embedding_size, vocabulary_size),\n",
" nn.Softmax()\n",
" )\n",
"\n",
" def forward(self, x):\n",
" return self.model(x)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"id": "Mvodzlq6XBuf"
},
"source": [
"## Training"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"id": "zUDc1k5cXBuf"
},
"source": [
"### Params"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"id": "ndnatbe3XBug"
},
"outputs": [],
"source": [
"vocab_size = 10000\n",
"embed_size = 100\n",
"batch_size = 2000\n",
"device = 'cuda'\n",
"path_to_train = '/content/drive/MyDrive/Colab Notebooks/train/in.tsv.xz'\n",
"path_to_model = 'modelneural_bigram.bin'"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"id": "7wF-1JG-XBug"
},
"source": [
"### Colab"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Sf4dvmOPXBuh",
"outputId": "3ac75e94-6acd-4906-e9c0-5a5bbe099566"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mounted at /content/drive\n",
"/content/drive/MyDrive\n"
]
}
],
"source": [
"drive.mount('/content/drive')\n",
"%cd /content/drive/MyDrive/"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"id": "aeSaf6vvXBuh"
},
"source": [
"### Run"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"id": "dzWDCLo0XBuh"
},
"outputs": [],
"source": [
"vocab = build_vocab_from_iterator(\n",
" get_word_lines_from_file(path_to_train),\n",
" max_tokens=vocab_size,\n",
" specials=['<unk>']\n",
")\n",
"\n",
"vocab.set_default_index(vocab['<unk>'])"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"id": "FRo29Q3bXBui"
},
"outputs": [],
"source": [
"train_dataset = Bigrams(path_to_train, vocab_size)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mYxBeXjwXBui",
"outputId": "ebd5218f-6a5b-49ec-a2da-e478d63fe50d"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.10/dist-packages/torch/nn/modules/container.py:217: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
" input = module(input)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 tensor(9.4517, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"100 tensor(7.9341, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"200 tensor(7.1452, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"300 tensor(6.7956, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"400 tensor(6.4127, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"500 tensor(6.3407, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"600 tensor(6.2125, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"700 tensor(5.7817, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"800 tensor(5.7309, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"900 tensor(5.7419, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"1000 tensor(5.7372, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"1100 tensor(5.2804, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"1200 tensor(5.4610, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"1300 tensor(5.6610, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"1400 tensor(5.3070, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"1500 tensor(4.9666, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"1600 tensor(5.2102, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"1700 tensor(5.4919, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"1800 tensor(5.1968, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"1900 tensor(5.3336, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"2000 tensor(5.2387, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"2100 tensor(5.2247, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"2200 tensor(5.2544, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"2300 tensor(5.3343, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"2400 tensor(5.3077, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"2500 tensor(5.1209, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"2600 tensor(5.3806, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"2700 tensor(5.2865, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"2800 tensor(5.2625, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"2900 tensor(5.2476, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"3000 tensor(5.2663, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"3100 tensor(5.0200, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"3200 tensor(5.2324, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"3300 tensor(5.1963, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"3400 tensor(5.1108, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"3500 tensor(5.1499, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"3600 tensor(5.3241, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"3700 tensor(5.1977, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"3800 tensor(5.1466, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"3900 tensor(5.2557, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"4000 tensor(5.0468, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"4100 tensor(5.1882, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"4200 tensor(5.0748, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"4300 tensor(4.9577, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"4400 tensor(4.8100, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"4500 tensor(5.0355, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"4600 tensor(5.1247, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"4700 tensor(5.0516, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"4800 tensor(4.9036, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"4900 tensor(5.0096, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"5000 tensor(5.2085, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"5100 tensor(5.0944, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"5200 tensor(5.1592, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"5300 tensor(5.2019, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"5400 tensor(5.2048, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"5500 tensor(5.0499, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"5600 tensor(5.0369, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"5700 tensor(5.2581, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"5800 tensor(5.0312, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"5900 tensor(5.0513, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"6000 tensor(5.2384, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"6100 tensor(5.0257, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"6200 tensor(5.1156, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"6300 tensor(4.9953, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"6400 tensor(5.2028, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"6500 tensor(4.8426, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"6600 tensor(5.0661, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"6700 tensor(5.0976, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"6800 tensor(4.9180, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"6900 tensor(4.9928, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"7000 tensor(5.1889, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"7100 tensor(4.9612, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"7200 tensor(5.1408, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"7300 tensor(5.0562, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"7400 tensor(4.8779, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"7500 tensor(5.0490, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"7600 tensor(5.0678, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"7700 tensor(4.9938, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"7800 tensor(5.0301, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"7900 tensor(5.2542, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"8000 tensor(4.8772, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"8100 tensor(5.0953, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"8200 tensor(5.0217, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"8300 tensor(5.0107, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"8400 tensor(5.0733, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"8500 tensor(4.5262, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"8600 tensor(5.0271, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"8700 tensor(4.6307, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"8800 tensor(4.9917, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"8900 tensor(5.1940, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"9000 tensor(5.0302, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"9100 tensor(5.0956, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"9200 tensor(5.0438, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"9300 tensor(5.0134, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"9400 tensor(5.2201, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"9500 tensor(4.8876, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"9600 tensor(5.1474, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"9700 tensor(5.0169, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"9800 tensor(5.0743, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"9900 tensor(4.9008, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"10000 tensor(5.1381, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"10100 tensor(5.0524, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"10200 tensor(5.0369, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"10300 tensor(5.0595, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"10400 tensor(5.0138, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"10500 tensor(5.0164, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"10600 tensor(4.9153, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"10700 tensor(4.9971, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"10800 tensor(5.0200, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"10900 tensor(4.9631, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"11000 tensor(4.9385, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"11100 tensor(4.9851, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"11200 tensor(5.0681, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"11300 tensor(5.1261, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"11400 tensor(5.0098, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"11500 tensor(5.1261, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"11600 tensor(5.1213, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"11700 tensor(5.0265, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"11800 tensor(4.7047, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"11900 tensor(5.1954, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"12000 tensor(5.0850, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"12100 tensor(4.9762, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"12200 tensor(5.0162, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"12300 tensor(4.9834, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"12400 tensor(4.8953, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"12500 tensor(5.0389, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"12600 tensor(4.9266, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"12700 tensor(5.0132, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"12800 tensor(5.1777, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"12900 tensor(4.8290, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"13000 tensor(5.0639, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"13100 tensor(5.0565, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"13200 tensor(5.0222, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"13300 tensor(5.2150, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"13400 tensor(4.9393, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"13500 tensor(5.0270, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"13600 tensor(4.9520, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"13700 tensor(4.9845, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"13800 tensor(4.8543, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"13900 tensor(4.8892, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"14000 tensor(4.9802, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"14100 tensor(4.9833, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"14200 tensor(4.9348, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"14300 tensor(4.9561, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"14400 tensor(5.0198, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"14500 tensor(4.9878, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"14600 tensor(4.7517, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"14700 tensor(4.9452, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"14800 tensor(4.8229, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"14900 tensor(5.1425, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"15000 tensor(4.9122, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"15100 tensor(4.8217, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"15200 tensor(4.8604, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"15300 tensor(5.1151, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"15400 tensor(4.9545, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"15500 tensor(5.0922, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"15600 tensor(4.7891, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"15700 tensor(4.6318, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"15800 tensor(4.9540, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"15900 tensor(4.7681, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"16000 tensor(4.9602, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"16100 tensor(4.9705, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"16200 tensor(4.8296, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"16300 tensor(5.0188, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"16400 tensor(5.1062, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"16500 tensor(5.2549, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"16600 tensor(5.1164, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"16700 tensor(4.9399, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"16800 tensor(5.1161, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"16900 tensor(4.9115, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"17000 tensor(4.7572, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"17100 tensor(4.9667, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"17200 tensor(4.7463, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"17300 tensor(4.9038, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"17400 tensor(4.9859, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"17500 tensor(5.0652, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"17600 tensor(4.6641, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"17700 tensor(4.9265, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"17800 tensor(5.0095, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"17900 tensor(5.1090, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"18000 tensor(4.9015, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"18100 tensor(4.9997, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"18200 tensor(4.8359, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"18300 tensor(4.7353, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"18400 tensor(4.9657, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"18500 tensor(4.9856, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"18600 tensor(5.0571, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"18700 tensor(4.8566, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"18800 tensor(4.9819, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"18900 tensor(4.9809, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"19000 tensor(5.0202, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"19100 tensor(5.1329, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"19200 tensor(5.0460, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"19300 tensor(4.9174, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"19400 tensor(5.1266, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"19500 tensor(4.8903, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"19600 tensor(5.0548, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"19700 tensor(4.9530, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"19800 tensor(4.9296, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"19900 tensor(4.9925, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"20000 tensor(4.9181, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"20100 tensor(4.9487, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"20200 tensor(5.0580, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"20300 tensor(5.1110, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"20400 tensor(4.8053, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"20500 tensor(4.7658, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"20600 tensor(4.7387, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"20700 tensor(4.9779, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"20800 tensor(4.8901, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"20900 tensor(4.9092, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"21000 tensor(5.2856, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"21100 tensor(4.9803, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"21200 tensor(4.6889, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"21300 tensor(4.8434, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"21400 tensor(4.7451, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"21500 tensor(4.9406, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"21600 tensor(4.8431, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"21700 tensor(4.9932, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"21800 tensor(4.6696, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"21900 tensor(4.8091, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"22000 tensor(4.7533, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"22100 tensor(4.6842, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"22200 tensor(4.8844, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"22300 tensor(5.1038, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"22400 tensor(4.9929, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"22500 tensor(5.0109, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"22600 tensor(4.8278, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"22700 tensor(4.8597, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"22800 tensor(5.0256, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"22900 tensor(4.4663, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"23000 tensor(4.6069, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"23100 tensor(5.0816, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"23200 tensor(4.9038, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"23300 tensor(4.9284, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"23400 tensor(5.0439, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"23500 tensor(4.9640, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"23600 tensor(5.0096, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"23700 tensor(4.9700, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"23800 tensor(4.9461, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"23900 tensor(4.8171, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"24000 tensor(4.9529, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"24100 tensor(4.8525, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"24200 tensor(5.0488, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"24300 tensor(4.9206, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"24400 tensor(5.0900, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"24500 tensor(4.9484, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"24600 tensor(4.8962, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"24700 tensor(4.8884, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"24800 tensor(5.1541, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"24900 tensor(4.9803, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"25000 tensor(4.4473, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"25100 tensor(4.7330, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"25200 tensor(5.0709, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"25300 tensor(4.7139, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"25400 tensor(4.8961, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"25500 tensor(4.9459, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"25600 tensor(4.8840, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"25700 tensor(4.7792, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"25800 tensor(4.9212, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"25900 tensor(4.7168, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"26000 tensor(4.7903, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"26100 tensor(4.9544, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"26200 tensor(4.8421, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"26300 tensor(4.8085, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"26400 tensor(4.7129, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"26500 tensor(5.0808, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"26600 tensor(4.8222, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"26700 tensor(4.7982, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"26800 tensor(4.8482, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"26900 tensor(5.0815, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"27000 tensor(4.9754, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"27100 tensor(5.0156, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"27200 tensor(4.7985, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"27300 tensor(4.6372, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"27400 tensor(4.5098, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"27500 tensor(5.0427, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"27600 tensor(4.9139, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"27700 tensor(4.8924, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"27800 tensor(4.9972, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"27900 tensor(5.0452, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"28000 tensor(4.5323, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"28100 tensor(4.8945, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"28200 tensor(4.8096, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"28300 tensor(5.1238, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"28400 tensor(4.9879, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"28500 tensor(4.9505, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"28600 tensor(4.7750, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"28700 tensor(5.0738, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"28800 tensor(4.9318, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"28900 tensor(5.0403, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"29000 tensor(4.9072, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"29100 tensor(4.9822, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"29200 tensor(4.8701, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"29300 tensor(4.8883, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"29400 tensor(4.8906, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"29500 tensor(5.0658, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"29600 tensor(4.7604, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"29700 tensor(5.0792, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"29800 tensor(4.9074, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"29900 tensor(4.8845, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"30000 tensor(5.1969, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"30100 tensor(4.9648, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"30200 tensor(4.9086, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"30300 tensor(4.9708, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"30400 tensor(4.9155, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"30500 tensor(4.9404, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"30600 tensor(5.0224, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"30700 tensor(5.0298, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"30800 tensor(4.9557, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"30900 tensor(4.9653, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"31000 tensor(4.8938, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"31100 tensor(4.6689, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"31200 tensor(4.9757, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"31300 tensor(4.8805, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"31400 tensor(4.9969, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"31500 tensor(4.8262, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"31600 tensor(4.5519, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"31700 tensor(4.9185, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"31800 tensor(4.9190, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"31900 tensor(4.8702, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"32000 tensor(4.9346, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"32100 tensor(4.8963, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"32200 tensor(4.9017, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"32300 tensor(4.9595, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"32400 tensor(4.8125, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"32500 tensor(4.9593, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"32600 tensor(5.0663, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"32700 tensor(4.9644, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"32800 tensor(4.8500, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"32900 tensor(5.0070, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"33000 tensor(4.8131, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"33100 tensor(5.0183, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"33200 tensor(4.8692, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"33300 tensor(4.9145, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"33400 tensor(5.0221, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"33500 tensor(4.9636, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"33600 tensor(4.8758, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"33700 tensor(4.8713, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"33800 tensor(4.7325, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"33900 tensor(4.9829, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"34000 tensor(4.7823, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"34100 tensor(4.9773, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"34200 tensor(4.9638, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"34300 tensor(5.0311, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"34400 tensor(4.9491, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"34500 tensor(4.9527, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"34600 tensor(4.7559, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"34700 tensor(4.9602, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"34800 tensor(5.0363, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"34900 tensor(4.9509, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"35000 tensor(4.8740, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"35100 tensor(4.8790, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"35200 tensor(4.7886, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"35300 tensor(4.9939, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"35400 tensor(4.8046, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"35500 tensor(5.0125, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"35600 tensor(4.8254, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"35700 tensor(4.5858, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"35800 tensor(5.0067, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"35900 tensor(5.0505, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"36000 tensor(4.9909, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"36100 tensor(4.8610, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"36200 tensor(4.9135, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"36300 tensor(5.0409, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"36400 tensor(4.8932, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"36500 tensor(4.8384, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"36600 tensor(4.8262, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"36700 tensor(4.8363, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"36800 tensor(4.9260, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"36900 tensor(4.7176, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"37000 tensor(4.8836, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"37100 tensor(4.7659, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"37200 tensor(5.0418, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"37300 tensor(4.7165, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"37400 tensor(4.7707, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"37500 tensor(4.9404, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"37600 tensor(4.7666, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"37700 tensor(5.0086, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"37800 tensor(4.8929, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"37900 tensor(5.0537, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"38000 tensor(4.8494, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"38100 tensor(5.1193, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"38200 tensor(4.9035, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"38300 tensor(4.7574, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"38400 tensor(4.9181, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"38500 tensor(5.0186, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"38600 tensor(5.0224, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"38700 tensor(4.6032, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"38800 tensor(5.1368, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"38900 tensor(4.9394, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"39000 tensor(4.7891, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"39100 tensor(4.9718, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"39200 tensor(4.9599, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"39300 tensor(4.8518, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"39400 tensor(4.7832, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"39500 tensor(4.9827, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"39600 tensor(5.0733, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"39700 tensor(4.8859, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"39800 tensor(4.9722, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"39900 tensor(5.0568, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"40000 tensor(4.8251, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"40100 tensor(4.8720, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"40200 tensor(5.3066, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"40300 tensor(4.9435, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"40400 tensor(4.9634, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"40500 tensor(4.8406, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"40600 tensor(4.8050, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"40700 tensor(4.6578, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"40800 tensor(4.8490, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"40900 tensor(5.1542, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"41000 tensor(4.8509, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"41100 tensor(4.8082, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"41200 tensor(4.8444, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"41300 tensor(5.1602, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"41400 tensor(4.7235, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"41500 tensor(5.0334, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"41600 tensor(5.0500, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"41700 tensor(5.0378, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"41800 tensor(4.7989, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"41900 tensor(4.9342, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"42000 tensor(4.9981, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"42100 tensor(4.6723, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"42200 tensor(4.9382, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"42300 tensor(4.9237, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"42400 tensor(4.9302, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"42500 tensor(4.8494, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"42600 tensor(4.9942, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"42700 tensor(4.9581, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"42800 tensor(4.8044, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"42900 tensor(5.0890, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"43000 tensor(4.9422, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"43100 tensor(5.0014, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"43200 tensor(4.9001, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"43300 tensor(4.9133, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"43400 tensor(4.8836, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"43500 tensor(4.8232, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"43600 tensor(4.8052, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"43700 tensor(5.0304, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"43800 tensor(5.0834, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"43900 tensor(4.8242, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"44000 tensor(4.8126, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"44100 tensor(4.7836, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"44200 tensor(5.0763, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"44300 tensor(5.0682, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"44400 tensor(4.8869, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"44500 tensor(4.8527, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"44600 tensor(4.8439, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"44700 tensor(4.9127, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"44800 tensor(4.9628, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"44900 tensor(5.0566, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"45000 tensor(5.0596, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"45100 tensor(5.1187, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"45200 tensor(5.0824, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"45300 tensor(4.8433, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"45400 tensor(4.7299, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"45500 tensor(5.1722, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"45600 tensor(4.7867, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"45700 tensor(4.9631, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"45800 tensor(4.6216, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"45900 tensor(4.9601, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"46000 tensor(4.9055, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"46100 tensor(5.0517, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"46200 tensor(5.0099, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"46300 tensor(4.8178, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"46400 tensor(4.9317, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"46500 tensor(4.8770, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"46600 tensor(4.9668, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"46700 tensor(5.1287, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"46800 tensor(4.9050, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"46900 tensor(4.9622, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"47000 tensor(4.6818, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"47100 tensor(4.8780, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"47200 tensor(4.9493, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"47300 tensor(4.7958, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"47400 tensor(4.5415, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"47500 tensor(5.0651, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"47600 tensor(4.9692, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"47700 tensor(4.8536, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"47800 tensor(4.7306, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"47900 tensor(5.1795, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"48000 tensor(4.9196, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"48100 tensor(5.1446, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"48200 tensor(4.9810, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"48300 tensor(4.9688, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"48400 tensor(5.0246, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"48500 tensor(4.7523, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"48600 tensor(4.7716, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"48700 tensor(4.8938, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"48800 tensor(4.9324, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"48900 tensor(4.9811, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"49000 tensor(4.8818, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"49100 tensor(4.9871, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"49200 tensor(4.8498, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"49300 tensor(4.8027, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"49400 tensor(5.0199, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"49500 tensor(4.9790, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"49600 tensor(5.0995, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"49700 tensor(4.8989, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"49800 tensor(4.8903, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"49900 tensor(4.6744, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"50000 tensor(4.9403, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"50100 tensor(4.7815, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"50200 tensor(4.8617, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"50300 tensor(4.4559, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"50400 tensor(5.0322, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"50500 tensor(4.6867, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"50600 tensor(4.9644, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"50700 tensor(5.0631, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"50800 tensor(4.7992, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"50900 tensor(4.9346, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"51000 tensor(4.6487, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"51100 tensor(4.8758, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"51200 tensor(5.0734, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"51300 tensor(4.8078, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"51400 tensor(4.7628, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"51500 tensor(4.8508, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"51600 tensor(4.8231, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"51700 tensor(5.0122, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"51800 tensor(4.8941, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"51900 tensor(5.0284, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"52000 tensor(4.9158, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"52100 tensor(4.8752, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"52200 tensor(4.7020, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"52300 tensor(4.6001, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"52400 tensor(4.7898, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"52500 tensor(4.8255, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"52600 tensor(4.7331, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"52700 tensor(4.8546, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"52800 tensor(4.9418, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"52900 tensor(4.7536, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"53000 tensor(4.9609, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"53100 tensor(5.0644, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"53200 tensor(4.8919, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"53300 tensor(4.7840, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"53400 tensor(4.8539, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"53500 tensor(4.8023, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"53600 tensor(4.9810, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"53700 tensor(4.9946, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"53800 tensor(4.3504, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"53900 tensor(4.8656, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"54000 tensor(5.0103, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"54100 tensor(4.8503, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"54200 tensor(4.9970, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"54300 tensor(4.5719, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"54400 tensor(4.7891, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"54500 tensor(4.8968, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"54600 tensor(5.0036, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"54700 tensor(4.9487, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"54800 tensor(4.8477, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"54900 tensor(4.9253, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"55000 tensor(4.9079, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"55100 tensor(4.9499, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"55200 tensor(5.0510, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"55300 tensor(4.9320, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"55400 tensor(4.5737, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"55500 tensor(4.7703, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"55600 tensor(5.0166, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"55700 tensor(4.9049, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"55800 tensor(4.7355, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"55900 tensor(4.5776, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"56000 tensor(4.9919, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"56100 tensor(4.8629, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"56200 tensor(5.0123, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"56300 tensor(4.3110, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"56400 tensor(4.8950, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"56500 tensor(4.8415, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"56600 tensor(4.7285, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"56700 tensor(4.8401, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"56800 tensor(4.7972, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"56900 tensor(4.7398, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"57000 tensor(5.1683, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"57100 tensor(4.9399, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"57200 tensor(4.9609, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"57300 tensor(4.9818, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"57400 tensor(4.9719, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"57500 tensor(4.8724, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"57600 tensor(4.9824, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"57700 tensor(5.0357, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"57800 tensor(5.0542, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"57900 tensor(4.8753, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"58000 tensor(4.7773, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"58100 tensor(4.7864, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"58200 tensor(4.8033, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"58300 tensor(4.9997, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"58400 tensor(4.9701, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"58500 tensor(4.8920, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"58600 tensor(4.9408, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"58700 tensor(5.1013, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"58800 tensor(4.8176, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"58900 tensor(4.7466, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"59000 tensor(4.9146, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"59100 tensor(4.8151, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"59200 tensor(4.9928, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"59300 tensor(5.0274, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"59400 tensor(4.7727, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"59500 tensor(5.0648, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"59600 tensor(4.9982, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"59700 tensor(4.8934, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"59800 tensor(4.8285, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"59900 tensor(4.8039, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"60000 tensor(4.9090, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"60100 tensor(4.6927, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"60200 tensor(4.8922, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"60300 tensor(4.8804, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"60400 tensor(4.9676, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"60500 tensor(4.7234, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"60600 tensor(4.9174, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"60700 tensor(4.9062, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"60800 tensor(5.0811, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"60900 tensor(5.1713, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"61000 tensor(4.9471, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"61100 tensor(4.8106, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"61200 tensor(4.8666, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"61300 tensor(4.8624, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"61400 tensor(4.5771, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"61500 tensor(4.8186, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"61600 tensor(4.7787, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"61700 tensor(4.9245, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"61800 tensor(5.0268, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"61900 tensor(5.2582, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"62000 tensor(4.8309, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"62100 tensor(4.9982, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"62200 tensor(4.8859, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"62300 tensor(4.5051, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"62400 tensor(4.6767, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"62500 tensor(4.7197, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"62600 tensor(4.6625, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"62700 tensor(4.6548, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"62800 tensor(4.7307, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"62900 tensor(4.9550, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"63000 tensor(4.5528, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"63100 tensor(4.8676, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"63200 tensor(4.9302, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"63300 tensor(4.8878, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"63400 tensor(4.9172, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"63500 tensor(4.7881, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"63600 tensor(4.8712, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"63700 tensor(4.9398, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"63800 tensor(4.9999, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"63900 tensor(4.8581, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"64000 tensor(4.6726, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"64100 tensor(5.0308, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"64200 tensor(4.7130, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"64300 tensor(4.9586, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"64400 tensor(4.9456, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"64500 tensor(4.8030, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"64600 tensor(4.9885, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"64700 tensor(4.9439, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"64800 tensor(4.6348, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"64900 tensor(4.8772, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"65000 tensor(4.9567, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"65100 tensor(4.9036, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"65200 tensor(4.7526, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"65300 tensor(4.9206, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"65400 tensor(4.8406, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"65500 tensor(4.5461, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"65600 tensor(4.9647, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"65700 tensor(4.9128, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"65800 tensor(4.8554, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"65900 tensor(4.8749, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"66000 tensor(5.1345, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"66100 tensor(4.6254, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"66200 tensor(4.9932, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"66300 tensor(4.5778, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"66400 tensor(4.7925, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"66500 tensor(4.9761, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"66600 tensor(4.9166, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"66700 tensor(4.8186, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"66800 tensor(4.9063, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"66900 tensor(4.9770, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"67000 tensor(4.8087, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"67100 tensor(4.7366, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"67200 tensor(5.0656, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"67300 tensor(4.9718, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"67400 tensor(4.8172, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"67500 tensor(4.9368, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"67600 tensor(4.9278, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"67700 tensor(4.8133, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"67800 tensor(4.9486, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"67900 tensor(4.8521, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"68000 tensor(4.9510, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"68100 tensor(4.8939, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"68200 tensor(4.8088, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"68300 tensor(4.9821, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"68400 tensor(5.1750, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"68500 tensor(4.6476, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"68600 tensor(4.8567, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"68700 tensor(4.8663, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"68800 tensor(5.0268, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"68900 tensor(4.8717, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"69000 tensor(4.9166, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"69100 tensor(4.9094, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"69200 tensor(4.7433, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"69300 tensor(4.5366, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"69400 tensor(5.0260, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"69500 tensor(4.7304, device='cuda:0', grad_fn=<NllLossBackward0>)\n"
]
}
],
"source": [
"model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)\n",
"data = DataLoader(train_dataset, batch_size=batch_size)\n",
"optimizer = torch.optim.Adam(model.parameters())\n",
"criterion = torch.nn.NLLLoss()\n",
"\n",
"model.train()\n",
"step = 0\n",
"for x, y in data:\n",
" x = x.to(device)\n",
" y = y.to(device)\n",
" optimizer.zero_grad()\n",
" ypredicted = model(x)\n",
" loss = criterion(torch.log(ypredicted), y)\n",
" if step % 100 == 0:\n",
" print(step, loss)\n",
" step += 1\n",
" loss.backward()\n",
" optimizer.step()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "wfLtxqN6gFCw",
"outputId": "1be9876e-eb88-4ed0-a40e-3546aa6c5ad4"
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import torch\n",
"torch.cuda.is_available()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"id": "bp60AtU0XBuj"
},
"outputs": [],
"source": [
"torch.save(model.state_dict(), path_to_model)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "BwN-Q2sFXBuj",
"outputId": "a444be6d-bfb3-4235-c48c-41ba6cbfeec1"
},
"outputs": [
{
"data": {
"text/plain": [
"SimpleBigramNeuralLanguageModel(\n",
" (model): Sequential(\n",
" (0): Embedding(10000, 100)\n",
" (1): Linear(in_features=100, out_features=10000, bias=True)\n",
" (2): Softmax(dim=None)\n",
" )\n",
")"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)\n",
"model.load_state_dict(torch.load(path_to_model))\n",
"model.eval()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "QVBhjgB1XBuk",
"outputId": "ee63bb8b-57c8-40fb-94fe-cd00e0fa82b8"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Creating outputs in dev-0\n"
]
}
],
"source": [
"save_outs('dev-0')"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "5BglgEAxXBuk",
"outputId": "4fda63a1-94d8-4daa-dbd7-d6a640e57f40"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Creating outputs in test-A\n"
]
}
],
"source": [
"save_outs('test-A')"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"provenance": []
},
"gpuClass": "standard",
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}