challenging-america-word-ga.../zad7neural_networks.ipynb
2023-04-28 09:55:14 +02:00

70 KiB

Imports

import itertools
import lzma

import regex as re
import torch
from torch import nn
from torch.utils.data import IterableDataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator
from google.colab import drive

Definitions

Functions

def clean_text(line: str):
    # Preprocessing
    separated = line.split('\t')
    prefix = separated[6].replace(r'\n', ' ').replace('\\\\n', ' ').replace('  ', ' ').replace('.', '').replace(',', '').replace('?', '').replace('!', '').replace('(', '').replace(')', '').replace(';', '').replace(':', '').replace('"', '').replace("'", '').replace('-', ' ').replace('  ', ' ')
    suffix = separated[7].replace(r'\n', ' ').replace('\\\\n', ' ').replace('  ', ' ').replace('.', '').replace(',', '').replace('?', '').replace('!', '').replace('(', '').replace(')', '').replace(';', '').replace(':', '').replace('"', '').replace("'", '').replace('-', ' ').replace('  ', ' ')
    return prefix + ' ' + suffix
def get_words_from_line(line):
    line = clean_text(line)
    for word in line.split():
        yield word
def get_word_lines_from_file(file_name):
    with lzma.open(file_name, mode='rt', encoding='utf-8') as fid:
        for line in fid:
            yield get_words_from_line(line)
def look_ahead_iterator(gen):
    prev = None
    for item in gen:
        if prev is not None:
            yield (prev, item)
        prev = item
def prediction(word: str) -> str:
    ixs = torch.tensor(vocab.forward([word])).to(device)
    out = model(ixs)
    top = torch.topk(out[0], 5)
    top_indices = top.indices.tolist()
    top_probs = top.values.tolist()
    top_words = vocab.lookup_tokens(top_indices)
    zipped = list(zip(top_words, top_probs))
    for index, element in enumerate(zipped):
        unk = None
        if '<unk>' in element:
            unk = zipped.pop(index)
            zipped.append(('', unk[1]))
            break
    if unk is None:
        zipped[-1] = ('', zipped[-1][1])
    return ' '.join([f'{x[0]}:{x[1]}' for x in zipped])
def save_outs(folder_name):
    print(f'Creating outputs in {folder_name}')
    with lzma.open(f'/content/drive/MyDrive/Colab Notebooks/{folder_name}/in.tsv.xz', mode='rt', encoding='utf-8') as fid:
        with open(f'/content/drive/MyDrive/Colab Notebooks/{folder_name}/out.tsv', 'w', encoding='utf-8', newline='\n') as f:
            for line in fid:
                separated = line.split('\t')
                prefix = separated[6].replace(r'\n', ' ').split()[-1]
                output_line = prediction(prefix)
                f.write(output_line + '\n')

Classes

class Bigrams(IterableDataset):
    def __init__(self, text_file, vocabulary_size):
        self.vocab = build_vocab_from_iterator(
            get_word_lines_from_file(text_file),
            max_tokens=vocabulary_size,
            specials=['<unk>'])
        self.vocab.set_default_index(self.vocab['<unk>'])
        self.vocabulary_size = vocabulary_size
        self.text_file = text_file

    def __iter__(self):
        return look_ahead_iterator(
            (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))
class SimpleBigramNeuralLanguageModel(nn.Module):
    def __init__(self, vocabulary_size, embedding_size):
        super(SimpleBigramNeuralLanguageModel, self).__init__()
        self.model = nn.Sequential(
            nn.Embedding(vocabulary_size, embedding_size),
            nn.Linear(embedding_size, vocabulary_size),
            nn.Softmax()
        )

    def forward(self, x):
        return self.model(x)

Training

Params

vocab_size = 10000
embed_size = 100
batch_size = 2000
device = 'cuda'
path_to_train = '/content/drive/MyDrive/Colab Notebooks/train/in.tsv.xz'
path_to_model = 'modelneural_bigram.bin'

Colab

drive.mount('/content/drive')
%cd /content/drive/MyDrive/
Mounted at /content/drive
/content/drive/MyDrive

Run

vocab = build_vocab_from_iterator(
    get_word_lines_from_file(path_to_train),
    max_tokens=vocab_size,
    specials=['<unk>']
)

vocab.set_default_index(vocab['<unk>'])
train_dataset = Bigrams(path_to_train, vocab_size)
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
data = DataLoader(train_dataset, batch_size=batch_size)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()

model.train()
step = 0
for x, y in data:
    x = x.to(device)
    y = y.to(device)
    optimizer.zero_grad()
    ypredicted = model(x)
    loss = criterion(torch.log(ypredicted), y)
    if step % 100 == 0:
        print(step, loss)
    step += 1
    loss.backward()
    optimizer.step()
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/container.py:217: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  input = module(input)
0 tensor(9.4517, device='cuda:0', grad_fn=<NllLossBackward0>)
100 tensor(7.9341, device='cuda:0', grad_fn=<NllLossBackward0>)
200 tensor(7.1452, device='cuda:0', grad_fn=<NllLossBackward0>)
300 tensor(6.7956, device='cuda:0', grad_fn=<NllLossBackward0>)
400 tensor(6.4127, device='cuda:0', grad_fn=<NllLossBackward0>)
500 tensor(6.3407, device='cuda:0', grad_fn=<NllLossBackward0>)
600 tensor(6.2125, device='cuda:0', grad_fn=<NllLossBackward0>)
700 tensor(5.7817, device='cuda:0', grad_fn=<NllLossBackward0>)
800 tensor(5.7309, device='cuda:0', grad_fn=<NllLossBackward0>)
900 tensor(5.7419, device='cuda:0', grad_fn=<NllLossBackward0>)
1000 tensor(5.7372, device='cuda:0', grad_fn=<NllLossBackward0>)
1100 tensor(5.2804, device='cuda:0', grad_fn=<NllLossBackward0>)
1200 tensor(5.4610, device='cuda:0', grad_fn=<NllLossBackward0>)
1300 tensor(5.6610, device='cuda:0', grad_fn=<NllLossBackward0>)
1400 tensor(5.3070, device='cuda:0', grad_fn=<NllLossBackward0>)
1500 tensor(4.9666, device='cuda:0', grad_fn=<NllLossBackward0>)
1600 tensor(5.2102, device='cuda:0', grad_fn=<NllLossBackward0>)
1700 tensor(5.4919, device='cuda:0', grad_fn=<NllLossBackward0>)
1800 tensor(5.1968, device='cuda:0', grad_fn=<NllLossBackward0>)
1900 tensor(5.3336, device='cuda:0', grad_fn=<NllLossBackward0>)
2000 tensor(5.2387, device='cuda:0', grad_fn=<NllLossBackward0>)
2100 tensor(5.2247, device='cuda:0', grad_fn=<NllLossBackward0>)
2200 tensor(5.2544, device='cuda:0', grad_fn=<NllLossBackward0>)
2300 tensor(5.3343, device='cuda:0', grad_fn=<NllLossBackward0>)
2400 tensor(5.3077, device='cuda:0', grad_fn=<NllLossBackward0>)
2500 tensor(5.1209, device='cuda:0', grad_fn=<NllLossBackward0>)
2600 tensor(5.3806, device='cuda:0', grad_fn=<NllLossBackward0>)
2700 tensor(5.2865, device='cuda:0', grad_fn=<NllLossBackward0>)
2800 tensor(5.2625, device='cuda:0', grad_fn=<NllLossBackward0>)
2900 tensor(5.2476, device='cuda:0', grad_fn=<NllLossBackward0>)
3000 tensor(5.2663, device='cuda:0', grad_fn=<NllLossBackward0>)
3100 tensor(5.0200, device='cuda:0', grad_fn=<NllLossBackward0>)
3200 tensor(5.2324, device='cuda:0', grad_fn=<NllLossBackward0>)
3300 tensor(5.1963, device='cuda:0', grad_fn=<NllLossBackward0>)
3400 tensor(5.1108, device='cuda:0', grad_fn=<NllLossBackward0>)
3500 tensor(5.1499, device='cuda:0', grad_fn=<NllLossBackward0>)
3600 tensor(5.3241, device='cuda:0', grad_fn=<NllLossBackward0>)
3700 tensor(5.1977, device='cuda:0', grad_fn=<NllLossBackward0>)
3800 tensor(5.1466, device='cuda:0', grad_fn=<NllLossBackward0>)
3900 tensor(5.2557, device='cuda:0', grad_fn=<NllLossBackward0>)
4000 tensor(5.0468, device='cuda:0', grad_fn=<NllLossBackward0>)
4100 tensor(5.1882, device='cuda:0', grad_fn=<NllLossBackward0>)
4200 tensor(5.0748, device='cuda:0', grad_fn=<NllLossBackward0>)
4300 tensor(4.9577, device='cuda:0', grad_fn=<NllLossBackward0>)
4400 tensor(4.8100, device='cuda:0', grad_fn=<NllLossBackward0>)
4500 tensor(5.0355, device='cuda:0', grad_fn=<NllLossBackward0>)
4600 tensor(5.1247, device='cuda:0', grad_fn=<NllLossBackward0>)
4700 tensor(5.0516, device='cuda:0', grad_fn=<NllLossBackward0>)
4800 tensor(4.9036, device='cuda:0', grad_fn=<NllLossBackward0>)
4900 tensor(5.0096, device='cuda:0', grad_fn=<NllLossBackward0>)
5000 tensor(5.2085, device='cuda:0', grad_fn=<NllLossBackward0>)
5100 tensor(5.0944, device='cuda:0', grad_fn=<NllLossBackward0>)
5200 tensor(5.1592, device='cuda:0', grad_fn=<NllLossBackward0>)
5300 tensor(5.2019, device='cuda:0', grad_fn=<NllLossBackward0>)
5400 tensor(5.2048, device='cuda:0', grad_fn=<NllLossBackward0>)
5500 tensor(5.0499, device='cuda:0', grad_fn=<NllLossBackward0>)
5600 tensor(5.0369, device='cuda:0', grad_fn=<NllLossBackward0>)
5700 tensor(5.2581, device='cuda:0', grad_fn=<NllLossBackward0>)
5800 tensor(5.0312, device='cuda:0', grad_fn=<NllLossBackward0>)
5900 tensor(5.0513, device='cuda:0', grad_fn=<NllLossBackward0>)
6000 tensor(5.2384, device='cuda:0', grad_fn=<NllLossBackward0>)
6100 tensor(5.0257, device='cuda:0', grad_fn=<NllLossBackward0>)
6200 tensor(5.1156, device='cuda:0', grad_fn=<NllLossBackward0>)
6300 tensor(4.9953, device='cuda:0', grad_fn=<NllLossBackward0>)
6400 tensor(5.2028, device='cuda:0', grad_fn=<NllLossBackward0>)
6500 tensor(4.8426, device='cuda:0', grad_fn=<NllLossBackward0>)
6600 tensor(5.0661, device='cuda:0', grad_fn=<NllLossBackward0>)
6700 tensor(5.0976, device='cuda:0', grad_fn=<NllLossBackward0>)
6800 tensor(4.9180, device='cuda:0', grad_fn=<NllLossBackward0>)
6900 tensor(4.9928, device='cuda:0', grad_fn=<NllLossBackward0>)
7000 tensor(5.1889, device='cuda:0', grad_fn=<NllLossBackward0>)
7100 tensor(4.9612, device='cuda:0', grad_fn=<NllLossBackward0>)
7200 tensor(5.1408, device='cuda:0', grad_fn=<NllLossBackward0>)
7300 tensor(5.0562, device='cuda:0', grad_fn=<NllLossBackward0>)
7400 tensor(4.8779, device='cuda:0', grad_fn=<NllLossBackward0>)
7500 tensor(5.0490, device='cuda:0', grad_fn=<NllLossBackward0>)
7600 tensor(5.0678, device='cuda:0', grad_fn=<NllLossBackward0>)
7700 tensor(4.9938, device='cuda:0', grad_fn=<NllLossBackward0>)
7800 tensor(5.0301, device='cuda:0', grad_fn=<NllLossBackward0>)
7900 tensor(5.2542, device='cuda:0', grad_fn=<NllLossBackward0>)
8000 tensor(4.8772, device='cuda:0', grad_fn=<NllLossBackward0>)
8100 tensor(5.0953, device='cuda:0', grad_fn=<NllLossBackward0>)
8200 tensor(5.0217, device='cuda:0', grad_fn=<NllLossBackward0>)
8300 tensor(5.0107, device='cuda:0', grad_fn=<NllLossBackward0>)
8400 tensor(5.0733, device='cuda:0', grad_fn=<NllLossBackward0>)
8500 tensor(4.5262, device='cuda:0', grad_fn=<NllLossBackward0>)
8600 tensor(5.0271, device='cuda:0', grad_fn=<NllLossBackward0>)
8700 tensor(4.6307, device='cuda:0', grad_fn=<NllLossBackward0>)
8800 tensor(4.9917, device='cuda:0', grad_fn=<NllLossBackward0>)
8900 tensor(5.1940, device='cuda:0', grad_fn=<NllLossBackward0>)
9000 tensor(5.0302, device='cuda:0', grad_fn=<NllLossBackward0>)
9100 tensor(5.0956, device='cuda:0', grad_fn=<NllLossBackward0>)
9200 tensor(5.0438, device='cuda:0', grad_fn=<NllLossBackward0>)
9300 tensor(5.0134, device='cuda:0', grad_fn=<NllLossBackward0>)
9400 tensor(5.2201, device='cuda:0', grad_fn=<NllLossBackward0>)
9500 tensor(4.8876, device='cuda:0', grad_fn=<NllLossBackward0>)
9600 tensor(5.1474, device='cuda:0', grad_fn=<NllLossBackward0>)
9700 tensor(5.0169, device='cuda:0', grad_fn=<NllLossBackward0>)
9800 tensor(5.0743, device='cuda:0', grad_fn=<NllLossBackward0>)
9900 tensor(4.9008, device='cuda:0', grad_fn=<NllLossBackward0>)
10000 tensor(5.1381, device='cuda:0', grad_fn=<NllLossBackward0>)
10100 tensor(5.0524, device='cuda:0', grad_fn=<NllLossBackward0>)
10200 tensor(5.0369, device='cuda:0', grad_fn=<NllLossBackward0>)
10300 tensor(5.0595, device='cuda:0', grad_fn=<NllLossBackward0>)
10400 tensor(5.0138, device='cuda:0', grad_fn=<NllLossBackward0>)
10500 tensor(5.0164, device='cuda:0', grad_fn=<NllLossBackward0>)
10600 tensor(4.9153, device='cuda:0', grad_fn=<NllLossBackward0>)
10700 tensor(4.9971, device='cuda:0', grad_fn=<NllLossBackward0>)
10800 tensor(5.0200, device='cuda:0', grad_fn=<NllLossBackward0>)
10900 tensor(4.9631, device='cuda:0', grad_fn=<NllLossBackward0>)
11000 tensor(4.9385, device='cuda:0', grad_fn=<NllLossBackward0>)
11100 tensor(4.9851, device='cuda:0', grad_fn=<NllLossBackward0>)
11200 tensor(5.0681, device='cuda:0', grad_fn=<NllLossBackward0>)
11300 tensor(5.1261, device='cuda:0', grad_fn=<NllLossBackward0>)
11400 tensor(5.0098, device='cuda:0', grad_fn=<NllLossBackward0>)
11500 tensor(5.1261, device='cuda:0', grad_fn=<NllLossBackward0>)
11600 tensor(5.1213, device='cuda:0', grad_fn=<NllLossBackward0>)
11700 tensor(5.0265, device='cuda:0', grad_fn=<NllLossBackward0>)
11800 tensor(4.7047, device='cuda:0', grad_fn=<NllLossBackward0>)
11900 tensor(5.1954, device='cuda:0', grad_fn=<NllLossBackward0>)
12000 tensor(5.0850, device='cuda:0', grad_fn=<NllLossBackward0>)
12100 tensor(4.9762, device='cuda:0', grad_fn=<NllLossBackward0>)
12200 tensor(5.0162, device='cuda:0', grad_fn=<NllLossBackward0>)
12300 tensor(4.9834, device='cuda:0', grad_fn=<NllLossBackward0>)
12400 tensor(4.8953, device='cuda:0', grad_fn=<NllLossBackward0>)
12500 tensor(5.0389, device='cuda:0', grad_fn=<NllLossBackward0>)
12600 tensor(4.9266, device='cuda:0', grad_fn=<NllLossBackward0>)
12700 tensor(5.0132, device='cuda:0', grad_fn=<NllLossBackward0>)
12800 tensor(5.1777, device='cuda:0', grad_fn=<NllLossBackward0>)
12900 tensor(4.8290, device='cuda:0', grad_fn=<NllLossBackward0>)
13000 tensor(5.0639, device='cuda:0', grad_fn=<NllLossBackward0>)
13100 tensor(5.0565, device='cuda:0', grad_fn=<NllLossBackward0>)
13200 tensor(5.0222, device='cuda:0', grad_fn=<NllLossBackward0>)
13300 tensor(5.2150, device='cuda:0', grad_fn=<NllLossBackward0>)
13400 tensor(4.9393, device='cuda:0', grad_fn=<NllLossBackward0>)
13500 tensor(5.0270, device='cuda:0', grad_fn=<NllLossBackward0>)
13600 tensor(4.9520, device='cuda:0', grad_fn=<NllLossBackward0>)
13700 tensor(4.9845, device='cuda:0', grad_fn=<NllLossBackward0>)
13800 tensor(4.8543, device='cuda:0', grad_fn=<NllLossBackward0>)
13900 tensor(4.8892, device='cuda:0', grad_fn=<NllLossBackward0>)
14000 tensor(4.9802, device='cuda:0', grad_fn=<NllLossBackward0>)
14100 tensor(4.9833, device='cuda:0', grad_fn=<NllLossBackward0>)
14200 tensor(4.9348, device='cuda:0', grad_fn=<NllLossBackward0>)
14300 tensor(4.9561, device='cuda:0', grad_fn=<NllLossBackward0>)
14400 tensor(5.0198, device='cuda:0', grad_fn=<NllLossBackward0>)
14500 tensor(4.9878, device='cuda:0', grad_fn=<NllLossBackward0>)
14600 tensor(4.7517, device='cuda:0', grad_fn=<NllLossBackward0>)
14700 tensor(4.9452, device='cuda:0', grad_fn=<NllLossBackward0>)
14800 tensor(4.8229, device='cuda:0', grad_fn=<NllLossBackward0>)
14900 tensor(5.1425, device='cuda:0', grad_fn=<NllLossBackward0>)
15000 tensor(4.9122, device='cuda:0', grad_fn=<NllLossBackward0>)
15100 tensor(4.8217, device='cuda:0', grad_fn=<NllLossBackward0>)
15200 tensor(4.8604, device='cuda:0', grad_fn=<NllLossBackward0>)
15300 tensor(5.1151, device='cuda:0', grad_fn=<NllLossBackward0>)
15400 tensor(4.9545, device='cuda:0', grad_fn=<NllLossBackward0>)
15500 tensor(5.0922, device='cuda:0', grad_fn=<NllLossBackward0>)
15600 tensor(4.7891, device='cuda:0', grad_fn=<NllLossBackward0>)
15700 tensor(4.6318, device='cuda:0', grad_fn=<NllLossBackward0>)
15800 tensor(4.9540, device='cuda:0', grad_fn=<NllLossBackward0>)
15900 tensor(4.7681, device='cuda:0', grad_fn=<NllLossBackward0>)
16000 tensor(4.9602, device='cuda:0', grad_fn=<NllLossBackward0>)
16100 tensor(4.9705, device='cuda:0', grad_fn=<NllLossBackward0>)
16200 tensor(4.8296, device='cuda:0', grad_fn=<NllLossBackward0>)
16300 tensor(5.0188, device='cuda:0', grad_fn=<NllLossBackward0>)
16400 tensor(5.1062, device='cuda:0', grad_fn=<NllLossBackward0>)
16500 tensor(5.2549, device='cuda:0', grad_fn=<NllLossBackward0>)
16600 tensor(5.1164, device='cuda:0', grad_fn=<NllLossBackward0>)
16700 tensor(4.9399, device='cuda:0', grad_fn=<NllLossBackward0>)
16800 tensor(5.1161, device='cuda:0', grad_fn=<NllLossBackward0>)
16900 tensor(4.9115, device='cuda:0', grad_fn=<NllLossBackward0>)
17000 tensor(4.7572, device='cuda:0', grad_fn=<NllLossBackward0>)
17100 tensor(4.9667, device='cuda:0', grad_fn=<NllLossBackward0>)
17200 tensor(4.7463, device='cuda:0', grad_fn=<NllLossBackward0>)
17300 tensor(4.9038, device='cuda:0', grad_fn=<NllLossBackward0>)
17400 tensor(4.9859, device='cuda:0', grad_fn=<NllLossBackward0>)
17500 tensor(5.0652, device='cuda:0', grad_fn=<NllLossBackward0>)
17600 tensor(4.6641, device='cuda:0', grad_fn=<NllLossBackward0>)
17700 tensor(4.9265, device='cuda:0', grad_fn=<NllLossBackward0>)
17800 tensor(5.0095, device='cuda:0', grad_fn=<NllLossBackward0>)
17900 tensor(5.1090, device='cuda:0', grad_fn=<NllLossBackward0>)
18000 tensor(4.9015, device='cuda:0', grad_fn=<NllLossBackward0>)
18100 tensor(4.9997, device='cuda:0', grad_fn=<NllLossBackward0>)
18200 tensor(4.8359, device='cuda:0', grad_fn=<NllLossBackward0>)
18300 tensor(4.7353, device='cuda:0', grad_fn=<NllLossBackward0>)
18400 tensor(4.9657, device='cuda:0', grad_fn=<NllLossBackward0>)
18500 tensor(4.9856, device='cuda:0', grad_fn=<NllLossBackward0>)
18600 tensor(5.0571, device='cuda:0', grad_fn=<NllLossBackward0>)
18700 tensor(4.8566, device='cuda:0', grad_fn=<NllLossBackward0>)
18800 tensor(4.9819, device='cuda:0', grad_fn=<NllLossBackward0>)
18900 tensor(4.9809, device='cuda:0', grad_fn=<NllLossBackward0>)
19000 tensor(5.0202, device='cuda:0', grad_fn=<NllLossBackward0>)
19100 tensor(5.1329, device='cuda:0', grad_fn=<NllLossBackward0>)
19200 tensor(5.0460, device='cuda:0', grad_fn=<NllLossBackward0>)
19300 tensor(4.9174, device='cuda:0', grad_fn=<NllLossBackward0>)
19400 tensor(5.1266, device='cuda:0', grad_fn=<NllLossBackward0>)
19500 tensor(4.8903, device='cuda:0', grad_fn=<NllLossBackward0>)
19600 tensor(5.0548, device='cuda:0', grad_fn=<NllLossBackward0>)
19700 tensor(4.9530, device='cuda:0', grad_fn=<NllLossBackward0>)
19800 tensor(4.9296, device='cuda:0', grad_fn=<NllLossBackward0>)
19900 tensor(4.9925, device='cuda:0', grad_fn=<NllLossBackward0>)
20000 tensor(4.9181, device='cuda:0', grad_fn=<NllLossBackward0>)
20100 tensor(4.9487, device='cuda:0', grad_fn=<NllLossBackward0>)
20200 tensor(5.0580, device='cuda:0', grad_fn=<NllLossBackward0>)
20300 tensor(5.1110, device='cuda:0', grad_fn=<NllLossBackward0>)
20400 tensor(4.8053, device='cuda:0', grad_fn=<NllLossBackward0>)
20500 tensor(4.7658, device='cuda:0', grad_fn=<NllLossBackward0>)
20600 tensor(4.7387, device='cuda:0', grad_fn=<NllLossBackward0>)
20700 tensor(4.9779, device='cuda:0', grad_fn=<NllLossBackward0>)
20800 tensor(4.8901, device='cuda:0', grad_fn=<NllLossBackward0>)
20900 tensor(4.9092, device='cuda:0', grad_fn=<NllLossBackward0>)
21000 tensor(5.2856, device='cuda:0', grad_fn=<NllLossBackward0>)
21100 tensor(4.9803, device='cuda:0', grad_fn=<NllLossBackward0>)
21200 tensor(4.6889, device='cuda:0', grad_fn=<NllLossBackward0>)
21300 tensor(4.8434, device='cuda:0', grad_fn=<NllLossBackward0>)
21400 tensor(4.7451, device='cuda:0', grad_fn=<NllLossBackward0>)
21500 tensor(4.9406, device='cuda:0', grad_fn=<NllLossBackward0>)
21600 tensor(4.8431, device='cuda:0', grad_fn=<NllLossBackward0>)
21700 tensor(4.9932, device='cuda:0', grad_fn=<NllLossBackward0>)
21800 tensor(4.6696, device='cuda:0', grad_fn=<NllLossBackward0>)
21900 tensor(4.8091, device='cuda:0', grad_fn=<NllLossBackward0>)
22000 tensor(4.7533, device='cuda:0', grad_fn=<NllLossBackward0>)
22100 tensor(4.6842, device='cuda:0', grad_fn=<NllLossBackward0>)
22200 tensor(4.8844, device='cuda:0', grad_fn=<NllLossBackward0>)
22300 tensor(5.1038, device='cuda:0', grad_fn=<NllLossBackward0>)
22400 tensor(4.9929, device='cuda:0', grad_fn=<NllLossBackward0>)
22500 tensor(5.0109, device='cuda:0', grad_fn=<NllLossBackward0>)
22600 tensor(4.8278, device='cuda:0', grad_fn=<NllLossBackward0>)
22700 tensor(4.8597, device='cuda:0', grad_fn=<NllLossBackward0>)
22800 tensor(5.0256, device='cuda:0', grad_fn=<NllLossBackward0>)
22900 tensor(4.4663, device='cuda:0', grad_fn=<NllLossBackward0>)
23000 tensor(4.6069, device='cuda:0', grad_fn=<NllLossBackward0>)
23100 tensor(5.0816, device='cuda:0', grad_fn=<NllLossBackward0>)
23200 tensor(4.9038, device='cuda:0', grad_fn=<NllLossBackward0>)
23300 tensor(4.9284, device='cuda:0', grad_fn=<NllLossBackward0>)
23400 tensor(5.0439, device='cuda:0', grad_fn=<NllLossBackward0>)
23500 tensor(4.9640, device='cuda:0', grad_fn=<NllLossBackward0>)
23600 tensor(5.0096, device='cuda:0', grad_fn=<NllLossBackward0>)
23700 tensor(4.9700, device='cuda:0', grad_fn=<NllLossBackward0>)
23800 tensor(4.9461, device='cuda:0', grad_fn=<NllLossBackward0>)
23900 tensor(4.8171, device='cuda:0', grad_fn=<NllLossBackward0>)
24000 tensor(4.9529, device='cuda:0', grad_fn=<NllLossBackward0>)
24100 tensor(4.8525, device='cuda:0', grad_fn=<NllLossBackward0>)
24200 tensor(5.0488, device='cuda:0', grad_fn=<NllLossBackward0>)
24300 tensor(4.9206, device='cuda:0', grad_fn=<NllLossBackward0>)
24400 tensor(5.0900, device='cuda:0', grad_fn=<NllLossBackward0>)
24500 tensor(4.9484, device='cuda:0', grad_fn=<NllLossBackward0>)
24600 tensor(4.8962, device='cuda:0', grad_fn=<NllLossBackward0>)
24700 tensor(4.8884, device='cuda:0', grad_fn=<NllLossBackward0>)
24800 tensor(5.1541, device='cuda:0', grad_fn=<NllLossBackward0>)
24900 tensor(4.9803, device='cuda:0', grad_fn=<NllLossBackward0>)
25000 tensor(4.4473, device='cuda:0', grad_fn=<NllLossBackward0>)
25100 tensor(4.7330, device='cuda:0', grad_fn=<NllLossBackward0>)
25200 tensor(5.0709, device='cuda:0', grad_fn=<NllLossBackward0>)
25300 tensor(4.7139, device='cuda:0', grad_fn=<NllLossBackward0>)
25400 tensor(4.8961, device='cuda:0', grad_fn=<NllLossBackward0>)
25500 tensor(4.9459, device='cuda:0', grad_fn=<NllLossBackward0>)
25600 tensor(4.8840, device='cuda:0', grad_fn=<NllLossBackward0>)
25700 tensor(4.7792, device='cuda:0', grad_fn=<NllLossBackward0>)
25800 tensor(4.9212, device='cuda:0', grad_fn=<NllLossBackward0>)
25900 tensor(4.7168, device='cuda:0', grad_fn=<NllLossBackward0>)
26000 tensor(4.7903, device='cuda:0', grad_fn=<NllLossBackward0>)
26100 tensor(4.9544, device='cuda:0', grad_fn=<NllLossBackward0>)
26200 tensor(4.8421, device='cuda:0', grad_fn=<NllLossBackward0>)
26300 tensor(4.8085, device='cuda:0', grad_fn=<NllLossBackward0>)
26400 tensor(4.7129, device='cuda:0', grad_fn=<NllLossBackward0>)
26500 tensor(5.0808, device='cuda:0', grad_fn=<NllLossBackward0>)
26600 tensor(4.8222, device='cuda:0', grad_fn=<NllLossBackward0>)
26700 tensor(4.7982, device='cuda:0', grad_fn=<NllLossBackward0>)
26800 tensor(4.8482, device='cuda:0', grad_fn=<NllLossBackward0>)
26900 tensor(5.0815, device='cuda:0', grad_fn=<NllLossBackward0>)
27000 tensor(4.9754, device='cuda:0', grad_fn=<NllLossBackward0>)
27100 tensor(5.0156, device='cuda:0', grad_fn=<NllLossBackward0>)
27200 tensor(4.7985, device='cuda:0', grad_fn=<NllLossBackward0>)
27300 tensor(4.6372, device='cuda:0', grad_fn=<NllLossBackward0>)
27400 tensor(4.5098, device='cuda:0', grad_fn=<NllLossBackward0>)
27500 tensor(5.0427, device='cuda:0', grad_fn=<NllLossBackward0>)
27600 tensor(4.9139, device='cuda:0', grad_fn=<NllLossBackward0>)
27700 tensor(4.8924, device='cuda:0', grad_fn=<NllLossBackward0>)
27800 tensor(4.9972, device='cuda:0', grad_fn=<NllLossBackward0>)
27900 tensor(5.0452, device='cuda:0', grad_fn=<NllLossBackward0>)
28000 tensor(4.5323, device='cuda:0', grad_fn=<NllLossBackward0>)
28100 tensor(4.8945, device='cuda:0', grad_fn=<NllLossBackward0>)
28200 tensor(4.8096, device='cuda:0', grad_fn=<NllLossBackward0>)
28300 tensor(5.1238, device='cuda:0', grad_fn=<NllLossBackward0>)
28400 tensor(4.9879, device='cuda:0', grad_fn=<NllLossBackward0>)
28500 tensor(4.9505, device='cuda:0', grad_fn=<NllLossBackward0>)
28600 tensor(4.7750, device='cuda:0', grad_fn=<NllLossBackward0>)
28700 tensor(5.0738, device='cuda:0', grad_fn=<NllLossBackward0>)
28800 tensor(4.9318, device='cuda:0', grad_fn=<NllLossBackward0>)
28900 tensor(5.0403, device='cuda:0', grad_fn=<NllLossBackward0>)
29000 tensor(4.9072, device='cuda:0', grad_fn=<NllLossBackward0>)
29100 tensor(4.9822, device='cuda:0', grad_fn=<NllLossBackward0>)
29200 tensor(4.8701, device='cuda:0', grad_fn=<NllLossBackward0>)
29300 tensor(4.8883, device='cuda:0', grad_fn=<NllLossBackward0>)
29400 tensor(4.8906, device='cuda:0', grad_fn=<NllLossBackward0>)
29500 tensor(5.0658, device='cuda:0', grad_fn=<NllLossBackward0>)
29600 tensor(4.7604, device='cuda:0', grad_fn=<NllLossBackward0>)
29700 tensor(5.0792, device='cuda:0', grad_fn=<NllLossBackward0>)
29800 tensor(4.9074, device='cuda:0', grad_fn=<NllLossBackward0>)
29900 tensor(4.8845, device='cuda:0', grad_fn=<NllLossBackward0>)
30000 tensor(5.1969, device='cuda:0', grad_fn=<NllLossBackward0>)
30100 tensor(4.9648, device='cuda:0', grad_fn=<NllLossBackward0>)
30200 tensor(4.9086, device='cuda:0', grad_fn=<NllLossBackward0>)
30300 tensor(4.9708, device='cuda:0', grad_fn=<NllLossBackward0>)
30400 tensor(4.9155, device='cuda:0', grad_fn=<NllLossBackward0>)
30500 tensor(4.9404, device='cuda:0', grad_fn=<NllLossBackward0>)
30600 tensor(5.0224, device='cuda:0', grad_fn=<NllLossBackward0>)
30700 tensor(5.0298, device='cuda:0', grad_fn=<NllLossBackward0>)
30800 tensor(4.9557, device='cuda:0', grad_fn=<NllLossBackward0>)
30900 tensor(4.9653, device='cuda:0', grad_fn=<NllLossBackward0>)
31000 tensor(4.8938, device='cuda:0', grad_fn=<NllLossBackward0>)
31100 tensor(4.6689, device='cuda:0', grad_fn=<NllLossBackward0>)
31200 tensor(4.9757, device='cuda:0', grad_fn=<NllLossBackward0>)
31300 tensor(4.8805, device='cuda:0', grad_fn=<NllLossBackward0>)
31400 tensor(4.9969, device='cuda:0', grad_fn=<NllLossBackward0>)
31500 tensor(4.8262, device='cuda:0', grad_fn=<NllLossBackward0>)
31600 tensor(4.5519, device='cuda:0', grad_fn=<NllLossBackward0>)
31700 tensor(4.9185, device='cuda:0', grad_fn=<NllLossBackward0>)
31800 tensor(4.9190, device='cuda:0', grad_fn=<NllLossBackward0>)
31900 tensor(4.8702, device='cuda:0', grad_fn=<NllLossBackward0>)
32000 tensor(4.9346, device='cuda:0', grad_fn=<NllLossBackward0>)
32100 tensor(4.8963, device='cuda:0', grad_fn=<NllLossBackward0>)
32200 tensor(4.9017, device='cuda:0', grad_fn=<NllLossBackward0>)
32300 tensor(4.9595, device='cuda:0', grad_fn=<NllLossBackward0>)
32400 tensor(4.8125, device='cuda:0', grad_fn=<NllLossBackward0>)
32500 tensor(4.9593, device='cuda:0', grad_fn=<NllLossBackward0>)
32600 tensor(5.0663, device='cuda:0', grad_fn=<NllLossBackward0>)
32700 tensor(4.9644, device='cuda:0', grad_fn=<NllLossBackward0>)
32800 tensor(4.8500, device='cuda:0', grad_fn=<NllLossBackward0>)
32900 tensor(5.0070, device='cuda:0', grad_fn=<NllLossBackward0>)
33000 tensor(4.8131, device='cuda:0', grad_fn=<NllLossBackward0>)
33100 tensor(5.0183, device='cuda:0', grad_fn=<NllLossBackward0>)
33200 tensor(4.8692, device='cuda:0', grad_fn=<NllLossBackward0>)
33300 tensor(4.9145, device='cuda:0', grad_fn=<NllLossBackward0>)
33400 tensor(5.0221, device='cuda:0', grad_fn=<NllLossBackward0>)
33500 tensor(4.9636, device='cuda:0', grad_fn=<NllLossBackward0>)
33600 tensor(4.8758, device='cuda:0', grad_fn=<NllLossBackward0>)
33700 tensor(4.8713, device='cuda:0', grad_fn=<NllLossBackward0>)
33800 tensor(4.7325, device='cuda:0', grad_fn=<NllLossBackward0>)
33900 tensor(4.9829, device='cuda:0', grad_fn=<NllLossBackward0>)
34000 tensor(4.7823, device='cuda:0', grad_fn=<NllLossBackward0>)
34100 tensor(4.9773, device='cuda:0', grad_fn=<NllLossBackward0>)
34200 tensor(4.9638, device='cuda:0', grad_fn=<NllLossBackward0>)
34300 tensor(5.0311, device='cuda:0', grad_fn=<NllLossBackward0>)
34400 tensor(4.9491, device='cuda:0', grad_fn=<NllLossBackward0>)
34500 tensor(4.9527, device='cuda:0', grad_fn=<NllLossBackward0>)
34600 tensor(4.7559, device='cuda:0', grad_fn=<NllLossBackward0>)
34700 tensor(4.9602, device='cuda:0', grad_fn=<NllLossBackward0>)
34800 tensor(5.0363, device='cuda:0', grad_fn=<NllLossBackward0>)
34900 tensor(4.9509, device='cuda:0', grad_fn=<NllLossBackward0>)
35000 tensor(4.8740, device='cuda:0', grad_fn=<NllLossBackward0>)
35100 tensor(4.8790, device='cuda:0', grad_fn=<NllLossBackward0>)
35200 tensor(4.7886, device='cuda:0', grad_fn=<NllLossBackward0>)
35300 tensor(4.9939, device='cuda:0', grad_fn=<NllLossBackward0>)
35400 tensor(4.8046, device='cuda:0', grad_fn=<NllLossBackward0>)
35500 tensor(5.0125, device='cuda:0', grad_fn=<NllLossBackward0>)
35600 tensor(4.8254, device='cuda:0', grad_fn=<NllLossBackward0>)
35700 tensor(4.5858, device='cuda:0', grad_fn=<NllLossBackward0>)
35800 tensor(5.0067, device='cuda:0', grad_fn=<NllLossBackward0>)
35900 tensor(5.0505, device='cuda:0', grad_fn=<NllLossBackward0>)
36000 tensor(4.9909, device='cuda:0', grad_fn=<NllLossBackward0>)
36100 tensor(4.8610, device='cuda:0', grad_fn=<NllLossBackward0>)
36200 tensor(4.9135, device='cuda:0', grad_fn=<NllLossBackward0>)
36300 tensor(5.0409, device='cuda:0', grad_fn=<NllLossBackward0>)
36400 tensor(4.8932, device='cuda:0', grad_fn=<NllLossBackward0>)
36500 tensor(4.8384, device='cuda:0', grad_fn=<NllLossBackward0>)
36600 tensor(4.8262, device='cuda:0', grad_fn=<NllLossBackward0>)
36700 tensor(4.8363, device='cuda:0', grad_fn=<NllLossBackward0>)
36800 tensor(4.9260, device='cuda:0', grad_fn=<NllLossBackward0>)
36900 tensor(4.7176, device='cuda:0', grad_fn=<NllLossBackward0>)
37000 tensor(4.8836, device='cuda:0', grad_fn=<NllLossBackward0>)
37100 tensor(4.7659, device='cuda:0', grad_fn=<NllLossBackward0>)
37200 tensor(5.0418, device='cuda:0', grad_fn=<NllLossBackward0>)
37300 tensor(4.7165, device='cuda:0', grad_fn=<NllLossBackward0>)
37400 tensor(4.7707, device='cuda:0', grad_fn=<NllLossBackward0>)
37500 tensor(4.9404, device='cuda:0', grad_fn=<NllLossBackward0>)
37600 tensor(4.7666, device='cuda:0', grad_fn=<NllLossBackward0>)
37700 tensor(5.0086, device='cuda:0', grad_fn=<NllLossBackward0>)
37800 tensor(4.8929, device='cuda:0', grad_fn=<NllLossBackward0>)
37900 tensor(5.0537, device='cuda:0', grad_fn=<NllLossBackward0>)
38000 tensor(4.8494, device='cuda:0', grad_fn=<NllLossBackward0>)
38100 tensor(5.1193, device='cuda:0', grad_fn=<NllLossBackward0>)
38200 tensor(4.9035, device='cuda:0', grad_fn=<NllLossBackward0>)
38300 tensor(4.7574, device='cuda:0', grad_fn=<NllLossBackward0>)
38400 tensor(4.9181, device='cuda:0', grad_fn=<NllLossBackward0>)
38500 tensor(5.0186, device='cuda:0', grad_fn=<NllLossBackward0>)
38600 tensor(5.0224, device='cuda:0', grad_fn=<NllLossBackward0>)
38700 tensor(4.6032, device='cuda:0', grad_fn=<NllLossBackward0>)
38800 tensor(5.1368, device='cuda:0', grad_fn=<NllLossBackward0>)
38900 tensor(4.9394, device='cuda:0', grad_fn=<NllLossBackward0>)
39000 tensor(4.7891, device='cuda:0', grad_fn=<NllLossBackward0>)
39100 tensor(4.9718, device='cuda:0', grad_fn=<NllLossBackward0>)
39200 tensor(4.9599, device='cuda:0', grad_fn=<NllLossBackward0>)
39300 tensor(4.8518, device='cuda:0', grad_fn=<NllLossBackward0>)
39400 tensor(4.7832, device='cuda:0', grad_fn=<NllLossBackward0>)
39500 tensor(4.9827, device='cuda:0', grad_fn=<NllLossBackward0>)
39600 tensor(5.0733, device='cuda:0', grad_fn=<NllLossBackward0>)
39700 tensor(4.8859, device='cuda:0', grad_fn=<NllLossBackward0>)
39800 tensor(4.9722, device='cuda:0', grad_fn=<NllLossBackward0>)
39900 tensor(5.0568, device='cuda:0', grad_fn=<NllLossBackward0>)
40000 tensor(4.8251, device='cuda:0', grad_fn=<NllLossBackward0>)
40100 tensor(4.8720, device='cuda:0', grad_fn=<NllLossBackward0>)
40200 tensor(5.3066, device='cuda:0', grad_fn=<NllLossBackward0>)
40300 tensor(4.9435, device='cuda:0', grad_fn=<NllLossBackward0>)
40400 tensor(4.9634, device='cuda:0', grad_fn=<NllLossBackward0>)
40500 tensor(4.8406, device='cuda:0', grad_fn=<NllLossBackward0>)
40600 tensor(4.8050, device='cuda:0', grad_fn=<NllLossBackward0>)
40700 tensor(4.6578, device='cuda:0', grad_fn=<NllLossBackward0>)
40800 tensor(4.8490, device='cuda:0', grad_fn=<NllLossBackward0>)
40900 tensor(5.1542, device='cuda:0', grad_fn=<NllLossBackward0>)
41000 tensor(4.8509, device='cuda:0', grad_fn=<NllLossBackward0>)
41100 tensor(4.8082, device='cuda:0', grad_fn=<NllLossBackward0>)
41200 tensor(4.8444, device='cuda:0', grad_fn=<NllLossBackward0>)
41300 tensor(5.1602, device='cuda:0', grad_fn=<NllLossBackward0>)
41400 tensor(4.7235, device='cuda:0', grad_fn=<NllLossBackward0>)
41500 tensor(5.0334, device='cuda:0', grad_fn=<NllLossBackward0>)
41600 tensor(5.0500, device='cuda:0', grad_fn=<NllLossBackward0>)
41700 tensor(5.0378, device='cuda:0', grad_fn=<NllLossBackward0>)
41800 tensor(4.7989, device='cuda:0', grad_fn=<NllLossBackward0>)
41900 tensor(4.9342, device='cuda:0', grad_fn=<NllLossBackward0>)
42000 tensor(4.9981, device='cuda:0', grad_fn=<NllLossBackward0>)
42100 tensor(4.6723, device='cuda:0', grad_fn=<NllLossBackward0>)
42200 tensor(4.9382, device='cuda:0', grad_fn=<NllLossBackward0>)
42300 tensor(4.9237, device='cuda:0', grad_fn=<NllLossBackward0>)
42400 tensor(4.9302, device='cuda:0', grad_fn=<NllLossBackward0>)
42500 tensor(4.8494, device='cuda:0', grad_fn=<NllLossBackward0>)
42600 tensor(4.9942, device='cuda:0', grad_fn=<NllLossBackward0>)
42700 tensor(4.9581, device='cuda:0', grad_fn=<NllLossBackward0>)
42800 tensor(4.8044, device='cuda:0', grad_fn=<NllLossBackward0>)
42900 tensor(5.0890, device='cuda:0', grad_fn=<NllLossBackward0>)
43000 tensor(4.9422, device='cuda:0', grad_fn=<NllLossBackward0>)
43100 tensor(5.0014, device='cuda:0', grad_fn=<NllLossBackward0>)
43200 tensor(4.9001, device='cuda:0', grad_fn=<NllLossBackward0>)
43300 tensor(4.9133, device='cuda:0', grad_fn=<NllLossBackward0>)
43400 tensor(4.8836, device='cuda:0', grad_fn=<NllLossBackward0>)
43500 tensor(4.8232, device='cuda:0', grad_fn=<NllLossBackward0>)
43600 tensor(4.8052, device='cuda:0', grad_fn=<NllLossBackward0>)
43700 tensor(5.0304, device='cuda:0', grad_fn=<NllLossBackward0>)
43800 tensor(5.0834, device='cuda:0', grad_fn=<NllLossBackward0>)
43900 tensor(4.8242, device='cuda:0', grad_fn=<NllLossBackward0>)
44000 tensor(4.8126, device='cuda:0', grad_fn=<NllLossBackward0>)
44100 tensor(4.7836, device='cuda:0', grad_fn=<NllLossBackward0>)
44200 tensor(5.0763, device='cuda:0', grad_fn=<NllLossBackward0>)
44300 tensor(5.0682, device='cuda:0', grad_fn=<NllLossBackward0>)
44400 tensor(4.8869, device='cuda:0', grad_fn=<NllLossBackward0>)
44500 tensor(4.8527, device='cuda:0', grad_fn=<NllLossBackward0>)
44600 tensor(4.8439, device='cuda:0', grad_fn=<NllLossBackward0>)
44700 tensor(4.9127, device='cuda:0', grad_fn=<NllLossBackward0>)
44800 tensor(4.9628, device='cuda:0', grad_fn=<NllLossBackward0>)
44900 tensor(5.0566, device='cuda:0', grad_fn=<NllLossBackward0>)
45000 tensor(5.0596, device='cuda:0', grad_fn=<NllLossBackward0>)
45100 tensor(5.1187, device='cuda:0', grad_fn=<NllLossBackward0>)
45200 tensor(5.0824, device='cuda:0', grad_fn=<NllLossBackward0>)
45300 tensor(4.8433, device='cuda:0', grad_fn=<NllLossBackward0>)
45400 tensor(4.7299, device='cuda:0', grad_fn=<NllLossBackward0>)
45500 tensor(5.1722, device='cuda:0', grad_fn=<NllLossBackward0>)
45600 tensor(4.7867, device='cuda:0', grad_fn=<NllLossBackward0>)
45700 tensor(4.9631, device='cuda:0', grad_fn=<NllLossBackward0>)
45800 tensor(4.6216, device='cuda:0', grad_fn=<NllLossBackward0>)
45900 tensor(4.9601, device='cuda:0', grad_fn=<NllLossBackward0>)
46000 tensor(4.9055, device='cuda:0', grad_fn=<NllLossBackward0>)
46100 tensor(5.0517, device='cuda:0', grad_fn=<NllLossBackward0>)
46200 tensor(5.0099, device='cuda:0', grad_fn=<NllLossBackward0>)
46300 tensor(4.8178, device='cuda:0', grad_fn=<NllLossBackward0>)
46400 tensor(4.9317, device='cuda:0', grad_fn=<NllLossBackward0>)
46500 tensor(4.8770, device='cuda:0', grad_fn=<NllLossBackward0>)
46600 tensor(4.9668, device='cuda:0', grad_fn=<NllLossBackward0>)
46700 tensor(5.1287, device='cuda:0', grad_fn=<NllLossBackward0>)
46800 tensor(4.9050, device='cuda:0', grad_fn=<NllLossBackward0>)
46900 tensor(4.9622, device='cuda:0', grad_fn=<NllLossBackward0>)
47000 tensor(4.6818, device='cuda:0', grad_fn=<NllLossBackward0>)
47100 tensor(4.8780, device='cuda:0', grad_fn=<NllLossBackward0>)
47200 tensor(4.9493, device='cuda:0', grad_fn=<NllLossBackward0>)
47300 tensor(4.7958, device='cuda:0', grad_fn=<NllLossBackward0>)
47400 tensor(4.5415, device='cuda:0', grad_fn=<NllLossBackward0>)
47500 tensor(5.0651, device='cuda:0', grad_fn=<NllLossBackward0>)
47600 tensor(4.9692, device='cuda:0', grad_fn=<NllLossBackward0>)
47700 tensor(4.8536, device='cuda:0', grad_fn=<NllLossBackward0>)
47800 tensor(4.7306, device='cuda:0', grad_fn=<NllLossBackward0>)
47900 tensor(5.1795, device='cuda:0', grad_fn=<NllLossBackward0>)
48000 tensor(4.9196, device='cuda:0', grad_fn=<NllLossBackward0>)
48100 tensor(5.1446, device='cuda:0', grad_fn=<NllLossBackward0>)
48200 tensor(4.9810, device='cuda:0', grad_fn=<NllLossBackward0>)
48300 tensor(4.9688, device='cuda:0', grad_fn=<NllLossBackward0>)
48400 tensor(5.0246, device='cuda:0', grad_fn=<NllLossBackward0>)
48500 tensor(4.7523, device='cuda:0', grad_fn=<NllLossBackward0>)
48600 tensor(4.7716, device='cuda:0', grad_fn=<NllLossBackward0>)
48700 tensor(4.8938, device='cuda:0', grad_fn=<NllLossBackward0>)
48800 tensor(4.9324, device='cuda:0', grad_fn=<NllLossBackward0>)
48900 tensor(4.9811, device='cuda:0', grad_fn=<NllLossBackward0>)
49000 tensor(4.8818, device='cuda:0', grad_fn=<NllLossBackward0>)
49100 tensor(4.9871, device='cuda:0', grad_fn=<NllLossBackward0>)
49200 tensor(4.8498, device='cuda:0', grad_fn=<NllLossBackward0>)
49300 tensor(4.8027, device='cuda:0', grad_fn=<NllLossBackward0>)
49400 tensor(5.0199, device='cuda:0', grad_fn=<NllLossBackward0>)
49500 tensor(4.9790, device='cuda:0', grad_fn=<NllLossBackward0>)
49600 tensor(5.0995, device='cuda:0', grad_fn=<NllLossBackward0>)
49700 tensor(4.8989, device='cuda:0', grad_fn=<NllLossBackward0>)
49800 tensor(4.8903, device='cuda:0', grad_fn=<NllLossBackward0>)
49900 tensor(4.6744, device='cuda:0', grad_fn=<NllLossBackward0>)
50000 tensor(4.9403, device='cuda:0', grad_fn=<NllLossBackward0>)
50100 tensor(4.7815, device='cuda:0', grad_fn=<NllLossBackward0>)
50200 tensor(4.8617, device='cuda:0', grad_fn=<NllLossBackward0>)
50300 tensor(4.4559, device='cuda:0', grad_fn=<NllLossBackward0>)
50400 tensor(5.0322, device='cuda:0', grad_fn=<NllLossBackward0>)
50500 tensor(4.6867, device='cuda:0', grad_fn=<NllLossBackward0>)
50600 tensor(4.9644, device='cuda:0', grad_fn=<NllLossBackward0>)
50700 tensor(5.0631, device='cuda:0', grad_fn=<NllLossBackward0>)
50800 tensor(4.7992, device='cuda:0', grad_fn=<NllLossBackward0>)
50900 tensor(4.9346, device='cuda:0', grad_fn=<NllLossBackward0>)
51000 tensor(4.6487, device='cuda:0', grad_fn=<NllLossBackward0>)
51100 tensor(4.8758, device='cuda:0', grad_fn=<NllLossBackward0>)
51200 tensor(5.0734, device='cuda:0', grad_fn=<NllLossBackward0>)
51300 tensor(4.8078, device='cuda:0', grad_fn=<NllLossBackward0>)
51400 tensor(4.7628, device='cuda:0', grad_fn=<NllLossBackward0>)
51500 tensor(4.8508, device='cuda:0', grad_fn=<NllLossBackward0>)
51600 tensor(4.8231, device='cuda:0', grad_fn=<NllLossBackward0>)
51700 tensor(5.0122, device='cuda:0', grad_fn=<NllLossBackward0>)
51800 tensor(4.8941, device='cuda:0', grad_fn=<NllLossBackward0>)
51900 tensor(5.0284, device='cuda:0', grad_fn=<NllLossBackward0>)
52000 tensor(4.9158, device='cuda:0', grad_fn=<NllLossBackward0>)
52100 tensor(4.8752, device='cuda:0', grad_fn=<NllLossBackward0>)
52200 tensor(4.7020, device='cuda:0', grad_fn=<NllLossBackward0>)
52300 tensor(4.6001, device='cuda:0', grad_fn=<NllLossBackward0>)
52400 tensor(4.7898, device='cuda:0', grad_fn=<NllLossBackward0>)
52500 tensor(4.8255, device='cuda:0', grad_fn=<NllLossBackward0>)
52600 tensor(4.7331, device='cuda:0', grad_fn=<NllLossBackward0>)
52700 tensor(4.8546, device='cuda:0', grad_fn=<NllLossBackward0>)
52800 tensor(4.9418, device='cuda:0', grad_fn=<NllLossBackward0>)
52900 tensor(4.7536, device='cuda:0', grad_fn=<NllLossBackward0>)
53000 tensor(4.9609, device='cuda:0', grad_fn=<NllLossBackward0>)
53100 tensor(5.0644, device='cuda:0', grad_fn=<NllLossBackward0>)
53200 tensor(4.8919, device='cuda:0', grad_fn=<NllLossBackward0>)
53300 tensor(4.7840, device='cuda:0', grad_fn=<NllLossBackward0>)
53400 tensor(4.8539, device='cuda:0', grad_fn=<NllLossBackward0>)
53500 tensor(4.8023, device='cuda:0', grad_fn=<NllLossBackward0>)
53600 tensor(4.9810, device='cuda:0', grad_fn=<NllLossBackward0>)
53700 tensor(4.9946, device='cuda:0', grad_fn=<NllLossBackward0>)
53800 tensor(4.3504, device='cuda:0', grad_fn=<NllLossBackward0>)
53900 tensor(4.8656, device='cuda:0', grad_fn=<NllLossBackward0>)
54000 tensor(5.0103, device='cuda:0', grad_fn=<NllLossBackward0>)
54100 tensor(4.8503, device='cuda:0', grad_fn=<NllLossBackward0>)
54200 tensor(4.9970, device='cuda:0', grad_fn=<NllLossBackward0>)
54300 tensor(4.5719, device='cuda:0', grad_fn=<NllLossBackward0>)
54400 tensor(4.7891, device='cuda:0', grad_fn=<NllLossBackward0>)
54500 tensor(4.8968, device='cuda:0', grad_fn=<NllLossBackward0>)
54600 tensor(5.0036, device='cuda:0', grad_fn=<NllLossBackward0>)
54700 tensor(4.9487, device='cuda:0', grad_fn=<NllLossBackward0>)
54800 tensor(4.8477, device='cuda:0', grad_fn=<NllLossBackward0>)
54900 tensor(4.9253, device='cuda:0', grad_fn=<NllLossBackward0>)
55000 tensor(4.9079, device='cuda:0', grad_fn=<NllLossBackward0>)
55100 tensor(4.9499, device='cuda:0', grad_fn=<NllLossBackward0>)
55200 tensor(5.0510, device='cuda:0', grad_fn=<NllLossBackward0>)
55300 tensor(4.9320, device='cuda:0', grad_fn=<NllLossBackward0>)
55400 tensor(4.5737, device='cuda:0', grad_fn=<NllLossBackward0>)
55500 tensor(4.7703, device='cuda:0', grad_fn=<NllLossBackward0>)
55600 tensor(5.0166, device='cuda:0', grad_fn=<NllLossBackward0>)
55700 tensor(4.9049, device='cuda:0', grad_fn=<NllLossBackward0>)
55800 tensor(4.7355, device='cuda:0', grad_fn=<NllLossBackward0>)
55900 tensor(4.5776, device='cuda:0', grad_fn=<NllLossBackward0>)
56000 tensor(4.9919, device='cuda:0', grad_fn=<NllLossBackward0>)
56100 tensor(4.8629, device='cuda:0', grad_fn=<NllLossBackward0>)
56200 tensor(5.0123, device='cuda:0', grad_fn=<NllLossBackward0>)
56300 tensor(4.3110, device='cuda:0', grad_fn=<NllLossBackward0>)
56400 tensor(4.8950, device='cuda:0', grad_fn=<NllLossBackward0>)
56500 tensor(4.8415, device='cuda:0', grad_fn=<NllLossBackward0>)
56600 tensor(4.7285, device='cuda:0', grad_fn=<NllLossBackward0>)
56700 tensor(4.8401, device='cuda:0', grad_fn=<NllLossBackward0>)
56800 tensor(4.7972, device='cuda:0', grad_fn=<NllLossBackward0>)
56900 tensor(4.7398, device='cuda:0', grad_fn=<NllLossBackward0>)
57000 tensor(5.1683, device='cuda:0', grad_fn=<NllLossBackward0>)
57100 tensor(4.9399, device='cuda:0', grad_fn=<NllLossBackward0>)
57200 tensor(4.9609, device='cuda:0', grad_fn=<NllLossBackward0>)
57300 tensor(4.9818, device='cuda:0', grad_fn=<NllLossBackward0>)
57400 tensor(4.9719, device='cuda:0', grad_fn=<NllLossBackward0>)
57500 tensor(4.8724, device='cuda:0', grad_fn=<NllLossBackward0>)
57600 tensor(4.9824, device='cuda:0', grad_fn=<NllLossBackward0>)
57700 tensor(5.0357, device='cuda:0', grad_fn=<NllLossBackward0>)
57800 tensor(5.0542, device='cuda:0', grad_fn=<NllLossBackward0>)
57900 tensor(4.8753, device='cuda:0', grad_fn=<NllLossBackward0>)
58000 tensor(4.7773, device='cuda:0', grad_fn=<NllLossBackward0>)
58100 tensor(4.7864, device='cuda:0', grad_fn=<NllLossBackward0>)
58200 tensor(4.8033, device='cuda:0', grad_fn=<NllLossBackward0>)
58300 tensor(4.9997, device='cuda:0', grad_fn=<NllLossBackward0>)
58400 tensor(4.9701, device='cuda:0', grad_fn=<NllLossBackward0>)
58500 tensor(4.8920, device='cuda:0', grad_fn=<NllLossBackward0>)
58600 tensor(4.9408, device='cuda:0', grad_fn=<NllLossBackward0>)
58700 tensor(5.1013, device='cuda:0', grad_fn=<NllLossBackward0>)
58800 tensor(4.8176, device='cuda:0', grad_fn=<NllLossBackward0>)
58900 tensor(4.7466, device='cuda:0', grad_fn=<NllLossBackward0>)
59000 tensor(4.9146, device='cuda:0', grad_fn=<NllLossBackward0>)
59100 tensor(4.8151, device='cuda:0', grad_fn=<NllLossBackward0>)
59200 tensor(4.9928, device='cuda:0', grad_fn=<NllLossBackward0>)
59300 tensor(5.0274, device='cuda:0', grad_fn=<NllLossBackward0>)
59400 tensor(4.7727, device='cuda:0', grad_fn=<NllLossBackward0>)
59500 tensor(5.0648, device='cuda:0', grad_fn=<NllLossBackward0>)
59600 tensor(4.9982, device='cuda:0', grad_fn=<NllLossBackward0>)
59700 tensor(4.8934, device='cuda:0', grad_fn=<NllLossBackward0>)
59800 tensor(4.8285, device='cuda:0', grad_fn=<NllLossBackward0>)
59900 tensor(4.8039, device='cuda:0', grad_fn=<NllLossBackward0>)
60000 tensor(4.9090, device='cuda:0', grad_fn=<NllLossBackward0>)
60100 tensor(4.6927, device='cuda:0', grad_fn=<NllLossBackward0>)
60200 tensor(4.8922, device='cuda:0', grad_fn=<NllLossBackward0>)
60300 tensor(4.8804, device='cuda:0', grad_fn=<NllLossBackward0>)
60400 tensor(4.9676, device='cuda:0', grad_fn=<NllLossBackward0>)
60500 tensor(4.7234, device='cuda:0', grad_fn=<NllLossBackward0>)
60600 tensor(4.9174, device='cuda:0', grad_fn=<NllLossBackward0>)
60700 tensor(4.9062, device='cuda:0', grad_fn=<NllLossBackward0>)
60800 tensor(5.0811, device='cuda:0', grad_fn=<NllLossBackward0>)
60900 tensor(5.1713, device='cuda:0', grad_fn=<NllLossBackward0>)
61000 tensor(4.9471, device='cuda:0', grad_fn=<NllLossBackward0>)
61100 tensor(4.8106, device='cuda:0', grad_fn=<NllLossBackward0>)
61200 tensor(4.8666, device='cuda:0', grad_fn=<NllLossBackward0>)
61300 tensor(4.8624, device='cuda:0', grad_fn=<NllLossBackward0>)
61400 tensor(4.5771, device='cuda:0', grad_fn=<NllLossBackward0>)
61500 tensor(4.8186, device='cuda:0', grad_fn=<NllLossBackward0>)
61600 tensor(4.7787, device='cuda:0', grad_fn=<NllLossBackward0>)
61700 tensor(4.9245, device='cuda:0', grad_fn=<NllLossBackward0>)
61800 tensor(5.0268, device='cuda:0', grad_fn=<NllLossBackward0>)
61900 tensor(5.2582, device='cuda:0', grad_fn=<NllLossBackward0>)
62000 tensor(4.8309, device='cuda:0', grad_fn=<NllLossBackward0>)
62100 tensor(4.9982, device='cuda:0', grad_fn=<NllLossBackward0>)
62200 tensor(4.8859, device='cuda:0', grad_fn=<NllLossBackward0>)
62300 tensor(4.5051, device='cuda:0', grad_fn=<NllLossBackward0>)
62400 tensor(4.6767, device='cuda:0', grad_fn=<NllLossBackward0>)
62500 tensor(4.7197, device='cuda:0', grad_fn=<NllLossBackward0>)
62600 tensor(4.6625, device='cuda:0', grad_fn=<NllLossBackward0>)
62700 tensor(4.6548, device='cuda:0', grad_fn=<NllLossBackward0>)
62800 tensor(4.7307, device='cuda:0', grad_fn=<NllLossBackward0>)
62900 tensor(4.9550, device='cuda:0', grad_fn=<NllLossBackward0>)
63000 tensor(4.5528, device='cuda:0', grad_fn=<NllLossBackward0>)
63100 tensor(4.8676, device='cuda:0', grad_fn=<NllLossBackward0>)
63200 tensor(4.9302, device='cuda:0', grad_fn=<NllLossBackward0>)
63300 tensor(4.8878, device='cuda:0', grad_fn=<NllLossBackward0>)
63400 tensor(4.9172, device='cuda:0', grad_fn=<NllLossBackward0>)
63500 tensor(4.7881, device='cuda:0', grad_fn=<NllLossBackward0>)
63600 tensor(4.8712, device='cuda:0', grad_fn=<NllLossBackward0>)
63700 tensor(4.9398, device='cuda:0', grad_fn=<NllLossBackward0>)
63800 tensor(4.9999, device='cuda:0', grad_fn=<NllLossBackward0>)
63900 tensor(4.8581, device='cuda:0', grad_fn=<NllLossBackward0>)
64000 tensor(4.6726, device='cuda:0', grad_fn=<NllLossBackward0>)
64100 tensor(5.0308, device='cuda:0', grad_fn=<NllLossBackward0>)
64200 tensor(4.7130, device='cuda:0', grad_fn=<NllLossBackward0>)
64300 tensor(4.9586, device='cuda:0', grad_fn=<NllLossBackward0>)
64400 tensor(4.9456, device='cuda:0', grad_fn=<NllLossBackward0>)
64500 tensor(4.8030, device='cuda:0', grad_fn=<NllLossBackward0>)
64600 tensor(4.9885, device='cuda:0', grad_fn=<NllLossBackward0>)
64700 tensor(4.9439, device='cuda:0', grad_fn=<NllLossBackward0>)
64800 tensor(4.6348, device='cuda:0', grad_fn=<NllLossBackward0>)
64900 tensor(4.8772, device='cuda:0', grad_fn=<NllLossBackward0>)
65000 tensor(4.9567, device='cuda:0', grad_fn=<NllLossBackward0>)
65100 tensor(4.9036, device='cuda:0', grad_fn=<NllLossBackward0>)
65200 tensor(4.7526, device='cuda:0', grad_fn=<NllLossBackward0>)
65300 tensor(4.9206, device='cuda:0', grad_fn=<NllLossBackward0>)
65400 tensor(4.8406, device='cuda:0', grad_fn=<NllLossBackward0>)
65500 tensor(4.5461, device='cuda:0', grad_fn=<NllLossBackward0>)
65600 tensor(4.9647, device='cuda:0', grad_fn=<NllLossBackward0>)
65700 tensor(4.9128, device='cuda:0', grad_fn=<NllLossBackward0>)
65800 tensor(4.8554, device='cuda:0', grad_fn=<NllLossBackward0>)
65900 tensor(4.8749, device='cuda:0', grad_fn=<NllLossBackward0>)
66000 tensor(5.1345, device='cuda:0', grad_fn=<NllLossBackward0>)
66100 tensor(4.6254, device='cuda:0', grad_fn=<NllLossBackward0>)
66200 tensor(4.9932, device='cuda:0', grad_fn=<NllLossBackward0>)
66300 tensor(4.5778, device='cuda:0', grad_fn=<NllLossBackward0>)
66400 tensor(4.7925, device='cuda:0', grad_fn=<NllLossBackward0>)
66500 tensor(4.9761, device='cuda:0', grad_fn=<NllLossBackward0>)
66600 tensor(4.9166, device='cuda:0', grad_fn=<NllLossBackward0>)
66700 tensor(4.8186, device='cuda:0', grad_fn=<NllLossBackward0>)
66800 tensor(4.9063, device='cuda:0', grad_fn=<NllLossBackward0>)
66900 tensor(4.9770, device='cuda:0', grad_fn=<NllLossBackward0>)
67000 tensor(4.8087, device='cuda:0', grad_fn=<NllLossBackward0>)
67100 tensor(4.7366, device='cuda:0', grad_fn=<NllLossBackward0>)
67200 tensor(5.0656, device='cuda:0', grad_fn=<NllLossBackward0>)
67300 tensor(4.9718, device='cuda:0', grad_fn=<NllLossBackward0>)
67400 tensor(4.8172, device='cuda:0', grad_fn=<NllLossBackward0>)
67500 tensor(4.9368, device='cuda:0', grad_fn=<NllLossBackward0>)
67600 tensor(4.9278, device='cuda:0', grad_fn=<NllLossBackward0>)
67700 tensor(4.8133, device='cuda:0', grad_fn=<NllLossBackward0>)
67800 tensor(4.9486, device='cuda:0', grad_fn=<NllLossBackward0>)
67900 tensor(4.8521, device='cuda:0', grad_fn=<NllLossBackward0>)
68000 tensor(4.9510, device='cuda:0', grad_fn=<NllLossBackward0>)
68100 tensor(4.8939, device='cuda:0', grad_fn=<NllLossBackward0>)
68200 tensor(4.8088, device='cuda:0', grad_fn=<NllLossBackward0>)
68300 tensor(4.9821, device='cuda:0', grad_fn=<NllLossBackward0>)
68400 tensor(5.1750, device='cuda:0', grad_fn=<NllLossBackward0>)
68500 tensor(4.6476, device='cuda:0', grad_fn=<NllLossBackward0>)
68600 tensor(4.8567, device='cuda:0', grad_fn=<NllLossBackward0>)
68700 tensor(4.8663, device='cuda:0', grad_fn=<NllLossBackward0>)
68800 tensor(5.0268, device='cuda:0', grad_fn=<NllLossBackward0>)
68900 tensor(4.8717, device='cuda:0', grad_fn=<NllLossBackward0>)
69000 tensor(4.9166, device='cuda:0', grad_fn=<NllLossBackward0>)
69100 tensor(4.9094, device='cuda:0', grad_fn=<NllLossBackward0>)
69200 tensor(4.7433, device='cuda:0', grad_fn=<NllLossBackward0>)
69300 tensor(4.5366, device='cuda:0', grad_fn=<NllLossBackward0>)
69400 tensor(5.0260, device='cuda:0', grad_fn=<NllLossBackward0>)
69500 tensor(4.7304, device='cuda:0', grad_fn=<NllLossBackward0>)
import torch
torch.cuda.is_available()
True
torch.save(model.state_dict(), path_to_model)
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
model.load_state_dict(torch.load(path_to_model))
model.eval()
SimpleBigramNeuralLanguageModel(
  (model): Sequential(
    (0): Embedding(10000, 100)
    (1): Linear(in_features=100, out_features=10000, bias=True)
    (2): Softmax(dim=None)
  )
)
save_outs('dev-0')
Creating outputs in dev-0
save_outs('test-A')
Creating outputs in test-A