challenging-america-word-ga.../07_bigram_regression.ipynb

14 KiB
Raw Blame History

Zadanie 1

Wyucz prosty bigramowy model języka oparty na regresji logistycznej (jak przedstawiono na wykładzie).

from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator


def get_words_from_line(line):
  line = line.rstrip()
  yield '<s>'
  for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
     yield m.group(0).lower()
  yield '</s>'


def get_word_lines_from_file(file_name):
  with open(file_name, 'r') as fh:
    for line in fh:
       yield get_words_from_line(line)

vocab_size = 20000

vocab = build_vocab_from_iterator(
    get_word_lines_from_file('test-A/in.tsv'),
    max_tokens = vocab_size,
    specials = ['<unk>'])
vocab['welcome']
3798
from torch import nn
import torch

embed_size = 100

class SimpleBigramNeuralLanguageModel(nn.Module):
  def __init__(self, vocabulary_size, embedding_size):
      super(SimpleBigramNeuralLanguageModel, self).__init__()
      self.model = nn.Sequential(
          nn.Embedding(vocabulary_size, embedding_size),
          nn.Linear(embedding_size, vocabulary_size),
          nn.Softmax()
      )

  def forward(self, x):
      return self.model(x)

model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size)

vocab.set_default_index(vocab['<unk>'])
ixs = torch.tensor(vocab.forward(['welcone']))
out = model(ixs)
out[0][vocab['to']]
tensor(5.5038e-05, grad_fn=<SelectBackward0>)
from torch.utils.data import IterableDataset
import itertools

def look_ahead_iterator(gen):
   prev = None
   for item in gen:
      if prev is not None:
         yield (prev, item)
      prev = item

class Bigrams(IterableDataset):
  def __init__(self, text_file, vocabulary_size):
      self.vocab = build_vocab_from_iterator(
         get_word_lines_from_file(text_file),
         max_tokens = vocabulary_size,
         specials = ['<unk>'])
      self.vocab.set_default_index(self.vocab['<unk>'])
      self.vocabulary_size = vocabulary_size
      self.text_file = text_file

  def __iter__(self):
     return look_ahead_iterator(
         (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))

train_dataset = Bigrams('test-A/in.tsv', vocab_size)
from torch.utils.data import DataLoader

device = 'cpu' # cuda
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
data = DataLoader(train_dataset, batch_size=5000)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()

model.train()
step = 0
for x, y in data:
   x = x.to(device)
   y = y.to(device)
   optimizer.zero_grad()
   ypredicted = model(x)
   loss = criterion(torch.log(ypredicted), y)
   if step % 100 == 0:
      print(step, loss)
   step += 1
   loss.backward()
   optimizer.step()

torch.save(model.state_dict(), 'model1.bin')
0 tensor(10.0928, grad_fn=<NllLossBackward0>)
100 tensor(8.4572, grad_fn=<NllLossBackward0>)
200 tensor(7.6165, grad_fn=<NllLossBackward0>)
300 tensor(6.9356, grad_fn=<NllLossBackward0>)
400 tensor(6.5687, grad_fn=<NllLossBackward0>)
500 tensor(6.2197, grad_fn=<NllLossBackward0>)
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[13], line 15
     13 y = y.to(device)
     14 optimizer.zero_grad()
---> 15 ypredicted = model(x)
     16 loss = criterion(torch.log(ypredicted), y)
     17 if step % 100 == 0:

File c:\Users\jadamski\.conda\envs\modelowanie\lib\site-packages\torch\nn\modules\module.py:1501, in Module._call_impl(self, *args, **kwargs)
   1496 # If we don't have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

Cell In[10], line 16, in SimpleBigramNeuralLanguageModel.forward(self, x)
     15 def forward(self, x):
---> 16     return self.model(x)

File c:\Users\jadamski\.conda\envs\modelowanie\lib\site-packages\torch\nn\modules\module.py:1501, in Module._call_impl(self, *args, **kwargs)
   1496 # If we don't have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

File c:\Users\jadamski\.conda\envs\modelowanie\lib\site-packages\torch\nn\modules\container.py:217, in Sequential.forward(self, input)
    215 def forward(self, input):
    216     for module in self:
--> 217         input = module(input)
    218     return input

File c:\Users\jadamski\.conda\envs\modelowanie\lib\site-packages\torch\nn\modules\module.py:1501, in Module._call_impl(self, *args, **kwargs)
   1496 # If we don't have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

File c:\Users\jadamski\.conda\envs\modelowanie\lib\site-packages\torch\nn\modules\linear.py:114, in Linear.forward(self, input)
    113 def forward(self, input: Tensor) -> Tensor:
--> 114     return F.linear(input, self.weight, self.bias)

KeyboardInterrupt: 
device = 'cpu' # cuda
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
#model.load_state_dict(torch.load('model1.bin'))
model.eval()

ixs = torch.tensor(vocab.forward(['welcome'])).to(device)

out = model(ixs)
top = torch.topk(out[0], 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
list(zip(top_words, top_indices, top_probs))
[('liquid', 6933, 0.0004737793351523578),
 ('bia', 5842, 0.00043268679291941226),
 ('sole', 6386, 0.0004295798426028341),
 ('nmeant', 17711, 0.00034942160709761083),
 ('savs', 16709, 0.00034736539237201214),
 ('striving', 12414, 0.0003441996523179114),
 ('nol', 2640, 0.00032789510441944003),
 ('imposing', 8457, 0.0003199590719304979),
 ('hound', 17348, 0.00031824613688513637),
 ('?"\\\\', 4294, 0.0003141215711366385)]