challenging-america-word-ga.../run.ipynb
Łukasz Jędyk a138439024 solution v2
2022-05-01 12:26:53 +02:00

12 KiB
Raw Blame History

import torch
from torch import nn
vocab_size = 20000
embed_size = 100

class SimpleTrigramNeuralLanguageModel(nn.Module):
    def __init__(self, vocabulary_size, embedding_size):
        super(SimpleTrigramNeuralLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocabulary_size, embedding_size)
        self.linear = nn.Linear(embedding_size, vocabulary_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.linear(x)
        x = torch.softmax(x, dim=1)
        return x
import regex as re
from itertools import islice, chain
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import IterableDataset

def get_words_from_line(line):
    line = line.rstrip()
    yield '<s>'
    for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
        yield m.group(0).lower()
    yield '</s>'

def get_word_lines_from_file(file_name):
    with open(file_name, 'r') as fh:
        for line in fh:
            yield get_words_from_line(line)
            
def look_ahead_iterator(gen):
    prev = None
    for item in gen:
        if prev is not None:
            yield (prev, item)
        prev = item
class Bigrams(IterableDataset):
    def __init__(self, text_file, vocabulary_size):
        self.vocab = build_vocab_from_iterator(
            get_word_lines_from_file(text_file),
            max_tokens = vocabulary_size,
            specials = ['<unk>']
        )
        self.vocab.set_default_index(self.vocab['<unk>'])
        self.vocabulary_size = vocabulary_size
        self.text_file = text_file

    def __iter__(self):
        return look_ahead_iterator((self.vocab[t] for t in chain.from_iterable(get_word_lines_from_file(self.text_file))))
from torch.utils.data import DataLoader

device = 'cpu'
train_dataset = Bigrams('europarl.txt', vocab_size)
model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size).to(device)
data = DataLoader(train_dataset, batch_size=2000)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()
step = 0

for epoch in range(1):
    model.train()
    for x, y in data:
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        outputs = model(x)
        loss = criterion(torch.log(outputs), y)
        if step % 100 == 0:
            print(step, loss)
        step += 1
        loss.backward()
        optimizer.step()
        
torch.save(model.state_dict(), 'model/model1.bin')
0 tensor(10.0424, grad_fn=<NllLossBackward0>)
100 tensor(7.9016, grad_fn=<NllLossBackward0>)
200 tensor(7.1964, grad_fn=<NllLossBackward0>)
300 tensor(6.5661, grad_fn=<NllLossBackward0>)
400 tensor(6.4146, grad_fn=<NllLossBackward0>)
500 tensor(5.8718, grad_fn=<NllLossBackward0>)
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-8-06724a2e87a4> in <module>
      7         y = y.to(device)
      8         optimizer.zero_grad()
----> 9         outputs = model(x)
     10         loss = criterion(torch.log(outputs), y)
     11         if step % 100 == 0:

~\AppData\Roaming\Python\Python39\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
   1108         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110             return forward_call(*input, **kwargs)
   1111         # Do not call functions when jit is used
   1112         full_backward_hooks, non_full_backward_hooks = [], []

<ipython-input-2-4f6f391f0eb8> in forward(self, x)
     10     def forward(self, x):
     11         x = self.embedding(x)
---> 12         x = self.linear(x)
     13         x = torch.softmax(x, dim=1)
     14         return x

~\AppData\Roaming\Python\Python39\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
   1108         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110             return forward_call(*input, **kwargs)
   1111         # Do not call functions when jit is used
   1112         full_backward_hooks, non_full_backward_hooks = [], []

~\AppData\Roaming\Python\Python39\site-packages\torch\nn\modules\linear.py in forward(self, input)
    101 
    102     def forward(self, input: Tensor) -> Tensor:
--> 103         return F.linear(input, self.weight, self.bias)
    104 
    105     def extra_repr(self) -> str:

KeyboardInterrupt: