In [1]:
from torchtext.vocab import build_vocab_from_iterator
from nltk.tokenize import word_tokenize


def get_words_from_line(line):
  line = line.split('\t')
  left_context = line[2]
  right_context = line[3]
  line = ' '.join([left_context, right_context])
  tokenized = word_tokenize(line)
  for word in tokenized:
     yield word.lower()

def get_context_from_line(line):
  line = line.split('\t')
  left_context = line[2]
  right_context = line[3]
  left_context = [cleanString(t) for t in word_tokenize(left_context)]
  right_context = [cleanString(t) for t in word_tokenize(right_context)]
  yield left_context[-2:] + right_context[:2]

def cleanString(value):
  value = value.strip('')
  value = value.strip('\n')
  value = value.strip('\t')
  value = value.lower()
  return value

def get_word_lines_from_file(file_name):
  with open(file_name, 'r') as fh:
    for line in fh:
       yield get_words_from_line(line)

vocab_size = 20000

vocab = build_vocab_from_iterator(
    get_word_lines_from_file('dev-0/in.tsv'),
    max_tokens = vocab_size,
    specials = ['<unk>'])

vocab.set_default_index(vocab['<unk>'])

In [5]:
from torch import nn
import torch

embed_size = 100
context_size = 4

class SimpleNgramNeuralLanguageModel(nn.Module):
  def __init__(self, vocabulary_size, embedding_size, context_size):
      super(SimpleNgramNeuralLanguageModel, self).__init__()
      self.embedding = nn.Embedding(vocabulary_size, embedding_size)
      self.fn1 = nn.Linear(embedding_size * context_size, vocabulary_size)
      self.fn2 = nn.Linear(vocabulary_size, vocabulary_size)
      self.out = nn.Softmax(dim=0)

  def forward(self, x):
      x = torch.as_tensor([torch.cat((self.embedding(val[0]), self.embedding(val[1]), self.embedding(val[2]), self.embedding(val[3]))) for val in x])
      x = self.fn1(x)
      x = self.fn2(x)
      x = self.out(x)
      return x

model = SimpleNgramNeuralLanguageModel(vocab_size, embed_size, context_size)

In [9]:
from torch.utils.data import IterableDataset, DataLoader

class Ngrams(IterableDataset):
  def __init__(self, text_file, vocabulary_size, expected_file):
      self.vocab = build_vocab_from_iterator(
         get_word_lines_from_file(text_file),
         max_tokens = vocabulary_size,
         specials = ['<unk>'])
      self.vocab.set_default_index(self.vocab['<unk>'])
      self.vocabulary_size = vocabulary_size
      self.text_file = text_file
      self.expected_file = expected_file

  def __iter__(self):
     with open(self.text_file, 'r') as inData:
        with open(self.expected_file, 'r') as expData:
         for lineIn, lineExp in zip(inData, expData):
            yield torch.as_tensor([self.vocab[t] for chunk in get_context_from_line(lineIn) for t in chunk]), torch.as_tensor(self.vocab[cleanString(lineExp)])

train_dataset = Ngrams('dev-0/in.tsv', vocab_size, 'dev-0/expected.tsv')

In [10]:
device = 'cuda'
model = SimpleNgramNeuralLanguageModel(vocab_size, embed_size, context_size).to(device)
data = DataLoader(train_dataset, batch_size=1)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()
   
model.train()
step = 0
for x, y in data:
   print(x)
   print(y)
   x = x.to(device)
   y = y.to(device)
   optimizer.zero_grad()
   ypredicted = model(x)
   print(ypredicted)
   loss = criterion(torch.log(ypredicted), y)
   if step % 100 == 0:
      print(step, loss)
   step += 1
   loss.backward()
   optimizer.step()
   break
   
#  torch.save(model.state_dict(), 'model1.bin')

#Policzmy najbardziej prawdopodobne kontynuację dla zadanego słowa:



tensor([[239, 466, 232,   1]])
tensor([2])


RuntimeError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.