import pickle from torch.utils.data import IterableDataset import itertools from torch import nn import torch import lzma from torch.utils.data import DataLoader import tqdm vocabulary_size = 20000 vocab = None with open('vocabulary.pickle', 'rb') as handle: vocab = pickle.load(handle) def look_ahead_iterator(gen): prev = None for item in gen: if prev is not None: yield (prev, item) prev = item def get_words_from_line(line): line = line.rstrip() yield '' for t in line.split(' '): yield t yield '' def get_word_lines_from_file(file_name): with lzma.open(file_name, 'r') as fh: for line in fh: yield get_words_from_line(line.decode('utf-8')) class Bigrams(IterableDataset): def __init__(self, text_file, vocabulary_size): self.vocab = vocab self.vocab.set_default_index(self.vocab['']) self.vocabulary_size = vocabulary_size self.text_file = text_file def __iter__(self): return look_ahead_iterator( (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file)))) train_dataset = Bigrams('train/in.tsv.xz', vocabulary_size) # print(next(iter(train_dataset))) # # print(vocab.lookup_tokens([23, 0])) embed_size = 100 class SimpleBigramNeuralLanguageModel(nn.Module): def __init__(self, vocabulary_size, embedding_size): super(SimpleBigramNeuralLanguageModel, self).__init__() self.model = nn.Sequential( nn.Embedding(vocabulary_size, embedding_size), nn.Linear(embedding_size, vocabulary_size), nn.Softmax() ) def forward(self, x): return self.model(x) device = 'cuda' model = SimpleBigramNeuralLanguageModel(vocabulary_size, embed_size).to(device) data = DataLoader(train_dataset, batch_size=500) optimizer = torch.optim.Adam(model.parameters()) criterion = torch.nn.NLLLoss() model.train() step = 0 for x, y in tqdm.tqdm(data): x = x.to(device) y = y.to(device) optimizer.zero_grad() ypredicted = model(x) loss = criterion(torch.log(ypredicted), y) if step % 100 == 0: print(step, loss) if step > 5000: break step += 1 loss.backward() optimizer.step() torch.save(model.state_dict(), 'model1.bin') device = 'cuda' model = SimpleBigramNeuralLanguageModel(vocabulary_size, embed_size).to(device) model.load_state_dict(torch.load('model1.bin')) model.eval() ixs = torch.tensor(vocab.forward(['that'])).to(device) out = model(ixs) top = torch.topk(out[0], 10) top_indices = top.indices.tolist() top_probs = top.values.tolist() top_words = vocab.lookup_tokens(top_indices) print(list(zip(top_words, top_indices, top_probs)))