from itertools import islice import regex as re import sys from torchtext.vocab import build_vocab_from_iterator import lzma from torch import nn import torch from torch.utils.data import IterableDataset import itertools from torch.utils.data import DataLoader import numpy as np # def get_words_from_line(file_path): # for index, line in enumerate(get_lines_from_file(file)): # yield '' # for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line): # yield m.group(0).lower() # yield '' # if index == 10000: # break def get_words_from_line(line): line = line.rstrip() yield '' for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line): yield m.group(0).lower() yield '' def get_words_lines_from_file(file_path): with lzma.open(file_path, mode='rt') as file: for index, line in enumerate(file): text = line.split("\t") yield get_words_from_line(re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower()))) if index == 50000: break vocab_size = 20000 vocab = build_vocab_from_iterator( get_words_lines_from_file('train/in.tsv.xz'), max_tokens=vocab_size, specials=['']) vocab.set_default_index(vocab['']) # vocab=None embed_size = 100 class SimpleBigramNeuralLanguageModel(nn.Module): def __init__(self, vocabulary_size, embedding_size): super(SimpleBigramNeuralLanguageModel, self).__init__() self.model = nn.Sequential( nn.Embedding(vocabulary_size, embedding_size), nn.Linear(embedding_size, vocabulary_size), nn.Softmax() ) def forward(self, x): return self.model(x) def look_ahead_iterator(gen): prev = None for item in gen: if prev is not None: yield (prev, item) prev = item class Bigrams(IterableDataset): def __init__(self, text_file, vocabulary_size): self.vocab = build_vocab_from_iterator( get_words_lines_from_file(text_file), max_tokens=vocabulary_size, specials=['']) self.vocab.set_default_index(self.vocab['']) self.vocabulary_size = vocabulary_size self.text_file = text_file def __iter__(self): return look_ahead_iterator( (self.vocab[t] for t in itertools.chain.from_iterable(get_words_lines_from_file(self.text_file)))) def train(): batch_size = 22000 train_dataset = Bigrams('train/in.tsv.xz', vocab_size) device = 'cuda' model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device) train_data_loader = DataLoader(train_dataset, batch_size=batch_size) optimizer = torch.optim.Adam(model.parameters()) criterion = torch.nn.NLLLoss() model.train() step = 0 for x, y in train_data_loader: # Transfer Data to GPU x = x.to(device) y = y.to(device) # Clear the gradients optimizer.zero_grad() # Forward Pass ypredicted = model(x) # Find the Loss loss = criterion(torch.log(ypredicted), y) if step % 100 == 0: print(step, loss) step += 1 # Calculate gradients loss.backward() # Update Weights optimizer.step() torch.save(model.state_dict(), 'model1.bin') def predict(): device = 'cuda' model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device) model.load_state_dict(torch.load('model1.bin')) model.eval() ixs = torch.tensor(vocab.forward(['for'])).to(device) out = model(ixs) top = torch.topk(out[0], 10) top_indices = top.indices.tolist() top_probs = top.values.tolist() top_words = vocab.lookup_tokens(top_indices) print(list(zip(top_words, top_indices, top_probs))) def similar(): device = 'cuda' model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device) model.load_state_dict(torch.load('model1.bin')) model.eval() cos = nn.CosineSimilarity(dim=1, eps=1e-6) embeddings = model.model[0].weight vec = embeddings[vocab['went']] similarities = cos(vec, embeddings) top = torch.topk(similarities, 10) top_indices = top.indices.tolist() top_probs = top.values.tolist() top_words = vocab.lookup_tokens(top_indices) print(list(zip(top_words, top_indices, top_probs))) if __name__ == "__main__": # train() predict()