from itertools import islice import regex as re import sys from torchtext.vocab import build_vocab_from_iterator import lzma from torch import nn import torch from torch.utils.data import IterableDataset import itertools from torch.utils.data import DataLoader import numpy as np from nltk.tokenize import RegexpTokenizer from nltk import trigrams # def get_words_from_line(file_path): # for index, line in enumerate(get_lines_from_file(file)): # yield '' # for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line): # yield m.group(0).lower() # yield '' # if index == 10000: # break tokenizer = RegexpTokenizer(r"\w+") def read_file_6(file): for line in file: text = line.split("\t") yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', text[6].replace("\\n", " ").replace("\n", "").lower())) def get_words_from_line(line): line = line.rstrip() yield '' for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line): yield m.group(0).lower() yield '' def get_words_lines_from_file(file_path): with lzma.open(file_path, mode='rt') as file: for index, line in enumerate(file): text = line.split("\t") yield get_words_from_line(re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower()))) # if index == 1000: # break vocab_size = 30000 vocab = build_vocab_from_iterator( get_words_lines_from_file('train/in.tsv.xz'), max_tokens=vocab_size, specials=['']) vocab.set_default_index(vocab['']) # vocab=None embed_size = 100 class SimpleBigramNeuralLanguageModel(nn.Module): def __init__(self, vocabulary_size, embedding_size): super(SimpleBigramNeuralLanguageModel, self).__init__() self.model = nn.Sequential( nn.Embedding(vocabulary_size, embedding_size), nn.Linear(embedding_size, vocabulary_size), nn.Softmax() ) def forward(self, x): return self.model(x) def look_ahead_iterator(gen): prev = None for item in gen: if prev is not None: yield (prev, item) prev = item class Bigrams(IterableDataset): def __init__(self, text_file, vocabulary_size): self.vocab = build_vocab_from_iterator( get_words_lines_from_file(text_file), max_tokens=vocabulary_size, specials=['']) self.vocab.set_default_index(self.vocab['']) self.vocabulary_size = vocabulary_size self.text_file = text_file def __iter__(self): return look_ahead_iterator( (self.vocab[t] for t in itertools.chain.from_iterable(get_words_lines_from_file(self.text_file)))) def train(): batch_size = 15000 train_dataset = Bigrams('train/in.tsv.xz', vocab_size) device = 'cuda' model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device) train_data_loader = DataLoader(train_dataset, batch_size=batch_size) optimizer = torch.optim.Adam(model.parameters()) criterion = torch.nn.NLLLoss() model.train() step = 0 for x, y in train_data_loader: # Transfer Data to GPU x = x.to(device) y = y.to(device) # Clear the gradients optimizer.zero_grad() # Forward Pass ypredicted = model(x) # Find the Loss loss = criterion(torch.log(ypredicted), y) if step % 100 == 0: print(step, loss) step += 1 # Calculate gradients loss.backward() # Update Weights optimizer.step() print(step) torch.save(model.state_dict(), 'model1.bin') def predict(word): device = 'cuda' model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device) model.load_state_dict(torch.load('model1.bin')) model.eval() ixs = torch.tensor(vocab.forward([word])).to(device) out = model(ixs) top = torch.topk(out[0], 8) top_indices = top.indices.tolist() top_probs = top.values.tolist() top_words = vocab.lookup_tokens(top_indices) str_predictions = "" lht = 1.0 for pred_word in list(zip(top_words, top_indices, top_probs)): if lht - pred_word[2] >= 0: str_predictions += f"{pred_word[0]}:{pred_word[2]} " lht -= pred_word[2] if lht != 1.0: str_predictions += f":{lht}" return str_predictions def similar(): device = 'cuda' model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device) model.load_state_dict(torch.load('model1.bin')) model.eval() cos = nn.CosineSimilarity(dim=1, eps=1e-6) embeddings = model.model[0].weight vec = embeddings[vocab['went']] similarities = cos(vec, embeddings) top = torch.topk(similarities, 10) top_indices = top.indices.tolist() top_probs = top.values.tolist() top_words = vocab.lookup_tokens(top_indices) print(list(zip(top_words, top_indices, top_probs))) def generate_outputs(input_file, output_file): with open(output_file, 'w') as outputf: with lzma.open(input_file, mode='rt') as file: for index, text in enumerate(read_file_6(file)): tokens = tokenizer.tokenize(text) if len(tokens) < 4: prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1' else: prediction = predict(tokens[-1]) outputf.write(prediction + '\n') if __name__ == "__main__": # train() # predict() # generate_outputs("dev-0/in.tsv.xz", "dev-0/out.tsv") generate_outputs("test-A/in.tsv.xz", "test-A/out.tsv") # count_words = 0 # for i in get_words_lines_from_file('train/in.tsv.xz'): # for j in i: # count_words += 1 # print(count_words)