diff --git a/run.py b/run.py index 519f042..7632b42 100644 --- a/run.py +++ b/run.py @@ -1,98 +1,163 @@ -import pandas as pd -import csv -from collections import Counter, defaultdict -from nltk.tokenize import RegexpTokenizer -from nltk import trigrams +from itertools import islice import regex as re +import sys +from torchtext.vocab import build_vocab_from_iterator import lzma -import kenlm -from math import log10 -from english_words import english_words_set +from torch import nn +import torch +from torch.utils.data import IterableDataset +import itertools +from torch.utils.data import DataLoader +import numpy as np -class WordPred: - def __init__(self): - self.tokenizer = RegexpTokenizer(r"\w+") - # self.model = defaultdict(lambda: defaultdict(lambda: 0)) - self.model = kenlm.Model("model.binary") - self.words = set() +# def get_words_from_line(file_path): +# for index, line in enumerate(get_lines_from_file(file)): +# yield '' +# for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line): +# yield m.group(0).lower() +# yield '' +# if index == 10000: +# break - def read_file(self, file): - for line in file: + +def get_words_from_line(line): + line = line.rstrip() + yield '' + for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line): + yield m.group(0).lower() + yield '' + + +def get_words_lines_from_file(file_path): + with lzma.open(file_path, mode='rt') as file: + for index, line in enumerate(file): text = line.split("\t") - yield re.sub(r"[^\w\d'\s]+", '', - re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower())) - - def read_file_7(self, file): - for line in file: - text = line.split("\t") - yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', text[7].replace("\\n", " ").replace("\n", "").lower())) - - def fill_words(self, file_path, output_file): - with open(output_file, 'w') as out: - with lzma.open(file_path, mode='rt') as file: - for text in self.read_file(file): - for mword in text.split(" "): - if mword not in self.words: - out.write(mword + "\n") - self.words.add(mword) - - def read_words(self, file_path): - with open(file_path, 'r') as fin: - for word in fin.readlines(): - word = word.replace("\n", "") - if word: - self.words.add(word) + yield get_words_from_line(re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower()))) + if index == 50000: + break - def create_train_file(self, file_path, output_path, rows=10000): - with open(output_path, 'w') as outputfile: - with lzma.open(file_path, mode='rt') as file: - for index, text in enumerate(self.read_file(file)): - outputfile.write(text) - if index == rows: - break - outputfile.close() +vocab_size = 20000 - def generate_outputs(self, input_file, output_file): - with open(output_file, 'w') as outputf: - with lzma.open(input_file, mode='rt') as file: - for index, text in enumerate(self.read_file_7(file)): - tokens = self.tokenizer.tokenize(text) - if len(tokens) < 4: - prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1' - else: - prediction = wp.predict_probs(tokens[0], tokens[1]) - outputf.write(prediction + '\n') +vocab = build_vocab_from_iterator( + get_words_lines_from_file('train/in.tsv.xz'), + max_tokens=vocab_size, + specials=['']) - def predict_probs(self, word1, word2): - preds = [] - for word in english_words_set: - sentence = word1 + ' ' + word + ' ' + word2 - words_score = self.model.score(sentence, bos=False, eos=False) +vocab.set_default_index(vocab['']) +# vocab=None - if len(preds) < 12: - preds.append((word, words_score)) - else: - min_score = preds[0] - for score in preds: - if min_score[1] > score[1]: - min_score = score - if min_score[1] < words_score: - preds.remove(min_score) - preds.append((word, words_score)) - probs = sorted(preds, key=lambda sc: sc[1], reverse=True) - str_prediction = '' - for word, prob in probs: - str_prediction += f'{word}:{prob} ' - str_prediction += f':{log10(0.99)}' +embed_size = 100 + + +class SimpleBigramNeuralLanguageModel(nn.Module): + def __init__(self, vocabulary_size, embedding_size): + super(SimpleBigramNeuralLanguageModel, self).__init__() + self.model = nn.Sequential( + nn.Embedding(vocabulary_size, embedding_size), + nn.Linear(embedding_size, vocabulary_size), + nn.Softmax() + ) + + def forward(self, x): + return self.model(x) + + +def look_ahead_iterator(gen): + prev = None + for item in gen: + if prev is not None: + yield (prev, item) + prev = item + + +class Bigrams(IterableDataset): + def __init__(self, text_file, vocabulary_size): + self.vocab = build_vocab_from_iterator( + get_words_lines_from_file(text_file), + max_tokens=vocabulary_size, + specials=['']) + self.vocab.set_default_index(self.vocab['']) + self.vocabulary_size = vocabulary_size + self.text_file = text_file + + def __iter__(self): + return look_ahead_iterator( + (self.vocab[t] for t in itertools.chain.from_iterable(get_words_lines_from_file(self.text_file)))) + + +def train(): + batch_size = 22000 + + train_dataset = Bigrams('train/in.tsv.xz', vocab_size) + + device = 'cuda' + model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device) + train_data_loader = DataLoader(train_dataset, batch_size=batch_size) + optimizer = torch.optim.Adam(model.parameters()) + criterion = torch.nn.NLLLoss() + + model.train() + step = 0 + for x, y in train_data_loader: + # Transfer Data to GPU + x = x.to(device) + y = y.to(device) + # Clear the gradients + optimizer.zero_grad() + # Forward Pass + ypredicted = model(x) + # Find the Loss + loss = criterion(torch.log(ypredicted), y) + if step % 100 == 0: + print(step, loss) + step += 1 + # Calculate gradients + loss.backward() + # Update Weights + optimizer.step() + torch.save(model.state_dict(), 'model1.bin') + + +def predict(): + device = 'cuda' + model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device) + model.load_state_dict(torch.load('model1.bin')) + model.eval() + + ixs = torch.tensor(vocab.forward(['for'])).to(device) + + out = model(ixs) + top = torch.topk(out[0], 10) + top_indices = top.indices.tolist() + top_probs = top.values.tolist() + top_words = vocab.lookup_tokens(top_indices) + print(list(zip(top_words, top_indices, top_probs))) + + +def similar(): + device = 'cuda' + model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device) + model.load_state_dict(torch.load('model1.bin')) + model.eval() + + cos = nn.CosineSimilarity(dim=1, eps=1e-6) + + embeddings = model.model[0].weight + + vec = embeddings[vocab['went']] + + similarities = cos(vec, embeddings) + + top = torch.topk(similarities, 10) + + top_indices = top.indices.tolist() + top_probs = top.values.tolist() + top_words = vocab.lookup_tokens(top_indices) + print(list(zip(top_words, top_indices, top_probs))) - return str_prediction if __name__ == "__main__": - wp = WordPred() - # wp.create_train_file("train/in.tsv.xz", "train/in.txt") - # wp.fill_words("train/in.tsv.xz", "words.txt") - # wp.read_words("words.txt") - wp.generate_outputs("dev-0/in.tsv.xz", "dev-0/out3.tsv") - wp.generate_outputs("test-A/in.tsv.xz", "test-A/out3.tsv") + # train() + predict() diff --git a/run_neu_val.py b/run_neu_val.py new file mode 100644 index 0000000..7f9ffde --- /dev/null +++ b/run_neu_val.py @@ -0,0 +1,196 @@ +from itertools import islice +import regex as re +import sys +from torchtext.vocab import build_vocab_from_iterator +import lzma +from torch import nn +import torch +from torch.utils.data import IterableDataset +import itertools +from torch.utils.data import DataLoader +import numpy as np + + +# def get_words_from_line(file_path): +# for index, line in enumerate(get_lines_from_file(file)): +# yield '' +# for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line): +# yield m.group(0).lower() +# yield '' +# if index == 10000: +# break + + +def get_words_from_line(line): + line = line.rstrip() + yield '' + for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line): + yield m.group(0).lower() + yield '' + + +def get_words_lines_from_file(file_path): + with lzma.open(file_path, mode='rt') as file: + for index, line in enumerate(file): + text = line.split("\t") + yield get_words_from_line(re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower()))) + if index == 50000: + break + + +vocab_size = 220 + +# vocab = build_vocab_from_iterator( +# get_words_lines_from_file('train/in.tsv.xz'), +# max_tokens=vocab_size, +# specials=['']) +# +# vocab.set_default_index(vocab['']) +vocab=None + +embed_size = 100 + + +class SimpleBigramNeuralLanguageModel(nn.Module): + def __init__(self, vocabulary_size, embedding_size): + super(SimpleBigramNeuralLanguageModel, self).__init__() + self.model = nn.Sequential( + nn.Embedding(vocabulary_size, embedding_size), + nn.Linear(embedding_size, vocabulary_size), + nn.Softmax() + ) + + def forward(self, x): + return self.model(x) + + +def look_ahead_iterator(gen): + prev = None + for item in gen: + if prev is not None: + yield (prev, item) + prev = item + + +class Bigrams(IterableDataset): + def __init__(self, text_file, vocabulary_size): + self.vocab = build_vocab_from_iterator( + get_words_lines_from_file(text_file), + max_tokens=vocabulary_size, + specials=['']) + self.vocab.set_default_index(self.vocab['']) + self.vocabulary_size = vocabulary_size + self.text_file = text_file + + def __iter__(self): + return look_ahead_iterator( + (self.vocab[t] for t in itertools.chain.from_iterable(get_words_lines_from_file(self.text_file)))) + + +def train(): + + batch_size = 100000 + epochs = 5 + + train_dataset = Bigrams('train/in.tsv.xz', vocab_size) + valid_dataset = Bigrams('dev-0/in.tsv.xz', vocab_size) + + device = 'cuda' + model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device) + train_data_loader = DataLoader(train_dataset, batch_size=batch_size) + optimizer = torch.optim.Adam(model.parameters()) + criterion = torch.nn.NLLLoss() + + valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size) + + model.train() + train_loss = 0.0 + min_valid_loss = np.inf + for e in range(epochs): + step = 0 + for x, y in train_data_loader: + # Transfer Data to GPU + x = x.to(device) + y = y.to(device) + # Clear the gradients + optimizer.zero_grad() + # Forward Pass + ypredicted = model(x) + # Find the Loss + loss = criterion(torch.log(ypredicted), y) + if step % 100 == 0: + print(step, loss) + step += 1 + # Calculate gradients + loss.backward() + # Update Weights + optimizer.step() + # Calculate Loss + train_loss += loss.item() + + # Validate + model.eval() + valid_loss = 0.0 + for x, y in valid_data_loader: + # Transfer Data to GPU + x = x.to(device) + y = y.to(device) + # Forward Pass + target = model(x) + # Find the Loss + loss = criterion(target, y) + # Calculate Loss + valid_loss += loss.item() + + print(f'Epoch {e + 1} \t\t ' + f'Training Loss: {train_loss} \t\t ' + f'Validation Loss: {valid_loss}') + + if min_valid_loss > valid_loss: + print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f}) \t Saving The Model') + min_valid_loss = valid_loss + # Saving State Dict + torch.save(model.state_dict(), 'model1.bin') + + +def predict(): + device = 'cuda' + model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device) + model.load_state_dict(torch.load('model1.bin')) + model.eval() + + ixs = torch.tensor(vocab.forward(['for'])).to(device) + + out = model(ixs) + top = torch.topk(out[0], 10) + top_indices = top.indices.tolist() + top_probs = top.values.tolist() + top_words = vocab.lookup_tokens(top_indices) + print(list(zip(top_words, top_indices, top_probs))) + + +def similar(): + device = 'cuda' + model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device) + model.load_state_dict(torch.load('model1.bin')) + model.eval() + + cos = nn.CosineSimilarity(dim=1, eps=1e-6) + + embeddings = model.model[0].weight + + vec = embeddings[vocab['went']] + + similarities = cos(vec, embeddings) + + top = torch.topk(similarities, 10) + + top_indices = top.indices.tolist() + top_probs = top.values.tolist() + top_words = vocab.lookup_tokens(top_indices) + print(list(zip(top_words, top_indices, top_probs))) + + +if __name__ == "__main__": + train() + # predict()