Neural bigram with/out validation.

2022-05-07 00:08:59 +02:00 · 2022-05-07 00:08:59 +02:00 · fd03c9369f
commit fd03c9369f
parent 43036240f0
2 changed files with 345 additions and 84 deletions
--- a/run.py
+++ b/run.py
@ -1,98 +1,163 @@
-import pandas as pd
+from itertools import islice
 import csv
 from collections import Counter, defaultdict
 from nltk.tokenize import RegexpTokenizer
 from nltk import trigrams
 import regex as re
 import sys
 from torchtext.vocab import build_vocab_from_iterator
 import lzma
-import kenlm
+from torch import nn
-from math import log10
+import torch
-from english_words import english_words_set
+from torch.utils.data import IterableDataset
 import itertools
 from torch.utils.data import DataLoader
 import numpy as np
 class WordPred:
-    def __init__(self):
+# def get_words_from_line(file_path):
-        self.tokenizer = RegexpTokenizer(r"\w+")
+#     for index, line in enumerate(get_lines_from_file(file)):
-        # self.model = defaultdict(lambda: defaultdict(lambda: 0))
+#         yield '<s>'
-        self.model = kenlm.Model("model.binary")
+#         for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
-        self.words = set()
+#             yield m.group(0).lower()
 #         yield '</s>'
 #         if index == 10000:
 #             break
-    def read_file(self, file):
+
-        for line in file:
+def get_words_from_line(line):
    line = line.rstrip()
    yield '<s>'
    for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
        yield m.group(0).lower()
    yield '</s>'
 def get_words_lines_from_file(file_path):
    with lzma.open(file_path, mode='rt') as file:
        for index, line in enumerate(file):
            text = line.split("\t")
-            yield re.sub(r"[^\w\d'\s]+", '',
+            yield get_words_from_line(re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower())))
-                         re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower()))
+            if index == 50000:
-
+                break
    def read_file_7(self, file):
        for line in file:
            text = line.split("\t")
            yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', text[7].replace("\\n", " ").replace("\n", "").lower()))
    def fill_words(self, file_path, output_file):
        with open(output_file, 'w') as out:
            with lzma.open(file_path, mode='rt') as file:
                for text in self.read_file(file):
                    for mword in text.split(" "):
                        if mword not in self.words:
                            out.write(mword + "\n")
                        self.words.add(mword)
    def read_words(self, file_path):
        with open(file_path, 'r') as fin:
            for word in fin.readlines():
                word = word.replace("\n", "")
                if word:
                    self.words.add(word)
-    def create_train_file(self, file_path, output_path, rows=10000):
+vocab_size = 20000
        with open(output_path, 'w') as outputfile:
            with lzma.open(file_path, mode='rt') as file:
                for index, text in enumerate(self.read_file(file)):
                    outputfile.write(text)
                    if index == rows:
                        break
                outputfile.close()
-    def generate_outputs(self, input_file, output_file):
+vocab = build_vocab_from_iterator(
-        with open(output_file, 'w') as outputf:
+    get_words_lines_from_file('train/in.tsv.xz'),
-            with lzma.open(input_file, mode='rt') as file:
+    max_tokens=vocab_size,
-                for index, text in enumerate(self.read_file_7(file)):
+    specials=['<unk>'])
                    tokens = self.tokenizer.tokenize(text)
                    if len(tokens) < 4:
                        prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
                    else:
                        prediction = wp.predict_probs(tokens[0], tokens[1])
                    outputf.write(prediction + '\n')
-    def predict_probs(self, word1, word2):
+vocab.set_default_index(vocab['<unk>'])
-        preds = []
+# vocab=None
        for word in english_words_set:
            sentence = word1 + ' ' + word + ' ' + word2
            words_score = self.model.score(sentence, bos=False, eos=False)
-            if len(preds) < 12:
+embed_size = 100
-                preds.append((word, words_score))
+
-            else:
+
-                min_score = preds[0]
+class SimpleBigramNeuralLanguageModel(nn.Module):
-                for score in preds:
+    def __init__(self, vocabulary_size, embedding_size):
-                    if min_score[1] > score[1]:
+        super(SimpleBigramNeuralLanguageModel, self).__init__()
-                        min_score = score
+        self.model = nn.Sequential(
-                if min_score[1] < words_score:
+            nn.Embedding(vocabulary_size, embedding_size),
-                    preds.remove(min_score)
+            nn.Linear(embedding_size, vocabulary_size),
-                    preds.append((word, words_score))
+            nn.Softmax()
-        probs = sorted(preds, key=lambda sc: sc[1], reverse=True)
+        )
-        str_prediction = ''
+
-        for word, prob in probs:
+    def forward(self, x):
-            str_prediction += f'{word}:{prob} '
+        return self.model(x)
-        str_prediction += f':{log10(0.99)}'
+
 def look_ahead_iterator(gen):
    prev = None
    for item in gen:
        if prev is not None:
            yield (prev, item)
        prev = item
 class Bigrams(IterableDataset):
    def __init__(self, text_file, vocabulary_size):
        self.vocab = build_vocab_from_iterator(
            get_words_lines_from_file(text_file),
            max_tokens=vocabulary_size,
            specials=['<unk>'])
        self.vocab.set_default_index(self.vocab['<unk>'])
        self.vocabulary_size = vocabulary_size
        self.text_file = text_file
    def __iter__(self):
        return look_ahead_iterator(
            (self.vocab[t] for t in itertools.chain.from_iterable(get_words_lines_from_file(self.text_file))))
 def train():
    batch_size = 22000
    train_dataset = Bigrams('train/in.tsv.xz', vocab_size)
    device = 'cuda'
    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size)
    optimizer = torch.optim.Adam(model.parameters())
    criterion = torch.nn.NLLLoss()
    model.train()
    step = 0
    for x, y in train_data_loader:
        # Transfer Data to GPU
        x = x.to(device)
        y = y.to(device)
        # Clear the gradients
        optimizer.zero_grad()
        # Forward Pass
        ypredicted = model(x)
        # Find the Loss
        loss = criterion(torch.log(ypredicted), y)
        if step % 100 == 0:
            print(step, loss)
        step += 1
        # Calculate gradients
        loss.backward()
        # Update Weights
        optimizer.step()
    torch.save(model.state_dict(), 'model1.bin')
 def predict():
    device = 'cuda'
    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
    model.load_state_dict(torch.load('model1.bin'))
    model.eval()
    ixs = torch.tensor(vocab.forward(['for'])).to(device)
    out = model(ixs)
    top = torch.topk(out[0], 10)
    top_indices = top.indices.tolist()
    top_probs = top.values.tolist()
    top_words = vocab.lookup_tokens(top_indices)
    print(list(zip(top_words, top_indices, top_probs)))
 def similar():
    device = 'cuda'
    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
    model.load_state_dict(torch.load('model1.bin'))
    model.eval()
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    embeddings = model.model[0].weight
    vec = embeddings[vocab['went']]
    similarities = cos(vec, embeddings)
    top = torch.topk(similarities, 10)
    top_indices = top.indices.tolist()
    top_probs = top.values.tolist()
    top_words = vocab.lookup_tokens(top_indices)
    print(list(zip(top_words, top_indices, top_probs)))
        return str_prediction
 if __name__ == "__main__":
-    wp = WordPred()
+    # train()
-    # wp.create_train_file("train/in.tsv.xz", "train/in.txt")
+    predict()
    # wp.fill_words("train/in.tsv.xz", "words.txt")
    # wp.read_words("words.txt")
    wp.generate_outputs("dev-0/in.tsv.xz", "dev-0/out3.tsv")
    wp.generate_outputs("test-A/in.tsv.xz", "test-A/out3.tsv")
--- a/run_neu_val.py
+++ b/run_neu_val.py
@ -0,0 +1,196 @@
 from itertools import islice
 import regex as re
 import sys
 from torchtext.vocab import build_vocab_from_iterator
 import lzma
 from torch import nn
 import torch
 from torch.utils.data import IterableDataset
 import itertools
 from torch.utils.data import DataLoader
 import numpy as np
 # def get_words_from_line(file_path):
 #     for index, line in enumerate(get_lines_from_file(file)):
 #         yield '<s>'
 #         for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
 #             yield m.group(0).lower()
 #         yield '</s>'
 #         if index == 10000:
 #             break
 def get_words_from_line(line):
    line = line.rstrip()
    yield '<s>'
    for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
        yield m.group(0).lower()
    yield '</s>'
 def get_words_lines_from_file(file_path):
    with lzma.open(file_path, mode='rt') as file:
        for index, line in enumerate(file):
            text = line.split("\t")
            yield get_words_from_line(re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower())))
            if index == 50000:
                break
 vocab_size = 220
 # vocab = build_vocab_from_iterator(
 #     get_words_lines_from_file('train/in.tsv.xz'),
 #     max_tokens=vocab_size,
 #     specials=['<unk>'])
 #
 # vocab.set_default_index(vocab['<unk>'])
 vocab=None
 embed_size = 100
 class SimpleBigramNeuralLanguageModel(nn.Module):
    def __init__(self, vocabulary_size, embedding_size):
        super(SimpleBigramNeuralLanguageModel, self).__init__()
        self.model = nn.Sequential(
            nn.Embedding(vocabulary_size, embedding_size),
            nn.Linear(embedding_size, vocabulary_size),
            nn.Softmax()
        )
    def forward(self, x):
        return self.model(x)
 def look_ahead_iterator(gen):
    prev = None
    for item in gen:
        if prev is not None:
            yield (prev, item)
        prev = item
 class Bigrams(IterableDataset):
    def __init__(self, text_file, vocabulary_size):
        self.vocab = build_vocab_from_iterator(
            get_words_lines_from_file(text_file),
            max_tokens=vocabulary_size,
            specials=['<unk>'])
        self.vocab.set_default_index(self.vocab['<unk>'])
        self.vocabulary_size = vocabulary_size
        self.text_file = text_file
    def __iter__(self):
        return look_ahead_iterator(
            (self.vocab[t] for t in itertools.chain.from_iterable(get_words_lines_from_file(self.text_file))))
 def train():
    batch_size = 100000
    epochs = 5
    train_dataset = Bigrams('train/in.tsv.xz', vocab_size)
    valid_dataset = Bigrams('dev-0/in.tsv.xz', vocab_size)
    device = 'cuda'
    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size)
    optimizer = torch.optim.Adam(model.parameters())
    criterion = torch.nn.NLLLoss()
    valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size)
    model.train()
    train_loss = 0.0
    min_valid_loss = np.inf
    for e in range(epochs):
        step = 0
        for x, y in train_data_loader:
            # Transfer Data to GPU
            x = x.to(device)
            y = y.to(device)
            # Clear the gradients
            optimizer.zero_grad()
            # Forward Pass
            ypredicted = model(x)
            # Find the Loss
            loss = criterion(torch.log(ypredicted), y)
            if step % 100 == 0:
                print(step, loss)
            step += 1
            # Calculate gradients
            loss.backward()
            # Update Weights
            optimizer.step()
            # Calculate Loss
            train_loss += loss.item()
        # Validate
        model.eval()
        valid_loss = 0.0
        for x, y in valid_data_loader:
            # Transfer Data to GPU
            x = x.to(device)
            y = y.to(device)
            # Forward Pass
            target = model(x)
            # Find the Loss
            loss = criterion(target, y)
            # Calculate Loss
            valid_loss += loss.item()
        print(f'Epoch {e + 1} \t\t '
              f'Training Loss: {train_loss} \t\t '
              f'Validation Loss: {valid_loss}')
        if min_valid_loss > valid_loss:
            print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f}) \t Saving The Model')
            min_valid_loss = valid_loss
            # Saving State Dict
            torch.save(model.state_dict(), 'model1.bin')
 def predict():
    device = 'cuda'
    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
    model.load_state_dict(torch.load('model1.bin'))
    model.eval()
    ixs = torch.tensor(vocab.forward(['for'])).to(device)
    out = model(ixs)
    top = torch.topk(out[0], 10)
    top_indices = top.indices.tolist()
    top_probs = top.values.tolist()
    top_words = vocab.lookup_tokens(top_indices)
    print(list(zip(top_words, top_indices, top_probs)))
 def similar():
    device = 'cuda'
    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
    model.load_state_dict(torch.load('model1.bin'))
    model.eval()
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    embeddings = model.model[0].weight
    vec = embeddings[vocab['went']]
    similarities = cos(vec, embeddings)
    top = torch.topk(similarities, 10)
    top_indices = top.indices.tolist()
    top_probs = top.values.tolist()
    top_words = vocab.lookup_tokens(top_indices)
    print(list(zip(top_words, top_indices, top_probs)))
 if __name__ == "__main__":
    train()
    # predict()