From fd03c9369f09620941b6bd9db6e994a4ff924b10 Mon Sep 17 00:00:00 2001
From: Jan Nowak <95jan.nowak@gmail.com>
Date: Sat, 7 May 2022 00:08:59 +0200
Subject: [PATCH] Neural bigram with/out validation.

---
 run.py         | 233 +++++++++++++++++++++++++++++++------------------
 run_neu_val.py | 196 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 345 insertions(+), 84 deletions(-)
 create mode 100644 run_neu_val.py
diff --git a/run.py b/run.py
index 519f042..7632b42 100644
--- a/run.py
+++ b/run.py
@@ -1,98 +1,163 @@
-import pandas as pd
-import csv
-from collections import Counter, defaultdict
-from nltk.tokenize import RegexpTokenizer
-from nltk import trigrams
+from itertools import islice
 import regex as re
+import sys
+from torchtext.vocab import build_vocab_from_iterator
 import lzma
-import kenlm
-from math import log10
-from english_words import english_words_set
+from torch import nn
+import torch
+from torch.utils.data import IterableDataset
+import itertools
+from torch.utils.data import DataLoader
+import numpy as np
 
-class WordPred:
 
-    def __init__(self):
-        self.tokenizer = RegexpTokenizer(r"\w+")
-        # self.model = defaultdict(lambda: defaultdict(lambda: 0))
-        self.model = kenlm.Model("model.binary")
-        self.words = set()
+# def get_words_from_line(file_path):
+#     for index, line in enumerate(get_lines_from_file(file)):
+#         yield '<s>'
+#         for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
+#             yield m.group(0).lower()
+#         yield '</s>'
+#         if index == 10000:
+#             break
 
-    def read_file(self, file):
-        for line in file:
+
+def get_words_from_line(line):
+    line = line.rstrip()
+    yield '<s>'
+    for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
+        yield m.group(0).lower()
+    yield '</s>'
+
+
+def get_words_lines_from_file(file_path):
+    with lzma.open(file_path, mode='rt') as file:
+        for index, line in enumerate(file):
             text = line.split("\t")
-            yield re.sub(r"[^\w\d'\s]+", '',
-                         re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower()))
-
-    def read_file_7(self, file):
-        for line in file:
-            text = line.split("\t")
-            yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', text[7].replace("\\n", " ").replace("\n", "").lower()))
-
-    def fill_words(self, file_path, output_file):
-        with open(output_file, 'w') as out:
-            with lzma.open(file_path, mode='rt') as file:
-                for text in self.read_file(file):
-                    for mword in text.split(" "):
-                        if mword not in self.words:
-                            out.write(mword + "\n")
-                        self.words.add(mword)
-
-    def read_words(self, file_path):
-        with open(file_path, 'r') as fin:
-            for word in fin.readlines():
-                word = word.replace("\n", "")
-                if word:
-                    self.words.add(word)
+            yield get_words_from_line(re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower())))
+            if index == 50000:
+                break
 
 
-    def create_train_file(self, file_path, output_path, rows=10000):
-        with open(output_path, 'w') as outputfile:
-            with lzma.open(file_path, mode='rt') as file:
-                for index, text in enumerate(self.read_file(file)):
-                    outputfile.write(text)
-                    if index == rows:
-                        break
-                outputfile.close()
+vocab_size = 20000
 
-    def generate_outputs(self, input_file, output_file):
-        with open(output_file, 'w') as outputf:
-            with lzma.open(input_file, mode='rt') as file:
-                for index, text in enumerate(self.read_file_7(file)):
-                    tokens = self.tokenizer.tokenize(text)
-                    if len(tokens) < 4:
-                        prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
-                    else:
-                        prediction = wp.predict_probs(tokens[0], tokens[1])
-                    outputf.write(prediction + '\n')
+vocab = build_vocab_from_iterator(
+    get_words_lines_from_file('train/in.tsv.xz'),
+    max_tokens=vocab_size,
+    specials=['<unk>'])
 
-    def predict_probs(self, word1, word2):
-        preds = []
-        for word in english_words_set:
-            sentence = word1 + ' ' + word + ' ' + word2
-            words_score = self.model.score(sentence, bos=False, eos=False)
+vocab.set_default_index(vocab['<unk>'])
+# vocab=None
 
-            if len(preds) < 12:
-                preds.append((word, words_score))
-            else:
-                min_score = preds[0]
-                for score in preds:
-                    if min_score[1] > score[1]:
-                        min_score = score
-                if min_score[1] < words_score:
-                    preds.remove(min_score)
-                    preds.append((word, words_score))
-        probs = sorted(preds, key=lambda sc: sc[1], reverse=True)
-        str_prediction = ''
-        for word, prob in probs:
-            str_prediction += f'{word}:{prob} '
-        str_prediction += f':{log10(0.99)}'
+embed_size = 100
+
+
+class SimpleBigramNeuralLanguageModel(nn.Module):
+    def __init__(self, vocabulary_size, embedding_size):
+        super(SimpleBigramNeuralLanguageModel, self).__init__()
+        self.model = nn.Sequential(
+            nn.Embedding(vocabulary_size, embedding_size),
+            nn.Linear(embedding_size, vocabulary_size),
+            nn.Softmax()
+        )
+
+    def forward(self, x):
+        return self.model(x)
+
+
+def look_ahead_iterator(gen):
+    prev = None
+    for item in gen:
+        if prev is not None:
+            yield (prev, item)
+        prev = item
+
+
+class Bigrams(IterableDataset):
+    def __init__(self, text_file, vocabulary_size):
+        self.vocab = build_vocab_from_iterator(
+            get_words_lines_from_file(text_file),
+            max_tokens=vocabulary_size,
+            specials=['<unk>'])
+        self.vocab.set_default_index(self.vocab['<unk>'])
+        self.vocabulary_size = vocabulary_size
+        self.text_file = text_file
+
+    def __iter__(self):
+        return look_ahead_iterator(
+            (self.vocab[t] for t in itertools.chain.from_iterable(get_words_lines_from_file(self.text_file))))
+
+
+def train():
+    batch_size = 22000
+
+    train_dataset = Bigrams('train/in.tsv.xz', vocab_size)
+
+    device = 'cuda'
+    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
+    train_data_loader = DataLoader(train_dataset, batch_size=batch_size)
+    optimizer = torch.optim.Adam(model.parameters())
+    criterion = torch.nn.NLLLoss()
+
+    model.train()
+    step = 0
+    for x, y in train_data_loader:
+        # Transfer Data to GPU
+        x = x.to(device)
+        y = y.to(device)
+        # Clear the gradients
+        optimizer.zero_grad()
+        # Forward Pass
+        ypredicted = model(x)
+        # Find the Loss
+        loss = criterion(torch.log(ypredicted), y)
+        if step % 100 == 0:
+            print(step, loss)
+        step += 1
+        # Calculate gradients
+        loss.backward()
+        # Update Weights
+        optimizer.step()
+    torch.save(model.state_dict(), 'model1.bin')
+
+
+def predict():
+    device = 'cuda'
+    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
+    model.load_state_dict(torch.load('model1.bin'))
+    model.eval()
+
+    ixs = torch.tensor(vocab.forward(['for'])).to(device)
+
+    out = model(ixs)
+    top = torch.topk(out[0], 10)
+    top_indices = top.indices.tolist()
+    top_probs = top.values.tolist()
+    top_words = vocab.lookup_tokens(top_indices)
+    print(list(zip(top_words, top_indices, top_probs)))
+
+
+def similar():
+    device = 'cuda'
+    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
+    model.load_state_dict(torch.load('model1.bin'))
+    model.eval()
+
+    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
+
+    embeddings = model.model[0].weight
+
+    vec = embeddings[vocab['went']]
+
+    similarities = cos(vec, embeddings)
+
+    top = torch.topk(similarities, 10)
+
+    top_indices = top.indices.tolist()
+    top_probs = top.values.tolist()
+    top_words = vocab.lookup_tokens(top_indices)
+    print(list(zip(top_words, top_indices, top_probs)))
 
-        return str_prediction
 
 if __name__ == "__main__":
-    wp = WordPred()
-    # wp.create_train_file("train/in.tsv.xz", "train/in.txt")
-    # wp.fill_words("train/in.tsv.xz", "words.txt")
-    # wp.read_words("words.txt")
-    wp.generate_outputs("dev-0/in.tsv.xz", "dev-0/out3.tsv")
-    wp.generate_outputs("test-A/in.tsv.xz", "test-A/out3.tsv")
+    # train()
+    predict()
diff --git a/run_neu_val.py b/run_neu_val.py
new file mode 100644
index 0000000..7f9ffde
--- /dev/null
+++ b/run_neu_val.py
@@ -0,0 +1,196 @@
+from itertools import islice
+import regex as re
+import sys
+from torchtext.vocab import build_vocab_from_iterator
+import lzma
+from torch import nn
+import torch
+from torch.utils.data import IterableDataset
+import itertools
+from torch.utils.data import DataLoader
+import numpy as np
+
+
+# def get_words_from_line(file_path):
+#     for index, line in enumerate(get_lines_from_file(file)):
+#         yield '<s>'
+#         for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
+#             yield m.group(0).lower()
+#         yield '</s>'
+#         if index == 10000:
+#             break
+
+
+def get_words_from_line(line):
+    line = line.rstrip()
+    yield '<s>'
+    for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
+        yield m.group(0).lower()
+    yield '</s>'
+
+
+def get_words_lines_from_file(file_path):
+    with lzma.open(file_path, mode='rt') as file:
+        for index, line in enumerate(file):
+            text = line.split("\t")
+            yield get_words_from_line(re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower())))
+            if index == 50000:
+                break
+
+
+vocab_size = 220
+
+# vocab = build_vocab_from_iterator(
+#     get_words_lines_from_file('train/in.tsv.xz'),
+#     max_tokens=vocab_size,
+#     specials=['<unk>'])
+#
+# vocab.set_default_index(vocab['<unk>'])
+vocab=None
+
+embed_size = 100
+
+
+class SimpleBigramNeuralLanguageModel(nn.Module):
+    def __init__(self, vocabulary_size, embedding_size):
+        super(SimpleBigramNeuralLanguageModel, self).__init__()
+        self.model = nn.Sequential(
+            nn.Embedding(vocabulary_size, embedding_size),
+            nn.Linear(embedding_size, vocabulary_size),
+            nn.Softmax()
+        )
+
+    def forward(self, x):
+        return self.model(x)
+
+
+def look_ahead_iterator(gen):
+    prev = None
+    for item in gen:
+        if prev is not None:
+            yield (prev, item)
+        prev = item
+
+
+class Bigrams(IterableDataset):
+    def __init__(self, text_file, vocabulary_size):
+        self.vocab = build_vocab_from_iterator(
+            get_words_lines_from_file(text_file),
+            max_tokens=vocabulary_size,
+            specials=['<unk>'])
+        self.vocab.set_default_index(self.vocab['<unk>'])
+        self.vocabulary_size = vocabulary_size
+        self.text_file = text_file
+
+    def __iter__(self):
+        return look_ahead_iterator(
+            (self.vocab[t] for t in itertools.chain.from_iterable(get_words_lines_from_file(self.text_file))))
+
+
+def train():
+
+    batch_size = 100000
+    epochs = 5
+
+    train_dataset = Bigrams('train/in.tsv.xz', vocab_size)
+    valid_dataset = Bigrams('dev-0/in.tsv.xz', vocab_size)
+
+    device = 'cuda'
+    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
+    train_data_loader = DataLoader(train_dataset, batch_size=batch_size)
+    optimizer = torch.optim.Adam(model.parameters())
+    criterion = torch.nn.NLLLoss()
+
+    valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size)
+
+    model.train()
+    train_loss = 0.0
+    min_valid_loss = np.inf
+    for e in range(epochs):
+        step = 0
+        for x, y in train_data_loader:
+            # Transfer Data to GPU
+            x = x.to(device)
+            y = y.to(device)
+            # Clear the gradients
+            optimizer.zero_grad()
+            # Forward Pass
+            ypredicted = model(x)
+            # Find the Loss
+            loss = criterion(torch.log(ypredicted), y)
+            if step % 100 == 0:
+                print(step, loss)
+            step += 1
+            # Calculate gradients
+            loss.backward()
+            # Update Weights
+            optimizer.step()
+            # Calculate Loss
+            train_loss += loss.item()
+
+        # Validate
+        model.eval()
+        valid_loss = 0.0
+        for x, y in valid_data_loader:
+            # Transfer Data to GPU
+            x = x.to(device)
+            y = y.to(device)
+            # Forward Pass
+            target = model(x)
+            # Find the Loss
+            loss = criterion(target, y)
+            # Calculate Loss
+            valid_loss += loss.item()
+
+        print(f'Epoch {e + 1} \t\t '
+              f'Training Loss: {train_loss} \t\t '
+              f'Validation Loss: {valid_loss}')
+
+        if min_valid_loss > valid_loss:
+            print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f}) \t Saving The Model')
+            min_valid_loss = valid_loss
+            # Saving State Dict
+            torch.save(model.state_dict(), 'model1.bin')
+
+
+def predict():
+    device = 'cuda'
+    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
+    model.load_state_dict(torch.load('model1.bin'))
+    model.eval()
+
+    ixs = torch.tensor(vocab.forward(['for'])).to(device)
+
+    out = model(ixs)
+    top = torch.topk(out[0], 10)
+    top_indices = top.indices.tolist()
+    top_probs = top.values.tolist()
+    top_words = vocab.lookup_tokens(top_indices)
+    print(list(zip(top_words, top_indices, top_probs)))
+
+
+def similar():
+    device = 'cuda'
+    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
+    model.load_state_dict(torch.load('model1.bin'))
+    model.eval()
+
+    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
+
+    embeddings = model.model[0].weight
+
+    vec = embeddings[vocab['went']]
+
+    similarities = cos(vec, embeddings)
+
+    top = torch.topk(similarities, 10)
+
+    top_indices = top.indices.tolist()
+    top_probs = top.values.tolist()
+    top_words = vocab.lookup_tokens(top_indices)
+    print(list(zip(top_words, top_indices, top_probs)))
+
+
+if __name__ == "__main__":
+    train()
+    # predict()