From fd03c9369f09620941b6bd9db6e994a4ff924b10 Mon Sep 17 00:00:00 2001
From: Jan Nowak <95jan.nowak@gmail.com>
Date: Sat, 7 May 2022 00:08:59 +0200
Subject: [PATCH] Neural bigram with/out validation.
---
run.py | 233 +++++++++++++++++++++++++++++++------------------
run_neu_val.py | 196 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 345 insertions(+), 84 deletions(-)
create mode 100644 run_neu_val.py
diff --git a/run.py b/run.py
index 519f042..7632b42 100644
--- a/run.py
+++ b/run.py
@@ -1,98 +1,163 @@
-import pandas as pd
-import csv
-from collections import Counter, defaultdict
-from nltk.tokenize import RegexpTokenizer
-from nltk import trigrams
+from itertools import islice
import regex as re
+import sys
+from torchtext.vocab import build_vocab_from_iterator
import lzma
-import kenlm
-from math import log10
-from english_words import english_words_set
+from torch import nn
+import torch
+from torch.utils.data import IterableDataset
+import itertools
+from torch.utils.data import DataLoader
+import numpy as np
-class WordPred:
- def __init__(self):
- self.tokenizer = RegexpTokenizer(r"\w+")
- # self.model = defaultdict(lambda: defaultdict(lambda: 0))
- self.model = kenlm.Model("model.binary")
- self.words = set()
+# def get_words_from_line(file_path):
+# for index, line in enumerate(get_lines_from_file(file)):
+# yield ''
+# for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
+# yield m.group(0).lower()
+# yield ''
+# if index == 10000:
+# break
- def read_file(self, file):
- for line in file:
+
+def get_words_from_line(line):
+ line = line.rstrip()
+ yield ''
+ for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
+ yield m.group(0).lower()
+ yield ''
+
+
+def get_words_lines_from_file(file_path):
+ with lzma.open(file_path, mode='rt') as file:
+ for index, line in enumerate(file):
text = line.split("\t")
- yield re.sub(r"[^\w\d'\s]+", '',
- re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower()))
-
- def read_file_7(self, file):
- for line in file:
- text = line.split("\t")
- yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', text[7].replace("\\n", " ").replace("\n", "").lower()))
-
- def fill_words(self, file_path, output_file):
- with open(output_file, 'w') as out:
- with lzma.open(file_path, mode='rt') as file:
- for text in self.read_file(file):
- for mword in text.split(" "):
- if mword not in self.words:
- out.write(mword + "\n")
- self.words.add(mword)
-
- def read_words(self, file_path):
- with open(file_path, 'r') as fin:
- for word in fin.readlines():
- word = word.replace("\n", "")
- if word:
- self.words.add(word)
+ yield get_words_from_line(re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower())))
+ if index == 50000:
+ break
- def create_train_file(self, file_path, output_path, rows=10000):
- with open(output_path, 'w') as outputfile:
- with lzma.open(file_path, mode='rt') as file:
- for index, text in enumerate(self.read_file(file)):
- outputfile.write(text)
- if index == rows:
- break
- outputfile.close()
+vocab_size = 20000
- def generate_outputs(self, input_file, output_file):
- with open(output_file, 'w') as outputf:
- with lzma.open(input_file, mode='rt') as file:
- for index, text in enumerate(self.read_file_7(file)):
- tokens = self.tokenizer.tokenize(text)
- if len(tokens) < 4:
- prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
- else:
- prediction = wp.predict_probs(tokens[0], tokens[1])
- outputf.write(prediction + '\n')
+vocab = build_vocab_from_iterator(
+ get_words_lines_from_file('train/in.tsv.xz'),
+ max_tokens=vocab_size,
+ specials=[''])
- def predict_probs(self, word1, word2):
- preds = []
- for word in english_words_set:
- sentence = word1 + ' ' + word + ' ' + word2
- words_score = self.model.score(sentence, bos=False, eos=False)
+vocab.set_default_index(vocab[''])
+# vocab=None
- if len(preds) < 12:
- preds.append((word, words_score))
- else:
- min_score = preds[0]
- for score in preds:
- if min_score[1] > score[1]:
- min_score = score
- if min_score[1] < words_score:
- preds.remove(min_score)
- preds.append((word, words_score))
- probs = sorted(preds, key=lambda sc: sc[1], reverse=True)
- str_prediction = ''
- for word, prob in probs:
- str_prediction += f'{word}:{prob} '
- str_prediction += f':{log10(0.99)}'
+embed_size = 100
+
+
+class SimpleBigramNeuralLanguageModel(nn.Module):
+ def __init__(self, vocabulary_size, embedding_size):
+ super(SimpleBigramNeuralLanguageModel, self).__init__()
+ self.model = nn.Sequential(
+ nn.Embedding(vocabulary_size, embedding_size),
+ nn.Linear(embedding_size, vocabulary_size),
+ nn.Softmax()
+ )
+
+ def forward(self, x):
+ return self.model(x)
+
+
+def look_ahead_iterator(gen):
+ prev = None
+ for item in gen:
+ if prev is not None:
+ yield (prev, item)
+ prev = item
+
+
+class Bigrams(IterableDataset):
+ def __init__(self, text_file, vocabulary_size):
+ self.vocab = build_vocab_from_iterator(
+ get_words_lines_from_file(text_file),
+ max_tokens=vocabulary_size,
+ specials=[''])
+ self.vocab.set_default_index(self.vocab[''])
+ self.vocabulary_size = vocabulary_size
+ self.text_file = text_file
+
+ def __iter__(self):
+ return look_ahead_iterator(
+ (self.vocab[t] for t in itertools.chain.from_iterable(get_words_lines_from_file(self.text_file))))
+
+
+def train():
+ batch_size = 22000
+
+ train_dataset = Bigrams('train/in.tsv.xz', vocab_size)
+
+ device = 'cuda'
+ model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
+ train_data_loader = DataLoader(train_dataset, batch_size=batch_size)
+ optimizer = torch.optim.Adam(model.parameters())
+ criterion = torch.nn.NLLLoss()
+
+ model.train()
+ step = 0
+ for x, y in train_data_loader:
+ # Transfer Data to GPU
+ x = x.to(device)
+ y = y.to(device)
+ # Clear the gradients
+ optimizer.zero_grad()
+ # Forward Pass
+ ypredicted = model(x)
+ # Find the Loss
+ loss = criterion(torch.log(ypredicted), y)
+ if step % 100 == 0:
+ print(step, loss)
+ step += 1
+ # Calculate gradients
+ loss.backward()
+ # Update Weights
+ optimizer.step()
+ torch.save(model.state_dict(), 'model1.bin')
+
+
+def predict():
+ device = 'cuda'
+ model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
+ model.load_state_dict(torch.load('model1.bin'))
+ model.eval()
+
+ ixs = torch.tensor(vocab.forward(['for'])).to(device)
+
+ out = model(ixs)
+ top = torch.topk(out[0], 10)
+ top_indices = top.indices.tolist()
+ top_probs = top.values.tolist()
+ top_words = vocab.lookup_tokens(top_indices)
+ print(list(zip(top_words, top_indices, top_probs)))
+
+
+def similar():
+ device = 'cuda'
+ model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
+ model.load_state_dict(torch.load('model1.bin'))
+ model.eval()
+
+ cos = nn.CosineSimilarity(dim=1, eps=1e-6)
+
+ embeddings = model.model[0].weight
+
+ vec = embeddings[vocab['went']]
+
+ similarities = cos(vec, embeddings)
+
+ top = torch.topk(similarities, 10)
+
+ top_indices = top.indices.tolist()
+ top_probs = top.values.tolist()
+ top_words = vocab.lookup_tokens(top_indices)
+ print(list(zip(top_words, top_indices, top_probs)))
- return str_prediction
if __name__ == "__main__":
- wp = WordPred()
- # wp.create_train_file("train/in.tsv.xz", "train/in.txt")
- # wp.fill_words("train/in.tsv.xz", "words.txt")
- # wp.read_words("words.txt")
- wp.generate_outputs("dev-0/in.tsv.xz", "dev-0/out3.tsv")
- wp.generate_outputs("test-A/in.tsv.xz", "test-A/out3.tsv")
+ # train()
+ predict()
diff --git a/run_neu_val.py b/run_neu_val.py
new file mode 100644
index 0000000..7f9ffde
--- /dev/null
+++ b/run_neu_val.py
@@ -0,0 +1,196 @@
+from itertools import islice
+import regex as re
+import sys
+from torchtext.vocab import build_vocab_from_iterator
+import lzma
+from torch import nn
+import torch
+from torch.utils.data import IterableDataset
+import itertools
+from torch.utils.data import DataLoader
+import numpy as np
+
+
+# def get_words_from_line(file_path):
+# for index, line in enumerate(get_lines_from_file(file)):
+# yield ''
+# for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
+# yield m.group(0).lower()
+# yield ''
+# if index == 10000:
+# break
+
+
+def get_words_from_line(line):
+ line = line.rstrip()
+ yield ''
+ for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
+ yield m.group(0).lower()
+ yield ''
+
+
+def get_words_lines_from_file(file_path):
+ with lzma.open(file_path, mode='rt') as file:
+ for index, line in enumerate(file):
+ text = line.split("\t")
+ yield get_words_from_line(re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower())))
+ if index == 50000:
+ break
+
+
+vocab_size = 220
+
+# vocab = build_vocab_from_iterator(
+# get_words_lines_from_file('train/in.tsv.xz'),
+# max_tokens=vocab_size,
+# specials=[''])
+#
+# vocab.set_default_index(vocab[''])
+vocab=None
+
+embed_size = 100
+
+
+class SimpleBigramNeuralLanguageModel(nn.Module):
+ def __init__(self, vocabulary_size, embedding_size):
+ super(SimpleBigramNeuralLanguageModel, self).__init__()
+ self.model = nn.Sequential(
+ nn.Embedding(vocabulary_size, embedding_size),
+ nn.Linear(embedding_size, vocabulary_size),
+ nn.Softmax()
+ )
+
+ def forward(self, x):
+ return self.model(x)
+
+
+def look_ahead_iterator(gen):
+ prev = None
+ for item in gen:
+ if prev is not None:
+ yield (prev, item)
+ prev = item
+
+
+class Bigrams(IterableDataset):
+ def __init__(self, text_file, vocabulary_size):
+ self.vocab = build_vocab_from_iterator(
+ get_words_lines_from_file(text_file),
+ max_tokens=vocabulary_size,
+ specials=[''])
+ self.vocab.set_default_index(self.vocab[''])
+ self.vocabulary_size = vocabulary_size
+ self.text_file = text_file
+
+ def __iter__(self):
+ return look_ahead_iterator(
+ (self.vocab[t] for t in itertools.chain.from_iterable(get_words_lines_from_file(self.text_file))))
+
+
+def train():
+
+ batch_size = 100000
+ epochs = 5
+
+ train_dataset = Bigrams('train/in.tsv.xz', vocab_size)
+ valid_dataset = Bigrams('dev-0/in.tsv.xz', vocab_size)
+
+ device = 'cuda'
+ model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
+ train_data_loader = DataLoader(train_dataset, batch_size=batch_size)
+ optimizer = torch.optim.Adam(model.parameters())
+ criterion = torch.nn.NLLLoss()
+
+ valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size)
+
+ model.train()
+ train_loss = 0.0
+ min_valid_loss = np.inf
+ for e in range(epochs):
+ step = 0
+ for x, y in train_data_loader:
+ # Transfer Data to GPU
+ x = x.to(device)
+ y = y.to(device)
+ # Clear the gradients
+ optimizer.zero_grad()
+ # Forward Pass
+ ypredicted = model(x)
+ # Find the Loss
+ loss = criterion(torch.log(ypredicted), y)
+ if step % 100 == 0:
+ print(step, loss)
+ step += 1
+ # Calculate gradients
+ loss.backward()
+ # Update Weights
+ optimizer.step()
+ # Calculate Loss
+ train_loss += loss.item()
+
+ # Validate
+ model.eval()
+ valid_loss = 0.0
+ for x, y in valid_data_loader:
+ # Transfer Data to GPU
+ x = x.to(device)
+ y = y.to(device)
+ # Forward Pass
+ target = model(x)
+ # Find the Loss
+ loss = criterion(target, y)
+ # Calculate Loss
+ valid_loss += loss.item()
+
+ print(f'Epoch {e + 1} \t\t '
+ f'Training Loss: {train_loss} \t\t '
+ f'Validation Loss: {valid_loss}')
+
+ if min_valid_loss > valid_loss:
+ print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f}) \t Saving The Model')
+ min_valid_loss = valid_loss
+ # Saving State Dict
+ torch.save(model.state_dict(), 'model1.bin')
+
+
+def predict():
+ device = 'cuda'
+ model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
+ model.load_state_dict(torch.load('model1.bin'))
+ model.eval()
+
+ ixs = torch.tensor(vocab.forward(['for'])).to(device)
+
+ out = model(ixs)
+ top = torch.topk(out[0], 10)
+ top_indices = top.indices.tolist()
+ top_probs = top.values.tolist()
+ top_words = vocab.lookup_tokens(top_indices)
+ print(list(zip(top_words, top_indices, top_probs)))
+
+
+def similar():
+ device = 'cuda'
+ model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
+ model.load_state_dict(torch.load('model1.bin'))
+ model.eval()
+
+ cos = nn.CosineSimilarity(dim=1, eps=1e-6)
+
+ embeddings = model.model[0].weight
+
+ vec = embeddings[vocab['went']]
+
+ similarities = cos(vec, embeddings)
+
+ top = torch.topk(similarities, 10)
+
+ top_indices = top.indices.tolist()
+ top_probs = top.values.tolist()
+ top_words = vocab.lookup_tokens(top_indices)
+ print(list(zip(top_words, top_indices, top_probs)))
+
+
+if __name__ == "__main__":
+ train()
+ # predict()