Neural bigram with/out validation.

This commit is contained in:
Jan Nowak 2022-05-07 00:08:59 +02:00
parent 43036240f0
commit fd03c9369f
2 changed files with 345 additions and 84 deletions

231
run.py
View File

@ -1,98 +1,163 @@
import pandas as pd
import csv
from collections import Counter, defaultdict
from nltk.tokenize import RegexpTokenizer
from nltk import trigrams
from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
import lzma
import kenlm
from math import log10
from english_words import english_words_set
from torch import nn
import torch
from torch.utils.data import IterableDataset
import itertools
from torch.utils.data import DataLoader
import numpy as np
class WordPred:
def __init__(self):
self.tokenizer = RegexpTokenizer(r"\w+")
# self.model = defaultdict(lambda: defaultdict(lambda: 0))
self.model = kenlm.Model("model.binary")
self.words = set()
# def get_words_from_line(file_path):
# for index, line in enumerate(get_lines_from_file(file)):
# yield '<s>'
# for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
# yield m.group(0).lower()
# yield '</s>'
# if index == 10000:
# break
def read_file(self, file):
for line in file:
text = line.split("\t")
yield re.sub(r"[^\w\d'\s]+", '',
re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower()))
def read_file_7(self, file):
for line in file:
text = line.split("\t")
yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', text[7].replace("\\n", " ").replace("\n", "").lower()))
def get_words_from_line(line):
line = line.rstrip()
yield '<s>'
for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
yield m.group(0).lower()
yield '</s>'
def fill_words(self, file_path, output_file):
with open(output_file, 'w') as out:
def get_words_lines_from_file(file_path):
with lzma.open(file_path, mode='rt') as file:
for text in self.read_file(file):
for mword in text.split(" "):
if mword not in self.words:
out.write(mword + "\n")
self.words.add(mword)
def read_words(self, file_path):
with open(file_path, 'r') as fin:
for word in fin.readlines():
word = word.replace("\n", "")
if word:
self.words.add(word)
def create_train_file(self, file_path, output_path, rows=10000):
with open(output_path, 'w') as outputfile:
with lzma.open(file_path, mode='rt') as file:
for index, text in enumerate(self.read_file(file)):
outputfile.write(text)
if index == rows:
for index, line in enumerate(file):
text = line.split("\t")
yield get_words_from_line(re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower())))
if index == 50000:
break
outputfile.close()
def generate_outputs(self, input_file, output_file):
with open(output_file, 'w') as outputf:
with lzma.open(input_file, mode='rt') as file:
for index, text in enumerate(self.read_file_7(file)):
tokens = self.tokenizer.tokenize(text)
if len(tokens) < 4:
prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
else:
prediction = wp.predict_probs(tokens[0], tokens[1])
outputf.write(prediction + '\n')
def predict_probs(self, word1, word2):
preds = []
for word in english_words_set:
sentence = word1 + ' ' + word + ' ' + word2
words_score = self.model.score(sentence, bos=False, eos=False)
vocab_size = 20000
if len(preds) < 12:
preds.append((word, words_score))
else:
min_score = preds[0]
for score in preds:
if min_score[1] > score[1]:
min_score = score
if min_score[1] < words_score:
preds.remove(min_score)
preds.append((word, words_score))
probs = sorted(preds, key=lambda sc: sc[1], reverse=True)
str_prediction = ''
for word, prob in probs:
str_prediction += f'{word}:{prob} '
str_prediction += f':{log10(0.99)}'
vocab = build_vocab_from_iterator(
get_words_lines_from_file('train/in.tsv.xz'),
max_tokens=vocab_size,
specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])
# vocab=None
embed_size = 100
class SimpleBigramNeuralLanguageModel(nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleBigramNeuralLanguageModel, self).__init__()
self.model = nn.Sequential(
nn.Embedding(vocabulary_size, embedding_size),
nn.Linear(embedding_size, vocabulary_size),
nn.Softmax()
)
def forward(self, x):
return self.model(x)
def look_ahead_iterator(gen):
prev = None
for item in gen:
if prev is not None:
yield (prev, item)
prev = item
class Bigrams(IterableDataset):
def __init__(self, text_file, vocabulary_size):
self.vocab = build_vocab_from_iterator(
get_words_lines_from_file(text_file),
max_tokens=vocabulary_size,
specials=['<unk>'])
self.vocab.set_default_index(self.vocab['<unk>'])
self.vocabulary_size = vocabulary_size
self.text_file = text_file
def __iter__(self):
return look_ahead_iterator(
(self.vocab[t] for t in itertools.chain.from_iterable(get_words_lines_from_file(self.text_file))))
def train():
batch_size = 22000
train_dataset = Bigrams('train/in.tsv.xz', vocab_size)
device = 'cuda'
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()
model.train()
step = 0
for x, y in train_data_loader:
# Transfer Data to GPU
x = x.to(device)
y = y.to(device)
# Clear the gradients
optimizer.zero_grad()
# Forward Pass
ypredicted = model(x)
# Find the Loss
loss = criterion(torch.log(ypredicted), y)
if step % 100 == 0:
print(step, loss)
step += 1
# Calculate gradients
loss.backward()
# Update Weights
optimizer.step()
torch.save(model.state_dict(), 'model1.bin')
def predict():
device = 'cuda'
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
model.load_state_dict(torch.load('model1.bin'))
model.eval()
ixs = torch.tensor(vocab.forward(['for'])).to(device)
out = model(ixs)
top = torch.topk(out[0], 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
print(list(zip(top_words, top_indices, top_probs)))
def similar():
device = 'cuda'
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
model.load_state_dict(torch.load('model1.bin'))
model.eval()
cos = nn.CosineSimilarity(dim=1, eps=1e-6)
embeddings = model.model[0].weight
vec = embeddings[vocab['went']]
similarities = cos(vec, embeddings)
top = torch.topk(similarities, 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
print(list(zip(top_words, top_indices, top_probs)))
return str_prediction
if __name__ == "__main__":
wp = WordPred()
# wp.create_train_file("train/in.tsv.xz", "train/in.txt")
# wp.fill_words("train/in.tsv.xz", "words.txt")
# wp.read_words("words.txt")
wp.generate_outputs("dev-0/in.tsv.xz", "dev-0/out3.tsv")
wp.generate_outputs("test-A/in.tsv.xz", "test-A/out3.tsv")
# train()
predict()

196
run_neu_val.py Normal file
View File

@ -0,0 +1,196 @@
from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
import lzma
from torch import nn
import torch
from torch.utils.data import IterableDataset
import itertools
from torch.utils.data import DataLoader
import numpy as np
# def get_words_from_line(file_path):
# for index, line in enumerate(get_lines_from_file(file)):
# yield '<s>'
# for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
# yield m.group(0).lower()
# yield '</s>'
# if index == 10000:
# break
def get_words_from_line(line):
line = line.rstrip()
yield '<s>'
for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
yield m.group(0).lower()
yield '</s>'
def get_words_lines_from_file(file_path):
with lzma.open(file_path, mode='rt') as file:
for index, line in enumerate(file):
text = line.split("\t")
yield get_words_from_line(re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower())))
if index == 50000:
break
vocab_size = 220
# vocab = build_vocab_from_iterator(
# get_words_lines_from_file('train/in.tsv.xz'),
# max_tokens=vocab_size,
# specials=['<unk>'])
#
# vocab.set_default_index(vocab['<unk>'])
vocab=None
embed_size = 100
class SimpleBigramNeuralLanguageModel(nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleBigramNeuralLanguageModel, self).__init__()
self.model = nn.Sequential(
nn.Embedding(vocabulary_size, embedding_size),
nn.Linear(embedding_size, vocabulary_size),
nn.Softmax()
)
def forward(self, x):
return self.model(x)
def look_ahead_iterator(gen):
prev = None
for item in gen:
if prev is not None:
yield (prev, item)
prev = item
class Bigrams(IterableDataset):
def __init__(self, text_file, vocabulary_size):
self.vocab = build_vocab_from_iterator(
get_words_lines_from_file(text_file),
max_tokens=vocabulary_size,
specials=['<unk>'])
self.vocab.set_default_index(self.vocab['<unk>'])
self.vocabulary_size = vocabulary_size
self.text_file = text_file
def __iter__(self):
return look_ahead_iterator(
(self.vocab[t] for t in itertools.chain.from_iterable(get_words_lines_from_file(self.text_file))))
def train():
batch_size = 100000
epochs = 5
train_dataset = Bigrams('train/in.tsv.xz', vocab_size)
valid_dataset = Bigrams('dev-0/in.tsv.xz', vocab_size)
device = 'cuda'
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()
valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size)
model.train()
train_loss = 0.0
min_valid_loss = np.inf
for e in range(epochs):
step = 0
for x, y in train_data_loader:
# Transfer Data to GPU
x = x.to(device)
y = y.to(device)
# Clear the gradients
optimizer.zero_grad()
# Forward Pass
ypredicted = model(x)
# Find the Loss
loss = criterion(torch.log(ypredicted), y)
if step % 100 == 0:
print(step, loss)
step += 1
# Calculate gradients
loss.backward()
# Update Weights
optimizer.step()
# Calculate Loss
train_loss += loss.item()
# Validate
model.eval()
valid_loss = 0.0
for x, y in valid_data_loader:
# Transfer Data to GPU
x = x.to(device)
y = y.to(device)
# Forward Pass
target = model(x)
# Find the Loss
loss = criterion(target, y)
# Calculate Loss
valid_loss += loss.item()
print(f'Epoch {e + 1} \t\t '
f'Training Loss: {train_loss} \t\t '
f'Validation Loss: {valid_loss}')
if min_valid_loss > valid_loss:
print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f}) \t Saving The Model')
min_valid_loss = valid_loss
# Saving State Dict
torch.save(model.state_dict(), 'model1.bin')
def predict():
device = 'cuda'
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
model.load_state_dict(torch.load('model1.bin'))
model.eval()
ixs = torch.tensor(vocab.forward(['for'])).to(device)
out = model(ixs)
top = torch.topk(out[0], 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
print(list(zip(top_words, top_indices, top_probs)))
def similar():
device = 'cuda'
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
model.load_state_dict(torch.load('model1.bin'))
model.eval()
cos = nn.CosineSimilarity(dim=1, eps=1e-6)
embeddings = model.model[0].weight
vec = embeddings[vocab['went']]
similarities = cos(vec, embeddings)
top = torch.topk(similarities, 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
print(list(zip(top_words, top_indices, top_probs)))
if __name__ == "__main__":
train()
# predict()