bigram neural

This commit is contained in:
s444417 2023-04-26 13:00:18 +00:00
parent 798d04eb15
commit 501182a0f1
5 changed files with 18203 additions and 0 deletions

View File

@ -0,0 +1,131 @@
import pickle
from torch.utils.data import IterableDataset
import itertools
from torch import nn
import torch
import lzma
from torch.utils.data import DataLoader
import pandas as pd
import tqdm
import regex as re
from nltk import word_tokenize
import csv
import nltk
vocabulary_size = 20000
most_common_en_word = "the:0.4 be:0.2 to:0.1 of:0.05 and:0.025 a:0.0125 :0.2125"
nltk.download("punkt")
vocab = None
with open('vocabulary.pickle', 'rb') as handle:
vocab = pickle.load(handle)
def look_ahead_iterator(gen):
prev = None
for item in gen:
if prev is not None:
yield (prev, item)
prev = item
def get_words_from_line(line):
line = line.rstrip()
yield '<s>'
for t in line.split(' '):
yield t
yield '</s>'
def get_word_lines_from_file(file_name):
with lzma.open(file_name, 'r') as fh:
for line in fh:
yield get_words_from_line(line.decode('utf-8'))
class Bigrams(IterableDataset):
def __init__(self, text_file, vocabulary_size):
self.vocab = vocab
self.vocab.set_default_index(self.vocab['<unk>'])
self.vocabulary_size = vocabulary_size
self.text_file = text_file
def __iter__(self):
return look_ahead_iterator(
(self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))
train_dataset = Bigrams('train/in.tsv.xz', vocabulary_size)
# print(next(iter(train_dataset)))
#
# print(vocab.lookup_tokens([23, 0]))
embed_size = 100
class SimpleBigramNeuralLanguageModel(nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleBigramNeuralLanguageModel, self).__init__()
self.model = nn.Sequential(
nn.Embedding(vocabulary_size, embedding_size),
nn.Linear(embedding_size, vocabulary_size),
nn.Softmax()
)
def forward(self, x):
return self.model(x)
device = 'cuda'
model = SimpleBigramNeuralLanguageModel(vocabulary_size, embed_size).to(device)
model.load_state_dict(torch.load('model1.bin'))
model.eval()
def predict_probs(word1):
ixs = torch.tensor(vocab.forward([word1])).to(device)
out = model(ixs)
top = torch.topk(out[0], 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
result_model = (list(zip(top_words, top_indices, top_probs)))
n_best = 5 # choose the top 5 predictions
# Remove any <unk> tokens from the predictions
unk_prob = 0
new_predictions = []
for pred in result_model:
if pred[0] == '<unk>':
unk_prob = pred[2]
else:
new_predictions.append(pred)
# Sort the predictions by probability and choose the top n
top_n = new_predictions[:n_best]
# Format the predictions as a string
output_str = ''
for i, pred in enumerate(top_n):
output_str += pred[0] + ':' + str(round(pred[2], 3)) + ' '
output_str += ':{}'.format(round(1 - sum([pred[2] for pred in top_n]) - unk_prob, 3))
return output_str
def prepare_text(text):
text = text.lower().replace("-\\n", "").replace("\\n", " ")
text = re.sub(r"\p{P}", "", text)
return text
def predict_file(file):
data = pd.read_csv(f'{file}/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
with open(f'{file}/out.tsv', 'w', encoding='utf-8') as file_out:
for _, row in tqdm.tqdm(data.iterrows()):
before = word_tokenize(prepare_text(str(row[6])))
if len(before) < 2:
prediction = most_common_en_word
else:
prediction = predict_probs(before[-1])
file_out.write(prediction + '\n')
predict_file('dev-0')
predict_file('test-A')

35
bigram-neural/train.py Normal file
View File

@ -0,0 +1,35 @@
from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
import lzma
import pickle
def get_words_from_line(line):
line = line.rstrip()
yield '<s>'
for t in line.split(' '):
yield t
yield '</s>'
n_size = 100000
def get_word_lines_from_file(file_name):
with lzma.open(file_name, 'r') as fh:
n = 0
for line in fh:
n += 1
yield get_words_from_line(line.decode('utf-8'))
if n == n_size:
break
vocab_size = 20000
vocab = build_vocab_from_iterator(
get_word_lines_from_file('train/in.tsv.xz'),
max_tokens = vocab_size,
specials = ['<unk>'])
with open('vocabulary.pickle', 'wb') as handle:
pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
vocab['human']

104
bigram-neural/train2.py Normal file
View File

@ -0,0 +1,104 @@
import pickle
from torch.utils.data import IterableDataset
import itertools
from torch import nn
import torch
import lzma
from torch.utils.data import DataLoader
import tqdm
vocabulary_size = 20000
vocab = None
with open('vocabulary.pickle', 'rb') as handle:
vocab = pickle.load(handle)
def look_ahead_iterator(gen):
prev = None
for item in gen:
if prev is not None:
yield (prev, item)
prev = item
def get_words_from_line(line):
line = line.rstrip()
yield '<s>'
for t in line.split(' '):
yield t
yield '</s>'
def get_word_lines_from_file(file_name):
with lzma.open(file_name, 'r') as fh:
for line in fh:
yield get_words_from_line(line.decode('utf-8'))
class Bigrams(IterableDataset):
def __init__(self, text_file, vocabulary_size):
self.vocab = vocab
self.vocab.set_default_index(self.vocab['<unk>'])
self.vocabulary_size = vocabulary_size
self.text_file = text_file
def __iter__(self):
return look_ahead_iterator(
(self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))
train_dataset = Bigrams('train/in.tsv.xz', vocabulary_size)
# print(next(iter(train_dataset)))
#
# print(vocab.lookup_tokens([23, 0]))
embed_size = 100
class SimpleBigramNeuralLanguageModel(nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleBigramNeuralLanguageModel, self).__init__()
self.model = nn.Sequential(
nn.Embedding(vocabulary_size, embedding_size),
nn.Linear(embedding_size, vocabulary_size),
nn.Softmax()
)
def forward(self, x):
return self.model(x)
device = 'cuda'
model = SimpleBigramNeuralLanguageModel(vocabulary_size, embed_size).to(device)
data = DataLoader(train_dataset, batch_size=500)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()
model.train()
step = 0
for x, y in tqdm.tqdm(data):
x = x.to(device)
y = y.to(device)
optimizer.zero_grad()
ypredicted = model(x)
loss = criterion(torch.log(ypredicted), y)
if step % 100 == 0:
print(step, loss)
if step > 5000:
break
step += 1
loss.backward()
optimizer.step()
torch.save(model.state_dict(), 'model1.bin')
device = 'cuda'
model = SimpleBigramNeuralLanguageModel(vocabulary_size, embed_size).to(device)
model.load_state_dict(torch.load('model1.bin'))
model.eval()
ixs = torch.tensor(vocab.forward(['that'])).to(device)
out = model(ixs)
top = torch.topk(out[0], 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
print(list(zip(top_words, top_indices, top_probs)))

10519
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

7414
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff