nn bigram

This commit is contained in:
Bartosz Karwacki 2022-05-08 17:23:26 +02:00
parent 20cb6b9e97
commit 3349d6ee6b
4 changed files with 18059 additions and 18043 deletions

File diff suppressed because it is too large Load Diff

BIN
nn_model.bin Normal file

Binary file not shown.

204
run2.py
View File

@ -1,76 +1,53 @@
import csv
import itertools
import lzma
from os.path import exists
import pandas as pd
import regex as re
import torch
from nltk.tokenize import RegexpTokenizer
from torch import nn
from torch.utils.data import DataLoader, IterableDataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
VOCAB_SIZE = 40000
EMBED_SIZE = 100
DEVICE = "cuda"
tokenizer = RegexpTokenizer(r"\w+")
IN_INPUT_PATH = "train/in.tsv.xz"
IN_OUTPUT_PATH = "train/expected.tsv"
VOCAB_SIZE = 30000
EMBED_SIZE = 150
BATCH_SIZE = 8000
DEV_PATH = "dev-0/"
TEST_PATH = "test-A/"
DEVICE = "cpu"
def read_file(file):
for line in file:
text = line.split("\t")
yield re.sub(
r"[^\w\d'\s]+",
"",
re.sub(" +", " ", text[6].replace("\\n", " ").replace("\n", "").lower()),
)
def clean(text):
text = str(text).lower().replace("-\\n", "").replace("\\n", " ")
return re.sub(r"\p{P}", "", text)
def get_words(line):
def get_words_from_line(line, specials=True):
line = line.rstrip()
if specials:
yield "<s>"
for m in re.finditer(r"[\p{L}0-9\*]+|\p{P}+", line):
yield m.group(0).lower()
if specials:
yield "</s>"
def get_line(file_path):
with lzma.open(file_path, mode="rt") as file:
for _, line in enumerate(file):
text = line.split("\t")
yield get_words(
re.sub(
r"[^\w\d'\s]+",
"",
re.sub(
" +",
" ",
" ".join([text[6], text[7]])
.replace("\\n", " ")
.replace("\n", "")
.lower(),
),
)
)
def buidl_vocab():
vocab = build_vocab_from_iterator(
get_line("train/in.tsv.xz"), max_tokens=VOCAB_SIZE, specials=["<unk>"]
)
vocab.set_default_index(vocab["<unk>"])
return vocab
def get_word_lines_from_data(d):
for line in d:
yield get_words_from_line(line)
def look_ahead_iterator(gen):
prev = None
w1 = None
for item in gen:
if prev is not None:
yield (prev, item)
prev = item
if w1 is not None:
yield (w1, item)
w1 = item
class SimpleBigramNeuralLanguageModel(nn.Module):
class SimpleBigramNeuralLanguageModel(torch.nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleBigramNeuralLanguageModel, self).__init__()
self.model = nn.Sequential(
@ -83,89 +60,128 @@ class SimpleBigramNeuralLanguageModel(nn.Module):
return self.model(x)
class Bigrams(IterableDataset):
def __init__(self, text_file, vocabulary_size):
class Bigrams(torch.utils.data.IterableDataset):
def __init__(self, data, vocabulary_size):
self.vocab = build_vocab_from_iterator(
get_line(text_file), max_tokens=vocabulary_size, specials=["<unk>"]
get_word_lines_from_data(data),
max_tokens=vocabulary_size,
specials=["<unk>"],
)
self.vocab.set_default_index(self.vocab["<unk>"])
self.vocabulary_size = vocabulary_size
self.text_file = text_file
self.data = data
def __iter__(self):
return look_ahead_iterator(
(
self.vocab[t]
for t in itertools.chain.from_iterable(get_line(self.text_file))
for t in itertools.chain.from_iterable(
get_word_lines_from_data(self.data)
)
)
)
vocab = buidl_vocab()
def get_dataset():
X_train = pd.read_csv(
IN_INPUT_PATH,
sep="\t",
header=None,
quoting=csv.QUOTE_NONE,
nrows=200000,
on_bad_lines="skip",
encoding="UTF-8",
)
Y_train = pd.read_csv(
IN_OUTPUT_PATH,
sep="\t",
header=None,
quoting=csv.QUOTE_NONE,
nrows=200000,
on_bad_lines="skip",
encoding="UTF-8",
)
X_train = X_train[[6, 7]]
X_train = pd.concat([X_train, Y_train], axis=1)
X_train = X_train[6] + X_train[0] + X_train[7]
X_train = X_train.apply(clean)
return Bigrams(X_train, VOCAB_SIZE)
def train():
batch_size = 10000
dataset = get_dataset()
train_dataset = Bigrams("train/in.tsv.xz", VOCAB_SIZE)
device = "cuda"
model = SimpleBigramNeuralLanguageModel(VOCAB_SIZE, EMBED_SIZE).to(device)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size)
def get_model():
model = SimpleBigramNeuralLanguageModel(VOCAB_SIZE, EMBED_SIZE).to(DEVICE)
if not exists("nn_model.bin"):
data = DataLoader(dataset, batch_size=BATCH_SIZE)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()
model.train()
step = 0
for x, y in train_data_loader:
x = x.to(device)
y = y.to(device)
for i in range(2):
for x, y in data:
x = x.to(DEVICE)
y = y.to(DEVICE)
optimizer.zero_grad()
ypredicted = model(x)
loss = criterion(torch.log(ypredicted), y)
y_predicted = model(x)
loss = criterion(torch.log(y_predicted), y)
if step % 100 == 0:
print(step, loss)
step += 1
loss.backward()
optimizer.step()
torch.save(model.state_dict(), "model1.bin")
torch.save(model.state_dict(), "nn_model.bin")
else:
model.load_state_dict(torch.load("nn_model.bin"))
return model
def predict(word, model):
ixs = torch.tensor(vocab.forward([word])).to(DEVICE)
vocab = dataset.vocab
model = get_model()
def predict(ws):
ixs = torch.tensor(vocab.forward(ws)).to(DEVICE)
out = model(ixs)
top = torch.topk(out[0], 8)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
str_predictions = ""
lht = 1.0
for pred_word in list(zip(top_words, top_indices, top_probs)):
if lht - pred_word[2] >= 0:
str_predictions += f"{pred_word[0]}:{pred_word[2]} "
lht -= pred_word[2]
if lht != 1.0:
str_predictions += f":{lht}"
return str_predictions
pred_str = ""
for word, prob in list(zip(top_words, top_probs)):
pred_str += f"{word}:{prob} "
return pred_str
def generate_predictions(input_file, output_file, model):
with open(output_file, "w") as outputf:
with lzma.open(input_file, mode="rt") as file:
for _, text in enumerate(read_file(file)):
tokens = tokenizer.tokenize(text)
if len(tokens) < 4:
prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
def predict_input(file):
X_test = pd.read_csv(
f"{file}/in.tsv.xz",
sep="\t",
header=None,
quoting=csv.QUOTE_NONE,
on_bad_lines="skip",
encoding="UTF-8",
)[6]
X_test = X_test.apply(clean)
with open(f"{file}/out.tsv", "w+", encoding="UTF-8") as f:
for row in X_test:
before = None
for before in get_words_from_line(clean(str(row)), False):
pass
before = [before]
if len(before) < 1:
pred_str = "a:0.2 the:0.2 to:0.2 of:0.1 and:0.1 of:0.1 :0.1"
else:
prediction = predict(tokens[-1], model)
outputf.write(prediction + "\n")
pred_str = predict(before)
pred_str = pred_str.strip()
f.write(pred_str + "\n")
if __name__ == "__main__":
train()
model = SimpleBigramNeuralLanguageModel(VOCAB_SIZE, EMBED_SIZE).to(DEVICE)
model.load_state_dict(torch.load("model1.bin"))
model.eval()
generate_predictions("dev-0/in.tsv.xz", "dev-0/out.tsv", model)
generate_predictions("test-A/in.tsv.xz", "test-A/out.tsv", model)
predict_input(DEV_PATH)
predict_input(TEST_PATH)

File diff suppressed because it is too large Load Diff