nn bigram

This commit is contained in:
Bartosz Karwacki 2022-05-08 17:23:26 +02:00
parent 20cb6b9e97
commit 3349d6ee6b
4 changed files with 18059 additions and 18043 deletions

File diff suppressed because it is too large Load Diff

BIN
nn_model.bin Normal file

Binary file not shown.

236
run2.py
View File

@ -1,76 +1,53 @@
import csv
import itertools import itertools
import lzma from os.path import exists
import pandas as pd
import regex as re import regex as re
import torch import torch
from nltk.tokenize import RegexpTokenizer
from torch import nn from torch import nn
from torch.utils.data import DataLoader, IterableDataset from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator from torchtext.vocab import build_vocab_from_iterator
VOCAB_SIZE = 40000 IN_INPUT_PATH = "train/in.tsv.xz"
EMBED_SIZE = 100 IN_OUTPUT_PATH = "train/expected.tsv"
DEVICE = "cuda" VOCAB_SIZE = 30000
EMBED_SIZE = 150
tokenizer = RegexpTokenizer(r"\w+") BATCH_SIZE = 8000
DEV_PATH = "dev-0/"
TEST_PATH = "test-A/"
DEVICE = "cpu"
def read_file(file): def clean(text):
for line in file: text = str(text).lower().replace("-\\n", "").replace("\\n", " ")
text = line.split("\t") return re.sub(r"\p{P}", "", text)
yield re.sub(
r"[^\w\d'\s]+",
"",
re.sub(" +", " ", text[6].replace("\\n", " ").replace("\n", "").lower()),
)
def get_words(line): def get_words_from_line(line, specials=True):
line = line.rstrip() line = line.rstrip()
yield "<s>" if specials:
yield "<s>"
for m in re.finditer(r"[\p{L}0-9\*]+|\p{P}+", line): for m in re.finditer(r"[\p{L}0-9\*]+|\p{P}+", line):
yield m.group(0).lower() yield m.group(0).lower()
yield "</s>" if specials:
yield "</s>"
def get_line(file_path): def get_word_lines_from_data(d):
with lzma.open(file_path, mode="rt") as file: for line in d:
for _, line in enumerate(file): yield get_words_from_line(line)
text = line.split("\t")
yield get_words(
re.sub(
r"[^\w\d'\s]+",
"",
re.sub(
" +",
" ",
" ".join([text[6], text[7]])
.replace("\\n", " ")
.replace("\n", "")
.lower(),
),
)
)
def buidl_vocab():
vocab = build_vocab_from_iterator(
get_line("train/in.tsv.xz"), max_tokens=VOCAB_SIZE, specials=["<unk>"]
)
vocab.set_default_index(vocab["<unk>"])
return vocab
def look_ahead_iterator(gen): def look_ahead_iterator(gen):
prev = None w1 = None
for item in gen: for item in gen:
if prev is not None: if w1 is not None:
yield (prev, item) yield (w1, item)
prev = item w1 = item
class SimpleBigramNeuralLanguageModel(nn.Module): class SimpleBigramNeuralLanguageModel(torch.nn.Module):
def __init__(self, vocabulary_size, embedding_size): def __init__(self, vocabulary_size, embedding_size):
super(SimpleBigramNeuralLanguageModel, self).__init__() super(SimpleBigramNeuralLanguageModel, self).__init__()
self.model = nn.Sequential( self.model = nn.Sequential(
@ -83,89 +60,128 @@ class SimpleBigramNeuralLanguageModel(nn.Module):
return self.model(x) return self.model(x)
class Bigrams(IterableDataset): class Bigrams(torch.utils.data.IterableDataset):
def __init__(self, text_file, vocabulary_size): def __init__(self, data, vocabulary_size):
self.vocab = build_vocab_from_iterator( self.vocab = build_vocab_from_iterator(
get_line(text_file), max_tokens=vocabulary_size, specials=["<unk>"] get_word_lines_from_data(data),
max_tokens=vocabulary_size,
specials=["<unk>"],
) )
self.vocab.set_default_index(self.vocab["<unk>"]) self.vocab.set_default_index(self.vocab["<unk>"])
self.vocabulary_size = vocabulary_size self.vocabulary_size = vocabulary_size
self.text_file = text_file self.data = data
def __iter__(self): def __iter__(self):
return look_ahead_iterator( return look_ahead_iterator(
( (
self.vocab[t] self.vocab[t]
for t in itertools.chain.from_iterable(get_line(self.text_file)) for t in itertools.chain.from_iterable(
get_word_lines_from_data(self.data)
)
) )
) )
vocab = buidl_vocab() def get_dataset():
X_train = pd.read_csv(
IN_INPUT_PATH,
sep="\t",
header=None,
quoting=csv.QUOTE_NONE,
nrows=200000,
on_bad_lines="skip",
encoding="UTF-8",
)
Y_train = pd.read_csv(
IN_OUTPUT_PATH,
sep="\t",
header=None,
quoting=csv.QUOTE_NONE,
nrows=200000,
on_bad_lines="skip",
encoding="UTF-8",
)
X_train = X_train[[6, 7]]
X_train = pd.concat([X_train, Y_train], axis=1)
X_train = X_train[6] + X_train[0] + X_train[7]
X_train = X_train.apply(clean)
return Bigrams(X_train, VOCAB_SIZE)
def train(): dataset = get_dataset()
batch_size = 10000
train_dataset = Bigrams("train/in.tsv.xz", VOCAB_SIZE)
device = "cuda"
model = SimpleBigramNeuralLanguageModel(VOCAB_SIZE, EMBED_SIZE).to(device)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()
model.train()
step = 0
for x, y in train_data_loader:
x = x.to(device)
y = y.to(device)
optimizer.zero_grad()
ypredicted = model(x)
loss = criterion(torch.log(ypredicted), y)
if step % 100 == 0:
print(step, loss)
step += 1
loss.backward()
optimizer.step()
torch.save(model.state_dict(), "model1.bin")
def predict(word, model): def get_model():
ixs = torch.tensor(vocab.forward([word])).to(DEVICE)
model = SimpleBigramNeuralLanguageModel(VOCAB_SIZE, EMBED_SIZE).to(DEVICE)
if not exists("nn_model.bin"):
data = DataLoader(dataset, batch_size=BATCH_SIZE)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()
model.train()
step = 0
for i in range(2):
for x, y in data:
x = x.to(DEVICE)
y = y.to(DEVICE)
optimizer.zero_grad()
y_predicted = model(x)
loss = criterion(torch.log(y_predicted), y)
if step % 100 == 0:
print(step, loss)
step += 1
loss.backward()
optimizer.step()
torch.save(model.state_dict(), "nn_model.bin")
else:
model.load_state_dict(torch.load("nn_model.bin"))
return model
vocab = dataset.vocab
model = get_model()
def predict(ws):
ixs = torch.tensor(vocab.forward(ws)).to(DEVICE)
out = model(ixs) out = model(ixs)
top = torch.topk(out[0], 8) top = torch.topk(out[0], 8)
top_indices = top.indices.tolist() top_indices = top.indices.tolist()
top_probs = top.values.tolist() top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices) top_words = vocab.lookup_tokens(top_indices)
str_predictions = "" pred_str = ""
lht = 1.0 for word, prob in list(zip(top_words, top_probs)):
for pred_word in list(zip(top_words, top_indices, top_probs)): pred_str += f"{word}:{prob} "
if lht - pred_word[2] >= 0: return pred_str
str_predictions += f"{pred_word[0]}:{pred_word[2]} "
lht -= pred_word[2]
if lht != 1.0:
str_predictions += f":{lht}"
return str_predictions
def generate_predictions(input_file, output_file, model): def predict_input(file):
with open(output_file, "w") as outputf: X_test = pd.read_csv(
with lzma.open(input_file, mode="rt") as file: f"{file}/in.tsv.xz",
for _, text in enumerate(read_file(file)): sep="\t",
tokens = tokenizer.tokenize(text) header=None,
if len(tokens) < 4: quoting=csv.QUOTE_NONE,
prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1" on_bad_lines="skip",
else: encoding="UTF-8",
prediction = predict(tokens[-1], model) )[6]
outputf.write(prediction + "\n") X_test = X_test.apply(clean)
with open(f"{file}/out.tsv", "w+", encoding="UTF-8") as f:
for row in X_test:
before = None
for before in get_words_from_line(clean(str(row)), False):
pass
before = [before]
if len(before) < 1:
pred_str = "a:0.2 the:0.2 to:0.2 of:0.1 and:0.1 of:0.1 :0.1"
else:
pred_str = predict(before)
pred_str = pred_str.strip()
f.write(pred_str + "\n")
if __name__ == "__main__": predict_input(DEV_PATH)
train() predict_input(TEST_PATH)
model = SimpleBigramNeuralLanguageModel(VOCAB_SIZE, EMBED_SIZE).to(DEVICE)
model.load_state_dict(torch.load("model1.bin"))
model.eval()
generate_predictions("dev-0/in.tsv.xz", "dev-0/out.tsv", model)
generate_predictions("test-A/in.tsv.xz", "test-A/out.tsv", model)

File diff suppressed because it is too large Load Diff