challenging-america-word-ga.../run2.py
Bartosz Karwacki 3349d6ee6b nn bigram
2022-05-08 17:23:26 +02:00

188 lines
4.7 KiB
Python

import csv
import itertools
from os.path import exists
import pandas as pd
import regex as re
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
IN_INPUT_PATH = "train/in.tsv.xz"
IN_OUTPUT_PATH = "train/expected.tsv"
VOCAB_SIZE = 30000
EMBED_SIZE = 150
BATCH_SIZE = 8000
DEV_PATH = "dev-0/"
TEST_PATH = "test-A/"
DEVICE = "cpu"
def clean(text):
text = str(text).lower().replace("-\\n", "").replace("\\n", " ")
return re.sub(r"\p{P}", "", text)
def get_words_from_line(line, specials=True):
line = line.rstrip()
if specials:
yield "<s>"
for m in re.finditer(r"[\p{L}0-9\*]+|\p{P}+", line):
yield m.group(0).lower()
if specials:
yield "</s>"
def get_word_lines_from_data(d):
for line in d:
yield get_words_from_line(line)
def look_ahead_iterator(gen):
w1 = None
for item in gen:
if w1 is not None:
yield (w1, item)
w1 = item
class SimpleBigramNeuralLanguageModel(torch.nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleBigramNeuralLanguageModel, self).__init__()
self.model = nn.Sequential(
nn.Embedding(vocabulary_size, embedding_size),
nn.Linear(embedding_size, vocabulary_size),
nn.Softmax(),
)
def forward(self, x):
return self.model(x)
class Bigrams(torch.utils.data.IterableDataset):
def __init__(self, data, vocabulary_size):
self.vocab = build_vocab_from_iterator(
get_word_lines_from_data(data),
max_tokens=vocabulary_size,
specials=["<unk>"],
)
self.vocab.set_default_index(self.vocab["<unk>"])
self.vocabulary_size = vocabulary_size
self.data = data
def __iter__(self):
return look_ahead_iterator(
(
self.vocab[t]
for t in itertools.chain.from_iterable(
get_word_lines_from_data(self.data)
)
)
)
def get_dataset():
X_train = pd.read_csv(
IN_INPUT_PATH,
sep="\t",
header=None,
quoting=csv.QUOTE_NONE,
nrows=200000,
on_bad_lines="skip",
encoding="UTF-8",
)
Y_train = pd.read_csv(
IN_OUTPUT_PATH,
sep="\t",
header=None,
quoting=csv.QUOTE_NONE,
nrows=200000,
on_bad_lines="skip",
encoding="UTF-8",
)
X_train = X_train[[6, 7]]
X_train = pd.concat([X_train, Y_train], axis=1)
X_train = X_train[6] + X_train[0] + X_train[7]
X_train = X_train.apply(clean)
return Bigrams(X_train, VOCAB_SIZE)
dataset = get_dataset()
def get_model():
model = SimpleBigramNeuralLanguageModel(VOCAB_SIZE, EMBED_SIZE).to(DEVICE)
if not exists("nn_model.bin"):
data = DataLoader(dataset, batch_size=BATCH_SIZE)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()
model.train()
step = 0
for i in range(2):
for x, y in data:
x = x.to(DEVICE)
y = y.to(DEVICE)
optimizer.zero_grad()
y_predicted = model(x)
loss = criterion(torch.log(y_predicted), y)
if step % 100 == 0:
print(step, loss)
step += 1
loss.backward()
optimizer.step()
torch.save(model.state_dict(), "nn_model.bin")
else:
model.load_state_dict(torch.load("nn_model.bin"))
return model
vocab = dataset.vocab
model = get_model()
def predict(ws):
ixs = torch.tensor(vocab.forward(ws)).to(DEVICE)
out = model(ixs)
top = torch.topk(out[0], 8)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
pred_str = ""
for word, prob in list(zip(top_words, top_probs)):
pred_str += f"{word}:{prob} "
return pred_str
def predict_input(file):
X_test = pd.read_csv(
f"{file}/in.tsv.xz",
sep="\t",
header=None,
quoting=csv.QUOTE_NONE,
on_bad_lines="skip",
encoding="UTF-8",
)[6]
X_test = X_test.apply(clean)
with open(f"{file}/out.tsv", "w+", encoding="UTF-8") as f:
for row in X_test:
before = None
for before in get_words_from_line(clean(str(row)), False):
pass
before = [before]
if len(before) < 1:
pred_str = "a:0.2 the:0.2 to:0.2 of:0.1 and:0.1 of:0.1 :0.1"
else:
pred_str = predict(before)
pred_str = pred_str.strip()
f.write(pred_str + "\n")
predict_input(DEV_PATH)
predict_input(TEST_PATH)