challenging-america-word-ga.../run2.py
bartosz-karwacki b03b6502e3 test
2022-05-08 16:29:51 +02:00

172 lines
4.8 KiB
Python

import itertools
import lzma
import regex as re
import torch
from nltk.tokenize import RegexpTokenizer
from torch import nn
from torch.utils.data import DataLoader, IterableDataset
from torchtext.vocab import build_vocab_from_iterator
VOCAB_SIZE = 40000
EMBED_SIZE = 100
DEVICE = "cuda"
tokenizer = RegexpTokenizer(r"\w+")
def read_file(file):
for line in file:
text = line.split("\t")
yield re.sub(
r"[^\w\d'\s]+",
"",
re.sub(" +", " ", text[6].replace("\\n", " ").replace("\n", "").lower()),
)
def get_words(line):
line = line.rstrip()
yield "<s>"
for m in re.finditer(r"[\p{L}0-9\*]+|\p{P}+", line):
yield m.group(0).lower()
yield "</s>"
def get_line(file_path):
with lzma.open(file_path, mode="rt") as file:
for _, line in enumerate(file):
text = line.split("\t")
yield get_words(
re.sub(
r"[^\w\d'\s]+",
"",
re.sub(
" +",
" ",
" ".join([text[6], text[7]])
.replace("\\n", " ")
.replace("\n", "")
.lower(),
),
)
)
def buidl_vocab():
vocab = build_vocab_from_iterator(
get_line("train/in.tsv.xz"), max_tokens=VOCAB_SIZE, specials=["<unk>"]
)
vocab.set_default_index(vocab["<unk>"])
return vocab
def look_ahead_iterator(gen):
prev = None
for item in gen:
if prev is not None:
yield (prev, item)
prev = item
class SimpleBigramNeuralLanguageModel(nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleBigramNeuralLanguageModel, self).__init__()
self.model = nn.Sequential(
nn.Embedding(vocabulary_size, embedding_size),
nn.Linear(embedding_size, vocabulary_size),
nn.Softmax(),
)
def forward(self, x):
return self.model(x)
class Bigrams(IterableDataset):
def __init__(self, text_file, vocabulary_size):
self.vocab = build_vocab_from_iterator(
get_line(text_file), max_tokens=vocabulary_size, specials=["<unk>"]
)
self.vocab.set_default_index(self.vocab["<unk>"])
self.vocabulary_size = vocabulary_size
self.text_file = text_file
def __iter__(self):
return look_ahead_iterator(
(
self.vocab[t]
for t in itertools.chain.from_iterable(get_line(self.text_file))
)
)
vocab = buidl_vocab()
def train():
batch_size = 10000
train_dataset = Bigrams("train/in.tsv.xz", VOCAB_SIZE)
device = "cuda"
model = SimpleBigramNeuralLanguageModel(VOCAB_SIZE, EMBED_SIZE).to(device)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()
model.train()
step = 0
for x, y in train_data_loader:
x = x.to(device)
y = y.to(device)
optimizer.zero_grad()
ypredicted = model(x)
loss = criterion(torch.log(ypredicted), y)
if step % 100 == 0:
print(step, loss)
step += 1
loss.backward()
optimizer.step()
torch.save(model.state_dict(), "model1.bin")
def predict(word, model):
ixs = torch.tensor(vocab.forward([word])).to(DEVICE)
out = model(ixs)
top = torch.topk(out[0], 8)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
str_predictions = ""
lht = 1.0
for pred_word in list(zip(top_words, top_indices, top_probs)):
if lht - pred_word[2] >= 0:
str_predictions += f"{pred_word[0]}:{pred_word[2]} "
lht -= pred_word[2]
if lht != 1.0:
str_predictions += f":{lht}"
return str_predictions
def generate_predictions(input_file, output_file, model):
with open(output_file, "w") as outputf:
with lzma.open(input_file, mode="rt") as file:
for _, text in enumerate(read_file(file)):
tokens = tokenizer.tokenize(text)
if len(tokens) < 4:
prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
else:
prediction = predict(tokens[-1], model)
outputf.write(prediction + "\n")
if __name__ == "__main__":
train()
model = SimpleBigramNeuralLanguageModel(VOCAB_SIZE, EMBED_SIZE).to(DEVICE)
model.load_state_dict(torch.load("model1.bin"))
model.eval()
generate_predictions("dev-0/in.tsv.xz", "dev-0/out.tsv", model)
generate_predictions("test-A/in.tsv.xz", "test-A/out.tsv", model)