13 KiB
13 KiB
from torchtext.vocab import build_vocab_from_iterator
import pickle
from torch.utils.data import IterableDataset
import itertools
from torch import nn
import torch
import lzma
from torch.utils.data import DataLoader
from tqdm import tqdm
def simple_preprocess(line):
return line.replace(r'\n', ' ')
def get_words_from_line(line):
line = line.strip()
line = simple_preprocess(line)
yield '<s>'
for t in line.split():
yield t
yield '</s>'
def get_word_lines_from_file(file_name, n_size=-1):
with lzma.open(file_name, 'r') as fh:
n = 0
for line in fh:
n += 1
yield get_words_from_line(line.decode('utf-8'))
if n == n_size:
break
def look_ahead_iterator(gen):
prev = None
for item in gen:
if prev is not None:
yield prev, item
prev = item
def build_vocab(file, vocab_size):
try:
with open(f'bigram_nn_vocab_{vocab_size}.pickle', 'rb') as handle:
vocab = pickle.load(handle)
except:
vocab = build_vocab_from_iterator(
get_word_lines_from_file(file),
max_tokens = vocab_size,
specials = ['<unk>'])
with open(f'bigram_nn_vocab_{vocab_size}.pickle', 'wb') as handle:
pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
return vocab
class Bigrams(IterableDataset):
def __init__(self, text_file, vocabulary_size):
self.vocab = vocab
self.vocab.set_default_index(self.vocab['<unk>'])
self.vocabulary_size = vocabulary_size
self.text_file = text_file
def __iter__(self):
return look_ahead_iterator(
(self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))
class SimpleBigramNeuralLanguageModel(nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleBigramNeuralLanguageModel, self).__init__()
self.model = nn.Sequential(
nn.Embedding(vocabulary_size, embedding_size),
nn.Linear(embedding_size, vocabulary_size),
nn.Softmax(dim=1)
)
def forward(self, x):
return self.model(x)
max_steps= -1
vocab_size = 20000
embed_size = 150
batch_size = 5000
learning_rate = 0.001
vocab = build_vocab('challenging-america-word-gap-prediction/train/in.tsv.xz', vocab_size)
train_dataset = Bigrams('challenging-america-word-gap-prediction/train/in.tsv.xz', vocab_size)
if torch.cuda.is_available():
device = 'cuda'
else:
raise Exception()
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
data = DataLoader(train_dataset, batch_size=batch_size)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.NLLLoss()
model.train()
step = 0
for x, y in data:
x = x.to(device)
y = y.to(device)
optimizer.zero_grad()
y_predicted = model(x)
loss = criterion(torch.log(y_predicted), y)
if step % 1000 == 0:
print(f'steps: {step}, loss: {loss.item()}')
if step != 0:
torch.save(model.state_dict(), f'bigram_nn_model_steps-{step}_vocab-{vocab_size}_embed-{embed_size}_batch-{batch_size}.bin')
if step == max_steps:
break
step += 1
loss.backward()
optimizer.step()
/home/ked/PycharmProjects/mj9/venv/lib/python3.10/site-packages/torch/nn/modules/container.py:217: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument. input = module(input)
steps: 0, loss: 10.091094017028809 steps: 1000, loss: 5.73332405090332 steps: 2000, loss: 5.655370712280273 steps: 3000, loss: 5.457630634307861 steps: 4000, loss: 5.38517427444458 steps: 5000, loss: 5.467936992645264 steps: 6000, loss: 5.372152328491211 steps: 7000, loss: 5.272013187408447 steps: 8000, loss: 5.439966201782227 steps: 9000, loss: 5.268238544464111 steps: 10000, loss: 5.1395182609558105 steps: 11000, loss: 5.2558159828186035 steps: 12000, loss: 5.263617515563965
[0;31m---------------------------------------------------------------------------[0m [0;31mKeyboardInterrupt[0m Traceback (most recent call last) Cell [0;32mIn[4], line 31[0m [1;32m 29[0m [38;5;28;01mbreak[39;00m [1;32m 30[0m step [38;5;241m+[39m[38;5;241m=[39m [38;5;241m1[39m [0;32m---> 31[0m [43mloss[49m[38;5;241;43m.[39;49m[43mbackward[49m[43m([49m[43m)[49m [1;32m 32[0m optimizer[38;5;241m.[39mstep() File [0;32m~/PycharmProjects/mj9/venv/lib/python3.10/site-packages/torch/_tensor.py:487[0m, in [0;36mTensor.backward[0;34m(self, gradient, retain_graph, create_graph, inputs)[0m [1;32m 477[0m [38;5;28;01mif[39;00m has_torch_function_unary([38;5;28mself[39m): [1;32m 478[0m [38;5;28;01mreturn[39;00m handle_torch_function( [1;32m 479[0m Tensor[38;5;241m.[39mbackward, [1;32m 480[0m ([38;5;28mself[39m,), [0;32m (...)[0m [1;32m 485[0m inputs[38;5;241m=[39minputs, [1;32m 486[0m ) [0;32m--> 487[0m [43mtorch[49m[38;5;241;43m.[39;49m[43mautograd[49m[38;5;241;43m.[39;49m[43mbackward[49m[43m([49m [1;32m 488[0m [43m [49m[38;5;28;43mself[39;49m[43m,[49m[43m [49m[43mgradient[49m[43m,[49m[43m [49m[43mretain_graph[49m[43m,[49m[43m [49m[43mcreate_graph[49m[43m,[49m[43m [49m[43minputs[49m[38;5;241;43m=[39;49m[43minputs[49m [1;32m 489[0m [43m[49m[43m)[49m File [0;32m~/PycharmProjects/mj9/venv/lib/python3.10/site-packages/torch/autograd/__init__.py:200[0m, in [0;36mbackward[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)[0m [1;32m 195[0m retain_graph [38;5;241m=[39m create_graph [1;32m 197[0m [38;5;66;03m# The reason we repeat same the comment below is that[39;00m [1;32m 198[0m [38;5;66;03m# some Python versions print out the first line of a multi-line function[39;00m [1;32m 199[0m [38;5;66;03m# calls in the traceback and some print out the last line[39;00m [0;32m--> 200[0m [43mVariable[49m[38;5;241;43m.[39;49m[43m_execution_engine[49m[38;5;241;43m.[39;49m[43mrun_backward[49m[43m([49m[43m [49m[38;5;66;43;03m# Calls into the C++ engine to run the backward pass[39;49;00m [1;32m 201[0m [43m [49m[43mtensors[49m[43m,[49m[43m [49m[43mgrad_tensors_[49m[43m,[49m[43m [49m[43mretain_graph[49m[43m,[49m[43m [49m[43mcreate_graph[49m[43m,[49m[43m [49m[43minputs[49m[43m,[49m [1;32m 202[0m [43m [49m[43mallow_unreachable[49m[38;5;241;43m=[39;49m[38;5;28;43;01mTrue[39;49;00m[43m,[49m[43m [49m[43maccumulate_grad[49m[38;5;241;43m=[39;49m[38;5;28;43;01mTrue[39;49;00m[43m)[49m [0;31mKeyboardInterrupt[0m:
vocab_size = 20000
embed_size = 150
batch_size = 5000
vocab = build_vocab('challenging-america-word-gap-prediction/train/in.tsv.xz', vocab_size)
vocab.set_default_index(vocab['<unk>'])
topk = 5
preds = []
device = 'cuda'
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
model.load_state_dict(torch.load('bigram_nn_model_steps-10000_vocab-20000_embed-150_batch-5000.bin'))
model.eval()
for path in ['challenging-america-word-gap-prediction/dev-0', 'challenging-america-word-gap-prediction/test-A']:
with lzma.open(f'{path}/in.tsv.xz', 'r') as fh, open(f'{path}/out.tsv', 'w', encoding='utf-8') as f_out:
for line in fh:
previous_word = simple_preprocess(line.decode('utf-8').split('\t')[-2].strip()).split()[-1]
ixs = torch.tensor(vocab.forward([previous_word])).to(device)
out = model(ixs)
top = torch.topk(out[0], topk)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
top_zipped = zip(top_words, top_probs)
pred = ''
total_prob = 0
for word, prob in top_zipped:
if word != '<unk>':
pred += f'{word}:{prob} '
total_prob += prob
unk_prob = 1 - total_prob
pred += f':{unk_prob}'
f_out.write(pred + '\n')
%cd challenging-america-word-gap-prediction/
!./geval --test-name dev-0
%cd ../
/home/ked/PycharmProjects/mj9/challenging-america-word-gap-prediction 394.97 /home/ked/PycharmProjects/mj9