challenging-america-word-ga.../solution.ipynb
2023-05-24 18:30:46 +02:00

13 KiB
Raw Permalink Blame History

from torchtext.vocab import build_vocab_from_iterator
import pickle
from torch.utils.data import IterableDataset
import itertools
from torch import nn
import torch
import lzma
from torch.utils.data import DataLoader
from tqdm import tqdm
def simple_preprocess(line):
    return line.replace(r'\n', ' ')

def get_words_from_line(line):
    line = line.strip()
    line = simple_preprocess(line)
    yield '<s>'
    for t in line.split():
     yield t
    yield '</s>'

def get_word_lines_from_file(file_name, n_size=-1):
    with lzma.open(file_name, 'r') as fh:
        n = 0
        for line in fh:
            n += 1
            yield get_words_from_line(line.decode('utf-8'))
            if n == n_size:
                break

def look_ahead_iterator(gen):
    prev = None
    for item in gen:
        if prev is not None:
            yield prev, item
        prev = item

def build_vocab(file, vocab_size):
    try:
        with open(f'bigram_nn_vocab_{vocab_size}.pickle', 'rb') as handle:
            vocab = pickle.load(handle)
    except:
        vocab = build_vocab_from_iterator(
            get_word_lines_from_file(file),
            max_tokens = vocab_size,
            specials = ['<unk>'])
        with open(f'bigram_nn_vocab_{vocab_size}.pickle', 'wb') as handle:
            pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return vocab

class Bigrams(IterableDataset):
    def __init__(self, text_file, vocabulary_size):
        self.vocab = vocab
        self.vocab.set_default_index(self.vocab['<unk>'])
        self.vocabulary_size = vocabulary_size
        self.text_file = text_file

    def __iter__(self):
        return look_ahead_iterator(
            (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))

class SimpleBigramNeuralLanguageModel(nn.Module):
    def __init__(self, vocabulary_size, embedding_size):
        super(SimpleBigramNeuralLanguageModel, self).__init__()
        self.model = nn.Sequential(
            nn.Embedding(vocabulary_size, embedding_size),
            nn.Linear(embedding_size, vocabulary_size),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        return self.model(x)
max_steps= -1
vocab_size = 20000
embed_size = 150
batch_size = 5000
learning_rate = 0.001
vocab = build_vocab('challenging-america-word-gap-prediction/train/in.tsv.xz', vocab_size)
train_dataset = Bigrams('challenging-america-word-gap-prediction/train/in.tsv.xz', vocab_size)
if torch.cuda.is_available():
  device = 'cuda'
else:
  raise Exception()
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
data = DataLoader(train_dataset, batch_size=batch_size)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.NLLLoss()

model.train()
step = 0
for x, y in data:
    x = x.to(device)
    y = y.to(device)
    optimizer.zero_grad()
    y_predicted = model(x)
    loss = criterion(torch.log(y_predicted), y)
    if step % 1000 == 0:
        print(f'steps: {step}, loss: {loss.item()}')
        if step != 0:
            torch.save(model.state_dict(), f'bigram_nn_model_steps-{step}_vocab-{vocab_size}_embed-{embed_size}_batch-{batch_size}.bin')
    if step == max_steps:
      break
    step += 1
    loss.backward()
    optimizer.step()
/home/ked/PycharmProjects/mj9/venv/lib/python3.10/site-packages/torch/nn/modules/container.py:217: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  input = module(input)
steps: 0, loss: 10.091094017028809
steps: 1000, loss: 5.73332405090332
steps: 2000, loss: 5.655370712280273
steps: 3000, loss: 5.457630634307861
steps: 4000, loss: 5.38517427444458
steps: 5000, loss: 5.467936992645264
steps: 6000, loss: 5.372152328491211
steps: 7000, loss: 5.272013187408447
steps: 8000, loss: 5.439966201782227
steps: 9000, loss: 5.268238544464111
steps: 10000, loss: 5.1395182609558105
steps: 11000, loss: 5.2558159828186035
steps: 12000, loss: 5.263617515563965
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[4], line 31
     29   break
     30 step += 1
---> 31 loss.backward()
     32 optimizer.step()

File ~/PycharmProjects/mj9/venv/lib/python3.10/site-packages/torch/_tensor.py:487, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
    477 if has_torch_function_unary(self):
    478     return handle_torch_function(
    479         Tensor.backward,
    480         (self,),
   (...)
    485         inputs=inputs,
    486     )
--> 487 torch.autograd.backward(
    488     self, gradient, retain_graph, create_graph, inputs=inputs
    489 )

File ~/PycharmProjects/mj9/venv/lib/python3.10/site-packages/torch/autograd/__init__.py:200, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    195     retain_graph = create_graph
    197 # The reason we repeat same the comment below is that
    198 # some Python versions print out the first line of a multi-line function
    199 # calls in the traceback and some print out the last line
--> 200 Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
    201     tensors, grad_tensors_, retain_graph, create_graph, inputs,
    202     allow_unreachable=True, accumulate_grad=True)

KeyboardInterrupt: 
vocab_size = 20000
embed_size = 150
batch_size = 5000
vocab = build_vocab('challenging-america-word-gap-prediction/train/in.tsv.xz', vocab_size)
vocab.set_default_index(vocab['<unk>'])
topk = 5
preds = []
device = 'cuda'
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
model.load_state_dict(torch.load('bigram_nn_model_steps-10000_vocab-20000_embed-150_batch-5000.bin'))
model.eval()
for path in ['challenging-america-word-gap-prediction/dev-0', 'challenging-america-word-gap-prediction/test-A']:
    with lzma.open(f'{path}/in.tsv.xz', 'r') as fh, open(f'{path}/out.tsv', 'w', encoding='utf-8') as f_out:
        for line in fh:
            previous_word = simple_preprocess(line.decode('utf-8').split('\t')[-2].strip()).split()[-1]
            ixs = torch.tensor(vocab.forward([previous_word])).to(device)
            out = model(ixs)
            top = torch.topk(out[0], topk)
            top_indices = top.indices.tolist()
            top_probs = top.values.tolist()
            top_words = vocab.lookup_tokens(top_indices)
            top_zipped = zip(top_words, top_probs)
            pred = ''
            total_prob = 0
            for word, prob in top_zipped:
                if word != '<unk>':
                    pred += f'{word}:{prob} '
                    total_prob += prob
            unk_prob = 1 - total_prob
            pred += f':{unk_prob}'
            f_out.write(pred + '\n')
%cd challenging-america-word-gap-prediction/
!./geval --test-name dev-0
%cd ../
/home/ked/PycharmProjects/mj9/challenging-america-word-gap-prediction
394.97
/home/ked/PycharmProjects/mj9