challenging-america-word-ga.../run.py

from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
import lzma
from torch import nn
import torch
from torch.utils.data import IterableDataset
import itertools
from torch.utils.data import DataLoader
import numpy as np


# def get_words_from_line(file_path):
#     for index, line in enumerate(get_lines_from_file(file)):
#         yield '<s>'
#         for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
#             yield m.group(0).lower()
#         yield '</s>'
#         if index == 10000:
#             break


def get_words_from_line(line):
    line = line.rstrip()
    yield '<s>'
    for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
        yield m.group(0).lower()
    yield '</s>'


def get_words_lines_from_file(file_path):
    with lzma.open(file_path, mode='rt') as file:
        for index, line in enumerate(file):
            text = line.split("\t")
            yield get_words_from_line(re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower())))
            if index == 50000:
                break


vocab_size = 20000

vocab = build_vocab_from_iterator(
    get_words_lines_from_file('train/in.tsv.xz'),
    max_tokens=vocab_size,
    specials=['<unk>'])

vocab.set_default_index(vocab['<unk>'])
# vocab=None

embed_size = 100


class SimpleBigramNeuralLanguageModel(nn.Module):
    def __init__(self, vocabulary_size, embedding_size):
        super(SimpleBigramNeuralLanguageModel, self).__init__()
        self.model = nn.Sequential(
            nn.Embedding(vocabulary_size, embedding_size),
            nn.Linear(embedding_size, vocabulary_size),
            nn.Softmax()
        )

    def forward(self, x):
        return self.model(x)


def look_ahead_iterator(gen):
    prev = None
    for item in gen:
        if prev is not None:
            yield (prev, item)
        prev = item


class Bigrams(IterableDataset):
    def __init__(self, text_file, vocabulary_size):
        self.vocab = build_vocab_from_iterator(
            get_words_lines_from_file(text_file),
            max_tokens=vocabulary_size,
            specials=['<unk>'])
        self.vocab.set_default_index(self.vocab['<unk>'])
        self.vocabulary_size = vocabulary_size
        self.text_file = text_file

    def __iter__(self):
        return look_ahead_iterator(
            (self.vocab[t] for t in itertools.chain.from_iterable(get_words_lines_from_file(self.text_file))))


def train():
    batch_size = 22000

    train_dataset = Bigrams('train/in.tsv.xz', vocab_size)

    device = 'cuda'
    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size)
    optimizer = torch.optim.Adam(model.parameters())
    criterion = torch.nn.NLLLoss()

    model.train()
    step = 0
    for x, y in train_data_loader:
        # Transfer Data to GPU
        x = x.to(device)
        y = y.to(device)
        # Clear the gradients
        optimizer.zero_grad()
        # Forward Pass
        ypredicted = model(x)
        # Find the Loss
        loss = criterion(torch.log(ypredicted), y)
        if step % 100 == 0:
            print(step, loss)
        step += 1
        # Calculate gradients
        loss.backward()
        # Update Weights
        optimizer.step()
    torch.save(model.state_dict(), 'model1.bin')


def predict():
    device = 'cuda'
    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
    model.load_state_dict(torch.load('model1.bin'))
    model.eval()

    ixs = torch.tensor(vocab.forward(['for'])).to(device)

    out = model(ixs)
    top = torch.topk(out[0], 10)
    top_indices = top.indices.tolist()
    top_probs = top.values.tolist()
    top_words = vocab.lookup_tokens(top_indices)
    print(list(zip(top_words, top_indices, top_probs)))


def similar():
    device = 'cuda'
    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
    model.load_state_dict(torch.load('model1.bin'))
    model.eval()

    cos = nn.CosineSimilarity(dim=1, eps=1e-6)

    embeddings = model.model[0].weight

    vec = embeddings[vocab['went']]

    similarities = cos(vec, embeddings)

    top = torch.topk(similarities, 10)

    top_indices = top.indices.tolist()
    top_probs = top.values.tolist()
    top_words = vocab.lookup_tokens(top_indices)
    print(list(zip(top_words, top_indices, top_probs)))


if __name__ == "__main__":
    # train()
    predict()