challenging-america-word-ga.../run.py

from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
import lzma
from torch import nn
import torch
from torch.utils.data import IterableDataset
import itertools
from torch.utils.data import DataLoader
import numpy as np


# def get_words_from_line(file_path):
#     for index, line in enumerate(get_lines_from_file(file)):
#         yield '<s>'
#         for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
#             yield m.group(0).lower()
#         yield '</s>'
#         if index == 10000:
#             break


def get_words_from_line(line):
    line = line.rstrip()
    yield '<s>'
    for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
        yield m.group(0).lower()
    yield '</s>'


def get_words_lines_from_file(file_path):
    with lzma.open(file_path, mode='rt') as file:
        for index, line in enumerate(file):
            text = line.split("\t")
            yield get_words_from_line(re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower())))
            if index == 50000:
                break


vocab_size = 20000

vocab = build_vocab_from_iterator(
    get_words_lines_from_file('train/in.tsv.xz'),
    max_tokens=vocab_size,
    specials=['<unk>'])

vocab.set_default_index(vocab['<unk>'])
# vocab=None

embed_size = 100


class SimpleBigramNeuralLanguageModel(nn.Module):
    def __init__(self, vocabulary_size, embedding_size):
        super(SimpleBigramNeuralLanguageModel, self).__init__()
        self.model = nn.Sequential(
            nn.Embedding(vocabulary_size, embedding_size),
            nn.Linear(embedding_size, vocabulary_size),
            nn.Softmax()
        )

    def forward(self, x):
        return self.model(x)


def look_ahead_iterator(gen):
    prev = None
    for item in gen:
        if prev is not None:
            yield (prev, item)
        prev = item


class Bigrams(IterableDataset):
    def __init__(self, text_file, vocabulary_size):
        self.vocab = build_vocab_from_iterator(
            get_words_lines_from_file(text_file),
            max_tokens=vocabulary_size,
            specials=['<unk>'])
        self.vocab.set_default_index(self.vocab['<unk>'])
        self.vocabulary_size = vocabulary_size
        self.text_file = text_file

    def __iter__(self):
        return look_ahead_iterator(
            (self.vocab[t] for t in itertools.chain.from_iterable(get_words_lines_from_file(self.text_file))))


def train():
    batch_size = 22000

    train_dataset = Bigrams('train/in.tsv.xz', vocab_size)

    device = 'cuda'
    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size)
    optimizer = torch.optim.Adam(model.parameters())
    criterion = torch.nn.NLLLoss()

    model.train()
    step = 0
    for x, y in train_data_loader:
        # Transfer Data to GPU
        x = x.to(device)
        y = y.to(device)
        # Clear the gradients
        optimizer.zero_grad()
        # Forward Pass
        ypredicted = model(x)
        # Find the Loss
        loss = criterion(torch.log(ypredicted), y)
        if step % 100 == 0:
            print(step, loss)
        step += 1
        # Calculate gradients
        loss.backward()
        # Update Weights
        optimizer.step()
    torch.save(model.state_dict(), 'model1.bin')


def predict():
    device = 'cuda'
    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
    model.load_state_dict(torch.load('model1.bin'))
    model.eval()

    ixs = torch.tensor(vocab.forward(['for'])).to(device)

    out = model(ixs)
    top = torch.topk(out[0], 10)
    top_indices = top.indices.tolist()
    top_probs = top.values.tolist()
    top_words = vocab.lookup_tokens(top_indices)
    print(list(zip(top_words, top_indices, top_probs)))


def similar():
    device = 'cuda'
    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
    model.load_state_dict(torch.load('model1.bin'))
    model.eval()

    cos = nn.CosineSimilarity(dim=1, eps=1e-6)

    embeddings = model.model[0].weight

    vec = embeddings[vocab['went']]

    similarities = cos(vec, embeddings)

    top = torch.topk(similarities, 10)

    top_indices = top.indices.tolist()
    top_probs = top.values.tolist()
    top_words = vocab.lookup_tokens(top_indices)
    print(list(zip(top_words, top_indices, top_probs)))


if __name__ == "__main__":
    # train()
    predict()
Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00			`from itertools import islice`
Add yeild file read. 2022-04-04 18:31:33 +02:00			`import regex as re`
Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00			`import sys`
			`from torchtext.vocab import build_vocab_from_iterator`
Add yeild file read. 2022-04-04 18:31:33 +02:00			`import lzma`
Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00			`from torch import nn`
			`import torch`
			`from torch.utils.data import IterableDataset`
			`import itertools`
			`from torch.utils.data import DataLoader`
			`import numpy as np`
Tworzenie bigramow i trigramow metoda 1. Funkcje yield. 2022-03-26 00:16:16 +01:00

Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00			`# def get_words_from_line(file_path):`
			`# for index, line in enumerate(get_lines_from_file(file)):`
			`# yield '<s>'`
			`# for m in re.finditer(r'[\p{L}0-9\*]+\|\p{P}+', line):`
			`# yield m.group(0).lower()`
			`# yield '</s>'`
			`# if index == 10000:`
			`# break`


			`def get_words_from_line(line):`
			`line = line.rstrip()`
			`yield '<s>'`
			`for m in re.finditer(r'[\p{L}0-9\*]+\|\p{P}+', line):`
			`yield m.group(0).lower()`
			`yield '</s>'`
kenlm 2022-04-25 16:58:55 +02:00

Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00			`def get_words_lines_from_file(file_path):`
			`with lzma.open(file_path, mode='rt') as file:`
			`for index, line in enumerate(file):`
kenlm 2022-04-25 16:58:55 +02:00			`text = line.split("\t")`
Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00			`yield get_words_from_line(re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower())))`
			`if index == 50000:`
			`break`


			`vocab_size = 20000`

			`vocab = build_vocab_from_iterator(`
			`get_words_lines_from_file('train/in.tsv.xz'),`
			`max_tokens=vocab_size,`
			`specials=['<unk>'])`

			`vocab.set_default_index(vocab['<unk>'])`
			`# vocab=None`

			`embed_size = 100`


			`class SimpleBigramNeuralLanguageModel(nn.Module):`
			`def __init__(self, vocabulary_size, embedding_size):`
			`super(SimpleBigramNeuralLanguageModel, self).__init__()`
			`self.model = nn.Sequential(`
			`nn.Embedding(vocabulary_size, embedding_size),`
			`nn.Linear(embedding_size, vocabulary_size),`
			`nn.Softmax()`
			`)`

			`def forward(self, x):`
			`return self.model(x)`


			`def look_ahead_iterator(gen):`
			`prev = None`
			`for item in gen:`
			`if prev is not None:`
			`yield (prev, item)`
			`prev = item`


			`class Bigrams(IterableDataset):`
			`def __init__(self, text_file, vocabulary_size):`
			`self.vocab = build_vocab_from_iterator(`
			`get_words_lines_from_file(text_file),`
			`max_tokens=vocabulary_size,`
			`specials=['<unk>'])`
			`self.vocab.set_default_index(self.vocab['<unk>'])`
			`self.vocabulary_size = vocabulary_size`
			`self.text_file = text_file`

			`def __iter__(self):`
			`return look_ahead_iterator(`
			`(self.vocab[t] for t in itertools.chain.from_iterable(get_words_lines_from_file(self.text_file))))`


			`def train():`
			`batch_size = 22000`

			`train_dataset = Bigrams('train/in.tsv.xz', vocab_size)`

			`device = 'cuda'`
			`model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)`
			`train_data_loader = DataLoader(train_dataset, batch_size=batch_size)`
			`optimizer = torch.optim.Adam(model.parameters())`
			`criterion = torch.nn.NLLLoss()`

			`model.train()`
			`step = 0`
			`for x, y in train_data_loader:`
			`# Transfer Data to GPU`
			`x = x.to(device)`
			`y = y.to(device)`
			`# Clear the gradients`
			`optimizer.zero_grad()`
			`# Forward Pass`
			`ypredicted = model(x)`
			`# Find the Loss`
			`loss = criterion(torch.log(ypredicted), y)`
			`if step % 100 == 0:`
			`print(step, loss)`
			`step += 1`
			`# Calculate gradients`
			`loss.backward()`
			`# Update Weights`
			`optimizer.step()`
			`torch.save(model.state_dict(), 'model1.bin')`


			`def predict():`
			`device = 'cuda'`
			`model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)`
			`model.load_state_dict(torch.load('model1.bin'))`
			`model.eval()`

			`ixs = torch.tensor(vocab.forward(['for'])).to(device)`

			`out = model(ixs)`
			`top = torch.topk(out[0], 10)`
			`top_indices = top.indices.tolist()`
			`top_probs = top.values.tolist()`
			`top_words = vocab.lookup_tokens(top_indices)`
			`print(list(zip(top_words, top_indices, top_probs)))`


			`def similar():`
			`device = 'cuda'`
			`model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)`
			`model.load_state_dict(torch.load('model1.bin'))`
			`model.eval()`

			`cos = nn.CosineSimilarity(dim=1, eps=1e-6)`

			`embeddings = model.model[0].weight`

			`vec = embeddings[vocab['went']]`

			`similarities = cos(vec, embeddings)`

			`top = torch.topk(similarities, 10)`

			`top_indices = top.indices.tolist()`
			`top_probs = top.values.tolist()`
			`top_words = vocab.lookup_tokens(top_indices)`
			`print(list(zip(top_words, top_indices, top_probs)))`

Zrobione szukanie leftcontext dla dokalnie wystepujacych dwoch slow. 2022-04-03 17:43:04 +02:00
kenlm 2022-04-25 16:58:55 +02:00			`if __name__ == "__main__":`
Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00			`# train()`
			`predict()`