challenging-america-word-ga.../run.py

from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
import lzma
from torch import nn
import torch
from torch.utils.data import IterableDataset
import itertools
from torch.utils.data import DataLoader
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk import trigrams


# def get_words_from_line(file_path):
#     for index, line in enumerate(get_lines_from_file(file)):
#         yield '<s>'
#         for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
#             yield m.group(0).lower()
#         yield '</s>'
#         if index == 10000:
#             break

tokenizer = RegexpTokenizer(r"\w+")

def read_file_6(file):
    for line in file:
        text = line.split("\t")
        yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', text[6].replace("\\n", " ").replace("\n", "").lower()))


def get_words_from_line(line):
    line = line.rstrip()
    yield '<s>'
    for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
        yield m.group(0).lower()
    yield '</s>'


def get_words_lines_from_file(file_path):
    with lzma.open(file_path, mode='rt') as file:
        for index, line in enumerate(file):
            text = line.split("\t")
            yield get_words_from_line(re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower())))
            # if index == 1000:
            #     break


vocab_size = 30000

vocab = build_vocab_from_iterator(
    get_words_lines_from_file('train/in.tsv.xz'),
    max_tokens=vocab_size,
    specials=['<unk>'])

vocab.set_default_index(vocab['<unk>'])
# vocab=None

embed_size = 100


class SimpleBigramNeuralLanguageModel(nn.Module):
    def __init__(self, vocabulary_size, embedding_size):
        super(SimpleBigramNeuralLanguageModel, self).__init__()
        self.model = nn.Sequential(
            nn.Embedding(vocabulary_size, embedding_size),
            nn.Linear(embedding_size, vocabulary_size),
            nn.Softmax()
        )

    def forward(self, x):
        return self.model(x)


def look_ahead_iterator(gen):
    prev = None
    for item in gen:
        if prev is not None:
            yield (prev, item)
        prev = item


class Bigrams(IterableDataset):
    def __init__(self, text_file, vocabulary_size):
        self.vocab = build_vocab_from_iterator(
            get_words_lines_from_file(text_file),
            max_tokens=vocabulary_size,
            specials=['<unk>'])
        self.vocab.set_default_index(self.vocab['<unk>'])
        self.vocabulary_size = vocabulary_size
        self.text_file = text_file

    def __iter__(self):
        return look_ahead_iterator(
            (self.vocab[t] for t in itertools.chain.from_iterable(get_words_lines_from_file(self.text_file))))


def train():
    batch_size = 15000

    train_dataset = Bigrams('train/in.tsv.xz', vocab_size)

    device = 'cuda'
    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size)
    optimizer = torch.optim.Adam(model.parameters())
    criterion = torch.nn.NLLLoss()

    model.train()
    step = 0
    for x, y in train_data_loader:
        # Transfer Data to GPU
        x = x.to(device)
        y = y.to(device)
        # Clear the gradients
        optimizer.zero_grad()
        # Forward Pass
        ypredicted = model(x)
        # Find the Loss
        loss = criterion(torch.log(ypredicted), y)
        if step % 100 == 0:
            print(step, loss)
        step += 1
        # Calculate gradients
        loss.backward()
        # Update Weights
        optimizer.step()
    print(step)
    torch.save(model.state_dict(), 'model1.bin')


def predict(word):
    device = 'cuda'
    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
    model.load_state_dict(torch.load('model1.bin'))
    model.eval()

    ixs = torch.tensor(vocab.forward([word])).to(device)

    out = model(ixs)
    top = torch.topk(out[0], 8)
    top_indices = top.indices.tolist()
    top_probs = top.values.tolist()
    top_words = vocab.lookup_tokens(top_indices)
    str_predictions = ""
    lht = 1.0
    for pred_word in list(zip(top_words, top_indices, top_probs)):
        if lht - pred_word[2] >= 0:
            str_predictions += f"{pred_word[0]}:{pred_word[2]} "
            lht -= pred_word[2]
    if lht != 1.0:
        str_predictions += f":{lht}"
    return str_predictions


def similar():
    device = 'cuda'
    model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
    model.load_state_dict(torch.load('model1.bin'))
    model.eval()

    cos = nn.CosineSimilarity(dim=1, eps=1e-6)

    embeddings = model.model[0].weight

    vec = embeddings[vocab['went']]

    similarities = cos(vec, embeddings)

    top = torch.topk(similarities, 10)

    top_indices = top.indices.tolist()
    top_probs = top.values.tolist()
    top_words = vocab.lookup_tokens(top_indices)
    print(list(zip(top_words, top_indices, top_probs)))


def generate_outputs(input_file, output_file):
    with open(output_file, 'w') as outputf:
        with lzma.open(input_file, mode='rt') as file:
            for index, text in enumerate(read_file_6(file)):
                tokens = tokenizer.tokenize(text)
                if len(tokens) < 4:
                    prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
                else:
                    prediction = predict(tokens[-1])
                outputf.write(prediction + '\n')


if __name__ == "__main__":
    # train()
    # predict()
    # generate_outputs("dev-0/in.tsv.xz", "dev-0/out.tsv")
    generate_outputs("test-A/in.tsv.xz", "test-A/out.tsv")
    # count_words = 0
    # for i in get_words_lines_from_file('train/in.tsv.xz'):
    #     for j in i:
    #         count_words += 1
    # print(count_words)
Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00			`from itertools import islice`
Add yeild file read. 2022-04-04 18:31:33 +02:00			`import regex as re`
Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00			`import sys`
			`from torchtext.vocab import build_vocab_from_iterator`
Add yeild file read. 2022-04-04 18:31:33 +02:00			`import lzma`
Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00			`from torch import nn`
			`import torch`
			`from torch.utils.data import IterableDataset`
			`import itertools`
			`from torch.utils.data import DataLoader`
			`import numpy as np`
Bigram neural finish. 2022-05-07 14:53:24 +02:00			`from nltk.tokenize import RegexpTokenizer`
			`from nltk import trigrams`
Tworzenie bigramow i trigramow metoda 1. Funkcje yield. 2022-03-26 00:16:16 +01:00

Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00			`# def get_words_from_line(file_path):`
			`# for index, line in enumerate(get_lines_from_file(file)):`
			`# yield '<s>'`
			`# for m in re.finditer(r'[\p{L}0-9\*]+\|\p{P}+', line):`
			`# yield m.group(0).lower()`
			`# yield '</s>'`
			`# if index == 10000:`
			`# break`

Bigram neural finish. 2022-05-07 14:53:24 +02:00			`tokenizer = RegexpTokenizer(r"\w+")`

			`def read_file_6(file):`
			`for line in file:`
			`text = line.split("\t")`
			`yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', text[6].replace("\\n", " ").replace("\n", "").lower()))`

Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00
			`def get_words_from_line(line):`
			`line = line.rstrip()`
			`yield '<s>'`
			`for m in re.finditer(r'[\p{L}0-9\*]+\|\p{P}+', line):`
			`yield m.group(0).lower()`
			`yield '</s>'`
kenlm 2022-04-25 16:58:55 +02:00

Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00			`def get_words_lines_from_file(file_path):`
			`with lzma.open(file_path, mode='rt') as file:`
			`for index, line in enumerate(file):`
kenlm 2022-04-25 16:58:55 +02:00			`text = line.split("\t")`
Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00			`yield get_words_from_line(re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower())))`
Bigram neural finish. 2022-05-07 14:53:24 +02:00			`# if index == 1000:`
			`# break`
Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00

Bigram neural finish. 2022-05-07 14:53:24 +02:00			`vocab_size = 30000`
Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00
			`vocab = build_vocab_from_iterator(`
			`get_words_lines_from_file('train/in.tsv.xz'),`
			`max_tokens=vocab_size,`
			`specials=['<unk>'])`

			`vocab.set_default_index(vocab['<unk>'])`
			`# vocab=None`

			`embed_size = 100`


			`class SimpleBigramNeuralLanguageModel(nn.Module):`
			`def __init__(self, vocabulary_size, embedding_size):`
			`super(SimpleBigramNeuralLanguageModel, self).__init__()`
			`self.model = nn.Sequential(`
			`nn.Embedding(vocabulary_size, embedding_size),`
			`nn.Linear(embedding_size, vocabulary_size),`
			`nn.Softmax()`
			`)`

			`def forward(self, x):`
			`return self.model(x)`


			`def look_ahead_iterator(gen):`
			`prev = None`
			`for item in gen:`
			`if prev is not None:`
			`yield (prev, item)`
			`prev = item`


			`class Bigrams(IterableDataset):`
			`def __init__(self, text_file, vocabulary_size):`
			`self.vocab = build_vocab_from_iterator(`
			`get_words_lines_from_file(text_file),`
			`max_tokens=vocabulary_size,`
			`specials=['<unk>'])`
			`self.vocab.set_default_index(self.vocab['<unk>'])`
			`self.vocabulary_size = vocabulary_size`
			`self.text_file = text_file`

			`def __iter__(self):`
			`return look_ahead_iterator(`
			`(self.vocab[t] for t in itertools.chain.from_iterable(get_words_lines_from_file(self.text_file))))`


			`def train():`
Bigram neural finish. 2022-05-07 14:53:24 +02:00			`batch_size = 15000`
Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00
			`train_dataset = Bigrams('train/in.tsv.xz', vocab_size)`

			`device = 'cuda'`
			`model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)`
			`train_data_loader = DataLoader(train_dataset, batch_size=batch_size)`
			`optimizer = torch.optim.Adam(model.parameters())`
			`criterion = torch.nn.NLLLoss()`

			`model.train()`
			`step = 0`
			`for x, y in train_data_loader:`
			`# Transfer Data to GPU`
			`x = x.to(device)`
			`y = y.to(device)`
			`# Clear the gradients`
			`optimizer.zero_grad()`
			`# Forward Pass`
			`ypredicted = model(x)`
			`# Find the Loss`
			`loss = criterion(torch.log(ypredicted), y)`
			`if step % 100 == 0:`
			`print(step, loss)`
			`step += 1`
			`# Calculate gradients`
			`loss.backward()`
			`# Update Weights`
			`optimizer.step()`
Bigram neural finish. 2022-05-07 14:53:24 +02:00			`print(step)`
Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00			`torch.save(model.state_dict(), 'model1.bin')`


Bigram neural finish. 2022-05-07 14:53:24 +02:00			`def predict(word):`
Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00			`device = 'cuda'`
			`model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)`
			`model.load_state_dict(torch.load('model1.bin'))`
			`model.eval()`

Bigram neural finish. 2022-05-07 14:53:24 +02:00			`ixs = torch.tensor(vocab.forward([word])).to(device)`
Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00
			`out = model(ixs)`
Bigram neural finish. 2022-05-07 14:53:24 +02:00			`top = torch.topk(out[0], 8)`
Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00			`top_indices = top.indices.tolist()`
			`top_probs = top.values.tolist()`
			`top_words = vocab.lookup_tokens(top_indices)`
Bigram neural finish. 2022-05-07 14:53:24 +02:00			`str_predictions = ""`
			`lht = 1.0`
			`for pred_word in list(zip(top_words, top_indices, top_probs)):`
			`if lht - pred_word[2] >= 0:`
			`str_predictions += f"{pred_word[0]}:{pred_word[2]} "`
			`lht -= pred_word[2]`
			`if lht != 1.0:`
			`str_predictions += f":{lht}"`
			`return str_predictions`
Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00

			`def similar():`
			`device = 'cuda'`
			`model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)`
			`model.load_state_dict(torch.load('model1.bin'))`
			`model.eval()`

			`cos = nn.CosineSimilarity(dim=1, eps=1e-6)`

			`embeddings = model.model[0].weight`

			`vec = embeddings[vocab['went']]`

			`similarities = cos(vec, embeddings)`

			`top = torch.topk(similarities, 10)`

			`top_indices = top.indices.tolist()`
			`top_probs = top.values.tolist()`
			`top_words = vocab.lookup_tokens(top_indices)`
			`print(list(zip(top_words, top_indices, top_probs)))`

Zrobione szukanie leftcontext dla dokalnie wystepujacych dwoch slow. 2022-04-03 17:43:04 +02:00
Bigram neural finish. 2022-05-07 14:53:24 +02:00			`def generate_outputs(input_file, output_file):`
			`with open(output_file, 'w') as outputf:`
			`with lzma.open(input_file, mode='rt') as file:`
			`for index, text in enumerate(read_file_6(file)):`
			`tokens = tokenizer.tokenize(text)`
			`if len(tokens) < 4:`
			`prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'`
			`else:`
			`prediction = predict(tokens[-1])`
			`outputf.write(prediction + '\n')`


kenlm 2022-04-25 16:58:55 +02:00			`if __name__ == "__main__":`
Neural bigram with/out validation. 2022-05-07 00:08:59 +02:00			`# train()`
Bigram neural finish. 2022-05-07 14:53:24 +02:00			`# predict()`
			`# generate_outputs("dev-0/in.tsv.xz", "dev-0/out.tsv")`
			`generate_outputs("test-A/in.tsv.xz", "test-A/out.tsv")`
			`# count_words = 0`
			`# for i in get_words_lines_from_file('train/in.tsv.xz'):`
			`# for j in i:`
			`# count_words += 1`
			`# print(count_words)`