Paweł Skórzewski b568122036 Lab. 8 i 9
2024-05-15 12:20:09 +02:00

60 KiB
Modelowanie języka laboratoria

15 maja 2024

9. Model neuronowy rekurencyjny

import torch
from torch import nn, optim
from import DataLoader
import numpy as np
from collections import Counter
import re
device = 'cpu'
!cat potop-* > potop.txt
class Dataset(
    def __init__(
        self.sequence_length = sequence_length
        self.words = self.load()
        self.uniq_words = self.get_uniq_words()

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        self.words_indexes = [self.word_to_index[w] for w in self.words]

    def load(self):
        with open('potop.txt', 'r') as f_in:
            text = [x.rstrip() for x in f_in.readlines() if x.strip()]
            text = ' '.join(text).lower()
            text = re.sub('[^a-ząćęłńóśźż ]', '', text) 
            text = text.split(' ')
        return text
    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.words_indexes) - self.sequence_length

    def __getitem__(self, index):
        return (
dataset = Dataset(5)
(tensor([  551,    18,    17,   255, 10748]),
 tensor([   18,    17,   255, 10748,    34]))
[dataset.index_to_word[x] for x in [   551,    18,    17,   255, 10748]]
['patrzył', 'tak', 'jak', 'człowiek', 'zbudzony']
[dataset.index_to_word[x] for x in [   18,    17,   255, 10748,    34]]
['tak', 'jak', 'człowiek', 'zbudzony', 'ze']
input_tensor = torch.tensor([[ 551,    18,    17,   255, 10748]], dtype=torch.int32).to(device)
#input_tensor = torch.tensor([[ 551,    18]], dtype=torch.int32).to(device)
class Model(nn.Module):
    def __init__(self, vocab_size):
        super(Model, self).__init__()
        self.lstm_size = 128
        self.embedding_dim = 128
        self.num_layers = 3

        self.embedding = nn.Embedding(
        self.lstm = nn.LSTM(
        self.fc = nn.Linear(self.lstm_size, vocab_size)

    def forward(self, x, prev_state = None):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(device),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(device))
model = Model(len(dataset)).to(device)
y_pred, (state_h, state_c) = model(input_tensor)
tensor([[[ 0.0046, -0.0113,  0.0313,  ...,  0.0198, -0.0312,  0.0223],
         [ 0.0039, -0.0110,  0.0303,  ...,  0.0213, -0.0302,  0.0230],
         [ 0.0029, -0.0133,  0.0265,  ...,  0.0204, -0.0297,  0.0219],
         [ 0.0010, -0.0120,  0.0282,  ...,  0.0241, -0.0314,  0.0241],
         [ 0.0038, -0.0106,  0.0346,  ...,  0.0230, -0.0333,  0.0232]]],
torch.Size([1, 5, 1187998])
def train(dataset, model, max_epochs, batch_size):

    dataloader = DataLoader(dataset, batch_size=batch_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(max_epochs):
        for batch, (x, y) in enumerate(dataloader):
            x =
            y =

            y_pred, (state_h, state_c) = model(x)
            loss = criterion(y_pred.transpose(1, 2), y)


            print({ 'epoch': epoch, 'update in batch': batch, '/' : len(dataloader), 'loss': loss.item() })
model = Model(vocab_size = len(dataset.uniq_words)).to(device)
train(dataset, model, 1, 64)
def predict(dataset, model, text, next_words=5):
    words = text.split(' ')
    state_h, state_c = model.init_state(len(words))

    for i in range(0, next_words):
        x = torch.tensor([[dataset.word_to_index[w] for w in words[i:]]]).to(device)
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))

        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)

    return words
predict(dataset, model, 'kmicic szedł')
['kmicic', 'szedł', 'zwycięzco', 'po', 'do', 'zlituj', 'i']


Stworzyć sieć rekurencyjną GRU dla Challenging America word-gap prediction. Wymogi takie jak zawsze, zadanie widoczne na Gonito:

Punktacja: 100 punktów

Deadline: 29 maja 2024 przed zajęciami