## Modelowanie języka – laboratoria
### 15 maja 2024
# 9. Model neuronowy rekurencyjny

In [21]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import numpy as np
from collections import Counter
import re

In [22]:
device = 'cpu'

In [23]:
! wget https://wolnelektury.pl/media/book/txt/potop-tom-pierwszy.txt
! wget https://wolnelektury.pl/media/book/txt/potop-tom-drugi.txt
! wget https://wolnelektury.pl/media/book/txt/potop-tom-trzeci.txt

--2024-05-15 11:45:39--  https://wolnelektury.pl/media/book/txt/potop-tom-pierwszy.txt
Resolving wolnelektury.pl (wolnelektury.pl)... 51.83.143.148, 2001:41d0:602:3294::
Connecting to wolnelektury.pl (wolnelektury.pl)|51.83.143.148|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 877885 (857K) [text/plain]
Saving to: ‘potop-tom-pierwszy.txt.1’


2024-05-15 11:45:41 (436 KB/s) - ‘potop-tom-pierwszy.txt.1’ saved [877885/877885]

--2024-05-15 11:45:41--  https://wolnelektury.pl/media/book/txt/potop-tom-drugi.txt
Resolving wolnelektury.pl (wolnelektury.pl)... 51.83.143.148, 2001:41d0:602:3294::
Connecting to wolnelektury.pl (wolnelektury.pl)|51.83.143.148|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1087789 (1.0M) [text/plain]
Saving to: ‘potop-tom-drugi.txt.1’


2024-05-15 11:45:45 (296 KB/s) - ‘potop-tom-drugi.txt.1’ saved [1087789/1087789]

--2024-05-15 11:45:45--  https://wolnelektury.pl/media/book/txt/potop-tom-trzeci.txt
Resolving

In [24]:
!cat potop-* > potop.txt

In [25]:
class Dataset(torch.utils.data.Dataset):
    def __init__(
            self,
            sequence_length,
    ):
        self.sequence_length = sequence_length
        self.words = self.load()
        self.uniq_words = self.get_uniq_words()

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        self.words_indexes = [self.word_to_index[w] for w in self.words]

    def load(self):
        with open('potop.txt', 'r') as f_in:
            text = [x.rstrip() for x in f_in.readlines() if x.strip()]
            text = ' '.join(text).lower()
            text = re.sub('[^a-ząćęłńóśźż ]', '', text) 
            text = text.split(' ')
        return text
    
    
    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.words_indexes) - self.sequence_length

    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.sequence_length]),
            torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1]),
        )

In [26]:
dataset = Dataset(5)

In [27]:
dataset[200]

(tensor([  551,    18,    17,   255, 10747]),
 tensor([   18,    17,   255, 10747,    34]))

In [28]:
[dataset.index_to_word[x] for x in [   551,    18,    17,   255, 10748]]

['patrzył', 'tak', 'jak', 'człowiek', 'opatrywać']

In [29]:
[dataset.index_to_word[x] for x in [   18,    17,   255, 10748,    34]]

['tak', 'jak', 'człowiek', 'opatrywać', 'ze']

In [30]:
input_tensor = torch.tensor([[ 551,    18,    17,   255, 10748]], dtype=torch.int32).to(device)

In [31]:
#input_tensor = torch.tensor([[ 551,    18]], dtype=torch.int32).to(device)

In [32]:
class Model(nn.Module):
    def __init__(self, vocab_size):
        super(Model, self).__init__()
        self.lstm_size = 128
        self.embedding_dim = 128
        self.num_layers = 3

        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.lstm_size,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.2,
        )
        self.fc = nn.Linear(self.lstm_size, vocab_size)

    def forward(self, x, prev_state = None):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(device),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(device))

In [33]:
model = Model(len(dataset)).to(device)

In [34]:
y_pred, (state_h, state_c) = model(input_tensor)

In [35]:
y_pred

tensor([[[ 0.0406, -0.0111, -0.0615,  ...,  0.0028, -0.0004, -0.0235],
         [ 0.0365, -0.0116, -0.0621,  ...,  0.0004, -0.0014, -0.0253],
         [ 0.0421, -0.0122, -0.0622,  ...,  0.0014, -0.0018, -0.0220],
         [ 0.0377, -0.0089, -0.0609,  ...,  0.0024, -0.0007, -0.0245],
         [ 0.0392, -0.0122, -0.0557,  ...,  0.0008,  0.0007, -0.0215]]],
       grad_fn=<ViewBackward0>)

In [36]:
y_pred.shape

torch.Size([1, 5, 791991])

In [37]:
def train(dataset, model, max_epochs, batch_size):
    model.train()

    dataloader = DataLoader(dataset, batch_size=batch_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(max_epochs):
        for batch, (x, y) in enumerate(dataloader):
            optimizer.zero_grad()
            x = x.to(device)
            y = y.to(device)

            y_pred, (state_h, state_c) = model(x)
            loss = criterion(y_pred.transpose(1, 2), y)

            loss.backward()
            optimizer.step()

            print({ 'epoch': epoch, 'update in batch': batch, '/' : len(dataloader), 'loss': loss.item() })


In [38]:
model = Model(vocab_size = len(dataset.uniq_words)).to(device)
train(dataset, model, 1, 64)

{'epoch': 0, 'update in batch': 0, '/': 12375, 'loss': 10.710318565368652}
{'epoch': 0, 'update in batch': 1, '/': 12375, 'loss': 10.693680763244629}
{'epoch': 0, 'update in batch': 2, '/': 12375, 'loss': 10.70435619354248}
{'epoch': 0, 'update in batch': 3, '/': 12375, 'loss': 10.694655418395996}
{'epoch': 0, 'update in batch': 4, '/': 12375, 'loss': 10.684280395507812}
{'epoch': 0, 'update in batch': 5, '/': 12375, 'loss': 10.6857271194458}
{'epoch': 0, 'update in batch': 6, '/': 12375, 'loss': 10.668371200561523}
{'epoch': 0, 'update in batch': 7, '/': 12375, 'loss': 10.683968544006348}
{'epoch': 0, 'update in batch': 8, '/': 12375, 'loss': 10.646549224853516}
{'epoch': 0, 'update in batch': 9, '/': 12375, 'loss': 10.62016487121582}
{'epoch': 0, 'update in batch': 10, '/': 12375, 'loss': 10.522863388061523}
{'epoch': 0, 'update in batch': 11, '/': 12375, 'loss': 10.469602584838867}
{'epoch': 0, 'update in batch': 12, '/': 12375, 'loss': 10.407214164733887}
{'epoch': 0, 'update in ba

In [19]:
def predict(dataset, model, text, next_words=5):
    model.eval()
    words = text.split(' ')
    state_h, state_c = model.init_state(len(words))

    for i in range(0, next_words):
        x = torch.tensor([[dataset.word_to_index[w] for w in words[i:]]]).to(device)
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))

        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)
        words.append(dataset.index_to_word[word_index])

    return words

In [22]:
predict(dataset, model, 'kmicic szedł')

['kmicic', 'szedł', 'zwycięzco', 'po', 'do', 'zlituj', 'i']

### ZADANIE

Stworzyć sieć rekurencyjną GRU dla Challenging America word-gap prediction. Wymogi takie jak zawsze, zadanie widoczne na Gonito:

https://gonito.csi.wmi.amu.edu.pl/challenge/challenging-america-word-gap-prediction

Punktacja: **100 punktów**

Deadline: **29 maja 2024** przed zajęciami