ISI-lstm-lm/lstm - ODPOWIEDZI.ipynb
2021-05-31 15:05:47 +02:00

47 KiB

importy

from gensim.utils import tokenize
import numpy as np
import torch
from tqdm.notebook import tqdm
/media/kuba/ssdsam/anaconda3/lib/python3.8/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
  warnings.warn(msg)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = 'cpu'
print('Using {} device'.format(device))
Using cuda device
device
device(type='cuda')

przygotowanie zbiorów

pan_tadeusz_path_train= '/home/kuba/Syncthing/przedmioty/2020-02/ISI/zajecia9_ngramowy_model_jDDezykowy/pan-tadeusz-train.txt'
pan_tadeusz_path_valid= '/home/kuba/Syncthing/przedmioty/2020-02/ISI/zajecia9_ngramowy_model_jDDezykowy/pan-tadeusz-test.txt'
corpora_train = open(pan_tadeusz_path_train).read()
corpora_train_tokenized = list(tokenize(corpora_train,lowercase = True))
vocab_itos = sorted(set(corpora_train_tokenized))
len(vocab_itos)
16598
vocab_itos = vocab_itos[:15005]
vocab_itos[15001] = "<UNK>"
vocab_itos[15002] = "<BOS>"
vocab_itos[15003] = "<EOS>"
vocab_itos[15004] = "<PAD>"
len(vocab_itos)
15005
vocab_stoi = dict()
for i, token in enumerate(vocab_itos):
    vocab_stoi[token] = i
NGRAMS = 5
def get_token_id(dataset):
    token_ids = [vocab_stoi['<PAD>']] * (NGRAMS-1) + [vocab_stoi['<BOS>']]
    for token in dataset:
        try:
            token_ids.append(vocab_stoi[token])
        except KeyError:
            token_ids.append(vocab_stoi['<UNK>'])
    token_ids.append(vocab_stoi['<EOS>'])
    return token_ids
train_ids = get_token_id(corpora_train_tokenized)
train_ids[:30]
[15004,
 15004,
 15004,
 15004,
 15002,
 7,
 5002,
 7247,
 11955,
 1432,
 7018,
 14739,
 5506,
 4696,
 4276,
 7505,
 2642,
 8477,
 7259,
 10870,
 10530,
 7506,
 12968,
 7997,
 1911,
 12479,
 11129,
 13069,
 11797,
 5819]
def get_samples(dataset):
    samples = []
    for i in range(len(dataset)-NGRAMS):
        samples.append(dataset[i:i+NGRAMS])
    return samples
train_ids = get_samples(train_ids)
train_ids = torch.tensor(train_ids, device = device)
train_ids[:30]
tensor([[15004, 15004, 15004, 15004, 15002],
        [15004, 15004, 15004, 15002,     7],
        [15004, 15004, 15002,     7,  5002],
        [15004, 15002,     7,  5002,  7247],
        [15002,     7,  5002,  7247, 11955],
        [    7,  5002,  7247, 11955,  1432],
        [ 5002,  7247, 11955,  1432,  7018],
        [ 7247, 11955,  1432,  7018, 14739],
        [11955,  1432,  7018, 14739,  5506],
        [ 1432,  7018, 14739,  5506,  4696],
        [ 7018, 14739,  5506,  4696,  4276],
        [14739,  5506,  4696,  4276,  7505],
        [ 5506,  4696,  4276,  7505,  2642],
        [ 4696,  4276,  7505,  2642,  8477],
        [ 4276,  7505,  2642,  8477,  7259],
        [ 7505,  2642,  8477,  7259, 10870],
        [ 2642,  8477,  7259, 10870, 10530],
        [ 8477,  7259, 10870, 10530,  7506],
        [ 7259, 10870, 10530,  7506, 12968],
        [10870, 10530,  7506, 12968,  7997],
        [10530,  7506, 12968,  7997,  1911],
        [ 7506, 12968,  7997,  1911, 12479],
        [12968,  7997,  1911, 12479, 11129],
        [ 7997,  1911, 12479, 11129, 13069],
        [ 1911, 12479, 11129, 13069, 11797],
        [12479, 11129, 13069, 11797,  5819],
        [11129, 13069, 11797,  5819,  6268],
        [13069, 11797,  5819,  6268,  2807],
        [11797,  5819,  6268,  2807,  7831],
        [ 5819,  6268,  2807,  7831, 12893]], device='cuda:0')
train_ids.shape
torch.Size([57022, 5])
corpora_valid = open(pan_tadeusz_path_valid).read()
corpora_valid_tokenized = list(tokenize(corpora_valid,lowercase = True))
valid_ids = get_token_id(corpora_valid_tokenized)
valid_ids = torch.tensor(get_samples(valid_ids), dtype = torch.long, device = device)

model

# https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
class LSTM(torch.nn.Module):

    def __init__(self):
        super(LSTM, self).__init__()
        self.emb = torch.nn.Embedding(len(vocab_itos),100)
        self.rec = torch.nn.LSTM(100, 256, 1, batch_first = True)
        self.fc1 = torch.nn.Linear( 256 ,len(vocab_itos))
        #self.dropout = torch.nn.Dropout(0.5)

    def forward(self, x):
        emb = self.emb(x)
        #emb = self.dropout(emb)
        output, (h_n, c_n) = self.rec(emb)
        hidden = h_n.squeeze(0)
        out = self.fc1(hidden)
        #out = self.dropout(out)
        return out
lm = LSTM().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lm.parameters(),lr=0.0001)
BATCH_SIZE = 128
EPOCHS = 15
def get_ppl(dataset_ids):
    lm.eval()

    batches = 0
    loss_sum =0
    acc_score = 0

    for i in range(0, len(dataset_ids)-BATCH_SIZE+1, BATCH_SIZE):
        X = dataset_ids[i:i+BATCH_SIZE,:NGRAMS-1]
        Y = dataset_ids[i:i+BATCH_SIZE,NGRAMS-1]
        predictions = lm(X)
        
        # equally distributted
        # predictions = torch.zeros_like(predictions)
        
        loss = criterion(predictions,Y)

        loss_sum += loss.item()
        batches += 1

    return np.exp(loss_sum / batches)
history_ppl_train  = []
history_ppl_valid  = []
for epoch in range(EPOCHS):
    
    batches = 0
    loss_sum =0
    acc_score = 0
    lm.train()
    #for i in range(0, len(train_ids)-BATCH_SIZE+1, BATCH_SIZE):
    for i in tqdm(range(0, len(train_ids)-BATCH_SIZE+1, BATCH_SIZE)):
        X = train_ids[i:i+BATCH_SIZE,:NGRAMS-1]
        Y = train_ids[i:i+BATCH_SIZE,NGRAMS-1]
        predictions = lm(X)
        loss = criterion(predictions,Y)
    
    
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        loss_sum += loss.item()
        batches += 1
        
    ppl_train = get_ppl(train_ids)
    ppl_valid = get_ppl(valid_ids)
    
    history_ppl_train.append(ppl_train)
    history_ppl_valid.append(ppl_valid)
    
    print('epoch: ', epoch)
    print('train ppl: ', ppl_train)
    print('valid ppl: ', ppl_valid)
    print()
HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch:  0
train ppl:  2296.6914856482526
valid ppl:  528.9542436139727

HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch:  1
train ppl:  2093.302103954666
valid ppl:  514.4726844027333

HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch:  2
train ppl:  2014.09679023559
valid ppl:  510.12146471773366

HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch:  3
train ppl:  1939.0594855086504
valid ppl:  509.1060151440451

HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch:  4
train ppl:  1854.4566511885196
valid ppl:  510.02244291272973

HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch:  5
train ppl:  1755.030202547313
valid ppl:  508.494174178397

HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch:  6
train ppl:  1646.180912657662
valid ppl:  506.06383737670035

HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch:  7
train ppl:  1533.0501876139222
valid ppl:  504.08067276707567

HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch:  8
train ppl:  1420.680717507558
valid ppl:  502.6906095632547

HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch:  9
train ppl:  1311.1083504083306
valid ppl:  503.5230045363773

HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch:  10
train ppl:  1203.498635587493
valid ppl:  505.7599916969862

HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch:  11
train ppl:  1100.0681613054269
valid ppl:  507.6071195979723

HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch:  12
train ppl:  1003.217414775517
valid ppl:  510.07952767103245

HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch:  13
train ppl:  912.2987798296267
valid ppl:  512.8275727599236

HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch:  14
train ppl:  826.911431868259
valid ppl:  516.1525759633064

parametry modelu

list(lm.parameters())
[Parameter containing:
 tensor([[ 0.4949, -1.2472, -0.7167,  ..., -0.0801, -2.1905,  0.9790],
         [ 0.2070,  1.7394,  0.8255,  ..., -0.5796, -0.3776,  0.8831],
         [-1.4559,  1.1073, -0.4904,  ..., -1.2919, -2.2661, -0.5476],
         ...,
         [-0.3706,  0.2133,  0.0484,  ..., -0.5792, -0.5769, -0.6941],
         [ 0.5502, -0.1212, -2.0879,  ...,  0.6764, -0.5961, -0.6282],
         [ 0.8362,  0.2193, -0.0807,  ...,  2.7741, -0.2589,  0.3310]],
        device='cuda:0', requires_grad=True),
 Parameter containing:
 tensor([[ 0.0252, -0.0744,  0.0817,  ..., -0.0559,  0.0896,  0.0208],
         [ 0.0423,  0.0329, -0.0610,  ..., -0.0009,  0.0169, -0.0361],
         [ 0.0507, -0.0838,  0.0520,  ...,  0.0395,  0.0067,  0.0173],
         ...,
         [ 0.0669,  0.0430, -0.0306,  ...,  0.0096,  0.0619, -0.0992],
         [-0.0153, -0.0888,  0.0580,  ..., -0.0433,  0.0399, -0.0494],
         [-0.0067, -0.0053, -0.0242,  ...,  0.0017, -0.0306, -0.0972]],
        device='cuda:0', requires_grad=True),
 Parameter containing:
 tensor([[ 0.0212, -0.0425, -0.0329,  ..., -0.0206,  0.0839,  0.0286],
         [ 0.0952,  0.0298, -0.1211,  ..., -0.0468, -0.0233, -0.0620],
         [ 0.0108,  0.0422, -0.0492,  ..., -0.0288, -0.0231,  0.0078],
         ...,
         [ 0.0253,  0.0154, -0.0765,  ..., -0.0025,  0.0057, -0.0408],
         [ 0.0892, -0.0928, -0.1039,  ..., -0.1531, -0.0011,  0.0180],
         [ 0.1341,  0.0666, -0.0548,  ..., -0.0573, -0.0376, -0.0813]],
        device='cuda:0', requires_grad=True),
 Parameter containing:
 tensor([0.1115, 0.0423, 0.1438,  ..., 0.0361, 0.0297, 0.0956], device='cuda:0',
        requires_grad=True),
 Parameter containing:
 tensor([0.0786, 0.1201, 0.0857,  ..., 0.1177, 0.1319, 0.0886], device='cuda:0',
        requires_grad=True),
 Parameter containing:
 tensor([[-0.0158,  0.0236, -0.0958,  ..., -0.0906, -0.0678,  0.0057],
         [-0.0871, -0.0788,  0.1217,  ..., -0.0231, -0.0102,  0.0220],
         [ 0.0265, -0.0680, -0.0219,  ..., -0.0520, -0.0565,  0.0628],
         ...,
         [-0.0618,  0.0232,  0.0898,  ...,  0.1069, -0.0112,  0.0103],
         [-0.0489,  0.0708,  0.0546,  ...,  0.1186, -0.0987,  0.1411],
         [-0.0764,  0.0463,  0.0947,  ...,  0.1104, -0.0312,  0.1118]],
        device='cuda:0', requires_grad=True),
 Parameter containing:
 tensor([ 0.0299, -0.0551, -0.0323,  ..., -0.0371, -0.0297, -0.0157],
        device='cuda:0', requires_grad=True)]

krzywe uczenia

history_ppl_valid
[528.9542436139727,
 514.4726844027333,
 510.12146471773366,
 509.1060151440451,
 510.02244291272973,
 508.494174178397,
 506.06383737670035,
 504.08067276707567,
 502.6906095632547,
 503.5230045363773,
 505.7599916969862,
 507.6071195979723,
 510.07952767103245,
 512.8275727599236,
 516.1525759633064]
import matplotlib.pyplot as plt
plt.plot(np.arange(len(history_ppl_train)), history_ppl_train, history_ppl_valid)
[<matplotlib.lines.Line2D at 0x7f02842a99a0>,
 <matplotlib.lines.Line2D at 0x7f02842a9a90>]

Inferencja

'Gości innych nie widział oprócz spółleśników'
'Gości innych nie widział oprócz spółleśników'
tokenized = list(tokenize('Gości innych nie widział oprócz spółleśników',lowercase = True))
#tokenized = tokenized[-NGRAMS  :-1 ]
tokenized
['gości', 'innych', 'nie', 'widział', 'oprócz', 'spółleśników']
ids = []
for word in tokenized:
    if word in vocab_stoi:
        ids.append(vocab_stoi[word])
    else:
        ids.append(vocab_stoi['<UNK>'])
ids
[2671, 3168, 5873, 13240, 6938, 15001]
lm.eval()
LSTM(
  (emb): Embedding(15005, 100)
  (rec): LSTM(100, 256, batch_first=True)
  (fc1): Linear(in_features=256, out_features=15005, bias=True)
)
ids = torch.tensor(ids, dtype = torch.long, device = device)
ids
tensor([ 2671,  3168,  5873, 13240,  6938, 15001], device='cuda:0')
preds= lm(ids.unsqueeze(0))
torch.argmax(torch.softmax(preds,1),1).item()
15001
torch.max(torch.softmax(preds,1),1)
torch.return_types.max(
values=tensor([0.1419], device='cuda:0', grad_fn=<MaxBackward0>),
indices=tensor([15001], device='cuda:0'))
vocab_itos[torch.argmax(torch.softmax(preds,1),1).item()]
'<UNK>'

ZADANIE: GENEROWANIE TEKSTU

Napisać funkcję generującą tekst, która dla podanego fragmentu generuje tekst. Generowanie tekstu ma wyglądać następująco: Z 10 najbardziej prawodpodobnych tokenów należy wylosować jeden, ala ma to byc token inny niż specjalny (UNK, BOS, EOS, PAD).

wygenerować tekst o długości 30 tokenów

generowanie tekstu

tokenized = list(tokenize('Pan Tadeusz', lowercase = True))
tokenized
['pan', 'tadeusz']
ids = []
for word in tokenized:
    if word in vocab_stoi:
        ids.append(vocab_stoi[word])
    else:
        ids.append(vocab_stoi['<UNK>'])
ids = torch.tensor([ids], dtype = torch.long, device = device)
candidates_number = 10
for i in range(30):
    preds= lm(ids)
    candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy()
    candidate = 15001
    while candidate > 15000:
        candidate = candidates[np.random.randint(candidates_number)]
    print(vocab_itos[candidate])
    ids =  torch.cat((ids, torch.tensor([[candidate]], device = device)), 1)
a
nie
ma
i
na
nim
na
w
tył
i
tak
w
tył
tylko
i
z
nim
na
litwie
a
tak
z
góry
w
górę
na
nie
a
tak
z