47 KiB
47 KiB
importy
from gensim.utils import tokenize
import numpy as np
import torch
from tqdm.notebook import tqdm
/media/kuba/ssdsam/anaconda3/lib/python3.8/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning. warnings.warn(msg)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = 'cpu'
print('Using {} device'.format(device))
Using cuda device
device
device(type='cuda')
przygotowanie zbiorów
pan_tadeusz_path_train= '/home/kuba/Syncthing/przedmioty/2020-02/ISI/zajecia9_ngramowy_model_jDDezykowy/pan-tadeusz-train.txt'
pan_tadeusz_path_valid= '/home/kuba/Syncthing/przedmioty/2020-02/ISI/zajecia9_ngramowy_model_jDDezykowy/pan-tadeusz-test.txt'
corpora_train = open(pan_tadeusz_path_train).read()
corpora_train_tokenized = list(tokenize(corpora_train,lowercase = True))
vocab_itos = sorted(set(corpora_train_tokenized))
len(vocab_itos)
16598
vocab_itos = vocab_itos[:15005]
vocab_itos[15001] = "<UNK>"
vocab_itos[15002] = "<BOS>"
vocab_itos[15003] = "<EOS>"
vocab_itos[15004] = "<PAD>"
len(vocab_itos)
15005
vocab_stoi = dict()
for i, token in enumerate(vocab_itos):
vocab_stoi[token] = i
NGRAMS = 5
def get_token_id(dataset):
token_ids = [vocab_stoi['<PAD>']] * (NGRAMS-1) + [vocab_stoi['<BOS>']]
for token in dataset:
try:
token_ids.append(vocab_stoi[token])
except KeyError:
token_ids.append(vocab_stoi['<UNK>'])
token_ids.append(vocab_stoi['<EOS>'])
return token_ids
train_ids = get_token_id(corpora_train_tokenized)
train_ids[:30]
[15004, 15004, 15004, 15004, 15002, 7, 5002, 7247, 11955, 1432, 7018, 14739, 5506, 4696, 4276, 7505, 2642, 8477, 7259, 10870, 10530, 7506, 12968, 7997, 1911, 12479, 11129, 13069, 11797, 5819]
def get_samples(dataset):
samples = []
for i in range(len(dataset)-NGRAMS):
samples.append(dataset[i:i+NGRAMS])
return samples
train_ids = get_samples(train_ids)
train_ids = torch.tensor(train_ids, device = device)
train_ids[:30]
tensor([[15004, 15004, 15004, 15004, 15002], [15004, 15004, 15004, 15002, 7], [15004, 15004, 15002, 7, 5002], [15004, 15002, 7, 5002, 7247], [15002, 7, 5002, 7247, 11955], [ 7, 5002, 7247, 11955, 1432], [ 5002, 7247, 11955, 1432, 7018], [ 7247, 11955, 1432, 7018, 14739], [11955, 1432, 7018, 14739, 5506], [ 1432, 7018, 14739, 5506, 4696], [ 7018, 14739, 5506, 4696, 4276], [14739, 5506, 4696, 4276, 7505], [ 5506, 4696, 4276, 7505, 2642], [ 4696, 4276, 7505, 2642, 8477], [ 4276, 7505, 2642, 8477, 7259], [ 7505, 2642, 8477, 7259, 10870], [ 2642, 8477, 7259, 10870, 10530], [ 8477, 7259, 10870, 10530, 7506], [ 7259, 10870, 10530, 7506, 12968], [10870, 10530, 7506, 12968, 7997], [10530, 7506, 12968, 7997, 1911], [ 7506, 12968, 7997, 1911, 12479], [12968, 7997, 1911, 12479, 11129], [ 7997, 1911, 12479, 11129, 13069], [ 1911, 12479, 11129, 13069, 11797], [12479, 11129, 13069, 11797, 5819], [11129, 13069, 11797, 5819, 6268], [13069, 11797, 5819, 6268, 2807], [11797, 5819, 6268, 2807, 7831], [ 5819, 6268, 2807, 7831, 12893]], device='cuda:0')
train_ids.shape
torch.Size([57022, 5])
corpora_valid = open(pan_tadeusz_path_valid).read()
corpora_valid_tokenized = list(tokenize(corpora_valid,lowercase = True))
valid_ids = get_token_id(corpora_valid_tokenized)
valid_ids = torch.tensor(get_samples(valid_ids), dtype = torch.long, device = device)
model
# https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
class LSTM(torch.nn.Module):
def __init__(self):
super(LSTM, self).__init__()
self.emb = torch.nn.Embedding(len(vocab_itos),100)
self.rec = torch.nn.LSTM(100, 256, 1, batch_first = True)
self.fc1 = torch.nn.Linear( 256 ,len(vocab_itos))
#self.dropout = torch.nn.Dropout(0.5)
def forward(self, x):
emb = self.emb(x)
#emb = self.dropout(emb)
output, (h_n, c_n) = self.rec(emb)
hidden = h_n.squeeze(0)
out = self.fc1(hidden)
#out = self.dropout(out)
return out
lm = LSTM().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lm.parameters(),lr=0.0001)
BATCH_SIZE = 128
EPOCHS = 15
def get_ppl(dataset_ids):
lm.eval()
batches = 0
loss_sum =0
acc_score = 0
for i in range(0, len(dataset_ids)-BATCH_SIZE+1, BATCH_SIZE):
X = dataset_ids[i:i+BATCH_SIZE,:NGRAMS-1]
Y = dataset_ids[i:i+BATCH_SIZE,NGRAMS-1]
predictions = lm(X)
# equally distributted
# predictions = torch.zeros_like(predictions)
loss = criterion(predictions,Y)
loss_sum += loss.item()
batches += 1
return np.exp(loss_sum / batches)
history_ppl_train = []
history_ppl_valid = []
for epoch in range(EPOCHS):
batches = 0
loss_sum =0
acc_score = 0
lm.train()
#for i in range(0, len(train_ids)-BATCH_SIZE+1, BATCH_SIZE):
for i in tqdm(range(0, len(train_ids)-BATCH_SIZE+1, BATCH_SIZE)):
X = train_ids[i:i+BATCH_SIZE,:NGRAMS-1]
Y = train_ids[i:i+BATCH_SIZE,NGRAMS-1]
predictions = lm(X)
loss = criterion(predictions,Y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss_sum += loss.item()
batches += 1
ppl_train = get_ppl(train_ids)
ppl_valid = get_ppl(valid_ids)
history_ppl_train.append(ppl_train)
history_ppl_valid.append(ppl_valid)
print('epoch: ', epoch)
print('train ppl: ', ppl_train)
print('valid ppl: ', ppl_valid)
print()
HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch: 0 train ppl: 2296.6914856482526 valid ppl: 528.9542436139727
HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch: 1 train ppl: 2093.302103954666 valid ppl: 514.4726844027333
HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch: 2 train ppl: 2014.09679023559 valid ppl: 510.12146471773366
HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch: 3 train ppl: 1939.0594855086504 valid ppl: 509.1060151440451
HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch: 4 train ppl: 1854.4566511885196 valid ppl: 510.02244291272973
HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch: 5 train ppl: 1755.030202547313 valid ppl: 508.494174178397
HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch: 6 train ppl: 1646.180912657662 valid ppl: 506.06383737670035
HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch: 7 train ppl: 1533.0501876139222 valid ppl: 504.08067276707567
HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch: 8 train ppl: 1420.680717507558 valid ppl: 502.6906095632547
HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch: 9 train ppl: 1311.1083504083306 valid ppl: 503.5230045363773
HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch: 10 train ppl: 1203.498635587493 valid ppl: 505.7599916969862
HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch: 11 train ppl: 1100.0681613054269 valid ppl: 507.6071195979723
HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch: 12 train ppl: 1003.217414775517 valid ppl: 510.07952767103245
HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch: 13 train ppl: 912.2987798296267 valid ppl: 512.8275727599236
HBox(children=(FloatProgress(value=0.0, max=445.0), HTML(value='')))
epoch: 14 train ppl: 826.911431868259 valid ppl: 516.1525759633064
parametry modelu
list(lm.parameters())
[Parameter containing: tensor([[ 0.4949, -1.2472, -0.7167, ..., -0.0801, -2.1905, 0.9790], [ 0.2070, 1.7394, 0.8255, ..., -0.5796, -0.3776, 0.8831], [-1.4559, 1.1073, -0.4904, ..., -1.2919, -2.2661, -0.5476], ..., [-0.3706, 0.2133, 0.0484, ..., -0.5792, -0.5769, -0.6941], [ 0.5502, -0.1212, -2.0879, ..., 0.6764, -0.5961, -0.6282], [ 0.8362, 0.2193, -0.0807, ..., 2.7741, -0.2589, 0.3310]], device='cuda:0', requires_grad=True), Parameter containing: tensor([[ 0.0252, -0.0744, 0.0817, ..., -0.0559, 0.0896, 0.0208], [ 0.0423, 0.0329, -0.0610, ..., -0.0009, 0.0169, -0.0361], [ 0.0507, -0.0838, 0.0520, ..., 0.0395, 0.0067, 0.0173], ..., [ 0.0669, 0.0430, -0.0306, ..., 0.0096, 0.0619, -0.0992], [-0.0153, -0.0888, 0.0580, ..., -0.0433, 0.0399, -0.0494], [-0.0067, -0.0053, -0.0242, ..., 0.0017, -0.0306, -0.0972]], device='cuda:0', requires_grad=True), Parameter containing: tensor([[ 0.0212, -0.0425, -0.0329, ..., -0.0206, 0.0839, 0.0286], [ 0.0952, 0.0298, -0.1211, ..., -0.0468, -0.0233, -0.0620], [ 0.0108, 0.0422, -0.0492, ..., -0.0288, -0.0231, 0.0078], ..., [ 0.0253, 0.0154, -0.0765, ..., -0.0025, 0.0057, -0.0408], [ 0.0892, -0.0928, -0.1039, ..., -0.1531, -0.0011, 0.0180], [ 0.1341, 0.0666, -0.0548, ..., -0.0573, -0.0376, -0.0813]], device='cuda:0', requires_grad=True), Parameter containing: tensor([0.1115, 0.0423, 0.1438, ..., 0.0361, 0.0297, 0.0956], device='cuda:0', requires_grad=True), Parameter containing: tensor([0.0786, 0.1201, 0.0857, ..., 0.1177, 0.1319, 0.0886], device='cuda:0', requires_grad=True), Parameter containing: tensor([[-0.0158, 0.0236, -0.0958, ..., -0.0906, -0.0678, 0.0057], [-0.0871, -0.0788, 0.1217, ..., -0.0231, -0.0102, 0.0220], [ 0.0265, -0.0680, -0.0219, ..., -0.0520, -0.0565, 0.0628], ..., [-0.0618, 0.0232, 0.0898, ..., 0.1069, -0.0112, 0.0103], [-0.0489, 0.0708, 0.0546, ..., 0.1186, -0.0987, 0.1411], [-0.0764, 0.0463, 0.0947, ..., 0.1104, -0.0312, 0.1118]], device='cuda:0', requires_grad=True), Parameter containing: tensor([ 0.0299, -0.0551, -0.0323, ..., -0.0371, -0.0297, -0.0157], device='cuda:0', requires_grad=True)]
krzywe uczenia
history_ppl_valid
[528.9542436139727, 514.4726844027333, 510.12146471773366, 509.1060151440451, 510.02244291272973, 508.494174178397, 506.06383737670035, 504.08067276707567, 502.6906095632547, 503.5230045363773, 505.7599916969862, 507.6071195979723, 510.07952767103245, 512.8275727599236, 516.1525759633064]
import matplotlib.pyplot as plt
plt.plot(np.arange(len(history_ppl_train)), history_ppl_train, history_ppl_valid)
[<matplotlib.lines.Line2D at 0x7f02842a99a0>, <matplotlib.lines.Line2D at 0x7f02842a9a90>]
Inferencja
'Gości innych nie widział oprócz spółleśników'
'Gości innych nie widział oprócz spółleśników'
tokenized = list(tokenize('Gości innych nie widział oprócz spółleśników',lowercase = True))
#tokenized = tokenized[-NGRAMS :-1 ]
tokenized
['gości', 'innych', 'nie', 'widział', 'oprócz', 'spółleśników']
ids = []
for word in tokenized:
if word in vocab_stoi:
ids.append(vocab_stoi[word])
else:
ids.append(vocab_stoi['<UNK>'])
ids
[2671, 3168, 5873, 13240, 6938, 15001]
lm.eval()
LSTM( (emb): Embedding(15005, 100) (rec): LSTM(100, 256, batch_first=True) (fc1): Linear(in_features=256, out_features=15005, bias=True) )
ids = torch.tensor(ids, dtype = torch.long, device = device)
ids
tensor([ 2671, 3168, 5873, 13240, 6938, 15001], device='cuda:0')
preds= lm(ids.unsqueeze(0))
torch.argmax(torch.softmax(preds,1),1).item()
15001
torch.max(torch.softmax(preds,1),1)
torch.return_types.max( values=tensor([0.1419], device='cuda:0', grad_fn=<MaxBackward0>), indices=tensor([15001], device='cuda:0'))
vocab_itos[torch.argmax(torch.softmax(preds,1),1).item()]
'<UNK>'
ZADANIE: GENEROWANIE TEKSTU
Napisać funkcję generującą tekst, która dla podanego fragmentu generuje tekst. Generowanie tekstu ma wyglądać następująco: Z 10 najbardziej prawodpodobnych tokenów należy wylosować jeden, ala ma to byc token inny niż specjalny (UNK, BOS, EOS, PAD).
wygenerować tekst o długości 30 tokenów
generowanie tekstu
tokenized = list(tokenize('Pan Tadeusz', lowercase = True))
tokenized
['pan', 'tadeusz']
ids = []
for word in tokenized:
if word in vocab_stoi:
ids.append(vocab_stoi[word])
else:
ids.append(vocab_stoi['<UNK>'])
ids = torch.tensor([ids], dtype = torch.long, device = device)
candidates_number = 10
for i in range(30):
preds= lm(ids)
candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy()
candidate = 15001
while candidate > 15000:
candidate = candidates[np.random.randint(candidates_number)]
print(vocab_itos[candidate])
ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1)
a nie ma i na nim na w tył i tak w tył tylko i z nim na litwie a tak z góry w górę na nie a tak z