add char lstm language model and translation model
This commit is contained in:
parent
80333aca0a
commit
311683235d
135842
eng-fra.txt
Normal file
135842
eng-fra.txt
Normal file
File diff suppressed because it is too large
Load Diff
98
pytorch11.py
Normal file
98
pytorch11.py
Normal file
@ -0,0 +1,98 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
# https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html
|
||||
|
||||
import torch
|
||||
from torch import nn, optim
|
||||
|
||||
history_length = 32
|
||||
history_encoded = [ord('\n')] * history_length
|
||||
nb_of_char_codes = 128
|
||||
embedding_size = 30
|
||||
step = 1000
|
||||
|
||||
device = torch.device('cpu')
|
||||
|
||||
f = open('shakespeare.txt')
|
||||
def char_source():
|
||||
for line in f:
|
||||
for c in line:
|
||||
c_code = ord(c)
|
||||
if c_code < nb_of_char_codes:
|
||||
yield(c_code)
|
||||
|
||||
|
||||
class EncoderRNN(nn.Module):
|
||||
def __init__(self, input_size, hidden_size, embedding_size):
|
||||
super(EncoderRNN, self).__init__()
|
||||
self.input_size = input_size
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
self.embedding = nn.Embedding(input_size, embedding_size)
|
||||
self.gru = nn.GRU(embedding_size, nb_of_char_codes)
|
||||
self.softmax = nn.LogSoftmax(dim=1)
|
||||
|
||||
def forward(self, input, hidden):
|
||||
embedded = self.embedding(input)
|
||||
output = embedded
|
||||
output, hidden = self.gru(output, hidden)
|
||||
output = self.softmax(output)
|
||||
return output, hidden
|
||||
|
||||
def initHidden(self):
|
||||
return torch.zeros(1, self.hidden_size, self.input_size, device=device)
|
||||
|
||||
def generate(self, n, encoder_hidden):
|
||||
t = (" " * 200 + "To be or not to be")[-history_length:]
|
||||
history = [ord(c) for c in t]
|
||||
|
||||
with torch.no_grad():
|
||||
for _ in range(n):
|
||||
x = torch.tensor(history, dtype=torch.long, device=device)
|
||||
x = x.unsqueeze(0)
|
||||
y = model(x,encoder_hidden)[0][:,-1,:][0]
|
||||
y = torch.exp(y)
|
||||
best = (sorted(range(nb_of_char_codes), key=lambda i: -y[i]))[0:2]
|
||||
yb = torch.tensor([(y[ix] if ix in best else 0.0) for ix in range(nb_of_char_codes)])
|
||||
c = torch.multinomial(yb, 1)[0].item()
|
||||
|
||||
t += chr(c)
|
||||
|
||||
history.pop(0)
|
||||
history.append(c)
|
||||
|
||||
print(t)
|
||||
|
||||
|
||||
model = EncoderRNN(nb_of_char_codes, history_length, embedding_size).to(device)
|
||||
criterion = nn.NLLLoss().to(device)
|
||||
optimizer = optim.Adam(model.parameters())
|
||||
|
||||
|
||||
counter = 0
|
||||
losses = []
|
||||
|
||||
for c in char_source():
|
||||
x = torch.tensor(history_encoded, dtype=torch.long, device=device)
|
||||
model.zero_grad()
|
||||
x = x.unsqueeze(0)
|
||||
encoder_hidden = model.initHidden()
|
||||
y = model(x,encoder_hidden)[0][:,-1,:]
|
||||
|
||||
loss = criterion(y, torch.tensor([c]).to(device))
|
||||
losses += [loss.item()]
|
||||
if len(losses) > step:
|
||||
losses.pop(0)
|
||||
|
||||
counter += 1
|
||||
|
||||
if counter % step == 0:
|
||||
avg_loss = sum(losses)/len(losses)
|
||||
print(f"{counter}: {loss} {avg_loss}")
|
||||
model.generate(200, encoder_hidden)
|
||||
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
history_encoded.pop(0)
|
||||
history_encoded.append(c)
|
122
pytorch12.py
Normal file
122
pytorch12.py
Normal file
@ -0,0 +1,122 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
# https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
|
||||
|
||||
import sys
|
||||
import torch
|
||||
from torch import nn, optim
|
||||
|
||||
nb_of_char_codes = 128 + 2
|
||||
SOS_token_id = 128 # start of sentence
|
||||
EOS_token_id = 129 # end of sentence
|
||||
|
||||
hidden_size = 32
|
||||
step = 200
|
||||
|
||||
device = torch.device('cpu')
|
||||
|
||||
f = open('eng-fra.txt')
|
||||
def char_source():
|
||||
for line in f:
|
||||
s, t = line.rstrip('\n').split('\t')
|
||||
s_list = []
|
||||
t_list = []
|
||||
|
||||
for c in s:
|
||||
c_code = ord(c)
|
||||
if c_code < nb_of_char_codes:
|
||||
s_list.append(ord(c))
|
||||
|
||||
for c in t:
|
||||
c_code = ord(c)
|
||||
if c_code < nb_of_char_codes:
|
||||
t_list.append(ord(c))
|
||||
|
||||
yield s_list, t_list
|
||||
|
||||
class EncoderRNN(nn.Module):
|
||||
def __init__(self, input_size, hidden_size):
|
||||
super(EncoderRNN, self).__init__()
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
self.embedding = nn.Embedding(input_size, hidden_size)
|
||||
self.gru = nn.GRU(hidden_size, hidden_size)
|
||||
|
||||
def forward(self, input, hidden):
|
||||
embedded = self.embedding(input)
|
||||
output = embedded
|
||||
output, hidden = self.gru(output, hidden)
|
||||
return output, hidden
|
||||
|
||||
def initHidden(self):
|
||||
return torch.zeros(1,1, self.hidden_size, device=device)
|
||||
|
||||
class DecoderRNN(nn.Module):
|
||||
def __init__(self, hidden_size, output_size):
|
||||
super(DecoderRNN, self).__init__()
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
self.embedding = nn.Embedding(output_size, hidden_size)
|
||||
self.gru = nn.GRU(hidden_size, hidden_size)
|
||||
self.out = nn.Linear(hidden_size, output_size)
|
||||
self.softmax = nn.LogSoftmax(dim=1)
|
||||
|
||||
def forward(self, input, hidden):
|
||||
output = self.embedding(input)
|
||||
output = torch.nn.functional.relu(output)
|
||||
output, hidden = self.gru(output, hidden)
|
||||
output = self.softmax(self.out(output[0]))
|
||||
return output, hidden
|
||||
|
||||
|
||||
encoder = EncoderRNN(nb_of_char_codes, hidden_size).to(device)
|
||||
decoder = DecoderRNN(hidden_size, nb_of_char_codes).to(device)
|
||||
criterion = nn.NLLLoss().to(device)
|
||||
optimizer = optim.Adam((list(encoder.parameters()) + list(decoder.parameters())))
|
||||
|
||||
counter = 0
|
||||
losses = []
|
||||
|
||||
for s,t in char_source():
|
||||
counter += 1
|
||||
encoder.zero_grad()
|
||||
decoder.zero_grad()
|
||||
x = torch.tensor(s, dtype=torch.long, device=device)
|
||||
encoder_hidden = encoder.initHidden()
|
||||
encoder_output = torch.zeros(hidden_size, hidden_size, device=device)
|
||||
for i in range(x.shape[0]):
|
||||
output, encoder_hidden = encoder(x[i].unsqueeze(0).unsqueeze(0), encoder_hidden)
|
||||
encoder_output[i] = output[0,0]
|
||||
|
||||
decoder_hidden = encoder_hidden
|
||||
|
||||
decoder_input = torch.tensor([[SOS_token_id]], device=device)
|
||||
|
||||
t.append(EOS_token_id)
|
||||
y = torch.tensor(t, dtype=torch.long, device=device)
|
||||
loss = 0
|
||||
output_string = ''
|
||||
for di in range(y.shape[0]):
|
||||
decoder_output, decoder_hidden = decoder(
|
||||
decoder_input, decoder_hidden)
|
||||
topv, topi = decoder_output.topk(1)
|
||||
decoder_input = topi.detach() # detach from history as input
|
||||
|
||||
output_string += chr(topi)
|
||||
loss += criterion(decoder_output, y[di].unsqueeze(0))
|
||||
if chr(topi) == EOS_token_id:
|
||||
break
|
||||
|
||||
losses.append(loss.item())
|
||||
if counter % step == 0:
|
||||
# print(counter, end='\t')
|
||||
avg_loss = sum(losses)/len(losses)
|
||||
print(f"{counter}: {avg_loss}")
|
||||
losses = []
|
||||
print('IN :\t', ''.join([chr(a) for a in s]))
|
||||
print('EXP:\t', ''.join([chr(a) for a in t]))
|
||||
print('OUT:\t', output_string)
|
||||
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
149542
shakespeare.txt
Normal file
149542
shakespeare.txt
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user