In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
SOS_token = 0
EOS_token = 1


class Lang:
 def __init__(self, name):
 self.name = name
 self.word2index = {}
 self.word2count = {}
 self.index2word = {0: "SOS", 1: "EOS"}
 self.n_words = 2 # Count SOS and EOS

 def addSentence(self, sentence):
 for word in sentence.split(' '):
 self.addWord(word)

 def addWord(self, word):
 if word not in self.word2index:
 self.word2index[word] = self.n_words
 self.word2count[word] = 1
 self.index2word[self.n_words] = word
 self.n_words += 1
 else:
 self.word2count[word] += 1

In [4]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
 return ''.join(
 c for c in unicodedata.normalize('NFD', s)
 if unicodedata.category(c) != 'Mn'
 )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
 s = unicodeToAscii(s.lower().strip())
 s = re.sub(r"([.!?])", r" \1", s)
 s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
 return s

In [5]:
def readLangs():
 print("Reading lines...")

 # Read the file and split into lines
 linesIn = open('in_40k.tsv').read().strip().split('\n')[:100]
 linesOut = open('exp_40k.tsv').read().strip().split('\n')[:100]
 # Split every line into pairs and normalize
 pairs = [[normalizeString(a),normalizeString(b)] for a,b in zip(linesIn,linesOut)]

 print(pairs)

 # Reverse pairs, make Lang instances
 # pairs = [list(reversed(p)) for p in pairs]
 input_lang = Lang('in')
 output_lang = Lang('out')
 return input_lang, output_lang, pairs

In [6]:
MAX_LENGTH = 80

def filterPair(p):
 #print(p)
 return len(p[0].split(' ')) < MAX_LENGTH and \
 len(p[1].split(' ')) < MAX_LENGTH


def filterPairs(pairs):
 return [pair for pair in pairs if filterPair(pair)]

In [8]:
def prepareData(lang1, lang2, reverse=False):
 input_lang, output_lang, pairs = readLangs()
 print("Read %s sentence pairs" % len(pairs))
 pairs = filterPairs(pairs)
 print("Trimmed to %s sentence pairs" % len(pairs))
 print("Counting words...")
 for pair in pairs:
 input_lang.addSentence(pair[0])
 output_lang.addSentence(pair[1])
 print("Counted words:")
 print(input_lang.name, input_lang.n_words)
 print(output_lang.name, output_lang.n_words)
 return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('pl', 'en', True)
#print(random.choice(pairs))

Reading lines...
[['naprawde wazne jest by wzrost gospodarczy nie powodowa automatycznie proporcjonalnego zwiekszonego zuzycia energii .', 'it is really important that growth should not automatically generate a proportionate rise in energy consumption .'], [' bg pani przewodniczaca panie premierze ! rok bedzie pierwszym w ktorym unii europejskiej beda przewodzic dwa kraje z europy srodkowej i wschodniej wegry oraz polska .', ' bg madam president prime minister will be the first year in which the european union will be headed by two countries from central and eastern europe hungary and poland .'], ['w dodatku odsetek ludzi w wieku ponad lat wzrosnie z w roku do w roku .', 'in addition the proportion of people aged over will rise from . in to . in .'], ['na pismie . sv w sprawozdaniu stwierdzono ze w wiekszosci panstw cz onkowskich spo eczenstwo starzeje sie co obciazy systemy zabezpieczenia spo ecznego i systemy emerytalne .', 'in writing . sv this report observes that in most member st

In [9]:
input_lang.n_words

1155

In [10]:
class EncoderRNN(nn.Module):
 def __init__(self, input_size, hidden_size):
 super(EncoderRNN, self).__init__()
 self.hidden_size = hidden_size

 self.embedding = nn.Embedding(input_size, hidden_size)
 self.gru = nn.GRU(hidden_size, hidden_size)

 def forward(self, input, hidden):
 embedded = self.embedding(input).view(1, 1, -1)
 output = embedded
 output, hidden = self.gru(output, hidden)
 return output, hidden

 def initHidden(self):
 return torch.zeros(1, 1, self.hidden_size, device=device)

In [11]:
class AttnDecoderRNN(nn.Module):
 def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
 super(AttnDecoderRNN, self).__init__()
 self.hidden_size = hidden_size
 self.output_size = output_size
 self.dropout_p = dropout_p
 self.max_length = max_length

 self.embedding = nn.Embedding(self.output_size, self.hidden_size)
 self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
 self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
 self.dropout = nn.Dropout(self.dropout_p)
 self.gru = nn.GRU(self.hidden_size, self.hidden_size)
 self.out = nn.Linear(self.hidden_size, self.output_size)

 def forward(self, input, hidden, encoder_outputs):
 embedded = self.embedding(input).view(1, 1, -1)
 embedded = self.dropout(embedded)

 attn_weights = F.softmax(
 self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
 attn_applied = torch.bmm(attn_weights.unsqueeze(0),
 encoder_outputs.unsqueeze(0))

 output = torch.cat((embedded[0], attn_applied[0]), 1)
 output = self.attn_combine(output).unsqueeze(0)

 output = F.relu(output)
 output, hidden = self.gru(output, hidden)

 output = F.log_softmax(self.out(output[0]), dim=1)
 return output, hidden, attn_weights

 def initHidden(self):
 return torch.zeros(1, 1, self.hidden_size, device=device)

In [22]:
def indexesFromSentence(lang, sentence):
 res = []
 for word in sentence.split(' '):
 if word not in lang.word2index:
 res.append(random.randrange(len(lang.word2index)))
 else:
 res.append(lang.word2index[word])
 return res


def tensorFromSentence(lang, sentence):
 indexes = indexesFromSentence(lang, sentence)
 indexes.append(EOS_token)
 return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
 input_tensor = tensorFromSentence(input_lang, pair[0])
 target_tensor = tensorFromSentence(output_lang, pair[1])
 return (input_tensor, target_tensor)

In [13]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
 encoder_hidden = encoder.initHidden()

 encoder_optimizer.zero_grad()
 decoder_optimizer.zero_grad()

 input_length = input_tensor.size(0)
 target_length = target_tensor.size(0)

 encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

 loss = 0

 for ei in range(input_length):
 encoder_output, encoder_hidden = encoder(
 input_tensor[ei], encoder_hidden)
 encoder_outputs[ei] = encoder_output[0, 0]

 decoder_input = torch.tensor([[SOS_token]], device=device)

 decoder_hidden = encoder_hidden

 use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

 if use_teacher_forcing:
 # Teacher forcing: Feed the target as the next input
 for di in range(target_length):
 decoder_output, decoder_hidden, decoder_attention = decoder(
 decoder_input, decoder_hidden, encoder_outputs)
 loss += criterion(decoder_output, target_tensor[di])
 decoder_input = target_tensor[di] # Teacher forcing

 else:
 # Without teacher forcing: use its own predictions as the next input
 for di in range(target_length):
 decoder_output, decoder_hidden, decoder_attention = decoder(
 decoder_input, decoder_hidden, encoder_outputs)
 topv, topi = decoder_output.topk(1)
 decoder_input = topi.squeeze().detach() # detach from history as input

 loss += criterion(decoder_output, target_tensor[di])
 if decoder_input.item() == EOS_token:
 break

 loss.backward()

 encoder_optimizer.step()
 decoder_optimizer.step()

 return loss.item() / target_length

In [14]:
import time
import math


def asMinutes(s):
 m = math.floor(s / 60)
 s -= m * 60
 return '%dm %ds' % (m, s)


def timeSince(since, percent):
 now = time.time()
 s = now - since
 es = s / (percent)
 rs = es - s
 return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [15]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
 start = time.time()
 plot_losses = []
 print_loss_total = 0 # Reset every print_every
 plot_loss_total = 0 # Reset every plot_every

 encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
 decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
 training_pairs = [tensorsFromPair(random.choice(pairs))
 for i in range(n_iters)]
 criterion = nn.NLLLoss()

 for iter in range(1, n_iters + 1):
 training_pair = training_pairs[iter - 1]
 input_tensor = training_pair[0]
 target_tensor = training_pair[1]

 loss = train(input_tensor, target_tensor, encoder,
 decoder, encoder_optimizer, decoder_optimizer, criterion)
 print_loss_total += loss
 plot_loss_total += loss

 if iter % print_every == 0:
 print_loss_avg = print_loss_total / print_every
 print_loss_total = 0
 print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
 iter, iter / n_iters * 100, print_loss_avg))

 if iter % plot_every == 0:
 plot_loss_avg = plot_loss_total / plot_every
 plot_losses.append(plot_loss_avg)
 plot_loss_total = 0

 showPlot(plot_losses)

In [16]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
 with torch.no_grad():
 input_tensor = tensorFromSentence(input_lang, sentence)
 input_length = input_tensor.size()[0]
 encoder_hidden = encoder.initHidden()

 encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

 for ei in range(input_length):
 encoder_output, encoder_hidden = encoder(input_tensor[ei],
 encoder_hidden)
 encoder_outputs[ei] += encoder_output[0, 0]

 decoder_input = torch.tensor([[SOS_token]], device=device) # SOS

 decoder_hidden = encoder_hidden

 decoded_words = []
 decoder_attentions = torch.zeros(max_length, max_length)

 for di in range(max_length):
 decoder_output, decoder_hidden, decoder_attention = decoder(
 decoder_input, decoder_hidden, encoder_outputs)
 decoder_attentions[di] = decoder_attention.data
 topv, topi = decoder_output.data.topk(1)
 if topi.item() == EOS_token:
 decoded_words.append('')
 break
 else:
 decoded_words.append(output_lang.index2word[topi.item()])

 decoder_input = topi.squeeze().detach()

 return decoded_words, decoder_attentions[:di + 1]

In [None]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 25000, print_every=20)

In [None]:
evaluate(encoder1, attn_decoder1, "Co tam u ciebie")

In [36]:
def evaluateAndShow(input_sentence):
 output_words = evaluate(
 encoder1, attn_decoder1, input_sentence)
 return " ".join(output_words[0])

In [37]:
evaluateAndShow("Co tam u cbie")

'let us be able to live up to it because these are real problems and real people and we have to deal with them now . '

In [38]:
temp = open('in.tsv', 'r').readlines()
data = []
for sent in temp:
 data.append(sent.replace('\n',''))

f=open('out.tsv','w+')
for sent in data:
 f.write(evaluateAndShow(sent).replace('', '') + '\n')

f.close()