This commit is contained in:
SzyGra 2021-01-27 03:54:11 +01:00
parent e5d8b26718
commit 2e7c5b13c0
12 changed files with 1267588 additions and 2030 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
*.dict
data

File diff suppressed because it is too large Load Diff

25
lang.py Normal file
View File

@ -0,0 +1,25 @@
from nltk.tokenize import RegexpTokenizer
SOS_token = 0
EOS_token = 1
tokenizer = RegexpTokenizer(r'\w+')
class Lang:
def __init__(self, name):
self.name = name
self.word2index = {}
self.word2count = {}
self.index2word = {0: "SOS", 1: "EOS"}
self.n_words = 2 # Count SOS and EOS
def addSentence(self, sentence):
for word in tokenizer.tokenize(sentence):
self.addWord(word)
def addWord(self, word):
if word not in self.word2index:
self.word2index[word] = self.n_words
self.word2count[word] = 1
self.index2word[self.n_words] = word
self.n_words += 1
else:
self.word2count[word] += 1

78
lstm_model.py Normal file
View File

@ -0,0 +1,78 @@
import torch
from torch import nn
device = 'cuda'
import torch.nn.functional as F
import torch.nn.init as init
from lang import SOS_token, EOS_token
class EncoderRNN(nn.Module):
def __init__(self, input_size, hidden_size):
super(EncoderRNN, self).__init__()
self.hidden_size = hidden_size
self.embedding = nn.Embedding(input_size, hidden_size)
self.lstm = nn.LSTM(hidden_size, hidden_size)
def forward(self, input, hidden):
embedded = self.embedding(input).view(1, 1, -1)
output = embedded
output, hidden = self.lstm(output, hidden)
return output, hidden
def initHidden(self):
return (torch.zeros(1, 1, self.hidden_size, device=device), torch.zeros(1, 1, self.hidden_size, device=device))
class DecoderRNN(nn.Module):
def __init__(self, hidden_size, output_size):
super(DecoderRNN, self).__init__()
self.hidden_size = hidden_size
self.embedding = nn.Embedding(output_size, hidden_size)
self.lstm = nn.LSTM(hidden_size, hidden_size)
self.out = nn.Linear(hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, input, hidden):
output = self.embedding(input).view(1, 1, -1)
output = F.relu(output)
output, hidden = self.lstm(output, hidden)
output = self.softmax(self.out(output[0]))
return output, hidden
def initHidden(self):
return (torch.zeros(1, 1, self.hidden_size, device=device), torch.zeros(1, 1, self.hidden_size, device=device))
class AttnDecoderRNN(nn.Module):
def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=300):
super(AttnDecoderRNN, self).__init__()
self.hidden_size = hidden_size
self.output_size = output_size
self.dropout_p = dropout_p
self.max_length = max_length
self.embedding = nn.Embedding(self.output_size, self.hidden_size)
self.attn = nn.Linear(self.hidden_size, self.max_length)
self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
self.dropout = nn.Dropout(self.dropout_p)
self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)
self.out = nn.Linear(self.hidden_size, self.output_size)
def forward(self, input, hidden, encoder_outputs):
embedded = self.embedding(input).view(1, 1, -1)
embedded = self.dropout(embedded)
attn_weights = F.softmax(
self.attn(torch.cat((embedded, hidden[0]), 1)), dim=1)
attn_applied = torch.bmm(attn_weights.unsqueeze(0),
encoder_outputs.unsqueeze(0))
output = torch.cat((embedded[0], attn_applied[0]), 1)
output = self.attn_combine(output).unsqueeze(0)
output = F.relu(output)
output, hidden = self.lstm(output, hidden)
output = F.log_softmax(self.out(output[0]), dim=1)
print(output.shape, hidden.shape)
return output, hidden, attn_weights
def initHidden(self):
return (torch.zeros(1, 1, self.hidden_size, device=device), torch.zeros(1, 1, self.hidden_size, device=device))

128
model_train.py Normal file
View File

@ -0,0 +1,128 @@
from lang import SOS_token, EOS_token
import torch
import random
import math
import time
from torch import nn, optim
import torch
from lang import EOS_token, tokenizer
import pickle
MAX_LENGTH = 300
device = 'cuda'
teacher_forcing_ratio = 0.5
with open('data/pairs.pkl', 'rb') as input_file:
pairs = pickle.load(input_file)
with open('data/pl_lang.pkl', 'rb') as input_file:
input_lang = pickle.load(input_file)
with open('data/en_lang.pkl', 'rb') as out_file:
output_lang = pickle.load(out_file)
def indexesFromSentence(lang, sentence):
return [lang.word2index[word] for word in tokenizer.tokenize(sentence) if word in lang.word2index]
def tensorFromSentence(lang, sentence):
indexes = indexesFromSentence(lang, sentence)
indexes.append(EOS_token)
return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)
def tensorsFromPair(pair):
input_tensor = tensorFromSentence(input_lang, pair[0])
target_tensor = tensorFromSentence(output_lang, pair[1])
return (input_tensor, target_tensor)
def asMinutes(s):
m = math.floor(s / 60)
s -= m * 60
return '%dm %ds' % (m, s)
def timeSince(since, percent):
now = time.time()
s = now - since
es = s / (percent)
rs = es - s
return '%s (- %s)' % (asMinutes(s), asMinutes(rs))
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
encoder_hidden = encoder.initHidden()
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
input_length = input_tensor.size(0)
target_length = target_tensor.size(0)
encoder_outputs = torch.zeros(max_length, max_length, encoder.hidden_size, device=device)
loss = 0
for ei in range(input_length):
encoder_output, encoder_hidden = encoder(
input_tensor[ei], encoder_hidden)
encoder_outputs[ei] = encoder_output[0, 0, 0]
decoder_input = torch.tensor([[SOS_token]], device=device)
decoder_hidden = encoder_hidden
use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
if use_teacher_forcing:
# Teacher forcing: Feed the target as the next input
for di in range(target_length):
decoder_output, decoder_hidden, decoder_attention = decoder(
decoder_input, decoder_hidden, encoder_outputs)
loss += criterion(decoder_output, target_tensor[di])
decoder_input = target_tensor[di]
else:
# Without teacher forcing: use its own predictions as the next input
for di in range(target_length):
decoder_output, decoder_hidden, decoder_attention = decoder(
decoder_input, decoder_hidden, encoder_outputs)
topv, topi = decoder_output.topk(1)
decoder_input = topi.squeeze().detach()
loss += criterion(decoder_output, target_tensor[di])
if decoder_input.item() == EOS_token:
break
loss.backward()
encoder_optimizer.step()
decoder_optimizer.step()
return loss.item() / target_length
def trainIters(encoder, decoder, n_iters, print_every=10, plot_every=100, learning_rate=0.01):
start = time.time()
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
training_pairs = [tensorsFromPair(random.choice(pairs))
for i in range(n_iters)]
criterion = nn.NLLLoss()
for iter in range(1, n_iters + 1):
training_pair = training_pairs[iter - 1]
input_tensor = training_pair[0]
target_tensor = training_pair[1]
loss = train(input_tensor, target_tensor, encoder,
decoder, encoder_optimizer, decoder_optimizer, criterion)
print_loss_total += loss
plot_loss_total += loss
if iter % print_every == 0:
print_loss_avg = print_loss_total / print_every
print_loss_total = 0
print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
iter, iter / n_iters * 100, print_loss_avg))
torch.save(encoder.state_dict(), 'encoder.dict')
torch.save(decoder.state_dict(), 'decoder.dict')

58
predict.py Normal file
View File

@ -0,0 +1,58 @@
from model_train import tensorFromSentence, SOS_token, MAX_LENGTH, device, EOS_token
import pickle
from lstm_model import EncoderRNN, DecoderRNN
import sys
import torch
with open('data/pl_lang.pkl', 'rb') as input_file:
input_lang = pickle.load(input_file)
with open('data/en_lang.pkl', 'rb') as out_file:
output_lang = pickle.load(out_file)
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
with torch.no_grad():
input_tensor = tensorFromSentence(input_lang, sentence)
input_length = input_tensor.size()[0]
encoder_hidden = encoder.initHidden()
encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
loss = 0
for ei in range(input_length):
encoder_output, encoder_hidden = encoder(input_tensor[ei],
encoder_hidden)
encoder_outputs[ei] = encoder_output[0, 0]
decoder_input = torch.tensor([[SOS_token]], dtype=torch.long, device=device).view(-1, 1) # SOS
decoder_hidden = encoder_hidden
decoded_words = []
decoder_attentions = torch.zeros(max_length, max_length)
for di in range(max_length):
decoder_output, decoder_hidden = decoder(
decoder_input, decoder_hidden)
topv, topi = decoder_output.data.topk(1)
if topi.item() == EOS_token:
break
else:
decoded_words.append(output_lang.index2word[topi.item()])
decoder_input = topi.squeeze().detach()
return decoded_words
hidden_size = 256
encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, output_lang.n_words).to(device)
encoder.load_state_dict(torch.load('encoder.dict'))
decoder.load_state_dict(torch.load('decoder.dict'))
for line in sys.stdin:
line = line.rstrip()
dec_words = evaluate(encoder, decoder, line, MAX_LENGTH)
print(' '.join(dec_words))

90
prepare.py Normal file
View File

@ -0,0 +1,90 @@
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import pickle
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from lang import *
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LENGTH = 300
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
)
# Lowercase, trim, and remove non-letter characters
def filterPair(p):
return len(tokenizer.tokenize(p[0])) < MAX_LENGTH and \
len(tokenizer.tokenize(p[1])) < MAX_LENGTH
def filterPairs(pairs):
return [pair for pair in pairs if filterPair(pair)]
def normalizeString(s):
s = re.sub(r"([.!?])", r" \1", s)
return s
def readLangs(lang1, lang2, reverse=False):
print("Reading lines...")
lines_pl = []
lines_en = []
# Read the file and split into lines
with open('train/in.tsv', 'r', encoding='utf-8') as pl:
for line in pl:
line = line.rstrip()
lines_pl.append(line)
with open('train/expected.tsv', 'r', encoding='utf-8') as en:
for line in en:
line = line.rstrip()
lines_en.append(line)
# Split every line into pairs and normalize
pairs = []
for p, e in zip(lines_pl, lines_en):
pl_s = normalizeString(p)
pl_e = normalizeString(e)
pairs.append([pl_e, pl_s])
if reverse:
pairs = [list(reversed(p)) for p in pairs]
input_lang = Lang(lang2)
output_lang = Lang(lang1)
else:
input_lang = Lang(lang1)
output_lang = Lang(lang2)
return input_lang, output_lang, pairs
def prepareData(lang1, lang2, reverse=False):
input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
print("Read %s sentence pairs" % len(pairs))
pairs = filterPairs(pairs)
print("Trimmed to %s sentence pairs" % len(pairs))
print("Counting words...")
for pair in pairs:
input_lang.addSentence(pair[0])
output_lang.addSentence(pair[1])
print("Counted words:")
print(input_lang.name, input_lang.n_words)
print(output_lang.name, output_lang.n_words)
return input_lang, output_lang, pairs
input_lang, output_lang, pairs = prepareData('pl', 'eng', True)
print(random.choice(pairs))
with open('data/pairs.pkl', 'wb+') as p:
pickle.dump(pairs, p, protocol=pickle.HIGHEST_PROTOCOL)
with open('data/pl_lang.pkl', 'wb+') as p:
pickle.dump(input_lang, p, protocol=pickle.HIGHEST_PROTOCOL)
with open('data/en_lang.pkl', 'wb+') as p:
pickle.dump(output_lang, p, protocol=pickle.HIGHEST_PROTOCOL)

View File

@ -1,30 +0,0 @@
# -*- coding: utf-8 -*-
from transformers import MarianTokenizer, MarianMTModel
import sys
from typing import List
from numba import jit
@jit
def count():
data={}
for doc_id,line in enumerate(sys.stdin):
data[doc_id]=line.rstrip()
return data
def translate(data):
for key in data.keys():
batch = tok.prepare_seq2seq_batch(src_texts=[data[key]])
gen = model.generate(**batch)
translate = tok.batch_decode(gen, skip_special_tokens=True)
print(translate[0])
if __name__ =="__main__":
src = 'pl' # source language
trg = 'en' # target language
mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
#print('Data ready!')
model = MarianMTModel.from_pretrained(mname)
tok = MarianTokenizer.from_pretrained(mname)
data=count()
translate(data)

File diff suppressed because it is too large Load Diff

7
train.py Normal file
View File

@ -0,0 +1,7 @@
from lstm_model import EncoderRNN, DecoderRNN, AttnDecoderRNN
from model_train import *
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)
trainIters(encoder1, attn_decoder1, 10000, print_every=100)

632600
train/expected.tsv Normal file

File diff suppressed because it is too large Load Diff

632600
train/in.tsv Normal file

File diff suppressed because it is too large Load Diff