Add lstm

2021-01-27 03:54:11 +01:00 · 2021-01-27 03:54:11 +01:00 · 2e7c5b13c0
commit 2e7c5b13c0
parent e5d8b26718
12 changed files with 1267588 additions and 2030 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+*.dict
+data
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/lang.py
+++ b/lang.py
@ -0,0 +1,25 @@
+from nltk.tokenize import RegexpTokenizer
+SOS_token = 0
+EOS_token = 1
+tokenizer = RegexpTokenizer(r'\w+')
+
+class Lang:
+    def __init__(self, name):
+        self.name = name
+        self.word2index = {}
+        self.word2count = {}
+        self.index2word = {0: "SOS", 1: "EOS"}
+        self.n_words = 2  # Count SOS and EOS
+
+    def addSentence(self, sentence):
+        for word in tokenizer.tokenize(sentence):
+            self.addWord(word)
+
+    def addWord(self, word):
+        if word not in self.word2index:
+            self.word2index[word] = self.n_words
+            self.word2count[word] = 1
+            self.index2word[self.n_words] = word
+            self.n_words += 1
+        else:
+            self.word2count[word] += 1
--- a/lstm_model.py
+++ b/lstm_model.py
@ -0,0 +1,78 @@
+import torch
+from torch import nn
+device = 'cuda'
+import torch.nn.functional as F
+import torch.nn.init as init
+from lang import SOS_token, EOS_token
+
+class EncoderRNN(nn.Module):
+    def __init__(self, input_size, hidden_size):
+        super(EncoderRNN, self).__init__()
+        self.hidden_size = hidden_size
+
+        self.embedding = nn.Embedding(input_size, hidden_size)
+        self.lstm = nn.LSTM(hidden_size, hidden_size)
+
+    def forward(self, input, hidden):
+        embedded = self.embedding(input).view(1, 1, -1)
+        output = embedded
+        output, hidden = self.lstm(output, hidden)
+        return output, hidden
+
+    def initHidden(self):
+        return (torch.zeros(1, 1, self.hidden_size, device=device), torch.zeros(1, 1, self.hidden_size, device=device))
+
+class DecoderRNN(nn.Module):
+    def __init__(self, hidden_size, output_size):
+        super(DecoderRNN, self).__init__()
+        self.hidden_size = hidden_size
+
+        self.embedding = nn.Embedding(output_size, hidden_size)
+        self.lstm = nn.LSTM(hidden_size, hidden_size)
+        self.out = nn.Linear(hidden_size, output_size)
+        self.softmax = nn.LogSoftmax(dim=1)
+
+    def forward(self, input, hidden):
+        output = self.embedding(input).view(1, 1, -1)
+        output = F.relu(output)
+        output, hidden = self.lstm(output, hidden)
+        output = self.softmax(self.out(output[0]))
+        return output, hidden
+
+    def initHidden(self):
+        return (torch.zeros(1, 1, self.hidden_size, device=device), torch.zeros(1, 1, self.hidden_size, device=device))
+
+class AttnDecoderRNN(nn.Module):
+    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=300):
+        super(AttnDecoderRNN, self).__init__()
+        self.hidden_size = hidden_size
+        self.output_size = output_size
+        self.dropout_p = dropout_p
+        self.max_length = max_length
+
+        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
+        self.attn = nn.Linear(self.hidden_size, self.max_length)
+        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
+        self.dropout = nn.Dropout(self.dropout_p)
+        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)
+        self.out = nn.Linear(self.hidden_size, self.output_size)
+
+    def forward(self, input, hidden, encoder_outputs):
+        embedded = self.embedding(input).view(1, 1, -1)
+        embedded = self.dropout(embedded)
+        attn_weights = F.softmax(
+            self.attn(torch.cat((embedded, hidden[0]), 1)), dim=1)
+        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
+                                 encoder_outputs.unsqueeze(0))
+        output = torch.cat((embedded[0], attn_applied[0]), 1)
+        output = self.attn_combine(output).unsqueeze(0)
+
+        output = F.relu(output)
+        output, hidden = self.lstm(output, hidden)
+
+        output = F.log_softmax(self.out(output[0]), dim=1)
+        print(output.shape, hidden.shape)
+        return output, hidden, attn_weights
+
+    def initHidden(self):
+        return (torch.zeros(1, 1, self.hidden_size, device=device), torch.zeros(1, 1, self.hidden_size, device=device))
--- a/model_train.py
+++ b/model_train.py
@ -0,0 +1,128 @@
+from lang import SOS_token, EOS_token
+import torch
+import random
+import math
+import time
+from torch import nn, optim
+import torch
+from lang import EOS_token, tokenizer
+import pickle
+
+MAX_LENGTH = 300
+device = 'cuda'
+teacher_forcing_ratio = 0.5
+
+with open('data/pairs.pkl', 'rb') as input_file:
+    pairs = pickle.load(input_file)
+
+with open('data/pl_lang.pkl', 'rb') as input_file:
+    input_lang = pickle.load(input_file)
+
+with open('data/en_lang.pkl', 'rb') as out_file:
+    output_lang = pickle.load(out_file)
+
+def indexesFromSentence(lang, sentence):
+    return [lang.word2index[word] for word in tokenizer.tokenize(sentence) if word in lang.word2index]
+
+
+def tensorFromSentence(lang, sentence):
+    indexes = indexesFromSentence(lang, sentence)
+    indexes.append(EOS_token)
+    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)
+
+
+def tensorsFromPair(pair):
+    input_tensor = tensorFromSentence(input_lang, pair[0])
+    target_tensor = tensorFromSentence(output_lang, pair[1])
+    return (input_tensor, target_tensor)
+
+
+def asMinutes(s):
+    m = math.floor(s / 60)
+    s -= m * 60
+    return '%dm %ds' % (m, s)
+
+def timeSince(since, percent):
+    now = time.time()
+    s = now - since
+    es = s / (percent)
+    rs = es - s
+    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))
+
+def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
+    encoder_hidden = encoder.initHidden()
+    encoder_optimizer.zero_grad()
+    decoder_optimizer.zero_grad()
+
+    input_length = input_tensor.size(0)
+    target_length = target_tensor.size(0)
+
+    encoder_outputs = torch.zeros(max_length, max_length, encoder.hidden_size, device=device)
+
+    loss = 0
+
+    for ei in range(input_length):
+        encoder_output, encoder_hidden = encoder(
+            input_tensor[ei], encoder_hidden)
+        encoder_outputs[ei] = encoder_output[0, 0, 0]
+
+
+    decoder_input = torch.tensor([[SOS_token]], device=device)
+    decoder_hidden = encoder_hidden
+    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
+
+    if use_teacher_forcing:
+        # Teacher forcing: Feed the target as the next input
+        for di in range(target_length):
+            decoder_output, decoder_hidden, decoder_attention = decoder(
+                decoder_input, decoder_hidden, encoder_outputs)
+            loss += criterion(decoder_output, target_tensor[di])
+            decoder_input = target_tensor[di]
+
+    else:
+        # Without teacher forcing: use its own predictions as the next input
+        for di in range(target_length):
+            decoder_output, decoder_hidden, decoder_attention = decoder(
+                decoder_input, decoder_hidden, encoder_outputs)
+            topv, topi = decoder_output.topk(1)
+            decoder_input = topi.squeeze().detach()
+
+            loss += criterion(decoder_output, target_tensor[di])
+            if decoder_input.item() == EOS_token:
+                break
+
+    loss.backward()
+
+    encoder_optimizer.step()
+    decoder_optimizer.step()
+
+    return loss.item() / target_length
+
+def trainIters(encoder, decoder, n_iters, print_every=10, plot_every=100, learning_rate=0.01):
+    start = time.time()
+    print_loss_total = 0  # Reset every print_every
+    plot_loss_total = 0  # Reset every plot_every
+
+    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
+    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
+    training_pairs = [tensorsFromPair(random.choice(pairs))
+                      for i in range(n_iters)]
+    criterion = nn.NLLLoss()
+
+    for iter in range(1, n_iters + 1):
+        training_pair = training_pairs[iter - 1]
+        input_tensor = training_pair[0]
+        target_tensor = training_pair[1]
+
+        loss = train(input_tensor, target_tensor, encoder,
+                     decoder, encoder_optimizer, decoder_optimizer, criterion)
+        print_loss_total += loss
+        plot_loss_total += loss
+
+        if iter % print_every == 0:
+            print_loss_avg = print_loss_total / print_every
+            print_loss_total = 0
+            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
+                                         iter, iter / n_iters * 100, print_loss_avg))
+    torch.save(encoder.state_dict(), 'encoder.dict')
+    torch.save(decoder.state_dict(), 'decoder.dict')
--- a/predict.py
+++ b/predict.py
@ -0,0 +1,58 @@
+from model_train import tensorFromSentence, SOS_token, MAX_LENGTH, device, EOS_token
+import pickle
+from lstm_model import EncoderRNN, DecoderRNN
+import sys
+import torch
+
+with open('data/pl_lang.pkl', 'rb') as input_file:
+    input_lang = pickle.load(input_file)
+
+with open('data/en_lang.pkl', 'rb') as out_file:
+    output_lang = pickle.load(out_file)
+
+
+def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
+    with torch.no_grad():
+        input_tensor = tensorFromSentence(input_lang, sentence)
+        input_length = input_tensor.size()[0]
+        encoder_hidden = encoder.initHidden()
+
+        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
+
+        loss = 0
+
+        for ei in range(input_length):
+            encoder_output, encoder_hidden = encoder(input_tensor[ei],
+                                                     encoder_hidden)
+            encoder_outputs[ei] = encoder_output[0, 0]
+
+        decoder_input = torch.tensor([[SOS_token]], dtype=torch.long, device=device).view(-1, 1)  # SOS
+
+        decoder_hidden = encoder_hidden
+
+        decoded_words = []
+        decoder_attentions = torch.zeros(max_length, max_length)
+
+        for di in range(max_length):
+            decoder_output, decoder_hidden = decoder(
+                decoder_input, decoder_hidden)
+            topv, topi = decoder_output.data.topk(1)
+            if topi.item() == EOS_token:
+                break
+            else:
+                decoded_words.append(output_lang.index2word[topi.item()])
+
+            decoder_input = topi.squeeze().detach()
+
+        return decoded_words
+
+hidden_size = 256
+encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
+decoder = DecoderRNN(hidden_size, output_lang.n_words).to(device)
+encoder.load_state_dict(torch.load('encoder.dict'))
+decoder.load_state_dict(torch.load('decoder.dict'))
+
+for line in sys.stdin:
+    line = line.rstrip()
+    dec_words = evaluate(encoder, decoder, line, MAX_LENGTH)
+    print(' '.join(dec_words))
--- a/prepare.py
+++ b/prepare.py
@ -0,0 +1,90 @@
+from __future__ import unicode_literals, print_function, division
+from io import open
+import unicodedata
+import string
+import re
+import random
+import pickle
+import torch
+import torch.nn as nn
+from torch import optim
+import torch.nn.functional as F
+from lang import *
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MAX_LENGTH = 300
+
+# Turn a Unicode string to plain ASCII, thanks to
+# https://stackoverflow.com/a/518232/2809427
+def unicodeToAscii(s):
+    return ''.join(
+        c for c in unicodedata.normalize('NFD', s)
+        if unicodedata.category(c) != 'Mn'
+    )
+
+# Lowercase, trim, and remove non-letter characters
+
+def filterPair(p):
+    return len(tokenizer.tokenize(p[0])) < MAX_LENGTH and \
+        len(tokenizer.tokenize(p[1])) < MAX_LENGTH
+
+
+def filterPairs(pairs):
+    return [pair for pair in pairs if filterPair(pair)]
+
+def normalizeString(s):
+    s = re.sub(r"([.!?])", r" \1", s)
+    return s
+
+def readLangs(lang1, lang2, reverse=False):
+    print("Reading lines...")
+    lines_pl = []
+    lines_en = []
+    # Read the file and split into lines
+    with open('train/in.tsv', 'r', encoding='utf-8') as pl:
+        for line in pl:
+            line = line.rstrip()
+            lines_pl.append(line)
+    with open('train/expected.tsv', 'r', encoding='utf-8') as en:
+        for line in en:
+            line = line.rstrip()
+            lines_en.append(line)
+
+    # Split every line into pairs and normalize
+    pairs = []
+    for p, e in zip(lines_pl, lines_en):
+        pl_s = normalizeString(p)
+        pl_e = normalizeString(e)
+        pairs.append([pl_e, pl_s])
+    if reverse:
+        pairs = [list(reversed(p)) for p in pairs]
+        input_lang = Lang(lang2)
+        output_lang = Lang(lang1)
+    else:
+        input_lang = Lang(lang1)
+        output_lang = Lang(lang2)
+
+    return input_lang, output_lang, pairs
+
+def prepareData(lang1, lang2, reverse=False):
+    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
+    print("Read %s sentence pairs" % len(pairs))
+    pairs = filterPairs(pairs)
+    print("Trimmed to %s sentence pairs" % len(pairs))
+    print("Counting words...")
+    for pair in pairs:
+        input_lang.addSentence(pair[0])
+        output_lang.addSentence(pair[1])
+    print("Counted words:")
+    print(input_lang.name, input_lang.n_words)
+    print(output_lang.name, output_lang.n_words)
+    return input_lang, output_lang, pairs
+
+
+input_lang, output_lang, pairs = prepareData('pl', 'eng', True)
+print(random.choice(pairs))
+with open('data/pairs.pkl', 'wb+') as p:
+    pickle.dump(pairs, p, protocol=pickle.HIGHEST_PROTOCOL)
+with open('data/pl_lang.pkl', 'wb+') as p:
+    pickle.dump(input_lang, p, protocol=pickle.HIGHEST_PROTOCOL)
+with open('data/en_lang.pkl', 'wb+') as p:
+    pickle.dump(output_lang, p, protocol=pickle.HIGHEST_PROTOCOL)
--- a/simple_translator.py
+++ b/simple_translator.py
@ -1,30 +0,0 @@
-# -*- coding: utf-8 -*-
-from transformers import MarianTokenizer, MarianMTModel
-import sys
-from typing import List
-from numba import jit
-
-@jit
-def count():
-	data={}
-	for doc_id,line in enumerate(sys.stdin):
-		data[doc_id]=line.rstrip()
-	return data 
-
-def translate(data):
-	for key in data.keys():
-		batch = tok.prepare_seq2seq_batch(src_texts=[data[key]])
-		gen = model.generate(**batch)
-		translate = tok.batch_decode(gen, skip_special_tokens=True)
-		print(translate[0])
-
-if __name__ =="__main__":
-	src = 'pl'  # source language
-	trg = 'en'  # target language
-	mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
-
-	#print('Data ready!')
-	model = MarianMTModel.from_pretrained(mname)
-	tok = MarianTokenizer.from_pretrained(mname)
-	data=count()
-	translate(data)
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train.py
+++ b/train.py
@ -0,0 +1,7 @@
+from lstm_model import EncoderRNN, DecoderRNN, AttnDecoderRNN
+from model_train import *
+hidden_size = 256
+encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
+attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)
+
+trainIters(encoder1, attn_decoder1, 10000, print_every=100)
--- a/train/expected.tsv
+++ b/train/expected.tsv
--- a/train/in.tsv
+++ b/train/in.tsv