#!/usr/bin/env python
# coding: utf-8

# In[2]:


import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import numpy as np
from collections import Counter
import re
import lzma
import csv


# In[3]:


device = 'cuda'


# In[4]:


class Dataset(torch.utils.data.Dataset):
    def __init__(
            self,
            sequence_length,
    ):
        self.sequence_length = sequence_length
        self.words = self.load()
        self.uniq_words = self.get_uniq_words()

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        self.words_indexes = [self.word_to_index[w] for w in self.words]

    def load(self):
        data = lzma.open(f'train/in.tsv.xz').read().decode('UTF-8').split('\n')
        data = [line.split('\t') for line in data][:-1]
        data = [[i[6].replace('\\\\n', ' '), i[7].replace('\\\\n', ' ')] for i in data]
        
        words = []
        with open(f'train/expected.tsv') as file:
            tsv_file = csv.reader(file, delimiter="\t")
            for line in tsv_file:
                words.append(line[0])
                
        text = []
#         for i in range(len(data) - 1):
        for i in range(5000):
            t = data[i][0] + ' ' + words[i] + ' ' + data[i][1] + ' '
            text += [t.replace('\\n', ' ')]
            
        text = ' '.join(text).lower()
        text = re.sub('[^a-z ]', '', text)
        text = text.split(' ')
        return text
    
    
    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.words_indexes) - self.sequence_length

    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.sequence_length]),
            torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1]),
        )


# In[5]:


dataset = Dataset(5)


# In[6]:


dataset[200]


# In[7]:


[dataset.index_to_word[x] for x in [  0, 231,  19,  98, 189]]


# In[8]:


[dataset.index_to_word[x] for x in [231,  19,  98, 189,   5]]


# In[9]:


input_tensor = torch.tensor([[  0, 231,  19,  98, 189]], dtype=torch.int32).to(device)


# In[ ]:


class Model(nn.Module):
    def __init__(self, vocab_size):
        super(Model, self).__init__()
        self.lstm_size = 128
        self.embedding_dim = 128
        self.num_layers = 3

        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.lstm_size,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.2,
        )
        self.fc = nn.Linear(self.lstm_size, vocab_size)

    def forward(self, x, prev_state = None):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(device),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(device))


# In[ ]:


model = Model(len(dataset)).to(device)


# In[ ]:


y_pred, state_h = model(input_tensor)


# In[ ]:


y_pred


# In[ ]:


y_pred.shape


# In[ ]:


def train(dataset, model, max_epochs, batch_size):
    model.train()

    dataloader = DataLoader(dataset, batch_size=batch_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(max_epochs):
        for batch, (x, y) in enumerate(dataloader):
            optimizer.zero_grad()
            x = x.to(device)
            y = y.to(device)

            y_pred, state_h = model(x)
            loss = criterion(y_pred.transpose(1, 2), y)

            loss.backward()
            optimizer.step()

            print({ 'epoch': epoch, 'update in batch': batch, '/' : len(dataloader), 'loss': loss.item() })


# In[ ]:


model = Model(vocab_size = len(dataset.uniq_words)).to(device)
train(dataset, model, 1, 64)


# In[ ]:


def predict(dataset, model, text, next_words=5):
    model.eval()
    words = text.split(' ')
    state_h = model.init_state(len(words))
    res = []

    x = torch.tensor([[dataset.word_to_index[w] for w in words]]).to(device)
    y_pred, state_h = model(x, state_h)

    last_word_logits = y_pred[0][-1]
    p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()
    tmp = sorted(zip(p, range(len(p))), reverse=True)[:next_words]
    for w in tmp:
        res.append((dataset.index_to_word[w[1]], w[0]))

    return res

def predict2(dataset, model, model2, text, text2, next_words=5):
    model.eval()
    model2.eval()
    words = text.split(' ')
    words2 = text2.split(' ')
    words2.reverse()
    state_h = model.init_state(len(words))
    state_h_2 = model2.init_state(len(words))
    res = []

    x = torch.tensor([[dataset.word_to_index[w] for w in words]]).to(device)
    x2 = torch.tensor([[dataset.word_to_index[w] for w in words2]]).to(device)
    y_pred, state_h = model(x, state_h)
    y_pred_2, state_h_2 = model2(x2, state_h_2)

    last_word_logits = y_pred[0][-1]
    last_word_logits_2 = y_pred_2[0][-1]
    p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()
    p2 = torch.nn.functional.softmax(last_word_logits_2, dim=0).detach().cpu().numpy()

    p_mean = [(g + h) / 2 for g, h in zip(p, p2)]

    tmp = sorted(zip(p_mean, range(len(p_mean))), reverse=True)[:next_words]
    for w in tmp:
        res.append((dataset.index_to_word[w[1]], w[0]))

    return res


# In[ ]:


predict(dataset, model, 'it is a')


# In[69]:


dev_data = lzma.open(f'dev-0/in.tsv.xz').read().decode('UTF-8').split('\n')
dev_data = [line.split('\t') for line in dev_data][:-1]
dev_data1 = [re.sub('[^a-z ]', '', i[6].replace('\\n', ' ').lower()).strip() for i in dev_data]
dev_data2 = [re.sub('[^a-z ]', '', i[7].replace('\\n', ' ').lower()).strip() for i in dev_data]


# In[23]:


dev_data[0]


# In[54]:


print(predict(dataset, model, ' '.join(dev_data[9].split()[-1:])))


# In[66]:


class ReversedDataset(torch.utils.data.Dataset):
    def __init__(
            self,
            sequence_length,
    ):
        self.sequence_length = sequence_length
        self.words = self.load()
        self.uniq_words = self.get_uniq_words()

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        self.words_indexes = [self.word_to_index[w] for w in self.words]

    def load(self):
        data = lzma.open(f'train/in.tsv.xz').read().decode('UTF-8').split('\n')
        data = [line.split('\t') for line in data][:-1]
        data = [[i[6].replace('\\\\n', ' '), i[7].replace('\\\\n', ' ')] for i in data]
        
        words = []
        with open(f'train/expected.tsv') as file:
            tsv_file = csv.reader(file, delimiter="\t")
            for line in tsv_file:
                words.append(line[0])
                
        text = []
#         for i in range(len(data) - 1):
        for i in range(5000):
            t = data[i][0] + ' ' + words[i] + ' ' + data[i][1] + ' '
            text += [t.replace('\\n', ' ')]
            
        text = ' '.join(text).lower()
        text = re.sub('[^a-z ]', '', text)
        text = text.split(' ')
        text.reverse()
        return text
    
    
    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.words_indexes) - self.sequence_length

    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.sequence_length]),
            torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1]),
        )


# In[67]:


dataset_2 = ReversedDataset(5)
input_tensor_2 = torch.tensor([[  0, 231,  19,  98, 189]], dtype=torch.int32).to(device)

model_2 = Model(len(dataset_2)).to(device)
y_pred_2, state_h_2 = model(input_tensor_2)
model_2 = Model(vocab_size = len(dataset_2.uniq_words)).to(device)
train(dataset_2, model_2, 1, 64)


# In[96]:


n = 2

f = open("dev-0/out.tsv", "w")

for i in range(len(dev_data1)):
    d1 = dev_data1[i]
    d2 = dev_data2[i]
    try:
        tmp = predict2(dataset, model, model_2, ' '.join(d1.split()[-n:]), ' '.join(d2.split()[:n]))
        f.writelines(' '.join([f'{i[0]}:{i[1]}' for i in tmp]) + ' :0.3\n')
    except:
        f.writelines(':1\n')

f.close()
        

# In[95]:


len(dev_data1)


# In[93]:


test_data = lzma.open(f'test-A/in.tsv.xz').read().decode('UTF-8').split('\n')
test_data = [line.split('\t') for line in test_data][:-1]
test_data1 = [re.sub('[^a-z ]', '', i[6].replace('\\n', ' ').lower()).strip() for i in test_data]
test_data2 = [re.sub('[^a-z ]', '', i[7].replace('\\n', ' ').lower()).strip() for i in test_data]


n = 2

f = open("test-A/out.tsv", "w")

for i in range(len(test_data1)):
    d1 = test_data1[i]
    d2 = test_data2[i]
    try:
        tmp = predict2(dataset, model, model_2, ' '.join(d1.split()[-n:]), ' '.join(d2.split()[:n]))
        f.writelines(' '.join([f'{i[0]}:{i[1]}' for i in tmp]) + ' :0.3\n')
    except:
        f.writelines(':1\n')

f.close()