In [6]:
import itertools
import lzma
import numpy as np
import regex as re
import torch
from torch import nn
from torch.utils.data import IterableDataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator

In [7]:
vocab_size = 10000
embed_size = 200
hidden_size = 100
batch_size = 1000
device = 'cuda'
path_to_train = 'train/in.tsv.xz'
path_to_model = 'model1.bin'

def clean_line(line: str):
    # Preprocessing
    separated = line.split('\t')
    prefix = separated[6].replace(r'\n', ' ')
    suffix = separated[7].replace(r'\n', ' ')
    return prefix + ' ' + suffix

def get_words_from_line(line):
    line = clean_line(line)
    for word in line.split():
        yield word

def get_word_lines_from_file(file_name):
    with lzma.open(file_name, mode='rt', encoding='utf-8') as fid:
        for line in fid:
            yield get_words_from_line(line)

def double_look_ahead_iterator(gen):
    prev_prev = None
    prev = None
    for item in gen:
        if prev_prev is not None:
            yield np.asarray((prev_prev, prev, item))
        prev_prev = prev
        prev = item

def prediction(words, model, top) -> str:
    words_tensor = [train_dataset.vocab.forward([word]) for word in words]
    ixs = torch.tensor(words_tensor).view(-1).to(device)
    out = model(ixs)
    top = torch.topk(out[0], top)
    top_indices = top.indices.tolist()
    top_probs = top.values.tolist()
    top_words = vocab.lookup_tokens(top_indices)
    zipped = list(zip(top_words, top_probs))
    for index, element in enumerate(zipped):
        unk = None
        if '<unk>' in element:
            unk = zipped.pop(index)
            zipped.append(('', unk[1]))
            break
    if unk is None:
        zipped[-1] = ('', zipped[-1][1])
    return ' '.join([f'{x[0]}:{x[1]:.5f}' for x in zipped])

def create_outputs(folder_name, model, top):
    print(f'Creating outputs in {folder_name}')
    with lzma.open(f'{folder_name}/in.tsv.xz', mode='rt', encoding='utf-8') as fid:
        with open(f'{folder_name}/out-top={top}.tsv', 'w', encoding='utf-8', newline='\n') as f:
            for line in fid:
                separated = line.split('\t')
                prefix = separated[6].replace(r'\n', ' ').split()[-2:]
                output_line = prediction(prefix, model, top)
                f.write(output_line + '\n')

def train_model(lr):
    model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size, hidden_size).to(device)
    data = DataLoader(train_dataset, batch_size=batch_size)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.NLLLoss()

    model.train()
    step = 0
    for batch in data:
        x = batch[:, :2]
        y = batch[:, 2]
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        ypredicted = model(x)
        loss = criterion(torch.log(ypredicted), y)
        if step % 100 == 0:
            print(step, loss)
        step += 1
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 10)


        optimizer.step()

    torch.save(model.state_dict(), path_to_model)

def with_hyperparams():
      train_model(lr=0.0001)
      model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size, hidden_size).to(device)
      model.load_state_dict(torch.load(path_to_model))
      model.eval()
      for top in [200, 400, 600]:
          create_outputs('dev-0', model, top)
          create_outputs('test-A', model, top)

class Trigrams(IterableDataset):
    def __init__(self, text_file, vocabulary_size):
        self.vocab = build_vocab_from_iterator(
            get_word_lines_from_file(text_file),
            max_tokens=vocabulary_size,
            specials=['<unk>'])
        self.vocab.set_default_index(self.vocab['<unk>'])
        self.vocabulary_size = vocabulary_size
        self.text_file = text_file

    def __iter__(self):
        return double_look_ahead_iterator(
            (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))


class SimpleTrigramNeuralLanguageModel(nn.Module):
    def __init__(self, vocabulary_size, embedding_size, hidden_size):
        super(SimpleTrigramNeuralLanguageModel, self).__init__()
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(vocabulary_size, embedding_size)
        self.lin1 = nn.Linear(2 * embedding_size, hidden_size)
        self.rel = nn.ReLU()
        self.lin2 = nn.Linear(hidden_size, vocabulary_size)
        self.sm = nn.Softmax()

    def forward(self, x):
        x = self.embedding(x).view((-1, 2 * self.embedding_size))
        x = self.lin1(x)
        x = self.rel(x)
        x = self.lin2(x)
        return self.sm(x)

In [8]:
from google.colab import drive

drive.mount('/content/drive',force_remount=True)
%cd /content/drive/MyDrive/modelowanie_jezyka

Mounted at /content/drive
/content/drive/MyDrive/modelowanie_jezyka


In [9]:
vocab = build_vocab_from_iterator(
    get_word_lines_from_file(path_to_train),
    max_tokens=vocab_size,
    specials=['<unk>']
)

vocab.set_default_index(vocab['<unk>'])
train_dataset = Trigrams(path_to_train, vocab_size)

In [11]:
with_hyperparams()

  return self.sm(x)


0 tensor(9.2570, device='cuda:0', grad_fn=<NllLossBackward0>)
100 tensor(8.3136, device='cuda:0', grad_fn=<NllLossBackward0>)
200 tensor(7.4358, device='cuda:0', grad_fn=<NllLossBackward0>)
300 tensor(7.2475, device='cuda:0', grad_fn=<NllLossBackward0>)
400 tensor(6.7645, device='cuda:0', grad_fn=<NllLossBackward0>)
500 tensor(6.5050, device='cuda:0', grad_fn=<NllLossBackward0>)
600 tensor(6.4014, device='cuda:0', grad_fn=<NllLossBackward0>)
700 tensor(6.6077, device='cuda:0', grad_fn=<NllLossBackward0>)
800 tensor(6.3927, device='cuda:0', grad_fn=<NllLossBackward0>)
900 tensor(6.0547, device='cuda:0', grad_fn=<NllLossBackward0>)
1000 tensor(6.1434, device='cuda:0', grad_fn=<NllLossBackward0>)
1100 tensor(5.8979, device='cuda:0', grad_fn=<NllLossBackward0>)
1200 tensor(6.1095, device='cuda:0', grad_fn=<NllLossBackward0>)
1300 tensor(6.1998, device='cuda:0', grad_fn=<NllLossBackward0>)
1400 tensor(5.9146, device='cuda:0', grad_fn=<NllLossBackward0>)
1500 tensor(5.7813, device='cuda:0', 