## Imports

In [2]:
import itertools
import lzma

import regex as re
import torch
from torch import nn
from torch.utils.data import IterableDataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator
from google.colab import drive

## Definitions

### Functions

In [1]:
def clean_text(line: str):
 # Preprocessing
 separated = line.split('\t')
 prefix = separated[6].replace(r'\n', ' ').replace('\\n', ' ').replace(' ', ' ').replace('.', '').replace(',', '').replace('?', '').replace('!', '').replace('(', '').replace(')', '').replace(';', '').replace(':', '').replace('"', '').replace("'", '').replace('-', ' ').replace(' ', ' ')
 suffix = separated[7].replace(r'\n', ' ').replace('\\n', ' ').replace(' ', ' ').replace('.', '').replace(',', '').replace('?', '').replace('!', '').replace('(', '').replace(')', '').replace(';', '').replace(':', '').replace('"', '').replace("'", '').replace('-', ' ').replace(' ', ' ')
 return prefix + ' ' + suffix

In [2]:
def get_words_from_line(line):
 line = clean_text(line)
 for word in line.split():
 yield word

In [3]:
def get_word_lines_from_file(file_name):
 with lzma.open(file_name, mode='rt', encoding='utf-8') as fid:
 for line in fid:
 yield get_words_from_line(line)

In [4]:
def look_ahead_iterator(gen):
 prev = None
 for item in gen:
 if prev is not None:
 yield (prev, item)
 prev = item

In [5]:
def prediction(word: str) -> str:
 ixs = torch.tensor(vocab.forward([word])).to(device)
 out = model(ixs)
 top = torch.topk(out[0], 5)
 top_indices = top.indices.tolist()
 top_probs = top.values.tolist()
 top_words = vocab.lookup_tokens(top_indices)
 zipped = list(zip(top_words, top_probs))
 for index, element in enumerate(zipped):
 unk = None
 if '' in element:
 unk = zipped.pop(index)
 zipped.append(('', unk[1]))
 break
 if unk is None:
 zipped[-1] = ('', zipped[-1][1])
 return ' '.join([f'{x[0]}:{x[1]}' for x in zipped])

In [6]:
def save_outs(folder_name):
 print(f'Creating outputs in {folder_name}')
 with lzma.open(f'/content/drive/MyDrive/Colab Notebooks/{folder_name}/in.tsv.xz', mode='rt', encoding='utf-8') as fid:
 with open(f'/content/drive/MyDrive/Colab Notebooks/{folder_name}/out.tsv', 'w', encoding='utf-8', newline='\n') as f:
 for line in fid:
 separated = line.split('\t')
 prefix = separated[6].replace(r'\n', ' ').split()[-1]
 output_line = prediction(prefix)
 f.write(output_line + '\n')

### Classes

In [None]:
class Bigrams(IterableDataset):
 def __init__(self, text_file, vocabulary_size):
 self.vocab = build_vocab_from_iterator(
 get_word_lines_from_file(text_file),
 max_tokens=vocabulary_size,
 specials=[''])
 self.vocab.set_default_index(self.vocab[''])
 self.vocabulary_size = vocabulary_size
 self.text_file = text_file

 def __iter__(self):
 return look_ahead_iterator(
 (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))

In [None]:
class SimpleBigramNeuralLanguageModel(nn.Module):
 def __init__(self, vocabulary_size, embedding_size):
 super(SimpleBigramNeuralLanguageModel, self).__init__()
 self.model = nn.Sequential(
 nn.Embedding(vocabulary_size, embedding_size),
 nn.Linear(embedding_size, vocabulary_size),
 nn.Softmax()
 )

 def forward(self, x):
 return self.model(x)

## Training

### Params

In [20]:
vocab_size = 10000
embed_size = 100
batch_size = 2000
device = 'cuda'
path_to_train = '/content/drive/MyDrive/Colab Notebooks/train/in.tsv.xz'
path_to_model = 'modelneural_bigram.bin'

### Colab

In [14]:
drive.mount('/content/drive')
%cd /content/drive/MyDrive/

Mounted at /content/drive
/content/drive/MyDrive


### Run

In [17]:
vocab = build_vocab_from_iterator(
 get_word_lines_from_file(path_to_train),
 max_tokens=vocab_size,
 specials=['']
)

vocab.set_default_index(vocab[''])

In [18]:
train_dataset = Bigrams(path_to_train, vocab_size)

In [21]:
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
data = DataLoader(train_dataset, batch_size=batch_size)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()

model.train()
step = 0
for x, y in data:
 x = x.to(device)
 y = y.to(device)
 optimizer.zero_grad()
 ypredicted = model(x)
 loss = criterion(torch.log(ypredicted), y)
 if step % 100 == 0:
 print(step, loss)
 step += 1
 loss.backward()
 optimizer.step()

 input = module(input)


0 tensor(9.4517, device='cuda:0', grad_fn=)
100 tensor(7.9341, device='cuda:0', grad_fn=)
200 tensor(7.1452, device='cuda:0', grad_fn=)
300 tensor(6.7956, device='cuda:0', grad_fn=)
400 tensor(6.4127, device='cuda:0', grad_fn=)
500 tensor(6.3407, device='cuda:0', grad_fn=)
600 tensor(6.2125, device='cuda:0', grad_fn=)
700 tensor(5.7817, device='cuda:0', grad_fn=)
800 tensor(5.7309, device='cuda:0', grad_fn=)
900 tensor(5.7419, device='cuda:0', grad_fn=)
1000 tensor(5.7372, device='cuda:0', grad_fn=)
1100 tensor(5.2804, device='cuda:0', grad_fn=)
1200 tensor(5.4610, device='cuda:0', grad_fn=)
1300 tensor(5.6610, device='cuda:0', grad_fn=)
1400 tensor(5.3070, device='cuda:0', grad_fn=)
1500 tensor(4.9666, device='cuda:0', grad_fn=)
1600 tensor(5.2102, device='cuda:0', grad_fn=)
1700 tensor(5.4919, device='cuda:0', grad_fn=)
1800 tensor(5.1968, device='cuda:0', grad_fn=)
1900 tensor(5.3336, device='cuda:0', grad_fn=)
2000 tensor(5.2387, device='cuda:0', grad_fn=)
2100 tensor(5.2247, devic

In [16]:
import torch
torch.cuda.is_available()

True

In [22]:
torch.save(model.state_dict(), path_to_model)

In [23]:
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
model.load_state_dict(torch.load(path_to_model))
model.eval()

SimpleBigramNeuralLanguageModel(
 (model): Sequential(
 (0): Embedding(10000, 100)
 (1): Linear(in_features=100, out_features=10000, bias=True)
 (2): Softmax(dim=None)
 )
)

In [29]:
save_outs('dev-0')

Creating outputs in dev-0


In [30]:
save_outs('test-A')

Creating outputs in test-A
