import itertools import lzma import numpy as np import torch from torch import nn from torch.utils.data import IterableDataset, DataLoader from torchtext.vocab import build_vocab_from_iterator def clean_line(line): # Preprocessing separated = line.split('\t') prefix = separated[6].replace(r'\n', ' ') suffix = separated[7].replace(r'\n', ' ') return prefix + ' ' + suffix def get_words_from_line(line): line = clean_line(line) for word in line.split(): yield word def get_word_lines_from_file(file_name): with lzma.open(file_name, mode='rt', encoding='utf-8') as fid: for line in fid: yield get_words_from_line(line) def n_look_ahead_iterator(n, gen): prevs = [None for _ in range(n)] for item in gen: if prevs[-1] is not None: ngram = prevs[::-1] ngram.append(item) yield np.asarray(ngram) prevs.insert(0, item) prevs = prevs[:n - 1] class Ngrams(IterableDataset): def __init__(self, text_file: str, context_size: int, vocabulary_size: int): self.vocab = build_vocab_from_iterator( get_word_lines_from_file(text_file), max_tokens=vocabulary_size, specials=[''] ) self.vocab.set_default_index(self.vocab['']) self.vocabulary_size = vocabulary_size self.text_file = text_file self.ngram_size = context_size + 1 def __iter__(self): return n_look_ahead_iterator( self.ngram_size, (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))) ) class NgramWithBagLM(nn.Module): def __init__(self, smaller_context_size, context_size, embedding_size, vocabulary_size, hidden_size): super().__init__() self.smaller_context_size = smaller_context_size self.context_size = context_size self.embedding_size = embedding_size self.embedding = nn.Embedding(vocabulary_size, embedding_size) self.bag_embedding = nn.Embedding(vocabulary_size, embedding_size) self.lin1 = nn.Linear((smaller_context_size + 1) * embedding_size, hidden_size) self.rel = nn.ReLU() self.lin2 = nn.Linear(hidden_size, vocabulary_size) self.sm = nn.Softmax(dim=1) def forward(self, words): smaller_context_embed = [ self.embedding(words[:, i]) for i in range(self.context_size - self.smaller_context_size, self.context_size) ] smaller_context_embed = torch.cat(smaller_context_embed, dim=-1) bag_embed = [ self.bag_embedding(words[:, i]) for i in range(self.context_size - self.smaller_context_size) ] bag_embed = torch.mean(torch.stack(bag_embed), dim=0) x = torch.cat((bag_embed, smaller_context_embed), dim=-1) x = self.lin1(x) x = self.rel(x) x = self.lin2(x) return self.sm(x) def train_model(): model = NgramWithBagLM( smaller_context, context_size, embed_size, vocab_size, hidden_size ).to(device) data = DataLoader(train_dataset, batch_size=batch_size) optimizer = torch.optim.Adam(model.parameters(), lr=lr) criterion = torch.nn.NLLLoss() model.train() step = 0 for batch in data: print(batch.shape) # x = batch[:, :context_size] # y = batch[:, context_size] x = batch y = batch[:, left_ctx:left_ctx + 1] print(x.shape) print(y.shape) x = x.to(device) y = y.type(torch.LongTensor) y = y.to(device) optimizer.zero_grad() ypredicted = model(x) loss = criterion(torch.log(ypredicted), y) if torch.isnan(loss): raise Exception("loss is nan") if step % 1000 == 0: print(step, loss) step += 1 loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5) optimizer.step() # cond? torch.save(model.state_dict(), path_to_model) def prediction(model, words: list, top=500) -> str: words_tensor = [train_dataset.vocab.forward([word]) for word in words] ixs = torch.tensor(words_tensor).view(-1).to(device) out = model(ixs.view(1, -1)) top = torch.topk(out[0], top) top_indices = top.indices.tolist() top_probs = top.values.tolist() top_words = train_dataset.vocab.lookup_tokens(top_indices) zipped = list(zip(top_words, top_probs)) for index, element in enumerate(zipped): unk = None if '' in element: unk = zipped.pop(index) zipped.append(('', unk[1])) break if unk is None: zipped[-1] = ('', zipped[-1][1]) return ' '.join([f'{x[0]}:{x[1]}' for x in zipped]) device = 'cuda' vocab_size = 250 #context_size = 40 left_ctx = 20 right_ctx = 20 #smaller_context = 8 smaller_left_ctx = 5 smaller_right_ctx = 3 embed_size = 20 hidden_size = 10 batch_size = 4000 lr = 0.0001 path_to_train = 'train/in.tsv.xz' path_to_model = 'model3.bin' train_dataset = Ngrams(path_to_train, left_ctx + right_ctx, vocab_size) train_model() model = NgramWithBagLM( smaller_context, context_size, embed_size, vocab_size, hidden_size ).to(device) model.load_state_dict(torch.load(path_to_model)) model.eval() folder_name = 'dev-0' top = 500 print(f'Creating outputs in {folder_name}') with lzma.open(f'{folder_name}/in.tsv.xz', mode='rt', encoding='utf-8') as fid: with open(f'{folder_name}/out-top={top}.tsv', 'w', encoding='utf-8', newline='\n') as f: for line in fid: separated = line.split('\t') prefix = separated[6].replace(r'\n', ' ').split()[-context_size:] output_line = prediction(model, prefix, top) f.write(output_line + '\n')