import itertools import lzma import numpy as np import torch from torch import nn from torch.utils.data import IterableDataset, DataLoader from torchtext.vocab import build_vocab_from_iterator def clean_line(line): # Preprocessing separated = line.split('\t') prefix = separated[6].replace(r'\n', ' ') suffix = separated[7].replace(r'\n', ' ') return prefix + ' ' + suffix def get_words_from_line(line): line = clean_line(line) for word in line.split(): yield word def get_word_lines_from_file(file_name): with lzma.open(file_name, mode='rt', encoding='utf-8') as fid: for line in fid: yield get_words_from_line(line) def n_look_ahead_iterator(n, gen): prevs = [None for _ in range(n)] for item in gen: if prevs[-1] is not None: ngram = prevs[::-1] ngram.append(item) yield np.asarray(ngram) prevs.insert(0, item) prevs = prevs[:n - 1] class Ngrams(IterableDataset): def __init__(self, text_file: str, context_size: int, vocabulary_size: int): self.vocab = build_vocab_from_iterator( get_word_lines_from_file(text_file), max_tokens=vocabulary_size, specials=[''] ) self.vocab.set_default_index(self.vocab['']) self.vocabulary_size = vocabulary_size self.text_file = text_file self.ngram_size = context_size + 1 def __iter__(self): return n_look_ahead_iterator( self.ngram_size, (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))) ) class NgramWithBagLM(nn.Module): def __init__( self, left_smaller_context_size, left_context_size, right_smaller_context_size, right_context_size, embedding_size, vocabulary_size, hidden_size, #second_hidden_size, ): super().__init__() self.left_smaller_context_size = left_smaller_context_size self.left_context_size = left_context_size self.right_smaller_context_size = right_smaller_context_size self.right_context_size = right_context_size self.embedding_size = embedding_size self.embedding = nn.Embedding(vocabulary_size, embedding_size) self.bag_embedding = nn.Embedding(vocabulary_size, embedding_size) self.lin1 = nn.Linear((left_smaller_context_size + right_smaller_context_size + 1) * embedding_size, hidden_size) self.rel = nn.ReLU() # self.lin2 = nn.Linear(hidden_size, second_hidden_size) # self.lin3 = nn.Linear(second_hidden_size, vocabulary_size) self.lin2 = nn.Linear(hidden_size, vocabulary_size) self.sm = nn.Softmax(dim=1) def forward(self, words): # Konkatenacja embedingow smaller_context_embed = [ self.embedding(words[:, i]) for i in range( self.left_context_size - self.left_smaller_context_size, self.left_context_size + self.right_smaller_context_size + 1 ) if i != self.left_context_size ] smaller_context_embed = torch.cat(smaller_context_embed, dim=-1) # Bag of words bag_embed = [ self.bag_embedding(words[:, i]) for i in range( self.left_context_size + self.right_context_size ) if i not in range( self.left_context_size - self.left_smaller_context_size, self.left_context_size + self.right_smaller_context_size + 1 ) ] bag_embed = torch.mean(torch.stack(bag_embed), dim=0) x = torch.cat((bag_embed, smaller_context_embed), dim=-1) x = self.lin1(x) x = self.rel(x) x = self.lin2(x) # x = self.rel(x) # x = self.lin3(x) return self.sm(x) def train_model(): model = NgramWithBagLM( ngram_left_ctx, bagging_left_ctx, ngram_right_ctx, bagging_right_ctx, embed_size, vocab_size, hidden_size, #second_hidden_size, ).to(device) data = DataLoader(train_dataset, batch_size=batch_size) optimizer = torch.optim.Adam(model.parameters(), lr=lr) criterion = torch.nn.NLLLoss() model.train() step = 0 for batch in data: # x = batch[:, :context_size] x = batch y = batch[:, bagging_left_ctx] x = x.to(device) y = y.type(torch.LongTensor) y = y.to(device) optimizer.zero_grad() ypredicted = model(x) loss = criterion(torch.log(ypredicted), y) if torch.isnan(loss): raise Exception("loss is nan") if step % 1000 == 0: print(step, loss) step += 1 loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5) optimizer.step() torch.save(model.state_dict(), "chckpoin.bin") # cond? torch.save(model.state_dict(), path_to_model) def prediction(model, words: list, top=500) -> str: words_tensor = [train_dataset.vocab.forward([word]) for word in words] ixs = torch.tensor(words_tensor).view(-1).to(device) out = model(ixs.view(1, -1)) top = torch.topk(out[0], top) top_indices = top.indices.tolist() top_probs = top.values.tolist() top_words = train_dataset.vocab.lookup_tokens(top_indices) zipped = list(zip(top_words, top_probs)) for index, element in enumerate(zipped): unk = None if '' in element: unk = zipped.pop(index) zipped.append(('', unk[1])) break if unk is None: zipped[-1] = ('', zipped[-1][1]) return ' '.join([f'{x[0]}:{x[1]}' for x in zipped]) device = 'cuda' vocab_size = 25000 #context_size = 40 bagging_left_ctx = 25 bagging_right_ctx = 25 #smaller_context = 8 ngram_left_ctx = 7 ngram_right_ctx = 3 embed_size = 300 hidden_size = 150 #second_hidden_size = 100 batch_size = 4000 lr = 0.0001 path_to_train = 'train/in.tsv.xz' path_to_model = 'model4.bin' train_dataset = Ngrams(path_to_train, bagging_left_ctx + bagging_right_ctx, vocab_size) #train_model() model = NgramWithBagLM( ngram_left_ctx, bagging_left_ctx, ngram_right_ctx, bagging_right_ctx, embed_size, vocab_size, hidden_size, #second_hidden_size ).to(device) model.load_state_dict(torch.load(path_to_model)) model.eval() #folder_name = 'dev-0' folder_name = 'test-A' top = 3000 print(f'Creating outputs in {folder_name}') with lzma.open(f'{folder_name}/in.tsv.xz', mode='rt', encoding='utf-8') as fid: with open(f'{folder_name}/out-top={top}.tsv', 'w', encoding='utf-8', newline='\n') as f: for line in fid: separated = line.split('\t') prefix = separated[6].replace(r'\n', ' ').split()[-bagging_left_ctx:] suffix = separated[7].replace(r'\n', ' ').split()[:bagging_right_ctx] words = prefix + ["padding"] + suffix output_line = prediction(model, words, top) f.write(output_line + '\n')