#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().system('git clone --single-branch git://gonito.net/challenging-america-word-gap-prediction -b master') # In[2]: from torch import device as dev device = dev("cuda") # In[3]: import lzma def read_xz_file(fname): with lzma.open(fname, mode='rt', encoding='utf-8') as f: return [line.strip() for line in f.readlines()] def read_file(fname): with open(fname, mode='rt', encoding='utf-8') as f: return [line.strip() for line in f.readlines()] def get_contexts(input_text): all_fields = input_text.replace(r'\n', ' ').split('\t') return {'left': all_fields[6], 'right': all_fields[7]} def compose_sentences(raw_input, labels): result = [] for input, label in zip(raw_input, labels): context = get_contexts(input) result.append(f'{context["left"]} {input} {context["right"]}') return result # In[4]: train_input_raw = read_xz_file('challenging-america-word-gap-prediction/train/in.tsv.xz') train_labels = read_file('challenging-america-word-gap-prediction/train/expected.tsv') train_sentences = compose_sentences(train_input_raw, train_labels) # In[5]: from torchtext.data import get_tokenizer from torchtext.vocab import build_vocab_from_iterator from torch import save as save_model def tokenize_dataset(lines, tokenizer): for line in lines: yield tokenizer(line) vocabulary_max_size = 16384 unknown_token = '<0>' tokenizer = get_tokenizer('basic_english') vocabulary = build_vocab_from_iterator( tokenize_dataset(train_sentences, tokenizer), specials=[unknown_token], max_tokens=vocabulary_max_size ) vocabulary.set_default_index(vocabulary[unknown_token]) save_model(vocabulary, 'vocabulary.pth') # In[6]: from torch import LongTensor class TrigramDataset: def __init__(self, lines, vocab, tokenizer, unknown_token): self.unknown_token = unknown_token self.vocab = vocab self.tokenizer = tokenizer self.lines = lines def __getitem__(self, idx): x = [] y = [] sentence = [self.vocab[token] for token in self.tokenizer(self.lines[idx])] for pos, _ in enumerate(sentence): prev = sentence[pos-1] if pos > 0 else self.vocab[self.unknown_token] current = sentence[pos] next = sentence[pos+1] if pos < len(sentence) - 1 else self.vocab[self.unknown_token] x.append([prev, next]) y.append([current]) return LongTensor(x), LongTensor(y) def __len__(self): return len(self.lines) # In[7]: train_dataset = TrigramDataset(train_sentences, vocabulary, tokenizer, unknown_token) # In[8]: from torch import nn class LanguageModel(nn.Module): grams_count = 3 def __init__(self, vocabulary_size, embedding_size, hidden_size): super(LanguageModel, self).__init__() self.embedding_size = embedding_size self.embedding = nn.Embedding(vocabulary_size, embedding_size) self.layers = nn.Sequential( nn.Linear((self.grams_count - 1) * embedding_size, hidden_size), nn.ReLU(), nn.Linear(hidden_size, vocabulary_size), nn.Softmax(dim=1) ) def forward(self, x): x = self.embedding(x).view((-1, (self.grams_count - 1) * self.embedding_size)) return self.layers(x) # In[12]: from torch.optim import Adam from torch import log from tqdm import tqdm def train(model, dataset, output_file, epochs): optimizer = Adam(model.parameters(), lr=0.00007) criterion = nn.NLLLoss() model.to(device) model.train() for epoch in range(epochs): for i, (x, y) in(bar := tqdm(enumerate(dataset), total=len(dataset))): x = x.to(device) y = y.to(device) optimizer.zero_grad() ypredicted = model(x) loss = criterion(log(ypredicted), y[:,0]) if not i % 100: bar.set_description(f'Epoch: {epoch}, Loss: {loss}, Batch: {i}') loss.backward() try: nn.utils.clip_grad_norm_(model.parameters(), 5, error_if_nonfinite=True) optimizer.step() except RuntimeError: print("Grad overflow") save_model(model.state_dict(), output_file) # In[13]: embedding_size = 256 model = LanguageModel(len(vocabulary), embedding_size, 128) # In[ ]: train(model, train_dataset, 'test_model', 5) # In[28]: embedding_size = 256 model_512 = LanguageModel(len(vocabulary), embedding_size, 512) train(model_512, train_dataset, 'test_model_512', 5) # In[ ]: embedding_size = 256 model = LanguageModel(len(vocabulary), embedding_size, 128) # In[18]: dev_input_raw = read_xz_file('challenging-america-word-gap-prediction/dev-0/in.tsv.xz') dev_contexts = [get_contexts(t) for t in dev_input_raw] test_input_raw = read_xz_file('challenging-america-word-gap-prediction/test-A/in.tsv.xz') test_sentences = [get_contexts(t) for t in test_input_raw] # In[15]: from torch import load as load_model vocabulary = load_model('vocabulary.pth') tokenizer = get_tokenizer('basic_english') #model = load_model('test_model') model.eval() # In[16]: from torch import LongTensor, topk, log from tqdm import tqdm def predict_words(dataset, tokenizer, vocab, model): preds = [] for entry in tqdm(dataset): tokenized_left = tokenizer(entry['left']) tokenized_right = tokenizer(entry['right']) # [last word from left context, y, first word from right context] src = LongTensor([vocab[tokenized_left[-1]], vocab[tokenized_right[0]]]).to(device) output = model(src) top = topk(output[0], 50) probs, tokens = top.values.tolist(), vocab.lookup_tokens(top.indices.tolist()) current_output = '' accumulated_probability = 0 for prob, token in zip(probs, tokens): accumulated_probability += prob current_output += f'{token.strip()}:{prob} ' current_output += f':{1 - accumulated_probability}' preds.append(current_output) return preds # In[24]: preds = predict_words(dev_contexts, tokenizer, vocabulary, model) # In[25]: with open('challenging-america-word-gap-prediction/dev-0/out-hidden_size=128.tsv', 'w') as f: f.writelines(line + '\n' for line in preds) # In[26]: test_preds = predict_words(test_sentences, tokenizer, vocabulary, model) with open('challenging-america-word-gap-prediction/test-A/out-hidden_size=128.tsv', 'w') as f: f.writelines(line + '\n' for line in test_preds) # In[29]: #model_512 = load_model('test_model_512') model_512.eval() preds_512 = predict_words(dev_contexts, tokenizer, vocabulary, model_512) with open('challenging-america-word-gap-prediction/dev-0/out-hidden_size=512.tsv', 'w') as f: f.writelines(line + '\n' for line in preds) test_preds_512 = predict_words(test_sentences, tokenizer, vocabulary, model_512) with open('challenging-america-word-gap-prediction/test-A/out-hidden_size=512.tsv', 'w') as f: f.writelines(line + '\n' for line in test_preds)