8_1
This commit is contained in:
parent
eb63850925
commit
af0c257a0e
10519
dev-0/out-hidden_size=128.tsv
Normal file
10519
dev-0/out-hidden_size=128.tsv
Normal file
File diff suppressed because it is too large
Load Diff
10519
dev-0/out-hidden_size=512.tsv
Normal file
10519
dev-0/out-hidden_size=512.tsv
Normal file
File diff suppressed because it is too large
Load Diff
21038
dev-0/out.tsv
21038
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
14
gonito.yml
Normal file
14
gonito.yml
Normal file
@ -0,0 +1,14 @@
|
||||
description: self-made trigram model with center context
|
||||
tags:
|
||||
- neural-network
|
||||
- n-gram
|
||||
- self-made
|
||||
params:
|
||||
embedding_size: 256
|
||||
lr: 0.00007
|
||||
hidden_size: 128
|
||||
epochs: 5
|
||||
vocab_size: 16384
|
||||
links:
|
||||
- title: "repository"
|
||||
url: "https://git.wmi.amu.edu.pl/s444501/challenging-america-word-gap-prediction.git"
|
276
run-8_1.py
Normal file
276
run-8_1.py
Normal file
@ -0,0 +1,276 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
# In[1]:
|
||||
|
||||
|
||||
get_ipython().system('git clone --single-branch git://gonito.net/challenging-america-word-gap-prediction -b master')
|
||||
|
||||
|
||||
# In[2]:
|
||||
|
||||
|
||||
from torch import device as dev
|
||||
|
||||
device = dev("cuda")
|
||||
|
||||
|
||||
# In[3]:
|
||||
|
||||
|
||||
import lzma
|
||||
def read_xz_file(fname):
|
||||
with lzma.open(fname, mode='rt', encoding='utf-8') as f:
|
||||
return [line.strip() for line in f.readlines()]
|
||||
|
||||
|
||||
def read_file(fname):
|
||||
with open(fname, mode='rt', encoding='utf-8') as f:
|
||||
return [line.strip() for line in f.readlines()]
|
||||
|
||||
|
||||
def get_contexts(input_text):
|
||||
all_fields = input_text.replace(r'\n', ' ').split('\t')
|
||||
return {'left': all_fields[6], 'right': all_fields[7]}
|
||||
|
||||
|
||||
def compose_sentences(raw_input, labels):
|
||||
result = []
|
||||
for input, label in zip(raw_input, labels):
|
||||
context = get_contexts(input)
|
||||
result.append(f'{context["left"]} {input} {context["right"]}')
|
||||
return result
|
||||
|
||||
|
||||
# In[4]:
|
||||
|
||||
|
||||
train_input_raw = read_xz_file('challenging-america-word-gap-prediction/train/in.tsv.xz')
|
||||
train_labels = read_file('challenging-america-word-gap-prediction/train/expected.tsv')
|
||||
train_sentences = compose_sentences(train_input_raw, train_labels)
|
||||
|
||||
|
||||
# In[5]:
|
||||
|
||||
|
||||
from torchtext.data import get_tokenizer
|
||||
from torchtext.vocab import build_vocab_from_iterator
|
||||
|
||||
from torch import save as save_model
|
||||
|
||||
def tokenize_dataset(lines, tokenizer):
|
||||
for line in lines:
|
||||
yield tokenizer(line)
|
||||
|
||||
vocabulary_max_size = 16384
|
||||
unknown_token = '<0>'
|
||||
tokenizer = get_tokenizer('basic_english')
|
||||
vocabulary = build_vocab_from_iterator(
|
||||
tokenize_dataset(train_sentences, tokenizer),
|
||||
specials=[unknown_token],
|
||||
max_tokens=vocabulary_max_size
|
||||
)
|
||||
vocabulary.set_default_index(vocabulary[unknown_token])
|
||||
save_model(vocabulary, 'vocabulary.pth')
|
||||
|
||||
|
||||
# In[6]:
|
||||
|
||||
|
||||
from torch import LongTensor
|
||||
|
||||
|
||||
class TrigramDataset:
|
||||
def __init__(self, lines, vocab, tokenizer, unknown_token):
|
||||
self.unknown_token = unknown_token
|
||||
self.vocab = vocab
|
||||
self.tokenizer = tokenizer
|
||||
self.lines = lines
|
||||
|
||||
def __getitem__(self, idx):
|
||||
x = []
|
||||
y = []
|
||||
sentence = [self.vocab[token] for token in self.tokenizer(self.lines[idx])]
|
||||
for pos, _ in enumerate(sentence):
|
||||
prev = sentence[pos-1] if pos > 0 else self.vocab[self.unknown_token]
|
||||
current = sentence[pos]
|
||||
next = sentence[pos+1] if pos < len(sentence) - 1 else self.vocab[self.unknown_token]
|
||||
x.append([prev, next])
|
||||
y.append([current])
|
||||
return LongTensor(x), LongTensor(y)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.lines)
|
||||
|
||||
|
||||
# In[7]:
|
||||
|
||||
|
||||
train_dataset = TrigramDataset(train_sentences, vocabulary, tokenizer, unknown_token)
|
||||
|
||||
|
||||
# In[8]:
|
||||
|
||||
|
||||
from torch import nn
|
||||
|
||||
|
||||
class LanguageModel(nn.Module):
|
||||
grams_count = 3
|
||||
def __init__(self, vocabulary_size, embedding_size, hidden_size):
|
||||
super(LanguageModel, self).__init__()
|
||||
self.embedding_size = embedding_size
|
||||
self.embedding = nn.Embedding(vocabulary_size, embedding_size)
|
||||
self.layers = nn.Sequential(
|
||||
nn.Linear((self.grams_count - 1) * embedding_size, hidden_size),
|
||||
nn.ReLU(),
|
||||
nn.Linear(hidden_size, vocabulary_size),
|
||||
nn.Softmax(dim=1)
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.embedding(x).view((-1, (self.grams_count - 1) * self.embedding_size))
|
||||
return self.layers(x)
|
||||
|
||||
|
||||
# In[12]:
|
||||
|
||||
|
||||
from torch.optim import Adam
|
||||
from torch import log
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def train(model, dataset, output_file, epochs):
|
||||
optimizer = Adam(model.parameters(), lr=0.00007)
|
||||
criterion = nn.NLLLoss()
|
||||
model.to(device)
|
||||
model.train()
|
||||
for epoch in range(epochs):
|
||||
for i, (x, y) in(bar := tqdm(enumerate(dataset), total=len(dataset))):
|
||||
x = x.to(device)
|
||||
y = y.to(device)
|
||||
optimizer.zero_grad()
|
||||
ypredicted = model(x)
|
||||
loss = criterion(log(ypredicted), y[:,0])
|
||||
if not i % 100:
|
||||
bar.set_description(f'Epoch: {epoch}, Loss: {loss}, Batch: {i}')
|
||||
loss.backward()
|
||||
try:
|
||||
nn.utils.clip_grad_norm_(model.parameters(), 5, error_if_nonfinite=True)
|
||||
optimizer.step()
|
||||
except RuntimeError:
|
||||
print("Grad overflow")
|
||||
|
||||
save_model(model.state_dict(), output_file)
|
||||
|
||||
|
||||
# In[13]:
|
||||
|
||||
|
||||
embedding_size = 256
|
||||
model = LanguageModel(len(vocabulary), embedding_size, 128)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
train(model, train_dataset, 'test_model', 5)
|
||||
|
||||
|
||||
# In[28]:
|
||||
|
||||
|
||||
embedding_size = 256
|
||||
model_512 = LanguageModel(len(vocabulary), embedding_size, 512)
|
||||
train(model_512, train_dataset, 'test_model_512', 5)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
embedding_size = 256
|
||||
model = LanguageModel(len(vocabulary), embedding_size, 128)
|
||||
|
||||
|
||||
# In[18]:
|
||||
|
||||
|
||||
dev_input_raw = read_xz_file('challenging-america-word-gap-prediction/dev-0/in.tsv.xz')
|
||||
dev_contexts = [get_contexts(t) for t in dev_input_raw]
|
||||
test_input_raw = read_xz_file('challenging-america-word-gap-prediction/test-A/in.tsv.xz')
|
||||
test_sentences = [get_contexts(t) for t in test_input_raw]
|
||||
|
||||
|
||||
# In[15]:
|
||||
|
||||
|
||||
from torch import load as load_model
|
||||
|
||||
vocabulary = load_model('vocabulary.pth')
|
||||
tokenizer = get_tokenizer('basic_english')
|
||||
|
||||
#model = load_model('test_model')
|
||||
model.eval()
|
||||
|
||||
|
||||
# In[16]:
|
||||
|
||||
|
||||
from torch import LongTensor, topk, log
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def predict_words(dataset, tokenizer, vocab, model):
|
||||
preds = []
|
||||
for entry in tqdm(dataset):
|
||||
tokenized_left = tokenizer(entry['left'])
|
||||
tokenized_right = tokenizer(entry['right'])
|
||||
# [last word from left context, y, first word from right context]
|
||||
src = LongTensor([vocab[tokenized_left[-1]], vocab[tokenized_right[0]]]).to(device)
|
||||
output = model(src)
|
||||
top = topk(output[0], 50)
|
||||
probs, tokens = top.values.tolist(), vocab.lookup_tokens(top.indices.tolist())
|
||||
current_output = ''
|
||||
accumulated_probability = 0
|
||||
for prob, token in zip(probs, tokens):
|
||||
accumulated_probability += prob
|
||||
current_output += f'{token.strip()}:{prob} '
|
||||
current_output += f':{1 - accumulated_probability}'
|
||||
preds.append(current_output)
|
||||
return preds
|
||||
|
||||
|
||||
# In[24]:
|
||||
|
||||
|
||||
preds = predict_words(dev_contexts, tokenizer, vocabulary, model)
|
||||
|
||||
|
||||
# In[25]:
|
||||
|
||||
|
||||
with open('challenging-america-word-gap-prediction/dev-0/out-hidden_size=128.tsv', 'w') as f:
|
||||
f.writelines(line + '\n' for line in preds)
|
||||
|
||||
|
||||
# In[26]:
|
||||
|
||||
|
||||
test_preds = predict_words(test_sentences, tokenizer, vocabulary, model)
|
||||
with open('challenging-america-word-gap-prediction/test-A/out-hidden_size=128.tsv', 'w') as f:
|
||||
f.writelines(line + '\n' for line in test_preds)
|
||||
|
||||
|
||||
# In[29]:
|
||||
|
||||
|
||||
#model_512 = load_model('test_model_512')
|
||||
model_512.eval()
|
||||
preds_512 = predict_words(dev_contexts, tokenizer, vocabulary, model_512)
|
||||
with open('challenging-america-word-gap-prediction/dev-0/out-hidden_size=512.tsv', 'w') as f:
|
||||
f.writelines(line + '\n' for line in preds)
|
||||
test_preds_512 = predict_words(test_sentences, tokenizer, vocabulary, model_512)
|
||||
with open('challenging-america-word-gap-prediction/test-A/out-hidden_size=512.tsv', 'w') as f:
|
||||
f.writelines(line + '\n' for line in test_preds)
|
||||
|
7414
test-A/out-hidden_size=128.tsv
Normal file
7414
test-A/out-hidden_size=128.tsv
Normal file
File diff suppressed because it is too large
Load Diff
7414
test-A/out-hidden_size=512.tsv
Normal file
7414
test-A/out-hidden_size=512.tsv
Normal file
File diff suppressed because it is too large
Load Diff
14828
test-A/out.tsv
14828
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user