all done
This commit is contained in:
commit
cef5c90195
|
@ -0,0 +1,8 @@
|
|||
|
||||
*~
|
||||
*.swp
|
||||
*.bak
|
||||
*.pyc
|
||||
*.o
|
||||
.DS_Store
|
||||
.token
|
|
@ -0,0 +1,30 @@
|
|||
from itertools import islice
|
||||
import regex as re
|
||||
import sys
|
||||
from torchtext.vocab import build_vocab_from_iterator
|
||||
import lzma
|
||||
import scripts
|
||||
|
||||
|
||||
|
||||
def get_word_lines_from_file(file_name):
|
||||
counter=0
|
||||
with lzma.open(file_name, 'r') as fh:
|
||||
for line in fh:
|
||||
counter+=1
|
||||
# if counter == 10000:
|
||||
# break
|
||||
line = line.decode("utf-8")
|
||||
yield scripts.get_words_from_line(line)
|
||||
|
||||
|
||||
vocab_size = scripts.vocab_size
|
||||
|
||||
vocab = build_vocab_from_iterator(
|
||||
get_word_lines_from_file('train/in.tsv.xz'),
|
||||
max_tokens = vocab_size,
|
||||
specials = ['<unk>'])
|
||||
|
||||
import pickle
|
||||
with open("vocab.pickle", 'wb') as handle:
|
||||
pickle.dump(vocab, handle)
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,14 @@
|
|||
description: Zajęcia 8
|
||||
tags:
|
||||
- trigram
|
||||
- neural-network
|
||||
- hidden-layer
|
||||
- hiperparameters
|
||||
params:
|
||||
epochs: 1,2,3,4
|
||||
learning-rate: 0.0001,0.00001
|
||||
batch-size: 6400,12800
|
||||
training-set: 100000-lines
|
||||
links:
|
||||
- title: "Git WMI"
|
||||
url: "https://git.wmi.amu.edu.pl/s444463/neural_word_gap"
|
|
@ -0,0 +1,107 @@
|
|||
from torch import nn
|
||||
import torch
|
||||
|
||||
|
||||
from torch.utils.data import IterableDataset
|
||||
import itertools
|
||||
import lzma
|
||||
import regex as re
|
||||
import pickle
|
||||
import scripts
|
||||
import os
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
||||
|
||||
class SimpleTrigramNeuralLanguageModel(nn.Module):
|
||||
def __init__(self, vocabulary_size, embedding_size):
|
||||
super(SimpleTrigramNeuralLanguageModel, self).__init__()
|
||||
self.embedings = nn.Embedding(vocabulary_size, embedding_size)
|
||||
self.linear = nn.Linear(embedding_size*2, vocabulary_size)
|
||||
|
||||
self.linear_first_layer = nn.Linear(embedding_size*2, embedding_size*2)
|
||||
self.relu = nn.ReLU()
|
||||
self.softmax = nn.Softmax()
|
||||
|
||||
# self.model = nn.Sequential(
|
||||
# nn.Embedding(vocabulary_size, embedding_size),
|
||||
# nn.Linear(embedding_size, vocabulary_size),
|
||||
# nn.Softmax()
|
||||
# )
|
||||
|
||||
def forward(self, x):
|
||||
emb_1 = self.embedings(x[0])
|
||||
emb_2 = self.embedings(x[1])
|
||||
|
||||
first_layer = self.linear_first_layer(torch.cat((emb_1, emb_2), dim=1))
|
||||
after_relu = self.relu(first_layer)
|
||||
concated = self.linear(after_relu)
|
||||
|
||||
y = self.softmax(concated)
|
||||
|
||||
return y
|
||||
|
||||
vocab_size = scripts.vocab_size
|
||||
embed_size = 100
|
||||
device = 'cuda'
|
||||
|
||||
model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size).to(device)
|
||||
|
||||
model.load_state_dict(torch.load('batch_model_epoch_0.bin'))
|
||||
model.eval()
|
||||
|
||||
with open("vocab.pickle", 'rb') as handle:
|
||||
vocab = pickle.load(handle)
|
||||
vocab.set_default_index(vocab['<unk>'])
|
||||
|
||||
|
||||
step = 0
|
||||
|
||||
|
||||
with lzma.open('dev-0/in.tsv.xz', 'rb') as file:
|
||||
for line in file:
|
||||
line = line.decode('utf-8')
|
||||
line = line.rstrip()
|
||||
# line = line.lower()
|
||||
line = line.replace("\\\\n", ' ')
|
||||
|
||||
|
||||
line_splitted = line.split('\t')[-2:]
|
||||
|
||||
prev = list(scripts.get_words_from_line(line_splitted[0]))[-1]
|
||||
next = list(scripts.get_words_from_line(line_splitted[1]))[0]
|
||||
|
||||
# prev = line[0].split(' ')[-1]
|
||||
# next = line[1].split(' ')[0]
|
||||
|
||||
|
||||
x = torch.tensor(vocab.forward([prev]))
|
||||
z = torch.tensor(vocab.forward([next]))
|
||||
x = x.to(device)
|
||||
z = z.to(device)
|
||||
ypredicted = model([x, z])
|
||||
|
||||
try:
|
||||
|
||||
top = torch.topk(ypredicted[0], 128)
|
||||
except:
|
||||
print(ypredicted[0])
|
||||
raise Exception('aa')
|
||||
top_indices = top.indices.tolist()
|
||||
top_probs = top.values.tolist()
|
||||
top_words = vocab.lookup_tokens(top_indices)
|
||||
|
||||
string_to_print = ''
|
||||
sum_probs = 0
|
||||
|
||||
for w, p in zip(top_words, top_probs):
|
||||
if '<unk>' in w:
|
||||
continue
|
||||
if re.search(r'\p{L}+', w):
|
||||
string_to_print += f"{w}:{p} "
|
||||
sum_probs += p
|
||||
if string_to_print == '':
|
||||
print(f"the:0.2 a:0.3 :0.5")
|
||||
continue
|
||||
unknow_prob = 1 - sum_probs
|
||||
string_to_print += f":{unknow_prob}"
|
||||
|
||||
print(string_to_print)
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,124 @@
|
|||
|
||||
|
||||
from torch import nn
|
||||
import torch
|
||||
|
||||
|
||||
from torch.utils.data import IterableDataset
|
||||
import itertools
|
||||
import lzma
|
||||
import regex as re
|
||||
import pickle
|
||||
import scripts
|
||||
|
||||
|
||||
def look_ahead_iterator(gen):
|
||||
prev = None
|
||||
current = None
|
||||
next = None
|
||||
for next in gen:
|
||||
if prev is not None and current is not None:
|
||||
yield (prev, current, next)
|
||||
prev = current
|
||||
current = next
|
||||
|
||||
|
||||
def get_word_lines_from_file(file_name):
|
||||
counter=0
|
||||
with lzma.open(file_name, 'r') as fh:
|
||||
for line in fh:
|
||||
counter+=1
|
||||
if counter == 100000:
|
||||
break
|
||||
line = line.decode("utf-8")
|
||||
yield scripts.get_words_from_line(line)
|
||||
|
||||
|
||||
|
||||
class Trigrams(IterableDataset):
|
||||
def load_vocab(self):
|
||||
with open("vocab.pickle", 'rb') as handle:
|
||||
vocab = pickle.load( handle)
|
||||
return vocab
|
||||
|
||||
def __init__(self, text_file, vocabulary_size):
|
||||
self.vocab = self.load_vocab()
|
||||
self.vocab.set_default_index(self.vocab['<unk>'])
|
||||
self.vocabulary_size = vocabulary_size
|
||||
self.text_file = text_file
|
||||
|
||||
def __iter__(self):
|
||||
return look_ahead_iterator(
|
||||
(self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))
|
||||
|
||||
vocab_size = scripts.vocab_size
|
||||
|
||||
train_dataset = Trigrams('train/in.tsv.xz', vocab_size)
|
||||
|
||||
|
||||
|
||||
#=== trenowanie
|
||||
from torch import nn
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
embed_size = 100
|
||||
|
||||
class SimpleTrigramNeuralLanguageModel(nn.Module):
|
||||
def __init__(self, vocabulary_size, embedding_size):
|
||||
super(SimpleTrigramNeuralLanguageModel, self).__init__()
|
||||
self.embedings = nn.Embedding(vocabulary_size, embedding_size)
|
||||
self.linear = nn.Linear(embedding_size*2, vocabulary_size)
|
||||
|
||||
self.linear_first_layer = nn.Linear(embedding_size*2, embedding_size*2)
|
||||
self.relu = nn.ReLU()
|
||||
self.softmax = nn.Softmax()
|
||||
|
||||
# self.model = nn.Sequential(
|
||||
# nn.Embedding(vocabulary_size, embedding_size),
|
||||
# nn.Linear(embedding_size, vocabulary_size),
|
||||
# nn.Softmax()
|
||||
# )
|
||||
|
||||
def forward(self, x):
|
||||
emb_1 = self.embedings(x[0])
|
||||
emb_2 = self.embedings(x[1])
|
||||
|
||||
first_layer = self.linear_first_layer(torch.cat((emb_1, emb_2), dim=1))
|
||||
after_relu = self.relu(first_layer)
|
||||
concated = self.linear(after_relu)
|
||||
|
||||
y = self.softmax(concated)
|
||||
|
||||
return y
|
||||
|
||||
model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size)
|
||||
|
||||
vocab = train_dataset.vocab
|
||||
|
||||
|
||||
device = 'cuda'
|
||||
model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size).to(device)
|
||||
data = DataLoader(train_dataset, batch_size=12800)
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=scripts.learning_rate)
|
||||
criterion = torch.nn.NLLLoss()
|
||||
|
||||
model.train()
|
||||
step = 0
|
||||
epochs = 4
|
||||
for i in range(epochs):
|
||||
for x, y, z in data:
|
||||
x = x.to(device)
|
||||
y = y.to(device)
|
||||
z = z.to(device)
|
||||
optimizer.zero_grad()
|
||||
ypredicted = model([x, z])
|
||||
loss = criterion(torch.log(ypredicted), y)
|
||||
if step % 2000 == 0:
|
||||
print(step, loss)
|
||||
# torch.save(model.state_dict(), f'model1_{step}.bin')
|
||||
step += 1
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
torch.save(model.state_dict(), f'batch_model_epoch_{i}.bin')
|
||||
print(step, loss, f'model_epoch_{i}.bin')
|
||||
torch.save(model.state_dict(), 'model_tri1.bin')
|
Loading…
Reference in New Issue