zad 10 done

This commit is contained in:
Mikolaj 2023-06-04 17:07:15 +02:00
parent 6b714b7556
commit 961dbf8b99
9 changed files with 18338 additions and 18511 deletions

View File

@ -1,30 +0,0 @@
from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
import lzma
import scripts
def get_word_lines_from_file(file_name):
counter=0
with lzma.open(file_name, 'r') as fh:
for line in fh:
counter+=1
# if counter == 10000:
# break
line = line.decode("utf-8")
yield scripts.get_words_from_line(line)
vocab_size = scripts.vocab_size
vocab = build_vocab_from_iterator(
get_word_lines_from_file('train/in.tsv.xz'),
max_tokens = vocab_size,
specials = ['<unk>'])
import pickle
with open("vocab.pickle", 'wb') as handle:
pickle.dump(vocab, handle)

File diff suppressed because it is too large Load Diff

104
generator.py Normal file
View File

@ -0,0 +1,104 @@
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import numpy as np
from collections import Counter
import string
import lzma
import pdb
import copy
from torch.utils.data import IterableDataset
import itertools
import lzma
import regex as re
import pickle
import string
import pdb
import utils
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = 'cuda'
vocab_size = utils.vocab_size
with open("vocab.pickle", 'rb') as handle:
vocab = pickle.load( handle)
vocab.set_default_index(vocab['<unk>'])
class Model(nn.Module):
def __init__(self, vocab_size):
super(Model, self).__init__()
self.lstm_size = 150
self.embedding_dim = 200
self.num_layers = 1
self.embedding = nn.Embedding(
num_embeddings=vocab_size,
embedding_dim=self.embedding_dim,
)
self.lstm = nn.LSTM(
input_size=self.embedding_dim,
hidden_size=self.lstm_size,
num_layers=self.num_layers,
batch_first=True,
bidirectional=True,
# dropout=0.2,
)
self.fc = nn.Linear(self.lstm_size*2, vocab_size)
def forward(self, x, prev_state = None):
embed = self.embedding(x)
output, state = self.lstm(embed, prev_state)
logits = self.fc(output)
return logits, state
def init_state(self, sequence_length):
return (torch.zeros(self.num_layers*2, sequence_length, self.lstm_size).to(device),
torch.zeros(self.num_layers*2, sequence_length, self.lstm_size).to(device))
model = Model(vocab_size = vocab_size).to(device)
model.load_state_dict(torch.load('lstm_step_10000.bin'))
model.eval()
def predict(model, text_splitted):
model.eval()
words = text_splitted
x = torch.tensor([[vocab[w] for w in words]]).to(device)
state_h, state_c = model.init_state(x.size()[0])
y_pred, (state_h, state_c) = model(x, (state_h, state_c))
last_word_logits = y_pred[0][-1]
p = torch.nn.functional.softmax(last_word_logits, dim=0)
top = torch.topk(p, 10)
top_indices = top.indices.tolist()
top_words = vocab.lookup_tokens(top_indices)
if '<unk>' in top_words:
top_words.remove('<unk>')
return np.random.choice(top_words)
prompts = [
'These, and a thousand other means, by which the wealth of a nation may be greatly increase',
'Pants, coat and vest of the latest styles, are provided. Whenever the fires need coaling,',
'Mr. Deddrick intends to clothe it and\ngive it as nearly as possible a likeness'
]
for p in prompts:
answer = ''
for i in range(10):
answer += predict(model, p.split()) + ' '
print('Prompt: ', p)
print('Answer: ', answer)
# Prompt: These, and a thousand other means, by which the wealth of a nation may be greatly increase
# Answer: as the of as and to in to for in
# Prompt: Pants, coat and vest of the latest styles, are provided. Whenever the fires need coaling,
# Answer: in that The a the of the to the for
# Prompt: Mr. Deddrick intends to clothe it and
# give it as nearly as possible a likeness
# Answer: and of\nthe for man in of\nthe and of man of

View File

@ -1,107 +1,118 @@
from torch import nn
import torch import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import numpy as np
from collections import Counter
import string
import lzma
import pdb
import copy
from torch.utils.data import IterableDataset from torch.utils.data import IterableDataset
import itertools import itertools
import lzma import lzma
import regex as re import regex as re
import pickle import pickle
import scripts import string
import pdb
import utils
import os import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1" os.environ["CUDA_VISIBLE_DEVICES"] = "0"
class SimpleTrigramNeuralLanguageModel(nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleTrigramNeuralLanguageModel, self).__init__()
self.embedings = nn.Embedding(vocabulary_size, embedding_size)
self.linear = nn.Linear(embedding_size*2, vocabulary_size)
self.linear_first_layer = nn.Linear(embedding_size*2, embedding_size*2)
self.relu = nn.ReLU()
self.softmax = nn.Softmax()
# self.model = nn.Sequential(
# nn.Embedding(vocabulary_size, embedding_size),
# nn.Linear(embedding_size, vocabulary_size),
# nn.Softmax()
# )
def forward(self, x):
emb_1 = self.embedings(x[0])
emb_2 = self.embedings(x[1])
first_layer = self.linear_first_layer(torch.cat((emb_1, emb_2), dim=1))
after_relu = self.relu(first_layer)
concated = self.linear(after_relu)
y = self.softmax(concated)
return y
vocab_size = scripts.vocab_size
embed_size = 100
device = 'cuda' device = 'cuda'
model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size).to(device) vocab_size = utils.vocab_size
model.load_state_dict(torch.load('batch_model_epoch_0.bin'))
model.eval()
with open("vocab.pickle", 'rb') as handle: with open("vocab.pickle", 'rb') as handle:
vocab = pickle.load(handle) vocab = pickle.load( handle)
vocab.set_default_index(vocab['<unk>']) vocab.set_default_index(vocab['<unk>'])
class Model(nn.Module):
def __init__(self, vocab_size):
super(Model, self).__init__()
self.lstm_size = 150
self.embedding_dim = 200
self.num_layers = 1
self.embedding = nn.Embedding(
num_embeddings=vocab_size,
embedding_dim=self.embedding_dim,
)
self.lstm = nn.LSTM(
input_size=self.embedding_dim,
hidden_size=self.lstm_size,
num_layers=self.num_layers,
batch_first=True,
bidirectional=True,
# dropout=0.2,
)
self.fc = nn.Linear(self.lstm_size*2, vocab_size)
def forward(self, x, prev_state = None):
embed = self.embedding(x)
output, state = self.lstm(embed, prev_state)
logits = self.fc(output)
return logits, state
def init_state(self, sequence_length):
return (torch.zeros(self.num_layers*2, sequence_length, self.lstm_size).to(device),
torch.zeros(self.num_layers*2, sequence_length, self.lstm_size).to(device))
step = 0 model = Model(vocab_size = vocab_size).to(device)
model.load_state_dict(torch.load('lstm_step_10000.bin'))
model.eval()
def predict(model, text_splitted):
model.eval()
words = text_splitted
x = torch.tensor([[vocab[w] for w in words]]).to(device)
state_h, state_c = model.init_state(x.size()[0])
y_pred, (state_h, state_c) = model(x, (state_h, state_c))
with lzma.open('dev-0/in.tsv.xz', 'rb') as file: last_word_logits = y_pred[0][-1]
for line in file: p = torch.nn.functional.softmax(last_word_logits, dim=0)
line = line.decode('utf-8')
line = line.rstrip()
# line = line.lower()
line = line.replace("\\\\n", ' ')
top = torch.topk(p, 64)
line_splitted = line.split('\t')[-2:]
prev = list(scripts.get_words_from_line(line_splitted[0]))[-1]
next = list(scripts.get_words_from_line(line_splitted[1]))[0]
# prev = line[0].split(' ')[-1]
# next = line[1].split(' ')[0]
x = torch.tensor(vocab.forward([prev]))
z = torch.tensor(vocab.forward([next]))
x = x.to(device)
z = z.to(device)
ypredicted = model([x, z])
try:
top = torch.topk(ypredicted[0], 128)
except:
print(ypredicted[0])
raise Exception('aa')
top_indices = top.indices.tolist() top_indices = top.indices.tolist()
top_probs = top.values.tolist() top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices) top_words = vocab.lookup_tokens(top_indices)
return top_words, top_probs
inference_result = []
with lzma.open(f'test-A/in.tsv.xz', 'r') as file:
for line in file:
line = line.decode("utf-8")
line = line.rstrip()
line = line.translate(str.maketrans('', '', string.punctuation))
line_splitted_by_tab = line.split('\t')
left_context = line_splitted_by_tab[-2]
left_context_splitted = list(utils.get_words_from_line(left_context))
top_words, top_probs = predict(model, left_context_splitted)
string_to_print = '' string_to_print = ''
sum_probs = 0
sum_probs = 0
for w, p in zip(top_words, top_probs): for w, p in zip(top_words, top_probs):
# print(top_words)
if '<unk>' in w: if '<unk>' in w:
continue continue
if re.search(r'\p{L}+', w):
string_to_print += f"{w}:{p} " string_to_print += f"{w}:{p} "
sum_probs += p sum_probs += p
if string_to_print == '': if string_to_print == '':
print(f"the:0.2 a:0.3 :0.5") inference_result.append("the:0.2 a:0.3 :0.5")
continue continue
unknow_prob = 1 - sum_probs unknow_prob = 1 - sum_probs
string_to_print += f":{unknow_prob}" string_to_print += f":{unknow_prob}"
print(string_to_print) inference_result.append(string_to_print)
with open('test-A/out.tsv', 'w') as f:
for line in inference_result:
f.write(line+'\n')
print('All done')

189
lstm.py Normal file
View File

@ -0,0 +1,189 @@
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import numpy as np
from collections import Counter
import string
import lzma
import pdb
import copy
from torch.utils.data import IterableDataset
import itertools
import lzma
import regex as re
import pickle
import string
import pdb
import utils
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device = 'cuda'
with open("vocab.pickle", 'rb') as handle:
vocab = pickle.load( handle)
vocab.set_default_index(vocab['<unk>'])
def get_word_lines_from_file(file_name):
counter=0
seq_len = 10
with lzma.open(file_name, 'r') as fh:
for line in fh:
counter+=1
# if counter == 100000:
# break
line = line.decode("utf-8")
line_splitted = utils.get_words_from_line(line)
vocab_line = [vocab[t] for t in line_splitted]
for i in range(len(vocab_line) - seq_len):
yield torch.tensor(vocab_line[i:i+seq_len]), torch.tensor(vocab_line[i+1 :i+seq_len+1])
class Grams_10(IterableDataset):
def __init__(self, text_file, vocab):
self.vocab = vocab
self.vocab.set_default_index(self.vocab['<unk>'])
self.text_file = text_file
def __iter__(self):
return get_word_lines_from_file(self.text_file)
vocab_size = utils.vocab_size
train_dataset = Grams_10('train/in.tsv.xz', vocab)
BATCH_SIZE = 1024
class Model(nn.Module):
def __init__(self, vocab_size):
super(Model, self).__init__()
self.lstm_size = 150
self.embedding_dim = 200
self.num_layers = 1
self.embedding = nn.Embedding(
num_embeddings=vocab_size,
embedding_dim=self.embedding_dim,
)
self.lstm = nn.LSTM(
input_size=self.embedding_dim,
hidden_size=self.lstm_size,
num_layers=self.num_layers,
batch_first=True,
bidirectional=True,
# dropout=0.2,
)
self.fc = nn.Linear(self.lstm_size*2, vocab_size)
def forward(self, x, prev_state = None):
embed = self.embedding(x)
output, state = self.lstm(embed, prev_state)
logits = self.fc(output)
return logits, state
def init_state(self, sequence_length):
return (torch.zeros(self.num_layers*2, sequence_length, self.lstm_size).to(device),
torch.zeros(self.num_layers*2, sequence_length, self.lstm_size).to(device))
def train(dataloader, model, max_epochs):
model.train()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
for epoch in range(max_epochs):
step = 0
for batch_i, (x, y) in enumerate(dataloader):
# pdb.set_trace()
x = x.to(device)
y = y.to(device)
optimizer.zero_grad()
y_pred, (state_h, state_c) = model(x)
# pdb.set_trace()
loss = criterion(y_pred.transpose(1, 2), y)
loss.backward()
optimizer.step()
step+=1
if step % 500 == 0:
print({ 'epoch': epoch,'step': step ,'loss': loss.item(), })
# torch.save(model.state_dict(), f'lstm_step_{step}.bin')
if step % 5000 == 0:
print({ 'epoch': epoch, 'step': step, 'loss': loss.item() })
torch.save(model.state_dict(), f'lstm_step_{step}.bin')
torch.save(model.state_dict(), f'lstm_epoch_{epoch}.bin')
# break
print('Halko zaczynamy trenowanie')
model = Model(vocab_size = vocab_size).to(device)
dataset = DataLoader(train_dataset, batch_size=BATCH_SIZE)
train(dataset, model, 1)
torch.save(model.state_dict(), f'lstm.bin')
# def predict(model, text_splitted):
# model.eval()
# words = text_splitted
# x = torch.tensor([[vocab[w] for w in words]]).to(device)
# state_h, state_c = model.init_state(x.size()[0])
# y_pred, (state_h, state_c) = model(x, (state_h, state_c))
# last_word_logits = y_pred[0][-1]
# p = torch.nn.functional.softmax(last_word_logits, dim=0)
# top = torch.topk(p, 64)
# top_indices = top.indices.tolist()
# top_probs = top.values.tolist()
# top_words = vocab.lookup_tokens(top_indices)
# return top_words, top_probs
# print('Halko zaczynamy predykcje')
# inference_result = []
# with lzma.open(f'dev-0/in.tsv.xz', 'r') as file:
# for line in file:
# line = line.decode("utf-8")
# line = line.rstrip()
# line = line.translate(str.maketrans('', '', string.punctuation))
# line_splitted_by_tab = line.split('\t')
# left_context = line_splitted_by_tab[-2]
# left_context_splitted = list(utils.get_words_from_line(left_context))
# top_words, top_probs = predict(model, left_context_splitted)
# string_to_print = ''
# sum_probs = 0
# for w, p in zip(top_words, top_probs):
# # print(top_words)
# if '<unk>' in w:
# continue
# string_to_print += f"{w}:{p} "
# sum_probs += p
# if string_to_print == '':
# inference_result.append("the:0.2 a:0.3 :0.5")
# continue
# unknow_prob = 1 - sum_probs
# string_to_print += f":{unknow_prob}"
# inference_result.append(string_to_print)
# with open('dev-0/out.tsv', 'w') as f:
# for line in inference_result:
# f.write(line+'\n')
print('All done')

File diff suppressed because it is too large Load Diff

124
train.py
View File

@ -1,124 +0,0 @@
from torch import nn
import torch
from torch.utils.data import IterableDataset
import itertools
import lzma
import regex as re
import pickle
import scripts
def look_ahead_iterator(gen):
prev = None
current = None
next = None
for next in gen:
if prev is not None and current is not None:
yield (prev, current, next)
prev = current
current = next
def get_word_lines_from_file(file_name):
counter=0
with lzma.open(file_name, 'r') as fh:
for line in fh:
counter+=1
if counter == 100000:
break
line = line.decode("utf-8")
yield scripts.get_words_from_line(line)
class Trigrams(IterableDataset):
def load_vocab(self):
with open("vocab.pickle", 'rb') as handle:
vocab = pickle.load( handle)
return vocab
def __init__(self, text_file, vocabulary_size):
self.vocab = self.load_vocab()
self.vocab.set_default_index(self.vocab['<unk>'])
self.vocabulary_size = vocabulary_size
self.text_file = text_file
def __iter__(self):
return look_ahead_iterator(
(self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))
vocab_size = scripts.vocab_size
train_dataset = Trigrams('train/in.tsv.xz', vocab_size)
#=== trenowanie
from torch import nn
import torch
from torch.utils.data import DataLoader
embed_size = 100
class SimpleTrigramNeuralLanguageModel(nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleTrigramNeuralLanguageModel, self).__init__()
self.embedings = nn.Embedding(vocabulary_size, embedding_size)
self.linear = nn.Linear(embedding_size*2, vocabulary_size)
self.linear_first_layer = nn.Linear(embedding_size*2, embedding_size*2)
self.relu = nn.ReLU()
self.softmax = nn.Softmax()
# self.model = nn.Sequential(
# nn.Embedding(vocabulary_size, embedding_size),
# nn.Linear(embedding_size, vocabulary_size),
# nn.Softmax()
# )
def forward(self, x):
emb_1 = self.embedings(x[0])
emb_2 = self.embedings(x[1])
first_layer = self.linear_first_layer(torch.cat((emb_1, emb_2), dim=1))
after_relu = self.relu(first_layer)
concated = self.linear(after_relu)
y = self.softmax(concated)
return y
model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size)
vocab = train_dataset.vocab
device = 'cuda'
model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size).to(device)
data = DataLoader(train_dataset, batch_size=12800)
optimizer = torch.optim.Adam(model.parameters(), lr=scripts.learning_rate)
criterion = torch.nn.NLLLoss()
model.train()
step = 0
epochs = 4
for i in range(epochs):
for x, y, z in data:
x = x.to(device)
y = y.to(device)
z = z.to(device)
optimizer.zero_grad()
ypredicted = model([x, z])
loss = criterion(torch.log(ypredicted), y)
if step % 2000 == 0:
print(step, loss)
# torch.save(model.state_dict(), f'model1_{step}.bin')
step += 1
loss.backward()
optimizer.step()
torch.save(model.state_dict(), f'batch_model_epoch_{i}.bin')
print(step, loss, f'model_epoch_{i}.bin')
torch.save(model.state_dict(), 'model_tri1.bin')

25
utils.py Normal file
View File

@ -0,0 +1,25 @@
import regex as re
import string
from torch import nn
import torch
from torch.utils.data import DataLoader
from torch.utils.data import IterableDataset
import itertools
import lzma
import regex as re
import pickle
import string
def get_words_from_line(line):
line = line.rstrip()
line = line.strip()
# yield '<s>'
for m in line.split():
yield m
# yield '</s>'
vocab_size = 20000
device = 'cuda'

View File

@ -1,348 +0,0 @@
from torch import nn
import torch
from torch.utils.data import DataLoader
import copy
from torch.utils.data import IterableDataset
import itertools
import lzma
import regex as re
import pickle
import scripts
import string
import pdb
import utils
def divide_chunks(l, n):
# looping till length l
for i in range(0, len(l), n):
yield l[i:i + n]
with open("vocab.pickle", 'rb') as handle:
vocab = pickle.load( handle)
vocab.set_default_index(vocab['<unk>'])
def look_ahead_iterator(gen):
seq = []
counter = 0
for item in gen:
seq.append(item)
if counter % 11 == 0 and counter !=0:
if len(seq) == 11:
yield seq
seq = []
counter+=1
def get_word_lines_from_file(file_name):
counter=0
with lzma.open(file_name, 'r') as fh:
for line in fh:
counter+=1
# if counter == 100000:
# break
line = line.decode("utf-8")
yield scripts.get_words_from_line(line)
class Grams_10(IterableDataset):
def load_vocab(self):
with open("vocab.pickle", 'rb') as handle:
vocab = pickle.load( handle)
return vocab
def __init__(self, text_file, vocab):
self.vocab = vocab
self.vocab.set_default_index(self.vocab['<unk>'])
self.text_file = text_file
def __iter__(self):
return look_ahead_iterator(
(self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))
vocab_size = scripts.vocab_size
train_dataset = Grams_10('train/in.tsv.xz', vocab)
BATCH_SIZE = 2048
train_data = DataLoader(train_dataset, batch_size=BATCH_SIZE)
PREFIX_TRAIN = 'train'
PREFIX_VALID = 'dev-0'
BATCHES = []
# def read_train_file(folder_prefix, vocab):
# dataset_x = []
# dataset_y = []
# counter_lines = 0
# seq_len = 10
# with lzma.open(f'{folder_prefix}/in.tsv.xz', 'r') as train, open(f'{folder_prefix}/expected.tsv', 'r') as expected:
# for t_line, e_line in zip(train, expected):
# t_line = t_line.decode("utf-8")
# t_line = t_line.rstrip()
# e_line = e_line.rstrip()
# t_line = t_line.translate(str.maketrans('', '', string.punctuation))
# t_line_splitted_by_tab = t_line.split('\t')
# # t_line_cleared = t_line_splitted_by_tab[-2] + ' ' + e_line + ' ' + t_line_splitted_by_tab[-1]
# whole_line = t_line_splitted_by_tab[-2] + ' ' + e_line + ' ' + t_line_splitted_by_tab[-1]
# whole_line_splitted = list(scripts.get_words_from_line(whole_line))
# whole_lines_splitted = divide_chunks(whole_line_splitted, 11)
# for chunk_line in whole_line_splitted:
# left_context_splitted = chunk_line[0:10]
# seq_x = []
# for i in range(seq_len):
# index = -1 - i
# if len(left_context_splitted) < i + 1:
# seq_x.insert(0, '<empty>')
# else:
# seq_x.insert(0, left_context_splitted[-1 -i])
# left_vocabed = [vocab[t] for t in seq_x]
# dataset_x.append(left_vocabed )
# dataset_y.append([vocab[chunk_line[10]]])
# counter_lines+=1
# # if counter_lines > 20000:
# # break
# return dataset_x, dataset_y
def read_dev_file(folder_prefix, vocab):
dataset_x = []
dataset_y = []
counter_lines = 0
seq_len = 10
with lzma.open(f'{folder_prefix}/in.tsv.xz', 'r') as train, open(f'{folder_prefix}/expected.tsv', 'r') as expected:
for t_line, e_line in zip(train, expected):
t_line = t_line.decode("utf-8")
t_line = t_line.rstrip()
e_line = e_line.rstrip()
t_line = t_line.translate(str.maketrans('', '', string.punctuation))
t_line_splitted_by_tab = t_line.split('\t')
# t_line_cleared = t_line_splitted_by_tab[-2] + ' ' + e_line + ' ' + t_line_splitted_by_tab[-1]
left_context = t_line_splitted_by_tab[-2]
left_context_splitted = list(scripts.get_words_from_line(left_context))
seq_x = []
for i in range(seq_len):
index = -1 - i
if len(left_context_splitted) < i + 1:
seq_x.insert(0, '<empty>')
else:
seq_x.insert(0, left_context_splitted[-1 -i])
left_vocabed = [vocab[t] for t in seq_x]
dataset_x.append(left_vocabed )
dataset_y.append([vocab[e_line]])
counter_lines+=1
# if counter_lines > 20000:
# break
return dataset_x, dataset_y
def read_test_file(folder_prefix, vocab):
dataset_x = []
dataset_y = []
counter_lines = 0
seq_len = 10
with lzma.open(f'{folder_prefix}/in.tsv.xz', 'r') as train:
for t_line in train:
t_line = t_line.decode("utf-8")
t_line = t_line.rstrip()
t_line = t_line.translate(str.maketrans('', '', string.punctuation))
t_line_splitted_by_tab = t_line.split('\t')
# t_line_cleared = t_line_splitted_by_tab[-2] + ' ' + e_line + ' ' + t_line_splitted_by_tab[-1]
left_context = t_line_splitted_by_tab[-2]
left_context_splitted = list(scripts.get_words_from_line(left_context))
seq_x = []
for i in range(seq_len):
index = -1 - i
if len(left_context_splitted) < i + 1:
seq_x.insert(0, '<empty>')
else:
seq_x.insert(0, left_context_splitted[-1 -i])
left_vocabed = [vocab[t] for t in seq_x]
dataset_x.append(left_vocabed )
counter_lines+=1
# if counter_lines > 20000:
# break
return dataset_x
# train_set_x, train_set_y = read_file(PREFIX_TRAIN, vocab)
dev_set_x, dev_set_y = read_dev_file(PREFIX_VALID, vocab)
test_set_x = read_test_file('test-A', vocab)
# train_data_x = DataLoader(train_set_x, batch_size=4048)
# train_data_y = DataLoader(train_set_y, batch_size=4048)
# train_data_x = DataLoader(train_set_x, batch_size=4048)
# train_data_y = DataLoader(train_set_y, batch_size=4048)
dev_data_x = DataLoader(dev_set_x, batch_size=1)
dev_data_y = DataLoader(dev_set_y, batch_size=1)
test_set_x = DataLoader(test_set_x, batch_size=1)
# pdb.set_trace()
device = utils.device
model = utils.LanguageModel(scripts.vocab_size, utils.embed_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=utils.learning_rate)
criterion = torch.nn.NLLLoss()
model.train()
step = 0
last_best_acc = -1
epochs = 3
for epoch in range(epochs):
model.train()
for batch in train_data:
x = batch[:10]
y = [batch[10]]
x = [i.to(device) for i in x]
y = y[0].to(device)
optimizer.zero_grad()
ypredicted = model(x)
# pdb.set_trace()
loss = criterion(torch.log(ypredicted), y)
if step % 10000 == 0:
print('Step: ', step, loss)
# torch.save(model.state_dict(), f'model1_{step}.bin')
step += 1
loss.backward()
optimizer.step()
# evaluation
model.eval()
y_predeicted = []
top_50_true = 0
for d_x, d_y in zip(dev_data_x, dev_data_y):
# pdb.set_trace()
d_x = [i.to(device) for i in d_x]
# d_y = d_y.to(device)
optimizer.zero_grad()
ypredicted = model(d_x)
top = torch.topk(ypredicted[0], 64)
top_indices = top.indices.tolist()
if d_y[0] in top_indices:
top_50_true+=1
my_acc = top_50_true/len(dev_data_y)
print('My_accuracy: ', my_acc, ", epoch: ", epoch)
if my_acc > last_best_acc:
print('NEW BEST -- My_accuracy: ', my_acc, ", epoch: ", epoch)
last_best_acc = my_acc
best_model = copy.deepcopy(model)
torch.save(model.state_dict(), f'model_last_best_.bin')
if epoch % 15 == 0:
print('Epoch: ', epoch, step, loss)
# torch.save(model.state_dict(), f'model_epoch_{epoch}_.bin')
# inference
print('inference')
inference_result = []
for d_x, d_y in zip(dev_data_x, dev_data_y):
# pdb.set_trace()
d_x = [i.to(device) for i in d_x]
# d_y = d_y.to(device)
optimizer.zero_grad()
ypredicted = model(d_x)
top = torch.topk(ypredicted[0], 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
string_to_print = ''
sum_probs = 0
for w, p in zip(top_words, top_probs):
# print(top_words)
if '<unk>' in w:
continue
string_to_print += f"{w}:{p} "
sum_probs += p
if string_to_print == '':
inference_result.append("the:0.2 a:0.3 :0.5")
continue
unknow_prob = 1 - sum_probs
string_to_print += f":{unknow_prob}"
inference_result.append(string_to_print)
with open('dev-0/out.tsv', 'w') as f:
for line in inference_result:
f.write(line+'\n')
print('inference test')
inference_result = []
for d_x in test_set_x:
# pdb.set_trace()
d_x = [i.to(device) for i in d_x]
# d_y = d_y.to(device)
optimizer.zero_grad()
ypredicted = model(d_x)
top = torch.topk(ypredicted[0], 64)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
string_to_print = ''
sum_probs = 0
for w, p in zip(top_words, top_probs):
# print(top_words)
if '<unk>' in w:
continue
string_to_print += f"{w}:{p} "
sum_probs += p
if string_to_print == '':
inference_result.append("the:0.2 a:0.3 :0.5")
continue
unknow_prob = 1 - sum_probs
string_to_print += f":{unknow_prob}"
inference_result.append(string_to_print)
with open('test-A/out.tsv', 'w') as f:
for line in inference_result:
f.write(line+'\n')
print('All done')