41 KiB
41 KiB
train_file ='train/in.tsv.xz'
test_file = 'dev-0/in.tsv.xz'
out_file = 'dev-0/out.tsv'
from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
import lzma
import pickle
import re
import torch
from torch import nn
from torch.utils.data import IterableDataset
import itertools
from torch.utils.data import DataLoader
import yaml
epochs = 3
embed_size = 200
device = 'cuda'
vocab_size = 30000
batch_s = 1600
learning_rate = 0.01
k = 20 #top k words
wildcard_minweight = 0.01
params = {
'epochs': 3,
'embed_size': 100,
'device': 'cuda',
'vocab_size': 30000,
'batch_size': 3200,
'learning_rate': 0.0001,
'k': 15, #top k words
'wildcard_minweight': 0.01
}
params = yaml.load(open('config/params.yaml'))
#then, entire code should go about those params with params[epochs] etc
/tmp/ipykernel_37433/1141171476.py:1: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details. params = yaml.load(open('config/params.yaml'))
params
{'epochs': 3, 'embed_size': 100, 'device': 'cuda', 'vocab_size': 30000, 'batch_size': 3200, 'learning_rate': 0.0001, 'k': 15, 'wildcard_minweight': 0.01}
def get_words_from_line(line):
line = line.rstrip()
yield '<s>'
line = preprocess(line)
for t in line.split(' '):
yield t
yield '</s>'
def get_word_lines_from_file(file_name):
n = 0
with lzma.open(file_name, 'r') as fh:
for line in fh:
n+=1
if n%1000==0:
print(n)
yield get_words_from_line(line.decode('utf-8'))
def look_ahead_iterator(gen):
prev2 = None
prev1 = None
for item in gen:
if prev2 is not None and prev1 is not None:
yield (prev2, prev1, item)
prev2 = prev1
prev1 = item
class Trigrams(IterableDataset):
def __init__(self, text_file, vocabulary_size):
self.vocab = build_vocab_from_iterator(
get_word_lines_from_file(text_file),
max_tokens = vocabulary_size,
specials = ['<unk>'])
self.vocab.set_default_index(self.vocab['<unk>'])
self.vocabulary_size = vocabulary_size
self.text_file = text_file
def __iter__(self):
return look_ahead_iterator(
(self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))
vocab = build_vocab_from_iterator(
get_word_lines_from_file(train_file),
max_tokens = params['vocab_size'],
specials = ['<unk>'])
1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000 21000 22000 23000 24000 25000 26000 27000 28000 29000 30000 31000 32000 33000 34000 35000 36000 37000 38000 39000 40000 41000 42000 43000 44000 45000 46000 47000 48000 49000 50000 51000 52000 53000 54000 55000 56000 57000 58000 59000 60000 61000 62000 63000 64000 65000 66000 67000 68000 69000 70000 71000 72000 73000 74000 75000 76000 77000 78000 79000 80000 81000 82000 83000 84000 85000 86000 87000 88000 89000 90000 91000 92000 93000 94000 95000 96000 97000 98000 99000 100000 101000 102000 103000 104000 105000 106000 107000 108000 109000 110000 111000 112000 113000 114000 115000 116000 117000 118000 119000 120000 121000 122000 123000 124000 125000 126000 127000 128000 129000 130000 131000 132000 133000 134000 135000 136000 137000 138000 139000 140000 141000 142000 143000 144000 145000 146000 147000 148000 149000 150000 151000 152000 153000 154000 155000 156000 157000 158000 159000 160000 161000 162000 163000 164000 165000 166000 167000 168000 169000 170000 171000 172000 173000 174000 175000 176000 177000 178000 179000 180000 181000 182000 183000 184000 185000 186000 187000 188000 189000 190000 191000 192000 193000 194000 195000 196000 197000 198000 199000 200000 201000 202000 203000 204000 205000 206000 207000 208000 209000 210000 211000 212000 213000 214000 215000 216000 217000 218000 219000 220000 221000 222000 223000 224000 225000 226000 227000 228000 229000 230000 231000 232000 233000 234000 235000 236000 237000 238000 239000 240000 241000 242000 243000 244000 245000 246000 247000 248000 249000 250000 251000 252000 253000 254000 255000 256000 257000 258000 259000 260000 261000 262000 263000 264000 265000 266000 267000 268000 269000 270000 271000 272000 273000 274000 275000 276000 277000 278000 279000 280000 281000 282000 283000 284000 285000 286000 287000 288000 289000 290000 291000 292000 293000 294000 295000 296000 297000 298000 299000 300000 301000 302000 303000 304000 305000 306000 307000 308000 309000 310000 311000 312000 313000 314000 315000 316000 317000 318000 319000 320000 321000 322000 323000 324000 325000 326000 327000 328000 329000 330000 331000 332000 333000 334000 335000 336000 337000 338000 339000 340000 341000 342000 343000 344000 345000 346000 347000 348000 349000 350000 351000 352000 353000 354000 355000 356000 357000 358000 359000 360000 361000 362000 363000 364000 365000 366000 367000 368000 369000 370000 371000 372000 373000 374000 375000 376000 377000 378000 379000 380000 381000 382000 383000 384000 385000 386000 387000 388000 389000 390000 391000 392000 393000 394000 395000 396000 397000 398000 399000 400000 401000 402000 403000 404000 405000 406000 407000 408000 409000 410000 411000 412000 413000 414000 415000 416000 417000 418000 419000 420000 421000 422000 423000 424000 425000 426000 427000 428000 429000 430000 431000 432000
with open('filename.pickle', 'wb') as handle:
pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('filename.pickle','rb') as handle:
vocab = pickle.load(handle)
train_dataset = Trigrams(train_file, params['vocab_size'])
1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000 21000 22000 23000 24000 25000 26000 27000 28000 29000 30000 31000 32000 33000 34000 35000 36000 37000 38000 39000 40000 41000 42000 43000 44000 45000 46000 47000 48000 49000 50000 51000 52000 53000 54000 55000 56000 57000 58000 59000 60000 61000 62000 63000 64000 65000 66000 67000 68000 69000 70000 71000 72000 73000 74000 75000 76000 77000 78000 79000 80000 81000 82000 83000 84000 85000 86000 87000 88000 89000 90000 91000 92000 93000 94000 95000 96000 97000 98000 99000 100000 101000 102000 103000 104000 105000 106000 107000 108000 109000 110000 111000 112000 113000 114000 115000 116000 117000 118000 119000 120000 121000 122000 123000 124000 125000 126000 127000 128000 129000 130000 131000 132000 133000 134000 135000 136000 137000 138000 139000 140000 141000 142000 143000 144000 145000 146000 147000 148000 149000 150000 151000 152000 153000 154000 155000 156000 157000 158000 159000 160000 161000 162000 163000 164000 165000 166000 167000 168000 169000 170000 171000 172000 173000 174000 175000 176000 177000 178000 179000 180000 181000 182000 183000 184000 185000 186000 187000 188000 189000 190000 191000 192000 193000 194000 195000 196000 197000 198000 199000 200000 201000 202000 203000 204000 205000 206000 207000 208000 209000 210000 211000 212000 213000 214000 215000 216000 217000 218000 219000 220000 221000 222000 223000 224000 225000 226000 227000 228000 229000 230000 231000 232000 233000 234000 235000 236000 237000 238000 239000 240000 241000 242000 243000 244000 245000 246000 247000 248000 249000 250000 251000 252000 253000 254000 255000 256000 257000 258000 259000 260000 261000 262000 263000 264000 265000 266000 267000 268000 269000 270000 271000 272000 273000 274000 275000 276000 277000 278000 279000 280000 281000 282000 283000 284000 285000 286000 287000 288000 289000 290000 291000 292000 293000 294000 295000 296000 297000 298000 299000 300000 301000 302000 303000 304000 305000 306000 307000 308000 309000 310000 311000 312000 313000 314000 315000 316000 317000 318000 319000 320000 321000 322000 323000 324000 325000 326000 327000 328000 329000 330000 331000 332000 333000 334000 335000 336000 337000 338000 339000 340000 341000 342000 343000 344000 345000 346000 347000 348000 349000 350000 351000 352000 353000 354000 355000 356000 357000 358000 359000 360000 361000 362000 363000 364000 365000 366000 367000 368000 369000 370000 371000 372000 373000 374000 375000 376000 377000 378000 379000 380000 381000 382000 383000 384000 385000 386000 387000 388000 389000 390000 391000 392000 393000 394000 395000 396000 397000 398000 399000 400000 401000 402000 403000 404000 405000 406000 407000 408000 409000 410000 411000 412000 413000 414000 415000 416000 417000 418000 419000 420000 421000 422000 423000 424000 425000 426000 427000 428000 429000 430000 431000 432000
data = DataLoader(train_dataset, batch_size=params['batch_size']) #load data
class SimpleTrigramNeuralLanguageModel(nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleTrigramNeuralLanguageModel, self).__init__()
self.embeddings = nn.Embedding(vocabulary_size, embedding_size)
self.linear = nn.Linear(2*embedding_size, vocabulary_size)
self.linear_matrix_2 = nn.Linear(embedding_size*2, embedding_size*2)
self.relu = nn.ReLU()
self.softmax = nn.Softmax()
#for each word in vocabulary theres a separate embedding vector, consisting of embedding_size entries
#self.linear is linear layer consisting of concatenated embeddings of left, and right context words
#self.linear_matrix_2 is linear layer
def forward(self, x): #x is list of prev and following embeddings
emb_left = self.embeddings(x[0])
emb_right = self.embeddings(x[1])
#create two embeddings vectors, for word before and after, respectively
first_layer_size_2 = self.linear_matrix_2(torch.cat((emb_left, emb_right), dim=1))
first_relu = self.relu(first_layer_size_2)
concated = self.linear(first_relu)
out = self.softmax(concated)
return out
import gc
torch.cuda.empty_cache()
gc.collect()
0
device = 'cuda'
model = SimpleTrigramNeuralLanguageModel(params['vocab_size'], params['embed_size']).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'])
criterion = torch.nn.NLLLoss()
torch.autograd.set_detect_anomaly(True)
model.load_state_dict(torch.load(f'model-tri-2following-40000.bin'))
for i in range(params['epochs']):
print('epoch: =', i)
model.train()
step = 0
for x, y, z in data: # word, following, 2nd_following words
x = x.to(device)
y = y.to(device)
z = z.to(device)
optimizer.zero_grad()
ypredicted = model([y, z]) #following, 2nd_following word
loss = criterion(torch.log(ypredicted), x) #x is to_predict
if step % 100 == 0:
print(step, loss)
step += 1
# if step % 10000 == 0:
# torch.save(model.state_dict(), f'model-tri-2following-{step}.bin')
loss.backward()
optimizer.step()
# torch.save(model.state_dict(), f'model-tri-2following-{i}.bin')
# torch.save(model.state_dict(), f'model-tri-2following-final.bin')
epoch: = 0 0 tensor(5.3414, device='cuda:0', grad_fn=<NllLossBackward0>)
/tmp/ipykernel_37433/606935597.py:22: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument. out = self.softmax(concated)
1000 100 tensor(5.4870, device='cuda:0', grad_fn=<NllLossBackward0>) 200 tensor(5.3542, device='cuda:0', grad_fn=<NllLossBackward0>) 2000 300 tensor(5.3792, device='cuda:0', grad_fn=<NllLossBackward0>) 3000 400 tensor(5.5982, device='cuda:0', grad_fn=<NllLossBackward0>) 4000 500 tensor(5.4045, device='cuda:0', grad_fn=<NllLossBackward0>) 5000 600 tensor(5.5620, device='cuda:0', grad_fn=<NllLossBackward0>) 6000 700 tensor(5.5428, device='cuda:0', grad_fn=<NllLossBackward0>) 7000 800 tensor(5.3684, device='cuda:0', grad_fn=<NllLossBackward0>) 8000 900 tensor(5.4198, device='cuda:0', grad_fn=<NllLossBackward0>) 9000 1000 tensor(5.4100, device='cuda:0', grad_fn=<NllLossBackward0>) 10000 1100 tensor(5.4554, device='cuda:0', grad_fn=<NllLossBackward0>) 11000 1200 tensor(5.5284, device='cuda:0', grad_fn=<NllLossBackward0>) 12000 1300 tensor(5.5495, device='cuda:0', grad_fn=<NllLossBackward0>)
/home/gedin/.local/lib/python3.10/site-packages/torch/autograd/__init__.py:200: UserWarning: Error detected in LogBackward0. Traceback of forward call that caused the error: File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/home/gedin/.local/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module> app.launch_new_instance() File "/home/gedin/.local/lib/python3.10/site-packages/traitlets/config/application.py", line 1043, in launch_instance app.start() File "/home/gedin/.local/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 725, in start self.io_loop.start() File "/home/gedin/.local/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 195, in start self.asyncio_loop.run_forever() File "/usr/lib/python3.10/asyncio/base_events.py", line 600, in run_forever self._run_once() File "/usr/lib/python3.10/asyncio/base_events.py", line 1896, in _run_once handle._run() File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run self._context.run(self._callback, *self._args) File "/home/gedin/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 513, in dispatch_queue await self.process_one() File "/home/gedin/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 502, in process_one await dispatch(*args) File "/home/gedin/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 409, in dispatch_shell await result File "/home/gedin/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 729, in execute_request reply_content = await reply_content File "/home/gedin/.local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 422, in do_execute res = shell.run_cell( File "/home/gedin/.local/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 540, in run_cell return super().run_cell(*args, **kwargs) File "/home/gedin/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3009, in run_cell result = self._run_cell( File "/home/gedin/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3064, in _run_cell result = runner(coro) File "/home/gedin/.local/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner coro.send(None) File "/home/gedin/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3269, in run_cell_async has_raised = await self.run_ast_nodes(code_ast.body, cell_name, File "/home/gedin/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3448, in run_ast_nodes if await self.run_code(code, result, async_=asy): File "/home/gedin/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code exec(code_obj, self.user_global_ns, self.user_ns) File "/tmp/ipykernel_37433/1707264841.py", line 13, in <module> loss = criterion(torch.log(ypredicted), x) #x is to_predict (Triggered internally at ../torch/csrc/autograd/python_anomaly_mode.cpp:114.) Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
[0;31m---------------------------------------------------------------------------[0m [0;31mRuntimeError[0m Traceback (most recent call last) Cell [0;32mIn[26], line 19[0m [1;32m 16[0m step [38;5;241m+[39m[38;5;241m=[39m [38;5;241m1[39m [1;32m 17[0m [38;5;66;03m# if step % 10000 == 0:[39;00m [1;32m 18[0m [38;5;66;03m# torch.save(model.state_dict(), f'model-tri-2following-{step}.bin')[39;00m [0;32m---> 19[0m [43mloss[49m[38;5;241;43m.[39;49m[43mbackward[49m[43m([49m[43m)[49m [1;32m 20[0m optimizer[38;5;241m.[39mstep() [1;32m 21[0m [38;5;66;03m# torch.save(model.state_dict(), f'model-tri-2following-{i}.bin') [39;00m [1;32m 22[0m [38;5;66;03m# torch.save(model.state_dict(), f'model-tri-2following-final.bin')[39;00m File [0;32m~/.local/lib/python3.10/site-packages/torch/_tensor.py:487[0m, in [0;36mTensor.backward[0;34m(self, gradient, retain_graph, create_graph, inputs)[0m [1;32m 477[0m [38;5;28;01mif[39;00m has_torch_function_unary([38;5;28mself[39m): [1;32m 478[0m [38;5;28;01mreturn[39;00m handle_torch_function( [1;32m 479[0m Tensor[38;5;241m.[39mbackward, [1;32m 480[0m ([38;5;28mself[39m,), [0;32m (...)[0m [1;32m 485[0m inputs[38;5;241m=[39minputs, [1;32m 486[0m ) [0;32m--> 487[0m [43mtorch[49m[38;5;241;43m.[39;49m[43mautograd[49m[38;5;241;43m.[39;49m[43mbackward[49m[43m([49m [1;32m 488[0m [43m [49m[38;5;28;43mself[39;49m[43m,[49m[43m [49m[43mgradient[49m[43m,[49m[43m [49m[43mretain_graph[49m[43m,[49m[43m [49m[43mcreate_graph[49m[43m,[49m[43m [49m[43minputs[49m[38;5;241;43m=[39;49m[43minputs[49m [1;32m 489[0m [43m[49m[43m)[49m File [0;32m~/.local/lib/python3.10/site-packages/torch/autograd/__init__.py:200[0m, in [0;36mbackward[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)[0m [1;32m 195[0m retain_graph [38;5;241m=[39m create_graph [1;32m 197[0m [38;5;66;03m# The reason we repeat same the comment below is that[39;00m [1;32m 198[0m [38;5;66;03m# some Python versions print out the first line of a multi-line function[39;00m [1;32m 199[0m [38;5;66;03m# calls in the traceback and some print out the last line[39;00m [0;32m--> 200[0m [43mVariable[49m[38;5;241;43m.[39;49m[43m_execution_engine[49m[38;5;241;43m.[39;49m[43mrun_backward[49m[43m([49m[43m [49m[38;5;66;43;03m# Calls into the C++ engine to run the backward pass[39;49;00m [1;32m 201[0m [43m [49m[43mtensors[49m[43m,[49m[43m [49m[43mgrad_tensors_[49m[43m,[49m[43m [49m[43mretain_graph[49m[43m,[49m[43m [49m[43mcreate_graph[49m[43m,[49m[43m [49m[43minputs[49m[43m,[49m [1;32m 202[0m [43m [49m[43mallow_unreachable[49m[38;5;241;43m=[39;49m[38;5;28;43;01mTrue[39;49;00m[43m,[49m[43m [49m[43maccumulate_grad[49m[38;5;241;43m=[39;49m[38;5;28;43;01mTrue[39;49;00m[43m)[49m [0;31mRuntimeError[0m: Function 'LogBackward0' returned nan values in its 0th output.
torch.save(model.state_dict(), f'model-tri-2following-final.bin')
def get_first_word(text):
"""Return the first word of a string."""
word = ""
for i in range(len(text)-1):
# if text[i] in [' ', ',', '.']
if text[i] == ' ':
return word.rstrip()
else:
word += text[i]
return word.rstrip()
def get_values_from_model(context: list, model, vocab, k=10):
words = [vocab.forward([word]) for word in context]
ixs = torch.tensor(words).to(device)
out = model(ixs)
top = torch.topk(out[0], k)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
return list(zip(top_words, top_probs))
def summarize_probs_unk(dic, const_wildcard=True):
'''
dic: dictionary of probabilities returned by model
returns: tab of probabilities, with <unk> specificly as last element
'''
if const_wildcard or '<unk>' not in dic.keys():
if '<unk>' in dic.keys():
del dic['<unk>']
probsum = sum(float(val) for key, val in dic.items())
for key in dic:
dic[key] = dic[key]/probsum*(1-wildcard_minweight) ###leave some space for wildcard
tab = [(key, val) for key, val in dic.items()]
tab.append(('<unk>', wildcard_minweight))
else:
probsum = sum(float(val) for key, val in dic.items())
for key in dic:
dic[key] = dic[key]/probsum*(1-wildcard_minweight) ###leave some space for wildcard
wildcard_value = dic['<unk>']
del dic['<unk>']
tab = [(key, val) for key, val in dic.items()]
tab.append(('<unk>', wildcard_value))
return tab
def gonito_format(dic, const_wildcard = True):
tab = summarize_probs_unk(dic, const_wildcard)
result = ''
for element in tab[:-1]:
result+=str(element[0])+':'+str(element[1])+'\t'
result+=':'+ str(tab[-1][1]) + '\n'
return result
###preprocessing
def preprocess(line):
line = get_rid_of_header(line)
line = replace_endline(line)
return line
def get_rid_of_header(line):
line = line.split('\t')[6:]
return " ".join(line)
def replace_endline(line):
line = line.replace("\\\\n", " ")
return line
ixs = torch.tensor([vocab.forward(['of']), vocab.forward(['the'])]).to(device)
out = model(ixs)
top = torch.topk(out[0], 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
list(zip(top_words, top_indices, top_probs))
/tmp/ipykernel_37433/606935597.py:22: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument. out = self.softmax(concated)
[('<unk>', 0, 0.12663832306861877), ('one', 43, 0.02672259509563446), ('part', 146, 0.015497211366891861), ('out', 63, 0.012386629357933998), ('some', 76, 0.008164796978235245), ('members', 426, 0.00799479242414236), ('side', 238, 0.007780702318996191), ('portion', 634, 0.005733700469136238), ('office', 282, 0.0053163678385317326), ('member', 712, 0.005126394797116518)]
model.load_state_dict(torch.load(f'model-tri-2following-final.bin'))
<All keys matched successfully>
with lzma.open(test_file, 'rt') as file:
predict_words = []
results = []
for line in file:
line = replace_endline(line) #get only relevant
line = line.split('\t')[6:]
context = line[1].rstrip().split(" ")[:2]
predict_words.append(context) #get_first_word(split[1cd
vocab = train_dataset.vocab
for context_words in predict_words:
results.append(dict(get_values_from_model(context_words, model, vocab, k=10)))
with open(out_file, 'w') as outfile:
for elem in results:
outfile.write(gonito_format(elem, const_wildcard=False))
/tmp/ipykernel_4654/606935597.py:22: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument. out = self.softmax(concated)