challenging-america-word-ga.../zad8.ipynb
2023-06-28 19:20:16 +02:00

41 KiB
Raw Blame History

train_file ='train/in.tsv.xz'
test_file = 'dev-0/in.tsv.xz'
out_file = 'dev-0/out.tsv'
from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
import lzma
import pickle
import re
import torch
from torch import nn
from torch.utils.data import IterableDataset
import itertools
from torch.utils.data import DataLoader
import yaml
epochs = 3
embed_size = 200
device = 'cuda'
vocab_size = 30000
batch_s = 1600
learning_rate = 0.01
k = 20 #top k words
wildcard_minweight = 0.01
params = {
'epochs': 3,
'embed_size': 100,
'device': 'cuda',
'vocab_size': 30000,
'batch_size': 3200,
'learning_rate': 0.0001,
'k': 15, #top k words
'wildcard_minweight': 0.01
}
params = yaml.load(open('config/params.yaml'))
#then, entire code should go about those params with params[epochs] etc
/tmp/ipykernel_37433/1141171476.py:1: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.
  params = yaml.load(open('config/params.yaml'))
params
{'epochs': 3,
 'embed_size': 100,
 'device': 'cuda',
 'vocab_size': 30000,
 'batch_size': 3200,
 'learning_rate': 0.0001,
 'k': 15,
 'wildcard_minweight': 0.01}
def get_words_from_line(line):
    line = line.rstrip()
    yield '<s>'
    line = preprocess(line)
    for t in line.split(' '):
        yield t
    yield '</s>'


def get_word_lines_from_file(file_name):
    n = 0
    with lzma.open(file_name, 'r') as fh:
        for line in fh:
            n+=1
            if n%1000==0:
                print(n)
            yield get_words_from_line(line.decode('utf-8'))
def look_ahead_iterator(gen):
    prev2 = None
    prev1 = None
    for item in gen:
        if prev2 is not None and prev1 is not None:
            yield (prev2, prev1, item)
        prev2 = prev1
        prev1 = item

class Trigrams(IterableDataset):
  def __init__(self, text_file, vocabulary_size):
      self.vocab = build_vocab_from_iterator(
         get_word_lines_from_file(text_file),
         max_tokens = vocabulary_size,
         specials = ['<unk>'])
      self.vocab.set_default_index(self.vocab['<unk>'])
      self.vocabulary_size = vocabulary_size
      self.text_file = text_file

  def __iter__(self):
     return look_ahead_iterator(
         (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))
    
vocab = build_vocab_from_iterator(
    get_word_lines_from_file(train_file),
    max_tokens = params['vocab_size'],
    specials = ['<unk>'])
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000
159000
160000
161000
162000
163000
164000
165000
166000
167000
168000
169000
170000
171000
172000
173000
174000
175000
176000
177000
178000
179000
180000
181000
182000
183000
184000
185000
186000
187000
188000
189000
190000
191000
192000
193000
194000
195000
196000
197000
198000
199000
200000
201000
202000
203000
204000
205000
206000
207000
208000
209000
210000
211000
212000
213000
214000
215000
216000
217000
218000
219000
220000
221000
222000
223000
224000
225000
226000
227000
228000
229000
230000
231000
232000
233000
234000
235000
236000
237000
238000
239000
240000
241000
242000
243000
244000
245000
246000
247000
248000
249000
250000
251000
252000
253000
254000
255000
256000
257000
258000
259000
260000
261000
262000
263000
264000
265000
266000
267000
268000
269000
270000
271000
272000
273000
274000
275000
276000
277000
278000
279000
280000
281000
282000
283000
284000
285000
286000
287000
288000
289000
290000
291000
292000
293000
294000
295000
296000
297000
298000
299000
300000
301000
302000
303000
304000
305000
306000
307000
308000
309000
310000
311000
312000
313000
314000
315000
316000
317000
318000
319000
320000
321000
322000
323000
324000
325000
326000
327000
328000
329000
330000
331000
332000
333000
334000
335000
336000
337000
338000
339000
340000
341000
342000
343000
344000
345000
346000
347000
348000
349000
350000
351000
352000
353000
354000
355000
356000
357000
358000
359000
360000
361000
362000
363000
364000
365000
366000
367000
368000
369000
370000
371000
372000
373000
374000
375000
376000
377000
378000
379000
380000
381000
382000
383000
384000
385000
386000
387000
388000
389000
390000
391000
392000
393000
394000
395000
396000
397000
398000
399000
400000
401000
402000
403000
404000
405000
406000
407000
408000
409000
410000
411000
412000
413000
414000
415000
416000
417000
418000
419000
420000
421000
422000
423000
424000
425000
426000
427000
428000
429000
430000
431000
432000
with open('filename.pickle', 'wb') as handle:
    pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('filename.pickle','rb') as handle:
    vocab = pickle.load(handle)
    
train_dataset = Trigrams(train_file, params['vocab_size'])
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000
159000
160000
161000
162000
163000
164000
165000
166000
167000
168000
169000
170000
171000
172000
173000
174000
175000
176000
177000
178000
179000
180000
181000
182000
183000
184000
185000
186000
187000
188000
189000
190000
191000
192000
193000
194000
195000
196000
197000
198000
199000
200000
201000
202000
203000
204000
205000
206000
207000
208000
209000
210000
211000
212000
213000
214000
215000
216000
217000
218000
219000
220000
221000
222000
223000
224000
225000
226000
227000
228000
229000
230000
231000
232000
233000
234000
235000
236000
237000
238000
239000
240000
241000
242000
243000
244000
245000
246000
247000
248000
249000
250000
251000
252000
253000
254000
255000
256000
257000
258000
259000
260000
261000
262000
263000
264000
265000
266000
267000
268000
269000
270000
271000
272000
273000
274000
275000
276000
277000
278000
279000
280000
281000
282000
283000
284000
285000
286000
287000
288000
289000
290000
291000
292000
293000
294000
295000
296000
297000
298000
299000
300000
301000
302000
303000
304000
305000
306000
307000
308000
309000
310000
311000
312000
313000
314000
315000
316000
317000
318000
319000
320000
321000
322000
323000
324000
325000
326000
327000
328000
329000
330000
331000
332000
333000
334000
335000
336000
337000
338000
339000
340000
341000
342000
343000
344000
345000
346000
347000
348000
349000
350000
351000
352000
353000
354000
355000
356000
357000
358000
359000
360000
361000
362000
363000
364000
365000
366000
367000
368000
369000
370000
371000
372000
373000
374000
375000
376000
377000
378000
379000
380000
381000
382000
383000
384000
385000
386000
387000
388000
389000
390000
391000
392000
393000
394000
395000
396000
397000
398000
399000
400000
401000
402000
403000
404000
405000
406000
407000
408000
409000
410000
411000
412000
413000
414000
415000
416000
417000
418000
419000
420000
421000
422000
423000
424000
425000
426000
427000
428000
429000
430000
431000
432000
data = DataLoader(train_dataset, batch_size=params['batch_size']) #load data 
class SimpleTrigramNeuralLanguageModel(nn.Module):
  def __init__(self, vocabulary_size, embedding_size):
      super(SimpleTrigramNeuralLanguageModel, self).__init__()
      self.embeddings = nn.Embedding(vocabulary_size, embedding_size)
      self.linear = nn.Linear(2*embedding_size, vocabulary_size)
      self.linear_matrix_2 = nn.Linear(embedding_size*2, embedding_size*2)
      self.relu = nn.ReLU()
      self.softmax = nn.Softmax()
    
    #for each word in vocabulary theres a separate embedding vector, consisting of embedding_size entries
    #self.linear is linear layer consisting of concatenated embeddings of left, and right context words
    #self.linear_matrix_2 is linear layer 
    
  def forward(self, x): #x is list of prev and following embeddings
      emb_left = self.embeddings(x[0])
      emb_right = self.embeddings(x[1])
        #create two embeddings vectors, for word before and after, respectively
    
      first_layer_size_2 = self.linear_matrix_2(torch.cat((emb_left, emb_right), dim=1))
      first_relu = self.relu(first_layer_size_2)
      concated = self.linear(first_relu)
      out = self.softmax(concated)
      return out
import gc
torch.cuda.empty_cache()
gc.collect()
0
device = 'cuda'
model = SimpleTrigramNeuralLanguageModel(params['vocab_size'], params['embed_size']).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'])
criterion = torch.nn.NLLLoss()
torch.autograd.set_detect_anomaly(True)
model.load_state_dict(torch.load(f'model-tri-2following-40000.bin'))
for i in range(params['epochs']):
    print('epoch: =', i)
    model.train()
    step = 0
    for x, y, z in data: # word, following, 2nd_following words
       x = x.to(device)
       y = y.to(device)
       z = z.to(device)
       optimizer.zero_grad()
       ypredicted = model([y, z]) #following, 2nd_following word
       loss = criterion(torch.log(ypredicted), x) #x is to_predict
       if step % 100 == 0:
          print(step, loss)
       step += 1
#        if step % 10000 == 0:
#            torch.save(model.state_dict(), f'model-tri-2following-{step}.bin')
       loss.backward()
       optimizer.step()
#     torch.save(model.state_dict(), f'model-tri-2following-{i}.bin')    
# torch.save(model.state_dict(), f'model-tri-2following-final.bin')
epoch: = 0
0 tensor(5.3414, device='cuda:0', grad_fn=<NllLossBackward0>)
/tmp/ipykernel_37433/606935597.py:22: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  out = self.softmax(concated)
1000
100 tensor(5.4870, device='cuda:0', grad_fn=<NllLossBackward0>)
200 tensor(5.3542, device='cuda:0', grad_fn=<NllLossBackward0>)
2000
300 tensor(5.3792, device='cuda:0', grad_fn=<NllLossBackward0>)
3000
400 tensor(5.5982, device='cuda:0', grad_fn=<NllLossBackward0>)
4000
500 tensor(5.4045, device='cuda:0', grad_fn=<NllLossBackward0>)
5000
600 tensor(5.5620, device='cuda:0', grad_fn=<NllLossBackward0>)
6000
700 tensor(5.5428, device='cuda:0', grad_fn=<NllLossBackward0>)
7000
800 tensor(5.3684, device='cuda:0', grad_fn=<NllLossBackward0>)
8000
900 tensor(5.4198, device='cuda:0', grad_fn=<NllLossBackward0>)
9000
1000 tensor(5.4100, device='cuda:0', grad_fn=<NllLossBackward0>)
10000
1100 tensor(5.4554, device='cuda:0', grad_fn=<NllLossBackward0>)
11000
1200 tensor(5.5284, device='cuda:0', grad_fn=<NllLossBackward0>)
12000
1300 tensor(5.5495, device='cuda:0', grad_fn=<NllLossBackward0>)
/home/gedin/.local/lib/python3.10/site-packages/torch/autograd/__init__.py:200: UserWarning: Error detected in LogBackward0. Traceback of forward call that caused the error:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/gedin/.local/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/gedin/.local/lib/python3.10/site-packages/traitlets/config/application.py", line 1043, in launch_instance
    app.start()
  File "/home/gedin/.local/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 725, in start
    self.io_loop.start()
  File "/home/gedin/.local/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 195, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.10/asyncio/base_events.py", line 600, in run_forever
    self._run_once()
  File "/usr/lib/python3.10/asyncio/base_events.py", line 1896, in _run_once
    handle._run()
  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/home/gedin/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 513, in dispatch_queue
    await self.process_one()
  File "/home/gedin/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 502, in process_one
    await dispatch(*args)
  File "/home/gedin/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 409, in dispatch_shell
    await result
  File "/home/gedin/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 729, in execute_request
    reply_content = await reply_content
  File "/home/gedin/.local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 422, in do_execute
    res = shell.run_cell(
  File "/home/gedin/.local/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 540, in run_cell
    return super().run_cell(*args, **kwargs)
  File "/home/gedin/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3009, in run_cell
    result = self._run_cell(
  File "/home/gedin/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3064, in _run_cell
    result = runner(coro)
  File "/home/gedin/.local/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
    coro.send(None)
  File "/home/gedin/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3269, in run_cell_async
    has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  File "/home/gedin/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3448, in run_ast_nodes
    if await self.run_code(code, result, async_=asy):
  File "/home/gedin/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_37433/1707264841.py", line 13, in <module>
    loss = criterion(torch.log(ypredicted), x) #x is to_predict
 (Triggered internally at ../torch/csrc/autograd/python_anomaly_mode.cpp:114.)
  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[26], line 19
     16        step += 1
     17 #        if step % 10000 == 0:
     18 #            torch.save(model.state_dict(), f'model-tri-2following-{step}.bin')
---> 19        loss.backward()
     20        optimizer.step()
     21 #     torch.save(model.state_dict(), f'model-tri-2following-{i}.bin')    
     22 # torch.save(model.state_dict(), f'model-tri-2following-final.bin')

File ~/.local/lib/python3.10/site-packages/torch/_tensor.py:487, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
    477 if has_torch_function_unary(self):
    478     return handle_torch_function(
    479         Tensor.backward,
    480         (self,),
   (...)
    485         inputs=inputs,
    486     )
--> 487 torch.autograd.backward(
    488     self, gradient, retain_graph, create_graph, inputs=inputs
    489 )

File ~/.local/lib/python3.10/site-packages/torch/autograd/__init__.py:200, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    195     retain_graph = create_graph
    197 # The reason we repeat same the comment below is that
    198 # some Python versions print out the first line of a multi-line function
    199 # calls in the traceback and some print out the last line
--> 200 Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
    201     tensors, grad_tensors_, retain_graph, create_graph, inputs,
    202     allow_unreachable=True, accumulate_grad=True)

RuntimeError: Function 'LogBackward0' returned nan values in its 0th output.
torch.save(model.state_dict(), f'model-tri-2following-final.bin')
def get_first_word(text):
    """Return the first word of a string."""
    word = ""
    for i in range(len(text)-1):
#         if text[i] in [' ', ',', '.']
        if text[i] == ' ':
            return word.rstrip()
        else:
            word += text[i]
    return word.rstrip()

def get_values_from_model(context: list, model, vocab, k=10):
    words = [vocab.forward([word]) for word in context]
    ixs =  torch.tensor(words).to(device)
    out = model(ixs)
    top = torch.topk(out[0], k)
    top_indices = top.indices.tolist()
    top_probs = top.values.tolist()
    top_words = vocab.lookup_tokens(top_indices)
    return list(zip(top_words, top_probs))

def summarize_probs_unk(dic, const_wildcard=True):
    ''' 
    dic: dictionary of probabilities returned by model 
    returns: tab of probabilities, with <unk> specificly as last element
    '''
    if const_wildcard or '<unk>' not in dic.keys():  
        if '<unk>' in dic.keys():
            del dic['<unk>']
        probsum = sum(float(val) for key, val in dic.items())
        for key in dic:
            dic[key] = dic[key]/probsum*(1-wildcard_minweight) ###leave some space for wildcard
        tab = [(key, val) for key, val in dic.items()]
        tab.append(('<unk>', wildcard_minweight))
    else:
        probsum = sum(float(val) for key, val in dic.items())
        for key in dic:
            dic[key] = dic[key]/probsum*(1-wildcard_minweight) ###leave some space for wildcard
        wildcard_value = dic['<unk>']
        del dic['<unk>']
        tab = [(key, val) for key, val in dic.items()]
        tab.append(('<unk>', wildcard_value))
  
    return tab

def gonito_format(dic, const_wildcard = True):
    tab = summarize_probs_unk(dic, const_wildcard)
    result = ''
    for element in tab[:-1]:
        result+=str(element[0])+':'+str(element[1])+'\t'
    result+=':'+ str(tab[-1][1]) + '\n'
    return result
###preprocessing
def preprocess(line):
    line = get_rid_of_header(line)
    line = replace_endline(line)
    return line

def get_rid_of_header(line):
    line = line.split('\t')[6:]
    return " ".join(line)
    
def replace_endline(line):
    line = line.replace("\\\\n", " ")
    return line

ixs = torch.tensor([vocab.forward(['of']), vocab.forward(['the'])]).to(device)

out = model(ixs)
top = torch.topk(out[0], 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
list(zip(top_words, top_indices, top_probs))
/tmp/ipykernel_37433/606935597.py:22: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  out = self.softmax(concated)
[('<unk>', 0, 0.12663832306861877),
 ('one', 43, 0.02672259509563446),
 ('part', 146, 0.015497211366891861),
 ('out', 63, 0.012386629357933998),
 ('some', 76, 0.008164796978235245),
 ('members', 426, 0.00799479242414236),
 ('side', 238, 0.007780702318996191),
 ('portion', 634, 0.005733700469136238),
 ('office', 282, 0.0053163678385317326),
 ('member', 712, 0.005126394797116518)]
model.load_state_dict(torch.load(f'model-tri-2following-final.bin'))
<All keys matched successfully>
with lzma.open(test_file, 'rt') as file:
    predict_words = []
    results = []
    for line in file:
        line = replace_endline(line) #get only relevant
        line = line.split('\t')[6:]
        context = line[1].rstrip().split(" ")[:2]
        predict_words.append(context) #get_first_word(split[1cd 
    vocab = train_dataset.vocab
    for context_words in predict_words:
        results.append(dict(get_values_from_model(context_words, model, vocab, k=10)))
        
    with open(out_file, 'w') as outfile:
        for elem in results:  
            outfile.write(gonito_format(elem, const_wildcard=False))
/tmp/ipykernel_4654/606935597.py:22: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  out = self.softmax(concated)