122

2023-06-09 21:23:41 +02:00 · 2023-06-09 21:23:41 +02:00 · edec0b1cf5
commit edec0b1cf5
parent b3b64f5475
3 changed files with 17999 additions and 17933 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/zad122.py
+++ b/zad122.py
@ -0,0 +1,66 @@
+import torch
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
+import lzma
+
+# import os
+# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
+torch.cuda.empty_cache()
+
+
+top = 50
+model_name = "gpt2"
+device = torch.device('cuda')
+model = GPT2LMHeadModel.from_pretrained(model_name)
+tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+tokenizer.truncation_side = 'left'
+model.to(torch.device(device))
+
+for folder_name in ['dev-0', 'test-A']:
+    linecount = 10519 if folder_name == 'dev-0' else 7414
+    processed_lines = 0
+    f = lzma.open(f'{folder_name}/in.tsv.xz', mode='rt', encoding='utf-8')
+    with open(f'{folder_name}/out.tsv', 'w', encoding='utf-8') as file:
+        for line in f:
+            separated = line.split('\t')
+            prefix = separated[6].replace(r'\n', ' ')
+            suffix = separated[7].replace(r'\n', ' ')
+
+            first_next_word = suffix.split()[0]
+            #prompt = f'{prefix} [TOKEN] {suffix}\n[TOKEN] = '
+
+            inputs = tokenizer.encode(prefix, return_tensors="pt", truncation=True).to(device)
+            output = model(inputs)
+            probs = torch.softmax(output[0][0][-1], dim=0)
+
+            result = ''
+            total = 0
+            values, indices = probs.topk(top)
+            for val, idx in zip(values, indices):
+                final_val = val.item()
+                token = tokenizer.decode([idx])
+                token = token.strip()
+                if token in ",<>.?:;\'\"/\\{[]}|_-+=)(&%^*#@!$":
+                    continue
+                if token in ['ia', 'ix', 'io', 'ik', 'ing']:
+                    continue
+
+                new_prompt = f'{prefix} {token} '
+                new_inputs = tokenizer.encode(new_prompt, return_tensors="pt", truncation=True).to(device)
+                new_output = model(new_inputs)
+                new_probs = torch.softmax(output[0][0][-1], dim=0)
+                new_values, new_indices = new_probs.topk(top)
+                for new_val, new_idx in zip(new_values, new_indices):
+                    if tokenizer.decode([new_idx]) == first_next_word:
+                        final_val += new_val.item()
+                        break
+
+                total += val
+                result += f'{token}:{final_val} '
+            result += f':{1 - total}'
+
+            file.write(result + '\n')
+            print(f'\r{folder_name} : {(processed_lines/linecount)*100:.2f}%', end='')
+            processed_lines += 1
+            #print(processed_lines)
+    f.close()
+    print()