challenging-america-word-ga.../run.ipynb
Mateusz Ogrodowczyk eda46d3f86 gpt-2 left context
2023-06-26 20:35:45 +02:00

3.4 KiB

import torch 
from transformers import GPT2LMHeadModel, GPT2Tokenizer

import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.__version__, device
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model.to(device)
import lzma 


def file_iterator(file_path):
    print(file_path, file_path.endswith(".xz"))
    if file_path.endswith(".xz"):
        with lzma.open(file_path, mode="r") as fp:
            for line in fp.readlines():
                yield line.decode("utf-8")#.split("\t")[7]
    else:
        with open(file_path, "r", encoding="utf-8") as fp:
            for line in fp.readlines():
                yield line

def clear_line(line):
    return line.lower().strip("\n").replace("\\\\n", "")

K = 20
for file_path in ('dev-0', 'test-A'):
    print('Working on file from folder:', file_path)
    data_iterator = file_iterator(f'{file_path}/in.tsv.xz')
    with open(f'{file_path}/out-tr-dec.tsv', 'w', encoding='utf-8') as fp:
        for line in data_iterator:
            # print([(i, part) for i, part in enumerate(line.split('\t'))])
            left_context = clear_line(line.split('\t')[6])
            # print(left_context)
            inputs = tokenizer.encode(left_context, return_tensors='pt').to(device)
            preds = model(inputs)
            # print('\n', preds)
            z_dist = preds[0][0][-1]
            probability_distances = torch.softmax(preds[0][0][-1], dim=0)
            top_k = probability_distances.topk(K)
            # print(top_k)
            results = [f'{tokenizer.decode([idx])}:{value}' for value, idx in zip(top_k.values, top_k.indices)]
            # print(results)
            line_to_write = ' '.join(results) + f' :{1 - torch.sum(top_k.values)}\n'
            # print(line_to_write)
            fp.write(line_to_write)
        #     break
        # break