wmt-2020-pl-en/run-transf-dec.ipynb
2023-06-26 19:51:47 +02:00

3.4 KiB

import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.__version__, device
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")

model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
model.to(device)
import lzma


def file_iterator(file_path):
    print(file_path, file_path.endswith(".xz"))
    if file_path.endswith(".xz"):
        with lzma.open(file_path, mode="r") as fp:
            for line in fp.readlines():
                yield line.decode("utf-8")  # .split("\t")[7]
    else:
        with open(file_path, "r", encoding="utf-8") as fp:
            for line in fp.readlines():
                yield line


def clear_line(line):
    return line.lower().strip("\n").replace("\\\\n", "")
K = 20
for file_path in ("dev-0", "test-A"):
    print("Working on file from folder:", file_path)
    data_iterator = file_iterator(f"{file_path}/in.tsv.xz")
    with open(f"{file_path}/out-tr-dec.tsv", "w", encoding="utf-8") as fp:
        for line in data_iterator:
            # print([(i, part) for i, part in enumerate(line.split('\t'))])
            left_context = clear_line(line.split("\t")[6])
            # print(left_context)
            inputs = tokenizer.encode(left_context, return_tensors="pt").to(device)
            preds = model(inputs)
            # print('\n', preds)
            z_dist = preds[0][0][-1]
            probability_distances = torch.softmax(preds[0][0][-1], dim=0)
            top_k = probability_distances.topk(K)
            # print(top_k)
            results = [
                f"{tokenizer.decode([idx])}:{value}"
                for value, idx in zip(top_k.values, top_k.indices)
            ]
            # print(results)
            line_to_write = " ".join(results) + f" :{1 - torch.sum(top_k.values)}\n"
            # print(line_to_write)
            fp.write(line_to_write)
        #     break
        # break