3.4 KiB
3.4 KiB
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.__version__, device
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
model.to(device)
import lzma
def file_iterator(file_path):
print(file_path, file_path.endswith(".xz"))
if file_path.endswith(".xz"):
with lzma.open(file_path, mode="r") as fp:
for line in fp.readlines():
yield line.decode("utf-8") # .split("\t")[7]
else:
with open(file_path, "r", encoding="utf-8") as fp:
for line in fp.readlines():
yield line
def clear_line(line):
return line.lower().strip("\n").replace("\\\\n", "")
K = 20
for file_path in ("dev-0", "test-A"):
print("Working on file from folder:", file_path)
data_iterator = file_iterator(f"{file_path}/in.tsv.xz")
with open(f"{file_path}/out-tr-dec.tsv", "w", encoding="utf-8") as fp:
for line in data_iterator:
# print([(i, part) for i, part in enumerate(line.split('\t'))])
left_context = clear_line(line.split("\t")[6])
# print(left_context)
inputs = tokenizer.encode(left_context, return_tensors="pt").to(device)
preds = model(inputs)
# print('\n', preds)
z_dist = preds[0][0][-1]
probability_distances = torch.softmax(preds[0][0][-1], dim=0)
top_k = probability_distances.topk(K)
# print(top_k)
results = [
f"{tokenizer.decode([idx])}:{value}"
for value, idx in zip(top_k.values, top_k.indices)
]
# print(results)
line_to_write = " ".join(results) + f" :{1 - torch.sum(top_k.values)}\n"
# print(line_to_write)
fp.write(line_to_write)
# break
# break