3.4 KiB
3.4 KiB
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.__version__, device
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model.to(device)
import lzma
def file_iterator(file_path):
print(file_path, file_path.endswith(".xz"))
if file_path.endswith(".xz"):
with lzma.open(file_path, mode="r") as fp:
for line in fp.readlines():
yield line.decode("utf-8")#.split("\t")[7]
else:
with open(file_path, "r", encoding="utf-8") as fp:
for line in fp.readlines():
yield line
def clear_line(line):
return line.lower().strip("\n").replace("\\\\n", "")
K = 20
for file_path in ('dev-0', 'test-A'):
print('Working on file from folder:', file_path)
data_iterator = file_iterator(f'{file_path}/in.tsv.xz')
with open(f'{file_path}/out-tr-dec.tsv', 'w', encoding='utf-8') as fp:
for line in data_iterator:
# print([(i, part) for i, part in enumerate(line.split('\t'))])
left_context = clear_line(line.split('\t')[6])
# print(left_context)
inputs = tokenizer.encode(left_context, return_tensors='pt').to(device)
preds = model(inputs)
# print('\n', preds)
z_dist = preds[0][0][-1]
probability_distances = torch.softmax(preds[0][0][-1], dim=0)
top_k = probability_distances.topk(K)
# print(top_k)
results = [f'{tokenizer.decode([idx])}:{value}' for value, idx in zip(top_k.values, top_k.indices)]
# print(results)
line_to_write = ' '.join(results) + f' :{1 - torch.sum(top_k.values)}\n'
# print(line_to_write)
fp.write(line_to_write)
# break
# break