22 KiB
22 KiB
test_file = 'dev-0/in.tsv.xz'
out_file = 'dev-0/out.tsv'
import torch.nn.functional as F
import torch
import lzma
###preprocessing
def preprocess(line):
line = get_rid_of_header(line)
line = replace_endline(line)
return line
def get_rid_of_header(line):
line = line.split('\t')[6:]
return " ".join(line)
def replace_endline(line):
line = line.replace("\\\\n", " ")
return line
def get_first_word(text):
"""Return the first word of a string."""
word = ""
for i in range(len(text)-1):
# if text[i] in [' ', ',', '.']
if text[i] == ' ':
return word.rstrip()
else:
word += text[i]
return word.rstrip()
def summarize_probs_unk(dic, const_wildcard=True, scale_probs=False, wildcard_minweight=0.01):
'''
dic: dictionary of probabilities returned by model
returns: tab of probabilities, with <unk> specificly as last element
'''
if not scale_probs:
if '' in dic.keys():
del dic['']
tab = [(key, val) for key, val in dic.items()]
tab.append(('', 1-sum([val for val in dic.values()])))
elif const_wildcard and scale_probs: #
if '' in dic.keys():
del dic['']
probsum = sum(float(val) for key, val in dic.items())
for key in dic:
dic[key] = dic[key]/(probsum*(1+wildcard_minweight))
tab = [(key, val) for key, val in dic.items()]
tab.append(('', 1-sum([val for val in dic.values()])))
# else if '' not in dic.keys(): #no wildcard entry
else:
if '' not in dic.keys():
wildcard_value = wildcard_minweight
else:
wildcard_value = dic['']
del dic['']
for key in dic:
dic[key] = dic[key]/(1-wildcard_value) ###leave some space for wildcar
tab = [(key, val) for key, val in dic.items()]
tab.append(('', 1-sum([val for val in dic.values()])))
return tab
def gonito_format(tab):
# tab = summarize_probs_unk(dic, const_wildcard=const_wildcard, wildcard_minweight=wildcard_minweight)
result = ''
for element in tab[:-1]:
result+=str(element[0])+':'+str(element[1])+'\t'
result+=':'+ str(tab[-1][1])+'\n'
return result
!pip install transformers
from transformers import pipeline, set_seed, AutoTokenizer, AutoModel, AutoModelForCausalLM, GPT2Tokenizer
Defaulting to user installation because normal site-packages is not writeable Requirement already satisfied: transformers in /home/gedin/.local/lib/python3.10/site-packages (4.30.2) Requirement already satisfied: requests in /usr/lib/python3/dist-packages (from transformers) (2.25.1) Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (0.13.3) Requirement already satisfied: numpy>=1.17 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (1.24.3) Requirement already satisfied: pyyaml>=5.1 in /usr/lib/python3/dist-packages (from transformers) (5.4.1) Requirement already satisfied: tqdm>=4.27 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (4.65.0) Requirement already satisfied: safetensors>=0.3.1 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (0.3.1) Requirement already satisfied: filelock in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (3.12.0) Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (0.15.1) Requirement already satisfied: regex!=2019.12.17 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (2023.5.5) Requirement already satisfied: packaging>=20.0 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (23.1) Requirement already satisfied: fsspec in /home/gedin/.local/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (2023.6.0) Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/gedin/.local/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (4.6.3)
Defaulting to user installation because normal site-packages is not writeable Requirement already satisfied: transformers in /home/gedin/.local/lib/python3.10/site-packages (4.30.2) Requirement already satisfied: tqdm>=4.27 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (4.65.0) Requirement already satisfied: regex!=2019.12.17 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (2023.5.5) Requirement already satisfied: pyyaml>=5.1 in /usr/lib/python3/dist-packages (from transformers) (5.4.1) Requirement already satisfied: requests in /usr/lib/python3/dist-packages (from transformers) (2.25.1) Requirement already satisfied: packaging>=20.0 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (23.1) Requirement already satisfied: numpy>=1.17 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (1.24.3) Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (0.15.1) Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (0.13.3) Requirement already satisfied: safetensors>=0.3.1 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (0.3.1) Requirement already satisfied: filelock in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (3.12.0) Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/gedin/.local/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (4.6.3) Requirement already satisfied: fsspec in /home/gedin/.local/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (2023.6.0)
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
pt_model = AutoModelForCausalLM.from_pretrained(model_name)
sentence = 'Northeasterly hv the head of said .^corn’s and and the'
encoding = tokenizer(sentence, return_tensors='pt')
output = pt_model(**encoding)
probs = F.softmax(output.logits[0][-1])
/tmp/ipykernel_9135/861569571.py:1: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument. probs = F.softmax(output.logits[0][-1])
top = torch.topk(probs, 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = [tokenizer.decode(idx).strip() for idx in top_indices]
tokenizer.decode(198)
'\n'
# print(top_indices)
print((top_words))
['head', 'heads', '.', 'same', 'body', 'neck', 'other', 'whole', 'said', 'king']
print(top_indices)
[1182, 6665, 764, 976, 1767, 7393, 584, 2187, 531, 5822]
print(top_probs)
[0.18150091171264648, 0.011893990449607372, 0.011805753223598003, 0.011544686742126942, 0.007725409232079983]
print(list(zip(top_words, top_probs)))
[('head', 0.18150091171264648), ('heads', 0.011893990449607372), ('.', 0.011805753223598003), ('same', 0.011544686742126942), ('body', 0.007725409232079983), ('neck', 0.007723228540271521), ('other', 0.006957209203392267), ('whole', 0.006453146692365408), ('said', 0.004757815971970558), ('king', 0.004543370567262173)]
#################Does probabilities from get_values_from_model sum up to 1##################
asdf = dict(get_values_from_model(sentence, model=pt_model, tokenizer=tokenizer, k=10))
asdf = summarize_probs_unk(asdf)
print([x for x in asdf] ,sum([x[1] for x in asdf]))
[('head', 0.7049117686793325), ('heads', 0.04619378363102533), ('.', 0.045851088608379394), ('same', 0.044837160725262136), ('body', 0.030003881711508234), ('neck', 0.02999541235835104), ('other', 0.027020352671284463), ('whole', 0.02506267877962141), ('said', 0.018478367079292513), ('king', 0.01764550575594297), ('', 0.01000000000000012)] 1.0
/tmp/ipykernel_9135/698839240.py:4: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument. probs = F.softmax(output.logits[0][-1]) #get the model prediction for the entire sentence ([-1]) no batching ([0])
def get_values_from_model(context: str, model, tokenizer, k=10):
encoding = tokenizer(context, return_tensors='pt')
output = model(**encoding)
probs = F.softmax(output.logits[0][-1], dim=-1) #get the model prediction for the entire sentence ([-1]) no batching ([0])
top = torch.topk(probs, k)
top_probs = top.values.tolist()
top_indices =top.indices.tolist()
top_words = [tokenizer.decode(idx).strip() for idx in top_indices]
# print(context, "probs: \n", list(zip(top_words, top_indices, top_probs)))
return list(zip(top_words, top_probs))
with lzma.open(test_file, 'rt') as file:
left_contexts = []
results = []
for line in file:
line = replace_endline(line) #get only relevant
line = line.split('\t')[6:]
context = ' '.join(line[0].rstrip().split(" ")[-10:])
# context = context + ' '
# print(context)
left_contexts.append(context)
###get results from gpt model###
for left_context in left_contexts:
results.append(dict(get_values_from_model(left_context, model=pt_model, tokenizer=tokenizer, k=10)))
with open(out_file, 'w') as outfile:
for elem in results:
tab = summarize_probs_unk(elem, const_wildcard=False, scale_probs=True, wildcard_minweight=0.01)
outfile.write(gonito_format(tab))
sentence = 'Northeasterly hv the head of said .^corn’s and and the'
encoding = tokenizer(sentence, return_tensors='pt')
output = pt_model(**encoding)
probs = F.softmax(output.logits[0][-1], dim = -1) #get the model prediction for the entire sentence ([-1]) no batching ([0])
top = torch.topk(probs, 5)
top_probs = top.values.tolist()
top_indices =top.indices.tolist()
top_words = [tokenizer.decode(idx).strip() for idx in top_indices]
print(list(zip(top_words, top_probs)))
[('head', 0.18150091171264648), ('heads', 0.011893990449607372), ('.', 0.011805753223598003), ('same', 0.011544686742126942), ('body', 0.007725409232079983)]
output.logits.shape
torch.Size([1, 20, 50257])
print(list(zip(top_words, top_probs)))
[('head', 0.18150091171264648), ('heads', 0.011893990449607372), ('.', 0.011805753223598003), ('same', 0.011544686742126942), ('body', 0.007725409232079983)]
with lzma.open(test_file, 'rt') as file:
left_contexts = []
right_contexts = []
results = []
i=0
for line in file:
if i >20:
break
line = replace_endline(line) #get only relevant
line = line.split('\t')[6:]
l_context = " ".join(line[0].rstrip().split(" ")[-10:])
r_context = " ".join(line[1].rstrip().split(" ")[:5])
left_contexts.append(l_context)
right_contexts.append(r_context)
i+=1;
left_contexts
['id Seorgc Acorns laud in said Aina and riinninu from', 'true that the Republican par- ty is National in its', 'but here¬ abouts 1 have not seen or heard ot', 'They will not bend or break like the single spiing.', '208 miles long. J. 8 . Con way the Com-', 'They felt that He was the long-looked- for Messiah with', 'in the state that could produce just as good fruit', 'after house and managed to find shelter in the lazarette,', 'hundred buildings put up in the place this season. Mechanics', 'de\xad testation. I no longer loved him, and I felt', 'ol the gentleman, by dis. cussing the constitutionality of these', 'the purest pa- triotism, and tin* most indent devotion to', 'now to i high, will come down, and those too', 'the \'extension of MdKlnleJ" street, and running back to the', 'sir, it is a subject ot peculiar delight to me', 'to affect a miserable scramble for offices in tiiis country.', 'Marj, the eldest, said lo the others: "Lit us pray', 'not enquire—But what sum lie asked, could tempt men well', "hete are 500, or 550 acres ol cleared land, '250", 'petition h is lieeii |ui tty elos ly scru;iiii/e<| and', 'on the high reputation of his record as Chief Mag-']
results = []
for left_context in left_contexts:
results.append(dict(get_values_from_model(left_context, model=pt_model, tokenizer=tokenizer, k=10)))
for idx, elem in enumerate(results):
tab = summarize_probs_unk(elem, const_wildcard=True, scale_probs=True, wildcard_minweight=0.01)
print(idx, " ", gonito_format(tab))
a =dict(get_values_from_model(left_contexts[4], model=pt_model, tokenizer=tokenizer, k=10))
a
{'p': 0.2211313545703888, 'm': 0.12113624811172485, 'mun': 0.07101781666278839, 'mon': 0.05452524498105049, 'mission': 0.05246562883257866, 'mand': 0.03528319671750069, 'pan': 0.02023007906973362, 'mer': 0.017731018364429474, 'pl': 0.015618585981428623, 'ple': 0.01504151988774538}
print(sum(float(y) for x, y in a.items()))
0.624180693179369
b = summarize_probs_unk(a, const_wildcard=True, scale_probs=True, wildcard_minweight=0.1)
print(gonito_format(b))
p:0.3220678024033818 m:0.1764294588459882 mun:0.1034342334152795 mon:0.07941354974589608 mission:0.07641381211021994 mand:0.05138837796498232 pan:0.02946419390001886 mer:0.02582442517073288 pl:0.022747763081629364 ple:0.021907292452780093 :0.09090909090909094
print(sum(float(y) for x, y in b))
1.0