challenging-america-word-ga.../gpt-2 finetune.ipynb
2023-06-29 18:36:47 +02:00

22 KiB
Raw Blame History

test_file = 'dev-0/in.tsv.xz'
out_file = 'dev-0/out.tsv'
import torch.nn.functional as F
import torch
import lzma
###preprocessing
def preprocess(line):
    line = get_rid_of_header(line)
    line = replace_endline(line)
    return line

def get_rid_of_header(line):
    line = line.split('\t')[6:]
    return " ".join(line)
    
def replace_endline(line):
    line = line.replace("\\\\n", " ")
    return line

def get_first_word(text):
    """Return the first word of a string."""
    word = ""
    for i in range(len(text)-1):
#         if text[i] in [' ', ',', '.']
        if text[i] == ' ':
            return word.rstrip()
        else:
            word += text[i]
    return word.rstrip()



def summarize_probs_unk(dic, const_wildcard=True, scale_probs=False, wildcard_minweight=0.01):
    ''' 
    dic: dictionary of probabilities returned by model 
    returns: tab of probabilities, with <unk> specificly as last element
    '''
    if not scale_probs:
        if '' in dic.keys():
            del dic['']
        tab = [(key, val) for key, val in dic.items()]
        tab.append(('', 1-sum([val for val in dic.values()])))
    elif const_wildcard and scale_probs: #
        if '' in dic.keys():
            del dic['']
        probsum = sum(float(val) for key, val in dic.items())
        for key in dic:
            dic[key] = dic[key]/(probsum*(1+wildcard_minweight)) 
        tab = [(key, val) for key, val in dic.items()]
        tab.append(('', 1-sum([val for val in dic.values()])))
#     else if '' not in dic.keys(): #no wildcard entry  
    else:
        if '' not in dic.keys():
            wildcard_value = wildcard_minweight
        else:
            wildcard_value = dic['']
            del dic['']
        for key in dic:
            dic[key] = dic[key]/(1-wildcard_value) ###leave some space for wildcar
        tab = [(key, val) for key, val in dic.items()]
        tab.append(('', 1-sum([val for val in dic.values()])))
        
    return tab


def gonito_format(tab):
#     tab = summarize_probs_unk(dic, const_wildcard=const_wildcard, wildcard_minweight=wildcard_minweight)
    result = ''
    for element in tab[:-1]:
        result+=str(element[0])+':'+str(element[1])+'\t'
    result+=':'+ str(tab[-1][1])+'\n'
    return result
!pip install transformers
from transformers import pipeline, set_seed, AutoTokenizer, AutoModel, AutoModelForCausalLM, GPT2Tokenizer
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: transformers in /home/gedin/.local/lib/python3.10/site-packages (4.30.2)
Requirement already satisfied: requests in /usr/lib/python3/dist-packages (from transformers) (2.25.1)
Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (0.13.3)
Requirement already satisfied: numpy>=1.17 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (1.24.3)
Requirement already satisfied: pyyaml>=5.1 in /usr/lib/python3/dist-packages (from transformers) (5.4.1)
Requirement already satisfied: tqdm>=4.27 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (4.65.0)
Requirement already satisfied: safetensors>=0.3.1 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (0.3.1)
Requirement already satisfied: filelock in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (3.12.0)
Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (0.15.1)
Requirement already satisfied: regex!=2019.12.17 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (2023.5.5)
Requirement already satisfied: packaging>=20.0 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (23.1)
Requirement already satisfied: fsspec in /home/gedin/.local/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (2023.6.0)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/gedin/.local/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (4.6.3)
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: transformers in /home/gedin/.local/lib/python3.10/site-packages (4.30.2)
Requirement already satisfied: tqdm>=4.27 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (4.65.0)
Requirement already satisfied: regex!=2019.12.17 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (2023.5.5)
Requirement already satisfied: pyyaml>=5.1 in /usr/lib/python3/dist-packages (from transformers) (5.4.1)
Requirement already satisfied: requests in /usr/lib/python3/dist-packages (from transformers) (2.25.1)
Requirement already satisfied: packaging>=20.0 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (23.1)
Requirement already satisfied: numpy>=1.17 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (1.24.3)
Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (0.15.1)
Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (0.13.3)
Requirement already satisfied: safetensors>=0.3.1 in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (0.3.1)
Requirement already satisfied: filelock in /home/gedin/.local/lib/python3.10/site-packages (from transformers) (3.12.0)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/gedin/.local/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (4.6.3)
Requirement already satisfied: fsspec in /home/gedin/.local/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (2023.6.0)
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
pt_model = AutoModelForCausalLM.from_pretrained(model_name)
sentence = 'Northeasterly hv the head of said .^corns and and the'
encoding = tokenizer(sentence, return_tensors='pt')
output = pt_model(**encoding)
probs = F.softmax(output.logits[0][-1])
/tmp/ipykernel_9135/861569571.py:1: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  probs = F.softmax(output.logits[0][-1])
top = torch.topk(probs, 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = [tokenizer.decode(idx).strip() for idx in top_indices] 
tokenizer.decode(198)
'\n'
# print(top_indices)
print((top_words))
['head', 'heads', '.', 'same', 'body', 'neck', 'other', 'whole', 'said', 'king']
print(top_indices)
[1182, 6665, 764, 976, 1767, 7393, 584, 2187, 531, 5822]
print(top_probs)
[0.18150091171264648, 0.011893990449607372, 0.011805753223598003, 0.011544686742126942, 0.007725409232079983]
print(list(zip(top_words, top_probs)))
[('head', 0.18150091171264648), ('heads', 0.011893990449607372), ('.', 0.011805753223598003), ('same', 0.011544686742126942), ('body', 0.007725409232079983), ('neck', 0.007723228540271521), ('other', 0.006957209203392267), ('whole', 0.006453146692365408), ('said', 0.004757815971970558), ('king', 0.004543370567262173)]
#################Does probabilities from get_values_from_model sum up to 1##################
asdf = dict(get_values_from_model(sentence, model=pt_model, tokenizer=tokenizer, k=10))
asdf = summarize_probs_unk(asdf)
print([x for x in asdf] ,sum([x[1] for x in asdf]))
[('head', 0.7049117686793325), ('heads', 0.04619378363102533), ('.', 0.045851088608379394), ('same', 0.044837160725262136), ('body', 0.030003881711508234), ('neck', 0.02999541235835104), ('other', 0.027020352671284463), ('whole', 0.02506267877962141), ('said', 0.018478367079292513), ('king', 0.01764550575594297), ('', 0.01000000000000012)] 1.0
/tmp/ipykernel_9135/698839240.py:4: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  probs = F.softmax(output.logits[0][-1]) #get the model prediction for the entire sentence ([-1]) no batching ([0])
def get_values_from_model(context: str, model, tokenizer, k=10):
    encoding = tokenizer(context, return_tensors='pt')
    output = model(**encoding)
    probs = F.softmax(output.logits[0][-1], dim=-1) #get the model prediction for the entire sentence ([-1]) no batching ([0])
    top = torch.topk(probs, k)
    top_probs = top.values.tolist()
    top_indices =top.indices.tolist()
    top_words = [tokenizer.decode(idx).strip() for idx in top_indices] 
#     print(context, "probs: \n", list(zip(top_words, top_indices, top_probs)))
    return list(zip(top_words, top_probs))
with lzma.open(test_file, 'rt') as file:
    left_contexts = []
    results = []
    for line in file:
        line = replace_endline(line) #get only relevant
        line = line.split('\t')[6:]
        context = ' '.join(line[0].rstrip().split(" ")[-10:])
#         context = context + ' '
#         print(context)
        left_contexts.append(context)
        ###get results from gpt model###
    for left_context in left_contexts:
        results.append(dict(get_values_from_model(left_context, model=pt_model, tokenizer=tokenizer, k=10)))
    with open(out_file, 'w') as outfile:
        for elem in results:
            tab = summarize_probs_unk(elem, const_wildcard=False, scale_probs=True, wildcard_minweight=0.01)
            outfile.write(gonito_format(tab))
sentence = 'Northeasterly hv the head of said .^corns and and the'
encoding = tokenizer(sentence, return_tensors='pt')
output = pt_model(**encoding)
probs = F.softmax(output.logits[0][-1], dim = -1) #get the model prediction for the entire sentence ([-1]) no batching ([0])
top = torch.topk(probs, 5)
top_probs = top.values.tolist()
top_indices =top.indices.tolist()
top_words = [tokenizer.decode(idx).strip() for idx in top_indices] 
print(list(zip(top_words, top_probs)))
[('head', 0.18150091171264648), ('heads', 0.011893990449607372), ('.', 0.011805753223598003), ('same', 0.011544686742126942), ('body', 0.007725409232079983)]
output.logits.shape
torch.Size([1, 20, 50257])
print(list(zip(top_words, top_probs)))
[('head', 0.18150091171264648), ('heads', 0.011893990449607372), ('.', 0.011805753223598003), ('same', 0.011544686742126942), ('body', 0.007725409232079983)]
with lzma.open(test_file, 'rt') as file:
    left_contexts = []
    right_contexts = []
    results = []
    i=0
    for line in file:
        if i >20:
            break
        line = replace_endline(line) #get only relevant
        line = line.split('\t')[6:]
        l_context = " ".join(line[0].rstrip().split(" ")[-10:])
        r_context = " ".join(line[1].rstrip().split(" ")[:5])
        left_contexts.append(l_context)
        right_contexts.append(r_context)
        i+=1;
left_contexts
['id Seorgc Acorns laud in said Aina and riinninu from',
 'true that the Republican par- ty is National in its',
 'but here¬ abouts 1 have not seen or heard ot',
 'They will not bend or break like the single spiing.',
 '208 miles long. J. 8 . Con way the Com-',
 'They felt that He was the long-looked- for Messiah with',
 'in the state that could produce just as good fruit',
 'after house and managed to find shelter in the lazarette,',
 'hundred buildings put up in the place this season. Mechanics',
 'de\xad testation. I no longer loved him, and I felt',
 'ol the gentleman, by dis. cussing the constitutionality of these',
 'the purest pa- triotism, and tin* most indent devotion to',
 'now to i high, will come down, and those too',
 'the \'extension of MdKlnleJ" street, and running back to the',
 'sir, it is a subject ot peculiar delight to me',
 'to affect a miserable scramble for offices in tiiis country.',
 'Marj, the eldest, said lo the others: "Lit us pray',
 'not enquire—But what sum lie asked, could tempt men well',
 "hete are 500, or 550 acres ol cleared land, '250",
 'petition h is lieeii |ui tty elos ly scru;iiii/e<| and',
 'on the high reputation of his record as Chief Mag-']
results = []
for left_context in left_contexts:
    results.append(dict(get_values_from_model(left_context, model=pt_model, tokenizer=tokenizer, k=10)))
for idx, elem in enumerate(results):
    tab = summarize_probs_unk(elem, const_wildcard=True, scale_probs=True, wildcard_minweight=0.01)
    print(idx, "    ", gonito_format(tab))
a =dict(get_values_from_model(left_contexts[4], model=pt_model, tokenizer=tokenizer, k=10))
a
{'p': 0.2211313545703888,
 'm': 0.12113624811172485,
 'mun': 0.07101781666278839,
 'mon': 0.05452524498105049,
 'mission': 0.05246562883257866,
 'mand': 0.03528319671750069,
 'pan': 0.02023007906973362,
 'mer': 0.017731018364429474,
 'pl': 0.015618585981428623,
 'ple': 0.01504151988774538}
print(sum(float(y) for x, y in a.items()))
0.624180693179369
b = summarize_probs_unk(a, const_wildcard=True, scale_probs=True, wildcard_minweight=0.1)
print(gonito_format(b))
p:0.3220678024033818	m:0.1764294588459882	mun:0.1034342334152795	mon:0.07941354974589608	mission:0.07641381211021994	mand:0.05138837796498232	pan:0.02946419390001886	mer:0.02582442517073288	pl:0.022747763081629364	ple:0.021907292452780093	:0.09090909090909094

print(sum(float(y) for x, y in b))
1.0