19 KiB
19 KiB
Zadania z laboratoriów 2
Zadanie 1
Znajdź 2 przykłady (słowa, zdania) gdzie zauważalne są różnice pomiędzy tokenizerem BERT oraz RoBERTa
from transformers import BertTokenizer, RobertaTokenizer, PreTrainedTokenizerFast, AutoTokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
text_en = 'Marion' #imię
print(' '.join(bert_tokenizer.tokenize(text_en)))
print(' '.join(roberta_tokenizer.tokenize(text_en)))
marion Mar ion
text_en = 'baptist' #baptysta
print(' '.join(bert_tokenizer.tokenize(text_en)))
print(' '.join(roberta_tokenizer.tokenize(text_en)))
baptist b apt ist
Zadanie 2
Znajdź 2 przykłady (słowa, zdania) gdzie podobne są wyniki pomiędzy tokenizerem BERT oraz RoBERTa
text_en = 'Football'
print(' '.join(bert_tokenizer.tokenize(text_en)))
print(' '.join(roberta_tokenizer.tokenize(text_en)))
football Football
text_en = 'I like reading.'
print(' '.join(bert_tokenizer.tokenize(text_en)))
print(' '.join(roberta_tokenizer.tokenize(text_en)))
i like reading . I Ġlike Ġreading .
Zadanie 3
Sprawdź jak zachowa się tokenizer BERT/RoBERTa na innym języka niż Angielski
text_pl = 'Bardzo lubię informatykę.'
#Tokenizacja na modelu z języka angielskiego
print(' '.join(bert_tokenizer.tokenize(text_pl)))
print(' '.join(roberta_tokenizer.tokenize(text_pl)))
bard ##zo lu ##bie inform ##at ##yk ##e . B ard zo Ġl ubi Ä Ļ Ġinform at yk Ä Ļ .
Zadanie 4
Sprawdź jak zachowa się tokenizer BERT/RoBERTy na tekście medycznym, czy innym specjalistycznym tekście.
# Tekst z artykułu medycznego
medical_en = 'When the Excluder, Endurant, and Zenith were pooled the rate of abdominal aortic aneurysm rupture was observed to be significantly higher among patients with the early AFX.'
print(' '.join(bert_tokenizer.tokenize(medical_en)))
print(' '.join(roberta_tokenizer.tokenize(medical_en)))
when the exclude ##r , end ##ura ##nt , and zenith were poole ##d the rate of abdominal ao ##rti ##c an ##eur ##ys ##m ru ##pt ##ure was observed to be significantly higher among patients with the early af ##x . When Ġthe ĠEx clud er , ĠEnd ur ant , Ġand ĠZen ith Ġwere Ġpooled Ġthe Ġrate Ġof Ġabdominal Ġa ort ic Ġan eur ys m Ġrupture Ġwas Ġobserved Ġto Ġbe Ġsignificantly Ġhigher Ġamong Ġpatients Ġwith Ġthe Ġearly ĠAF X .
Zadanie 5
Wykonaj po 3 przykłady _FillMask dla modelu:
- BERT/RoBERTa
- Polish RoBERTa
BERT - angielski
import torch
from torch.nn import functional as F
from transformers import BertForMaskedLM
bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased')
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight'] - This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
inputs_mlm = bert_tokenizer(f'The sky was full of {bert_tokenizer.mask_token}.', return_tensors='pt')
labels_mlm = bert_tokenizer("The sky was full of stars.", return_tensors="pt")["input_ids"]
outputs_mlm = bert_model(**inputs_mlm, labels=labels_mlm)
mask_token_idx = 6 # CLS + 5 tokens
softmax_mlm = F.softmax(outputs_mlm.logits, dim = -1)
mask_token = softmax_mlm[0, mask_token_idx, :]
top_10 = torch.topk(mask_token, 10, dim = 0)
for i, (token_id, prob) in enumerate(zip(top_10.indices, top_10.values)):
token = bert_tokenizer.decode([token_id])
print(f'{i:2}\t{token:25}', prob.item())
0 stars 0.6143525838851929 1 clouds 0.2152138501405716 2 birds 0.008692129515111446 3 blue 0.008089331910014153 4 cloud 0.005828939378261566 5 sunshine 0.005086773540824652 6 light 0.005068401340395212 7 flowers 0.004763070959597826 8 darkness 0.004391019232571125 9 lights 0.004141420125961304
inputs_mlm = bert_tokenizer(f'This jacket is a little too {bert_tokenizer.mask_token}.', return_tensors='pt')
labels_mlm = bert_tokenizer("This jacket is a little too big.", return_tensors="pt")["input_ids"]
outputs_mlm = bert_model(**inputs_mlm, labels=labels_mlm)
mask_token_idx = 7 # CLS + 6 tokens
softmax_mlm = F.softmax(outputs_mlm.logits, dim = -1)
mask_token = softmax_mlm[0, mask_token_idx, :]
top_10 = torch.topk(mask_token, 10, dim = 0)
for i, (token_id, prob) in enumerate(zip(top_10.indices, top_10.values)):
token = bert_tokenizer.decode([token_id])
print(f'{i:2}\t{token:25}', prob.item())
0 tight 0.2341388612985611 1 big 0.11350443959236145 2 heavy 0.07258473336696625 3 short 0.05406404659152031 4 long 0.050229042768478394 5 light 0.03884173184633255 6 thin 0.025743598118424416 7 revealing 0.020789707079529762 8 warm 0.01982339844107628 9 small 0.019418802112340927
inputs_mlm = bert_tokenizer(f"What's your favorite ice cream {bert_tokenizer.mask_token}?", return_tensors='pt')
labels_mlm = bert_tokenizer("What's your favorite ice cream flavor?", return_tensors="pt")["input_ids"]
print(bert_tokenizer.tokenize(f"What's your favorite ice cream {bert_tokenizer.mask_token}?"))
outputs_mlm = bert_model(**inputs_mlm, labels=labels_mlm)
mask_token_idx = 8 # CLS + 7 tokens
softmax_mlm = F.softmax(outputs_mlm.logits, dim = -1)
mask_token = softmax_mlm[0, mask_token_idx, :]
top_10 = torch.topk(mask_token, 10, dim = 0)
for i, (token_id, prob) in enumerate(zip(top_10.indices, top_10.values)):
token = bert_tokenizer.decode([token_id])
print(f'{i:2}\t{token:25}', prob.item())
['what', "'", 's', 'your', 'favorite', 'ice', 'cream', '[MASK]', '?'] 0 flavor 0.5929659008979797 1 now 0.014950926415622234 2 line 0.014521223492920399 3 recipe 0.013670633547008038 4 color 0.010578353889286518 5 ? 0.00849001295864582 6 thing 0.00799252837896347 7 please 0.007873623631894588 8 today 0.007739454973489046 9 number 0.007451422978192568
RoBERTa - angielski
from transformers import RobertaForMaskedLM
roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base')
inputs_mlm = roberta_tokenizer(f'Hand me the {roberta_tokenizer.mask_token}!', return_tensors='pt')
labels_mlm = roberta_tokenizer("Hand me the hammer!", return_tensors="pt")["input_ids"]
print(roberta_tokenizer.tokenize(f'Hand me the {roberta_tokenizer.mask_token}!'))
outputs_mlm = roberta_model(**inputs_mlm, labels=labels_mlm)
mask_token_idx = 4 # CLS + 3 tokens
softmax_mlm = F.softmax(outputs_mlm.logits, dim = -1)
mask_token = softmax_mlm[0, mask_token_idx, :]
top_10 = torch.topk(mask_token, 10, dim = 0)
for i, (token_id, prob) in enumerate(zip(top_10.indices, top_10.values)):
token = roberta_tokenizer.decode([token_id])
print(f'{i:2}\t{token:25}', prob.item())
['Hand', 'Ġme', 'Ġthe', '<mask>', '!'] 0 keys 0.33524537086486816 1 phone 0.05494626611471176 2 key 0.02826027013361454 3 paper 0.025939658284187317 4 papers 0.01922498270869255 5 reins 0.018558315932750702 6 cup 0.016417579725384712 7 bag 0.015210084617137909 8 coffee 0.014366202056407928 9 gun 0.013706102967262268
RoBERTa - polski
from transformers import AutoModelForMaskedLM
polish_roberta_tokenizer = PreTrainedTokenizerFast.from_pretrained('sdadas/polish-roberta-large-v1')
polish_roberta_model = AutoModelForMaskedLM.from_pretrained('sdadas/polish-roberta-large-v1')
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. The class this function is called from is 'PreTrainedTokenizerFast'.
inputs_mlm = polish_roberta_tokenizer(f'Bardzo lubię {polish_roberta_tokenizer.mask_token}.', return_tensors='pt')
labels_mlm = polish_roberta_tokenizer("Bardzo lubię czytać.", return_tensors="pt")["input_ids"]
print(polish_roberta_tokenizer.tokenize(f'Bardzo lubię {polish_roberta_tokenizer.mask_token}.'))
outputs_mlm = polish_roberta_model(**inputs_mlm, labels=labels_mlm)
mask_token_idx = 4 # CLS + 3 tokens
softmax_mlm = F.softmax(outputs_mlm.logits, dim = -1)
mask_token = softmax_mlm[0, mask_token_idx, :]
top_10 = torch.topk(mask_token, 10, dim = 0)
for i, (token_id, prob) in enumerate(zip(top_10.indices, top_10.values)):
token = polish_roberta_tokenizer.decode([token_id])
print(f'{i:2}\t{token:25}', prob.item())
['Bar', 'dzo', '▁lubię', ' <mask>', '.'] 0 czytać 0.06616953760385513 1 podróżować 0.04533696547150612 2 gotować 0.04076462611556053 3 muzykę 0.039369307458400726 4 koty 0.03558063879609108 5 pisać 0.03538721054792404 6 książki 0.033440858125686646 7 śpiewać 0.02773296646773815 8 sport 0.027220433577895164 9 tańczyć 0.026598699390888214
inputs_mlm = polish_roberta_tokenizer(f'Zajęcia na uczelni są {polish_roberta_tokenizer.mask_token}.', return_tensors='pt')
labels_mlm = polish_roberta_tokenizer("Zajęcia na uczelni są ciekawe.", return_tensors="pt")["input_ids"]
print(polish_roberta_tokenizer.tokenize(f'Zajęcia na uczelni są {polish_roberta_tokenizer.mask_token}.'))
outputs_mlm = polish_roberta_model(**inputs_mlm, labels=labels_mlm)
mask_token_idx = 6 # CLS + 5 tokens
softmax_mlm = F.softmax(outputs_mlm.logits, dim = -1)
mask_token = softmax_mlm[0, mask_token_idx, :]
top_10 = torch.topk(mask_token, 10, dim = 0)
for i, (token_id, prob) in enumerate(zip(top_10.indices, top_10.values)):
token = polish_roberta_tokenizer.decode([token_id])
print(f'{i:2}\t{token:25}', prob.item())
['Za', 'jęcia', '▁na', '▁uczelni', '▁są', ' <mask>', '.'] 0 bezpłatne 0.9145433902740479 1 obowiązkowe 0.014430041424930096 2 prowadzone 0.010215427726507187 3 zróżnicowane 0.008744887076318264 4 różnorodne 0.00670977309346199 5 następujące 0.004183280747383833 6 otwarte 0.002896391786634922 7 intensywne 0.002672090893611312 8 realizowane 0.0019869415555149317 9 ok 0.0018993624253198504
inputs_mlm = polish_roberta_tokenizer(f'Jutro na obiad będzie {polish_roberta_tokenizer.mask_token}.', return_tensors='pt')
labels_mlm = polish_roberta_tokenizer("Jutro na obiad będzie ryba.", return_tensors="pt")["input_ids"]
print(polish_roberta_tokenizer.tokenize(f'Jutro na obiad będzie {polish_roberta_tokenizer.mask_token}.'))
outputs_mlm = polish_roberta_model(**inputs_mlm, labels=labels_mlm)
mask_token_idx = 6 # CLS + 5 tokens
softmax_mlm = F.softmax(outputs_mlm.logits, dim = -1)
mask_token = softmax_mlm[0, mask_token_idx, :]
top_10 = torch.topk(mask_token, 10, dim = 0)
for i, (token_id, prob) in enumerate(zip(top_10.indices, top_10.values)):
token = polish_roberta_tokenizer.decode([token_id])
print(f'{i:2}\t{token:25}', prob.item())
['Ju', 'tro', '▁na', '▁obiad', '▁będzie', ' <mask>', '.'] 0 ryba 0.27743467688560486 1 mięso 0.1686241328716278 2 ciasto 0.024455789476633072 3 ryż 0.0164520051330328 4 niedziela 0.013327408581972122 5 masło 0.01118378434330225 6 obiad 0.010521633550524712 7 chleb 0.00991259329020977 8 czwartek 0.009901482611894608 9 wino 0.008945722132921219
Zadanie 6
Spróbuj porównać czy jedno zdanie następuje po drugim.