Przetwarzanie_tekstu/02.ipynb

21 KiB

Zadania z laboratoriów 2

Zadanie 1

Znajdź 2 przykłady (słowa, zdania) gdzie zauważalne są różnice pomiędzy tokenizerem BERT oraz RoBERTa

from transformers import BertTokenizer, RobertaTokenizer, PreTrainedTokenizerFast, AutoTokenizer

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
text_en = 'Marion' #imię

print(' '.join(bert_tokenizer.tokenize(text_en)))
print(' '.join(roberta_tokenizer.tokenize(text_en)))
marion
Mar ion
text_en = 'baptist' #baptysta

print(' '.join(bert_tokenizer.tokenize(text_en)))
print(' '.join(roberta_tokenizer.tokenize(text_en)))
baptist
b apt ist

Zadanie 2

Znajdź 2 przykłady (słowa, zdania) gdzie podobne są wyniki pomiędzy tokenizerem BERT oraz RoBERTa

text_en = 'Football'

print(' '.join(bert_tokenizer.tokenize(text_en)))
print(' '.join(roberta_tokenizer.tokenize(text_en)))
football
Football
text_en = 'I like reading.'

print(' '.join(bert_tokenizer.tokenize(text_en)))
print(' '.join(roberta_tokenizer.tokenize(text_en)))
i like reading .
I Ġlike Ġreading .

Zadanie 3

Sprawdź jak zachowa się tokenizer BERT/RoBERTa na innym języka niż Angielski

text_pl = 'Bardzo lubię informatykę.'

#Tokenizacja na modelu z języka angielskiego
print(' '.join(bert_tokenizer.tokenize(text_pl)))
print(' '.join(roberta_tokenizer.tokenize(text_pl)))
bard ##zo lu ##bie inform ##at ##yk ##e .
B ard zo Ġl ubi Ä Ļ Ġinform at yk Ä Ļ .

Zadanie 4

Sprawdź jak zachowa się tokenizer BERT/RoBERTy na tekście medycznym, czy innym specjalistycznym tekście.

# Tekst z artykułu medycznego
medical_en = 'When the Excluder, Endurant, and Zenith were pooled the rate of abdominal aortic aneurysm rupture was observed to be significantly higher among patients with the early AFX.'

print(' '.join(bert_tokenizer.tokenize(medical_en)))
print(' '.join(roberta_tokenizer.tokenize(medical_en)))
when the exclude ##r , end ##ura ##nt , and zenith were poole ##d the rate of abdominal ao ##rti ##c an ##eur ##ys ##m ru ##pt ##ure was observed to be significantly higher among patients with the early af ##x .
When Ġthe ĠEx clud er , ĠEnd ur ant , Ġand ĠZen ith Ġwere Ġpooled Ġthe Ġrate Ġof Ġabdominal Ġa ort ic Ġan eur ys m Ġrupture Ġwas Ġobserved Ġto Ġbe Ġsignificantly Ġhigher Ġamong Ġpatients Ġwith Ġthe Ġearly ĠAF X .

Zadanie 5

Wykonaj po 3 przykłady _FillMask dla modelu:

  • BERT/RoBERTa
  • Polish RoBERTa

BERT - angielski

import torch
from torch.nn import functional as F
from transformers import BertForMaskedLM

bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased')
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
inputs_mlm = bert_tokenizer(f'The sky was full of {bert_tokenizer.mask_token}.', return_tensors='pt')
labels_mlm = bert_tokenizer("The sky was full of stars.", return_tensors="pt")["input_ids"]

outputs_mlm = bert_model(**inputs_mlm, labels=labels_mlm)

mask_token_idx = 6 # CLS + 5 tokens
softmax_mlm = F.softmax(outputs_mlm.logits, dim = -1)

mask_token = softmax_mlm[0, mask_token_idx, :]
top_10 = torch.topk(mask_token, 10, dim = 0)

for i, (token_id, prob) in enumerate(zip(top_10.indices, top_10.values)):
   token = bert_tokenizer.decode([token_id])
   print(f'{i:2}\t{token:25}', prob.item())
 0	stars                     0.6143525838851929
 1	clouds                    0.2152138501405716
 2	birds                     0.008692129515111446
 3	blue                      0.008089331910014153
 4	cloud                     0.005828939378261566
 5	sunshine                  0.005086773540824652
 6	light                     0.005068401340395212
 7	flowers                   0.004763070959597826
 8	darkness                  0.004391019232571125
 9	lights                    0.004141420125961304
inputs_mlm = bert_tokenizer(f'This jacket is a little too {bert_tokenizer.mask_token}.', return_tensors='pt')
labels_mlm = bert_tokenizer("This jacket is a little too big.", return_tensors="pt")["input_ids"]

outputs_mlm = bert_model(**inputs_mlm, labels=labels_mlm)

mask_token_idx = 7 # CLS + 6 tokens
softmax_mlm = F.softmax(outputs_mlm.logits, dim = -1)

mask_token = softmax_mlm[0, mask_token_idx, :]
top_10 = torch.topk(mask_token, 10, dim = 0)

for i, (token_id, prob) in enumerate(zip(top_10.indices, top_10.values)):
   token = bert_tokenizer.decode([token_id])
   print(f'{i:2}\t{token:25}', prob.item())
 0	tight                     0.2341388612985611
 1	big                       0.11350443959236145
 2	heavy                     0.07258473336696625
 3	short                     0.05406404659152031
 4	long                      0.050229042768478394
 5	light                     0.03884173184633255
 6	thin                      0.025743598118424416
 7	revealing                 0.020789707079529762
 8	warm                      0.01982339844107628
 9	small                     0.019418802112340927
inputs_mlm = bert_tokenizer(f"What's your favorite ice cream {bert_tokenizer.mask_token}?", return_tensors='pt')
labels_mlm = bert_tokenizer("What's your favorite ice cream flavor?", return_tensors="pt")["input_ids"]
print(bert_tokenizer.tokenize(f"What's your favorite ice cream {bert_tokenizer.mask_token}?"))

outputs_mlm = bert_model(**inputs_mlm, labels=labels_mlm)

mask_token_idx = 8 # CLS + 7 tokens
softmax_mlm = F.softmax(outputs_mlm.logits, dim = -1)

mask_token = softmax_mlm[0, mask_token_idx, :]
top_10 = torch.topk(mask_token, 10, dim = 0)

for i, (token_id, prob) in enumerate(zip(top_10.indices, top_10.values)):
   token = bert_tokenizer.decode([token_id])
   print(f'{i:2}\t{token:25}', prob.item())
['what', "'", 's', 'your', 'favorite', 'ice', 'cream', '[MASK]', '?']
 0	flavor                    0.5929659008979797
 1	now                       0.014950926415622234
 2	line                      0.014521223492920399
 3	recipe                    0.013670633547008038
 4	color                     0.010578353889286518
 5	?                         0.00849001295864582
 6	thing                     0.00799252837896347
 7	please                    0.007873623631894588
 8	today                     0.007739454973489046
 9	number                    0.007451422978192568

RoBERTa - angielski

from transformers import RobertaForMaskedLM

roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base')
inputs_mlm = roberta_tokenizer(f'Hand me the {roberta_tokenizer.mask_token}!', return_tensors='pt')
labels_mlm = roberta_tokenizer("Hand me the hammer!", return_tensors="pt")["input_ids"]
print(roberta_tokenizer.tokenize(f'Hand me the {roberta_tokenizer.mask_token}!'))

outputs_mlm = roberta_model(**inputs_mlm, labels=labels_mlm)

mask_token_idx = 4 # CLS + 3 tokens
softmax_mlm = F.softmax(outputs_mlm.logits, dim = -1)

mask_token = softmax_mlm[0, mask_token_idx, :]
top_10 = torch.topk(mask_token, 10, dim = 0)

for i, (token_id, prob) in enumerate(zip(top_10.indices, top_10.values)):
   token = roberta_tokenizer.decode([token_id])
   print(f'{i:2}\t{token:25}', prob.item())
['Hand', 'Ġme', 'Ġthe', '<mask>', '!']
 0	 keys                     0.33524537086486816
 1	 phone                    0.05494626611471176
 2	 key                      0.02826027013361454
 3	 paper                    0.025939658284187317
 4	 papers                   0.01922498270869255
 5	 reins                    0.018558315932750702
 6	 cup                      0.016417579725384712
 7	 bag                      0.015210084617137909
 8	 coffee                   0.014366202056407928
 9	 gun                      0.013706102967262268

RoBERTa - polski

from transformers import AutoModelForMaskedLM

polish_roberta_tokenizer = PreTrainedTokenizerFast.from_pretrained('sdadas/polish-roberta-large-v1')
polish_roberta_model = AutoModelForMaskedLM.from_pretrained('sdadas/polish-roberta-large-v1')
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
inputs_mlm = polish_roberta_tokenizer(f'Bardzo lubię {polish_roberta_tokenizer.mask_token}.', return_tensors='pt')
labels_mlm = polish_roberta_tokenizer("Bardzo lubię czytać.", return_tensors="pt")["input_ids"]
print(polish_roberta_tokenizer.tokenize(f'Bardzo lubię {polish_roberta_tokenizer.mask_token}.'))

outputs_mlm = polish_roberta_model(**inputs_mlm, labels=labels_mlm)

mask_token_idx = 4 # CLS + 3 tokens
softmax_mlm = F.softmax(outputs_mlm.logits, dim = -1)

mask_token = softmax_mlm[0, mask_token_idx, :]
top_10 = torch.topk(mask_token, 10, dim = 0)

for i, (token_id, prob) in enumerate(zip(top_10.indices, top_10.values)):
   token = polish_roberta_tokenizer.decode([token_id])
   print(f'{i:2}\t{token:25}', prob.item())
['Bar', 'dzo', '▁lubię', ' <mask>', '.']
 0	czytać                    0.06616953760385513
 1	podróżować                0.04533696547150612
 2	gotować                   0.04076462611556053
 3	muzykę                    0.039369307458400726
 4	koty                      0.03558063879609108
 5	pisać                     0.03538721054792404
 6	książki                   0.033440858125686646
 7	śpiewać                   0.02773296646773815
 8	sport                     0.027220433577895164
 9	tańczyć                   0.026598699390888214
inputs_mlm = polish_roberta_tokenizer(f'Zajęcia na uczelni są {polish_roberta_tokenizer.mask_token}.', return_tensors='pt')
labels_mlm = polish_roberta_tokenizer("Zajęcia na uczelni są ciekawe.", return_tensors="pt")["input_ids"]
print(polish_roberta_tokenizer.tokenize(f'Zajęcia na uczelni są {polish_roberta_tokenizer.mask_token}.'))

outputs_mlm = polish_roberta_model(**inputs_mlm, labels=labels_mlm)

mask_token_idx = 6 # CLS + 5 tokens
softmax_mlm = F.softmax(outputs_mlm.logits, dim = -1)

mask_token = softmax_mlm[0, mask_token_idx, :]
top_10 = torch.topk(mask_token, 10, dim = 0)

for i, (token_id, prob) in enumerate(zip(top_10.indices, top_10.values)):
   token = polish_roberta_tokenizer.decode([token_id])
   print(f'{i:2}\t{token:25}', prob.item())
['Za', 'jęcia', '▁na', '▁uczelni', '▁są', ' <mask>', '.']
 0	bezpłatne                 0.9145433902740479
 1	obowiązkowe               0.014430041424930096
 2	prowadzone                0.010215427726507187
 3	zróżnicowane              0.008744887076318264
 4	różnorodne                0.00670977309346199
 5	następujące               0.004183280747383833
 6	otwarte                   0.002896391786634922
 7	intensywne                0.002672090893611312
 8	realizowane               0.0019869415555149317
 9	ok                        0.0018993624253198504
inputs_mlm = polish_roberta_tokenizer(f'Jutro na obiad będzie {polish_roberta_tokenizer.mask_token}.', return_tensors='pt')
labels_mlm = polish_roberta_tokenizer("Jutro na obiad będzie ryba.", return_tensors="pt")["input_ids"]
print(polish_roberta_tokenizer.tokenize(f'Jutro na obiad będzie {polish_roberta_tokenizer.mask_token}.'))

outputs_mlm = polish_roberta_model(**inputs_mlm, labels=labels_mlm)

mask_token_idx = 6 # CLS + 5 tokens
softmax_mlm = F.softmax(outputs_mlm.logits, dim = -1)

mask_token = softmax_mlm[0, mask_token_idx, :]
top_10 = torch.topk(mask_token, 10, dim = 0)

for i, (token_id, prob) in enumerate(zip(top_10.indices, top_10.values)):
   token = polish_roberta_tokenizer.decode([token_id])
   print(f'{i:2}\t{token:25}', prob.item())
['Ju', 'tro', '▁na', '▁obiad', '▁będzie', ' <mask>', '.']
 0	ryba                      0.27743467688560486
 1	mięso                     0.1686241328716278
 2	ciasto                    0.024455789476633072
 3	ryż                       0.0164520051330328
 4	niedziela                 0.013327408581972122
 5	masło                     0.01118378434330225
 6	obiad                     0.010521633550524712
 7	chleb                     0.00991259329020977
 8	czwartek                  0.009901482611894608
 9	wino                      0.008945722132921219

Zadanie 6

Spróbuj porównać czy jedno zdanie następuje po drugim.

from transformers import BertTokenizer, BertForNextSentencePrediction
import torch

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForNextSentencePrediction.from_pretrained("bert-base-uncased")

prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "In other cases pizza may be sliced."
encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

outputs = model(**encoding, labels=torch.LongTensor([1]))
logits = outputs.logits

sentenceWasRandom = logits[0, 0] < logits[0, 1]
print("Kolejne zdanie jest losowe: " + str(sentenceWasRandom.item()))
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Kolejne zdanie jest losowe: False