278 KiB
278 KiB
Instalacja pakietów
!pip install transformers datasets torch
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Collecting transformers Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0m [?25hCollecting datasets Downloading datasets-2.9.0-py3-none-any.whl (462 kB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: torch in /usr/local/lib/python3.8/dist-packages (1.13.1+cu116) Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.8/dist-packages (from transformers) (4.64.1) Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.8/dist-packages (from transformers) (1.21.6) Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.8/dist-packages (from transformers) (2022.6.2) Requirement already satisfied: requests in /usr/local/lib/python3.8/dist-packages (from transformers) (2.25.1) Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.8/dist-packages (from transformers) (6.0) Collecting huggingface-hub<1.0,>=0.11.0 Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from transformers) (3.9.0) Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m101.8 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.8/dist-packages (from transformers) (23.0) Requirement already satisfied: dill<0.3.7 in /usr/local/lib/python3.8/dist-packages (from datasets) (0.3.6) Requirement already satisfied: pyarrow>=6.0.0 in /usr/local/lib/python3.8/dist-packages (from datasets) (9.0.0) Requirement already satisfied: aiohttp in /usr/local/lib/python3.8/dist-packages (from datasets) (3.8.4) Collecting responses<0.19 Downloading responses-0.18.0-py3-none-any.whl (38 kB) Collecting xxhash Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m [?25hCollecting multiprocess Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.8/dist-packages (from datasets) (2023.1.0) Requirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from datasets) (1.3.5) Requirement already satisfied: typing-extensions in /usr/local/lib/python3.8/dist-packages (from torch) (4.5.0) Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.3.1) Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.8.2) Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.3.3) Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (6.0.4) Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (4.0.2) Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (22.2.0) Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (3.0.1) Requirement already satisfied: chardet<5,>=3.0.2 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (4.0.0) Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (1.24.3) Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (2.10) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (2022.12.7) Collecting urllib3<1.27,>=1.21.1 Downloading urllib3-1.26.14-py2.py3-none-any.whl (140 kB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas->datasets) (2.8.2) Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas->datasets) (2022.7.1) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.8/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0) Installing collected packages: tokenizers, xxhash, urllib3, multiprocess, responses, huggingface-hub, transformers, datasets Attempting uninstall: urllib3 Found existing installation: urllib3 1.24.3 Uninstalling urllib3-1.24.3: Successfully uninstalled urllib3-1.24.3 Successfully installed datasets-2.9.0 huggingface-hub-0.12.1 multiprocess-0.70.14 responses-0.18.0 tokenizers-0.13.2 transformers-4.26.1 urllib3-1.26.14 xxhash-3.2.0
Załadowanie pakietów
from datasets import load_dataset
from transformers import BertTokenizer
import torch
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
import numpy as np
import time
import datetime
import random
Załadowanie datasetu
sms_spam
dataset = load_dataset("sms_spam")
Downloading builder script: 0%| | 0.00/3.21k [00:00<?, ?B/s]
Downloading metadata: 0%| | 0.00/1.69k [00:00<?, ?B/s]
Downloading readme: 0%| | 0.00/4.87k [00:00<?, ?B/s]
Downloading and preparing dataset sms_spam/plain_text to /root/.cache/huggingface/datasets/sms_spam/plain_text/1.0.0/53f051d3b5f62d99d61792c91acefe4f1577ad3e4c216fb0ad39e30b9f20019c...
Downloading data: 0%| | 0.00/203k [00:00<?, ?B/s]
Generating train split: 0%| | 0/5574 [00:00<?, ? examples/s]
Dataset sms_spam downloaded and prepared to /root/.cache/huggingface/datasets/sms_spam/plain_text/1.0.0/53f051d3b5f62d99d61792c91acefe4f1577ad3e4c216fb0ad39e30b9f20019c. Subsequent calls will reuse this data.
0%| | 0/1 [00:00<?, ?it/s]
dataset['train'][0]
{'sms': 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n', 'label': 0}
Tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
Downloading (…)solve/main/vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]
Downloading (…)okenizer_config.json: 0%| | 0.00/28.0 [00:00<?, ?B/s]
Downloading (…)lve/main/config.json: 0%| | 0.00/570 [00:00<?, ?B/s]
sms = dataset['train'][0]['sms']
print('Original: ', sms)
print('Tokenized: ', tokenizer.tokenize(sms))
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sms)))
Original: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat... Tokenized: ['go', 'until', 'ju', '##rong', 'point', ',', 'crazy', '.', '.', 'available', 'only', 'in', 'bug', '##is', 'n', 'great', 'world', 'la', 'e', 'buffet', '.', '.', '.', 'ci', '##ne', 'there', 'got', 'amore', 'wat', '.', '.', '.'] Token IDs: [2175, 2127, 18414, 17583, 2391, 1010, 4689, 1012, 1012, 2800, 2069, 1999, 11829, 2483, 1050, 2307, 2088, 2474, 1041, 28305, 1012, 1012, 1012, 25022, 2638, 2045, 2288, 26297, 28194, 1012, 1012, 1012]
Check maximum length of a sentence
max_len = 0
for sentence in dataset['train']:
input_ids = tokenizer.encode(sentence['sms'], add_special_tokens=True)
max_len = max(max_len, len(input_ids))
print('Max sentence length: ', max_len)
Max sentence length: 238
Special tokenization
input_ids = []
attention_masks = []
for sentence in dataset['train']:
encoded_dict = tokenizer.encode_plus(
sentence['sms'],
add_special_tokens = True,
max_length = 240,
padding = 'max_length',
truncation=True,
return_attention_mask = True,
return_tensors = 'pt',
)
input_ids.append(encoded_dict['input_ids'])
attention_masks.append(encoded_dict['attention_mask'])
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor([sentence['label'] for sentence in dataset['train']])
print('Original: ', dataset['train'][0])
print('Token IDs:', input_ids[0])
Original: {'sms': 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n', 'label': 0} Token IDs: tensor([ 101, 2175, 2127, 18414, 17583, 2391, 1010, 4689, 1012, 1012, 2800, 2069, 1999, 11829, 2483, 1050, 2307, 2088, 2474, 1041, 28305, 1012, 1012, 1012, 25022, 2638, 2045, 2288, 26297, 28194, 1012, 1012, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Split dataset
Class balance ratio should be similar to base dataset ratio.
def check_class_balance(dataset):
spam_count = 0.0
not_spam_count = 0.0
for row in dataset:
if row[2].item() == 1:
spam_count += 1.0
else:
not_spam_count += 1.0
return spam_count / not_spam_count
dataset = TensorDataset(input_ids, attention_masks, labels)
print("Spam to not spam messages ratio: {}\n".format(check_class_balance(dataset)))
test_size = 1000
dataset_len = len(dataset)
train_size = int(0.9 * (dataset_len-test_size))
val_size = (dataset_len-test_size) - train_size
test_dataset, train_dataset, val_dataset = random_split(dataset, [test_size, train_size, val_size])
print('{:>5,} test samples'.format(test_size))
print("Ratio: {}\n".format(check_class_balance(test_dataset)))
print('{:>5,} training samples'.format(train_size))
print("Ratio: {}\n".format(check_class_balance(train_dataset)))
print('{:>5,} validation samples'.format(val_size))
print("Ratio: {}\n".format(check_class_balance(val_dataset)))
Spam to not spam messages ratio: 0.15475450590428838 1,000 test samples Ratio: 0.1792452830188679 4,116 training samples Ratio: 0.15100671140939598 458 validation samples Ratio: 0.13647642679900746
Create train and validation loaders
batch_size = 32
train_dataloader = DataLoader(
train_dataset,
sampler = RandomSampler(train_dataset),
batch_size = batch_size
)
validation_dataloader = DataLoader(
val_dataset,
sampler = SequentialSampler(val_dataset),
batch_size = batch_size
)
Device check
if torch.cuda.is_available():
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
There are 1 GPU(s) available. We will use the GPU: Tesla T4
Custom BERT for classification
Compared to default BertForSequenceClassification - additional linear layer. https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/src/transformers/models/bert/modeling_bert.py#L1506
from transformers import BertForSequenceClassification, BertConfig, BertModel
from torch import nn
# in original model - no custom head just linear layer
# self.classifier = nn.Linear(config.hidden_size, config.num_labels)
class BertClassificationHeadCustom(nn.Module):
def __init__(self, config):
super().__init__()
self.dense_1 = nn.Linear(config.hidden_size, config.hidden_size)
self.dense_2 = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, x):
x = self.dense_1(x)
x = torch.relu(x)
x = self.dense_2(x)
x = torch.relu(x)
return x
class BertForSequenceClassificationCustom(BertForSequenceClassification):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.bert = BertModel(config)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = BertClassificationHeadCustom(config)
# Initialize weights and apply final processing
self.post_init()
model = BertForSequenceClassificationCustom.from_pretrained(
"bert-base-uncased",
num_labels = 2,
output_attentions = False,
output_hidden_states = False,
)
model.cuda()
Downloading (…)"pytorch_model.bin";: 0%| | 0.00/440M [00:00<?, ?B/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassificationCustom: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight'] - This IS expected if you are initializing BertForSequenceClassificationCustom from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing BertForSequenceClassificationCustom from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Some weights of BertForSequenceClassificationCustom were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.dense_1.weight', 'classifier.dense_2.weight', 'classifier.dense_2.bias', 'classifier.dense_1.bias'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
BertForSequenceClassificationCustom( (bert): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(30522, 768, padding_idx=0) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) (dropout): Dropout(p=0.1, inplace=False) (classifier): BertClassificationHeadCustom( (dense_1): Linear(in_features=768, out_features=768, bias=True) (dense_2): Linear(in_features=768, out_features=2, bias=True) ) )
Init training parameters
optimizer = torch.optim.AdamW(model.parameters(),
lr = 2e-5,
eps = 1e-8
)
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0,
num_training_steps = total_steps)
Helper functions
def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
def format_time(elapsed):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
elapsed_rounded = int(round((elapsed)))
return str(datetime.timedelta(seconds=elapsed_rounded))
Training
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []
total_t0 = time.time()
for epoch_i in range(0, epochs):
# ========================================
# Training
# ========================================
print("")
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
print('Training...')
t0 = time.time()
total_train_loss = 0
model.train()
for step, batch in enumerate(train_dataloader):
if step % 40 == 0 and not step == 0:
elapsed = format_time(time.time() - t0)
print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
model.zero_grad()
outputs = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
loss = outputs['loss']
total_train_loss += loss.item()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
avg_train_loss = total_train_loss / len(train_dataloader)
training_time = format_time(time.time() - t0)
print("")
print(" Average training loss: {0:.2f}".format(avg_train_loss))
print(" Training epcoh took: {:}".format(training_time))
# ========================================
# Validation
# ========================================
print("")
print("Running Validation...")
t0 = time.time()
model.eval()
total_eval_accuracy = 0
total_eval_loss = 0
nb_eval_steps = 0
for batch in validation_dataloader:
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
with torch.no_grad():
outputs = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
loss = outputs['loss']
logits = outputs['logits']
total_eval_loss += loss.item()
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
total_eval_accuracy += flat_accuracy(logits, label_ids)
avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
print(" Accuracy: {0:.2f}".format(avg_val_accuracy))
avg_val_loss = total_eval_loss / len(validation_dataloader)
validation_time = format_time(time.time() - t0)
print(" Validation Loss: {0:.2f}".format(avg_val_loss))
print(" Validation took: {:}".format(validation_time))
training_stats.append(
{
'epoch': epoch_i + 1,
'Training Loss': avg_train_loss,
'Valid. Loss': avg_val_loss,
'Valid. Accur.': avg_val_accuracy,
'Training Time': training_time,
'Validation Time': validation_time
}
)
print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
======== Epoch 1 / 4 ======== Training... Batch 40 of 129. Elapsed: 0:00:48. Batch 80 of 129. Elapsed: 0:01:35. Batch 120 of 129. Elapsed: 0:02:23. Average training loss: 0.13 Training epcoh took: 0:02:34 Running Validation... Accuracy: 0.98 Validation Loss: 0.08 Validation took: 0:00:06 ======== Epoch 2 / 4 ======== Training... Batch 40 of 129. Elapsed: 0:00:49. Batch 80 of 129. Elapsed: 0:01:38. Batch 120 of 129. Elapsed: 0:02:27. Average training loss: 0.03 Training epcoh took: 0:02:38 Running Validation... Accuracy: 0.99 Validation Loss: 0.06 Validation took: 0:00:06 ======== Epoch 3 / 4 ======== Training... Batch 40 of 129. Elapsed: 0:00:49. Batch 80 of 129. Elapsed: 0:01:37. Batch 120 of 129. Elapsed: 0:02:26. Average training loss: 0.01 Training epcoh took: 0:02:37 Running Validation... Accuracy: 0.99 Validation Loss: 0.06 Validation took: 0:00:06 ======== Epoch 4 / 4 ======== Training... Batch 40 of 129. Elapsed: 0:00:49. Batch 80 of 129. Elapsed: 0:01:37. Batch 120 of 129. Elapsed: 0:02:26. Average training loss: 0.01 Training epcoh took: 0:02:36 Running Validation... Accuracy: 0.99 Validation Loss: 0.06 Validation took: 0:00:06 Training complete! Total training took 0:10:50 (h:mm:ss)
Train summary
import pandas as pd
pd.set_option('precision', 2)
df_stats = pd.DataFrame(data=training_stats)
df_stats = df_stats.set_index('epoch')
df_stats
Training Loss | Valid. Loss | Valid. Accur. | Training Time | Validation Time | |
---|---|---|---|---|---|
epoch | |||||
1 | 1.26e-01 | 0.08 | 0.98 | 0:02:34 | 0:00:06 |
2 | 2.58e-02 | 0.06 | 0.99 | 0:02:38 | 0:00:06 |
3 | 1.26e-02 | 0.06 | 0.99 | 0:02:37 | 0:00:06 |
4 | 9.73e-03 | 0.06 | 0.99 | 0:02:36 | 0:00:06 |
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='darkgrid')
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4])
plt.show()
Create test loader
prediction_dataloader = DataLoader(
test_dataset,
sampler = SequentialSampler(test_dataset),
batch_size = batch_size
)
Evaluate on test dataset
print('Predicting labels for {:,} test sentences...'.format(len(test_dataset)))
model.eval()
predictions , true_labels = [], []
for batch in prediction_dataloader:
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
with torch.no_grad():
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask)
logits = outputs['logits']
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
predictions.append(logits)
true_labels.append(label_ids)
print(' DONE.')
Predicting labels for 1,000 test sentences... DONE.
results_ok = 0
results_false = 0
for idx, true_labels_batch in enumerate(true_labels):
predictions_i = np.argmax(predictions[idx], axis=1).flatten()
for bidx, true_label in enumerate(true_labels_batch):
if true_label == predictions_i[bidx]:
results_ok += 1
else:
results_false += 1
print("Correct predictions: {}, incorrect results: {}, accuracy: {}".format(results_ok, results_false, float(results_ok) / (results_ok + results_false)))
Correct predictions: 995, incorrect results: 5, accuracy: 0.995
MCC Score
from sklearn.metrics import matthews_corrcoef
matthews_set = []
print('Calculating Matthews Corr. Coef. for each batch...')
for i in range(len(true_labels)):
pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
matthews = matthews_corrcoef(true_labels[i], pred_labels_i)
matthews_set.append(matthews)
Calculating Matthews Corr. Coef. for each batch...
ax = sns.barplot(x=list(range(len(matthews_set))), y=matthews_set, ci=None)
plt.title('MCC Score per Batch')
plt.ylabel('MCC Score (-1 to +1)')
plt.xlabel('Batch #')
plt.show()
flat_predictions = np.concatenate(predictions, axis=0)
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = np.concatenate(true_labels, axis=0)
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)
print('Total MCC: %.3f' % mcc)
Total MCC: 0.981
Save model
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)
output_dir = '/content/gdrive/My Drive/UAM/Przetwarzanie-tekstu/BERT_custom_model'
print("Saving model to %s" % output_dir)
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
Mounted at /content/gdrive/ Saving model to /content/gdrive/My Drive/UAM/Przetwarzanie-tekstu/BERT_custom_model
('/content/gdrive/My Drive/UAM/Przetwarzanie-tekstu/BERT_custom_model/tokenizer_config.json', '/content/gdrive/My Drive/UAM/Przetwarzanie-tekstu/BERT_custom_model/special_tokens_map.json', '/content/gdrive/My Drive/UAM/Przetwarzanie-tekstu/BERT_custom_model/vocab.txt', '/content/gdrive/My Drive/UAM/Przetwarzanie-tekstu/BERT_custom_model/added_tokens.json')