288 KiB
288 KiB
Instalacja pakietów
!pip install transformers datasets torch
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Collecting transformers Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m71.0 MB/s[0m eta [36m0:00:00[0m [?25hCollecting datasets Downloading datasets-2.9.0-py3-none-any.whl (462 kB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: torch in /usr/local/lib/python3.8/dist-packages (1.13.1+cu116) Requirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from transformers) (3.9.0) Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.8/dist-packages (from transformers) (6.0) Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.8/dist-packages (from transformers) (4.64.1) Collecting huggingface-hub<1.0,>=0.11.0 Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.8/dist-packages (from transformers) (2022.6.2) Requirement already satisfied: requests in /usr/local/lib/python3.8/dist-packages (from transformers) (2.25.1) Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.8/dist-packages (from transformers) (23.0) Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.8/dist-packages (from transformers) (1.21.6) Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m105.8 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: dill<0.3.7 in /usr/local/lib/python3.8/dist-packages (from datasets) (0.3.6) Requirement already satisfied: pyarrow>=6.0.0 in /usr/local/lib/python3.8/dist-packages (from datasets) (9.0.0) Collecting multiprocess Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m [?25hCollecting xxhash Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.8/dist-packages (from datasets) (2023.1.0) Requirement already satisfied: aiohttp in /usr/local/lib/python3.8/dist-packages (from datasets) (3.8.3) Collecting responses<0.19 Downloading responses-0.18.0-py3-none-any.whl (38 kB) Requirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from datasets) (1.3.5) Requirement already satisfied: typing-extensions in /usr/local/lib/python3.8/dist-packages (from torch) (4.4.0) Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.3.1) Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (2.1.1) Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (22.2.0) Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.8.2) Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (1.3.3) Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (6.0.4) Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.8/dist-packages (from aiohttp->datasets) (4.0.2) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (2022.12.7) Requirement already satisfied: chardet<5,>=3.0.2 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (4.0.0) Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (1.24.3) Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests->transformers) (2.10) Collecting urllib3<1.27,>=1.21.1 Downloading urllib3-1.26.14-py2.py3-none-any.whl (140 kB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m [?25hRequirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas->datasets) (2022.7.1) Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas->datasets) (2.8.2) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.8/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0) Installing collected packages: tokenizers, xxhash, urllib3, multiprocess, responses, huggingface-hub, transformers, datasets Attempting uninstall: urllib3 Found existing installation: urllib3 1.24.3 Uninstalling urllib3-1.24.3: Successfully uninstalled urllib3-1.24.3 Successfully installed datasets-2.9.0 huggingface-hub-0.12.0 multiprocess-0.70.14 responses-0.18.0 tokenizers-0.13.2 transformers-4.26.1 urllib3-1.26.14 xxhash-3.2.0
Załadowanie pakietów
from datasets import load_dataset
from transformers import GPT2Tokenizer
import torch
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import GPT2ForSequenceClassification, GPT2Config
from transformers import get_linear_schedule_with_warmup
import numpy as np
import time
import datetime
import random
Załadowanie datasetu
sms_spam
dataset = load_dataset("sms_spam")
Downloading builder script: 0%| | 0.00/3.21k [00:00<?, ?B/s]
Downloading metadata: 0%| | 0.00/1.69k [00:00<?, ?B/s]
Downloading readme: 0%| | 0.00/4.87k [00:00<?, ?B/s]
Downloading and preparing dataset sms_spam/plain_text to /root/.cache/huggingface/datasets/sms_spam/plain_text/1.0.0/53f051d3b5f62d99d61792c91acefe4f1577ad3e4c216fb0ad39e30b9f20019c...
Downloading data: 0%| | 0.00/203k [00:00<?, ?B/s]
Generating train split: 0%| | 0/5574 [00:00<?, ? examples/s]
Dataset sms_spam downloaded and prepared to /root/.cache/huggingface/datasets/sms_spam/plain_text/1.0.0/53f051d3b5f62d99d61792c91acefe4f1577ad3e4c216fb0ad39e30b9f20019c. Subsequent calls will reuse this data.
0%| | 0/1 [00:00<?, ?it/s]
dataset['train'][0]
{'sms': 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n', 'label': 0}
Tokenizer GPT2
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
Downloading (…)olve/main/vocab.json: 0%| | 0.00/1.04M [00:00<?, ?B/s]
Downloading (…)olve/main/merges.txt: 0%| | 0.00/456k [00:00<?, ?B/s]
Downloading (…)lve/main/config.json: 0%| | 0.00/665 [00:00<?, ?B/s]
sms = dataset['train'][0]['sms']
print('Original: ', sms)
print('Tokenized: ', tokenizer.tokenize(sms))
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sms)))
Original: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat... Tokenized: ['Go', 'Ġuntil', 'Ġjur', 'ong', 'Ġpoint', ',', 'Ġcrazy', '..', 'ĠAvailable', 'Ġonly', 'Ġin', 'Ġbug', 'is', 'Ġn', 'Ġgreat', 'Ġworld', 'Ġla', 'Ġe', 'Ġbuffet', '...', 'ĠC', 'ine', 'Ġthere', 'Ġgot', 'Ġam', 'ore', 'Ġwat', '...', 'Ċ'] Token IDs: [5247, 1566, 8174, 506, 966, 11, 7165, 492, 14898, 691, 287, 5434, 271, 299, 1049, 995, 8591, 304, 44703, 986, 327, 500, 612, 1392, 716, 382, 4383, 986, 198]
Check maximum length of a sentence
max_len = 0
for sentence in dataset['train']:
input_ids = tokenizer.encode(sentence['sms'], add_special_tokens=True)
max_len = max(max_len, len(input_ids))
print('Max sentence length: ', max_len)
Max sentence length: 258
Special tokenization
input_ids = []
attention_masks = []
for sentence in dataset['train']:
encoded_dict = tokenizer.encode_plus(
sentence['sms'],
add_special_tokens = True,
max_length = 260,
padding = 'max_length',
truncation=True,
return_attention_mask = True,
return_tensors = 'pt',
)
input_ids.append(encoded_dict['input_ids'])
attention_masks.append(encoded_dict['attention_mask'])
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor([sentence['label'] for sentence in dataset['train']])
print('Original: ', dataset['train'][0])
print('Token IDs:', input_ids[0])
Original: {'sms': 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n', 'label': 0} Token IDs: tensor([ 5247, 1566, 8174, 506, 966, 11, 7165, 492, 14898, 691, 287, 5434, 271, 299, 1049, 995, 8591, 304, 44703, 986, 327, 500, 612, 1392, 716, 382, 4383, 986, 198, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256])
Split dataset
dataset = TensorDataset(input_ids, attention_masks, labels)
test_size = 1000
dataset_len = len(dataset)
train_size = int(0.9 * (dataset_len-test_size))
val_size = (dataset_len-test_size) - train_size
test_dataset, train_dataset, val_dataset = random_split(dataset, [test_size, train_size, val_size])
print('{:>5,} test samples'.format(test_size))
print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))
1,000 test samples 4,116 training samples 458 validation samples
Create train and validation loaders
batch_size = 8
train_dataloader = DataLoader(
train_dataset,
sampler = RandomSampler(train_dataset),
batch_size = batch_size
)
validation_dataloader = DataLoader(
val_dataset,
sampler = SequentialSampler(val_dataset),
batch_size = batch_size
)
Device check
if torch.cuda.is_available():
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
There are 1 GPU(s) available. We will use the GPU: Tesla T4
Load GPT2 model
model = GPT2ForSequenceClassification.from_pretrained(
'gpt2',
num_labels = 2,
)
Downloading (…)"pytorch_model.bin";: 0%| | 0.00/548M [00:00<?, ?B/s]
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = model.config.eos_token_id
model.cuda()
GPT2ForSequenceClassification( (transformer): GPT2Model( (wte): Embedding(50257, 768) (wpe): Embedding(1024, 768) (drop): Dropout(p=0.1, inplace=False) (h): ModuleList( (0): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (1): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (2): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (3): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (4): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (5): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (6): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (7): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (8): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (9): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (10): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) (11): GPT2Block( (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): GPT2Attention( (c_attn): Conv1D() (c_proj): Conv1D() (attn_dropout): Dropout(p=0.1, inplace=False) (resid_dropout): Dropout(p=0.1, inplace=False) ) (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): GPT2MLP( (c_fc): Conv1D() (c_proj): Conv1D() (act): NewGELUActivation() (dropout): Dropout(p=0.1, inplace=False) ) ) ) (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) (score): Linear(in_features=768, out_features=2, bias=False) )
Init training parameters
optimizer = torch.optim.AdamW(model.parameters(),
lr = 2e-5,
eps = 1e-8
)
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0,
num_training_steps = total_steps)
Helper functions
def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
def format_time(elapsed):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
elapsed_rounded = int(round((elapsed)))
return str(datetime.timedelta(seconds=elapsed_rounded))
Training
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []
total_t0 = time.time()
for epoch_i in range(0, epochs):
# ========================================
# Training
# ========================================
print("")
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
print('Training...')
t0 = time.time()
total_train_loss = 0
model.train()
for step, batch in enumerate(train_dataloader):
if step % 40 == 0 and not step == 0:
elapsed = format_time(time.time() - t0)
print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
model.zero_grad()
outputs = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
loss = outputs['loss']
total_train_loss += loss.item()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
avg_train_loss = total_train_loss / len(train_dataloader)
training_time = format_time(time.time() - t0)
print("")
print(" Average training loss: {0:.2f}".format(avg_train_loss))
print(" Training epcoh took: {:}".format(training_time))
# ========================================
# Validation
# ========================================
print("")
print("Running Validation...")
t0 = time.time()
model.eval()
total_eval_accuracy = 0
total_eval_loss = 0
nb_eval_steps = 0
for batch in validation_dataloader:
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
with torch.no_grad():
outputs = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
loss = outputs['loss']
logits = outputs['logits']
total_eval_loss += loss.item()
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
total_eval_accuracy += flat_accuracy(logits, label_ids)
avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
print(" Accuracy: {0:.2f}".format(avg_val_accuracy))
avg_val_loss = total_eval_loss / len(validation_dataloader)
validation_time = format_time(time.time() - t0)
print(" Validation Loss: {0:.2f}".format(avg_val_loss))
print(" Validation took: {:}".format(validation_time))
training_stats.append(
{
'epoch': epoch_i + 1,
'Training Loss': avg_train_loss,
'Valid. Loss': avg_val_loss,
'Valid. Accur.': avg_val_accuracy,
'Training Time': training_time,
'Validation Time': validation_time
}
)
print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
======== Epoch 1 / 4 ======== Training... Batch 40 of 515. Elapsed: 0:00:19. Batch 80 of 515. Elapsed: 0:00:35. Batch 120 of 515. Elapsed: 0:00:52. Batch 160 of 515. Elapsed: 0:01:08. Batch 200 of 515. Elapsed: 0:01:25. Batch 240 of 515. Elapsed: 0:01:42. Batch 280 of 515. Elapsed: 0:01:58. Batch 320 of 515. Elapsed: 0:02:15. Batch 360 of 515. Elapsed: 0:02:32. Batch 400 of 515. Elapsed: 0:02:49. Batch 440 of 515. Elapsed: 0:03:06. Batch 480 of 515. Elapsed: 0:03:24. Average training loss: 0.14 Training epcoh took: 0:03:38 Running Validation... Accuracy: 0.97 Validation Loss: 0.21 Validation took: 0:00:08 ======== Epoch 2 / 4 ======== Training... Batch 40 of 515. Elapsed: 0:00:17. Batch 80 of 515. Elapsed: 0:00:35. Batch 120 of 515. Elapsed: 0:00:52. Batch 160 of 515. Elapsed: 0:01:09. Batch 200 of 515. Elapsed: 0:01:27. Batch 240 of 515. Elapsed: 0:01:44. Batch 280 of 515. Elapsed: 0:02:01. Batch 320 of 515. Elapsed: 0:02:19. Batch 360 of 515. Elapsed: 0:02:36. Batch 400 of 515. Elapsed: 0:02:54. Batch 440 of 515. Elapsed: 0:03:11. Batch 480 of 515. Elapsed: 0:03:29. Average training loss: 0.04 Training epcoh took: 0:03:44 Running Validation... Accuracy: 0.97 Validation Loss: 0.19 Validation took: 0:00:08 ======== Epoch 3 / 4 ======== Training... Batch 40 of 515. Elapsed: 0:00:17. Batch 80 of 515. Elapsed: 0:00:35. Batch 120 of 515. Elapsed: 0:00:52. Batch 160 of 515. Elapsed: 0:01:10. Batch 200 of 515. Elapsed: 0:01:27. Batch 240 of 515. Elapsed: 0:01:45. Batch 280 of 515. Elapsed: 0:02:02. Batch 320 of 515. Elapsed: 0:02:20. Batch 360 of 515. Elapsed: 0:02:37. Batch 400 of 515. Elapsed: 0:02:55. Batch 440 of 515. Elapsed: 0:03:12. Batch 480 of 515. Elapsed: 0:03:30. Average training loss: 0.03 Training epcoh took: 0:03:45 Running Validation... Accuracy: 0.97 Validation Loss: 0.16 Validation took: 0:00:08 ======== Epoch 4 / 4 ======== Training... Batch 40 of 515. Elapsed: 0:00:17. Batch 80 of 515. Elapsed: 0:00:35. Batch 120 of 515. Elapsed: 0:00:52. Batch 160 of 515. Elapsed: 0:01:10. Batch 200 of 515. Elapsed: 0:01:27. Batch 240 of 515. Elapsed: 0:01:45. Batch 280 of 515. Elapsed: 0:02:02. Batch 320 of 515. Elapsed: 0:02:20. Batch 360 of 515. Elapsed: 0:02:37. Batch 400 of 515. Elapsed: 0:02:55. Batch 440 of 515. Elapsed: 0:03:12. Batch 480 of 515. Elapsed: 0:03:30. Average training loss: 0.01 Training epcoh took: 0:03:45 Running Validation... Accuracy: 0.98 Validation Loss: 0.11 Validation took: 0:00:08 Training complete! Total training took 0:15:24 (h:mm:ss)
Train summary
import pandas as pd
pd.set_option('precision', 2)
df_stats = pd.DataFrame(data=training_stats)
df_stats = df_stats.set_index('epoch')
df_stats
Training Loss | Valid. Loss | Valid. Accur. | Training Time | Validation Time | |
---|---|---|---|---|---|
epoch | |||||
1 | 0.14 | 0.21 | 0.97 | 0:03:38 | 0:00:08 |
2 | 0.04 | 0.19 | 0.97 | 0:03:44 | 0:00:08 |
3 | 0.03 | 0.16 | 0.97 | 0:03:45 | 0:00:08 |
4 | 0.01 | 0.11 | 0.98 | 0:03:45 | 0:00:08 |
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='darkgrid')
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4])
plt.show()
Create test loader
prediction_dataloader = DataLoader(
test_dataset,
sampler = SequentialSampler(test_dataset),
batch_size = batch_size
)
Evaluate on test dataset
print('Predicting labels for {:,} test sentences...'.format(len(test_dataset)))
model.eval()
predictions , true_labels = [], []
for batch in prediction_dataloader:
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
with torch.no_grad():
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask)
logits = outputs['logits']
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
predictions.append(logits)
true_labels.append(label_ids)
print(' DONE.')
Predicting labels for 1,000 test sentences... DONE.
results_ok = 0
results_false = 0
for idx, true_labels_batch in enumerate(true_labels):
predictions_i = np.argmax(predictions[idx], axis=1).flatten()
for bidx, true_label in enumerate(true_labels_batch):
if true_label == predictions_i[bidx]:
results_ok += 1
else:
results_false += 1
print("Correct predictions: {}, incorrect results: {}, accuracy: {}".format(results_ok, results_false, float(results_ok) / (results_ok + results_false)))
Correct predictions: 990, incorrect results: 10, accuracy: 0.99
MCC Score
from sklearn.metrics import matthews_corrcoef
matthews_set = []
print('Calculating Matthews Corr. Coef. for each batch...')
for i in range(len(true_labels)):
pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
matthews = matthews_corrcoef(true_labels[i], pred_labels_i)
matthews_set.append(matthews)
Calculating Matthews Corr. Coef. for each batch...
ax = sns.barplot(x=list(range(len(matthews_set))), y=matthews_set, ci=None)
plt.title('MCC Score per Batch')
plt.ylabel('MCC Score (-1 to +1)')
plt.xlabel('Batch #')
plt.show()
flat_predictions = np.concatenate(predictions, axis=0)
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = np.concatenate(true_labels, axis=0)
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)
print('Total MCC: %.3f' % mcc)
Total MCC: 0.960
Save model
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)
output_dir = '/content/gdrive/My Drive/UAM/Przetwarzanie-tekstu/GPT2_Model'
print("Saving model to %s" % output_dir)
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
Mounted at /content/gdrive/ Saving model to /content/gdrive/My Drive/UAM/Przetwarzanie-tekstu/GPT2_Model
('/content/gdrive/My Drive/UAM/Przetwarzanie-tekstu/GPT2_Model/tokenizer_config.json', '/content/gdrive/My Drive/UAM/Przetwarzanie-tekstu/GPT2_Model/special_tokens_map.json', '/content/gdrive/My Drive/UAM/Przetwarzanie-tekstu/GPT2_Model/vocab.json', '/content/gdrive/My Drive/UAM/Przetwarzanie-tekstu/GPT2_Model/merges.txt', '/content/gdrive/My Drive/UAM/Przetwarzanie-tekstu/GPT2_Model/added_tokens.json')