uczenie/T5.ipynb

93 KiB

!pip install datasets transformers sentencepiece
Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (2.16.1)
Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.35.2)
Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (0.1.99)
Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.13.1)
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.23.5)
Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (10.0.1)
Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets) (0.6)
Requirement already satisfied: dill<0.3.8,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.7)
Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (1.5.3)
Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.31.0)
Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.1)
Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.4.1)
Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.15)
Requirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)
Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.9.1)
Requirement already satisfied: huggingface-hub>=0.19.4 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.20.2)
Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (23.2)
Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.1)
Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)
Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.15.0)
Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.1)
Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.2.0)
Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.4)
Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.4)
Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)
Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)
Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.19.4->datasets) (4.5.0)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.6)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2.0.7)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2023.11.17)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2023.3.post1)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.16.0)
from datasets import load_dataset
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import random
import time
import numpy as np
import datetime
import sklearn
from tqdm.notebook import tqdm
import os
def load_and_process_dataset():
    dataset = load_dataset("sst2")
    dataset.remove_columns('idx')
    del dataset['test']
    dataset['test'] = dataset['validation']
    del dataset['validation']
    split_dataset = dataset['train'].train_test_split(test_size=1600)
    dataset['train'] = split_dataset['train']
    dataset['validation'] = split_dataset['test']
    return dataset
dataset = load_and_process_dataset()
dataset
/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: 
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  warnings.warn(
DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 65749
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1600
    })
})
for key in dataset.keys():
  dataset[key] = dataset[key].map(lambda x: {'source_text': f"sentiment-analysis: {x['sentence']}", 'target_text': 'positive' if x['label'] else 'negative'}).remove_columns(['idx', 'sentence', 'label'])
Map:   0%|          | 0/65749 [00:00<?, ? examples/s]
Map:   0%|          | 0/1600 [00:00<?, ? examples/s]
train = dataset['train']
validation = dataset['validation']
test = dataset['test']
dataset['train'][0]
{'source_text': 'sentiment-analysis: proficiently enough to trounce its overly comfortable trappings ',
 'target_text': 'positive'}
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
def tokenize(dataset):
  input_ids = []
  target_ids = []
  attention_masks = []
  for row in dataset:
    encoding = tokenizer.encode_plus(
      row['source_text'],
      padding="max_length",
      max_length=86,
      truncation=True,
      add_special_tokens = True,
      return_attention_mask = True,
      return_tensors="pt"
    )
    target_encoding = tokenizer.encode_plus(
      row['target_text'],
      padding="max_length",
      max_length=2,
      truncation=True,
      add_special_tokens = True,
      return_attention_mask = True,
      return_tensors="pt",
    )
    input_ids.append(encoding['input_ids'])
    target_ids.append(target_encoding['input_ids'])
    attention_masks.append(encoding['attention_mask'])
  return TensorDataset(torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0), torch.cat(target_ids, dim=0))
train_tokenized = tokenize(train)
validation_tokenized = tokenize(validation)
test_tokenized = tokenize(test)
batch_size = 32

train_dataloader = DataLoader(
            train_tokenized,
            sampler = RandomSampler(train_tokenized),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            validation_tokenized,
            sampler = SequentialSampler(validation_tokenized),
            batch_size = batch_size
        )

test_dataloader = DataLoader(
            test_tokenized,
            sampler = SequentialSampler(test_tokenized),
            batch_size = batch_size
        )
import logging
import os
import random
import numpy as np
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
import sklearn.metrics

class T5Model:
    def __init__(self, model, tokenizer, train_dataloader, val_dataloader, logs_dir_path, seed=42, epochs=3, lr=1e-4, eps=1e-8):
        self.model = model
        self.tokenizer = tokenizer
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
        self.seed = seed
        self.epochs = epochs
        self.learning_rate = lr
        self.eps = eps
        self.logs_dir_path = logs_dir_path
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)
        self.optimizer = AdamW(model.parameters(), lr=lr, eps=eps)
        self.init_logging()
        self.init_seed()

    def init_seed(self):
        random.seed(self.seed)
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

    def init_logging(self):
        if not os.path.exists(self.logs_dir_path):
            os.makedirs(self.logs_dir_path)
        logging.basicConfig(filename=os.path.join(self.logs_dir_path, 'training.log'), level=logging.INFO)

    def compute_metrics(self, target, preds):
        return sklearn.metrics.accuracy_score(target, preds)

    def train(self):
        print(f"Starting training, epochs: {self.epochs}")
        for epoch in range(self.epochs):
            self._train(epoch)
            self.validate(epoch)

    def _train(self, epoch_number):
        print(f"Training epoch: {epoch_number}")
        self.model.train()
        train_accuracy_total = 0
        pbar = tqdm(self.train_dataloader, total=len(self.train_dataloader))
        for step, batch in enumerate(pbar):
            b_input_ids, b_input_mask, b_labels = [b.to(self.device) for b in batch]

            outputs = self.model(
                input_ids=b_input_ids,
                attention_mask=b_input_mask,
                labels=b_labels
            )

            gen_output = self.model.generate(
                input_ids=b_input_ids,
                attention_mask=b_input_mask,
                max_length=3,
                num_beams=2,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True
            )

            preds = [self.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in gen_output]
            target = [self.tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in b_labels]

            loss = outputs[0]
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            accuracy = self.compute_metrics(target, preds)
            train_accuracy_total += accuracy
            pbar.set_description(f"Epoch {epoch_number+1}, step {step}, train loss {loss}, accuracy: {accuracy}")

        print(f"Epoch: {epoch_number+1}, Average train accuracy: {train_accuracy_total/len(self.train_dataloader)}")

    def validate(self, epoch_number):
        print(f"Validation epoch: {epoch_number}")
        self.model.eval()
        val_accuracy_total = 0
        pbar_val = tqdm(self.val_dataloader, total=len(self.val_dataloader))
        for step, batch in enumerate(pbar_val):
            b_input_ids, b_input_mask, b_labels = [b.to(self.device) for b in batch]

            with torch.no_grad():
                outputs = self.model(
                    input_ids=b_input_ids,
                    attention_mask=b_input_mask,
                    labels=b_labels
                )

                loss = outputs[0]

                gen_output = self.model.generate(
                    input_ids=b_input_ids,
                    attention_mask=b_input_mask,
                    max_length=3,
                    num_beams=2,
                    repetition_penalty=2.5,
                    length_penalty=1.0,
                    early_stopping=True
                )

                preds = [self.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in gen_output]
                target = [self.tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in b_labels]

                accuracy_val = self.compute_metrics(target, preds)
                val_accuracy_total += accuracy_val
                pbar_val.set_description(f"Epoch {epoch_number+1}, val loss {loss}, accuracy: {accuracy_val}")

        print(f"Epoch: {epoch_number+1}, Average validation accuracy: {val_accuracy_total/len(self.val_dataloader)}")

    def evaluate(self, test_dataloader):
        print("Evaluating on test data")
        self.model.eval()
        total_test_acc = 0
        for batch in tqdm(test_dataloader, total=len(test_dataloader)):
            b_input_ids, b_input_mask, b_labels = [b.to(self.device) for b in batch]

            with torch.no_grad():
                gen_output = self.model.generate(
                    input_ids=b_input_ids,
                    attention_mask=b_input_mask,
                    max_length=3,
                    num_beams=2,
                    repetition_penalty=2.5,
                    length_penalty=1.0,
                    early_stopping=True
                )

                preds = [self.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in gen_output]
                target = [self.tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in b_labels]

                total_test_acc += self.compute_metrics(preds, target)

        print("Average test accuracy: ", total_test_acc / len(test_dataloader))
t5model = T5Model(model, tokenizer, train_dataloader, validation_dataloader, "./logs")
t5model.train()
Starting training, epochs: 3
Training epoch: 0
  0%|          | 0/2055 [00:00<?, ?it/s]
Epoch: 1, Average train accuracy: 0.888327685088634
Validation epoch: 0
  0%|          | 0/50 [00:00<?, ?it/s]
Epoch: 1, Average validation accuracy: 0.939375
Training epoch: 1
  0%|          | 0/2055 [00:00<?, ?it/s]
Epoch: 2, Average train accuracy: 0.9262925790754258
Validation epoch: 1
  0%|          | 0/50 [00:00<?, ?it/s]
Epoch: 2, Average validation accuracy: 0.9375
Training epoch: 2
  0%|          | 0/2055 [00:00<?, ?it/s]
Epoch: 3, Average train accuracy: 0.9394616788321168
Validation epoch: 2
  0%|          | 0/50 [00:00<?, ?it/s]
Epoch: 3, Average validation accuracy: 0.950625
print(model)
T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dropout(p=0.1, inplace=False)
              (act): ReLU()
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (1-5): 5 x T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dropout(p=0.1, inplace=False)
              (act): ReLU()
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (final_layer_norm): T5LayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerCrossAttention(
            (EncDecAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (2): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dropout(p=0.1, inplace=False)
              (act): ReLU()
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (1-5): 5 x T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerCrossAttention(
            (EncDecAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (2): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dropout(p=0.1, inplace=False)
              (act): ReLU()
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (final_layer_norm): T5LayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (lm_head): Linear(in_features=512, out_features=32128, bias=False)
)