empatia-projekt/chatbot_training.ipynb
2023-06-21 11:13:04 +02:00

29 KiB
Raw Permalink Blame History

import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pip install transformers torch accelerate
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.30.2)
Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.0.1+cu118)
Requirement already satisfied: accelerate in /usr/local/lib/python3.10/dist-packages (0.20.3)
Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.0)
Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.15.1)
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)
Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)
Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)
Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)
Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.13.3)
Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.3.1)
Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)
Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.5.0)
Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.11.1)
Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.1)
Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.2)
Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.0.0)
Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (3.25.2)
Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch) (16.0.5)
Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate) (5.9.5)
Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (2023.4.0)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.2)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.15)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2022.12.7)
Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.12)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)
Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed

model = AutoModelForCausalLM.from_pretrained('flax-community/papuGaPT2')
tokenizer = AutoTokenizer.from_pretrained('flax-community/papuGaPT2')

# model = AutoModelForCausalLM.from_pretrained('sdadas/polish-gpt2-medium')
# tokenizer = AutoTokenizer.from_pretrained('sdadas/polish-gpt2-medium')

tokenizer.pad_token = tokenizer.eos_token

Wczytanie danych do finetuningu

Dane stworzyliśmy ręcznie oraz za pomocą ChatGPT.

from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)
working_dir = '/content/gdrive/My Drive/empatia/'
Mounted at /content/gdrive/
dialogs_df = pd.read_csv(working_dir + 'data/dialogs.csv')
dialogs2_df = pd.read_csv(working_dir + 'data/dialogs2.csv')

dialogs_df = pd.concat([dialogs_df, dialogs2_df])

texts = 'question: ' + dialogs_df['question'] + "\nanswer: " + dialogs_df['answer']
texts = texts.tolist()

print(texts[10])
question: powodzenia w szkole.
answer: Dziękuję bardzo.
dialogs_df.sample(5)
question answer
405 Szkoda, że nie mogę pracować mniej. Czuję si... Próbowałem tego, czego naprawdę potrzebuję, je...
548 Tak, to było o wiele prostsze. Cieszyliśmy się... życie było proste wtedy nie było! bardzo ładny.
564 Dowiedziałem się więc czegoś, co bardzo mnie z... Moje dziecko wyszło za moimi plecami i wymknęł...
142 moja wina, miałem obowiązki do zrobienia. w porządku.
384 brzmi jakby to była bliska gra. dlatego była to tak świetna gra.

Preprocessing

from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
import torch

# Create custom dataset
class KolegaDataset(Dataset):
  def __init__(self, txt_list, tokenizer):
    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

    for txt in txt_list:
      encodings_dict = tokenizer(txt, padding="max_length", truncation=True, max_length=512)
      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx]
dataset = KolegaDataset(texts, tokenizer)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('Train dataset size: ', train_size)
print('Validation dataset size: ', val_size)
Train dataset size:  1349
Validation dataset size:  150
batch_size = 8

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

Fine-tuning

# some parameters I cooked up that work reasonably well

epochs = 20
learning_rate = 0.0005
warmup_steps = 1e2
epsilon = 1e-8
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr = learning_rate, eps = epsilon)
/usr/local/lib/python3.10/dist-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warmup_steps, num_training_steps = total_steps)
import datetime
import time
import random

def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

device = torch.device("cuda")
model.cuda()
GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): FastGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=51200, bias=False)
)
total_t0 = time.time()

training_stats = []

model = model.to(device)

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()

        outputs = model(  b_input_ids,
                          labels=b_labels,
                          attention_mask = b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]

        batch_loss = loss.item()
        total_train_loss += batch_loss

        loss.backward()

        optimizer.step()

        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        with torch.no_grad():

            outputs  = model(b_input_ids,
#                            token_type_ids=None,
                             attention_mask = b_masks,
                            labels=b_labels)

            loss = outputs[0]

        batch_loss = loss.item()
        total_eval_loss += batch_loss

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
======== Epoch 1 / 20 ========
Training...

  Average training loss: 0.44
  Training epoch took: 0:00:46

Running Validation...
  Validation Loss: 0.15
  Validation took: 0:00:02

======== Epoch 2 / 20 ========
Training...

  Average training loss: 0.12
  Training epoch took: 0:00:46

Running Validation...
  Validation Loss: 0.11
  Validation took: 0:00:02

======== Epoch 3 / 20 ========
Training...

  Average training loss: 0.08
  Training epoch took: 0:00:46

Running Validation...
  Validation Loss: 0.09
  Validation took: 0:00:02

======== Epoch 4 / 20 ========
Training...

  Average training loss: 0.05
  Training epoch took: 0:00:46

Running Validation...
  Validation Loss: 0.08
  Validation took: 0:00:02

======== Epoch 5 / 20 ========
Training...

  Average training loss: 0.04
  Training epoch took: 0:00:46

Running Validation...
  Validation Loss: 0.08
  Validation took: 0:00:02

======== Epoch 6 / 20 ========
Training...

  Average training loss: 0.03
  Training epoch took: 0:00:46

Running Validation...
  Validation Loss: 0.08
  Validation took: 0:00:02

======== Epoch 7 / 20 ========
Training...

  Average training loss: 0.03
  Training epoch took: 0:00:46

Running Validation...
  Validation Loss: 0.08
  Validation took: 0:00:02

======== Epoch 8 / 20 ========
Training...

  Average training loss: 0.02
  Training epoch took: 0:00:46

Running Validation...
  Validation Loss: 0.08
  Validation took: 0:00:02

======== Epoch 9 / 20 ========
Training...

  Average training loss: 0.02
  Training epoch took: 0:00:46

Running Validation...
  Validation Loss: 0.07
  Validation took: 0:00:02

======== Epoch 10 / 20 ========
Training...

  Average training loss: 0.02
  Training epoch took: 0:00:46

Running Validation...
  Validation Loss: 0.07
  Validation took: 0:00:02

======== Epoch 11 / 20 ========
Training...

  Average training loss: 0.02
  Training epoch took: 0:00:46

Running Validation...
  Validation Loss: 0.07
  Validation took: 0:00:02

======== Epoch 12 / 20 ========
Training...

  Average training loss: 0.02
  Training epoch took: 0:00:46

Running Validation...
  Validation Loss: 0.07
  Validation took: 0:00:02

======== Epoch 13 / 20 ========
Training...

  Average training loss: 0.02
  Training epoch took: 0:00:46

Running Validation...
  Validation Loss: 0.07
  Validation took: 0:00:02

======== Epoch 14 / 20 ========
Training...

  Average training loss: 0.02
  Training epoch took: 0:00:46

Running Validation...
  Validation Loss: 0.07
  Validation took: 0:00:02

======== Epoch 15 / 20 ========
Training...

  Average training loss: 0.02
  Training epoch took: 0:00:46

Running Validation...
  Validation Loss: 0.07
  Validation took: 0:00:02

======== Epoch 16 / 20 ========
Training...

  Average training loss: 0.02
  Training epoch took: 0:00:46

Running Validation...
  Validation Loss: 0.07
  Validation took: 0:00:02

======== Epoch 17 / 20 ========
Training...

  Average training loss: 0.02
  Training epoch took: 0:00:46

Running Validation...
  Validation Loss: 0.07
  Validation took: 0:00:02

======== Epoch 18 / 20 ========
Training...

  Average training loss: 0.02
  Training epoch took: 0:00:46

Running Validation...
  Validation Loss: 0.07
  Validation took: 0:00:02

======== Epoch 19 / 20 ========
Training...

  Average training loss: 0.02
  Training epoch took: 0:00:46

Running Validation...
  Validation Loss: 0.07
  Validation took: 0:00:02

======== Epoch 20 / 20 ========
Training...

  Average training loss: 0.01
  Training epoch took: 0:00:46

Running Validation...
  Validation Loss: 0.08
  Validation took: 0:00:02

Training complete!
Total training took 0:16:00 (h:mm:ss)
model.eval()

input_text = "question: Cześć, byłem dziś w szkole i było źle\nanswer:"
input_ids = tokenizer.encode(input_text, return_tensors='pt')
input_ids = input_ids.to(device)

output = model.generate(input_ids, max_length=100, early_stopping=True)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
question: Cześć, byłem dziś w szkole i było źle
answer: Nie, byłem za młody, ale prawie płakałem
model.save_pretrained('/content/gdrive/MyDrive/empatia/model')
tokenizer.save_pretrained('/content/gdrive/MyDrive/empatia/model')
('/content/gdrive/MyDrive/empatia/model/tokenizer_config.json',
 '/content/gdrive/MyDrive/empatia/model/special_tokens_map.json',
 '/content/gdrive/MyDrive/empatia/model/vocab.json',
 '/content/gdrive/MyDrive/empatia/model/merges.txt',
 '/content/gdrive/MyDrive/empatia/model/added_tokens.json',
 '/content/gdrive/MyDrive/empatia/model/tokenizer.json')