sztuczna-empatia-kaczuszka/project/flan-t5_ubuntu_finetuning.ipynb
2023-06-20 18:52:37 +02:00

167 KiB
Raw Blame History

Setup

from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
!pip install -Uq transformers bitsandbytes accelerate datasets==2.0.0
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.2/7.2 MB 109.5 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 97.1/97.1 MB 9.0 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 227.6/227.6 kB 26.5 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 325.5/325.5 kB 36.3 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.5/110.5 kB 14.3 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 212.5/212.5 kB 25.7 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.3/134.3 kB 18.4 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.0/1.0 MB 58.7 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 236.8/236.8 kB 28.2 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.8/7.8 MB 33.5 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 81.3 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 114.5/114.5 kB 14.8 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 268.8/268.8 kB 32.4 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 149.6/149.6 kB 19.9 MB/s eta 0:00:00
[?25h

Create dataset

from datasets import load_dataset

dataset = load_dataset("sedthh/ubuntu_dialogue_qa", split="train")
WARNING:datasets.builder:Using custom data configuration sedthh--ubuntu_dialogue_qa-8763539f847553a9
Downloading and preparing dataset parquet/sedthh--ubuntu_dialogue_qa to /root/.cache/huggingface/datasets/parquet/sedthh--ubuntu_dialogue_qa-8763539f847553a9/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...
Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading data:   0%|          | 0.00/2.16M [00:00<?, ?B/s]
Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]
Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/sedthh--ubuntu_dialogue_qa-8763539f847553a9/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.

Define training components

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = 'google/flan-t5-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map='auto')
Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]
Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]
Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]
Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]
Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]
Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]
Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]
def preprocess_function(sample):
    inputs = ['Answer the question: ' + item for item in sample['INSTRUCTION']]
    model_inputs = tokenizer(inputs, padding=True, truncation=True)
    labels = tokenizer(text_target=sample['RESPONSE'], padding=True, truncation=True)
    labels['input_ids'] = [[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels['input_ids']]
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['INSTRUCTION', 'RESPONSE', 'SOURCE', 'METADATA'])
WARNING:datasets.fingerprint:Parameter 'function'=<function preprocess_function at 0x7fe0df129ea0> of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.
  0%|          | 0/17 [00:00<?, ?ba/s]
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model
)

Training

from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir='flan-t5_large_v2'

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
	  auto_find_batch_size=True,
    num_train_epochs=3,
    logging_dir=f'{output_dir}/logs',
    save_steps=25000
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset
)
model.config.use_cache = False
trainer.train()
/usr/local/lib/python3.10/dist-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
[ 2/6066 : < :, Epoch 0.00/3]
Step Training Loss

[ 2/12132 : < :, Epoch 0.00/3]
Step Training Loss

[24261/24261 4:52:22, Epoch 3/3]
Step Training Loss
500 3.320700
1000 3.173000
1500 3.142500
2000 3.070900
2500 3.083600
3000 3.057900
3500 3.005900
4000 3.016900
4500 3.012300
5000 2.957100
5500 2.958700
6000 2.976900
6500 2.959700
7000 2.975100
7500 2.951100
8000 2.945200
8500 2.849600
9000 2.722500
9500 2.854400
10000 2.782700
10500 2.845900
11000 2.803300
11500 2.797800
12000 2.827400
12500 2.757800
13000 2.823600
13500 2.827200
14000 2.832400
14500 2.839800
15000 2.783400
15500 2.774200
16000 2.785500
16500 2.793600
17000 2.801200
17500 2.812800
18000 2.813300
18500 2.804800
19000 2.777800
19500 2.761100
20000 2.832700
20500 2.868300
21000 2.757300
21500 2.763300
22000 2.817900
22500 2.775300
23000 2.796400
23500 2.798700
24000 2.778000

TrainOutput(global_step=24261, training_loss=2.8796593968087287, metrics={'train_runtime': 17543.4268, 'train_samples_per_second': 2.766, 'train_steps_per_second': 1.383, 'total_flos': 3.65470024955904e+16, 'train_loss': 2.8796593968087287, 'epoch': 3.0})

Save the model

trainer.model.save_pretrained('se_flan-t5_v2')
tokenizer.save_pretrained('se_flan-t5_v2')
('se_flan-t5_v2/tokenizer_config.json',
 'se_flan-t5_v2/special_tokens_map.json',
 'se_flan-t5_v2/tokenizer.json')
!rm -r /content/drive/MyDrive/se; mkdir /content/drive/MyDrive/se
!cp -r se_flan-t5_v2 /content/drive/MyDrive/se
rm: cannot remove '/content/drive/MyDrive/se': No such file or directory