# Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -Uq transformers bitsandbytes accelerate datasets==2.0.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m109.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.1/97.1 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.5/325.5 kB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

# Create dataset

In [3]:
from datasets import load_dataset

dataset = load_dataset("sedthh/ubuntu_dialogue_qa", split="train")



Downloading and preparing dataset parquet/sedthh--ubuntu_dialogue_qa to /root/.cache/huggingface/datasets/parquet/sedthh--ubuntu_dialogue_qa-8763539f847553a9/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.16M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/sedthh--ubuntu_dialogue_qa-8763539f847553a9/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


# Define training components

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = 'google/flan-t5-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map='auto')

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [5]:
def preprocess_function(sample):
    inputs = ['Answer the question: ' + item for item in sample['INSTRUCTION']]
    model_inputs = tokenizer(inputs, padding=True, truncation=True)
    labels = tokenizer(text_target=sample['RESPONSE'], padding=True, truncation=True)
    labels['input_ids'] = [[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels['input_ids']]
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['INSTRUCTION', 'RESPONSE', 'SOURCE', 'METADATA'])



  0%|          | 0/17 [00:00<?, ?ba/s]

In [6]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model
)

# Training

In [7]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir='flan-t5_large_v2'

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
	  auto_find_batch_size=True,
    num_train_epochs=3,
    logging_dir=f'{output_dir}/logs',
    save_steps=25000
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset
)
model.config.use_cache = False

In [8]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


Step,Training Loss


Step,Training Loss
500,3.3207
1000,3.173
1500,3.1425
2000,3.0709
2500,3.0836
3000,3.0579
3500,3.0059
4000,3.0169
4500,3.0123
5000,2.9571


TrainOutput(global_step=24261, training_loss=2.8796593968087287, metrics={'train_runtime': 17543.4268, 'train_samples_per_second': 2.766, 'train_steps_per_second': 1.383, 'total_flos': 3.65470024955904e+16, 'train_loss': 2.8796593968087287, 'epoch': 3.0})

# Save the model

In [9]:
trainer.model.save_pretrained('se_flan-t5_v2')
tokenizer.save_pretrained('se_flan-t5_v2')

('se_flan-t5_v2/tokenizer_config.json',
 'se_flan-t5_v2/special_tokens_map.json',
 'se_flan-t5_v2/tokenizer.json')

In [10]:
!rm -r /content/drive/MyDrive/se; mkdir /content/drive/MyDrive/se
!cp -r se_flan-t5_v2 /content/drive/MyDrive/se

rm: cannot remove '/content/drive/MyDrive/se': No such file or directory
