167 KiB
167 KiB
Setup
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
!pip install -Uq transformers bitsandbytes accelerate datasets==2.0.0
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m109.5 MB/s[0m eta [36m0:00:00[0m [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.1/97.1 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.5/325.5 kB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.6/149.6 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m [?25h
Create dataset
from datasets import load_dataset
dataset = load_dataset("sedthh/ubuntu_dialogue_qa", split="train")
WARNING:datasets.builder:Using custom data configuration sedthh--ubuntu_dialogue_qa-8763539f847553a9
Downloading and preparing dataset parquet/sedthh--ubuntu_dialogue_qa to /root/.cache/huggingface/datasets/parquet/sedthh--ubuntu_dialogue_qa-8763539f847553a9/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...
Downloading data files: 0%| | 0/1 [00:00<?, ?it/s]
Downloading data: 0%| | 0.00/2.16M [00:00<?, ?B/s]
Extracting data files: 0%| | 0/1 [00:00<?, ?it/s]
Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/sedthh--ubuntu_dialogue_qa-8763539f847553a9/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.
Define training components
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_name = 'google/flan-t5-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map='auto')
Downloading (…)okenizer_config.json: 0%| | 0.00/2.54k [00:00<?, ?B/s]
Downloading spiece.model: 0%| | 0.00/792k [00:00<?, ?B/s]
Downloading (…)/main/tokenizer.json: 0%| | 0.00/2.42M [00:00<?, ?B/s]
Downloading (…)cial_tokens_map.json: 0%| | 0.00/2.20k [00:00<?, ?B/s]
Downloading (…)lve/main/config.json: 0%| | 0.00/662 [00:00<?, ?B/s]
Downloading model.safetensors: 0%| | 0.00/3.13G [00:00<?, ?B/s]
Downloading (…)neration_config.json: 0%| | 0.00/147 [00:00<?, ?B/s]
def preprocess_function(sample):
inputs = ['Answer the question: ' + item for item in sample['INSTRUCTION']]
model_inputs = tokenizer(inputs, padding=True, truncation=True)
labels = tokenizer(text_target=sample['RESPONSE'], padding=True, truncation=True)
labels['input_ids'] = [[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels['input_ids']]
model_inputs['labels'] = labels['input_ids']
return model_inputs
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['INSTRUCTION', 'RESPONSE', 'SOURCE', 'METADATA'])
WARNING:datasets.fingerprint:Parameter 'function'=<function preprocess_function at 0x7fe0df129ea0> of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.
0%| | 0/17 [00:00<?, ?ba/s]
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(
tokenizer,
model=model
)
Training
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
output_dir='flan-t5_large_v2'
training_args = Seq2SeqTrainingArguments(
output_dir=output_dir,
auto_find_batch_size=True,
num_train_epochs=3,
logging_dir=f'{output_dir}/logs',
save_steps=25000
)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=tokenized_dataset
)
model.config.use_cache = False
trainer.train()
/usr/local/lib/python3.10/dist-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning warnings.warn( You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
[ 2/6066 : < :, Epoch 0.00/3]
Step | Training Loss |
---|
[ 2/12132 : < :, Epoch 0.00/3]
Step | Training Loss |
---|
[24261/24261 4:52:22, Epoch 3/3]
Step | Training Loss |
---|---|
500 | 3.320700 |
1000 | 3.173000 |
1500 | 3.142500 |
2000 | 3.070900 |
2500 | 3.083600 |
3000 | 3.057900 |
3500 | 3.005900 |
4000 | 3.016900 |
4500 | 3.012300 |
5000 | 2.957100 |
5500 | 2.958700 |
6000 | 2.976900 |
6500 | 2.959700 |
7000 | 2.975100 |
7500 | 2.951100 |
8000 | 2.945200 |
8500 | 2.849600 |
9000 | 2.722500 |
9500 | 2.854400 |
10000 | 2.782700 |
10500 | 2.845900 |
11000 | 2.803300 |
11500 | 2.797800 |
12000 | 2.827400 |
12500 | 2.757800 |
13000 | 2.823600 |
13500 | 2.827200 |
14000 | 2.832400 |
14500 | 2.839800 |
15000 | 2.783400 |
15500 | 2.774200 |
16000 | 2.785500 |
16500 | 2.793600 |
17000 | 2.801200 |
17500 | 2.812800 |
18000 | 2.813300 |
18500 | 2.804800 |
19000 | 2.777800 |
19500 | 2.761100 |
20000 | 2.832700 |
20500 | 2.868300 |
21000 | 2.757300 |
21500 | 2.763300 |
22000 | 2.817900 |
22500 | 2.775300 |
23000 | 2.796400 |
23500 | 2.798700 |
24000 | 2.778000 |
TrainOutput(global_step=24261, training_loss=2.8796593968087287, metrics={'train_runtime': 17543.4268, 'train_samples_per_second': 2.766, 'train_steps_per_second': 1.383, 'total_flos': 3.65470024955904e+16, 'train_loss': 2.8796593968087287, 'epoch': 3.0})
Save the model
trainer.model.save_pretrained('se_flan-t5_v2')
tokenizer.save_pretrained('se_flan-t5_v2')
('se_flan-t5_v2/tokenizer_config.json', 'se_flan-t5_v2/special_tokens_map.json', 'se_flan-t5_v2/tokenizer.json')
!rm -r /content/drive/MyDrive/se; mkdir /content/drive/MyDrive/se
!cp -r se_flan-t5_v2 /content/drive/MyDrive/se
rm: cannot remove '/content/drive/MyDrive/se': No such file or directory