JARVIS/nlg_train.ipynb

24 KiB
Raw Permalink Blame History

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    pipeline,
)

from datasets import load_dataset

model_name = "google/umt5-small"
2024-06-03 11:19:02.256736: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-03 11:19:02.256864: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-03 11:19:02.368948: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
dataset = load_dataset('csv', data_files='/kaggle/input/ngl-data/nlg_data.csv', split='train').train_test_split(test_size=0.1)
dataset
Generating train split: 0 examples [00:00, ? examples/s]
DatasetDict({
    train: Dataset({
        features: ['mr', 'ref'],
        num_rows: 18564
    })
    test: Dataset({
        features: ['mr', 'ref'],
        num_rows: 2063
    })
})
tokenizer = AutoTokenizer.from_pretrained(model_name)


def tokenize_samples(samples):
    inputs = [f"generate text: {mr}" for mr in samples["mr"]]

    tokenized_inputs = tokenizer(
        inputs,
        max_length=128,
        padding="max_length",
        truncation=True,
    )

    labels = tokenizer(
        text_target=samples["ref"],
        max_length=128,
        padding="max_length",
        truncation=True,
    )

    labels["input_ids"] = [
        [
            (token_id if token_id != tokenizer.pad_token_id else -100)
            for token_id in label
        ]
        for label in labels["input_ids"]
    ]

    tokenized_inputs["labels"] = labels["input_ids"]
    return tokenized_inputs


tokenized_dataset = dataset.map(
    tokenize_samples,
    batched=True,
    remove_columns=["mr", "ref"],
)

tokenized_dataset
tokenizer_config.json:   0%|          | 0.00/6.84k [00:00<?, ?B/s]
spiece.model:   0%|          | 0.00/4.55M [00:00<?, ?B/s]
tokenizer.json:   0%|          | 0.00/16.9M [00:00<?, ?B/s]
special_tokens_map.json:   0%|          | 0.00/6.62k [00:00<?, ?B/s]
Map:   0%|          | 0/18564 [00:00<?, ? examples/s]
Map:   0%|          | 0/2063 [00:00<?, ? examples/s]
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 18564
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2063
    })
})
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100, pad_to_multiple_of=8)
config.json:   0%|          | 0.00/771 [00:00<?, ?B/s]
pytorch_model.bin:   0%|          | 0.00/1.23G [00:00<?, ?B/s]
/opt/conda/lib/python3.10/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
  return self.fget.__get__(instance, owner)()
generation_config.json:   0%|          | 0.00/171 [00:00<?, ?B/s]
training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    learning_rate=5e-5,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)
/opt/conda/lib/python3.10/site-packages/transformers/training_args.py:1474: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
  warnings.warn(
trainer.train()
wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.
wandb: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
wandb: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:
  ········································
wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc
Tracking run with wandb version 0.17.0
Run data is saved locally in /kaggle/working/wandb/run-20240603_111947-zd4tutif
[6963/6963 38:47, Epoch 3/3]
Epoch Training Loss Validation Loss
1 0.732900 0.331611
2 0.373100 0.246366
3 0.326900 0.231167

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].
TrainOutput(global_step=6963, training_loss=1.0388871652717377, metrics={'train_runtime': 2359.6292, 'train_samples_per_second': 23.602, 'train_steps_per_second': 2.951, 'total_flos': 7499132383002624.0, 'train_loss': 1.0388871652717377, 'epoch': 3.0})
nlg = pipeline('summarization', model=model, tokenizer=tokenizer)
nlg(f'generate text: dish[tatar], price[50], ingredient[wolowina]')[0]['summary_text']
'Nie mamy tatar w menu. Cena wynosi 50. Składnik to owoce.'
nlg(f'generate text: payment_methods[gotowka], price[150], addresses[ulica Dluga 5]')[0]['summary_text']
'Nie obsługujemy płatności gotowka. Cena wynosi 150. Oczywiście, dostarczymy na ulica Dluga 5.'
nlg(f'generate text: dish[tiramisu], ingredient[mleko], allergy[laktoza]')[0]['summary_text']
'Nie mamy tiramisu w menu. Składnik mleko jest dostępny. Nie zawiera alergenu laktoza.'
nlg(f'generate text: time[dziesiata]')[0]['summary_text']
Your max_length is set to 20, but your input_length is only 10. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)
'Zamknięte o dziesiata.'
nlg(f'generate text: dish[spaghetti], ingredient[ser]')[0]['summary_text']
Your max_length is set to 20, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
'Nie mamy spaghetti w menu. Składnik ser jest dostępny.'
nlg(f'generate text: dish[pierogi], ingredient[kozi ser]')[0]['summary_text']
Your max_length is set to 20, but your input_length is only 16. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)
'Nie mamy pierogi w menu. Składnik to koti ser.'
nlg(f'generate text: time[23:00], adres[ul Krótka 256]')[0]['summary_text']
'Zamknięte o 23:00. Nie dostarczamy na ulica Krótka 256.'
model.save_pretrained("/kaggle/working")
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("huggingface-write")
from huggingface_hub import login
login(secret_value_0)
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
trainer.push_to_hub("filnow/nlg-umt5-pol")
events.out.tfevents.1717413574.743112a2decd.34.0:   0%|          | 0.00/9.10k [00:00<?, ?B/s]
model.safetensors:   0%|          | 0.00/1.23G [00:00<?, ?B/s]
Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]
training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]
CommitInfo(commit_url='https://huggingface.co/filnow/working/commit/72c855645f38e057804135cb1de549ce045e18ea', commit_message='filnow/nlg-umt5-pol', commit_description='', oid='72c855645f38e057804135cb1de549ce045e18ea', pr_url=None, pr_revision=None, pr_num=None)
my_model = AutoModelForSeq2SeqLM.from_pretrained("filnow/nlg-umt5-pol")
config.json:   0%|          | 0.00/859 [00:00<?, ?B/s]
model.safetensors:   0%|          | 0.00/1.23G [00:00<?, ?B/s]
generation_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]
my_nlg = pipeline('summarization', model=my_model, tokenizer=tokenizer)
my_nlg(f'generate text: time[23:00], adres[ul Krótka 256]')[0]['summary_text']
'Zamknięte o 23:00. Nie dostarczamy na ulica Krótka 256.'