import transformers from datasets import Dataset import pdb from transformers import AutoTokenizer, AutoModelForSeq2SeqLM def gen_train(): with open('dev-A/in.tsv', 'r') as in_file, open('dev-A/expected.tsv') as exp_file: for line_1, line_2 in zip(in_file, exp_file): line_1 = line_1.rstrip() line_1_splitted_by_tab = line_1.split('\t') text = line_1_splitted_by_tab[-1] y_text = line_2.rstrip() yield {'x': text, 'y': y_text} train_dataset = Dataset.from_generator(gen_train) model_id="google/flan-t5-base" # Load tokenizer of FLAN-t5-base tokenizer = AutoTokenizer.from_pretrained(model_id) def preprocess_function(sample,padding="max_length"): max_source_length = 100 max_target_length = 100 # add prefix to the input for t5 inputs = [item for item in sample['x']] # tokenize inputs model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True) # Tokenize targets with the `text_target` keyword argument labels = tokenizer(text_target=sample["y"], max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length": labels["input_ids"] = [ [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] ] model_inputs["labels"] = labels["input_ids"] return model_inputs tokenized_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=["x", "y"]) # print(f"Keys of tokenized dataset: {list(tokenized_dataset.features)}") from transformers import AutoModelForSeq2SeqLM # huggingface hub model id model_id="google/flan-t5-base" # load model from the hub model = AutoModelForSeq2SeqLM.from_pretrained(model_id) import evaluate import nltk import numpy as np from nltk.tokenize import sent_tokenize nltk.download("punkt") # Metric metric = evaluate.load("rouge") # helper function to postprocess text def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [label.strip() for label in labels] # rougeLSum expects newline after each sentence preds = ["\n".join(sent_tokenize(pred)) for pred in preds] labels = ["\n".join(sent_tokenize(label)) for label in labels] return preds, labels def compute_metrics(eval_preds): preds, labels = eval_preds if isinstance(preds, tuple): preds = preds[0] decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) result = {k: round(v * 100, 4) for k, v in result.items()} prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] result["gen_len"] = np.mean(prediction_lens) return result from transformers import DataCollatorForSeq2Seq # we want to ignore tokenizer pad token in the loss label_pad_token_id = -100 # Data collator data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8 ) from huggingface_hub import HfFolder from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments # Hugging Face repository id # Define training args training_args = Seq2SeqTrainingArguments( output_dir='model', per_device_train_batch_size=8, per_device_eval_batch_size=8, predict_with_generate=True, fp16=False, # Overflows with fp16 learning_rate=5e-5, num_train_epochs=5, # logging & evaluation strategies logging_strategy="steps", logging_steps=500, evaluation_strategy="epoch", save_strategy="epoch", save_total_limit=2, load_best_model_at_end=True, # metric_for_best_model="overall_f1", # push to hub parameters ) # Create Trainer instance trainer = Seq2SeqTrainer( model=model, args=training_args, data_collator=data_collator, train_dataset=tokenized_dataset, eval_dataset=tokenized_dataset, compute_metrics=compute_metrics, ) # Start training trainer.train()