import os import pandas as pd from datasets import Dataset from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments translated_data_directory = 'translated_data' # Łączymy wszystkie przetłumaczone pliki TSV w jeden zbiór danych dfs = [] for file_name in os.listdir(translated_data_directory): if file_name.endswith('.tsv'): file_path = os.path.join(translated_data_directory, file_name) df = pd.read_csv(file_path, sep='\t') df_user = df[df['role'] == 'system'].drop('role', axis=1) dfs.append(df_user) combined_df = pd.concat(dfs, ignore_index=True) # Przygotowanie zbioru danych do trenowania dataset = Dataset.from_pandas(combined_df) # Wczytujemy model i tokenizer model_name = "google/flan-t5-small" tokenizer = AutoTokenizer.from_pretrained(model_name) # Funkcja do tokenizacji danych def tokenize_samples(samples): inputs = [f"generate text: {act}" for act in samples["act"]] tokenized_inputs = tokenizer(inputs, max_length=128, padding="max_length", truncation=True) labels = tokenizer(samples["value_en"], max_length=128, padding="max_length", truncation=True) labels["input_ids"] = [ [(token_id if token_id != tokenizer.pad_token_id else -100) for token_id in label] for label in labels["input_ids"] ] tokenized_inputs["labels"] = labels["input_ids"] return tokenized_inputs # Tokenizujemy dane tokenized_dataset = dataset.map(tokenize_samples, batched=True) # Wczytujemy model model = AutoModelForSeq2SeqLM.from_pretrained(model_name) # Konfiguracja DataCollator data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100, pad_to_multiple_of=8) # Konfiguracja treningu training_args = Seq2SeqTrainingArguments( output_dir="./nlg_model", per_device_train_batch_size=8, per_device_eval_batch_size=16, predict_with_generate=True, learning_rate=5e-5, num_train_epochs=10, evaluation_strategy="epoch", save_strategy="epoch", save_total_limit=None, # Wyłącz rotację punktów kontrolnych load_best_model_at_end=True, ) # Inicjalizacja trenera trainer = Seq2SeqTrainer( model=model, args=training_args, data_collator=data_collator, train_dataset=tokenized_dataset, eval_dataset=tokenized_dataset, ) # Trening modelu trainer.train() # Zapisanie wytrenowanego modelu trainer.save_model("./nlg_model") tokenizer.save_pretrained("./nlg_model")