import pandas as pd from datasets import load_dataset, load_metric from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification import numpy as np dataset = load_dataset("conll2003", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") label_list = dataset['train'].features['ner_tags'].feature.names def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer(examples['tokens'], truncation=True, padding='max_length', is_split_into_words=True) labels = [] for i, label in enumerate(examples['ner_tags']): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: if word_idx is None: label_ids.append(-100) elif word_idx != previous_word_idx: label_ids.append(label[word_idx]) else: label_ids.append(-100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True) train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["validation"] model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(label_list)) data_collator = DataCollatorForTokenClassification(tokenizer) training_args = TrainingArguments( output_dir='./results', evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=3, weight_decay=0.01, ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator, compute_metrics=lambda p: { "accuracy": (p.predictions.argmax(-1) == p.label_ids).astype(np.float32).mean().item(), "f1": load_metric("seqeval").compute(predictions=np.argmax(p.predictions, axis=2), references=p.label_ids)['overall_f1'] }, ) trainer.train() results = trainer.evaluate() print("Evaluation results:", results) predictions, labels, _ = trainer.predict(eval_dataset) predictions = np.argmax(predictions, axis=2) true_labels = [[label_list[l] for l in label if l != -100] for label in labels] true_predictions = [ [label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels) ] results_df = pd.DataFrame({ 'tokens': eval_dataset['tokens'], 'true_labels': true_labels, 'predicted_labels': true_predictions }) results_df.to_csv('mnt/data/ner_results.csv', index=False) print("Wyniki analizy NER zostaƂy zapisane do pliku 'mnt/data/ner_results.csv'.")