import pandas as pd from datasets import load_dataset, load_metric from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification import numpy as np # Load the CoNLL-2003 dataset with trust_remote_code dataset = load_dataset("conll2003", trust_remote_code=True) # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") # Define label list and map labels to IDs label_list = dataset['train'].features['ner_tags'].feature.names # Tokenize and align labels function def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer(examples['tokens'], truncation=True, padding='max_length', is_split_into_words=True) labels = [] for i, label in enumerate(examples['ner_tags']): word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word. previous_word_idx = None label_ids = [] for word_idx in word_ids: # Set the special tokens to -100. if word_idx is None: label_ids.append(-100) elif word_idx != previous_word_idx: # Only label the first token of a given word. label_ids.append(label[word_idx]) else: label_ids.append(-100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs # Tokenize the datasets tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True) # Split the dataset into training and evaluation sets train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["validation"] # Load the model model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(label_list)) # Data collator for token classification data_collator = DataCollatorForTokenClassification(tokenizer) # Training arguments training_args = TrainingArguments( output_dir='./results', evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=3, weight_decay=0.01, ) # Define the trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator, compute_metrics=lambda p: { "accuracy": (p.predictions.argmax(-1) == p.label_ids).astype(np.float32).mean().item(), "f1": load_metric("seqeval").compute(predictions=np.argmax(p.predictions, axis=2), references=p.label_ids)['overall_f1'] }, ) # Train the model trainer.train() # Evaluate the model results = trainer.evaluate() # Print the results print("Evaluation results:", results) # Predict on the evaluation set predictions, labels, _ = trainer.predict(eval_dataset) predictions = np.argmax(predictions, axis=2) # Convert the predictions and labels to the original tags true_labels = [[label_list[l] for l in label if l != -100] for label in labels] true_predictions = [ [label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels) ] # Create a DataFrame for the results results_df = pd.DataFrame({ 'tokens': eval_dataset['tokens'], 'true_labels': true_labels, 'predicted_labels': true_predictions }) # Save the results to a CSV file results_df.to_csv('mnt/data/ner_results.csv', index=False) print("Wyniki analizy NER zostaƂy zapisane do pliku 'mnt/data/ner_results.csv'.")