From f491aaac21b08f475012300aab23dadace76b19b Mon Sep 17 00:00:00 2001 From: s464967 Date: Tue, 11 Jun 2024 11:38:31 +0200 Subject: [PATCH] small fix --- main.py | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/main.py b/main.py index 3ef995d..4b38da3 100644 --- a/main.py +++ b/main.py @@ -3,27 +3,23 @@ from datasets import load_dataset, load_metric from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification import numpy as np -# Load the CoNLL-2003 dataset with trust_remote_code dataset = load_dataset("conll2003", trust_remote_code=True) -# Load the tokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") -# Define label list and map labels to IDs label_list = dataset['train'].features['ner_tags'].feature.names -# Tokenize and align labels function def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer(examples['tokens'], truncation=True, padding='max_length', is_split_into_words=True) labels = [] for i, label in enumerate(examples['ner_tags']): - word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word. + word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] - for word_idx in word_ids: # Set the special tokens to -100. + for word_idx in word_ids: if word_idx is None: label_ids.append(-100) - elif word_idx != previous_word_idx: # Only label the first token of a given word. + elif word_idx != previous_word_idx: label_ids.append(label[word_idx]) else: label_ids.append(-100) @@ -32,20 +28,15 @@ def tokenize_and_align_labels(examples): tokenized_inputs["labels"] = labels return tokenized_inputs -# Tokenize the datasets tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True) -# Split the dataset into training and evaluation sets train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["validation"] -# Load the model model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(label_list)) -# Data collator for token classification data_collator = DataCollatorForTokenClassification(tokenizer) -# Training arguments training_args = TrainingArguments( output_dir='./results', evaluation_strategy="epoch", @@ -56,7 +47,6 @@ training_args = TrainingArguments( weight_decay=0.01, ) -# Define the trainer trainer = Trainer( model=model, args=training_args, @@ -69,34 +59,26 @@ trainer = Trainer( }, ) -# Train the model trainer.train() - -# Evaluate the model results = trainer.evaluate() -# Print the results print("Evaluation results:", results) -# Predict on the evaluation set predictions, labels, _ = trainer.predict(eval_dataset) predictions = np.argmax(predictions, axis=2) -# Convert the predictions and labels to the original tags true_labels = [[label_list[l] for l in label if l != -100] for label in labels] true_predictions = [ [label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels) ] -# Create a DataFrame for the results results_df = pd.DataFrame({ 'tokens': eval_dataset['tokens'], 'true_labels': true_labels, 'predicted_labels': true_predictions }) -# Save the results to a CSV file results_df.to_csv('mnt/data/ner_results.csv', index=False) print("Wyniki analizy NER zostaƂy zapisane do pliku 'mnt/data/ner_results.csv'.") \ No newline at end of file