{ "cells": [ { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n", "- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Sentence: CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY . LONDON 1996-08-30 West Indian all-rounder Phil\n", "Tokens: ['L', 'LONDON', 'West', 'Indian', 'Phil']\n", "Labels: ['I-PER', 'I-LOC', 'I-MISC', 'I-MISC', 'I-PER']\n" ] } ], "source": [ "from transformers import pipeline\n", "import pandas as pd\n", "import re\n", "from transformers import pipeline\n", "\n", "ner_pipeline = pipeline(\"ner\", model=\"dbmdz/bert-large-cased-finetuned-conll03-english\")\n", "\n", "input_text = \"CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY . LONDON 1996-08-30 West Indian all-rounder Phil\"\n", "\n", "def predict_and_combine(text):\n", " ner_results = ner_pipeline(text)\n", " combined_tokens = []\n", " combined_labels = []\n", " current_word = \"\"\n", " current_label = None\n", "\n", " for result in ner_results:\n", " token = result['word']\n", " label = result['entity']\n", " if token.startswith(\"##\"):\n", " current_word += token[2:]\n", " else:\n", " if current_word:\n", " combined_tokens.append(current_word)\n", " combined_labels.append(current_label)\n", " current_word = token\n", " current_label = label\n", "\n", " if current_word:\n", " combined_tokens.append(current_word)\n", " combined_labels.append(current_label)\n", "\n", " return combined_tokens, combined_labels\n", "\n", "tokens, labels = predict_and_combine(input_text)\n", "\n", "print(f\"Sentence: {input_text}\")\n", "print(\"Tokens:\", tokens)\n", "print(\"Labels:\", labels)\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "def find_word_starts(text):\n", " indices = [match.start() + 1 for match in re.finditer(r\"\\s\\S\", text)]\n", " if not text[0].isspace():\n", " indices.insert(0, 0)\n", " return sorted(indices)\n", "\n", "def find_word_start(text, index):\n", " while index > 0 and text[index - 1] != \" \":\n", " index -= 1\n", " return index\n", "\n", "def merge_wordpieces(ner_tokens, original_sentence):\n", " results = []\n", " for token in ner_tokens:\n", " if token['word'].startswith(\"##\") and results and token['start'] == results[-1]['end']:\n", " results[-1]['end'] = token['end']\n", " results[-1]['word'] += token['word'][2:]\n", " else:\n", " if results and not original_sentence[token['start'] - 1].isspace():\n", " results[-1]['end'] = token['end']\n", " results[-1]['word'] += token['word']\n", " else:\n", " token['start'] = find_word_start(original_sentence, token['start'])\n", " results.append(token)\n", " \n", " word_start_to_tag = {result['start']: result['entity'] for result in results}\n", " for index in find_word_starts(original_sentence):\n", " if index not in word_start_to_tag:\n", " word_start_to_tag[index] = \"O\"\n", " \n", " return [word_start_to_tag[index] for index in sorted(word_start_to_tag.keys())]\n", "\n", "def predict_and_merge(text):\n", " return ner_pipeline(text)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "dev_data = pd.read_csv(\"dev-0/in.tsv\", sep=\"\\t\", names=[\"Text\"])\n", "dev_labels = pd.read_csv(\"dev-0/expected.tsv\", sep=\"\\t\", names=[\"Label\"])\n", "\n", "dev_data[\"NER_Results\"] = dev_data[\"Text\"].apply(predict_and_merge)\n", "processed_data = []\n", "\n", "for i, (model_out, raw_sentence) in enumerate(zip(dev_data[\"NER_Results\"], dev_data[\"Text\"])):\n", " merged_tokens = merge_wordpieces(model_out, raw_sentence)\n", " processed_line = \" \".join(merged_tokens)\n", " processed_data.append(processed_line)\n", " \n", " if len(merged_tokens) != len(raw_sentence.split()):\n", " raise AssertionError\n", "\n", "with open(\"dev-0/out_unprocessed.tsv\", \"w\", encoding=\"utf-8\") as f:\n", " for line in processed_data:\n", " f.write(f\"{line}\\n\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.8418625244437885\n" ] } ], "source": [ "from sklearn.metrics import accuracy_score\n", "\n", "with open('dev-0/out.tsv', 'r') as file:\n", " predicted_labels = [line.strip().split()[1:] for line in file]\n", "\n", "with open('dev-0/expected.tsv', 'r') as file:\n", " true_labels = [line.strip().split()[1:] for line in file]\n", "\n", "predicted_labels = [label for sublist in predicted_labels for label in sublist]\n", "true_labels = [label for sublist in true_labels for label in sublist]\n", "\n", "accuracy = accuracy_score(true_labels, predicted_labels)\n", "print(\"Accuracy:\", accuracy)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "dev_data = pd.read_csv(\"test-A/in.tsv\", sep=\"\\t\", names=[\"Text\"])\n", "\n", "dev_data[\"NER_Results\"] = dev_data[\"Text\"].apply(predict_and_merge)\n", "processed_data = []\n", "\n", "for i, (model_out, raw_sentence) in enumerate(zip(dev_data[\"NER_Results\"], dev_data[\"Text\"])):\n", " merged_tokens = merge_wordpieces(model_out, raw_sentence)\n", " processed_line = \" \".join(merged_tokens)\n", " processed_data.append(processed_line)\n", " \n", " if len(merged_tokens) != len(raw_sentence.split()):\n", " raise AssertionError\n", "\n", "with open(\"test-A/out_unprocessed.tsv\", \"w\", encoding=\"utf-8\") as f:\n", " for line in processed_data:\n", " f.write(f\"{line}\\n\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 2 }