Transformer-Ner/Transformer.ipynb

1112 lines
37 KiB
Plaintext
Raw Permalink Normal View History

2024-06-06 22:10:16 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import lzma\n",
"from transformers import pipeline\n",
"\n",
"with lzma.open(\"train/train.tsv.xz\", \"rt\") as f:\n",
" train_data = pd.read_csv(f, delimiter='\\t', header=None)\n",
"\n",
"in_data_dev0 = pd.read_csv('dev-0/in.tsv', delimiter='\\t', header=None)\n",
"expected_data_dev0 = pd.read_csv('dev-0/expected.tsv', delimiter='\\t', header=None)\n",
"\n",
"in_data_testA = pd.read_csv('dev-0/in.tsv', delimiter='\\t', header=None)\n",
"expected_data_testA = pd.read_csv('dev-0/expected.tsv', delimiter='\\t', header=None)\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).\n",
"Using a pipeline without specifying a model name and revision in production is not recommended.\n",
"/home/wmi/miniconda3/envs/pbr/lib/python3.12/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
" warnings.warn(\n",
"Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n",
"- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"/home/wmi/miniconda3/envs/pbr/lib/python3.12/site-packages/transformers/pipelines/token_classification.py:168: UserWarning: `grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to `aggregation_strategy=\"AggregationStrategy.SIMPLE\"` instead.\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/plain": [
"[[{'entity_group': 'ORG',\n",
" 'score': 0.34655723,\n",
" 'word': 'L',\n",
" 'start': 10,\n",
" 'end': 11},\n",
" {'entity_group': 'MISC',\n",
" 'score': 0.38114035,\n",
" 'word': '##A',\n",
" 'start': 26,\n",
" 'end': 27},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.88899577,\n",
" 'word': 'LONDON',\n",
" 'start': 71,\n",
" 'end': 77},\n",
" {'entity_group': 'MISC',\n",
" 'score': 0.9981057,\n",
" 'word': 'West Indian',\n",
" 'start': 94,\n",
" 'end': 105},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.99973667,\n",
" 'word': 'Phil Simmons',\n",
" 'start': 118,\n",
" 'end': 130},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.99539727,\n",
" 'word': 'Leicestershire',\n",
" 'start': 161,\n",
" 'end': 175},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.997735,\n",
" 'word': 'Somerset',\n",
" 'start': 181,\n",
" 'end': 189},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9995547,\n",
" 'word': 'Essex',\n",
" 'start': 351,\n",
" 'end': 356},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9992822,\n",
" 'word': 'Derbyshire',\n",
" 'start': 359,\n",
" 'end': 369},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9993387,\n",
" 'word': 'Surrey',\n",
" 'start': 374,\n",
" 'end': 380},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9993369,\n",
" 'word': 'Kent',\n",
" 'start': 412,\n",
" 'end': 416},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.99557805,\n",
" 'word': 'Nottinghamshire',\n",
" 'start': 476,\n",
" 'end': 491},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.99760664,\n",
" 'word': 'Somerset',\n",
" 'start': 513,\n",
" 'end': 521},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.99563384,\n",
" 'word': 'Grace Road',\n",
" 'start': 559,\n",
" 'end': 569},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9976046,\n",
" 'word': 'Leicestershire',\n",
" 'start': 572,\n",
" 'end': 586},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.9994592,\n",
" 'word': 'England',\n",
" 'start': 664,\n",
" 'end': 671},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9892952,\n",
" 'word': 'Andy Caddick',\n",
" 'start': 680,\n",
" 'end': 692},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9989544,\n",
" 'word': 'Somerset',\n",
" 'start': 738,\n",
" 'end': 746},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9996531,\n",
" 'word': 'Simmons',\n",
" 'start': 796,\n",
" 'end': 803},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9993154,\n",
" 'word': 'Essex',\n",
" 'start': 849,\n",
" 'end': 854},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.99977463,\n",
" 'word': 'Nasser Hussain',\n",
" 'start': 911,\n",
" 'end': 925},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9996419,\n",
" 'word': 'Peter Such',\n",
" 'start': 930,\n",
" 'end': 940},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9935796,\n",
" 'word': 'Yorkshire',\n",
" 'start': 986,\n",
" 'end': 995},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.99555916,\n",
" 'word': 'Headingley',\n",
" 'start': 999,\n",
" 'end': 1009},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.99976915,\n",
" 'word': 'Hussain',\n",
" 'start': 1017,\n",
" 'end': 1024},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.999605,\n",
" 'word': 'England',\n",
" 'start': 1049,\n",
" 'end': 1056},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.99933463,\n",
" 'word': 'Essex',\n",
" 'start': 1146,\n",
" 'end': 1151},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.99699366,\n",
" 'word': 'Yorkshire',\n",
" 'start': 1220,\n",
" 'end': 1229},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9988619,\n",
" 'word': 'Such',\n",
" 'start': 1286,\n",
" 'end': 1290},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.9991185,\n",
" 'word': 'Oval',\n",
" 'start': 1425,\n",
" 'end': 1429},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9990688,\n",
" 'word': 'Surrey',\n",
" 'start': 1432,\n",
" 'end': 1438},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9997764,\n",
" 'word': 'Chris Lewis',\n",
" 'start': 1447,\n",
" 'end': 1458},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.99957126,\n",
" 'word': 'England',\n",
" 'start': 1483,\n",
" 'end': 1490},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9987552,\n",
" 'word': 'Warwickshire',\n",
" 'start': 1616,\n",
" 'end': 1628},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.7200186,\n",
" 'word': 'S',\n",
" 'start': 1633,\n",
" 'end': 1634},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.9987576,\n",
" 'word': 'England',\n",
" 'start': 1658,\n",
" 'end': 1665},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9997127,\n",
" 'word': 'Mark Butcher',\n",
" 'start': 1674,\n",
" 'end': 1686},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9978284,\n",
" 'word': 'Surrey',\n",
" 'start': 1702,\n",
" 'end': 1708},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.5290484,\n",
" 'word': 'S',\n",
" 'start': 1753,\n",
" 'end': 1754},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9982393,\n",
" 'word': 'Derbyshire',\n",
" 'start': 1756,\n",
" 'end': 1766},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.999025,\n",
" 'word': 'Worcestershire',\n",
" 'start': 1842,\n",
" 'end': 1856},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.5895682,\n",
" 'word': 'S',\n",
" 'start': 1955,\n",
" 'end': 1956},\n",
" {'entity_group': 'MISC',\n",
" 'score': 0.9989274,\n",
" 'word': 'Australian',\n",
" 'start': 1958,\n",
" 'end': 1968},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9996978,\n",
" 'word': 'Tom Moody',\n",
" 'start': 1969,\n",
" 'end': 1978},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.99972975,\n",
" 'word': 'Chris Adams',\n",
" 'start': 1999,\n",
" 'end': 2010},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9994934,\n",
" 'word': 'Tim O',\n",
" 'start': 2023,\n",
" 'end': 2028},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.5700783,\n",
" 'word': 'Go',\n",
" 'start': 2029,\n",
" 'end': 2031},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9984907,\n",
" 'word': 'Derbyshire',\n",
" 'start': 2049,\n",
" 'end': 2059},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.4096144,\n",
" 'word': 'S',\n",
" 'start': 2103,\n",
" 'end': 2104}],\n",
" [{'entity_group': 'MISC',\n",
" 'score': 0.46102658,\n",
" 'word': '##IC',\n",
" 'start': 2,\n",
" 'end': 4},\n",
" {'entity_group': 'MISC',\n",
" 'score': 0.97070557,\n",
" 'word': 'ENGL',\n",
" 'start': 10,\n",
" 'end': 14},\n",
" {'entity_group': 'MISC',\n",
" 'score': 0.7318856,\n",
" 'word': '##H COUNTY CHAM',\n",
" 'start': 16,\n",
" 'end': 29},\n",
" {'entity_group': 'MISC',\n",
" 'score': 0.66476506,\n",
" 'word': '##H',\n",
" 'start': 34,\n",
" 'end': 35},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.95440763,\n",
" 'word': 'LONDON',\n",
" 'start': 52,\n",
" 'end': 58},\n",
" {'entity_group': 'MISC',\n",
" 'score': 0.99895835,\n",
" 'word': 'English',\n",
" 'start': 110,\n",
" 'end': 117},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.99821544,\n",
" 'word': 'Leicester',\n",
" 'start': 163,\n",
" 'end': 172},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.99364024,\n",
" 'word': 'Leicestershire',\n",
" 'start': 175,\n",
" 'end': 189},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.99720675,\n",
" 'word': 'Somerset',\n",
" 'start': 195,\n",
" 'end': 203},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9979146,\n",
" 'word': 'Somerset',\n",
" 'start': 237,\n",
" 'end': 245},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.96438843,\n",
" 'word': 'P. Simmons',\n",
" 'start': 259,\n",
" 'end': 269},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9983664,\n",
" 'word': 'Leicestershire',\n",
" 'start': 279,\n",
" 'end': 293},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.998063,\n",
" 'word': 'Leicestershire',\n",
" 'start': 305,\n",
" 'end': 319},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.99934703,\n",
" 'word': 'Somerset',\n",
" 'start': 332,\n",
" 'end': 340},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.96631926,\n",
" 'word': 'Chester - le - Street',\n",
" 'start': 350,\n",
" 'end': 367},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9994879,\n",
" 'word': 'Glamorgan',\n",
" 'start': 370,\n",
" 'end': 379},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9771094,\n",
" 'word': 'A. Dale',\n",
" 'start': 394,\n",
" 'end': 401},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.95903397,\n",
" 'word': 'H. Morris',\n",
" 'start': 407,\n",
" 'end': 416},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.95786047,\n",
" 'word': 'D. Blenkiron',\n",
" 'start': 422,\n",
" 'end': 434},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9991248,\n",
" 'word': 'Durham',\n",
" 'start': 444,\n",
" 'end': 450},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.96577376,\n",
" 'word': 'S. Watkin',\n",
" 'start': 457,\n",
" 'end': 466},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.99620086,\n",
" 'word': 'Tunbridge Wells',\n",
" 'start': 490,\n",
" 'end': 505},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.999209,\n",
" 'word': 'Nottinghamshire',\n",
" 'start': 508,\n",
" 'end': 523},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.94234514,\n",
" 'word': 'P. Johnson',\n",
" 'start': 530,\n",
" 'end': 540},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.99028325,\n",
" 'word': 'M. McCague',\n",
" 'start': 546,\n",
" 'end': 556},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.99935454,\n",
" 'word': 'Kent',\n",
" 'start': 566,\n",
" 'end': 570},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.9988416,\n",
" 'word': 'London',\n",
" 'start': 584,\n",
" 'end': 590},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.9966791,\n",
" 'word': 'The Oval',\n",
" 'start': 593,\n",
" 'end': 601},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.99928087,\n",
" 'word': 'Warwickshire',\n",
" 'start': 606,\n",
" 'end': 618},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9995234,\n",
" 'word': 'Surrey',\n",
" 'start': 625,\n",
" 'end': 631},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9265669,\n",
" 'word': 'C. Lewis',\n",
" 'start': 640,\n",
" 'end': 648},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.98573065,\n",
" 'word': 'M. Butcher',\n",
" 'start': 662,\n",
" 'end': 672},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.99570525,\n",
" 'word': 'G. Kersey',\n",
" 'start': 678,\n",
" 'end': 687},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.98831654,\n",
" 'word': 'J. Ratcliffe',\n",
" 'start': 693,\n",
" 'end': 705},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9866266,\n",
" 'word': 'D. Bicknell',\n",
" 'start': 711,\n",
" 'end': 722},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.9714904,\n",
" 'word': 'Hove',\n",
" 'start': 735,\n",
" 'end': 739},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.99935216,\n",
" 'word': 'Sussex',\n",
" 'start': 742,\n",
" 'end': 748},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.97753423,\n",
" 'word': 'W. Athey',\n",
" 'start': 755,\n",
" 'end': 763},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9889897,\n",
" 'word': 'V. Drakes',\n",
" 'start': 770,\n",
" 'end': 779},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9931459,\n",
" 'word': 'I. Austin',\n",
" 'start': 785,\n",
" 'end': 794},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.99931324,\n",
" 'word': 'Lancashire',\n",
" 'start': 804,\n",
" 'end': 814},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.97633815,\n",
" 'word': 'W. Hegg',\n",
" 'start': 823,\n",
" 'end': 830},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.9942986,\n",
" 'word': 'Portsmouth',\n",
" 'start': 841,\n",
" 'end': 851},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.99923646,\n",
" 'word': 'Middlesex',\n",
" 'start': 854,\n",
" 'end': 863},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.97500485,\n",
" 'word': 'J. Pooley',\n",
" 'start': 878,\n",
" 'end': 887},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.97731185,\n",
" 'word': 'M. Ramprakash',\n",
" 'start': 894,\n",
" 'end': 907},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9795357,\n",
" 'word': 'M. Gatting',\n",
" 'start': 914,\n",
" 'end': 924},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.99938464,\n",
" 'word': 'Hampshire',\n",
" 'start': 932,\n",
" 'end': 941},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.9905847,\n",
" 'word': 'Chesterfield',\n",
" 'start': 963,\n",
" 'end': 975},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.99893564,\n",
" 'word': 'Worcestershire',\n",
" 'start': 978,\n",
" 'end': 992},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9993286,\n",
" 'word': 'Derbyshire',\n",
" 'start': 1009,\n",
" 'end': 1019},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.96917397,\n",
" 'word': 'J. Adams',\n",
" 'start': 1026,\n",
" 'end': 1034},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9931639,\n",
" 'word': \"T. O ' Gorman\",\n",
" 'start': 1041,\n",
" 'end': 1051},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.97878116,\n",
" 'word': 'K. Barnett',\n",
" 'start': 1066,\n",
" 'end': 1076},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9884977,\n",
" 'word': 'T. Moody',\n",
" 'start': 1082,\n",
" 'end': 1090},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.99594116,\n",
" 'word': 'Bristol',\n",
" 'start': 1103,\n",
" 'end': 1110},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9994455,\n",
" 'word': 'Gloucestershire',\n",
" 'start': 1113,\n",
" 'end': 1128},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.999629,\n",
" 'word': 'J',\n",
" 'start': 1145,\n",
" 'end': 1146},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9994917,\n",
" 'word': 'Russell',\n",
" 'start': 1148,\n",
" 'end': 1155},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.94236225,\n",
" 'word': 'Northamptonshire',\n",
" 'start': 1171,\n",
" 'end': 1187},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.99586487,\n",
" 'word': 'K',\n",
" 'start': 1194,\n",
" 'end': 1195},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9998486,\n",
" 'word': 'Curran',\n",
" 'start': 1197,\n",
" 'end': 1203},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9988618,\n",
" 'word': 'A',\n",
" 'start': 1209,\n",
" 'end': 1210},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9765073,\n",
" 'word': 'Smith',\n",
" 'start': 1212,\n",
" 'end': 1217},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.998814,\n",
" 'word': 'S',\n",
" 'start': 1229,\n",
" 'end': 1230}],\n",
" [{'entity_group': 'LOC',\n",
" 'score': 0.96220237,\n",
" 'word': 'LONDON',\n",
" 'start': 39,\n",
" 'end': 45},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.9997117,\n",
" 'word': 'Australia',\n",
" 'start': 62,\n",
" 'end': 71},\n",
" {'entity_group': 'MISC',\n",
" 'score': 0.9900663,\n",
" 'word': 'Ashes',\n",
" 'start': 88,\n",
" 'end': 93},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.99974364,\n",
" 'word': 'England',\n",
" 'start': 128,\n",
" 'end': 135},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9373318,\n",
" 'word': 'Test and County Cricket Board',\n",
" 'start': 201,\n",
" 'end': 230},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.9997092,\n",
" 'word': 'Australia',\n",
" 'start': 258,\n",
" 'end': 267},\n",
" {'entity_group': 'MISC',\n",
" 'score': 0.52167755,\n",
" 'word': 'S',\n",
" 'start': 440,\n",
" 'end': 441},\n",
" {'entity_group': 'MISC',\n",
" 'score': 0.99905914,\n",
" 'word': 'English',\n",
" 'start': 443,\n",
" 'end': 450},\n",
" {'entity_group': 'MISC',\n",
" 'score': 0.9493715,\n",
" 'word': 'British',\n",
" 'start': 484,\n",
" 'end': 491},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.66527855,\n",
" 'word': 'Minor Counties',\n",
" 'start': 551,\n",
" 'end': 565},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.7435069,\n",
" 'word': 'S',\n",
" 'start': 572,\n",
" 'end': 573},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.9995771,\n",
" 'word': 'Scotland',\n",
" 'start': 575,\n",
" 'end': 583},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.99926525,\n",
" 'word': 'London',\n",
" 'start': 639,\n",
" 'end': 645},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.97566634,\n",
" 'word': \"Lord ' s\",\n",
" 'start': 670,\n",
" 'end': 677},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.98920923,\n",
" 'word': \"Duke of Norfolk ' s XI\",\n",
" 'start': 692,\n",
" 'end': 713},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.98959255,\n",
" 'word': 'Arundel',\n",
" 'start': 719,\n",
" 'end': 726},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9956369,\n",
" 'word': 'Northampton',\n",
" 'start': 743,\n",
" 'end': 754},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9987895,\n",
" 'word': 'Worcestershire',\n",
" 'start': 769,\n",
" 'end': 783},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9977308,\n",
" 'word': 'Durham',\n",
" 'start': 798,\n",
" 'end': 804},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.99277717,\n",
" 'word': 'Headingley',\n",
" 'start': 850,\n",
" 'end': 860},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.93061477,\n",
" 'word': 'Leeds',\n",
" 'start': 868,\n",
" 'end': 873},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.9936111,\n",
" 'word': 'The Oval',\n",
" 'start': 922,\n",
" 'end': 930},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.9980288,\n",
" 'word': 'London',\n",
" 'start': 938,\n",
" 'end': 944},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.9947086,\n",
" 'word': \"Lord ' s\",\n",
" 'start': 992,\n",
" 'end': 999},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.99907875,\n",
" 'word': 'London',\n",
" 'start': 1002,\n",
" 'end': 1008},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9987871,\n",
" 'word': 'Gloucestershire',\n",
" 'start': 1028,\n",
" 'end': 1043},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9982962,\n",
" 'word': 'Sussex',\n",
" 'start': 1047,\n",
" 'end': 1053},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9983968,\n",
" 'word': 'Surrey',\n",
" 'start': 1057,\n",
" 'end': 1063},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9983108,\n",
" 'word': 'Derbyshire',\n",
" 'start': 1107,\n",
" 'end': 1117},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.9901608,\n",
" 'word': 'Edgbaston',\n",
" 'start': 1179,\n",
" 'end': 1188},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.99937767,\n",
" 'word': 'Birmingham',\n",
" 'start': 1191,\n",
" 'end': 1201},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.5605512,\n",
" 'word': 'S',\n",
" 'start': 1265,\n",
" 'end': 1266},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9908428,\n",
" 'word': 'Leicestershire',\n",
" 'start': 1281,\n",
" 'end': 1295},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.7768232,\n",
" 'word': 'Lord',\n",
" 'start': 1344,\n",
" 'end': 1348},\n",
" {'entity_group': 'MISC',\n",
" 'score': 0.6473316,\n",
" 'word': 'British',\n",
" 'start': 1372,\n",
" 'end': 1379},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9781204,\n",
" 'word': 'Oxford',\n",
" 'start': 1398,\n",
" 'end': 1404},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.99740857,\n",
" 'word': 'Hampshire',\n",
" 'start': 1438,\n",
" 'end': 1447},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.93818957,\n",
" 'word': 'Old',\n",
" 'start': 1503,\n",
" 'end': 1506},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.60427374,\n",
" 'word': '##rafford',\n",
" 'start': 1508,\n",
" 'end': 1515},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.88743263,\n",
" 'word': 'Manchester',\n",
" 'start': 1518,\n",
" 'end': 1528},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.6398575,\n",
" 'word': 'Minor Counties XI',\n",
" 'start': 1545,\n",
" 'end': 1562},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.9948643,\n",
" 'word': 'Scotland',\n",
" 'start': 1578,\n",
" 'end': 1586},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9956975,\n",
" 'word': 'Glamorgan',\n",
" 'start': 1605,\n",
" 'end': 1614}],\n",
" [{'entity_group': 'PER',\n",
" 'score': 0.64593357,\n",
" 'word': 'SHEARER',\n",
" 'start': 9,\n",
" 'end': 16},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.9937872,\n",
" 'word': 'ENGLAND',\n",
" 'start': 26,\n",
" 'end': 33},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.98532915,\n",
" 'word': 'LONDON',\n",
" 'start': 49,\n",
" 'end': 55},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.99965554,\n",
" 'word': 'Alan Shearer',\n",
" 'start': 106,\n",
" 'end': 118},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.99965096,\n",
" 'word': 'England',\n",
" 'start': 140,\n",
" 'end': 147},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.99851114,\n",
" 'word': 'Newcastle',\n",
" 'start': 202,\n",
" 'end': 211},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.99972355,\n",
" 'word': 'Tony Adams',\n",
" 'start': 280,\n",
" 'end': 290},\n",
" {'entity_group': 'MISC',\n",
" 'score': 0.99899167,\n",
" 'word': 'European',\n",
" 'start': 321,\n",
" 'end': 329},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.99968123,\n",
" 'word': 'David Platt',\n",
" 'start': 372,\n",
" 'end': 383},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.99977285,\n",
" 'word': 'Adams',\n",
" 'start': 391,\n",
" 'end': 396},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.99800897,\n",
" 'word': 'Platt',\n",
" 'start': 401,\n",
" 'end': 406},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.99970406,\n",
" 'word': 'England',\n",
" 'start': 438,\n",
" 'end': 445},\n",
" {'entity_group': 'MISC',\n",
" 'score': 0.99456084,\n",
" 'word': 'World Cup',\n",
" 'start': 457,\n",
" 'end': 466},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.9998385,\n",
" 'word': 'Moldova',\n",
" 'start': 485,\n",
" 'end': 492},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9992155,\n",
" 'word': 'Shearer',\n",
" 'start': 510,\n",
" 'end': 517},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9990761,\n",
" 'word': 'Glenn Hoddle',\n",
" 'start': 571,\n",
" 'end': 583},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9991636,\n",
" 'word': 'Blackburn',\n",
" 'start': 621,\n",
" 'end': 630},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.99904674,\n",
" 'word': 'Southampton',\n",
" 'start': 635,\n",
" 'end': 646},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9996246,\n",
" 'word': 'Alan',\n",
" 'start': 745,\n",
" 'end': 749},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9964572,\n",
" 'word': 'Hoddle',\n",
" 'start': 777,\n",
" 'end': 783},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9996996,\n",
" 'word': 'Alan',\n",
" 'start': 873,\n",
" 'end': 877},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.99759614,\n",
" 'word': 'Shearer',\n",
" 'start': 1172,\n",
" 'end': 1179},\n",
" {'entity_group': 'MISC',\n",
" 'score': 0.9978334,\n",
" 'word': 'Euro 96',\n",
" 'start': 1183,\n",
" 'end': 1190},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.99211884,\n",
" 'word': 'Teddy Sheringham',\n",
" 'start': 1208,\n",
" 'end': 1224},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.99672574,\n",
" 'word': 'Shearer',\n",
" 'start': 1313,\n",
" 'end': 1320},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9980933,\n",
" 'word': 'Newcastle',\n",
" 'start': 1324,\n",
" 'end': 1333},\n",
" {'entity_group': 'PER',\n",
" 'score': 0.9995302,\n",
" 'word': 'Les Ferdinand',\n",
" 'start': 1344,\n",
" 'end': 1357}],\n",
" [{'entity_group': 'MISC',\n",
" 'score': 0.6943369,\n",
" 'word': 'IN',\n",
" 'start': 13,\n",
" 'end': 15},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.9792612,\n",
" 'word': 'BELGRADE',\n",
" 'start': 52,\n",
" 'end': 60},\n",
" {'entity_group': 'MISC',\n",
" 'score': 0.6702504,\n",
" 'word': 'S',\n",
" 'start': 106,\n",
" 'end': 107},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.9988783,\n",
" 'word': 'Red Star',\n",
" 'start': 148,\n",
" 'end': 156},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.99980396,\n",
" 'word': 'Yugoslavia',\n",
" 'start': 159,\n",
" 'end': 169},\n",
" {'entity_group': 'ORG',\n",
" 'score': 0.99583733,\n",
" 'word': 'Dinamo',\n",
" 'start': 177,\n",
" 'end': 183},\n",
" {'entity_group': 'LOC',\n",
" 'score': 0.99982125,\n",
" 'word': 'Russia',\n",
" 'start': 186,\n",
" 'end': 192}]]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Załadowanie pipeline do NER\n",
"ner_pipeline = pipeline(\"ner\", grouped_entities=True)\n",
"\n",
"# Przetworzenie danych z in.tsv przy użyciu pipeline\n",
"ner_results_dev0 = ner_pipeline(in_data_dev0[0].tolist())\n",
"ner_results_testA = ner_pipeline(in_data_testA[0].tolist())\n",
"\n",
"# Wyświetlenie przykładowych wyników\n",
"ner_results_dev0[:5]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy - dev-0: 94.88%\n"
]
}
],
"source": [
"# Funkcja do mapowania wyników NER na format B-XXX, I-XXX, O\n",
"def map_ner_results(ner_results, sentences):\n",
" ner_labels = []\n",
"\n",
" for sentence, entities in zip(sentences, ner_results):\n",
" words = sentence.split()\n",
" labels = ['O'] * len(words)\n",
"\n",
" for entity in entities:\n",
" start_idx = entity['start']\n",
" end_idx = entity['end']\n",
" entity_label = entity['entity_group']\n",
" entity_words = sentence[start_idx:end_idx].split()\n",
" start_word_idx = len(sentence[:start_idx].split())\n",
" end_word_idx = start_word_idx + len(entity_words)\n",
"\n",
" if start_word_idx < len(labels) and end_word_idx <= len(labels):\n",
" labels[start_word_idx] = f'B-{entity_label}'\n",
"\n",
" for i in range(start_word_idx + 1, end_word_idx):\n",
" labels[i] = f'I-{entity_label}'\n",
"\n",
" ner_labels.append(labels)\n",
" return ner_labels\n",
"\n",
"predicted_labels_dev0 = map_ner_results(ner_results_dev0, in_data_dev0[0].tolist())\n",
"predicted_labels_testA = map_ner_results(ner_results_testA, in_data_dev0[0].tolist())\n",
"\n",
"predicted_strings_dev0 = [' '.join(labels) for labels in predicted_labels_dev0]\n",
"predicted_strings_testA = [' '.join(labels) for labels in predicted_labels_testA]\n",
"expected_strings_dev0 = expected_data_dev0[0].tolist()\n",
"\n",
"with open('dev-0/out.tsv', 'w') as f:\n",
" for line in predicted_strings_dev0:\n",
" f.write(line + '\\n')\n",
"\n",
"with open('test-A/out.tsv', 'w') as f:\n",
" for line in predicted_strings_testA:\n",
" f.write(line + '\\n')\n",
"\n",
"# Sprawdzenie zgodności wyników\n",
"correct = 0\n",
"total = 0\n",
"for pred, exp in zip(predicted_strings_dev0, expected_strings_dev0):\n",
" pred_labels = pred.split()\n",
" exp_labels = exp.split()\n",
" for p, e in zip(pred_labels, exp_labels):\n",
" if p == e:\n",
" correct += 1\n",
" total += 1\n",
"\n",
"accuracy = correct / total\n",
"print(f\"Accuracy - dev-0: {accuracy:.2%}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}