{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import lzma\n", "from transformers import pipeline\n", "\n", "with lzma.open(\"train/train.tsv.xz\", \"rt\") as f:\n", " train_data = pd.read_csv(f, delimiter='\\t', header=None)\n", "\n", "in_data_dev0 = pd.read_csv('dev-0/in.tsv', delimiter='\\t', header=None)\n", "expected_data_dev0 = pd.read_csv('dev-0/expected.tsv', delimiter='\\t', header=None)\n", "\n", "in_data_testA = pd.read_csv('dev-0/in.tsv', delimiter='\\t', header=None)\n", "expected_data_testA = pd.read_csv('dev-0/expected.tsv', delimiter='\\t', header=None)\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).\n", "Using a pipeline without specifying a model name and revision in production is not recommended.\n", "/home/wmi/miniconda3/envs/pbr/lib/python3.12/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n", "Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n", "- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "/home/wmi/miniconda3/envs/pbr/lib/python3.12/site-packages/transformers/pipelines/token_classification.py:168: UserWarning: `grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to `aggregation_strategy=\"AggregationStrategy.SIMPLE\"` instead.\n", " warnings.warn(\n" ] }, { "data": { "text/plain": [ "[[{'entity_group': 'ORG',\n", " 'score': 0.34655723,\n", " 'word': 'L',\n", " 'start': 10,\n", " 'end': 11},\n", " {'entity_group': 'MISC',\n", " 'score': 0.38114035,\n", " 'word': '##A',\n", " 'start': 26,\n", " 'end': 27},\n", " {'entity_group': 'LOC',\n", " 'score': 0.88899577,\n", " 'word': 'LONDON',\n", " 'start': 71,\n", " 'end': 77},\n", " {'entity_group': 'MISC',\n", " 'score': 0.9981057,\n", " 'word': 'West Indian',\n", " 'start': 94,\n", " 'end': 105},\n", " {'entity_group': 'PER',\n", " 'score': 0.99973667,\n", " 'word': 'Phil Simmons',\n", " 'start': 118,\n", " 'end': 130},\n", " {'entity_group': 'ORG',\n", " 'score': 0.99539727,\n", " 'word': 'Leicestershire',\n", " 'start': 161,\n", " 'end': 175},\n", " {'entity_group': 'ORG',\n", " 'score': 0.997735,\n", " 'word': 'Somerset',\n", " 'start': 181,\n", " 'end': 189},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9995547,\n", " 'word': 'Essex',\n", " 'start': 351,\n", " 'end': 356},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9992822,\n", " 'word': 'Derbyshire',\n", " 'start': 359,\n", " 'end': 369},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9993387,\n", " 'word': 'Surrey',\n", " 'start': 374,\n", " 'end': 380},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9993369,\n", " 'word': 'Kent',\n", " 'start': 412,\n", " 'end': 416},\n", " {'entity_group': 'ORG',\n", " 'score': 0.99557805,\n", " 'word': 'Nottinghamshire',\n", " 'start': 476,\n", " 'end': 491},\n", " {'entity_group': 'ORG',\n", " 'score': 0.99760664,\n", " 'word': 'Somerset',\n", " 'start': 513,\n", " 'end': 521},\n", " {'entity_group': 'LOC',\n", " 'score': 0.99563384,\n", " 'word': 'Grace Road',\n", " 'start': 559,\n", " 'end': 569},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9976046,\n", " 'word': 'Leicestershire',\n", " 'start': 572,\n", " 'end': 586},\n", " {'entity_group': 'LOC',\n", " 'score': 0.9994592,\n", " 'word': 'England',\n", " 'start': 664,\n", " 'end': 671},\n", " {'entity_group': 'PER',\n", " 'score': 0.9892952,\n", " 'word': 'Andy Caddick',\n", " 'start': 680,\n", " 'end': 692},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9989544,\n", " 'word': 'Somerset',\n", " 'start': 738,\n", " 'end': 746},\n", " {'entity_group': 'PER',\n", " 'score': 0.9996531,\n", " 'word': 'Simmons',\n", " 'start': 796,\n", " 'end': 803},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9993154,\n", " 'word': 'Essex',\n", " 'start': 849,\n", " 'end': 854},\n", " {'entity_group': 'PER',\n", " 'score': 0.99977463,\n", " 'word': 'Nasser Hussain',\n", " 'start': 911,\n", " 'end': 925},\n", " {'entity_group': 'PER',\n", " 'score': 0.9996419,\n", " 'word': 'Peter Such',\n", " 'start': 930,\n", " 'end': 940},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9935796,\n", " 'word': 'Yorkshire',\n", " 'start': 986,\n", " 'end': 995},\n", " {'entity_group': 'LOC',\n", " 'score': 0.99555916,\n", " 'word': 'Headingley',\n", " 'start': 999,\n", " 'end': 1009},\n", " {'entity_group': 'PER',\n", " 'score': 0.99976915,\n", " 'word': 'Hussain',\n", " 'start': 1017,\n", " 'end': 1024},\n", " {'entity_group': 'LOC',\n", " 'score': 0.999605,\n", " 'word': 'England',\n", " 'start': 1049,\n", " 'end': 1056},\n", " {'entity_group': 'ORG',\n", " 'score': 0.99933463,\n", " 'word': 'Essex',\n", " 'start': 1146,\n", " 'end': 1151},\n", " {'entity_group': 'ORG',\n", " 'score': 0.99699366,\n", " 'word': 'Yorkshire',\n", " 'start': 1220,\n", " 'end': 1229},\n", " {'entity_group': 'PER',\n", " 'score': 0.9988619,\n", " 'word': 'Such',\n", " 'start': 1286,\n", " 'end': 1290},\n", " {'entity_group': 'LOC',\n", " 'score': 0.9991185,\n", " 'word': 'Oval',\n", " 'start': 1425,\n", " 'end': 1429},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9990688,\n", " 'word': 'Surrey',\n", " 'start': 1432,\n", " 'end': 1438},\n", " {'entity_group': 'PER',\n", " 'score': 0.9997764,\n", " 'word': 'Chris Lewis',\n", " 'start': 1447,\n", " 'end': 1458},\n", " {'entity_group': 'LOC',\n", " 'score': 0.99957126,\n", " 'word': 'England',\n", " 'start': 1483,\n", " 'end': 1490},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9987552,\n", " 'word': 'Warwickshire',\n", " 'start': 1616,\n", " 'end': 1628},\n", " {'entity_group': 'ORG',\n", " 'score': 0.7200186,\n", " 'word': 'S',\n", " 'start': 1633,\n", " 'end': 1634},\n", " {'entity_group': 'LOC',\n", " 'score': 0.9987576,\n", " 'word': 'England',\n", " 'start': 1658,\n", " 'end': 1665},\n", " {'entity_group': 'PER',\n", " 'score': 0.9997127,\n", " 'word': 'Mark Butcher',\n", " 'start': 1674,\n", " 'end': 1686},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9978284,\n", " 'word': 'Surrey',\n", " 'start': 1702,\n", " 'end': 1708},\n", " {'entity_group': 'ORG',\n", " 'score': 0.5290484,\n", " 'word': 'S',\n", " 'start': 1753,\n", " 'end': 1754},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9982393,\n", " 'word': 'Derbyshire',\n", " 'start': 1756,\n", " 'end': 1766},\n", " {'entity_group': 'ORG',\n", " 'score': 0.999025,\n", " 'word': 'Worcestershire',\n", " 'start': 1842,\n", " 'end': 1856},\n", " {'entity_group': 'ORG',\n", " 'score': 0.5895682,\n", " 'word': 'S',\n", " 'start': 1955,\n", " 'end': 1956},\n", " {'entity_group': 'MISC',\n", " 'score': 0.9989274,\n", " 'word': 'Australian',\n", " 'start': 1958,\n", " 'end': 1968},\n", " {'entity_group': 'PER',\n", " 'score': 0.9996978,\n", " 'word': 'Tom Moody',\n", " 'start': 1969,\n", " 'end': 1978},\n", " {'entity_group': 'PER',\n", " 'score': 0.99972975,\n", " 'word': 'Chris Adams',\n", " 'start': 1999,\n", " 'end': 2010},\n", " {'entity_group': 'PER',\n", " 'score': 0.9994934,\n", " 'word': 'Tim O',\n", " 'start': 2023,\n", " 'end': 2028},\n", " {'entity_group': 'PER',\n", " 'score': 0.5700783,\n", " 'word': 'Go',\n", " 'start': 2029,\n", " 'end': 2031},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9984907,\n", " 'word': 'Derbyshire',\n", " 'start': 2049,\n", " 'end': 2059},\n", " {'entity_group': 'ORG',\n", " 'score': 0.4096144,\n", " 'word': 'S',\n", " 'start': 2103,\n", " 'end': 2104}],\n", " [{'entity_group': 'MISC',\n", " 'score': 0.46102658,\n", " 'word': '##IC',\n", " 'start': 2,\n", " 'end': 4},\n", " {'entity_group': 'MISC',\n", " 'score': 0.97070557,\n", " 'word': 'ENGL',\n", " 'start': 10,\n", " 'end': 14},\n", " {'entity_group': 'MISC',\n", " 'score': 0.7318856,\n", " 'word': '##H COUNTY CHAM',\n", " 'start': 16,\n", " 'end': 29},\n", " {'entity_group': 'MISC',\n", " 'score': 0.66476506,\n", " 'word': '##H',\n", " 'start': 34,\n", " 'end': 35},\n", " {'entity_group': 'LOC',\n", " 'score': 0.95440763,\n", " 'word': 'LONDON',\n", " 'start': 52,\n", " 'end': 58},\n", " {'entity_group': 'MISC',\n", " 'score': 0.99895835,\n", " 'word': 'English',\n", " 'start': 110,\n", " 'end': 117},\n", " {'entity_group': 'LOC',\n", " 'score': 0.99821544,\n", " 'word': 'Leicester',\n", " 'start': 163,\n", " 'end': 172},\n", " {'entity_group': 'ORG',\n", " 'score': 0.99364024,\n", " 'word': 'Leicestershire',\n", " 'start': 175,\n", " 'end': 189},\n", " {'entity_group': 'ORG',\n", " 'score': 0.99720675,\n", " 'word': 'Somerset',\n", " 'start': 195,\n", " 'end': 203},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9979146,\n", " 'word': 'Somerset',\n", " 'start': 237,\n", " 'end': 245},\n", " {'entity_group': 'PER',\n", " 'score': 0.96438843,\n", " 'word': 'P. Simmons',\n", " 'start': 259,\n", " 'end': 269},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9983664,\n", " 'word': 'Leicestershire',\n", " 'start': 279,\n", " 'end': 293},\n", " {'entity_group': 'ORG',\n", " 'score': 0.998063,\n", " 'word': 'Leicestershire',\n", " 'start': 305,\n", " 'end': 319},\n", " {'entity_group': 'ORG',\n", " 'score': 0.99934703,\n", " 'word': 'Somerset',\n", " 'start': 332,\n", " 'end': 340},\n", " {'entity_group': 'LOC',\n", " 'score': 0.96631926,\n", " 'word': 'Chester - le - Street',\n", " 'start': 350,\n", " 'end': 367},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9994879,\n", " 'word': 'Glamorgan',\n", " 'start': 370,\n", " 'end': 379},\n", " {'entity_group': 'PER',\n", " 'score': 0.9771094,\n", " 'word': 'A. Dale',\n", " 'start': 394,\n", " 'end': 401},\n", " {'entity_group': 'PER',\n", " 'score': 0.95903397,\n", " 'word': 'H. Morris',\n", " 'start': 407,\n", " 'end': 416},\n", " {'entity_group': 'PER',\n", " 'score': 0.95786047,\n", " 'word': 'D. Blenkiron',\n", " 'start': 422,\n", " 'end': 434},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9991248,\n", " 'word': 'Durham',\n", " 'start': 444,\n", " 'end': 450},\n", " {'entity_group': 'PER',\n", " 'score': 0.96577376,\n", " 'word': 'S. Watkin',\n", " 'start': 457,\n", " 'end': 466},\n", " {'entity_group': 'LOC',\n", " 'score': 0.99620086,\n", " 'word': 'Tunbridge Wells',\n", " 'start': 490,\n", " 'end': 505},\n", " {'entity_group': 'ORG',\n", " 'score': 0.999209,\n", " 'word': 'Nottinghamshire',\n", " 'start': 508,\n", " 'end': 523},\n", " {'entity_group': 'PER',\n", " 'score': 0.94234514,\n", " 'word': 'P. Johnson',\n", " 'start': 530,\n", " 'end': 540},\n", " {'entity_group': 'PER',\n", " 'score': 0.99028325,\n", " 'word': 'M. McCague',\n", " 'start': 546,\n", " 'end': 556},\n", " {'entity_group': 'ORG',\n", " 'score': 0.99935454,\n", " 'word': 'Kent',\n", " 'start': 566,\n", " 'end': 570},\n", " {'entity_group': 'LOC',\n", " 'score': 0.9988416,\n", " 'word': 'London',\n", " 'start': 584,\n", " 'end': 590},\n", " {'entity_group': 'LOC',\n", " 'score': 0.9966791,\n", " 'word': 'The Oval',\n", " 'start': 593,\n", " 'end': 601},\n", " {'entity_group': 'ORG',\n", " 'score': 0.99928087,\n", " 'word': 'Warwickshire',\n", " 'start': 606,\n", " 'end': 618},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9995234,\n", " 'word': 'Surrey',\n", " 'start': 625,\n", " 'end': 631},\n", " {'entity_group': 'PER',\n", " 'score': 0.9265669,\n", " 'word': 'C. Lewis',\n", " 'start': 640,\n", " 'end': 648},\n", " {'entity_group': 'PER',\n", " 'score': 0.98573065,\n", " 'word': 'M. Butcher',\n", " 'start': 662,\n", " 'end': 672},\n", " {'entity_group': 'PER',\n", " 'score': 0.99570525,\n", " 'word': 'G. Kersey',\n", " 'start': 678,\n", " 'end': 687},\n", " {'entity_group': 'PER',\n", " 'score': 0.98831654,\n", " 'word': 'J. Ratcliffe',\n", " 'start': 693,\n", " 'end': 705},\n", " {'entity_group': 'PER',\n", " 'score': 0.9866266,\n", " 'word': 'D. Bicknell',\n", " 'start': 711,\n", " 'end': 722},\n", " {'entity_group': 'LOC',\n", " 'score': 0.9714904,\n", " 'word': 'Hove',\n", " 'start': 735,\n", " 'end': 739},\n", " {'entity_group': 'ORG',\n", " 'score': 0.99935216,\n", " 'word': 'Sussex',\n", " 'start': 742,\n", " 'end': 748},\n", " {'entity_group': 'PER',\n", " 'score': 0.97753423,\n", " 'word': 'W. Athey',\n", " 'start': 755,\n", " 'end': 763},\n", " {'entity_group': 'PER',\n", " 'score': 0.9889897,\n", " 'word': 'V. Drakes',\n", " 'start': 770,\n", " 'end': 779},\n", " {'entity_group': 'PER',\n", " 'score': 0.9931459,\n", " 'word': 'I. Austin',\n", " 'start': 785,\n", " 'end': 794},\n", " {'entity_group': 'ORG',\n", " 'score': 0.99931324,\n", " 'word': 'Lancashire',\n", " 'start': 804,\n", " 'end': 814},\n", " {'entity_group': 'PER',\n", " 'score': 0.97633815,\n", " 'word': 'W. Hegg',\n", " 'start': 823,\n", " 'end': 830},\n", " {'entity_group': 'LOC',\n", " 'score': 0.9942986,\n", " 'word': 'Portsmouth',\n", " 'start': 841,\n", " 'end': 851},\n", " {'entity_group': 'ORG',\n", " 'score': 0.99923646,\n", " 'word': 'Middlesex',\n", " 'start': 854,\n", " 'end': 863},\n", " {'entity_group': 'PER',\n", " 'score': 0.97500485,\n", " 'word': 'J. Pooley',\n", " 'start': 878,\n", " 'end': 887},\n", " {'entity_group': 'PER',\n", " 'score': 0.97731185,\n", " 'word': 'M. Ramprakash',\n", " 'start': 894,\n", " 'end': 907},\n", " {'entity_group': 'PER',\n", " 'score': 0.9795357,\n", " 'word': 'M. Gatting',\n", " 'start': 914,\n", " 'end': 924},\n", " {'entity_group': 'ORG',\n", " 'score': 0.99938464,\n", " 'word': 'Hampshire',\n", " 'start': 932,\n", " 'end': 941},\n", " {'entity_group': 'LOC',\n", " 'score': 0.9905847,\n", " 'word': 'Chesterfield',\n", " 'start': 963,\n", " 'end': 975},\n", " {'entity_group': 'ORG',\n", " 'score': 0.99893564,\n", " 'word': 'Worcestershire',\n", " 'start': 978,\n", " 'end': 992},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9993286,\n", " 'word': 'Derbyshire',\n", " 'start': 1009,\n", " 'end': 1019},\n", " {'entity_group': 'PER',\n", " 'score': 0.96917397,\n", " 'word': 'J. Adams',\n", " 'start': 1026,\n", " 'end': 1034},\n", " {'entity_group': 'PER',\n", " 'score': 0.9931639,\n", " 'word': \"T. O ' Gorman\",\n", " 'start': 1041,\n", " 'end': 1051},\n", " {'entity_group': 'PER',\n", " 'score': 0.97878116,\n", " 'word': 'K. Barnett',\n", " 'start': 1066,\n", " 'end': 1076},\n", " {'entity_group': 'PER',\n", " 'score': 0.9884977,\n", " 'word': 'T. Moody',\n", " 'start': 1082,\n", " 'end': 1090},\n", " {'entity_group': 'LOC',\n", " 'score': 0.99594116,\n", " 'word': 'Bristol',\n", " 'start': 1103,\n", " 'end': 1110},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9994455,\n", " 'word': 'Gloucestershire',\n", " 'start': 1113,\n", " 'end': 1128},\n", " {'entity_group': 'PER',\n", " 'score': 0.999629,\n", " 'word': 'J',\n", " 'start': 1145,\n", " 'end': 1146},\n", " {'entity_group': 'PER',\n", " 'score': 0.9994917,\n", " 'word': 'Russell',\n", " 'start': 1148,\n", " 'end': 1155},\n", " {'entity_group': 'LOC',\n", " 'score': 0.94236225,\n", " 'word': 'Northamptonshire',\n", " 'start': 1171,\n", " 'end': 1187},\n", " {'entity_group': 'PER',\n", " 'score': 0.99586487,\n", " 'word': 'K',\n", " 'start': 1194,\n", " 'end': 1195},\n", " {'entity_group': 'PER',\n", " 'score': 0.9998486,\n", " 'word': 'Curran',\n", " 'start': 1197,\n", " 'end': 1203},\n", " {'entity_group': 'PER',\n", " 'score': 0.9988618,\n", " 'word': 'A',\n", " 'start': 1209,\n", " 'end': 1210},\n", " {'entity_group': 'PER',\n", " 'score': 0.9765073,\n", " 'word': 'Smith',\n", " 'start': 1212,\n", " 'end': 1217},\n", " {'entity_group': 'PER',\n", " 'score': 0.998814,\n", " 'word': 'S',\n", " 'start': 1229,\n", " 'end': 1230}],\n", " [{'entity_group': 'LOC',\n", " 'score': 0.96220237,\n", " 'word': 'LONDON',\n", " 'start': 39,\n", " 'end': 45},\n", " {'entity_group': 'LOC',\n", " 'score': 0.9997117,\n", " 'word': 'Australia',\n", " 'start': 62,\n", " 'end': 71},\n", " {'entity_group': 'MISC',\n", " 'score': 0.9900663,\n", " 'word': 'Ashes',\n", " 'start': 88,\n", " 'end': 93},\n", " {'entity_group': 'LOC',\n", " 'score': 0.99974364,\n", " 'word': 'England',\n", " 'start': 128,\n", " 'end': 135},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9373318,\n", " 'word': 'Test and County Cricket Board',\n", " 'start': 201,\n", " 'end': 230},\n", " {'entity_group': 'LOC',\n", " 'score': 0.9997092,\n", " 'word': 'Australia',\n", " 'start': 258,\n", " 'end': 267},\n", " {'entity_group': 'MISC',\n", " 'score': 0.52167755,\n", " 'word': 'S',\n", " 'start': 440,\n", " 'end': 441},\n", " {'entity_group': 'MISC',\n", " 'score': 0.99905914,\n", " 'word': 'English',\n", " 'start': 443,\n", " 'end': 450},\n", " {'entity_group': 'MISC',\n", " 'score': 0.9493715,\n", " 'word': 'British',\n", " 'start': 484,\n", " 'end': 491},\n", " {'entity_group': 'ORG',\n", " 'score': 0.66527855,\n", " 'word': 'Minor Counties',\n", " 'start': 551,\n", " 'end': 565},\n", " {'entity_group': 'LOC',\n", " 'score': 0.7435069,\n", " 'word': 'S',\n", " 'start': 572,\n", " 'end': 573},\n", " {'entity_group': 'LOC',\n", " 'score': 0.9995771,\n", " 'word': 'Scotland',\n", " 'start': 575,\n", " 'end': 583},\n", " {'entity_group': 'LOC',\n", " 'score': 0.99926525,\n", " 'word': 'London',\n", " 'start': 639,\n", " 'end': 645},\n", " {'entity_group': 'LOC',\n", " 'score': 0.97566634,\n", " 'word': \"Lord ' s\",\n", " 'start': 670,\n", " 'end': 677},\n", " {'entity_group': 'ORG',\n", " 'score': 0.98920923,\n", " 'word': \"Duke of Norfolk ' s XI\",\n", " 'start': 692,\n", " 'end': 713},\n", " {'entity_group': 'LOC',\n", " 'score': 0.98959255,\n", " 'word': 'Arundel',\n", " 'start': 719,\n", " 'end': 726},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9956369,\n", " 'word': 'Northampton',\n", " 'start': 743,\n", " 'end': 754},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9987895,\n", " 'word': 'Worcestershire',\n", " 'start': 769,\n", " 'end': 783},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9977308,\n", " 'word': 'Durham',\n", " 'start': 798,\n", " 'end': 804},\n", " {'entity_group': 'LOC',\n", " 'score': 0.99277717,\n", " 'word': 'Headingley',\n", " 'start': 850,\n", " 'end': 860},\n", " {'entity_group': 'LOC',\n", " 'score': 0.93061477,\n", " 'word': 'Leeds',\n", " 'start': 868,\n", " 'end': 873},\n", " {'entity_group': 'LOC',\n", " 'score': 0.9936111,\n", " 'word': 'The Oval',\n", " 'start': 922,\n", " 'end': 930},\n", " {'entity_group': 'LOC',\n", " 'score': 0.9980288,\n", " 'word': 'London',\n", " 'start': 938,\n", " 'end': 944},\n", " {'entity_group': 'LOC',\n", " 'score': 0.9947086,\n", " 'word': \"Lord ' s\",\n", " 'start': 992,\n", " 'end': 999},\n", " {'entity_group': 'LOC',\n", " 'score': 0.99907875,\n", " 'word': 'London',\n", " 'start': 1002,\n", " 'end': 1008},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9987871,\n", " 'word': 'Gloucestershire',\n", " 'start': 1028,\n", " 'end': 1043},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9982962,\n", " 'word': 'Sussex',\n", " 'start': 1047,\n", " 'end': 1053},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9983968,\n", " 'word': 'Surrey',\n", " 'start': 1057,\n", " 'end': 1063},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9983108,\n", " 'word': 'Derbyshire',\n", " 'start': 1107,\n", " 'end': 1117},\n", " {'entity_group': 'LOC',\n", " 'score': 0.9901608,\n", " 'word': 'Edgbaston',\n", " 'start': 1179,\n", " 'end': 1188},\n", " {'entity_group': 'LOC',\n", " 'score': 0.99937767,\n", " 'word': 'Birmingham',\n", " 'start': 1191,\n", " 'end': 1201},\n", " {'entity_group': 'LOC',\n", " 'score': 0.5605512,\n", " 'word': 'S',\n", " 'start': 1265,\n", " 'end': 1266},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9908428,\n", " 'word': 'Leicestershire',\n", " 'start': 1281,\n", " 'end': 1295},\n", " {'entity_group': 'LOC',\n", " 'score': 0.7768232,\n", " 'word': 'Lord',\n", " 'start': 1344,\n", " 'end': 1348},\n", " {'entity_group': 'MISC',\n", " 'score': 0.6473316,\n", " 'word': 'British',\n", " 'start': 1372,\n", " 'end': 1379},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9781204,\n", " 'word': 'Oxford',\n", " 'start': 1398,\n", " 'end': 1404},\n", " {'entity_group': 'ORG',\n", " 'score': 0.99740857,\n", " 'word': 'Hampshire',\n", " 'start': 1438,\n", " 'end': 1447},\n", " {'entity_group': 'ORG',\n", " 'score': 0.93818957,\n", " 'word': 'Old',\n", " 'start': 1503,\n", " 'end': 1506},\n", " {'entity_group': 'LOC',\n", " 'score': 0.60427374,\n", " 'word': '##rafford',\n", " 'start': 1508,\n", " 'end': 1515},\n", " {'entity_group': 'LOC',\n", " 'score': 0.88743263,\n", " 'word': 'Manchester',\n", " 'start': 1518,\n", " 'end': 1528},\n", " {'entity_group': 'ORG',\n", " 'score': 0.6398575,\n", " 'word': 'Minor Counties XI',\n", " 'start': 1545,\n", " 'end': 1562},\n", " {'entity_group': 'LOC',\n", " 'score': 0.9948643,\n", " 'word': 'Scotland',\n", " 'start': 1578,\n", " 'end': 1586},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9956975,\n", " 'word': 'Glamorgan',\n", " 'start': 1605,\n", " 'end': 1614}],\n", " [{'entity_group': 'PER',\n", " 'score': 0.64593357,\n", " 'word': 'SHEARER',\n", " 'start': 9,\n", " 'end': 16},\n", " {'entity_group': 'LOC',\n", " 'score': 0.9937872,\n", " 'word': 'ENGLAND',\n", " 'start': 26,\n", " 'end': 33},\n", " {'entity_group': 'LOC',\n", " 'score': 0.98532915,\n", " 'word': 'LONDON',\n", " 'start': 49,\n", " 'end': 55},\n", " {'entity_group': 'PER',\n", " 'score': 0.99965554,\n", " 'word': 'Alan Shearer',\n", " 'start': 106,\n", " 'end': 118},\n", " {'entity_group': 'LOC',\n", " 'score': 0.99965096,\n", " 'word': 'England',\n", " 'start': 140,\n", " 'end': 147},\n", " {'entity_group': 'ORG',\n", " 'score': 0.99851114,\n", " 'word': 'Newcastle',\n", " 'start': 202,\n", " 'end': 211},\n", " {'entity_group': 'PER',\n", " 'score': 0.99972355,\n", " 'word': 'Tony Adams',\n", " 'start': 280,\n", " 'end': 290},\n", " {'entity_group': 'MISC',\n", " 'score': 0.99899167,\n", " 'word': 'European',\n", " 'start': 321,\n", " 'end': 329},\n", " {'entity_group': 'PER',\n", " 'score': 0.99968123,\n", " 'word': 'David Platt',\n", " 'start': 372,\n", " 'end': 383},\n", " {'entity_group': 'PER',\n", " 'score': 0.99977285,\n", " 'word': 'Adams',\n", " 'start': 391,\n", " 'end': 396},\n", " {'entity_group': 'PER',\n", " 'score': 0.99800897,\n", " 'word': 'Platt',\n", " 'start': 401,\n", " 'end': 406},\n", " {'entity_group': 'LOC',\n", " 'score': 0.99970406,\n", " 'word': 'England',\n", " 'start': 438,\n", " 'end': 445},\n", " {'entity_group': 'MISC',\n", " 'score': 0.99456084,\n", " 'word': 'World Cup',\n", " 'start': 457,\n", " 'end': 466},\n", " {'entity_group': 'LOC',\n", " 'score': 0.9998385,\n", " 'word': 'Moldova',\n", " 'start': 485,\n", " 'end': 492},\n", " {'entity_group': 'PER',\n", " 'score': 0.9992155,\n", " 'word': 'Shearer',\n", " 'start': 510,\n", " 'end': 517},\n", " {'entity_group': 'PER',\n", " 'score': 0.9990761,\n", " 'word': 'Glenn Hoddle',\n", " 'start': 571,\n", " 'end': 583},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9991636,\n", " 'word': 'Blackburn',\n", " 'start': 621,\n", " 'end': 630},\n", " {'entity_group': 'ORG',\n", " 'score': 0.99904674,\n", " 'word': 'Southampton',\n", " 'start': 635,\n", " 'end': 646},\n", " {'entity_group': 'PER',\n", " 'score': 0.9996246,\n", " 'word': 'Alan',\n", " 'start': 745,\n", " 'end': 749},\n", " {'entity_group': 'PER',\n", " 'score': 0.9964572,\n", " 'word': 'Hoddle',\n", " 'start': 777,\n", " 'end': 783},\n", " {'entity_group': 'PER',\n", " 'score': 0.9996996,\n", " 'word': 'Alan',\n", " 'start': 873,\n", " 'end': 877},\n", " {'entity_group': 'PER',\n", " 'score': 0.99759614,\n", " 'word': 'Shearer',\n", " 'start': 1172,\n", " 'end': 1179},\n", " {'entity_group': 'MISC',\n", " 'score': 0.9978334,\n", " 'word': 'Euro 96',\n", " 'start': 1183,\n", " 'end': 1190},\n", " {'entity_group': 'PER',\n", " 'score': 0.99211884,\n", " 'word': 'Teddy Sheringham',\n", " 'start': 1208,\n", " 'end': 1224},\n", " {'entity_group': 'PER',\n", " 'score': 0.99672574,\n", " 'word': 'Shearer',\n", " 'start': 1313,\n", " 'end': 1320},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9980933,\n", " 'word': 'Newcastle',\n", " 'start': 1324,\n", " 'end': 1333},\n", " {'entity_group': 'PER',\n", " 'score': 0.9995302,\n", " 'word': 'Les Ferdinand',\n", " 'start': 1344,\n", " 'end': 1357}],\n", " [{'entity_group': 'MISC',\n", " 'score': 0.6943369,\n", " 'word': 'IN',\n", " 'start': 13,\n", " 'end': 15},\n", " {'entity_group': 'LOC',\n", " 'score': 0.9792612,\n", " 'word': 'BELGRADE',\n", " 'start': 52,\n", " 'end': 60},\n", " {'entity_group': 'MISC',\n", " 'score': 0.6702504,\n", " 'word': 'S',\n", " 'start': 106,\n", " 'end': 107},\n", " {'entity_group': 'ORG',\n", " 'score': 0.9988783,\n", " 'word': 'Red Star',\n", " 'start': 148,\n", " 'end': 156},\n", " {'entity_group': 'LOC',\n", " 'score': 0.99980396,\n", " 'word': 'Yugoslavia',\n", " 'start': 159,\n", " 'end': 169},\n", " {'entity_group': 'ORG',\n", " 'score': 0.99583733,\n", " 'word': 'Dinamo',\n", " 'start': 177,\n", " 'end': 183},\n", " {'entity_group': 'LOC',\n", " 'score': 0.99982125,\n", " 'word': 'Russia',\n", " 'start': 186,\n", " 'end': 192}]]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Załadowanie pipeline do NER\n", "ner_pipeline = pipeline(\"ner\", grouped_entities=True)\n", "\n", "# Przetworzenie danych z in.tsv przy użyciu pipeline\n", "ner_results_dev0 = ner_pipeline(in_data_dev0[0].tolist())\n", "ner_results_testA = ner_pipeline(in_data_testA[0].tolist())\n", "\n", "# Wyświetlenie przykładowych wyników\n", "ner_results_dev0[:5]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy - dev-0: 94.88%\n" ] } ], "source": [ "# Funkcja do mapowania wyników NER na format B-XXX, I-XXX, O\n", "def map_ner_results(ner_results, sentences):\n", " ner_labels = []\n", "\n", " for sentence, entities in zip(sentences, ner_results):\n", " words = sentence.split()\n", " labels = ['O'] * len(words)\n", "\n", " for entity in entities:\n", " start_idx = entity['start']\n", " end_idx = entity['end']\n", " entity_label = entity['entity_group']\n", " entity_words = sentence[start_idx:end_idx].split()\n", " start_word_idx = len(sentence[:start_idx].split())\n", " end_word_idx = start_word_idx + len(entity_words)\n", "\n", " if start_word_idx < len(labels) and end_word_idx <= len(labels):\n", " labels[start_word_idx] = f'B-{entity_label}'\n", "\n", " for i in range(start_word_idx + 1, end_word_idx):\n", " labels[i] = f'I-{entity_label}'\n", "\n", " ner_labels.append(labels)\n", " return ner_labels\n", "\n", "predicted_labels_dev0 = map_ner_results(ner_results_dev0, in_data_dev0[0].tolist())\n", "predicted_labels_testA = map_ner_results(ner_results_testA, in_data_dev0[0].tolist())\n", "\n", "predicted_strings_dev0 = [' '.join(labels) for labels in predicted_labels_dev0]\n", "predicted_strings_testA = [' '.join(labels) for labels in predicted_labels_testA]\n", "expected_strings_dev0 = expected_data_dev0[0].tolist()\n", "\n", "with open('dev-0/out.tsv', 'w') as f:\n", " for line in predicted_strings_dev0:\n", " f.write(line + '\\n')\n", "\n", "with open('test-A/out.tsv', 'w') as f:\n", " for line in predicted_strings_testA:\n", " f.write(line + '\\n')\n", "\n", "# Sprawdzenie zgodności wyników\n", "correct = 0\n", "total = 0\n", "for pred, exp in zip(predicted_strings_dev0, expected_strings_dev0):\n", " pred_labels = pred.split()\n", " exp_labels = exp.split()\n", " for p, e in zip(pred_labels, exp_labels):\n", " if p == e:\n", " correct += 1\n", " total += 1\n", "\n", "accuracy = correct / total\n", "print(f\"Accuracy - dev-0: {accuracy:.2%}\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 4 }