Transformer-Ner/Transformer.ipynb
2024-06-06 22:10:16 +02:00

37 KiB

import pandas as pd
import lzma
from transformers import pipeline

with lzma.open("train/train.tsv.xz", "rt") as f:
    train_data = pd.read_csv(f, delimiter='\t', header=None)

in_data_dev0 = pd.read_csv('dev-0/in.tsv', delimiter='\t', header=None)
expected_data_dev0 = pd.read_csv('dev-0/expected.tsv', delimiter='\t', header=None)

in_data_testA = pd.read_csv('dev-0/in.tsv', delimiter='\t', header=None)
expected_data_testA = pd.read_csv('dev-0/expected.tsv', delimiter='\t', header=None)
# Załadowanie pipeline do NER
ner_pipeline = pipeline("ner", grouped_entities=True)

# Przetworzenie danych z in.tsv przy użyciu pipeline
ner_results_dev0 = ner_pipeline(in_data_dev0[0].tolist())
ner_results_testA = ner_pipeline(in_data_testA[0].tolist())

# Wyświetlenie przykładowych wyników
ner_results_dev0[:5]
No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
/home/wmi/miniconda3/envs/pbr/lib/python3.12/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
/home/wmi/miniconda3/envs/pbr/lib/python3.12/site-packages/transformers/pipelines/token_classification.py:168: UserWarning: `grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to `aggregation_strategy="AggregationStrategy.SIMPLE"` instead.
  warnings.warn(
[[{'entity_group': 'ORG',
   'score': 0.34655723,
   'word': 'L',
   'start': 10,
   'end': 11},
  {'entity_group': 'MISC',
   'score': 0.38114035,
   'word': '##A',
   'start': 26,
   'end': 27},
  {'entity_group': 'LOC',
   'score': 0.88899577,
   'word': 'LONDON',
   'start': 71,
   'end': 77},
  {'entity_group': 'MISC',
   'score': 0.9981057,
   'word': 'West Indian',
   'start': 94,
   'end': 105},
  {'entity_group': 'PER',
   'score': 0.99973667,
   'word': 'Phil Simmons',
   'start': 118,
   'end': 130},
  {'entity_group': 'ORG',
   'score': 0.99539727,
   'word': 'Leicestershire',
   'start': 161,
   'end': 175},
  {'entity_group': 'ORG',
   'score': 0.997735,
   'word': 'Somerset',
   'start': 181,
   'end': 189},
  {'entity_group': 'ORG',
   'score': 0.9995547,
   'word': 'Essex',
   'start': 351,
   'end': 356},
  {'entity_group': 'ORG',
   'score': 0.9992822,
   'word': 'Derbyshire',
   'start': 359,
   'end': 369},
  {'entity_group': 'ORG',
   'score': 0.9993387,
   'word': 'Surrey',
   'start': 374,
   'end': 380},
  {'entity_group': 'ORG',
   'score': 0.9993369,
   'word': 'Kent',
   'start': 412,
   'end': 416},
  {'entity_group': 'ORG',
   'score': 0.99557805,
   'word': 'Nottinghamshire',
   'start': 476,
   'end': 491},
  {'entity_group': 'ORG',
   'score': 0.99760664,
   'word': 'Somerset',
   'start': 513,
   'end': 521},
  {'entity_group': 'LOC',
   'score': 0.99563384,
   'word': 'Grace Road',
   'start': 559,
   'end': 569},
  {'entity_group': 'ORG',
   'score': 0.9976046,
   'word': 'Leicestershire',
   'start': 572,
   'end': 586},
  {'entity_group': 'LOC',
   'score': 0.9994592,
   'word': 'England',
   'start': 664,
   'end': 671},
  {'entity_group': 'PER',
   'score': 0.9892952,
   'word': 'Andy Caddick',
   'start': 680,
   'end': 692},
  {'entity_group': 'ORG',
   'score': 0.9989544,
   'word': 'Somerset',
   'start': 738,
   'end': 746},
  {'entity_group': 'PER',
   'score': 0.9996531,
   'word': 'Simmons',
   'start': 796,
   'end': 803},
  {'entity_group': 'ORG',
   'score': 0.9993154,
   'word': 'Essex',
   'start': 849,
   'end': 854},
  {'entity_group': 'PER',
   'score': 0.99977463,
   'word': 'Nasser Hussain',
   'start': 911,
   'end': 925},
  {'entity_group': 'PER',
   'score': 0.9996419,
   'word': 'Peter Such',
   'start': 930,
   'end': 940},
  {'entity_group': 'ORG',
   'score': 0.9935796,
   'word': 'Yorkshire',
   'start': 986,
   'end': 995},
  {'entity_group': 'LOC',
   'score': 0.99555916,
   'word': 'Headingley',
   'start': 999,
   'end': 1009},
  {'entity_group': 'PER',
   'score': 0.99976915,
   'word': 'Hussain',
   'start': 1017,
   'end': 1024},
  {'entity_group': 'LOC',
   'score': 0.999605,
   'word': 'England',
   'start': 1049,
   'end': 1056},
  {'entity_group': 'ORG',
   'score': 0.99933463,
   'word': 'Essex',
   'start': 1146,
   'end': 1151},
  {'entity_group': 'ORG',
   'score': 0.99699366,
   'word': 'Yorkshire',
   'start': 1220,
   'end': 1229},
  {'entity_group': 'PER',
   'score': 0.9988619,
   'word': 'Such',
   'start': 1286,
   'end': 1290},
  {'entity_group': 'LOC',
   'score': 0.9991185,
   'word': 'Oval',
   'start': 1425,
   'end': 1429},
  {'entity_group': 'ORG',
   'score': 0.9990688,
   'word': 'Surrey',
   'start': 1432,
   'end': 1438},
  {'entity_group': 'PER',
   'score': 0.9997764,
   'word': 'Chris Lewis',
   'start': 1447,
   'end': 1458},
  {'entity_group': 'LOC',
   'score': 0.99957126,
   'word': 'England',
   'start': 1483,
   'end': 1490},
  {'entity_group': 'ORG',
   'score': 0.9987552,
   'word': 'Warwickshire',
   'start': 1616,
   'end': 1628},
  {'entity_group': 'ORG',
   'score': 0.7200186,
   'word': 'S',
   'start': 1633,
   'end': 1634},
  {'entity_group': 'LOC',
   'score': 0.9987576,
   'word': 'England',
   'start': 1658,
   'end': 1665},
  {'entity_group': 'PER',
   'score': 0.9997127,
   'word': 'Mark Butcher',
   'start': 1674,
   'end': 1686},
  {'entity_group': 'ORG',
   'score': 0.9978284,
   'word': 'Surrey',
   'start': 1702,
   'end': 1708},
  {'entity_group': 'ORG',
   'score': 0.5290484,
   'word': 'S',
   'start': 1753,
   'end': 1754},
  {'entity_group': 'ORG',
   'score': 0.9982393,
   'word': 'Derbyshire',
   'start': 1756,
   'end': 1766},
  {'entity_group': 'ORG',
   'score': 0.999025,
   'word': 'Worcestershire',
   'start': 1842,
   'end': 1856},
  {'entity_group': 'ORG',
   'score': 0.5895682,
   'word': 'S',
   'start': 1955,
   'end': 1956},
  {'entity_group': 'MISC',
   'score': 0.9989274,
   'word': 'Australian',
   'start': 1958,
   'end': 1968},
  {'entity_group': 'PER',
   'score': 0.9996978,
   'word': 'Tom Moody',
   'start': 1969,
   'end': 1978},
  {'entity_group': 'PER',
   'score': 0.99972975,
   'word': 'Chris Adams',
   'start': 1999,
   'end': 2010},
  {'entity_group': 'PER',
   'score': 0.9994934,
   'word': 'Tim O',
   'start': 2023,
   'end': 2028},
  {'entity_group': 'PER',
   'score': 0.5700783,
   'word': 'Go',
   'start': 2029,
   'end': 2031},
  {'entity_group': 'ORG',
   'score': 0.9984907,
   'word': 'Derbyshire',
   'start': 2049,
   'end': 2059},
  {'entity_group': 'ORG',
   'score': 0.4096144,
   'word': 'S',
   'start': 2103,
   'end': 2104}],
 [{'entity_group': 'MISC',
   'score': 0.46102658,
   'word': '##IC',
   'start': 2,
   'end': 4},
  {'entity_group': 'MISC',
   'score': 0.97070557,
   'word': 'ENGL',
   'start': 10,
   'end': 14},
  {'entity_group': 'MISC',
   'score': 0.7318856,
   'word': '##H COUNTY CHAM',
   'start': 16,
   'end': 29},
  {'entity_group': 'MISC',
   'score': 0.66476506,
   'word': '##H',
   'start': 34,
   'end': 35},
  {'entity_group': 'LOC',
   'score': 0.95440763,
   'word': 'LONDON',
   'start': 52,
   'end': 58},
  {'entity_group': 'MISC',
   'score': 0.99895835,
   'word': 'English',
   'start': 110,
   'end': 117},
  {'entity_group': 'LOC',
   'score': 0.99821544,
   'word': 'Leicester',
   'start': 163,
   'end': 172},
  {'entity_group': 'ORG',
   'score': 0.99364024,
   'word': 'Leicestershire',
   'start': 175,
   'end': 189},
  {'entity_group': 'ORG',
   'score': 0.99720675,
   'word': 'Somerset',
   'start': 195,
   'end': 203},
  {'entity_group': 'ORG',
   'score': 0.9979146,
   'word': 'Somerset',
   'start': 237,
   'end': 245},
  {'entity_group': 'PER',
   'score': 0.96438843,
   'word': 'P. Simmons',
   'start': 259,
   'end': 269},
  {'entity_group': 'ORG',
   'score': 0.9983664,
   'word': 'Leicestershire',
   'start': 279,
   'end': 293},
  {'entity_group': 'ORG',
   'score': 0.998063,
   'word': 'Leicestershire',
   'start': 305,
   'end': 319},
  {'entity_group': 'ORG',
   'score': 0.99934703,
   'word': 'Somerset',
   'start': 332,
   'end': 340},
  {'entity_group': 'LOC',
   'score': 0.96631926,
   'word': 'Chester - le - Street',
   'start': 350,
   'end': 367},
  {'entity_group': 'ORG',
   'score': 0.9994879,
   'word': 'Glamorgan',
   'start': 370,
   'end': 379},
  {'entity_group': 'PER',
   'score': 0.9771094,
   'word': 'A. Dale',
   'start': 394,
   'end': 401},
  {'entity_group': 'PER',
   'score': 0.95903397,
   'word': 'H. Morris',
   'start': 407,
   'end': 416},
  {'entity_group': 'PER',
   'score': 0.95786047,
   'word': 'D. Blenkiron',
   'start': 422,
   'end': 434},
  {'entity_group': 'ORG',
   'score': 0.9991248,
   'word': 'Durham',
   'start': 444,
   'end': 450},
  {'entity_group': 'PER',
   'score': 0.96577376,
   'word': 'S. Watkin',
   'start': 457,
   'end': 466},
  {'entity_group': 'LOC',
   'score': 0.99620086,
   'word': 'Tunbridge Wells',
   'start': 490,
   'end': 505},
  {'entity_group': 'ORG',
   'score': 0.999209,
   'word': 'Nottinghamshire',
   'start': 508,
   'end': 523},
  {'entity_group': 'PER',
   'score': 0.94234514,
   'word': 'P. Johnson',
   'start': 530,
   'end': 540},
  {'entity_group': 'PER',
   'score': 0.99028325,
   'word': 'M. McCague',
   'start': 546,
   'end': 556},
  {'entity_group': 'ORG',
   'score': 0.99935454,
   'word': 'Kent',
   'start': 566,
   'end': 570},
  {'entity_group': 'LOC',
   'score': 0.9988416,
   'word': 'London',
   'start': 584,
   'end': 590},
  {'entity_group': 'LOC',
   'score': 0.9966791,
   'word': 'The Oval',
   'start': 593,
   'end': 601},
  {'entity_group': 'ORG',
   'score': 0.99928087,
   'word': 'Warwickshire',
   'start': 606,
   'end': 618},
  {'entity_group': 'ORG',
   'score': 0.9995234,
   'word': 'Surrey',
   'start': 625,
   'end': 631},
  {'entity_group': 'PER',
   'score': 0.9265669,
   'word': 'C. Lewis',
   'start': 640,
   'end': 648},
  {'entity_group': 'PER',
   'score': 0.98573065,
   'word': 'M. Butcher',
   'start': 662,
   'end': 672},
  {'entity_group': 'PER',
   'score': 0.99570525,
   'word': 'G. Kersey',
   'start': 678,
   'end': 687},
  {'entity_group': 'PER',
   'score': 0.98831654,
   'word': 'J. Ratcliffe',
   'start': 693,
   'end': 705},
  {'entity_group': 'PER',
   'score': 0.9866266,
   'word': 'D. Bicknell',
   'start': 711,
   'end': 722},
  {'entity_group': 'LOC',
   'score': 0.9714904,
   'word': 'Hove',
   'start': 735,
   'end': 739},
  {'entity_group': 'ORG',
   'score': 0.99935216,
   'word': 'Sussex',
   'start': 742,
   'end': 748},
  {'entity_group': 'PER',
   'score': 0.97753423,
   'word': 'W. Athey',
   'start': 755,
   'end': 763},
  {'entity_group': 'PER',
   'score': 0.9889897,
   'word': 'V. Drakes',
   'start': 770,
   'end': 779},
  {'entity_group': 'PER',
   'score': 0.9931459,
   'word': 'I. Austin',
   'start': 785,
   'end': 794},
  {'entity_group': 'ORG',
   'score': 0.99931324,
   'word': 'Lancashire',
   'start': 804,
   'end': 814},
  {'entity_group': 'PER',
   'score': 0.97633815,
   'word': 'W. Hegg',
   'start': 823,
   'end': 830},
  {'entity_group': 'LOC',
   'score': 0.9942986,
   'word': 'Portsmouth',
   'start': 841,
   'end': 851},
  {'entity_group': 'ORG',
   'score': 0.99923646,
   'word': 'Middlesex',
   'start': 854,
   'end': 863},
  {'entity_group': 'PER',
   'score': 0.97500485,
   'word': 'J. Pooley',
   'start': 878,
   'end': 887},
  {'entity_group': 'PER',
   'score': 0.97731185,
   'word': 'M. Ramprakash',
   'start': 894,
   'end': 907},
  {'entity_group': 'PER',
   'score': 0.9795357,
   'word': 'M. Gatting',
   'start': 914,
   'end': 924},
  {'entity_group': 'ORG',
   'score': 0.99938464,
   'word': 'Hampshire',
   'start': 932,
   'end': 941},
  {'entity_group': 'LOC',
   'score': 0.9905847,
   'word': 'Chesterfield',
   'start': 963,
   'end': 975},
  {'entity_group': 'ORG',
   'score': 0.99893564,
   'word': 'Worcestershire',
   'start': 978,
   'end': 992},
  {'entity_group': 'ORG',
   'score': 0.9993286,
   'word': 'Derbyshire',
   'start': 1009,
   'end': 1019},
  {'entity_group': 'PER',
   'score': 0.96917397,
   'word': 'J. Adams',
   'start': 1026,
   'end': 1034},
  {'entity_group': 'PER',
   'score': 0.9931639,
   'word': "T. O ' Gorman",
   'start': 1041,
   'end': 1051},
  {'entity_group': 'PER',
   'score': 0.97878116,
   'word': 'K. Barnett',
   'start': 1066,
   'end': 1076},
  {'entity_group': 'PER',
   'score': 0.9884977,
   'word': 'T. Moody',
   'start': 1082,
   'end': 1090},
  {'entity_group': 'LOC',
   'score': 0.99594116,
   'word': 'Bristol',
   'start': 1103,
   'end': 1110},
  {'entity_group': 'ORG',
   'score': 0.9994455,
   'word': 'Gloucestershire',
   'start': 1113,
   'end': 1128},
  {'entity_group': 'PER',
   'score': 0.999629,
   'word': 'J',
   'start': 1145,
   'end': 1146},
  {'entity_group': 'PER',
   'score': 0.9994917,
   'word': 'Russell',
   'start': 1148,
   'end': 1155},
  {'entity_group': 'LOC',
   'score': 0.94236225,
   'word': 'Northamptonshire',
   'start': 1171,
   'end': 1187},
  {'entity_group': 'PER',
   'score': 0.99586487,
   'word': 'K',
   'start': 1194,
   'end': 1195},
  {'entity_group': 'PER',
   'score': 0.9998486,
   'word': 'Curran',
   'start': 1197,
   'end': 1203},
  {'entity_group': 'PER',
   'score': 0.9988618,
   'word': 'A',
   'start': 1209,
   'end': 1210},
  {'entity_group': 'PER',
   'score': 0.9765073,
   'word': 'Smith',
   'start': 1212,
   'end': 1217},
  {'entity_group': 'PER',
   'score': 0.998814,
   'word': 'S',
   'start': 1229,
   'end': 1230}],
 [{'entity_group': 'LOC',
   'score': 0.96220237,
   'word': 'LONDON',
   'start': 39,
   'end': 45},
  {'entity_group': 'LOC',
   'score': 0.9997117,
   'word': 'Australia',
   'start': 62,
   'end': 71},
  {'entity_group': 'MISC',
   'score': 0.9900663,
   'word': 'Ashes',
   'start': 88,
   'end': 93},
  {'entity_group': 'LOC',
   'score': 0.99974364,
   'word': 'England',
   'start': 128,
   'end': 135},
  {'entity_group': 'ORG',
   'score': 0.9373318,
   'word': 'Test and County Cricket Board',
   'start': 201,
   'end': 230},
  {'entity_group': 'LOC',
   'score': 0.9997092,
   'word': 'Australia',
   'start': 258,
   'end': 267},
  {'entity_group': 'MISC',
   'score': 0.52167755,
   'word': 'S',
   'start': 440,
   'end': 441},
  {'entity_group': 'MISC',
   'score': 0.99905914,
   'word': 'English',
   'start': 443,
   'end': 450},
  {'entity_group': 'MISC',
   'score': 0.9493715,
   'word': 'British',
   'start': 484,
   'end': 491},
  {'entity_group': 'ORG',
   'score': 0.66527855,
   'word': 'Minor Counties',
   'start': 551,
   'end': 565},
  {'entity_group': 'LOC',
   'score': 0.7435069,
   'word': 'S',
   'start': 572,
   'end': 573},
  {'entity_group': 'LOC',
   'score': 0.9995771,
   'word': 'Scotland',
   'start': 575,
   'end': 583},
  {'entity_group': 'LOC',
   'score': 0.99926525,
   'word': 'London',
   'start': 639,
   'end': 645},
  {'entity_group': 'LOC',
   'score': 0.97566634,
   'word': "Lord ' s",
   'start': 670,
   'end': 677},
  {'entity_group': 'ORG',
   'score': 0.98920923,
   'word': "Duke of Norfolk ' s XI",
   'start': 692,
   'end': 713},
  {'entity_group': 'LOC',
   'score': 0.98959255,
   'word': 'Arundel',
   'start': 719,
   'end': 726},
  {'entity_group': 'ORG',
   'score': 0.9956369,
   'word': 'Northampton',
   'start': 743,
   'end': 754},
  {'entity_group': 'ORG',
   'score': 0.9987895,
   'word': 'Worcestershire',
   'start': 769,
   'end': 783},
  {'entity_group': 'ORG',
   'score': 0.9977308,
   'word': 'Durham',
   'start': 798,
   'end': 804},
  {'entity_group': 'LOC',
   'score': 0.99277717,
   'word': 'Headingley',
   'start': 850,
   'end': 860},
  {'entity_group': 'LOC',
   'score': 0.93061477,
   'word': 'Leeds',
   'start': 868,
   'end': 873},
  {'entity_group': 'LOC',
   'score': 0.9936111,
   'word': 'The Oval',
   'start': 922,
   'end': 930},
  {'entity_group': 'LOC',
   'score': 0.9980288,
   'word': 'London',
   'start': 938,
   'end': 944},
  {'entity_group': 'LOC',
   'score': 0.9947086,
   'word': "Lord ' s",
   'start': 992,
   'end': 999},
  {'entity_group': 'LOC',
   'score': 0.99907875,
   'word': 'London',
   'start': 1002,
   'end': 1008},
  {'entity_group': 'ORG',
   'score': 0.9987871,
   'word': 'Gloucestershire',
   'start': 1028,
   'end': 1043},
  {'entity_group': 'ORG',
   'score': 0.9982962,
   'word': 'Sussex',
   'start': 1047,
   'end': 1053},
  {'entity_group': 'ORG',
   'score': 0.9983968,
   'word': 'Surrey',
   'start': 1057,
   'end': 1063},
  {'entity_group': 'ORG',
   'score': 0.9983108,
   'word': 'Derbyshire',
   'start': 1107,
   'end': 1117},
  {'entity_group': 'LOC',
   'score': 0.9901608,
   'word': 'Edgbaston',
   'start': 1179,
   'end': 1188},
  {'entity_group': 'LOC',
   'score': 0.99937767,
   'word': 'Birmingham',
   'start': 1191,
   'end': 1201},
  {'entity_group': 'LOC',
   'score': 0.5605512,
   'word': 'S',
   'start': 1265,
   'end': 1266},
  {'entity_group': 'ORG',
   'score': 0.9908428,
   'word': 'Leicestershire',
   'start': 1281,
   'end': 1295},
  {'entity_group': 'LOC',
   'score': 0.7768232,
   'word': 'Lord',
   'start': 1344,
   'end': 1348},
  {'entity_group': 'MISC',
   'score': 0.6473316,
   'word': 'British',
   'start': 1372,
   'end': 1379},
  {'entity_group': 'ORG',
   'score': 0.9781204,
   'word': 'Oxford',
   'start': 1398,
   'end': 1404},
  {'entity_group': 'ORG',
   'score': 0.99740857,
   'word': 'Hampshire',
   'start': 1438,
   'end': 1447},
  {'entity_group': 'ORG',
   'score': 0.93818957,
   'word': 'Old',
   'start': 1503,
   'end': 1506},
  {'entity_group': 'LOC',
   'score': 0.60427374,
   'word': '##rafford',
   'start': 1508,
   'end': 1515},
  {'entity_group': 'LOC',
   'score': 0.88743263,
   'word': 'Manchester',
   'start': 1518,
   'end': 1528},
  {'entity_group': 'ORG',
   'score': 0.6398575,
   'word': 'Minor Counties XI',
   'start': 1545,
   'end': 1562},
  {'entity_group': 'LOC',
   'score': 0.9948643,
   'word': 'Scotland',
   'start': 1578,
   'end': 1586},
  {'entity_group': 'ORG',
   'score': 0.9956975,
   'word': 'Glamorgan',
   'start': 1605,
   'end': 1614}],
 [{'entity_group': 'PER',
   'score': 0.64593357,
   'word': 'SHEARER',
   'start': 9,
   'end': 16},
  {'entity_group': 'LOC',
   'score': 0.9937872,
   'word': 'ENGLAND',
   'start': 26,
   'end': 33},
  {'entity_group': 'LOC',
   'score': 0.98532915,
   'word': 'LONDON',
   'start': 49,
   'end': 55},
  {'entity_group': 'PER',
   'score': 0.99965554,
   'word': 'Alan Shearer',
   'start': 106,
   'end': 118},
  {'entity_group': 'LOC',
   'score': 0.99965096,
   'word': 'England',
   'start': 140,
   'end': 147},
  {'entity_group': 'ORG',
   'score': 0.99851114,
   'word': 'Newcastle',
   'start': 202,
   'end': 211},
  {'entity_group': 'PER',
   'score': 0.99972355,
   'word': 'Tony Adams',
   'start': 280,
   'end': 290},
  {'entity_group': 'MISC',
   'score': 0.99899167,
   'word': 'European',
   'start': 321,
   'end': 329},
  {'entity_group': 'PER',
   'score': 0.99968123,
   'word': 'David Platt',
   'start': 372,
   'end': 383},
  {'entity_group': 'PER',
   'score': 0.99977285,
   'word': 'Adams',
   'start': 391,
   'end': 396},
  {'entity_group': 'PER',
   'score': 0.99800897,
   'word': 'Platt',
   'start': 401,
   'end': 406},
  {'entity_group': 'LOC',
   'score': 0.99970406,
   'word': 'England',
   'start': 438,
   'end': 445},
  {'entity_group': 'MISC',
   'score': 0.99456084,
   'word': 'World Cup',
   'start': 457,
   'end': 466},
  {'entity_group': 'LOC',
   'score': 0.9998385,
   'word': 'Moldova',
   'start': 485,
   'end': 492},
  {'entity_group': 'PER',
   'score': 0.9992155,
   'word': 'Shearer',
   'start': 510,
   'end': 517},
  {'entity_group': 'PER',
   'score': 0.9990761,
   'word': 'Glenn Hoddle',
   'start': 571,
   'end': 583},
  {'entity_group': 'ORG',
   'score': 0.9991636,
   'word': 'Blackburn',
   'start': 621,
   'end': 630},
  {'entity_group': 'ORG',
   'score': 0.99904674,
   'word': 'Southampton',
   'start': 635,
   'end': 646},
  {'entity_group': 'PER',
   'score': 0.9996246,
   'word': 'Alan',
   'start': 745,
   'end': 749},
  {'entity_group': 'PER',
   'score': 0.9964572,
   'word': 'Hoddle',
   'start': 777,
   'end': 783},
  {'entity_group': 'PER',
   'score': 0.9996996,
   'word': 'Alan',
   'start': 873,
   'end': 877},
  {'entity_group': 'PER',
   'score': 0.99759614,
   'word': 'Shearer',
   'start': 1172,
   'end': 1179},
  {'entity_group': 'MISC',
   'score': 0.9978334,
   'word': 'Euro 96',
   'start': 1183,
   'end': 1190},
  {'entity_group': 'PER',
   'score': 0.99211884,
   'word': 'Teddy Sheringham',
   'start': 1208,
   'end': 1224},
  {'entity_group': 'PER',
   'score': 0.99672574,
   'word': 'Shearer',
   'start': 1313,
   'end': 1320},
  {'entity_group': 'ORG',
   'score': 0.9980933,
   'word': 'Newcastle',
   'start': 1324,
   'end': 1333},
  {'entity_group': 'PER',
   'score': 0.9995302,
   'word': 'Les Ferdinand',
   'start': 1344,
   'end': 1357}],
 [{'entity_group': 'MISC',
   'score': 0.6943369,
   'word': 'IN',
   'start': 13,
   'end': 15},
  {'entity_group': 'LOC',
   'score': 0.9792612,
   'word': 'BELGRADE',
   'start': 52,
   'end': 60},
  {'entity_group': 'MISC',
   'score': 0.6702504,
   'word': 'S',
   'start': 106,
   'end': 107},
  {'entity_group': 'ORG',
   'score': 0.9988783,
   'word': 'Red Star',
   'start': 148,
   'end': 156},
  {'entity_group': 'LOC',
   'score': 0.99980396,
   'word': 'Yugoslavia',
   'start': 159,
   'end': 169},
  {'entity_group': 'ORG',
   'score': 0.99583733,
   'word': 'Dinamo',
   'start': 177,
   'end': 183},
  {'entity_group': 'LOC',
   'score': 0.99982125,
   'word': 'Russia',
   'start': 186,
   'end': 192}]]
# Funkcja do mapowania wyników NER na format B-XXX, I-XXX, O
def map_ner_results(ner_results, sentences):
    ner_labels = []

    for sentence, entities in zip(sentences, ner_results):
        words = sentence.split()
        labels = ['O'] * len(words)

        for entity in entities:
            start_idx = entity['start']
            end_idx = entity['end']
            entity_label = entity['entity_group']
            entity_words = sentence[start_idx:end_idx].split()
            start_word_idx = len(sentence[:start_idx].split())
            end_word_idx = start_word_idx + len(entity_words)

            if start_word_idx < len(labels) and end_word_idx <= len(labels):
                labels[start_word_idx] = f'B-{entity_label}'

                for i in range(start_word_idx + 1, end_word_idx):
                    labels[i] = f'I-{entity_label}'

        ner_labels.append(labels)
    return ner_labels

predicted_labels_dev0 = map_ner_results(ner_results_dev0, in_data_dev0[0].tolist())
predicted_labels_testA = map_ner_results(ner_results_testA, in_data_dev0[0].tolist())

predicted_strings_dev0 = [' '.join(labels) for labels in predicted_labels_dev0]
predicted_strings_testA = [' '.join(labels) for labels in predicted_labels_testA]
expected_strings_dev0 = expected_data_dev0[0].tolist()

with open('dev-0/out.tsv', 'w') as f:
    for line in predicted_strings_dev0:
        f.write(line + '\n')

with open('test-A/out.tsv', 'w') as f:
    for line in predicted_strings_testA:
        f.write(line + '\n')

# Sprawdzenie zgodności wyników
correct = 0
total = 0
for pred, exp in zip(predicted_strings_dev0, expected_strings_dev0):
    pred_labels = pred.split()
    exp_labels = exp.split()
    for p, e in zip(pred_labels, exp_labels):
        if p == e:
            correct += 1
        total += 1

accuracy = correct / total
print(f"Accuracy - dev-0: {accuracy:.2%}")
Accuracy - dev-0: 94.88%