37 KiB
37 KiB
import pandas as pd
import lzma
from transformers import pipeline
with lzma.open("train/train.tsv.xz", "rt") as f:
train_data = pd.read_csv(f, delimiter='\t', header=None)
in_data_dev0 = pd.read_csv('dev-0/in.tsv', delimiter='\t', header=None)
expected_data_dev0 = pd.read_csv('dev-0/expected.tsv', delimiter='\t', header=None)
in_data_testA = pd.read_csv('dev-0/in.tsv', delimiter='\t', header=None)
expected_data_testA = pd.read_csv('dev-0/expected.tsv', delimiter='\t', header=None)
# Załadowanie pipeline do NER
ner_pipeline = pipeline("ner", grouped_entities=True)
# Przetworzenie danych z in.tsv przy użyciu pipeline
ner_results_dev0 = ner_pipeline(in_data_dev0[0].tolist())
ner_results_testA = ner_pipeline(in_data_testA[0].tolist())
# Wyświetlenie przykładowych wyników
ner_results_dev0[:5]
No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english). Using a pipeline without specifying a model name and revision in production is not recommended. /home/wmi/miniconda3/envs/pbr/lib/python3.12/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn( Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight'] - This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). /home/wmi/miniconda3/envs/pbr/lib/python3.12/site-packages/transformers/pipelines/token_classification.py:168: UserWarning: `grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to `aggregation_strategy="AggregationStrategy.SIMPLE"` instead. warnings.warn(
[[{'entity_group': 'ORG', 'score': 0.34655723, 'word': 'L', 'start': 10, 'end': 11}, {'entity_group': 'MISC', 'score': 0.38114035, 'word': '##A', 'start': 26, 'end': 27}, {'entity_group': 'LOC', 'score': 0.88899577, 'word': 'LONDON', 'start': 71, 'end': 77}, {'entity_group': 'MISC', 'score': 0.9981057, 'word': 'West Indian', 'start': 94, 'end': 105}, {'entity_group': 'PER', 'score': 0.99973667, 'word': 'Phil Simmons', 'start': 118, 'end': 130}, {'entity_group': 'ORG', 'score': 0.99539727, 'word': 'Leicestershire', 'start': 161, 'end': 175}, {'entity_group': 'ORG', 'score': 0.997735, 'word': 'Somerset', 'start': 181, 'end': 189}, {'entity_group': 'ORG', 'score': 0.9995547, 'word': 'Essex', 'start': 351, 'end': 356}, {'entity_group': 'ORG', 'score': 0.9992822, 'word': 'Derbyshire', 'start': 359, 'end': 369}, {'entity_group': 'ORG', 'score': 0.9993387, 'word': 'Surrey', 'start': 374, 'end': 380}, {'entity_group': 'ORG', 'score': 0.9993369, 'word': 'Kent', 'start': 412, 'end': 416}, {'entity_group': 'ORG', 'score': 0.99557805, 'word': 'Nottinghamshire', 'start': 476, 'end': 491}, {'entity_group': 'ORG', 'score': 0.99760664, 'word': 'Somerset', 'start': 513, 'end': 521}, {'entity_group': 'LOC', 'score': 0.99563384, 'word': 'Grace Road', 'start': 559, 'end': 569}, {'entity_group': 'ORG', 'score': 0.9976046, 'word': 'Leicestershire', 'start': 572, 'end': 586}, {'entity_group': 'LOC', 'score': 0.9994592, 'word': 'England', 'start': 664, 'end': 671}, {'entity_group': 'PER', 'score': 0.9892952, 'word': 'Andy Caddick', 'start': 680, 'end': 692}, {'entity_group': 'ORG', 'score': 0.9989544, 'word': 'Somerset', 'start': 738, 'end': 746}, {'entity_group': 'PER', 'score': 0.9996531, 'word': 'Simmons', 'start': 796, 'end': 803}, {'entity_group': 'ORG', 'score': 0.9993154, 'word': 'Essex', 'start': 849, 'end': 854}, {'entity_group': 'PER', 'score': 0.99977463, 'word': 'Nasser Hussain', 'start': 911, 'end': 925}, {'entity_group': 'PER', 'score': 0.9996419, 'word': 'Peter Such', 'start': 930, 'end': 940}, {'entity_group': 'ORG', 'score': 0.9935796, 'word': 'Yorkshire', 'start': 986, 'end': 995}, {'entity_group': 'LOC', 'score': 0.99555916, 'word': 'Headingley', 'start': 999, 'end': 1009}, {'entity_group': 'PER', 'score': 0.99976915, 'word': 'Hussain', 'start': 1017, 'end': 1024}, {'entity_group': 'LOC', 'score': 0.999605, 'word': 'England', 'start': 1049, 'end': 1056}, {'entity_group': 'ORG', 'score': 0.99933463, 'word': 'Essex', 'start': 1146, 'end': 1151}, {'entity_group': 'ORG', 'score': 0.99699366, 'word': 'Yorkshire', 'start': 1220, 'end': 1229}, {'entity_group': 'PER', 'score': 0.9988619, 'word': 'Such', 'start': 1286, 'end': 1290}, {'entity_group': 'LOC', 'score': 0.9991185, 'word': 'Oval', 'start': 1425, 'end': 1429}, {'entity_group': 'ORG', 'score': 0.9990688, 'word': 'Surrey', 'start': 1432, 'end': 1438}, {'entity_group': 'PER', 'score': 0.9997764, 'word': 'Chris Lewis', 'start': 1447, 'end': 1458}, {'entity_group': 'LOC', 'score': 0.99957126, 'word': 'England', 'start': 1483, 'end': 1490}, {'entity_group': 'ORG', 'score': 0.9987552, 'word': 'Warwickshire', 'start': 1616, 'end': 1628}, {'entity_group': 'ORG', 'score': 0.7200186, 'word': 'S', 'start': 1633, 'end': 1634}, {'entity_group': 'LOC', 'score': 0.9987576, 'word': 'England', 'start': 1658, 'end': 1665}, {'entity_group': 'PER', 'score': 0.9997127, 'word': 'Mark Butcher', 'start': 1674, 'end': 1686}, {'entity_group': 'ORG', 'score': 0.9978284, 'word': 'Surrey', 'start': 1702, 'end': 1708}, {'entity_group': 'ORG', 'score': 0.5290484, 'word': 'S', 'start': 1753, 'end': 1754}, {'entity_group': 'ORG', 'score': 0.9982393, 'word': 'Derbyshire', 'start': 1756, 'end': 1766}, {'entity_group': 'ORG', 'score': 0.999025, 'word': 'Worcestershire', 'start': 1842, 'end': 1856}, {'entity_group': 'ORG', 'score': 0.5895682, 'word': 'S', 'start': 1955, 'end': 1956}, {'entity_group': 'MISC', 'score': 0.9989274, 'word': 'Australian', 'start': 1958, 'end': 1968}, {'entity_group': 'PER', 'score': 0.9996978, 'word': 'Tom Moody', 'start': 1969, 'end': 1978}, {'entity_group': 'PER', 'score': 0.99972975, 'word': 'Chris Adams', 'start': 1999, 'end': 2010}, {'entity_group': 'PER', 'score': 0.9994934, 'word': 'Tim O', 'start': 2023, 'end': 2028}, {'entity_group': 'PER', 'score': 0.5700783, 'word': 'Go', 'start': 2029, 'end': 2031}, {'entity_group': 'ORG', 'score': 0.9984907, 'word': 'Derbyshire', 'start': 2049, 'end': 2059}, {'entity_group': 'ORG', 'score': 0.4096144, 'word': 'S', 'start': 2103, 'end': 2104}], [{'entity_group': 'MISC', 'score': 0.46102658, 'word': '##IC', 'start': 2, 'end': 4}, {'entity_group': 'MISC', 'score': 0.97070557, 'word': 'ENGL', 'start': 10, 'end': 14}, {'entity_group': 'MISC', 'score': 0.7318856, 'word': '##H COUNTY CHAM', 'start': 16, 'end': 29}, {'entity_group': 'MISC', 'score': 0.66476506, 'word': '##H', 'start': 34, 'end': 35}, {'entity_group': 'LOC', 'score': 0.95440763, 'word': 'LONDON', 'start': 52, 'end': 58}, {'entity_group': 'MISC', 'score': 0.99895835, 'word': 'English', 'start': 110, 'end': 117}, {'entity_group': 'LOC', 'score': 0.99821544, 'word': 'Leicester', 'start': 163, 'end': 172}, {'entity_group': 'ORG', 'score': 0.99364024, 'word': 'Leicestershire', 'start': 175, 'end': 189}, {'entity_group': 'ORG', 'score': 0.99720675, 'word': 'Somerset', 'start': 195, 'end': 203}, {'entity_group': 'ORG', 'score': 0.9979146, 'word': 'Somerset', 'start': 237, 'end': 245}, {'entity_group': 'PER', 'score': 0.96438843, 'word': 'P. Simmons', 'start': 259, 'end': 269}, {'entity_group': 'ORG', 'score': 0.9983664, 'word': 'Leicestershire', 'start': 279, 'end': 293}, {'entity_group': 'ORG', 'score': 0.998063, 'word': 'Leicestershire', 'start': 305, 'end': 319}, {'entity_group': 'ORG', 'score': 0.99934703, 'word': 'Somerset', 'start': 332, 'end': 340}, {'entity_group': 'LOC', 'score': 0.96631926, 'word': 'Chester - le - Street', 'start': 350, 'end': 367}, {'entity_group': 'ORG', 'score': 0.9994879, 'word': 'Glamorgan', 'start': 370, 'end': 379}, {'entity_group': 'PER', 'score': 0.9771094, 'word': 'A. Dale', 'start': 394, 'end': 401}, {'entity_group': 'PER', 'score': 0.95903397, 'word': 'H. Morris', 'start': 407, 'end': 416}, {'entity_group': 'PER', 'score': 0.95786047, 'word': 'D. Blenkiron', 'start': 422, 'end': 434}, {'entity_group': 'ORG', 'score': 0.9991248, 'word': 'Durham', 'start': 444, 'end': 450}, {'entity_group': 'PER', 'score': 0.96577376, 'word': 'S. Watkin', 'start': 457, 'end': 466}, {'entity_group': 'LOC', 'score': 0.99620086, 'word': 'Tunbridge Wells', 'start': 490, 'end': 505}, {'entity_group': 'ORG', 'score': 0.999209, 'word': 'Nottinghamshire', 'start': 508, 'end': 523}, {'entity_group': 'PER', 'score': 0.94234514, 'word': 'P. Johnson', 'start': 530, 'end': 540}, {'entity_group': 'PER', 'score': 0.99028325, 'word': 'M. McCague', 'start': 546, 'end': 556}, {'entity_group': 'ORG', 'score': 0.99935454, 'word': 'Kent', 'start': 566, 'end': 570}, {'entity_group': 'LOC', 'score': 0.9988416, 'word': 'London', 'start': 584, 'end': 590}, {'entity_group': 'LOC', 'score': 0.9966791, 'word': 'The Oval', 'start': 593, 'end': 601}, {'entity_group': 'ORG', 'score': 0.99928087, 'word': 'Warwickshire', 'start': 606, 'end': 618}, {'entity_group': 'ORG', 'score': 0.9995234, 'word': 'Surrey', 'start': 625, 'end': 631}, {'entity_group': 'PER', 'score': 0.9265669, 'word': 'C. Lewis', 'start': 640, 'end': 648}, {'entity_group': 'PER', 'score': 0.98573065, 'word': 'M. Butcher', 'start': 662, 'end': 672}, {'entity_group': 'PER', 'score': 0.99570525, 'word': 'G. Kersey', 'start': 678, 'end': 687}, {'entity_group': 'PER', 'score': 0.98831654, 'word': 'J. Ratcliffe', 'start': 693, 'end': 705}, {'entity_group': 'PER', 'score': 0.9866266, 'word': 'D. Bicknell', 'start': 711, 'end': 722}, {'entity_group': 'LOC', 'score': 0.9714904, 'word': 'Hove', 'start': 735, 'end': 739}, {'entity_group': 'ORG', 'score': 0.99935216, 'word': 'Sussex', 'start': 742, 'end': 748}, {'entity_group': 'PER', 'score': 0.97753423, 'word': 'W. Athey', 'start': 755, 'end': 763}, {'entity_group': 'PER', 'score': 0.9889897, 'word': 'V. Drakes', 'start': 770, 'end': 779}, {'entity_group': 'PER', 'score': 0.9931459, 'word': 'I. Austin', 'start': 785, 'end': 794}, {'entity_group': 'ORG', 'score': 0.99931324, 'word': 'Lancashire', 'start': 804, 'end': 814}, {'entity_group': 'PER', 'score': 0.97633815, 'word': 'W. Hegg', 'start': 823, 'end': 830}, {'entity_group': 'LOC', 'score': 0.9942986, 'word': 'Portsmouth', 'start': 841, 'end': 851}, {'entity_group': 'ORG', 'score': 0.99923646, 'word': 'Middlesex', 'start': 854, 'end': 863}, {'entity_group': 'PER', 'score': 0.97500485, 'word': 'J. Pooley', 'start': 878, 'end': 887}, {'entity_group': 'PER', 'score': 0.97731185, 'word': 'M. Ramprakash', 'start': 894, 'end': 907}, {'entity_group': 'PER', 'score': 0.9795357, 'word': 'M. Gatting', 'start': 914, 'end': 924}, {'entity_group': 'ORG', 'score': 0.99938464, 'word': 'Hampshire', 'start': 932, 'end': 941}, {'entity_group': 'LOC', 'score': 0.9905847, 'word': 'Chesterfield', 'start': 963, 'end': 975}, {'entity_group': 'ORG', 'score': 0.99893564, 'word': 'Worcestershire', 'start': 978, 'end': 992}, {'entity_group': 'ORG', 'score': 0.9993286, 'word': 'Derbyshire', 'start': 1009, 'end': 1019}, {'entity_group': 'PER', 'score': 0.96917397, 'word': 'J. Adams', 'start': 1026, 'end': 1034}, {'entity_group': 'PER', 'score': 0.9931639, 'word': "T. O ' Gorman", 'start': 1041, 'end': 1051}, {'entity_group': 'PER', 'score': 0.97878116, 'word': 'K. Barnett', 'start': 1066, 'end': 1076}, {'entity_group': 'PER', 'score': 0.9884977, 'word': 'T. Moody', 'start': 1082, 'end': 1090}, {'entity_group': 'LOC', 'score': 0.99594116, 'word': 'Bristol', 'start': 1103, 'end': 1110}, {'entity_group': 'ORG', 'score': 0.9994455, 'word': 'Gloucestershire', 'start': 1113, 'end': 1128}, {'entity_group': 'PER', 'score': 0.999629, 'word': 'J', 'start': 1145, 'end': 1146}, {'entity_group': 'PER', 'score': 0.9994917, 'word': 'Russell', 'start': 1148, 'end': 1155}, {'entity_group': 'LOC', 'score': 0.94236225, 'word': 'Northamptonshire', 'start': 1171, 'end': 1187}, {'entity_group': 'PER', 'score': 0.99586487, 'word': 'K', 'start': 1194, 'end': 1195}, {'entity_group': 'PER', 'score': 0.9998486, 'word': 'Curran', 'start': 1197, 'end': 1203}, {'entity_group': 'PER', 'score': 0.9988618, 'word': 'A', 'start': 1209, 'end': 1210}, {'entity_group': 'PER', 'score': 0.9765073, 'word': 'Smith', 'start': 1212, 'end': 1217}, {'entity_group': 'PER', 'score': 0.998814, 'word': 'S', 'start': 1229, 'end': 1230}], [{'entity_group': 'LOC', 'score': 0.96220237, 'word': 'LONDON', 'start': 39, 'end': 45}, {'entity_group': 'LOC', 'score': 0.9997117, 'word': 'Australia', 'start': 62, 'end': 71}, {'entity_group': 'MISC', 'score': 0.9900663, 'word': 'Ashes', 'start': 88, 'end': 93}, {'entity_group': 'LOC', 'score': 0.99974364, 'word': 'England', 'start': 128, 'end': 135}, {'entity_group': 'ORG', 'score': 0.9373318, 'word': 'Test and County Cricket Board', 'start': 201, 'end': 230}, {'entity_group': 'LOC', 'score': 0.9997092, 'word': 'Australia', 'start': 258, 'end': 267}, {'entity_group': 'MISC', 'score': 0.52167755, 'word': 'S', 'start': 440, 'end': 441}, {'entity_group': 'MISC', 'score': 0.99905914, 'word': 'English', 'start': 443, 'end': 450}, {'entity_group': 'MISC', 'score': 0.9493715, 'word': 'British', 'start': 484, 'end': 491}, {'entity_group': 'ORG', 'score': 0.66527855, 'word': 'Minor Counties', 'start': 551, 'end': 565}, {'entity_group': 'LOC', 'score': 0.7435069, 'word': 'S', 'start': 572, 'end': 573}, {'entity_group': 'LOC', 'score': 0.9995771, 'word': 'Scotland', 'start': 575, 'end': 583}, {'entity_group': 'LOC', 'score': 0.99926525, 'word': 'London', 'start': 639, 'end': 645}, {'entity_group': 'LOC', 'score': 0.97566634, 'word': "Lord ' s", 'start': 670, 'end': 677}, {'entity_group': 'ORG', 'score': 0.98920923, 'word': "Duke of Norfolk ' s XI", 'start': 692, 'end': 713}, {'entity_group': 'LOC', 'score': 0.98959255, 'word': 'Arundel', 'start': 719, 'end': 726}, {'entity_group': 'ORG', 'score': 0.9956369, 'word': 'Northampton', 'start': 743, 'end': 754}, {'entity_group': 'ORG', 'score': 0.9987895, 'word': 'Worcestershire', 'start': 769, 'end': 783}, {'entity_group': 'ORG', 'score': 0.9977308, 'word': 'Durham', 'start': 798, 'end': 804}, {'entity_group': 'LOC', 'score': 0.99277717, 'word': 'Headingley', 'start': 850, 'end': 860}, {'entity_group': 'LOC', 'score': 0.93061477, 'word': 'Leeds', 'start': 868, 'end': 873}, {'entity_group': 'LOC', 'score': 0.9936111, 'word': 'The Oval', 'start': 922, 'end': 930}, {'entity_group': 'LOC', 'score': 0.9980288, 'word': 'London', 'start': 938, 'end': 944}, {'entity_group': 'LOC', 'score': 0.9947086, 'word': "Lord ' s", 'start': 992, 'end': 999}, {'entity_group': 'LOC', 'score': 0.99907875, 'word': 'London', 'start': 1002, 'end': 1008}, {'entity_group': 'ORG', 'score': 0.9987871, 'word': 'Gloucestershire', 'start': 1028, 'end': 1043}, {'entity_group': 'ORG', 'score': 0.9982962, 'word': 'Sussex', 'start': 1047, 'end': 1053}, {'entity_group': 'ORG', 'score': 0.9983968, 'word': 'Surrey', 'start': 1057, 'end': 1063}, {'entity_group': 'ORG', 'score': 0.9983108, 'word': 'Derbyshire', 'start': 1107, 'end': 1117}, {'entity_group': 'LOC', 'score': 0.9901608, 'word': 'Edgbaston', 'start': 1179, 'end': 1188}, {'entity_group': 'LOC', 'score': 0.99937767, 'word': 'Birmingham', 'start': 1191, 'end': 1201}, {'entity_group': 'LOC', 'score': 0.5605512, 'word': 'S', 'start': 1265, 'end': 1266}, {'entity_group': 'ORG', 'score': 0.9908428, 'word': 'Leicestershire', 'start': 1281, 'end': 1295}, {'entity_group': 'LOC', 'score': 0.7768232, 'word': 'Lord', 'start': 1344, 'end': 1348}, {'entity_group': 'MISC', 'score': 0.6473316, 'word': 'British', 'start': 1372, 'end': 1379}, {'entity_group': 'ORG', 'score': 0.9781204, 'word': 'Oxford', 'start': 1398, 'end': 1404}, {'entity_group': 'ORG', 'score': 0.99740857, 'word': 'Hampshire', 'start': 1438, 'end': 1447}, {'entity_group': 'ORG', 'score': 0.93818957, 'word': 'Old', 'start': 1503, 'end': 1506}, {'entity_group': 'LOC', 'score': 0.60427374, 'word': '##rafford', 'start': 1508, 'end': 1515}, {'entity_group': 'LOC', 'score': 0.88743263, 'word': 'Manchester', 'start': 1518, 'end': 1528}, {'entity_group': 'ORG', 'score': 0.6398575, 'word': 'Minor Counties XI', 'start': 1545, 'end': 1562}, {'entity_group': 'LOC', 'score': 0.9948643, 'word': 'Scotland', 'start': 1578, 'end': 1586}, {'entity_group': 'ORG', 'score': 0.9956975, 'word': 'Glamorgan', 'start': 1605, 'end': 1614}], [{'entity_group': 'PER', 'score': 0.64593357, 'word': 'SHEARER', 'start': 9, 'end': 16}, {'entity_group': 'LOC', 'score': 0.9937872, 'word': 'ENGLAND', 'start': 26, 'end': 33}, {'entity_group': 'LOC', 'score': 0.98532915, 'word': 'LONDON', 'start': 49, 'end': 55}, {'entity_group': 'PER', 'score': 0.99965554, 'word': 'Alan Shearer', 'start': 106, 'end': 118}, {'entity_group': 'LOC', 'score': 0.99965096, 'word': 'England', 'start': 140, 'end': 147}, {'entity_group': 'ORG', 'score': 0.99851114, 'word': 'Newcastle', 'start': 202, 'end': 211}, {'entity_group': 'PER', 'score': 0.99972355, 'word': 'Tony Adams', 'start': 280, 'end': 290}, {'entity_group': 'MISC', 'score': 0.99899167, 'word': 'European', 'start': 321, 'end': 329}, {'entity_group': 'PER', 'score': 0.99968123, 'word': 'David Platt', 'start': 372, 'end': 383}, {'entity_group': 'PER', 'score': 0.99977285, 'word': 'Adams', 'start': 391, 'end': 396}, {'entity_group': 'PER', 'score': 0.99800897, 'word': 'Platt', 'start': 401, 'end': 406}, {'entity_group': 'LOC', 'score': 0.99970406, 'word': 'England', 'start': 438, 'end': 445}, {'entity_group': 'MISC', 'score': 0.99456084, 'word': 'World Cup', 'start': 457, 'end': 466}, {'entity_group': 'LOC', 'score': 0.9998385, 'word': 'Moldova', 'start': 485, 'end': 492}, {'entity_group': 'PER', 'score': 0.9992155, 'word': 'Shearer', 'start': 510, 'end': 517}, {'entity_group': 'PER', 'score': 0.9990761, 'word': 'Glenn Hoddle', 'start': 571, 'end': 583}, {'entity_group': 'ORG', 'score': 0.9991636, 'word': 'Blackburn', 'start': 621, 'end': 630}, {'entity_group': 'ORG', 'score': 0.99904674, 'word': 'Southampton', 'start': 635, 'end': 646}, {'entity_group': 'PER', 'score': 0.9996246, 'word': 'Alan', 'start': 745, 'end': 749}, {'entity_group': 'PER', 'score': 0.9964572, 'word': 'Hoddle', 'start': 777, 'end': 783}, {'entity_group': 'PER', 'score': 0.9996996, 'word': 'Alan', 'start': 873, 'end': 877}, {'entity_group': 'PER', 'score': 0.99759614, 'word': 'Shearer', 'start': 1172, 'end': 1179}, {'entity_group': 'MISC', 'score': 0.9978334, 'word': 'Euro 96', 'start': 1183, 'end': 1190}, {'entity_group': 'PER', 'score': 0.99211884, 'word': 'Teddy Sheringham', 'start': 1208, 'end': 1224}, {'entity_group': 'PER', 'score': 0.99672574, 'word': 'Shearer', 'start': 1313, 'end': 1320}, {'entity_group': 'ORG', 'score': 0.9980933, 'word': 'Newcastle', 'start': 1324, 'end': 1333}, {'entity_group': 'PER', 'score': 0.9995302, 'word': 'Les Ferdinand', 'start': 1344, 'end': 1357}], [{'entity_group': 'MISC', 'score': 0.6943369, 'word': 'IN', 'start': 13, 'end': 15}, {'entity_group': 'LOC', 'score': 0.9792612, 'word': 'BELGRADE', 'start': 52, 'end': 60}, {'entity_group': 'MISC', 'score': 0.6702504, 'word': 'S', 'start': 106, 'end': 107}, {'entity_group': 'ORG', 'score': 0.9988783, 'word': 'Red Star', 'start': 148, 'end': 156}, {'entity_group': 'LOC', 'score': 0.99980396, 'word': 'Yugoslavia', 'start': 159, 'end': 169}, {'entity_group': 'ORG', 'score': 0.99583733, 'word': 'Dinamo', 'start': 177, 'end': 183}, {'entity_group': 'LOC', 'score': 0.99982125, 'word': 'Russia', 'start': 186, 'end': 192}]]
# Funkcja do mapowania wyników NER na format B-XXX, I-XXX, O
def map_ner_results(ner_results, sentences):
ner_labels = []
for sentence, entities in zip(sentences, ner_results):
words = sentence.split()
labels = ['O'] * len(words)
for entity in entities:
start_idx = entity['start']
end_idx = entity['end']
entity_label = entity['entity_group']
entity_words = sentence[start_idx:end_idx].split()
start_word_idx = len(sentence[:start_idx].split())
end_word_idx = start_word_idx + len(entity_words)
if start_word_idx < len(labels) and end_word_idx <= len(labels):
labels[start_word_idx] = f'B-{entity_label}'
for i in range(start_word_idx + 1, end_word_idx):
labels[i] = f'I-{entity_label}'
ner_labels.append(labels)
return ner_labels
predicted_labels_dev0 = map_ner_results(ner_results_dev0, in_data_dev0[0].tolist())
predicted_labels_testA = map_ner_results(ner_results_testA, in_data_dev0[0].tolist())
predicted_strings_dev0 = [' '.join(labels) for labels in predicted_labels_dev0]
predicted_strings_testA = [' '.join(labels) for labels in predicted_labels_testA]
expected_strings_dev0 = expected_data_dev0[0].tolist()
with open('dev-0/out.tsv', 'w') as f:
for line in predicted_strings_dev0:
f.write(line + '\n')
with open('test-A/out.tsv', 'w') as f:
for line in predicted_strings_testA:
f.write(line + '\n')
# Sprawdzenie zgodności wyników
correct = 0
total = 0
for pred, exp in zip(predicted_strings_dev0, expected_strings_dev0):
pred_labels = pred.split()
exp_labels = exp.split()
for p, e in zip(pred_labels, exp_labels):
if p == e:
correct += 1
total += 1
accuracy = correct / total
print(f"Accuracy - dev-0: {accuracy:.2%}")
Accuracy - dev-0: 94.88%