In [1]:
import pandas as pd
import lzma
from transformers import pipeline

with lzma.open("train/train.tsv.xz", "rt") as f:
    train_data = pd.read_csv(f, delimiter='\t', header=None)

in_data_dev0 = pd.read_csv('dev-0/in.tsv', delimiter='\t', header=None)
expected_data_dev0 = pd.read_csv('dev-0/expected.tsv', delimiter='\t', header=None)

in_data_testA = pd.read_csv('dev-0/in.tsv', delimiter='\t', header=None)
expected_data_testA = pd.read_csv('dev-0/expected.tsv', delimiter='\t', header=None)


In [2]:
# Załadowanie pipeline do NER
ner_pipeline = pipeline("ner", grouped_entities=True)

# Przetworzenie danych z in.tsv przy użyciu pipeline
ner_results_dev0 = ner_pipeline(in_data_dev0[0].tolist())
ner_results_testA = ner_pipeline(in_data_testA[0].tolist())

# Wyświetlenie przykładowych wyników
ner_results_dev0[:5]

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[[{'entity_group': 'ORG',
   'score': 0.34655723,
   'word': 'L',
   'start': 10,
   'end': 11},
  {'entity_group': 'MISC',
   'score': 0.38114035,
   'word': '##A',
   'start': 26,
   'end': 27},
  {'entity_group': 'LOC',
   'score': 0.88899577,
   'word': 'LONDON',
   'start': 71,
   'end': 77},
  {'entity_group': 'MISC',
   'score': 0.9981057,
   'word': 'West Indian',
   'start': 94,
   'end': 105},
  {'entity_group': 'PER',
   'score': 0.99973667,
   'word': 'Phil Simmons',
   'start': 118,
   'end': 130},
  {'entity_group': 'ORG',
   'score': 0.99539727,
   'word': 'Leicestershire',
   'start': 161,
   'end': 175},
  {'entity_group': 'ORG',
   'score': 0.997735,
   'word': 'Somerset',
   'start': 181,
   'end': 189},
  {'entity_group': 'ORG',
   'score': 0.9995547,
   'word': 'Essex',
   'start': 351,
   'end': 356},
  {'entity_group': 'ORG',
   'score': 0.9992822,
   'word': 'Derbyshire',
   'start': 359,
   'end': 369},
  {'entity_group': 'ORG',
   'score': 0.9993387,
   'word'

In [3]:
# Funkcja do mapowania wyników NER na format B-XXX, I-XXX, O
def map_ner_results(ner_results, sentences):
    ner_labels = []

    for sentence, entities in zip(sentences, ner_results):
        words = sentence.split()
        labels = ['O'] * len(words)

        for entity in entities:
            start_idx = entity['start']
            end_idx = entity['end']
            entity_label = entity['entity_group']
            entity_words = sentence[start_idx:end_idx].split()
            start_word_idx = len(sentence[:start_idx].split())
            end_word_idx = start_word_idx + len(entity_words)

            if start_word_idx < len(labels) and end_word_idx <= len(labels):
                labels[start_word_idx] = f'B-{entity_label}'

                for i in range(start_word_idx + 1, end_word_idx):
                    labels[i] = f'I-{entity_label}'

        ner_labels.append(labels)
    return ner_labels

predicted_labels_dev0 = map_ner_results(ner_results_dev0, in_data_dev0[0].tolist())
predicted_labels_testA = map_ner_results(ner_results_testA, in_data_dev0[0].tolist())

predicted_strings_dev0 = [' '.join(labels) for labels in predicted_labels_dev0]
predicted_strings_testA = [' '.join(labels) for labels in predicted_labels_testA]
expected_strings_dev0 = expected_data_dev0[0].tolist()

with open('dev-0/out.tsv', 'w') as f:
    for line in predicted_strings_dev0:
        f.write(line + '\n')

with open('test-A/out.tsv', 'w') as f:
    for line in predicted_strings_testA:
        f.write(line + '\n')

# Sprawdzenie zgodności wyników
correct = 0
total = 0
for pred, exp in zip(predicted_strings_dev0, expected_strings_dev0):
    pred_labels = pred.split()
    exp_labels = exp.split()
    for p, e in zip(pred_labels, exp_labels):
        if p == e:
            correct += 1
        total += 1

accuracy = correct / total
print(f"Accuracy - dev-0: {accuracy:.2%}")

Accuracy - dev-0: 94.88%
