s464953_uczenie_glebokie_tr.../transformer.ipynb

7.7 KiB

from transformers import pipeline
import pandas as pd
import re
from transformers import pipeline

ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

input_text = "CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY . </S> LONDON 1996-08-30 </S> West Indian all-rounder Phil"

def predict_and_combine(text):
    ner_results = ner_pipeline(text)
    combined_tokens = []
    combined_labels = []
    current_word = ""
    current_label = None

    for result in ner_results:
        token = result['word']
        label = result['entity']
        if token.startswith("##"):
            current_word += token[2:]
        else:
            if current_word:
                combined_tokens.append(current_word)
                combined_labels.append(current_label)
            current_word = token
            current_label = label

    if current_word:
        combined_tokens.append(current_word)
        combined_labels.append(current_label)

    return combined_tokens, combined_labels

tokens, labels = predict_and_combine(input_text)

print(f"Sentence: {input_text}")
print("Tokens:", tokens)
print("Labels:", labels)
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Sentence: CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY . </S> LONDON 1996-08-30 </S> West Indian all-rounder Phil
Tokens: ['L', 'LONDON', 'West', 'Indian', 'Phil']
Labels: ['I-PER', 'I-LOC', 'I-MISC', 'I-MISC', 'I-PER']
def find_word_starts(text):
    indices = [match.start() + 1 for match in re.finditer(r"\s\S", text)]
    if not text[0].isspace():
        indices.insert(0, 0)
    return sorted(indices)

def find_word_start(text, index):
    while index > 0 and text[index - 1] != " ":
        index -= 1
    return index

def merge_wordpieces(ner_tokens, original_sentence):
    results = []
    for token in ner_tokens:
        if token['word'].startswith("##") and results and token['start'] == results[-1]['end']:
            results[-1]['end'] = token['end']
            results[-1]['word'] += token['word'][2:]
        else:
            if results and not original_sentence[token['start'] - 1].isspace():
                results[-1]['end'] = token['end']
                results[-1]['word'] += token['word']
            else:
                token['start'] = find_word_start(original_sentence, token['start'])
                results.append(token)
    
    word_start_to_tag = {result['start']: result['entity'] for result in results}
    for index in find_word_starts(original_sentence):
        if index not in word_start_to_tag:
            word_start_to_tag[index] = "O"
    
    return [word_start_to_tag[index] for index in sorted(word_start_to_tag.keys())]

def predict_and_merge(text):
    return ner_pipeline(text)
dev_data = pd.read_csv("dev-0/in.tsv", sep="\t", names=["Text"])
dev_labels = pd.read_csv("dev-0/expected.tsv", sep="\t", names=["Label"])

dev_data["NER_Results"] = dev_data["Text"].apply(predict_and_merge)
processed_data = []

for i, (model_out, raw_sentence) in enumerate(zip(dev_data["NER_Results"], dev_data["Text"])):
    merged_tokens = merge_wordpieces(model_out, raw_sentence)
    processed_line = " ".join(merged_tokens)
    processed_data.append(processed_line)
    
    if len(merged_tokens) != len(raw_sentence.split()):
        raise AssertionError

with open("dev-0/out_unprocessed.tsv", "w", encoding="utf-8") as f:
    for line in processed_data:
        f.write(f"{line}\n")
from sklearn.metrics import accuracy_score

with open('dev-0/out.tsv', 'r') as file:
    predicted_labels = [line.strip().split()[1:] for line in file]

with open('dev-0/expected.tsv', 'r') as file:
    true_labels = [line.strip().split()[1:] for line in file]

predicted_labels = [label for sublist in predicted_labels for label in sublist]
true_labels = [label for sublist in true_labels for label in sublist]

accuracy = accuracy_score(true_labels, predicted_labels)
print("Accuracy:", accuracy)
Accuracy: 0.8418625244437885
dev_data = pd.read_csv("test-A/in.tsv", sep="\t", names=["Text"])

dev_data["NER_Results"] = dev_data["Text"].apply(predict_and_merge)
processed_data = []

for i, (model_out, raw_sentence) in enumerate(zip(dev_data["NER_Results"], dev_data["Text"])):
    merged_tokens = merge_wordpieces(model_out, raw_sentence)
    processed_line = " ".join(merged_tokens)
    processed_data.append(processed_line)
    
    if len(merged_tokens) != len(raw_sentence.split()):
        raise AssertionError

with open("test-A/out_unprocessed.tsv", "w", encoding="utf-8") as f:
    for line in processed_data:
        f.write(f"{line}\n")