DL_TRANSFORMER/transformer5.ipynb
2024-06-09 12:16:39 +02:00

14 KiB

Importy

from transformers import pipeline
import re
from tqdm import tqdm
import pandas as pd

Initializacja modelu NER

nlp = pipeline("ner", model = 'dbmdz/bert-large-cased-finetuned-conll03-english')
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

Metody do tokenizacji

def get_word_indices(string_to_search):
    pattern = "\s\S"
    matches = re.finditer(pattern, string_to_search)
    indices = [m.start(0) + 1 for m in matches]
    if not string_to_search[0].isspace():
        indices.insert(0, 0)
    return sorted(indices)

def get_word_beginning(string_to_search, letter_index):
    while letter_index > 0 and string_to_search[letter_index - 1] != " ":
        letter_index -= 1
    return letter_index

def wordpiece_tokenization(ner_tokenized, original_sentence):
    word_start_index_to_tag = {}
    formatted_results = []
    previous_tag = "O"

    for result in ner_tokenized:
        word = result["word"].replace("##", "")
        start, end = result["start"], result["start"] + len(word)

        if formatted_results and (original_sentence[result["start"] - 1] != " " or result["word"].startswith("##")):
            formatted_results[-1]["end"] = end
            formatted_results[-1]["word"] += word
        else:
            result["word"] = word
            result["start"] = get_word_beginning(original_sentence, start)
            result["end"] = end
            formatted_results.append(result)

    for result in formatted_results:
        start_index = result["start"]
        tag = result["entity"]

        if tag != "O":
            if previous_tag != tag:
                tag = f"B-{tag.split('-')[-1]}"
            else:
                tag = f"I-{tag.split('-')[-1]}"
        word_start_index_to_tag[start_index] = tag
        previous_tag = result["entity"]

    for index in get_word_indices(original_sentence):
        word_start_index_to_tag.setdefault(index, "O")

    return [word_start_index_to_tag[index] for index in sorted(word_start_index_to_tag.keys())]

Przykładowe użycie

sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
           "close to the Manhattan Bridge which is visible from the window."
model_out = nlp(sequence)
model_out
[{'entity': 'I-ORG',
  'score': 0.9995635,
  'index': 1,
  'word': 'Hu',
  'start': 0,
  'end': 2},
 {'entity': 'I-ORG',
  'score': 0.99159384,
  'index': 2,
  'word': '##gging',
  'start': 2,
  'end': 7},
 {'entity': 'I-ORG',
  'score': 0.99826705,
  'index': 3,
  'word': 'Face',
  'start': 8,
  'end': 12},
 {'entity': 'I-ORG',
  'score': 0.9994404,
  'index': 4,
  'word': 'Inc',
  'start': 13,
  'end': 16},
 {'entity': 'I-LOC',
  'score': 0.99943465,
  'index': 11,
  'word': 'New',
  'start': 40,
  'end': 43},
 {'entity': 'I-LOC',
  'score': 0.99932706,
  'index': 12,
  'word': 'York',
  'start': 44,
  'end': 48},
 {'entity': 'I-LOC',
  'score': 0.9993864,
  'index': 13,
  'word': 'City',
  'start': 49,
  'end': 53},
 {'entity': 'I-LOC',
  'score': 0.9825622,
  'index': 19,
  'word': 'D',
  'start': 79,
  'end': 80},
 {'entity': 'I-LOC',
  'score': 0.936983,
  'index': 20,
  'word': '##UM',
  'start': 80,
  'end': 82},
 {'entity': 'I-LOC',
  'score': 0.89870995,
  'index': 21,
  'word': '##BO',
  'start': 82,
  'end': 84},
 {'entity': 'I-LOC',
  'score': 0.97582406,
  'index': 29,
  'word': 'Manhattan',
  'start': 113,
  'end': 122},
 {'entity': 'I-LOC',
  'score': 0.99024945,
  'index': 30,
  'word': 'Bridge',
  'start': 123,
  'end': 129}]

Tokenizacja plików

def tokenize_file(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as f:
        original_sentences = f.readlines()

    processed_data = []
    for raw_sentence in tqdm(original_sentences, desc=f"Processing {input_file}"):
        model_out = nlp(raw_sentence.strip())
        word_tokenization = wordpiece_tokenization(model_out, raw_sentence.strip())
        processed_line = " ".join(word_tokenization)
        processed_data.append(processed_line)

    with open(output_file, "w", encoding="utf-8") as f:
        for line in processed_data:
            f.write(f"{line}\n")

Ewaluacja

tokenize_file("dev-0/in.tsv", "dev-0/out.tsv")
Processing dev-0/in.tsv: 100%|██████████| 215/215 [03:28<00:00,  1.03it/s]
tokenize_file("test-A/in.tsv", "test-A/out.tsv")
Processing test-A/in.tsv: 100%|██████████| 230/230 [03:42<00:00,  1.03it/s]

Poprawienie etykiet

def correct_labels(input_file, output_file):
    df = pd.read_csv(input_file, sep="\t", names=["Text"])

    corrected_lines = []

    for line in df["Text"]:
        tokens = line.split(" ")
        corrected_tokens = []
        previous_token = "O"

        for token in tokens:
            if (
                token == "I-ORG"
                and previous_token != "B-ORG"
                and previous_token != "I-ORG"
            ):
                corrected_tokens.append("B-ORG")
            elif (
                token == "I-PER"
                and previous_token != "B-PER"
                and previous_token != "I-PER"
            ):
                corrected_tokens.append("B-PER")
            elif (
                token == "I-LOC"
                and previous_token != "B-LOC"
                and previous_token != "I-LOC"
            ):
                corrected_tokens.append("B-LOC")
            elif (
                token == "I-MISC"
                and previous_token != "B-MISC"
                and previous_token != "I-MISC"
            ):
                corrected_tokens.append("B-MISC")
            else:
                corrected_tokens.append(token)

            previous_token = token

        corrected_line = " ".join(corrected_tokens)
        corrected_lines.append(corrected_line)

    df["Text"] = corrected_lines
    df.to_csv(output_file, sep="\t", index=False, header=False)
input_file = "test-A/out.tsv"
output_file = "test-A/out.tsv"
correct_labels(input_file, output_file)
input_file = "dev-0/out.tsv"
output_file = "dev-0/out.tsv"
correct_labels(input_file, output_file)

Obliczenie dokładności

def calculate_accuracy(input_file, expected_file):
    with open(input_file, "r", encoding="utf-8") as f:
        original_sentences = f.readlines()

    with open(expected_file, "r", encoding="utf-8") as f:
        expected_tags = f.readlines()

    total_tags = 0
    correct_tags = 0

    for raw_sentence, expected_line in tqdm(zip(original_sentences, expected_tags), desc=f"Processing {input_file}", total=len(original_sentences)):
        model_out = nlp(raw_sentence.strip())
        word_tokenization = wordpiece_tokenization(model_out, raw_sentence.strip())
        expected_tags_list = expected_line.strip().split()

        total_tags += len(expected_tags_list)
        correct_tags += sum(p == e for p, e in zip(word_tokenization, expected_tags_list))

    accuracy = correct_tags / total_tags
    print(f"Accuracy: {accuracy:.4f}")

calculate_accuracy("dev-0/in.tsv", "dev-0/expected.tsv")
Processing dev-0/in.tsv: 100%|██████████| 215/215 [03:36<00:00,  1.01s/it]
Accuracy: 0.9236