12 KiB
12 KiB
Importy
from transformers import pipeline
import re
from tqdm import tqdm
import pandas as pd
Initializacja modelu NER
nlp = pipeline("ner", model = 'dbmdz/bert-large-cased-finetuned-conll03-english')
No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english). Using a pipeline without specifying a model name and revision in production is not recommended. C:\Users\adamw\PycharmProjects\pythonProject\venv\lib\site-packages\huggingface_hub\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn( Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight'] - This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Metody do tokenizacji
def get_word_indices(string_to_search):
pattern = "\s\S"
matches = re.finditer(pattern, string_to_search)
indices = [m.start(0) + 1 for m in matches]
if not string_to_search[0].isspace():
indices.insert(0, 0)
return sorted(indices)
def get_word_beginning(string_to_search, letter_index):
while letter_index > 0 and string_to_search[letter_index - 1] != " ":
letter_index -= 1
return letter_index
def wordpiece_tokenization(ner_tokenized, original_sentence):
word_start_index_to_tag = {}
formatted_results = []
previous_tag = "O"
for result in ner_tokenized:
word = result["word"].replace("##", "")
start, end = result["start"], result["start"] + len(word)
if formatted_results and (original_sentence[result["start"] - 1] != " " or result["word"].startswith("##")):
formatted_results[-1]["end"] = end
formatted_results[-1]["word"] += word
else:
result["word"] = word
result["start"] = get_word_beginning(original_sentence, start)
result["end"] = end
formatted_results.append(result)
for result in formatted_results:
start_index = result["start"]
tag = result["entity"]
if tag != "O":
if previous_tag != tag:
tag = f"B-{tag.split('-')[-1]}"
else:
tag = f"I-{tag.split('-')[-1]}"
word_start_index_to_tag[start_index] = tag
previous_tag = result["entity"]
for index in get_word_indices(original_sentence):
word_start_index_to_tag.setdefault(index, "O")
return [word_start_index_to_tag[index] for index in sorted(word_start_index_to_tag.keys())]
Tokenizacja plików
def tokenize_file(input_file, output_file):
with open(input_file, "r", encoding="utf-8") as f:
original_sentences = f.readlines()
processed_data = []
for raw_sentence in tqdm(original_sentences, desc=f"Processing {input_file}"):
model_out = nlp(raw_sentence.strip())
word_tokenization = wordpiece_tokenization(model_out, raw_sentence.strip())
processed_line = " ".join(word_tokenization)
processed_data.append(processed_line)
with open(output_file, "w", encoding="utf-8") as f:
for line in processed_data:
f.write(f"{line}\n")
Ewaluacja
tokenize_file("dev-0/in.tsv", "dev-0/out.tsv")
Processing dev-0/in.tsv: 100%|██████████| 215/215 [03:28<00:00, 1.03it/s]
tokenize_file("test-A/in.tsv", "test-A/out.tsv")
Processing test-A/in.tsv: 100%|██████████| 230/230 [03:42<00:00, 1.03it/s]
Poprawienie etykiet
def correct_labels(input_file, output_file):
df = pd.read_csv(input_file, sep="\t", names=["Text"])
corrected_lines = []
for line in df["Text"]:
tokens = line.split(" ")
corrected_tokens = []
previous_token = "O"
for token in tokens:
if (
token == "I-ORG"
and previous_token != "B-ORG"
and previous_token != "I-ORG"
):
corrected_tokens.append("B-ORG")
elif (
token == "I-PER"
and previous_token != "B-PER"
and previous_token != "I-PER"
):
corrected_tokens.append("B-PER")
elif (
token == "I-LOC"
and previous_token != "B-LOC"
and previous_token != "I-LOC"
):
corrected_tokens.append("B-LOC")
elif (
token == "I-MISC"
and previous_token != "B-MISC"
and previous_token != "I-MISC"
):
corrected_tokens.append("B-MISC")
else:
corrected_tokens.append(token)
previous_token = token
corrected_line = " ".join(corrected_tokens)
corrected_lines.append(corrected_line)
df["Text"] = corrected_lines
df.to_csv(output_file, sep="\t", index=False, header=False)
input_file = "test-A/out.tsv"
output_file = "test-A/out.tsv"
correct_labels(input_file, output_file)
input_file = "dev-0/out.tsv"
output_file = "dev-0/out.tsv"
correct_labels(input_file, output_file)
Obliczenie dokładności
def calculate_accuracy(input_file, expected_file):
with open(input_file, "r", encoding="utf-8") as f:
original_sentences = f.readlines()
with open(expected_file, "r", encoding="utf-8") as f:
expected_tags = f.readlines()
total_tags = 0
correct_tags = 0
for raw_sentence, expected_line in tqdm(zip(original_sentences, expected_tags), desc=f"Processing {input_file}", total=len(original_sentences)):
model_out = nlp(raw_sentence.strip())
word_tokenization = wordpiece_tokenization(model_out, raw_sentence.strip())
expected_tags_list = expected_line.strip().split()
total_tags += len(expected_tags_list)
correct_tags += sum(p == e for p, e in zip(word_tokenization, expected_tags_list))
accuracy = correct_tags / total_tags
print(f"Accuracy: {accuracy:.4f}")
calculate_accuracy("dev-0/in.tsv", "dev-0/expected.tsv")
Processing dev-0/in.tsv: 100%|██████████| 215/215 [03:36<00:00, 1.01s/it]
Accuracy: 0.9236