7.7 KiB
7.7 KiB
from transformers import pipeline
import pandas as pd
import re
from transformers import pipeline
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
input_text = "CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY . </S> LONDON 1996-08-30 </S> West Indian all-rounder Phil"
def predict_and_combine(text):
ner_results = ner_pipeline(text)
combined_tokens = []
combined_labels = []
current_word = ""
current_label = None
for result in ner_results:
token = result['word']
label = result['entity']
if token.startswith("##"):
current_word += token[2:]
else:
if current_word:
combined_tokens.append(current_word)
combined_labels.append(current_label)
current_word = token
current_label = label
if current_word:
combined_tokens.append(current_word)
combined_labels.append(current_label)
return combined_tokens, combined_labels
tokens, labels = predict_and_combine(input_text)
print(f"Sentence: {input_text}")
print("Tokens:", tokens)
print("Labels:", labels)
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight'] - This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Sentence: CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY . </S> LONDON 1996-08-30 </S> West Indian all-rounder Phil Tokens: ['L', 'LONDON', 'West', 'Indian', 'Phil'] Labels: ['I-PER', 'I-LOC', 'I-MISC', 'I-MISC', 'I-PER']
def find_word_starts(text):
indices = [match.start() + 1 for match in re.finditer(r"\s\S", text)]
if not text[0].isspace():
indices.insert(0, 0)
return sorted(indices)
def find_word_start(text, index):
while index > 0 and text[index - 1] != " ":
index -= 1
return index
def merge_wordpieces(ner_tokens, original_sentence):
results = []
for token in ner_tokens:
if token['word'].startswith("##") and results and token['start'] == results[-1]['end']:
results[-1]['end'] = token['end']
results[-1]['word'] += token['word'][2:]
else:
if results and not original_sentence[token['start'] - 1].isspace():
results[-1]['end'] = token['end']
results[-1]['word'] += token['word']
else:
token['start'] = find_word_start(original_sentence, token['start'])
results.append(token)
word_start_to_tag = {result['start']: result['entity'] for result in results}
for index in find_word_starts(original_sentence):
if index not in word_start_to_tag:
word_start_to_tag[index] = "O"
return [word_start_to_tag[index] for index in sorted(word_start_to_tag.keys())]
def predict_and_merge(text):
return ner_pipeline(text)
dev_data = pd.read_csv("dev-0/in.tsv", sep="\t", names=["Text"])
dev_labels = pd.read_csv("dev-0/expected.tsv", sep="\t", names=["Label"])
dev_data["NER_Results"] = dev_data["Text"].apply(predict_and_merge)
processed_data = []
for i, (model_out, raw_sentence) in enumerate(zip(dev_data["NER_Results"], dev_data["Text"])):
merged_tokens = merge_wordpieces(model_out, raw_sentence)
processed_line = " ".join(merged_tokens)
processed_data.append(processed_line)
if len(merged_tokens) != len(raw_sentence.split()):
raise AssertionError
with open("dev-0/out_unprocessed.tsv", "w", encoding="utf-8") as f:
for line in processed_data:
f.write(f"{line}\n")
from sklearn.metrics import accuracy_score
with open('dev-0/out.tsv', 'r') as file:
predicted_labels = [line.strip().split()[1:] for line in file]
with open('dev-0/expected.tsv', 'r') as file:
true_labels = [line.strip().split()[1:] for line in file]
predicted_labels = [label for sublist in predicted_labels for label in sublist]
true_labels = [label for sublist in true_labels for label in sublist]
accuracy = accuracy_score(true_labels, predicted_labels)
print("Accuracy:", accuracy)
Accuracy: 0.8418625244437885
dev_data = pd.read_csv("test-A/in.tsv", sep="\t", names=["Text"])
dev_data["NER_Results"] = dev_data["Text"].apply(predict_and_merge)
processed_data = []
for i, (model_out, raw_sentence) in enumerate(zip(dev_data["NER_Results"], dev_data["Text"])):
merged_tokens = merge_wordpieces(model_out, raw_sentence)
processed_line = " ".join(merged_tokens)
processed_data.append(processed_line)
if len(merged_tokens) != len(raw_sentence.split()):
raise AssertionError
with open("test-A/out_unprocessed.tsv", "w", encoding="utf-8") as f:
for line in processed_data:
f.write(f"{line}\n")