28 KiB
28 KiB
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import warnings
warnings.filterwarnings('ignore')
import torchtext
from torchtext.vocab import vocab
from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from tqdm.notebook import tqdm
from collections import Counter
# Load the data
train_data = pd.read_csv('train/train.tsv', delimiter='\t', header=None)
valid_data_in = pd.read_csv('dev-0/in.tsv', delimiter='\t', header=None)
valid_data_expected = pd.read_csv('dev-0/expected.tsv', delimiter='\t', header=None)
valid_data = pd.concat([valid_data_expected, valid_data_in], axis=1)
test_data = pd.read_csv('test-A/in.tsv', delimiter='\t', header=None)
# Label the columns
train_data.columns = ['ner_tags', 'text']
valid_data.columns = ['ner_tags', 'text']
test_data.columns = ['text']
# Split the text into tokens
train_data['text_tokens'] = train_data['text'].apply(lambda x: x.split())
valid_data['text_tokens'] = valid_data['text'].apply(lambda x: x.split())
test_data['text_tokens'] = test_data['text'].apply(lambda x: x.split())
# Split the NER tags into tokens
train_data['ner_tags_tokens'] = train_data['ner_tags'].apply(lambda x: x.split())
valid_data['ner_tags_tokens'] = valid_data['ner_tags'].apply(lambda x: x.split())
# Method for building the vocabulary from DataFrame dataset
# Special tokens:
# <unk> - unknown token
# <pad> - padding token
# <bos> - beginning of sentence token
# <eos> - end of sentence token
def build_vocab(dataset):
# Initialize the counter
counter = Counter()
# Iterate over the dataset and update the counter
for idx, document in dataset.iterrows():
counter.update(document['text_tokens'])
# Return the vocabulary
return vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
# Build the vocabulary
v = build_vocab(train_data)
# Mapping from index to token
itos = v.get_itos()
# Set default index for unknown tokens
v.set_default_index(v["<unk>"])
# Get the unique ner tags
ner_tags = set([tag for tags in train_data['ner_tags_tokens'] for tag in tags])
# Mapping from tag to index (https://huggingface.co/datasets/conll2003)
ner_tag2idx = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
# reverse mapping
ner_idx2tag = {idx: tag for tag, idx in ner_tag2idx.items()}
ner_tag2idx
{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
ner_idx2tag
{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}
# Method for vectorizing text data using the vocabulary mapping
def text_to_vec(data):
return [torch.tensor([v['<bos>']] + [v[token] for token in document] + [v['<eos>']], dtype=torch.long) for document in data]
# Method for vectorizing NER tags data using the NER tags mapping
def ner_tags_to_vec(data):
return [torch.tensor([0] + [ner_tag2idx[tag] for tag in document] + [0], dtype=torch.long) for document in data]
# Vectorize the text data (input)
X_train = text_to_vec(train_data['text_tokens'])
X_dev = text_to_vec(valid_data['text_tokens'])
X_test = text_to_vec(test_data['text_tokens'])
# Vectorize the NER tags data (output, labels)
y_train = ner_tags_to_vec(train_data['ner_tags_tokens'])
y_dev = ner_tags_to_vec(valid_data['ner_tags_tokens'])
# Model definition
class LSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
super(LSTM, self).__init__()
# Embedding layer
self.embedding = nn.Embedding(vocab_size, embedding_dim)
# LSTM layer
self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first = True)
# Fully connected layer
self.fc = nn.Linear(hidden_dim, output_dim)
self.relu = nn.ReLU()
def forward(self, x):
# Embedding
embedding = self.relu(self.embedding(x))
# LSTM
output, (hidden, cell) = self.lstm(embedding)
# Fully connected
output = self.fc(output)
return output
# Segeval evaluation
def evaluate_model(model, X, y):
"""
Method for evaluating the model
:param model: model
:param X: input data
:param y: output data
:return: dictionary with metrics values
"""
# No gradients
with torch.no_grad():
# Predict the labels
y_pred = [torch.argmax(model(x.unsqueeze(0)).squeeze(0), 1) for x in X]
# Convert the labels to ner tags
y_pred = [[ner_idx2tag[int(idx)] for idx in y] for y in y_pred]
y_tags = [[ner_idx2tag[int(idx)] for idx in y] for y in y]
# Calculate the metrics
accuracy = accuracy_score(y_tags, y_pred)
precision = precision_score(y_tags, y_pred)
recall = recall_score(y_tags, y_pred)
f1 = f1_score(y_tags, y_pred)
return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}
# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Model parameters
vocab_size = len(v)
embedding_dim = 64
hidden_dim = 256
output_dim = len(ner_tags)
epochs = 20
# Seed for reproducibility
torch.manual_seed(1234)
import random
random.seed(1234)
np.random.seed(1234)
# Initialize the model
model = LSTM(vocab_size, embedding_dim, hidden_dim, output_dim)
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())
# Move training to GPU
model = model.to(device)
X_train = [x.to(device) for x in X_train]
y_train = [y.to(device) for y in y_train]
X_dev = [x.to(device) for x in X_dev]
y_dev = [y.to(device) for y in y_dev]
# Training loop
model.train()
for epoch in range(epochs):
for idx in tqdm(range(len(X_train))):
# Zero the gradients
optimizer.zero_grad()
# Forward pass
output = model(X_train[idx].unsqueeze(0))
# Calculate the loss
loss = criterion(output.squeeze(0), y_train[idx])
# Backward pass
loss.backward()
# Update the weights
optimizer.step()
# Evaluate the model on the dev set
metrics = evaluate_model(model, X_dev, y_dev)
print(f'Epoch: {epoch+1}, Accuracy: {metrics["accuracy"]}, Precision: {metrics["precision"]}, Recall: {metrics["recall"]}, F1: {metrics["f1"]}')
0%| | 0/945 [00:00<?, ?it/s]
Epoch: 1, Accuracy: 0.9545313667936774, Precision: 0.7780607604147717, Recall: 0.7213695395513577, F1: 0.7486434447750743
evaluate_model(model, X_dev, y_dev)
{'accuracy': 0.9545313667936774, 'precision': 0.7780607604147717, 'recall': 0.7213695395513577, 'f1': 0.7486434447750743}
# Move to CPU
model = model.to('cpu')
X_dev = [x.to('cpu') for x in X_dev]
y_dev = [y.to('cpu') for y in y_dev]
# Predict the labels for the validation and test sets
with torch.no_grad():
y_dev_pred = [torch.argmax(model(x.unsqueeze(0)).squeeze(0), 1) for x in X_dev]
y_test_pred = [torch.argmax(model(x.unsqueeze(0)).squeeze(0), 1) for x in X_test]
# Convert the labels to ner tags
y_dev_pred = [[ner_idx2tag[int(idx)] for idx in y] for y in y_dev_pred]
y_test_pred = [[ner_idx2tag[int(idx)] for idx in y] for y in y_test_pred]
# Concatenate predicted labels (skip the special tokens <bos> and <eos>)
y_dev_pred_con = [' '.join(y[1:-1]) for y in y_dev_pred]
y_test_pred_con = [' '.join(y[1:-1]) for y in y_test_pred]
# Save the predictions (without postprocessing)
pd.DataFrame(y_dev_pred_con).to_csv('dev-0/out-model.tsv', header=False, index=False, sep='\t')
pd.DataFrame(y_test_pred_con).to_csv('test-A/out-model.tsv', header=False, index=False, sep='\t')
# Postprocessing
# Regex for finding I-tags that start a sequence (should be B-tags)
def incorrect_I_as_begin_tag(text):
return re.finditer(r'(?<![BI]-\w+ )I-\w+', text)
# Helper method for replacing I-tags that start a sequence with B-tags
def replace_incorrect_I_as_begin_tag(df):
# Iterate until no more changes
i = 0
while True:
outer_counter_old = 0
outer_counter = 0
print(f"Iteration: {i+1}")
for idx, row in df.iterrows():
x = incorrect_I_as_begin_tag(row['ner_tags'])
inner_counter = 0
for match in x:
inner_counter += 1
hp = list(row['ner_tags'])
hp[match.start()] = 'B'
row['ner_tags'] = "".join(hp)
outer_counter += inner_counter
print(f"Changes: {outer_counter - outer_counter_old}")
i += 1
if outer_counter_old == outer_counter:
break
else:
outer_counter_old = outer_counter
return df
# Regex for finding inconsistent I-tags after B-tags (I-tags that are not continuation of B-tags)
def inconsistent_I_after_B(text):
return re.finditer(r'(?<=B-(\w+) )(?:I-(?!\1)\w+)', text)
# Helper method for removing inconsistent I-tags after B-tags
def replace_inconsistent_I_after_B(df):
# Iterate until no more changes
i = 0
while True:
outer_counter_old = 0
outer_counter = 0
print(f"Iteration: {i+1}")
for idx, row in df.iterrows():
matches = inconsistent_I_after_B(row['ner_tags'])
inner_counter = 0
for match in matches:
inner_counter += 1
hp = list(row['ner_tags'])
hp[match.start()] = 'B'
row['ner_tags'] = "".join(hp)
outer_counter += inner_counter
print(f"Changes: {outer_counter - outer_counter_old}")
i += 1
if outer_counter_old == outer_counter:
break
else:
outer_counter_old = outer_counter
return df
# Regex for finding inconsistent I-tags after other I-tags (I-tags that are not continuation of the same tag)
def inconsistent_I_after_I(text):
return re.finditer(r'(?<=I-(\w+) )(?:I-(?!\1)\w+)', text)
# Helper method for removing inconsistent I-tags after other I-tags
def replace_inconsistent_I_after_I(df):
# Iterate until no more changes
i = 0
while True:
outer_counter_old = 0
outer_counter = 0
print(f"Iteration: {i+1}")
for idx, row in df.iterrows():
matches = inconsistent_I_after_I(row['ner_tags'])
inner_counter = 0
for match in matches:
inner_counter += 1
hp = list(row['ner_tags'])
hp[match.start()] = 'B'
row['ner_tags'] = "".join(hp)
outer_counter += inner_counter
print(f"Changes: {outer_counter - outer_counter_old}")
i += 1
if outer_counter_old == outer_counter:
break
else:
outer_counter_old = outer_counter
return df
# Load the predictions
out_dev = pd.read_csv('dev-0/out-model.tsv', delimiter='\t', header=None)
out_dev.columns = ['ner_tags']
out_test = pd.read_csv('test-A/out-model.tsv', delimiter='\t', header=None)
out_test.columns = ['ner_tags']
# Postprocessing
out_dev = replace_incorrect_I_as_begin_tag(out_dev)
out_dev = replace_inconsistent_I_after_B(out_dev)
out_dev = replace_inconsistent_I_after_I(out_dev)
out_test = replace_incorrect_I_as_begin_tag(out_test)
out_test = replace_inconsistent_I_after_B(out_test)
out_test = replace_inconsistent_I_after_I(out_test)
Iteration: 1 Changes: 100 Iteration: 2 Changes: 0 Iteration: 1 Changes: 113 Iteration: 2 Changes: 4 Iteration: 3 Changes: 0 Iteration: 1 Changes: 18 Iteration: 2 Changes: 0 Iteration: 1 Changes: 105 Iteration: 2 Changes: 0 Iteration: 1 Changes: 111 Iteration: 2 Changes: 5 Iteration: 3 Changes: 0 Iteration: 1 Changes: 22 Iteration: 2 Changes: 0
# Save the predictions (with postprocessing)
out_dev.to_csv('dev-0/out.tsv', header=False, index=False, sep='\t')
out_test.to_csv('test-A/out.tsv', header=False, index=False, sep='\t')
# Evaluation
in_dev = pd.read_csv('dev-0/expected.tsv', delimiter='\t', header=None)
in_dev.columns = ['ner_tags']
GEVAL F1-BIO (dev): 0.74864