en-ner-conll-2003/lstm.ipynb
2024-05-16 21:28:49 +02:00

28 KiB

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

import warnings
warnings.filterwarnings('ignore')

import torchtext
from torchtext.vocab import vocab

from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

from tqdm.notebook import tqdm

from collections import Counter
# Load the data
train_data = pd.read_csv('train/train.tsv', delimiter='\t', header=None)

valid_data_in = pd.read_csv('dev-0/in.tsv', delimiter='\t', header=None)
valid_data_expected = pd.read_csv('dev-0/expected.tsv', delimiter='\t', header=None)
valid_data = pd.concat([valid_data_expected, valid_data_in], axis=1)

test_data = pd.read_csv('test-A/in.tsv', delimiter='\t', header=None)

# Label the columns
train_data.columns = ['ner_tags', 'text']
valid_data.columns = ['ner_tags', 'text']
test_data.columns = ['text']

# Split the text into tokens
train_data['text_tokens'] = train_data['text'].apply(lambda x: x.split())
valid_data['text_tokens'] = valid_data['text'].apply(lambda x: x.split())
test_data['text_tokens'] = test_data['text'].apply(lambda x: x.split())

# Split the NER tags into tokens
train_data['ner_tags_tokens'] = train_data['ner_tags'].apply(lambda x: x.split())
valid_data['ner_tags_tokens'] = valid_data['ner_tags'].apply(lambda x: x.split())
# Method for building the vocabulary from DataFrame dataset
# Special tokens:
# <unk> - unknown token
# <pad> - padding token
# <bos> - beginning of sentence token
# <eos> - end of sentence token
def build_vocab(dataset):
    # Initialize the counter
    counter = Counter()
    
    # Iterate over the dataset and update the counter
    for idx, document in dataset.iterrows():
        counter.update(document['text_tokens'])
        
    # Return the vocabulary
    return vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
# Build the vocabulary
v = build_vocab(train_data)
# Mapping from index to token
itos = v.get_itos()
# Set default index for unknown tokens
v.set_default_index(v["<unk>"])
# Get the unique ner tags
ner_tags = set([tag for tags in train_data['ner_tags_tokens'] for tag in tags])
# Mapping from tag to index (https://huggingface.co/datasets/conll2003)
ner_tag2idx = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}

# reverse mapping
ner_idx2tag = {idx: tag for tag, idx in ner_tag2idx.items()}
ner_tag2idx
{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-MISC': 7,
 'I-MISC': 8}
ner_idx2tag
{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}
# Method for vectorizing text data using the vocabulary mapping
def text_to_vec(data):
    return [torch.tensor([v['<bos>']] + [v[token] for token in document] + [v['<eos>']], dtype=torch.long) for document in data]
# Method for vectorizing NER tags data using the NER tags mapping
def ner_tags_to_vec(data):
    return [torch.tensor([0] + [ner_tag2idx[tag] for tag in document] + [0], dtype=torch.long) for document in data]
# Vectorize the text data (input)
X_train = text_to_vec(train_data['text_tokens'])
X_dev = text_to_vec(valid_data['text_tokens'])
X_test = text_to_vec(test_data['text_tokens'])
# Vectorize the NER tags data (output, labels)
y_train = ner_tags_to_vec(train_data['ner_tags_tokens'])
y_dev = ner_tags_to_vec(valid_data['ner_tags_tokens'])
# Model definition
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTM, self).__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first = True)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        self.relu = nn.ReLU()
        
    def forward(self, x):
        # Embedding
        embedding = self.relu(self.embedding(x))
        
        # LSTM
        output, (hidden, cell) = self.lstm(embedding)
        
        # Fully connected
        output = self.fc(output)
        
        return output
        
# Segeval evaluation
def evaluate_model(model, X, y):
    """
    Method for evaluating the model
    :param model: model
    :param X: input data
    :param y: output data 
    :return: dictionary with metrics values
    """
    # No gradients
    with torch.no_grad():
        # Predict the labels
        y_pred = [torch.argmax(model(x.unsqueeze(0)).squeeze(0), 1) for x in X]
    
    # Convert the labels to ner tags
    y_pred = [[ner_idx2tag[int(idx)] for idx in y] for y in y_pred]
    y_tags = [[ner_idx2tag[int(idx)] for idx in y] for y in y]
    
    # Calculate the metrics
    accuracy = accuracy_score(y_tags, y_pred)
    precision = precision_score(y_tags, y_pred)
    recall = recall_score(y_tags, y_pred)
    f1 = f1_score(y_tags, y_pred)
    
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}
# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Model parameters
vocab_size = len(v)
embedding_dim = 64
hidden_dim = 256
output_dim = len(ner_tags)
epochs = 20
# Seed for reproducibility
torch.manual_seed(1234)

import random
random.seed(1234)

np.random.seed(1234)
# Initialize the model
model = LSTM(vocab_size, embedding_dim, hidden_dim, output_dim)
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())
# Move training to GPU
model = model.to(device)
X_train = [x.to(device) for x in X_train]
y_train = [y.to(device) for y in y_train]
X_dev = [x.to(device) for x in X_dev]
y_dev = [y.to(device) for y in y_dev]
# Training loop
model.train()

for epoch in range(epochs):
    
    for idx in tqdm(range(len(X_train))):
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        output = model(X_train[idx].unsqueeze(0))

        # Calculate the loss
        loss = criterion(output.squeeze(0), y_train[idx])
        
        # Backward pass
        loss.backward()
        
        # Update the weights
        optimizer.step()
        
    # Evaluate the model on the dev set
    metrics = evaluate_model(model, X_dev, y_dev)
    
    print(f'Epoch: {epoch+1}, Accuracy: {metrics["accuracy"]}, Precision: {metrics["precision"]}, Recall: {metrics["recall"]}, F1: {metrics["f1"]}')
  0%|          | 0/945 [00:00<?, ?it/s]
Epoch: 1, Accuracy: 0.9545313667936774, Precision: 0.7780607604147717, Recall: 0.7213695395513577, F1: 0.7486434447750743
evaluate_model(model, X_dev, y_dev)
{'accuracy': 0.9545313667936774,
 'precision': 0.7780607604147717,
 'recall': 0.7213695395513577,
 'f1': 0.7486434447750743}
# Move to CPU
model = model.to('cpu')
X_dev = [x.to('cpu') for x in X_dev]
y_dev = [y.to('cpu') for y in y_dev]
# Predict the labels for the validation and test sets
with torch.no_grad():
    y_dev_pred = [torch.argmax(model(x.unsqueeze(0)).squeeze(0), 1) for x in X_dev]
    y_test_pred = [torch.argmax(model(x.unsqueeze(0)).squeeze(0), 1) for x in X_test]

# Convert the labels to ner tags
y_dev_pred = [[ner_idx2tag[int(idx)] for idx in y] for y in y_dev_pred]
y_test_pred = [[ner_idx2tag[int(idx)] for idx in y] for y in y_test_pred]
# Concatenate predicted labels (skip the special tokens <bos> and <eos>)
y_dev_pred_con = [' '.join(y[1:-1]) for y in y_dev_pred]
y_test_pred_con = [' '.join(y[1:-1]) for y in y_test_pred]
# Save the predictions (without postprocessing)
pd.DataFrame(y_dev_pred_con).to_csv('dev-0/out-model.tsv', header=False, index=False, sep='\t')
pd.DataFrame(y_test_pred_con).to_csv('test-A/out-model.tsv', header=False, index=False, sep='\t')
# Postprocessing
# Regex for finding I-tags that start a sequence (should be B-tags)
def incorrect_I_as_begin_tag(text):
    return re.finditer(r'(?<![BI]-\w+ )I-\w+', text)

# Helper method for replacing I-tags that start a sequence with B-tags
def replace_incorrect_I_as_begin_tag(df):
    # Iterate until no more changes
    i = 0
    
    while True:
        outer_counter_old = 0
        outer_counter = 0
        
        print(f"Iteration: {i+1}")
        
        for idx, row in df.iterrows():
            x = incorrect_I_as_begin_tag(row['ner_tags'])
            
            inner_counter = 0
            
            for match in x:
                inner_counter += 1
                hp = list(row['ner_tags'])
                hp[match.start()] = 'B'
                row['ner_tags'] = "".join(hp)
             
            outer_counter += inner_counter
            
        print(f"Changes: {outer_counter - outer_counter_old}")
        
        i += 1
            
        if outer_counter_old == outer_counter:
            break
        else:
            outer_counter_old = outer_counter
            
    return df

# Regex for finding inconsistent I-tags after B-tags (I-tags that are not continuation of B-tags)
def inconsistent_I_after_B(text):
    return re.finditer(r'(?<=B-(\w+) )(?:I-(?!\1)\w+)', text)

# Helper method for removing inconsistent I-tags after B-tags
def replace_inconsistent_I_after_B(df):
    # Iterate until no more changes
    i = 0
    
    while True:
        outer_counter_old = 0
        outer_counter = 0
        
        print(f"Iteration: {i+1}")
        
        for idx, row in df.iterrows():
            matches = inconsistent_I_after_B(row['ner_tags'])
            
            inner_counter = 0
            
            for match in matches:
                inner_counter += 1
                hp = list(row['ner_tags'])
                hp[match.start()] = 'B'
                row['ner_tags'] = "".join(hp)
             
            outer_counter += inner_counter
            
        print(f"Changes: {outer_counter - outer_counter_old}")
        
        i += 1
            
        if outer_counter_old == outer_counter:
            break
        else:
            outer_counter_old = outer_counter
            
    return df

# Regex for finding inconsistent I-tags after other I-tags (I-tags that are not continuation of the same tag)
def inconsistent_I_after_I(text):
    return re.finditer(r'(?<=I-(\w+) )(?:I-(?!\1)\w+)', text)

# Helper method for removing inconsistent I-tags after other I-tags
def replace_inconsistent_I_after_I(df):
    # Iterate until no more changes
    i = 0
    
    while True:
        outer_counter_old = 0
        outer_counter = 0
        
        print(f"Iteration: {i+1}")
        
        for idx, row in df.iterrows():
            matches = inconsistent_I_after_I(row['ner_tags'])
            
            inner_counter = 0
            
            for match in matches:
                inner_counter += 1
                hp = list(row['ner_tags'])
                hp[match.start()] = 'B'
                row['ner_tags'] = "".join(hp)
             
            outer_counter += inner_counter
            
        print(f"Changes: {outer_counter - outer_counter_old}")
        
        i += 1
            
        if outer_counter_old == outer_counter:
            break
        else:
            outer_counter_old = outer_counter
            
    return df
# Load the predictions
out_dev = pd.read_csv('dev-0/out-model.tsv', delimiter='\t', header=None)
out_dev.columns = ['ner_tags']

out_test = pd.read_csv('test-A/out-model.tsv', delimiter='\t', header=None)
out_test.columns = ['ner_tags']
# Postprocessing
out_dev = replace_incorrect_I_as_begin_tag(out_dev)
out_dev = replace_inconsistent_I_after_B(out_dev)
out_dev = replace_inconsistent_I_after_I(out_dev)

out_test = replace_incorrect_I_as_begin_tag(out_test)
out_test = replace_inconsistent_I_after_B(out_test)
out_test = replace_inconsistent_I_after_I(out_test)
Iteration: 1
Changes: 100
Iteration: 2
Changes: 0
Iteration: 1
Changes: 113
Iteration: 2
Changes: 4
Iteration: 3
Changes: 0
Iteration: 1
Changes: 18
Iteration: 2
Changes: 0
Iteration: 1
Changes: 105
Iteration: 2
Changes: 0
Iteration: 1
Changes: 111
Iteration: 2
Changes: 5
Iteration: 3
Changes: 0
Iteration: 1
Changes: 22
Iteration: 2
Changes: 0
# Save the predictions (with postprocessing)
out_dev.to_csv('dev-0/out.tsv', header=False, index=False, sep='\t')
out_test.to_csv('test-A/out.tsv', header=False, index=False, sep='\t')
# Evaluation
in_dev = pd.read_csv('dev-0/expected.tsv', delimiter='\t', header=None)
in_dev.columns = ['ner_tags']

GEVAL F1-BIO (dev): 0.74864