import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

import warnings

import torchtext
from torchtext.vocab import vocab

from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

from tqdm.notebook import tqdm

from collections import Counter
# Load the data
train_data = pd.read_csv('train/train.tsv', delimiter='\t', header=None)

valid_data_in = pd.read_csv('dev-0/in.tsv', delimiter='\t', header=None)
valid_data_expected = pd.read_csv('dev-0/expected.tsv', delimiter='\t', header=None)
valid_data = pd.concat([valid_data_expected, valid_data_in], axis=1)

test_data = pd.read_csv('test-A/in.tsv', delimiter='\t', header=None)

# Label the columns
train_data.columns = ['ner_tags', 'text']
valid_data.columns = ['ner_tags', 'text']
test_data.columns = ['text']

# Split the text into tokens
train_data['text_tokens'] = train_data['text'].apply(lambda x: x.split())
valid_data['text_tokens'] = valid_data['text'].apply(lambda x: x.split())
test_data['text_tokens'] = test_data['text'].apply(lambda x: x.split())

# Split the NER tags into tokens
train_data['ner_tags_tokens'] = train_data['ner_tags'].apply(lambda x: x.split())
valid_data['ner_tags_tokens'] = valid_data['ner_tags'].apply(lambda x: x.split())
# Method for building the vocabulary from DataFrame dataset
# Special tokens:
# <unk> - unknown token
# <pad> - padding token
# <bos> - beginning of sentence token
# <eos> - end of sentence token
def build_vocab(dataset):
    # Initialize the counter
    counter = Counter()
    # Iterate over the dataset and update the counter
    for idx, document in dataset.iterrows():
    # Return the vocabulary
    return vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
# Build the vocabulary
v = build_vocab(train_data)
# Mapping from index to token
itos = v.get_itos()
# Set default index for unknown tokens
# Get the unique ner tags
ner_tags = set([tag for tags in train_data['ner_tags_tokens'] for tag in tags])
# Mapping from tag to index (https://huggingface.co/datasets/conll2003)
ner_tag2idx = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}

# reverse mapping
ner_idx2tag = {idx: tag for tag, idx in ner_tag2idx.items()}
{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-MISC': 7,
 'I-MISC': 8}
{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}
# Method for vectorizing text data using the vocabulary mapping
def text_to_vec(data):
    return [torch.tensor([v['<bos>']] + [v[token] for token in document] + [v['<eos>']], dtype=torch.long) for document in data]
# Method for vectorizing NER tags data using the NER tags mapping
def ner_tags_to_vec(data):
    return [torch.tensor([0] + [ner_tag2idx[tag] for tag in document] + [0], dtype=torch.long) for document in data]
# Vectorize the text data (input)
X_train = text_to_vec(train_data['text_tokens'])
X_dev = text_to_vec(valid_data['text_tokens'])
X_test = text_to_vec(test_data['text_tokens'])
# Vectorize the NER tags data (output, labels)
y_train = ner_tags_to_vec(train_data['ner_tags_tokens'])
y_dev = ner_tags_to_vec(valid_data['ner_tags_tokens'])
# Model definition
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTM, self).__init__()
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first = True)
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
    def forward(self, x):
        # Embedding
        embedding = self.relu(self.embedding(x))
        # LSTM
        output, (hidden, cell) = self.lstm(embedding)
        # Fully connected
        output = self.fc(output)
        return output
# Segeval evaluation
def evaluate_model(model, X, y):
    Method for evaluating the model
    :param model: model
    :param X: input data
    :param y: output data 
    :return: dictionary with metrics values
    # No gradients
    with torch.no_grad():
        # Predict the labels
        y_pred = [torch.argmax(model(x.unsqueeze(0)).squeeze(0), 1) for x in X]
    # Convert the labels to ner tags
    y_pred = [[ner_idx2tag[int(idx)] for idx in y] for y in y_pred]
    y_tags = [[ner_idx2tag[int(idx)] for idx in y] for y in y]
    # Calculate the metrics
    accuracy = accuracy_score(y_tags, y_pred)
    precision = precision_score(y_tags, y_pred)
    recall = recall_score(y_tags, y_pred)
    f1 = f1_score(y_tags, y_pred)
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}
# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Model parameters
vocab_size = len(v)
embedding_dim = 64
hidden_dim = 256
output_dim = len(ner_tags)
epochs = 20
# Seed for reproducibility

import random

# Initialize the model
model = LSTM(vocab_size, embedding_dim, hidden_dim, output_dim)
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())
# Move training to GPU
model = model.to(device)
X_train = [x.to(device) for x in X_train]
y_train = [y.to(device) for y in y_train]
X_dev = [x.to(device) for x in X_dev]
y_dev = [y.to(device) for y in y_dev]
# Training loop

for epoch in range(epochs):
    for idx in tqdm(range(len(X_train))):
        # Zero the gradients
        # Forward pass
        output = model(X_train[idx].unsqueeze(0))

        # Calculate the loss
        loss = criterion(output.squeeze(0), y_train[idx])
        # Backward pass
        # Update the weights
    # Evaluate the model on the dev set
    metrics = evaluate_model(model, X_dev, y_dev)
    print(f'Epoch: {epoch+1}, Accuracy: {metrics["accuracy"]}, Precision: {metrics["precision"]}, Recall: {metrics["recall"]}, F1: {metrics["f1"]}')
Epoch: 1, Accuracy: 0.9545313667936774, Precision: 0.7780607604147717, Recall: 0.7213695395513577, F1: 0.7486434447750743
evaluate_model(model, X_dev, y_dev)
{'accuracy': 0.9545313667936774,
 'precision': 0.7780607604147717,
 'recall': 0.7213695395513577,
 'f1': 0.7486434447750743}
# Move to CPU
model = model.to('cpu')
X_dev = [x.to('cpu') for x in X_dev]
y_dev = [y.to('cpu') for y in y_dev]
# Predict the labels for the validation and test sets
with torch.no_grad():
    y_dev_pred = [torch.argmax(model(x.unsqueeze(0)).squeeze(0), 1) for x in X_dev]
    y_test_pred = [torch.argmax(model(x.unsqueeze(0)).squeeze(0), 1) for x in X_test]

# Convert the labels to ner tags
y_dev_pred = [[ner_idx2tag[int(idx)] for idx in y] for y in y_dev_pred]
y_test_pred = [[ner_idx2tag[int(idx)] for idx in y] for y in y_test_pred]
# Concatenate predicted labels (skip the special tokens <bos> and <eos>)
y_dev_pred_con = [' '.join(y[1:-1]) for y in y_dev_pred]
y_test_pred_con = [' '.join(y[1:-1]) for y in y_test_pred]
# Save the predictions (without postprocessing)
pd.DataFrame(y_dev_pred_con).to_csv('dev-0/out-model.tsv', header=False, index=False, sep='\t')
pd.DataFrame(y_test_pred_con).to_csv('test-A/out-model.tsv', header=False, index=False, sep='\t')
# Postprocessing
# Regex for finding I-tags that start a sequence (should be B-tags)
def incorrect_I_as_begin_tag(text):
    return re.finditer(r'(?<![BI]-\w+ )I-\w+', text)

# Helper method for replacing I-tags that start a sequence with B-tags
def replace_incorrect_I_as_begin_tag(df):
    # Iterate until no more changes
    i = 0
    while True:
        outer_counter_old = 0
        outer_counter = 0
        print(f"Iteration: {i+1}")
        for idx, row in df.iterrows():
            x = incorrect_I_as_begin_tag(row['ner_tags'])
            inner_counter = 0
            for match in x:
                inner_counter += 1
                hp = list(row['ner_tags'])
                hp[match.start()] = 'B'
                row['ner_tags'] = "".join(hp)
            outer_counter += inner_counter
        print(f"Changes: {outer_counter - outer_counter_old}")
        i += 1
        if outer_counter_old == outer_counter:
            outer_counter_old = outer_counter
    return df

# Regex for finding inconsistent I-tags after B-tags (I-tags that are not continuation of B-tags)
def inconsistent_I_after_B(text):
    return re.finditer(r'(?<=B-(\w+) )(?:I-(?!\1)\w+)', text)

# Helper method for removing inconsistent I-tags after B-tags
def replace_inconsistent_I_after_B(df):
    # Iterate until no more changes
    i = 0
    while True:
        outer_counter_old = 0
        outer_counter = 0
        print(f"Iteration: {i+1}")
        for idx, row in df.iterrows():
            matches = inconsistent_I_after_B(row['ner_tags'])
            inner_counter = 0
            for match in matches:
                inner_counter += 1
                hp = list(row['ner_tags'])
                hp[match.start()] = 'B'
                row['ner_tags'] = "".join(hp)
            outer_counter += inner_counter
        print(f"Changes: {outer_counter - outer_counter_old}")
        i += 1
        if outer_counter_old == outer_counter:
            outer_counter_old = outer_counter
    return df

# Regex for finding inconsistent I-tags after other I-tags (I-tags that are not continuation of the same tag)
def inconsistent_I_after_I(text):
    return re.finditer(r'(?<=I-(\w+) )(?:I-(?!\1)\w+)', text)

# Helper method for removing inconsistent I-tags after other I-tags
def replace_inconsistent_I_after_I(df):
    # Iterate until no more changes
    i = 0
    while True:
        outer_counter_old = 0
        outer_counter = 0
        print(f"Iteration: {i+1}")
        for idx, row in df.iterrows():
            matches = inconsistent_I_after_I(row['ner_tags'])
            inner_counter = 0
            for match in matches:
                inner_counter += 1
                hp = list(row['ner_tags'])
                hp[match.start()] = 'B'
                row['ner_tags'] = "".join(hp)
            outer_counter += inner_counter
        print(f"Changes: {outer_counter - outer_counter_old}")
        i += 1
        if outer_counter_old == outer_counter:
            outer_counter_old = outer_counter
    return df
# Load the predictions
out_dev = pd.read_csv('dev-0/out-model.tsv', delimiter='\t', header=None)
out_dev.columns = ['ner_tags']

out_test = pd.read_csv('test-A/out-model.tsv', delimiter='\t', header=None)
out_test.columns = ['ner_tags']
# Postprocessing
out_dev = replace_incorrect_I_as_begin_tag(out_dev)
out_dev = replace_inconsistent_I_after_B(out_dev)
out_dev = replace_inconsistent_I_after_I(out_dev)

out_test = replace_incorrect_I_as_begin_tag(out_test)
out_test = replace_inconsistent_I_after_B(out_test)
out_test = replace_inconsistent_I_after_I(out_test)
# Save the predictions (with postprocessing)
out_dev.to_csv('dev-0/out.tsv', header=False, index=False, sep='\t')
out_test.to_csv('test-A/out.tsv', header=False, index=False, sep='\t')
# Evaluation
in_dev = pd.read_csv('dev-0/expected.tsv', delimiter='\t', header=None)
in_dev.columns = ['ner_tags']

GEVAL F1-BIO (dev): 0.74864