## Transformer

In [1]:
# Necessary imports
import pandas as pd
import numpy as np

import torch

import datasets
from datasets import ClassLabel, Features, Sequence, Value
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForTokenClassification

import warnings
warnings.filterwarnings('ignore')

### Prepare data

In [2]:
# Divide train data into sentences and labels
train_data = pd.read_csv('train/train.tsv', sep='\t', header=None)

with open('train/train_labels.tsv', 'w') as f:
    for i in range(len(train_data)):
        if i == len(train_data) - 1:
            f.write(train_data.iloc[i][0])
        else:
            f.write(train_data.iloc[i][0] + '\n')
        
with open('train/train_sentences.tsv', 'w') as f:
    for i in range(len(train_data)):
        if i == len(train_data) - 1:
            f.write(train_data.iloc[i][1])
        else:
            f.write(train_data.iloc[i][1] + '\n')

In [3]:
# Data paths
train_sentences_file = 'train/train_sentences.tsv'
train_labels_file = 'train/train_labels.tsv'
val_sentences_file = 'dev-0/in.tsv'
val_labels_file = 'dev-0/expected.tsv'
test_sentences_file = 'test-A/in.tsv'

In [4]:
# Method to read tokens and labels from files
def read_sentences_and_labels(sentences_path, labels_path=None):
    tokens = []
    ner_tags = []
    
    with open(sentences_path, 'r') as f:
        for line in f:
            tokens.append(line.strip().split())
    
    if labels_path:
        with open(labels_path, 'r') as f:
            for line in f:
                ner_tags.append(line.strip().split())
    
    if labels_path:
        return {'tokens': tokens, 'ner_tags': ner_tags}
    else:
        return {'tokens': tokens}

In [5]:
# Load data
train_data = read_sentences_and_labels(train_sentences_file, train_labels_file)
val_data = read_sentences_and_labels(val_sentences_file, val_labels_file)
test_data = read_sentences_and_labels(test_sentences_file)

In [6]:
# Split long sentences into multiple sentences
def split_long_sentences(data, max_length=128):
    if 'ner_tags' in data:
        new_data = {'tokens': [], 'ner_tags': []}
    else:
        new_data = {'tokens': []}
        
    original_sentence_indices = []
    fragment_lengths = []
    
    for i in range(len(data['tokens'])):
        tokens = data['tokens'][i]
        if 'ner_tags' in data:
            ner_tags = data['ner_tags'][i]
        
        if len(tokens) > max_length:
            for j in range(0, len(tokens), max_length):
                new_data['tokens'].append(tokens[j:j+max_length])
                if 'ner_tags' in data:
                    new_data['ner_tags'].append(ner_tags[j:j+max_length])
                original_sentence_indices.append(i)
                fragment_lengths.append(len(tokens[j:j+max_length]))
        else:
            new_data['tokens'].append(tokens)
            if 'ner_tags' in data:
                new_data['ner_tags'].append(ner_tags)
            original_sentence_indices.append(i)
            fragment_lengths.append(len(tokens))
    
    return new_data, original_sentence_indices, fragment_lengths

In [7]:
# Split long sentences
train_data, train_original_sentence_indices, train_fragment_lengths = split_long_sentences(train_data)
val_data, val_original_sentence_indices, val_fragment_lengths = split_long_sentences(val_data)
test_data, test_original_sentence_indices, test_fragment_lengths = split_long_sentences(test_data)

In [8]:
# Convert to datasets
train_dataset = datasets.Dataset.from_dict(train_data)
val_dataset = datasets.Dataset.from_dict(val_data)
test_dataset = datasets.Dataset.from_dict(test_data)

In [9]:
# List of unique ner labels
unique_labels = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

# Create class label
ner_tags_feature = ClassLabel(names=unique_labels)

In [10]:
# Method to convert ner tags to class labels
def convert_to_classlabel(example):
    example['ner_tags'] = [ner_tags_feature.str2int(tag) for tag in example['ner_tags']]
    
    return example

In [11]:
# Convert ner tags to class labels
train_dataset = train_dataset.map(convert_to_classlabel)
val_dataset = val_dataset.map(convert_to_classlabel)

Map:   0%|          | 0/2149 [00:00<?, ? examples/s]

Map:   0%|          | 0/529 [00:00<?, ? examples/s]

In [12]:
# Define features
features = Features({
    'ner_tags': Sequence(ner_tags_feature),
    'tokens': Sequence(Value('string'))
})

In [13]:
# Cast dataset
train_dataset = train_dataset.cast(features)
val_dataset = val_dataset.cast(features)

Casting the dataset:   0%|          | 0/2149 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/529 [00:00<?, ? examples/s]

### Tokenization

In [14]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased", return_token_type_ids="token_type_ids")

In [15]:
# Method to tokenize and align labels
def tokenize_and_align_labels(examples, label_all_tokens=False):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    
    if 'ner_tags' in examples:
        labels = []
        for i, label in enumerate(examples[f"ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    label_ids.append(label[word_idx] if label_all_tokens else -100)
                previous_word_idx = word_idx
    
            labels.append(label_ids)

        tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [16]:
# Tokenize and align labels
tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_val = val_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/2149 [00:00<?, ? examples/s]

Map:   0%|          | 0/529 [00:00<?, ? examples/s]

In [17]:
# Tokenize test data
tokenized_test = test_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/504 [00:00<?, ? examples/s]

### Load pre-trained model

In [24]:
# Load model
# model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(unique_labels))
model = AutoModelForTokenClassification.from_pretrained("ner-model", num_labels=len(unique_labels))

### Retrain model with prepared data

In [25]:
# Define training arguments
training_args = TrainingArguments(
    "test-ner",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    weight_decay=0.01,
)

In [26]:
# Define data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

In [27]:
# Define metric to compute
metric = datasets.load_metric("seqeval")

In [28]:
# Helper method to compute metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [unique_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [unique_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [29]:
# Define trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

### Train model

In [27]:
# Train model
torch.cuda.empty_cache()
trainer.train()

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=10745, training_loss=0.16770398169686596, metrics={'train_runtime': 715.5143, 'train_samples_per_second': 15.017, 'train_steps_per_second': 15.017, 'total_flos': 424900907413920.0, 'train_loss': 0.16770398169686596, 'epoch': 5.0})

In [29]:
# Save model
trainer.save_model('ner-model')

### Evaluate model

In [30]:
# Evaluate
trainer.evaluate()

{'eval_loss': 0.17733338475227356,
 'eval_precision': 0.7404310067545835,
 'eval_recall': 0.7740416946872899,
 'eval_f1': 0.7568633897747822,
 'eval_accuracy': 0.9586372907517319,
 'eval_runtime': 5.8684,
 'eval_samples_per_second': 90.144,
 'eval_steps_per_second': 90.144}

### Predict on validation data

In [167]:
# Preprocess data
def preprocess_data(tokens):
    sentences = [" ".join(token_list) for token_list in tokens]
    return sentences

In [168]:
train_sentences = preprocess_data(train_data['tokens'])
val_sentences = preprocess_data(val_data['tokens'])
test_sentences = preprocess_data(test_data['tokens'])

In [169]:
# Align predictions
def align_predictions(predictions, label_ids, sentence_indices, fragment_lengths):
    preds = np.argmax(predictions, axis=2)
    aligned_preds = []
    aligned_labels = []

    for pred, label, idx, length in zip(preds, label_ids, sentence_indices, fragment_lengths):
        aligned_pred = []
        aligned_label = []
        for p, l in zip(pred, label):
            if l != -100:
                aligned_pred.append(p)
                aligned_label.append(l)
        aligned_preds.append(aligned_pred)
        aligned_labels.append(aligned_label)

    return aligned_preds, aligned_labels

In [170]:
# Predict on validation data
predictions_val, label_ids_val, metrics_val = trainer.predict(tokenized_val)

In [171]:
# Align predictions
aligned_preds_val, aligned_labels_val = align_predictions(predictions_val, label_ids_val, val_original_sentence_indices, val_fragment_lengths)

In [172]:
# Concat results based on val_original_sentence_indices
predicted_labels = []
true_labels = []
for i in range(len(aligned_preds_val)):
    if i == 0:
        predicted_labels.append(aligned_preds_val[i])
        true_labels.append(aligned_labels_val[i])
    elif val_original_sentence_indices[i] == val_original_sentence_indices[i-1]:
        predicted_labels[-1] += aligned_preds_val[i]
        true_labels[-1] += aligned_labels_val[i]
    else:
        predicted_labels.append(aligned_preds_val[i])
        true_labels.append(aligned_labels_val[i])

### Postprocessing

In [174]:
import regex as re

# Postprocessing
# Regex for finding I-tags that start a sequence (should be B-tags)
def incorrect_I_as_begin_tag(text):
    return re.finditer(r'(?<![BI]-\w+ )I-\w+', text)

# Helper method for replacing I-tags that start a sequence with B-tags
def replace_incorrect_I_as_begin_tag(df):
    # Iterate until no more changes
    i = 0
    
    while True:
        outer_counter_old = 0
        outer_counter = 0
        
        print(f"Iteration: {i+1}")
        
        for idx, row in df.iterrows():
            x = incorrect_I_as_begin_tag(row['ner_tags'])
            
            inner_counter = 0
            
            for match in x:
                inner_counter += 1
                hp = list(row['ner_tags'])
                hp[match.start()] = 'B'
                row['ner_tags'] = "".join(hp)
             
            outer_counter += inner_counter
            
        print(f"Changes: {outer_counter - outer_counter_old}")
        
        i += 1
            
        if outer_counter_old == outer_counter:
            break
        else:
            outer_counter_old = outer_counter
            
    return df

# Regex for finding inconsistent I-tags after B-tags (I-tags that are not continuation of B-tags)
def inconsistent_I_after_B(text):
    return re.finditer(r'(?<=B-(\w+) )(?:I-(?!\1)\w+)', text)

# Helper method for removing inconsistent I-tags after B-tags
def replace_inconsistent_I_after_B(df):
    # Iterate until no more changes
    i = 0
    
    while True:
        outer_counter_old = 0
        outer_counter = 0
        
        print(f"Iteration: {i+1}")
        
        for idx, row in df.iterrows():
            matches = inconsistent_I_after_B(row['ner_tags'])
            
            inner_counter = 0
            
            for match in matches:
                inner_counter += 1
                hp = list(row['ner_tags'])
                hp[match.start()] = 'B'
                row['ner_tags'] = "".join(hp)
             
            outer_counter += inner_counter
            
        print(f"Changes: {outer_counter - outer_counter_old}")
        
        i += 1
            
        if outer_counter_old == outer_counter:
            break
        else:
            outer_counter_old = outer_counter
            
    return df

# Regex for finding inconsistent I-tags after other I-tags (I-tags that are not continuation of the same tag)
def inconsistent_I_after_I(text):
    return re.finditer(r'(?<=I-(\w+) )(?:I-(?!\1)\w+)', text)

# Helper method for removing inconsistent I-tags after other I-tags
def replace_inconsistent_I_after_I(df):
    # Iterate until no more changes
    i = 0
    
    while True:
        outer_counter_old = 0
        outer_counter = 0
        
        print(f"Iteration: {i+1}")
        
        for idx, row in df.iterrows():
            matches = inconsistent_I_after_I(row['ner_tags'])
            
            inner_counter = 0
            
            for match in matches:
                inner_counter += 1
                hp = list(row['ner_tags'])
                hp[match.start()] = 'B'
                row['ner_tags'] = "".join(hp)
             
            outer_counter += inner_counter
            
        print(f"Changes: {outer_counter - outer_counter_old}")
        
        i += 1
            
        if outer_counter_old == outer_counter:
            break
        else:
            outer_counter_old = outer_counter
            
    return df

In [175]:
predicted_labels

[[0,
  0,
  3,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  5,
  0,
  0,
  5,
  8,
  0,
  1,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  3,
  0,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  0,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  3,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  6,
  0,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  0,
  1,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  2,
  0,
  1,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  3,
  0,
  5,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  5,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [127]:
# Save predictions to .tsv file (line by line)
with open('train/out-transformer.tsv', 'w') as f:
    for i in range(len(predicted_labels)):
        f.write(' '.join([unique_labels[p] for p in predicted_labels[i]]) + '\n')

In [157]:
# Load predictions
predictions = pd.read_csv('train/out-transformer.tsv', header=None, delimiter='\t')
predictions.columns = ['ner_tags']

In [158]:
# Postprocessing
predictions = replace_incorrect_I_as_begin_tag(predictions)
predictions = replace_inconsistent_I_after_B(predictions)
predictions = replace_inconsistent_I_after_I(predictions)

Iteration: 1
Changes: 143
Iteration: 2
Changes: 0
Iteration: 1
Changes: 168
Iteration: 2
Changes: 14
Iteration: 3
Changes: 0
Iteration: 1
Changes: 17
Iteration: 2
Changes: 0


In [159]:
# Save predictions to .tsv file (line by line)
predictions.to_csv('dev-0/out.tsv', header=False, index=False, sep='\t')

In [160]:
from seqeval.metrics import classification_report

# Convert index to label
df_val = pd.DataFrame({'ner_tags': true_labels})
df_val['tokens'] = df_val['ner_tags'].apply(lambda x: [unique_labels[int(i)] for i in x])
predictions['tokens'] = predictions['ner_tags'].apply(lambda x: x.split())

In [165]:
# Classification report
print(classification_report(df_val['tokens'].tolist(), predictions['tokens'].tolist()))

              precision    recall  f1-score   support

         LOC       0.80      0.86      0.83      1835
        MISC       0.72      0.70      0.71       921
         ORG       0.66      0.69      0.67      1333
         PER       0.74      0.78      0.76      1840

   micro avg       0.74      0.77      0.76      5929
   macro avg       0.73      0.76      0.74      5929
weighted avg       0.74      0.77      0.75      5929


GEVAL F1-BIO (dev): 0.75517