!unzip -q /content/en-ner-conll-2003.zip -d /content/
import os
import pandas as pd
from transformers import pipeline, AutoModelForTokenClassification, BertTokenizer

Declare path

data_dir_path = 'en-ner-conll-2003'
train_path = os.path.join(data_dir_path, 'train', 'train.tsv')
dev_texts_path = os.path.join(data_dir_path, 'dev-0', 'in.tsv')
dev_labels_path = os.path.join(data_dir_path, 'dev-0', 'expected.tsv')
dev_predicted_path = os.path.join(data_dir_path, 'dev-0', 'out.tsv')
test_texts_path = os.path.join(data_dir_path, 'test-A', 'in.tsv')
test_predicted_path = os.path.join(data_dir_path, 'test-A', 'out.tsv')

Load files

train_data = pd.read_csv(train_path, sep='\t', usecols=[0, 1], header=None, names=['label', 'text'])
dev_texts_data = pd.read_csv(dev_texts_path, sep='\t', usecols=[0], header=None, names=['text'])
dev_labels_data = pd.read_csv(dev_labels_path, sep='\t', usecols=[0], header=None, names=['label'])
test_texts_data = pd.read_csv(test_texts_path, sep='\t', usecols=[0], header=None, names=['text'])

Create transformer and tokenizer pipeline

model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")

class SpaceTokenizer(BertTokenizer):
    def tokenize(self, text):
        return text.split()

tokenizer = SpaceTokenizer.from_pretrained("bert-base-cased")

recognizer = pipeline("ner", model=model, tokenizer=tokenizer)
Predict and save results

def predict(X):
    predictions = list()
    for text in X:
    return predictions
def map_predictions(X, predictions):
    results = list()
    for text, prediction in zip(X, predictions):
        result = ['O'] * len(text.split())
        for prediction_element in prediction:
            result[prediction_element['index']-1] = prediction_element['entity']
        result = " ".join(result)
    return results
def predict_and_save(X, filename):
    X = X['text']
    predictions = predict(X)
    Y_predicted = map_predictions(X, predictions)
    Y_predicted_df = pd.DataFrame(Y_predicted, columns=['predicted_label'])
    Y_predicted_df.to_csv(filename, sep='\t', index=False, header=None)
dev_predicted = predict_and_save(dev_texts_data, dev_predicted_path)
test_predicted = predict_and_save(test_texts_data, test_predicted_path)