en-ner-conll-2003/BIO_fixer.py

from tqdm import tqdm

def read(path):
    with open(path, "r+") as file:
        data_raw = file.readlines()
    data = []
    for d in data_raw:
        data.append(d.split(sep=" "))
    return data


def save(content, path):
    with open(path, "w+") as file:
        temp_str = ""
        for i in content:
            for j in i:
                temp_str += str(j)
                temp_str += " "
            temp_str = temp_str[:-1]
            #temp_str += "\n"
        temp_str = temp_str[:-1]
        file.write(temp_str)


def fix_bio_labels(data_old):
    data_new = []
    for i in tqdm(range(0,len(data_old))):
        data_new.append([])
        for j in range(0, len(data_old[i])):
            if i == 0:
                data_new[i].append(data_old[i][j])
                continue
            # check if B- is good
            if data_old[i][j] == 'B-MISC' and (data_old[i][j-1] == 'I-MISC' or data_old[i][j-1] == 'B-MISC'):
                data_new[i].append('I-MISC')
            elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-LOC':
                data_new[i].append('B-LOC')
            elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-ORG':
                data_new[i].append('B-ORG')
            elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-PER':
                data_new[i].append('B-PER')
            elif data_old[i][j] == 'B-LOC' and (data_old[i][j-1] == 'I-LOC' or data_old[i][j-1] == 'B-LOC'):
                data_new[i].append('I-LOC')
            elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-MISC':
                data_new[i].append('B-MISC')
            elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-ORG':
                data_new[i].append('B-ORG')
            elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-PER':
                data_new[i].append('B-PER')
            elif data_old[i][j] == 'B-ORG' and (data_old[i][j-1] == 'I-ORG' or data_old[i][j-1] == 'B-ORG'):
                data_new[i].append('I-ORG')
            elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-MISC':
                data_new[i].append('B-MISC')
            elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-LOC':
                data_new[i].append('B-LOC')
            elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-PER':
                data_new[i].append('B-PER')
            elif data_old[i][j] == 'B-PER' and (data_old[i][j-1] == 'I-PER' or data_old[i][j-1] == 'B-PER'):
                data_new[i].append('I-PER')
            elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-MISC':
                data_new[i].append('B-MISC')
            elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-LOC':
                data_new[i].append('B-LOC')
            elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-ORG':
                data_new[i].append('B-ORG')
            # check if I- is good
            elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-ORG' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-LOC':
                data_new[i].append('I-LOC')
            elif (data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-ORG' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-MISC':
                data_new[i].append('I-MISC')
            elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-ORG':
                data_new[i].append('I-ORG')
            elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-ORG') and data_old[i][j-1] == 'I-PER':
                data_new[i].append('I-PER')
            else:
                data_new[i].append(data_old[i][j])
    return data_new


if __name__ == "__main__":
    data = read("test-A/out.tsv")
    data = fix_bio_labels(data)
    save(data, "test-A/out.tsv")