en-ner-conll-2003/BIO_fixer.py

from tqdm import tqdm

def read(path):
    with open(path, "r+") as file:
        data_raw = file.readlines()
    data = []
    for d in data_raw:
        data.append(d.split(sep=" "))
    return data


def save(content, path):
    with open(path, "w+") as file:
        temp_str = ""
        for i in content:
            for j in i:
                temp_str += str(j)
                temp_str += " "
            temp_str = temp_str[:-1]
            #temp_str += "\n"
        temp_str = temp_str[:-1]
        file.write(temp_str)


def fix_bio_labels(data_old):
    data_new = []
    for i in tqdm(range(0,len(data_old))):
        data_new.append([])
        for j in range(0, len(data_old[i])):
            if i == 0:
                data_new[i].append(data_old[i][j])
                continue
            # check if B- is good
            if data_old[i][j] == 'B-MISC' and (data_old[i][j-1] == 'I-MISC' or data_old[i][j-1] == 'B-MISC'):
                data_new[i].append('I-MISC')
            elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-LOC':
                data_new[i].append('B-LOC')
            elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-ORG':
                data_new[i].append('B-ORG')
            elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-PER':
                data_new[i].append('B-PER')
            elif data_old[i][j] == 'B-LOC' and (data_old[i][j-1] == 'I-LOC' or data_old[i][j-1] == 'B-LOC'):
                data_new[i].append('I-LOC')
            elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-MISC':
                data_new[i].append('B-MISC')
            elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-ORG':
                data_new[i].append('B-ORG')
            elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-PER':
                data_new[i].append('B-PER')
            elif data_old[i][j] == 'B-ORG' and (data_old[i][j-1] == 'I-ORG' or data_old[i][j-1] == 'B-ORG'):
                data_new[i].append('I-ORG')
            elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-MISC':
                data_new[i].append('B-MISC')
            elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-LOC':
                data_new[i].append('B-LOC')
            elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-PER':
                data_new[i].append('B-PER')
            elif data_old[i][j] == 'B-PER' and (data_old[i][j-1] == 'I-PER' or data_old[i][j-1] == 'B-PER'):
                data_new[i].append('I-PER')
            elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-MISC':
                data_new[i].append('B-MISC')
            elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-LOC':
                data_new[i].append('B-LOC')
            elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-ORG':
                data_new[i].append('B-ORG')
            # check if I- is good
            elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-ORG' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-LOC':
                data_new[i].append('I-LOC')
            elif (data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-ORG' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-MISC':
                data_new[i].append('I-MISC')
            elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-ORG':
                data_new[i].append('I-ORG')
            elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-ORG') and data_old[i][j-1] == 'I-PER':
                data_new[i].append('I-PER')
            else:
                data_new[i].append(data_old[i][j])
    return data_new


if __name__ == "__main__":
    data = read("test-A/out.tsv")
    data = fix_bio_labels(data)
    save(data, "test-A/out.tsv")
CRF and GRU labeling 2021-06-20 22:24:32 +02:00			`from tqdm import tqdm`

			`def read(path):`
			`with open(path, "r+") as file:`
			`data_raw = file.readlines()`
			`data = []`
			`for d in data_raw:`
			`data.append(d.split(sep=" "))`
			`return data`


			`def save(content, path):`
			`with open(path, "w+") as file:`
			`temp_str = ""`
			`for i in content:`
			`for j in i:`
			`temp_str += str(j)`
			`temp_str += " "`
			`temp_str = temp_str[:-1]`
			`#temp_str += "\n"`
			`temp_str = temp_str[:-1]`
			`file.write(temp_str)`


			`def fix_bio_labels(data_old):`
			`data_new = []`
			`for i in tqdm(range(0,len(data_old))):`
			`data_new.append([])`
			`for j in range(0, len(data_old[i])):`
			`if i == 0:`
			`data_new[i].append(data_old[i][j])`
			`continue`
			`# check if B- is good`
			`if data_old[i][j] == 'B-MISC' and (data_old[i][j-1] == 'I-MISC' or data_old[i][j-1] == 'B-MISC'):`
			`data_new[i].append('I-MISC')`
			`elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-LOC':`
			`data_new[i].append('B-LOC')`
			`elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-ORG':`
			`data_new[i].append('B-ORG')`
			`elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-PER':`
			`data_new[i].append('B-PER')`
			`elif data_old[i][j] == 'B-LOC' and (data_old[i][j-1] == 'I-LOC' or data_old[i][j-1] == 'B-LOC'):`
			`data_new[i].append('I-LOC')`
			`elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-MISC':`
			`data_new[i].append('B-MISC')`
			`elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-ORG':`
			`data_new[i].append('B-ORG')`
			`elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-PER':`
			`data_new[i].append('B-PER')`
			`elif data_old[i][j] == 'B-ORG' and (data_old[i][j-1] == 'I-ORG' or data_old[i][j-1] == 'B-ORG'):`
			`data_new[i].append('I-ORG')`
			`elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-MISC':`
			`data_new[i].append('B-MISC')`
			`elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-LOC':`
			`data_new[i].append('B-LOC')`
			`elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-PER':`
			`data_new[i].append('B-PER')`
			`elif data_old[i][j] == 'B-PER' and (data_old[i][j-1] == 'I-PER' or data_old[i][j-1] == 'B-PER'):`
			`data_new[i].append('I-PER')`
			`elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-MISC':`
			`data_new[i].append('B-MISC')`
			`elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-LOC':`
			`data_new[i].append('B-LOC')`
			`elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-ORG':`
			`data_new[i].append('B-ORG')`
			`# check if I- is good`
			`elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-ORG' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-LOC':`
			`data_new[i].append('I-LOC')`
			`elif (data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-ORG' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-MISC':`
			`data_new[i].append('I-MISC')`
			`elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-ORG':`
			`data_new[i].append('I-ORG')`
			`elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-ORG') and data_old[i][j-1] == 'I-PER':`
			`data_new[i].append('I-PER')`
			`else:`
			`data_new[i].append(data_old[i][j])`
			`return data_new`


			`if __name__ == "__main__":`
			`data = read("test-A/out.tsv")`
			`data = fix_bio_labels(data)`
			`save(data, "test-A/out.tsv")`