en-ner-conll-2003/BIO_fixer.py

91 lines
4.2 KiB
Python
Raw Normal View History

2021-06-20 22:24:32 +02:00
from tqdm import tqdm
def read(path):
with open(path, "r+") as file:
data_raw = file.readlines()
data = []
for d in data_raw:
data.append(d.split(sep=" "))
return data
def save(content, path):
with open(path, "w+") as file:
temp_str = ""
for i in content:
for j in i:
temp_str += str(j)
temp_str += " "
temp_str = temp_str[:-1]
#temp_str += "\n"
temp_str = temp_str[:-1]
file.write(temp_str)
def fix_bio_labels(data_old):
data_new = []
for i in tqdm(range(0,len(data_old))):
data_new.append([])
for j in range(0, len(data_old[i])):
if i == 0:
data_new[i].append(data_old[i][j])
continue
# check if B- is good
if data_old[i][j] == 'B-MISC' and (data_old[i][j-1] == 'I-MISC' or data_old[i][j-1] == 'B-MISC'):
data_new[i].append('I-MISC')
elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-LOC':
data_new[i].append('B-LOC')
elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-ORG':
data_new[i].append('B-ORG')
elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-PER':
data_new[i].append('B-PER')
elif data_old[i][j] == 'B-LOC' and (data_old[i][j-1] == 'I-LOC' or data_old[i][j-1] == 'B-LOC'):
data_new[i].append('I-LOC')
elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-MISC':
data_new[i].append('B-MISC')
elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-ORG':
data_new[i].append('B-ORG')
elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-PER':
data_new[i].append('B-PER')
elif data_old[i][j] == 'B-ORG' and (data_old[i][j-1] == 'I-ORG' or data_old[i][j-1] == 'B-ORG'):
data_new[i].append('I-ORG')
elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-MISC':
data_new[i].append('B-MISC')
elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-LOC':
data_new[i].append('B-LOC')
elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-PER':
data_new[i].append('B-PER')
elif data_old[i][j] == 'B-PER' and (data_old[i][j-1] == 'I-PER' or data_old[i][j-1] == 'B-PER'):
data_new[i].append('I-PER')
elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-MISC':
data_new[i].append('B-MISC')
elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-LOC':
data_new[i].append('B-LOC')
elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-ORG':
data_new[i].append('B-ORG')
# check if I- is good
elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-ORG' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-LOC':
data_new[i].append('I-LOC')
elif (data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-ORG' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-MISC':
data_new[i].append('I-MISC')
elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-ORG':
data_new[i].append('I-ORG')
elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-ORG') and data_old[i][j-1] == 'I-PER':
data_new[i].append('I-PER')
2021-06-22 21:28:06 +02:00
elif data_old[i][j] == 'I-LOC' and data_old[i][j-1] == "O":
data_new[i].append('B-LOC')
elif data_old[i][j] == 'I-MISC' and data_old[i][j-1] == "O":
data_new[i].append('B-MISC')
elif data_old[i][j] == 'I-ORG' and data_old[i][j-1] == "O":
data_new[i].append('B-ORG')
elif data_old[i][j] == 'I-PER' and data_old[i][j - 1] == "O":
data_new[i].append('B-PER')
2021-06-20 22:24:32 +02:00
else:
data_new[i].append(data_old[i][j])
return data_new
if __name__ == "__main__":
2021-06-22 21:28:06 +02:00
data = read("dev-0/out.tsv")
2021-06-20 22:24:32 +02:00
data = fix_bio_labels(data)
2021-06-22 21:28:06 +02:00
save(data, "dev-0/out.tsv")