forked from kubapok/en-ner-conll-2003
83 lines
3.8 KiB
Python
83 lines
3.8 KiB
Python
|
from tqdm import tqdm
|
||
|
|
||
|
def read(path):
|
||
|
with open(path, "r+") as file:
|
||
|
data_raw = file.readlines()
|
||
|
data = []
|
||
|
for d in data_raw:
|
||
|
data.append(d.split(sep=" "))
|
||
|
return data
|
||
|
|
||
|
|
||
|
def save(content, path):
|
||
|
with open(path, "w+") as file:
|
||
|
temp_str = ""
|
||
|
for i in content:
|
||
|
for j in i:
|
||
|
temp_str += str(j)
|
||
|
temp_str += " "
|
||
|
temp_str = temp_str[:-1]
|
||
|
#temp_str += "\n"
|
||
|
temp_str = temp_str[:-1]
|
||
|
file.write(temp_str)
|
||
|
|
||
|
|
||
|
def fix_bio_labels(data_old):
|
||
|
data_new = []
|
||
|
for i in tqdm(range(0,len(data_old))):
|
||
|
data_new.append([])
|
||
|
for j in range(0, len(data_old[i])):
|
||
|
if i == 0:
|
||
|
data_new[i].append(data_old[i][j])
|
||
|
continue
|
||
|
# check if B- is good
|
||
|
if data_old[i][j] == 'B-MISC' and (data_old[i][j-1] == 'I-MISC' or data_old[i][j-1] == 'B-MISC'):
|
||
|
data_new[i].append('I-MISC')
|
||
|
elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-LOC':
|
||
|
data_new[i].append('B-LOC')
|
||
|
elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-ORG':
|
||
|
data_new[i].append('B-ORG')
|
||
|
elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-PER':
|
||
|
data_new[i].append('B-PER')
|
||
|
elif data_old[i][j] == 'B-LOC' and (data_old[i][j-1] == 'I-LOC' or data_old[i][j-1] == 'B-LOC'):
|
||
|
data_new[i].append('I-LOC')
|
||
|
elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-MISC':
|
||
|
data_new[i].append('B-MISC')
|
||
|
elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-ORG':
|
||
|
data_new[i].append('B-ORG')
|
||
|
elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-PER':
|
||
|
data_new[i].append('B-PER')
|
||
|
elif data_old[i][j] == 'B-ORG' and (data_old[i][j-1] == 'I-ORG' or data_old[i][j-1] == 'B-ORG'):
|
||
|
data_new[i].append('I-ORG')
|
||
|
elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-MISC':
|
||
|
data_new[i].append('B-MISC')
|
||
|
elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-LOC':
|
||
|
data_new[i].append('B-LOC')
|
||
|
elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-PER':
|
||
|
data_new[i].append('B-PER')
|
||
|
elif data_old[i][j] == 'B-PER' and (data_old[i][j-1] == 'I-PER' or data_old[i][j-1] == 'B-PER'):
|
||
|
data_new[i].append('I-PER')
|
||
|
elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-MISC':
|
||
|
data_new[i].append('B-MISC')
|
||
|
elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-LOC':
|
||
|
data_new[i].append('B-LOC')
|
||
|
elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-ORG':
|
||
|
data_new[i].append('B-ORG')
|
||
|
# check if I- is good
|
||
|
elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-ORG' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-LOC':
|
||
|
data_new[i].append('I-LOC')
|
||
|
elif (data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-ORG' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-MISC':
|
||
|
data_new[i].append('I-MISC')
|
||
|
elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-ORG':
|
||
|
data_new[i].append('I-ORG')
|
||
|
elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-ORG') and data_old[i][j-1] == 'I-PER':
|
||
|
data_new[i].append('I-PER')
|
||
|
else:
|
||
|
data_new[i].append(data_old[i][j])
|
||
|
return data_new
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
data = read("test-A/out.tsv")
|
||
|
data = fix_bio_labels(data)
|
||
|
save(data, "test-A/out.tsv")
|