2021-06-20 22:24:32 +02:00
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
|
|
def read(path):
|
|
|
|
with open(path, "r+") as file:
|
|
|
|
data_raw = file.readlines()
|
|
|
|
data = []
|
|
|
|
for d in data_raw:
|
|
|
|
data.append(d.split(sep=" "))
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
|
|
def save(content, path):
|
|
|
|
with open(path, "w+") as file:
|
|
|
|
temp_str = ""
|
|
|
|
for i in content:
|
|
|
|
for j in i:
|
|
|
|
temp_str += str(j)
|
|
|
|
temp_str += " "
|
|
|
|
temp_str = temp_str[:-1]
|
|
|
|
#temp_str += "\n"
|
|
|
|
temp_str = temp_str[:-1]
|
|
|
|
file.write(temp_str)
|
|
|
|
|
|
|
|
|
|
|
|
def fix_bio_labels(data_old):
|
|
|
|
data_new = []
|
|
|
|
for i in tqdm(range(0,len(data_old))):
|
|
|
|
data_new.append([])
|
|
|
|
for j in range(0, len(data_old[i])):
|
|
|
|
if i == 0:
|
|
|
|
data_new[i].append(data_old[i][j])
|
|
|
|
continue
|
|
|
|
# check if B- is good
|
|
|
|
if data_old[i][j] == 'B-MISC' and (data_old[i][j-1] == 'I-MISC' or data_old[i][j-1] == 'B-MISC'):
|
|
|
|
data_new[i].append('I-MISC')
|
|
|
|
elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-LOC':
|
|
|
|
data_new[i].append('B-LOC')
|
|
|
|
elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-ORG':
|
|
|
|
data_new[i].append('B-ORG')
|
|
|
|
elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-PER':
|
|
|
|
data_new[i].append('B-PER')
|
|
|
|
elif data_old[i][j] == 'B-LOC' and (data_old[i][j-1] == 'I-LOC' or data_old[i][j-1] == 'B-LOC'):
|
|
|
|
data_new[i].append('I-LOC')
|
|
|
|
elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-MISC':
|
|
|
|
data_new[i].append('B-MISC')
|
|
|
|
elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-ORG':
|
|
|
|
data_new[i].append('B-ORG')
|
|
|
|
elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-PER':
|
|
|
|
data_new[i].append('B-PER')
|
|
|
|
elif data_old[i][j] == 'B-ORG' and (data_old[i][j-1] == 'I-ORG' or data_old[i][j-1] == 'B-ORG'):
|
|
|
|
data_new[i].append('I-ORG')
|
|
|
|
elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-MISC':
|
|
|
|
data_new[i].append('B-MISC')
|
|
|
|
elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-LOC':
|
|
|
|
data_new[i].append('B-LOC')
|
|
|
|
elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-PER':
|
|
|
|
data_new[i].append('B-PER')
|
|
|
|
elif data_old[i][j] == 'B-PER' and (data_old[i][j-1] == 'I-PER' or data_old[i][j-1] == 'B-PER'):
|
|
|
|
data_new[i].append('I-PER')
|
|
|
|
elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-MISC':
|
|
|
|
data_new[i].append('B-MISC')
|
|
|
|
elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-LOC':
|
|
|
|
data_new[i].append('B-LOC')
|
|
|
|
elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-ORG':
|
|
|
|
data_new[i].append('B-ORG')
|
|
|
|
# check if I- is good
|
|
|
|
elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-ORG' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-LOC':
|
|
|
|
data_new[i].append('I-LOC')
|
|
|
|
elif (data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-ORG' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-MISC':
|
|
|
|
data_new[i].append('I-MISC')
|
|
|
|
elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-ORG':
|
|
|
|
data_new[i].append('I-ORG')
|
|
|
|
elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-ORG') and data_old[i][j-1] == 'I-PER':
|
|
|
|
data_new[i].append('I-PER')
|
2021-06-22 21:28:06 +02:00
|
|
|
elif data_old[i][j] == 'I-LOC' and data_old[i][j-1] == "O":
|
|
|
|
data_new[i].append('B-LOC')
|
|
|
|
elif data_old[i][j] == 'I-MISC' and data_old[i][j-1] == "O":
|
|
|
|
data_new[i].append('B-MISC')
|
|
|
|
elif data_old[i][j] == 'I-ORG' and data_old[i][j-1] == "O":
|
|
|
|
data_new[i].append('B-ORG')
|
|
|
|
elif data_old[i][j] == 'I-PER' and data_old[i][j - 1] == "O":
|
|
|
|
data_new[i].append('B-PER')
|
2021-06-20 22:24:32 +02:00
|
|
|
else:
|
|
|
|
data_new[i].append(data_old[i][j])
|
|
|
|
return data_new
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2021-06-22 21:28:06 +02:00
|
|
|
data = read("dev-0/out.tsv")
|
2021-06-20 22:24:32 +02:00
|
|
|
data = fix_bio_labels(data)
|
2021-06-22 21:28:06 +02:00
|
|
|
save(data, "dev-0/out.tsv")
|