from tqdm import tqdm def read(path): with open(path, "r+") as file: data_raw = file.readlines() data = [] for d in data_raw: data.append(d.split(sep=" ")) return data def save(content, path): with open(path, "w+") as file: temp_str = "" for i in content: for j in i: temp_str += str(j) temp_str += " " temp_str = temp_str[:-1] #temp_str += "\n" temp_str = temp_str[:-1] file.write(temp_str) def fix_bio_labels(data_old): data_new = [] for i in tqdm(range(0,len(data_old))): data_new.append([]) for j in range(0, len(data_old[i])): if i == 0: data_new[i].append(data_old[i][j]) continue # check if B- is good if data_old[i][j] == 'B-MISC' and (data_old[i][j-1] == 'I-MISC' or data_old[i][j-1] == 'B-MISC'): data_new[i].append('I-MISC') elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-LOC': data_new[i].append('B-LOC') elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-ORG': data_new[i].append('B-ORG') elif data_old[i][j] == 'B-MISC' and data_old[i][j+1] == 'I-PER': data_new[i].append('B-PER') elif data_old[i][j] == 'B-LOC' and (data_old[i][j-1] == 'I-LOC' or data_old[i][j-1] == 'B-LOC'): data_new[i].append('I-LOC') elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-MISC': data_new[i].append('B-MISC') elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-ORG': data_new[i].append('B-ORG') elif data_old[i][j] == 'B-LOC' and data_old[i][j+1] == 'I-PER': data_new[i].append('B-PER') elif data_old[i][j] == 'B-ORG' and (data_old[i][j-1] == 'I-ORG' or data_old[i][j-1] == 'B-ORG'): data_new[i].append('I-ORG') elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-MISC': data_new[i].append('B-MISC') elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-LOC': data_new[i].append('B-LOC') elif data_old[i][j] == 'B-ORG' and data_old[i][j+1] == 'I-PER': data_new[i].append('B-PER') elif data_old[i][j] == 'B-PER' and (data_old[i][j-1] == 'I-PER' or data_old[i][j-1] == 'B-PER'): data_new[i].append('I-PER') elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-MISC': data_new[i].append('B-MISC') elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-LOC': data_new[i].append('B-LOC') elif data_old[i][j] == 'B-PER' and data_old[i][j+1] == 'I-ORG': data_new[i].append('B-ORG') # check if I- is good elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-ORG' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-LOC': data_new[i].append('I-LOC') elif (data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-ORG' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-MISC': data_new[i].append('I-MISC') elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-PER') and data_old[i][j-1] == 'I-ORG': data_new[i].append('I-ORG') elif (data_old[i][j] == 'I-MISC' or data_old[i][j] == 'I-LOC' or data_old[i][j] == 'I-ORG') and data_old[i][j-1] == 'I-PER': data_new[i].append('I-PER') else: data_new[i].append(data_old[i][j]) return data_new if __name__ == "__main__": data = read("test-A/out.tsv") data = fix_bio_labels(data) save(data, "test-A/out.tsv")