import pandas as pd import pickle import torch from sklearn.metrics import accuracy_score, f1_score, classification_report from model import Model from tqdm import tqdm import csv def process_output(lines): result = [] for line in lines: last_label = None new_line = [] for label in line: if(label != "O" and label[0:2] == "I-"): if last_label == None or last_label == "O": label = label.replace('I-', 'B-') else: label = "I-" + last_label[2:] last_label = label new_line.append(label) x = (" ".join(new_line)) result.append(" ".join(new_line)) return result def data_process(dt): return [ torch.tensor([vocab[process_token(token)] for token in document.split(" ") ], dtype = torch.long) for document in dt] def labels_process(dt): return [ torch.tensor([labels_vocab[token] for token in document.split(" ") ], dtype = torch.long) for document in dt] def process_document(document): return [process_token(x) for x in document.split(" ")] def save_file(path, obj): with open(path, "w") as file: file.write(obj) def process_token(token): return token.lower() data = pd.read_csv("dev-0/in.tsv", sep="\t", names=['data']) ex_data = pd.read_csv("dev-0/expected.tsv", sep="\t", names=['labels']) in_data = data['data'] target = ex_data['labels'] num_tags = 9 seq_length = 5 save_path = "dev-0/out.tsv" with open("vocab.pickle", "rb") as file: vocab = pickle.load(file) labels_vocab = { 'O': 0, 'B-PER': 1, 'B-LOC': 2, 'I-PER': 3, 'B-MISC': 4, 'I-MISC': 5, 'I-LOC': 6, 'B-ORG': 7, 'I-ORG': 8 } inv_labels_vocab = {v: k for k, v in labels_vocab.items()} train_tokens_ids = data_process(in_data) train_labels = labels_process(target) model = Model(num_tags, seq_length, vocab) device = torch.device("cuda") model.to(device) model.cuda(0) model.eval() model.eval_mode() predicted = [] correct = [] model.load_state_dict(torch.load("model.torch")) for i in tqdm(range(0, len(train_tokens_ids))): last_idx = 0 for k in range(0, len(train_tokens_ids[i]) - seq_length + 1, seq_length): batch_tokens = train_tokens_ids[i][k: k + seq_length].unsqueeze(0) tags = train_labels[i][k: k + seq_length].unsqueeze(1) predicted_tags = model.decode(batch_tokens.to(device)) predicted += predicted_tags[0] correct += [x[0] for x in tags.numpy().tolist()] last_idx = k l = len(train_tokens_ids[i]) rest = l - int(l/seq_length) * seq_length if rest != 0: batch_tokens = train_tokens_ids[i][last_idx: last_idx + rest].unsqueeze(0) tags = train_labels[i][last_idx: last_idx + rest].unsqueeze(1) predicted_tags = model.decode(batch_tokens.to(device)) predicted += predicted_tags[0] correct += [x[0] for x in tags.numpy().tolist()] print(classification_report(correct, predicted)) print(accuracy_score(correct, predicted)) print(f1_score(correct, predicted, average="micro")) save_file("correct.txt", '\n'.join([str(x) for x in correct])) save_file("predicted.txt", '\n'.join([str(x) for x in predicted])) predicted = list(map(lambda x: inv_labels_vocab[x], predicted)) slices = [len(x.split(" ")) for x in in_data] with open(save_path, "w") as save: writer = csv.writer(save, delimiter='\t', lineterminator='\n') accumulator = 0 output = [] for slice in slices: output.append(predicted[accumulator: accumulator + slice]) accumulator += slice for line in process_output(output): writer.writerow([line])