from os import sep from nltk import word_tokenize import pandas as pd import torch from torch._C import device from tqdm import tqdm from torchtext.vocab import vocab from collections import Counter, OrderedDict import spacy from torchcrf import CRF from torch.utils.data import DataLoader import numpy as np from sklearn.metrics import accuracy_score, f1_score, classification_report import csv import pickle from model import Model nlp = spacy.load("en_core_web_sm") def process_output(lines): result = [] for line in lines: last_label = None new_line = [] for label in line: if(label != "O" and label[0:2] == "I-"): if last_label == None or last_label == "O": label = label.replace('I-', 'B-') else: label = "I-" + last_label[2:] last_label = label new_line.append(label) x = (" ".join(new_line)) result.append(" ".join(new_line)) return result def process_document(document): return [process_token(x) for x in document.split(" ")] def save_file(path, obj): with open(path, "w") as file: file.write(obj) def process_token(token): return token.lower() def build_vocab(dataset): counter = Counter() for document in dataset: counter.update(process_document(document)) sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True) ordered_dict = OrderedDict(sorted_by_freq_tuples) v = vocab(counter) default_index = 0 v.set_default_index(default_index) return v def data_process(dt): return [ torch.tensor([vocab[process_token(token)] for token in document.split(" ") ], dtype = torch.long) for document in dt] def labels_process(dt): return [ torch.tensor([labels_vocab[token] for token in document.split(" ") ], dtype = torch.long) for document in dt] # mode = "train" # mode = "eval" mode = "generate" save_path = "dev-0/out.tsv" data = pd.read_csv("dev-0/in.tsv", sep="\t", names=['data']) # data.columns = ["labels", "text"] # train_target = pd.read_csv("train/train.tsv", sep = '\t', names = ['labels', 'data']) ex_data = pd.read_csv("dev-0/expected.tsv", sep="\t", names=['labels']) in_data = data['data'] target = ex_data['labels'] # in_data = data["0"] # target = ex_data["labels"] # test_data = pd.read_csv("test-A/in.tsv", sep = '\t', names=['0']) # test_data.columns = ['0'] # data = test_data['0'] # in_data = test_data['0'] # target = list(np.zeros(len(in_data))) # target = train_target['labels'] # labels_vocab = build_vocab(data['labels']) if mode == "train": vocab = build_vocab(in_data) with open("vocab.pickle", "wb") as file: pickle.dump(vocab, file) print("Vocab saved") else: with open("vocab.pickle", "rb") as file: vocab = pickle.load(file) labels_vocab = { 'O': 0, 'B-PER': 1, 'B-LOC': 2, 'I-PER': 3, 'B-MISC': 4, 'I-MISC': 5, 'I-LOC': 6, 'B-ORG': 7, 'I-ORG': 8 } inv_labels_vocab = {v: k for k, v in labels_vocab.items()} train_tokens_ids = data_process(in_data) train_labels = labels_process(target) num_tags = 9 NUM_EPOCHS = 5 seq_length = 5 model = Model(num_tags, seq_length, vocab) device = torch.device("cuda") model.to(device) model.cuda(0) if mode == "train": criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) for i in range(NUM_EPOCHS): model.train() model.train_mode() #for i in tqdm(range(500)): for i in tqdm(range(len(train_labels))): for k in range(0, len(train_tokens_ids[i]) - seq_length, seq_length): batch_tokens = train_tokens_ids[i][k: k + seq_length].unsqueeze(0) tags = train_labels[i][k: k + seq_length].unsqueeze(1) predicted_tags = model(batch_tokens.to(device), tags.to(device)) predicted_tags.backward() optimizer.step() model.zero_grad() model.crf.zero_grad() optimizer.zero_grad() torch.save(model.state_dict(), "model.torch") if mode == "eval" or mode == "generate": model.eval() model.eval_mode() predicted = [] correct = [] model.load_state_dict(torch.load("model.torch")) for i in tqdm(range(0, len(train_tokens_ids))): last_idx = 0 for k in range(0, len(train_tokens_ids[i]) - seq_length + 1, seq_length): batch_tokens = train_tokens_ids[i][k: k + seq_length].unsqueeze(0) tags = train_labels[i][k: k + seq_length].unsqueeze(1) predicted_tags = model.decode(batch_tokens.to(device)) predicted += predicted_tags[0] correct += [x[0] for x in tags.numpy().tolist()] last_idx = k l = len(train_tokens_ids[i]) rest = l - int(l/seq_length) * seq_length if rest != 0: batch_tokens = train_tokens_ids[i][last_idx: last_idx + rest].unsqueeze(0) tags = train_labels[i][last_idx: last_idx + rest].unsqueeze(1) predicted_tags = model.decode(batch_tokens.to(device)) predicted += predicted_tags[0] correct += [x[0] for x in tags.numpy().tolist()] if mode == "eval": print(classification_report(correct, predicted)) print(accuracy_score(correct, predicted)) print(f1_score(correct, predicted, average="micro")) save_file("correct.txt", '\n'.join([str(x) for x in correct])) save_file("predicted.txt", '\n'.join([str(x) for x in predicted])) predicted = list(map(lambda x: inv_labels_vocab[x], predicted)) slices = [len(x.split(" ")) for x in in_data] with open(save_path, "w") as save: writer = csv.writer(save, delimiter='\t', lineterminator='\n') accumulator = 0 output = [] for slice in slices: output.append(predicted[accumulator: accumulator + slice]) accumulator += slice for line in process_output(output): writer.writerow([line])