diff --git a/generate.py b/generate.py new file mode 100644 index 0000000..7df2e7f --- /dev/null +++ b/generate.py @@ -0,0 +1,111 @@ +import pandas as pd +import pickle +import torch +from sklearn.metrics import accuracy_score, f1_score, classification_report +from model import Model +from tqdm import tqdm +import csv +def process_output(lines): + result = [] + for line in lines: + last_label = None + new_line = [] + for label in line: + if(label != "O" and label[0:2] == "I-"): + if last_label == None or last_label == "O": + label = label.replace('I-', 'B-') + else: + label = "I-" + last_label[2:] + last_label = label + new_line.append(label) + x = (" ".join(new_line)) + result.append(" ".join(new_line)) + return result +def data_process(dt): + return [ torch.tensor([vocab[process_token(token)] for token in document.split(" ") ], dtype = torch.long) for document in dt] +def labels_process(dt): + return [ torch.tensor([labels_vocab[token] for token in document.split(" ") ], dtype = torch.long) for document in dt] +def process_document(document): + return [process_token(x) for x in document.split(" ")] +def save_file(path, obj): + with open(path, "w") as file: + file.write(obj) +def process_token(token): + return token.lower() + +data = pd.read_csv("dev-0/in.tsv", sep="\t", names=['data']) +ex_data = pd.read_csv("dev-0/expected.tsv", sep="\t", names=['labels']) + +in_data = data['data'] +target = ex_data['labels'] +num_tags = 9 +seq_length = 5 +save_path = "dev-0/out.tsv" + +with open("vocab.pickle", "rb") as file: + vocab = pickle.load(file) + +labels_vocab = { + 'O': 0, + 'B-PER': 1, + 'B-LOC': 2, + 'I-PER': 3, + 'B-MISC': 4, + 'I-MISC': 5, + 'I-LOC': 6, + 'B-ORG': 7, + 'I-ORG': 8 +} + +inv_labels_vocab = {v: k for k, v in labels_vocab.items()} + +train_tokens_ids = data_process(in_data) +train_labels = labels_process(target) + +model = Model(num_tags, seq_length, vocab) +device = torch.device("cuda") +model.to(device) +model.cuda(0) + +model.eval() +model.eval_mode() +predicted = [] +correct = [] +model.load_state_dict(torch.load("model.torch")) +for i in tqdm(range(0, len(train_tokens_ids))): + last_idx = 0 + for k in range(0, len(train_tokens_ids[i]) - seq_length + 1, seq_length): + batch_tokens = train_tokens_ids[i][k: k + seq_length].unsqueeze(0) + tags = train_labels[i][k: k + seq_length].unsqueeze(1) + predicted_tags = model.decode(batch_tokens.to(device)) + predicted += predicted_tags[0] + correct += [x[0] for x in tags.numpy().tolist()] + last_idx = k + l = len(train_tokens_ids[i]) + rest = l - int(l/seq_length) * seq_length + if rest != 0: + batch_tokens = train_tokens_ids[i][last_idx: last_idx + rest].unsqueeze(0) + tags = train_labels[i][last_idx: last_idx + rest].unsqueeze(1) + predicted_tags = model.decode(batch_tokens.to(device)) + predicted += predicted_tags[0] + correct += [x[0] for x in tags.numpy().tolist()] + +print(classification_report(correct, predicted)) +print(accuracy_score(correct, predicted)) +print(f1_score(correct, predicted, average="micro")) +save_file("correct.txt", '\n'.join([str(x) for x in correct])) +save_file("predicted.txt", '\n'.join([str(x) for x in predicted])) + +predicted = list(map(lambda x: inv_labels_vocab[x], predicted)) +slices = [len(x.split(" ")) for x in in_data] +with open(save_path, "w") as save: + writer = csv.writer(save, delimiter='\t', lineterminator='\n') + accumulator = 0 + output = [] + for slice in slices: + output.append(predicted[accumulator: accumulator + slice]) + accumulator += slice + for line in process_output(output): + writer.writerow([line]) + + \ No newline at end of file diff --git a/main.py b/main.py index 0007ef1..5980542 100644 --- a/main.py +++ b/main.py @@ -13,38 +13,10 @@ import numpy as np from sklearn.metrics import accuracy_score, f1_score, classification_report import csv import pickle +from model import Model -class Model(torch.nn.Module): - def __init__(self, num_tags, seq_length): - super(Model, self).__init__() - self.emb = torch.nn.Embedding(len(vocab.get_itos()), 100) - self.gru = torch.nn.GRU(100, 256, 1, batch_first=True) - self.hidden2tag = torch.nn.Linear(256, 9) - self.crf = CRF(num_tags, batch_first=True) - self.relu = torch.nn.ReLU() - self.fc1 = torch.nn.Linear(1, seq_length) - self.softmax = torch.nn.Softmax(dim=0) - self.sigm = torch.nn.Sigmoid() +nlp = spacy.load("en_core_web_sm") - def forward(self, data, tags): - emb = self.relu(self.emb(data)) - out, h_n = self.gru(emb) - out = self.hidden2tag(out) - out = self.crf(out, tags.T) - return -out - - def decode(self, data): - emb = self.relu(self.emb(data)) - out, h_n = self.gru(emb) - out = self.hidden2tag(out) - out = self.crf.decode(out) - return out - - def train_mode(self): - self.crf.train() - - def eval_mode(self): - self.crf.eval() def process_output(lines): result = [] @@ -64,7 +36,14 @@ def process_output(lines): return result def process_document(document): - return document.split(" ") + return [process_token(x) for x in document.split(" ")] + +def save_file(path, obj): + with open(path, "w") as file: + file.write(obj) + +def process_token(token): + return token.lower() def build_vocab(dataset): counter = Counter() @@ -78,26 +57,28 @@ def build_vocab(dataset): return v def data_process(dt): - return [ torch.tensor([vocab[token] for token in document.split(" ") ], dtype = torch.long) for document in dt] + return [ torch.tensor([vocab[process_token(token)] for token in document.split(" ") ], dtype = torch.long) for document in dt] def labels_process(dt): return [ torch.tensor([labels_vocab[token] for token in document.split(" ") ], dtype = torch.long) for document in dt] # mode = "train" -# mode = "eval" -mode = "generate" +mode = "eval" +# mode = "generate" save_path = "dev-0/out.tsv" -data = pd.read_csv("dev-0/in.tsv", sep="\t", names=['0']) +data = pd.read_csv("dev-0/in.tsv", sep="\t", names=['data']) # data.columns = ["labels", "text"] # train_target = pd.read_csv("train/train.tsv", sep = '\t', names = ['labels', 'data']) ex_data = pd.read_csv("dev-0/expected.tsv", sep="\t", names=['labels']) -in_data = data["0"] -target = ex_data["labels"] +in_data = data['data'] +target = ex_data['labels'] +# in_data = data["0"] +# target = ex_data["labels"] # test_data = pd.read_csv("test-A/in.tsv", sep = '\t', names=['0']) # test_data.columns = ['0'] @@ -137,10 +118,10 @@ num_tags = 9 NUM_EPOCHS = 5 seq_length = 5 -model = Model(num_tags, seq_length) -device = torch.device("cpu") +model = Model(num_tags, seq_length, vocab) +device = torch.device("cuda") model.to(device) -# model.cuda(0) +model.cuda(0) if mode == "train": criterion = torch.nn.CrossEntropyLoss() @@ -191,8 +172,10 @@ if mode == "eval" or mode == "generate": if mode == "eval": print(classification_report(correct, predicted)) print(accuracy_score(correct, predicted)) - print(f1_score(correct, predicted, average="weighted")) - + print(f1_score(correct, predicted, average="micro")) + save_file("correct.txt", '\n'.join([str(x) for x in correct])) + save_file("predicted.txt", '\n'.join([str(x) for x in predicted])) + predicted = list(map(lambda x: inv_labels_vocab[x], predicted)) slices = [len(x.split(" ")) for x in in_data] with open(save_path, "w") as save: diff --git a/model.py b/model.py new file mode 100644 index 0000000..389fe06 --- /dev/null +++ b/model.py @@ -0,0 +1,34 @@ +import torch +from torchcrf import CRF + +class Model(torch.nn.Module): + def __init__(self, num_tags, seq_length, vocab): + super(Model, self).__init__() + self.emb = torch.nn.Embedding(len(vocab.get_itos()), 100) + self.gru = torch.nn.GRU(100, 256, 1, batch_first=True) + self.hidden2tag = torch.nn.Linear(256, 9) + self.crf = CRF(num_tags, batch_first=True) + self.relu = torch.nn.ReLU() + self.fc1 = torch.nn.Linear(1, seq_length) + self.softmax = torch.nn.Softmax(dim=0) + self.sigm = torch.nn.Sigmoid() + + def forward(self, data, tags): + emb = self.relu(self.emb(data)) + out, h_n = self.gru(emb) + out = self.hidden2tag(out) + out = self.crf(out, tags.T) + return -out + + def decode(self, data): + emb = self.relu(self.emb(data)) + out, h_n = self.gru(emb) + out = self.hidden2tag(out) + out = self.crf.decode(out) + return out + + def train_mode(self): + self.crf.train() + + def eval_mode(self): + self.crf.eval() \ No newline at end of file