from os import sep from nltk import word_tokenize import pandas as pd import torch from torch._C import device from tqdm import tqdm from torchtext.vocab import vocab from collections import Counter, OrderedDict import spacy from torchcrf import CRF from torch.utils.data import DataLoader import numpy as np from sklearn.metrics import accuracy_score, f1_score, classification_report import csv import pickle class Model(torch.nn.Module): def __init__(self, num_tags, seq_length): super(Model, self).__init__() self.emb = torch.nn.Embedding(len(vocab.get_itos()), 100) self.gru = torch.nn.GRU(100, 256, 1, batch_first=True) self.hidden2tag = torch.nn.Linear(256, 9) self.crf = CRF(num_tags, batch_first=True) self.relu = torch.nn.ReLU() self.fc1 = torch.nn.Linear(1, seq_length) self.softmax = torch.nn.Softmax(dim=0) self.sigm = torch.nn.Sigmoid() def forward(self, data, tags): emb = self.relu(self.emb(data)) out, h_n = self.gru(emb) out = self.hidden2tag(out) out = self.crf(out, tags.T) return -out def decode(self, data): emb = self.relu(self.emb(data)) out, h_n = self.gru(emb) out = self.hidden2tag(out) out = self.crf.decode(out) return out def train_mode(self): self.crf.train() def eval_mode(self): self.crf.eval() def process_document(document): return document.split(" ") def build_vocab(dataset): counter = Counter() for document in dataset: counter.update(process_document(document)) sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True) ordered_dict = OrderedDict(sorted_by_freq_tuples) v = vocab(counter) default_index = 0 v.set_default_index(default_index) return v def data_process(dt): return [ torch.tensor([vocab[token] for token in document.split(" ") ], dtype = torch.long) for document in dt] def labels_process(dt): return [ torch.tensor([labels_vocab[token] for token in document.split(" ") ], dtype = torch.long) for document in dt] # mode = "train" mode = "eval" # mode = "generate" save_path = "dev-0/out.tsv" data = pd.read_csv("dev-0/in.tsv", sep="\t", names=['0']) # data.columns = ["labels", "text"] ex_data = pd.read_csv("dev-0/expected.tsv", sep="\t", names=['labels']) in_data = data["0"] target = ex_data["labels"] # test_data = pd.read_csv("test-A/in.tsv", sep = '\t') # test_data.columns = ['0'] # data = test_data['0'] # in_data = test_data['0'] # target = list(np.zeros(len(in_data))) # labels_vocab = build_vocab(data['labels']) if mode == "train": vocab = build_vocab(in_data) with open("vocab.pickle", "wb") as file: pickle.dump(vocab, file) print("Vocab saved") else: with open("vocab.pickle", "rb") as file: vocab = pickle.load(file) labels_vocab = { 'O': 0, 'B-PER': 1, 'B-LOC': 2, 'I-PER': 3, 'B-MISC': 4, 'I-MISC': 5, 'I-LOC': 6, 'B-ORG': 7, 'I-ORG': 8 } inv_labels_vocab = {v: k for k, v in labels_vocab.items()} train_tokens_ids = data_process(in_data) train_labels = labels_process(target) num_tags = 9 NUM_EPOCHS = 5 seq_length = 5 model = Model(num_tags, seq_length) device = torch.device("cpu") model.to(device) # model.cuda(0) if mode == "train": criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) for i in range(NUM_EPOCHS): model.train() model.train_mode() #for i in tqdm(range(500)): for i in tqdm(range(len(train_labels))): for k in range(0, len(train_tokens_ids[i]) - seq_length, seq_length): batch_tokens = train_tokens_ids[i][k: k + seq_length].unsqueeze(0) tags = train_labels[i][k: k + seq_length].unsqueeze(1) predicted_tags = model(batch_tokens.to(device), tags.to(device)) predicted_tags.backward() optimizer.step() model.zero_grad() model.crf.zero_grad() optimizer.zero_grad() torch.save(model.state_dict(), "model.torch") if mode == "eval" or mode == "generate": model.eval() model.eval_mode() predicted = [] correct = [] model.load_state_dict(torch.load("model.torch")) for i in tqdm(range(0, len(train_tokens_ids))): last_idx = 0 for k in range(0, len(train_tokens_ids[i]) - seq_length, seq_length): batch_tokens = train_tokens_ids[i][k: k + seq_length].unsqueeze(0) tags = train_labels[i][k: k + seq_length].unsqueeze(1) predicted_tags = model.decode(batch_tokens.to(device)) predicted += predicted_tags[0] correct += [x[0] for x in tags.numpy().tolist()] last_idx = k l = len(train_tokens_ids[i]) rest = l - int(l/seq_length) * seq_length if rest != 0: batch_tokens = train_tokens_ids[i][last_idx: last_idx + rest].unsqueeze(0) tags = train_labels[i][last_idx: last_idx + rest].unsqueeze(1) predicted_tags = model.decode(batch_tokens.to(device)) predicted += predicted_tags[0] correct += [x[0] for x in tags.numpy().tolist()] if mode == "eval": print(classification_report(correct, predicted)) print(accuracy_score(correct, predicted)) print(f1_score(correct, predicted, average="weighted")) predicted = list(map(lambda x: inv_labels_vocab[x], predicted)) slices = [len(x.split(" ")) for x in in_data] with open(save_path, "w") as save: writer = csv.writer(save, delimiter='\t', lineterminator='\n') accumulator = 0 for slice in slices: writer.writerow([' '.join(predicted[accumulator: accumulator + slice])]) accumulator += slice - 1