import numpy as np import pandas as pd import torch from torchtext.vocab import Vocab from collections import Counter from sklearn.metrics import f1_score from torchcrf import CRF from tqdm import tqdm def load_train_data(): data = pd.read_csv("train/train.tsv.xz", sep='\t', names=['labels', 'document']) Y_raw = data['labels'].values X_raw = data['document'].values return tokenize(X_raw, Y_raw) def load_test_data(): data = pd.read_csv("test-A/in.tsv", sep='\t', names=['document']) X = data['document'].values return X def load_dev_data(): data = pd.read_csv("dev-0/in.tsv", sep='\t', names=['document']) X_raw = data['document'].values labels_df = pd.read_csv("dev-0/expected.tsv", sep='\t', names=['labels']) Y_raw = labels_df['labels'].values return tokenize(X_raw, Y_raw) def build_vocab(dataset): counter = Counter() for document in dataset: counter.update(document) return Vocab(counter, specials=['', '', '', '']) def build_vocab_BIO(dataset): counter = Counter() for document in dataset: counter.update(document) return Vocab(counter, specials=[]) def tokenize(X_raw, Y_raw): X = [x.split(sep=" ") for x in X_raw] Y = [y.split(sep=" ") for y in Y_raw] return X, Y def data_process(dt, vocab): return [torch.tensor([vocab[token] for token in document], dtype=torch.long) for document in dt] def data_translate(dt, vocab): return [[vocab.itos[token] for token in document] for document in dt] class GRU(torch.nn.Module): def __init__(self, doc_vocab_len, tags_number): super(GRU, self).__init__() self.emb = torch.nn.Embedding(doc_vocab_len, 100) self.dropout = torch.nn.Dropout(0.2) self.rec = torch.nn.GRU(100, 256, 2, batch_first=True, bidirectional=True) self.fc1 = torch.nn.Linear(2*256, tags_number) def forward(self, x): emb = torch.relu(self.emb(x)) emb = self.dropout(emb) gru_output, h_n = self.rec(emb) out_weights = self.fc1(gru_output) return out_weights def train_model(bio, crf, device, X, Y, epoch_amount): for epoch in range(epoch_amount): print("\nepoch: ", epoch + 1, "/", epoch_amount) bio.train() crf.train() for i in tqdm(range(len(Y))): batch_tokens = X[i].unsqueeze(0).to(device) batch_tags = Y[i].unsqueeze(1).to(device) emissions = bio(batch_tokens).squeeze(0).unsqueeze(1).to(device) optimizer.zero_grad() loss = -crf(emissions, batch_tags) loss.backward() optimizer.step() def test_model(dataset_tokens, bio, crf, device, BIO_vocab, save_path): bio.eval() crf.eval() Y_pred = [] for i in tqdm(range(len(dataset_tokens))): batch_tokens = dataset_tokens[i].unsqueeze(0).to(device) emissions = bio(batch_tokens).squeeze(0).unsqueeze(1).to(device) Y_pred += [crf.decode(emissions)[0]] #np.savetxt(save_path, Y_pred, delimiter="\t") Y_pred_translated = data_translate(Y_pred, BIO_vocab) with open(save_path, "w+") as file: temp_str = "" for i in Y_pred_translated: for j in i: temp_str += str(j) temp_str += " " temp_str = temp_str[:-1] temp_str += "\n" temp_str = temp_str[:-1] file.write(temp_str) def eval_model(dataset_tokens, dataset_labels, bio, crf, device, BIO_vocab, save_path): Y_true = [] Y_pred = [] bio.eval() crf.eval() for i in tqdm(range(len(dataset_labels))): batch_tokens = dataset_tokens[i].unsqueeze(0).to(device) batch_tags = dataset_labels[i].unsqueeze(1).to(device) emissions = bio(batch_tokens).squeeze(0).unsqueeze(1).to(device) Y_pred += [crf.decode(emissions)[0]] Y_true += [batch_tags] Y_pred_translated = data_translate(Y_pred, BIO_vocab) #np.savetxt(save_path, Y_pred_translated, delimiter="\t") with open(save_path, "w+") as file: temp_str = "" for i in Y_pred_translated: for j in i: temp_str += str(j) temp_str += " " temp_str = temp_str[:-1] temp_str += "\n" temp_str = temp_str[:-1] file.write(temp_str) return Y_pred_translated if __name__ == "__main__": BIO_LABELS = ['I-MISC', 'I-LOC', 'I-ORG', 'I-PER', 'B-MISC', 'B-LOC', 'B-ORG', 'B-PER', 'O'] BATCH = 1 EPOCHES = 5 BIO_TAGS_AMOUNT = 9 # set device use_cuda = torch.cuda.is_available() print("use CUDA: ", use_cuda) device = torch.device("cuda" if use_cuda else "cpu") # loading and prepearing data print("Loading data...") X, Y = load_train_data() vocab = build_vocab(X) vocab_BIO = build_vocab_BIO(Y) data_tokens = data_process(X, vocab) labels_tokens = data_process(Y, vocab_BIO) # train model print("Training model...") bio_model = GRU(len(vocab.itos), BIO_TAGS_AMOUNT).to(device) crf = CRF(BIO_TAGS_AMOUNT).to(device) params = list(bio_model.parameters()) + list(crf.parameters()) optimizer = torch.optim.Adam(params) train_model(bio_model, crf, device, data_tokens, labels_tokens, EPOCHES) # test model print("Evaluate model...") X_dev, Y_dev_exp = load_dev_data() data_tokens_dev = data_process(X_dev, vocab) labels_tokens_dev = data_process(Y_dev_exp, vocab_BIO) Y_pred = eval_model(data_tokens_dev, labels_tokens_dev, bio_model, crf, device, vocab_BIO, "dev-0/out.tsv") X_test = load_test_data() data_tokens_test = data_process(X_test, vocab) test_model(data_tokens_test, bio_model, crf, device, vocab_BIO, "test-A/out.tsv")