# imports import torch import pandas as pd import csv from torchtext.vocab import Vocab from collections import Counter class NERModel(torch.nn.Module): def __init__(self,): super(NERModel, self).__init__() self.emb = torch.nn.Embedding(23627,200) self.fc1 = torch.nn.Linear(600,9) def forward(self, x): x = self.emb(x) x = x.reshape(600) x = self.fc1(x) return x class NeuralNetworkModel(torch.nn.Module): def __init__(self, output_size): super(NeuralNetworkModel, self).__init__() self.fc1 = torch.nn.Linear(10_000, output_size) self.softmax = torch.nn.Softmax(dim=0) def forward(self, x): x = self.fc1(x) x = self.softmax(x) return x def build_vocab(dataset): counter = Counter() for document in dataset: counter.update(document) return Vocab(counter, specials=['', '', '', '']) def data_process(dt): return [torch.tensor([vocab['']] + [vocab[token] for token in document] + [vocab['']], dtype=torch.long) for document in dt] def labels_process(dt): return [torch.tensor([0] + document + [0], dtype=torch.long) for document in dt] LABELS = ['O','B-LOC', 'I-LOC','B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER'] train = pd.read_csv("./train/train.tsv.xz", error_bad_lines=False, compression='xz', sep='\t', header=None, quoting=csv.QUOTE_NONE) dev = pd.read_csv('./dev-0/in.tsv', error_bad_lines=False, sep='\t', header=None, quoting=csv.QUOTE_NONE) test = pd.read_csv('./test-A/in.tsv', error_bad_lines=False, sep='\t', header=None, quoting=csv.QUOTE_NONE) tags = train[0].apply(lambda x: [LABELS.index(y) for y in x.split()]) tokens = train[1].apply(lambda x: x.split()) dev_tokens = dev[0].apply(lambda x: x.split()) test_tokens = dev[0].apply(lambda x: x.split()) vocab = build_vocab(tokens) train_labels = labels_process(tags) train_tokens_ids = data_process(tokens) print(train_labels[0][:10]) print(train_tokens_ids[0][:10]) ner_model = NERModel() nn_model = NeuralNetworkModel(len(train_tokens_ids)) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(ner_model.parameters()) for epoch in range(2): loss_score = 0 acc_score = 0 prec_score = 0 selected_items = 0 recall_score = 0 relevant_items = 0 items_total = 0 nn_model.train() for i in range(100): for j in range(1, len(train_labels[i]) - 1): X = train_tokens_ids[i][j-1: j+2] Y = train_labels[i][j: j+1] Y_predictions = ner_model(X) acc_score += int(torch.argmax(Y_predictions) == Y) if torch.argmax(Y_predictions) != 0: selected_items +=1 if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item(): prec_score += 1 if Y.item() != 0: relevant_items +=1 if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item(): recall_score += 1 items_total += 1 optimizer.zero_grad() loss = criterion(Y_predictions.unsqueeze(0), Y) loss.backward() optimizer.step() loss_score += loss.item() precision = prec_score / selected_items recall = recall_score / relevant_items f1_score = (2*precision * recall) / (precision + recall) print('epoch: ', epoch) print('loss: ', loss_score / items_total) print('acc: ', acc_score / items_total) print('prec: ', precision) print('recall: : ', recall) print('f1: ', f1_score)