import torch import pandas as pd from torchtext.vocab import Vocab from collections import Counter x_train = pd.read_table('train/train.tsv', sep='\t', header = None) x_dev = pd.read_table('dev-0/in.tsv', sep='\t', header = None) y_dev = pd.read_table('dev-0/expected.tsv', sep='\t', header = None) x_test = pd.read_table('test-A/in.tsv', sep='\t', header = None) label_list = ['O', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER'] x_train[0] = x_train[0].apply(lambda x: [label_list.index(i) for i in x.split()]) x_train[1] = x_train[1].apply(lambda x: x.split()) x_dev[0] = x_dev[0].apply(lambda x: x.split()) x_test[0] = x_test[0].apply(lambda x: x.split()) def data_process(dt): return [ torch.tensor([vocab['']] +[vocab[token] for token in document ] + [vocab['']], dtype = torch.long) for document in dt] def build_vocab(dataset): counter = Counter() for document in dataset: counter.update(document) return Vocab(counter, specials=['', '', '', '']) vocab = build_vocab(x_train[1]) def labels_process(dt): return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt] class NERModel(torch.nn.Module): def __init__(self,): super(NERModel, self).__init__() self.emb = torch.nn.Embedding(23627,200) self.fc1 = torch.nn.Linear(600,9) def forward(self, x): x = self.emb(x) x = x.reshape(600) x = self.fc1(x) return x ner_model = NERModel() criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(ner_model.parameters()) train_labels = labels_process(x_train[0]) train_tokens_ids = data_process(x_train[1]) for epoch in range(2): loss_score = 0 acc_score = 0 prec_score = 0 selected_items = 0 recall_score = 0 relevant_items = 0 items_total = 0 ner_model.train() for i in range(len(train_labels)): for j in range(1, len(train_labels[i])-1): X = train_tokens_ids[i][j-1: j+2] Y = train_labels[i][j: j+1] Y_predictions = ner_model(X) acc_score += int(torch.argmax(Y_predictions) == Y) if torch.argmax(Y_predictions) != 0: selected_items +=1 if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item(): prec_score +=1 if Y.item() != 0: relevant_items += 1 if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item(): recall_score += 1 items_total +=1 optimizer.zero_grad() loss = criterion(Y_predictions.unsqueeze(0), Y) loss.backward() optimizer.step() loss_score += loss.item() precision = prec_score / selected_items recall = recall_score / relevant_items f1_score = (2*precision * recall) / (precision + recall) print('epoch: ', epoch) print('loss: ', loss_score / items_total) print('acc: ', acc_score / items_total) print('prec: ', precision) print('recall: ', recall) print('f1: ', f1_score) dev_data_tokens_ids = data_process(x_dev[0]) dev_results = [] for i in range(len(dev_data_tokens_ids)): line = [] for j in range(1, len(dev_data_tokens_ids[i]) - 1): X = dev_data_tokens_ids[i][j-1: j+2] Y_predictions = ner_model(X) result = torch.argmax(Y_predictions) label = label_list[result] line.append(label) dev_results.append(line) test_data_tokens_ids = data_process(x_test[0]) test_results = [] for i in range(len(test_data_tokens_ids)): line = [] for j in range(1, len(test_data_tokens_ids[i]) - 1): X = test_data_tokens_ids[i][j-1: j+2] Y_predictions = ner_model(X) result = torch.argmax(Y_predictions) label = label_list[result] line.append(label) test_results.append(line) dev_results.to_csv(r'dev-0/out.tsv', sep='\t', index=False, header=False) test_results.to_csv(r'test-A/out.tsv', sep='\t', index=False, header=False)