diff --git a/seq_lab.py b/seq_lab.py new file mode 100644 index 0000000..cff21d9 --- /dev/null +++ b/seq_lab.py @@ -0,0 +1,102 @@ +# imports +import torch +import pandas as pd +import csv +from torchtext.vocab import Vocab +from collections import Counter + +class NERModel(torch.nn.Module): + def __init__(self,): + super(NERModel, self).__init__() + self.emb = torch.nn.Embedding(23627,200) + self.fc1 = torch.nn.Linear(600,9) + def forward(self, x): + x = self.emb(x) + x = x.reshape(600) + x = self.fc1(x) + return x + +class NeuralNetworkModel(torch.nn.Module): + def __init__(self, output_size): + super(NeuralNetworkModel, self).__init__() + self.fc1 = torch.nn.Linear(10_000, output_size) + self.softmax = torch.nn.Softmax(dim=0) + def forward(self, x): + x = self.fc1(x) + x = self.softmax(x) + return x + +def build_vocab(dataset): + counter = Counter() + for document in dataset: + counter.update(document) + return Vocab(counter, specials=['', '', '', '']) + +def data_process(dt): + return [torch.tensor([vocab['']] + [vocab[token] for token in document] + [vocab['']], dtype=torch.long) + for document in dt] + +def labels_process(dt): + return [torch.tensor([0] + document + [0], dtype=torch.long) for document in dt] + + +LABELS = ['O','B-LOC', 'I-LOC','B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER'] +train = pd.read_csv("./train/train.tsv.xz", error_bad_lines=False, compression='xz', sep='\t', header=None, quoting=csv.QUOTE_NONE) +dev = pd.read_csv('./dev-0/in.tsv', error_bad_lines=False, sep='\t', header=None, quoting=csv.QUOTE_NONE) +test = pd.read_csv('./test-A/in.tsv', error_bad_lines=False, sep='\t', header=None, quoting=csv.QUOTE_NONE) + +tags = train[0].apply(lambda x: [LABELS.index(y) for y in x.split()]) +tokens = train[1].apply(lambda x: x.split()) +dev_tokens = dev[0].apply(lambda x: x.split()) +test_tokens = dev[0].apply(lambda x: x.split()) + +vocab = build_vocab(tokens) +train_labels = labels_process(tags) +train_tokens_ids = data_process(tokens) + +ner_model = NERModel() +nn_model = NeuralNetworkModel(len(train_tokens_ids)) +criterion = torch.nn.CrossEntropyLoss() +optimizer = torch.optim.Adam(ner_model.parameters()) + +for epoch in range(2): + loss_score = 0 + acc_score = 0 + prec_score = 0 + selected_items = 0 + recall_score = 0 + relevant_items = 0 + items_total = 0 + nn_model.train() + for i in range(100): + for j in range(1, len(train_labels[i]) - 1): + X = train_tokens_ids[i][j-1: j+2] + Y = train_labels[i][j: j+1] + Y_predictions = ner_model(X) + acc_score += int(torch.argmax(Y_predictions) == Y) + + if torch.argmax(Y_predictions) != 0: + selected_items +=1 + if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item(): + prec_score += 1 + + if Y.item() != 0: + relevant_items +=1 + if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item(): + recall_score += 1 + items_total += 1 + optimizer.zero_grad() + loss = criterion(Y_predictions.unsqueeze(0), Y) + loss.backward() + optimizer.step() + loss_score += loss.item() + + precision = prec_score / selected_items + recall = recall_score / relevant_items + f1_score = (2*precision * recall) / (precision + recall) + print('epoch: ', epoch) + print('loss: ', loss_score / items_total) + print('acc: ', acc_score / items_total) + print('prec: ', precision) + print('recall: : ', recall) + print('f1: ', f1_score) \ No newline at end of file