implement data reading and basic Neural Network

2021-06-08 21:31:19 +02:00 · 2021-06-08 21:31:19 +02:00 · ff95d0bcc7
commit ff95d0bcc7
parent a6bbd87b3b
1 changed files with 102 additions and 0 deletions
--- a/seq_lab.py
+++ b/seq_lab.py
@ -0,0 +1,102 @@
+# imports
+import torch
+import pandas as pd
+import csv
+from torchtext.vocab import Vocab
+from collections import Counter
+
+class NERModel(torch.nn.Module):
+    def __init__(self,):
+        super(NERModel, self).__init__()
+        self.emb = torch.nn.Embedding(23627,200)
+        self.fc1 = torch.nn.Linear(600,9)
+    def forward(self, x):
+        x = self.emb(x)
+        x = x.reshape(600) 
+        x = self.fc1(x)
+        return x
+
+class NeuralNetworkModel(torch.nn.Module):
+    def __init__(self, output_size):
+        super(NeuralNetworkModel, self).__init__()
+        self.fc1 = torch.nn.Linear(10_000, output_size)
+        self.softmax = torch.nn.Softmax(dim=0)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.softmax(x)
+        return x
+
+def build_vocab(dataset):
+    counter = Counter()
+    for document in dataset:
+        counter.update(document)
+    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
+
+def data_process(dt):
+    return [torch.tensor([vocab['<bos>']] + [vocab[token] for token in document] + [vocab['<eos>']], dtype=torch.long)
+            for document in dt]
+
+def labels_process(dt):
+    return [torch.tensor([0] + document + [0], dtype=torch.long) for document in dt]
+
+
+LABELS = ['O','B-LOC', 'I-LOC','B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']
+train = pd.read_csv("./train/train.tsv.xz", error_bad_lines=False, compression='xz', sep='\t', header=None, quoting=csv.QUOTE_NONE)
+dev = pd.read_csv('./dev-0/in.tsv', error_bad_lines=False, sep='\t', header=None, quoting=csv.QUOTE_NONE)
+test = pd.read_csv('./test-A/in.tsv', error_bad_lines=False, sep='\t', header=None, quoting=csv.QUOTE_NONE)
+
+tags = train[0].apply(lambda x: [LABELS.index(y) for y in x.split()])
+tokens = train[1].apply(lambda x: x.split())
+dev_tokens = dev[0].apply(lambda x: x.split())
+test_tokens = dev[0].apply(lambda x: x.split())
+
+vocab = build_vocab(tokens)
+train_labels = labels_process(tags)
+train_tokens_ids = data_process(tokens)
+
+ner_model = NERModel()
+nn_model = NeuralNetworkModel(len(train_tokens_ids))
+criterion = torch.nn.CrossEntropyLoss()
+optimizer = torch.optim.Adam(ner_model.parameters())
+
+for epoch in range(2):
+    loss_score = 0
+    acc_score = 0
+    prec_score = 0
+    selected_items = 0
+    recall_score = 0
+    relevant_items = 0
+    items_total = 0
+    nn_model.train()
+    for i in range(100):
+        for j in range(1, len(train_labels[i]) - 1):
+            X = train_tokens_ids[i][j-1: j+2]
+            Y = train_labels[i][j: j+1]
+            Y_predictions = ner_model(X)
+            acc_score += int(torch.argmax(Y_predictions) == Y)
+            
+            if torch.argmax(Y_predictions) != 0:
+                selected_items +=1
+            if  torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():
+                prec_score += 1
+            
+            if  Y.item() != 0:
+                relevant_items +=1
+            if  Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():
+                recall_score += 1
+            items_total += 1
+            optimizer.zero_grad()
+            loss = criterion(Y_predictions.unsqueeze(0), Y)
+            loss.backward()
+            optimizer.step()
+            loss_score += loss.item() 
+    
+    precision = prec_score / selected_items
+    recall = recall_score / relevant_items
+    f1_score = (2*precision * recall) / (precision + recall)
+    print('epoch: ', epoch)
+    print('loss: ', loss_score / items_total)
+    print('acc: ', acc_score / items_total)
+    print('prec: ', precision)
+    print('recall: : ', recall)
+    print('f1: ', f1_score)