en-ner-conll-2003/main_updated.ipynb

24 KiB

import numpy as np
import gensim
import torch
import pandas as pd

from torchtext.vocab import Vocab
from collections import Counter

import lzma
import re
import itertools
class NeuralNetworkModel(torch.nn.Module):

    def __init__(self, output_size):
        super(NeuralNetworkModel, self).__init__()
        self.fc1 = torch.nn.Linear(10_000,len(train_tokens_ids))
        self.softmax = torch.nn.Softmax(dim=0)
        

    def forward(self, x):
        x = self.fc1(x)
        x = self.softmax(x)
        return x
class NERModel(torch.nn.Module):

    def __init__(self,):
        super(NERModel, self).__init__()
        self.emb = torch.nn.Embedding(23627,200)
        self.fc1 = torch.nn.Linear(600,9)

    def forward(self, x):
        x = self.emb(x)
        x = x.reshape(600) 
        x = self.fc1(x)
        return x
def get_dataset(path):
    data = lzma.open(path).read().decode('UTF-8').split('\n')
    return [line.split('\t') for line in data][:-1]

train_data = get_dataset('train.tsv.xz')

tokens = []
ner_tags = []

for i in train_data:
    ner_tags.append(i[0].split())
    tokens.append(i[1].split())

ner_tags_set = list(set(itertools.chain(*ner_tags)))

ner_tags_dictionary = {}

for i in range(len(ner_tags_set)):
    ner_tags_dictionary[ner_tags_set[i]] = i
for i in range(len(ner_tags)):
    for j in range(len(ner_tags[i])):
        ner_tags[i][j] = ner_tags_dictionary[ner_tags[i][j]]
def data_preprocessing(data):
    return [ torch.tensor([vocab['<bos>']] +[vocab[token]  for token in  document ] + [vocab['<eos>']], dtype = torch.long) for document in data ]

def labels_preprocessing(data):
    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in data ]

def build_vocab(dataset):
    counter = Counter()
    for document in dataset:
        counter.update(document)
    return Vocab(counter)


vocab = build_vocab(tokens)
train_tokens_ids = data_preprocessing(tokens)
train_labels = labels_preprocessing(ner_tags)
nn_model = NeuralNetworkModel(len(train_tokens_ids))
train_tokens_ids[0][1:4]

ner_model = NERModel()
ner_model(train_tokens_ids[0][1:4])

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(ner_model.parameters())

for epoch in range(2):
    loss_score = 0
    acc_score = 0
    prec_score = 0
    selected_items = 0
    recall_score = 0
    relevant_items = 0
    items_total = 0
    nn_model.train()
    for i in range(100):
        for j in range(1, len(train_labels[i]) - 1):
    
            X = train_tokens_ids[i][j-1: j+2]
            Y = train_labels[i][j: j+1]

            Y_predictions = ner_model(X)
            
            
            acc_score += int(torch.argmax(Y_predictions) == Y)
            
            if torch.argmax(Y_predictions) != 0:
                selected_items +=1
            if  torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():
                prec_score += 1
            
            if  Y.item() != 0:
                relevant_items +=1
            if  Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():
                recall_score += 1
            
            items_total += 1

            
            optimizer.zero_grad()
            loss = criterion(Y_predictions.unsqueeze(0), Y)
            loss.backward()
            optimizer.step()


            loss_score += loss.item() 
    
    precision = prec_score / selected_items
    recall = recall_score / relevant_items
    f1_score = (2*precision * recall) / (precision + recall)
    display('epoch: ', epoch)
    display('loss: ', loss_score / items_total)
    display('acc: ', acc_score / items_total)
    display('prec: ', precision)
    display('recall: : ', recall)
    display('f1: ', f1_score)
'epoch: '
0
'loss: '
0.5260595670613091
'acc: '
0.8669271431854746
'prec: '
0.868387037208014
'recall: : '
0.8694707649641784
'f1: '
0.868928563179897
'epoch: '
1
'loss: '
0.46469578519580995
'acc: '
0.8826936336474374
'prec: '
0.8842406612180819
'recall: : '
0.885139819736538
'f1: '
0.884690012011457
def process_output(lines):
    result = []
    for line in lines:
        last_label = None
        new_line = []
        for label in line:
            if(label != "O" and label[0:2] == "I-"):
                if last_label == None or last_label == "O":
                    label = label.replace('I-', 'B-')
                else:
                    label = "I-" + last_label[2:]
            last_label = label
            new_line.append(label)
            x = (" ".join(new_line))
        result.append(" ".join(new_line))
    return result
with open('dev-0/in.tsv', "r", encoding="utf-8") as f:
    dev_0_data = [line.rstrip() for line in f]
    
dev_0_data = [i.split() for i in dev_0_data]

with open('dev-0/expected.tsv', "r", encoding="utf-8") as f:
    dev_0_tags = [line.rstrip() for line in f]
    
dev_0_tags = [i.split() for i in dev_0_tags]

for i in range(len(dev_0_tags)):
    for j in range(len(dev_0_tags[i])):
        dev_0_tags[i][j] = ner_tags_dictionary[dev_0_tags[i][j]]
        
test_tokens_ids = data_preprocessing(dev_0_data)
test_labels = labels_preprocessing(dev_0_tags)
result = []

loss_score = 0
acc_score = 0
prec_score = 0
selected_items = 0
recall_score = 0
relevant_items = 0
items_total = 0
nn_model.eval()

for i in range(len(test_tokens_ids)):
    result.append([])
    for j in range(1, len(test_labels[i]) - 1):

        X = test_tokens_ids[i][j-1: j+2]
        Y = test_labels[i][j: j+1]

        Y_predictions = ner_model(X)


        acc_score += int(torch.argmax(Y_predictions) == Y)

        if torch.argmax(Y_predictions) != 0:
            selected_items +=1
        if  torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():
            prec_score += 1

        if  Y.item() != 0:
            relevant_items +=1
        if  Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():
            recall_score += 1

        items_total += 1
        loss = criterion(Y_predictions.unsqueeze(0), Y)
        loss_score += loss.item() 
        
        result[i].append(int(torch.argmax(Y_predictions)))

precision = prec_score / selected_items
recall = recall_score / relevant_items
f1_score = (2*precision * recall) / (precision + recall)
display('loss: ', loss_score / items_total)
display('acc: ', acc_score / items_total)
display('prec: ', precision)
display('recall: : ', recall)
display('f1: ', f1_score)
'loss: '
0.9558752998502382
'acc: '
0.8221458628103122
'prec: '
0.8261110597800979
'recall: : '
0.8253637102134259
'f1: '
0.8257372158959724
def save_to_file(path, results):
  with open(path, "w") as f:
    for line in results:
      print(line)
      f.write(line + "\n")
tags = []
tmp = []
for i in ner_tags_dictionary:
    tmp.append(i)

for i in range(len(result)):
    tags.append([])
    for j in range(len(result[i])):
        tags[i].append(tmp[result[i][j]])

processed_tags = process_output(tags)

with open("dev-0/out.tsv", "w") as f:
    for line in processed_tags:
      f.write(line + "\n")

with open('dev-0/expected.tsv', "r", encoding="utf-8") as f:
    dev_0_tags = [line.rstrip() for line in f]
    
dev_0_tags = [i.split() for i in dev_0_tags]
with open('test-A/in.tsv', "r", encoding="utf-8") as file:
    test_data = [line.rstrip() for line in file]
    
test_data = [i.split() for i in test_data]
test_tokens_ids = data_preprocessing(test_data)
result = []

loss_score = 0
acc_score = 0
prec_score = 0
selected_items = 0
recall_score = 0
relevant_items = 0
items_total = 0
nn_model.eval()

test_tokens_length = len(test_tokens_ids)

for i in range(test_tokens_length):
    result.append([])
    for j in range(1, len(test_tokens_ids[i]) - 1):
        X = test_tokens_ids[i][j-1: j + 2]
        Y_predictions = ner_model(X)
        result[i].append(int(torch.argmax(Y_predictions)))
tags = []
tmp = []

for i in ner_tags_dictionary:
    tmp.append(i)

result_length = len(result)

for i in range(result_length):
    tags.append([])
    for j in range(len(result[i])):
        tags[i].append(tmp[result[i][j]])

processed_tags = process_output(tags)

with open("test-A/out.tsv", "w") as f:
    for line in processed_tags:
      f.write(line + "\n")