en-ner-conll-2003/main.ipynb

22 KiB

import numpy as np
import gensim
import torch
import pandas as pd

from torchtext.vocab import Vocab
from collections import Counter

import lzma
import re
import itertools
class NeuralNetworkModel(torch.nn.Module):

    def __init__(self, output_size):
        super(NeuralNetworkModel, self).__init__()
        self.fc1 = torch.nn.Linear(10_000,len(train_tokens_ids))
        self.softmax = torch.nn.Softmax(dim=0)
        

    def forward(self, x):
        x = self.fc1(x)
        x = self.softmax(x)
        return x
class NERModel(torch.nn.Module):

    def __init__(self,):
        super(NERModel, self).__init__()
        self.emb = torch.nn.Embedding(23627,200)
        self.fc1 = torch.nn.Linear(600,9)

    def forward(self, x):
        x = self.emb(x)
        x = x.reshape(600) 
        x = self.fc1(x)
        return x
def get_dataset(path):
    data = lzma.open(path).read().decode('UTF-8').split('\n')
    return [line.split('\t') for line in data][:-1]

train_data = get_dataset('train.tsv.xz')

tokens = []
ner_tags = []

for i in train_data:
    ner_tags.append(i[0].split())
    tokens.append(i[1].split())

ner_tags_set = list(set(itertools.chain(*ner_tags)))

ner_tags_dictionary = {}

for i in range(len(ner_tags_set)):
    ner_tags_dictionary[ner_tags_set[i]] = i
for i in range(len(ner_tags)):
    for j in range(len(ner_tags[i])):
        ner_tags[i][j] = ner_tags_dictionary[ner_tags[i][j]]

def data_preprocessing(data):
    return [ torch.tensor([vocab['<bos>']] +[vocab[token]  for token in  document ] + [vocab['<eos>']], dtype = torch.long) for document in data ]

def labels_preprocessing(data):
    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in data ]

def build_vocab(dataset):
    counter = Counter()
    for document in dataset:
        counter.update(document)
    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])


vocab = build_vocab(tokens)
train_tokens_ids = data_preprocessing(tokens)
train_labels = labels_preprocessing(ner_tags)
nn_model = NeuralNetworkModel(len(train_tokens_ids))
train_tokens_ids[0][1:4]

ner_model = NERModel()
ner_model(train_tokens_ids[0][1:4])

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(ner_model.parameters())

for epoch in range(2):
    loss_score = 0
    acc_score = 0
    prec_score = 0
    selected_items = 0
    recall_score = 0
    relevant_items = 0
    items_total = 0
    nn_model.train()
    for i in range(100):
        for j in range(1, len(train_labels[i]) - 1):
    
            X = train_tokens_ids[i][j-1: j+2]
            Y = train_labels[i][j: j+1]

            Y_predictions = ner_model(X)
            
            
            acc_score += int(torch.argmax(Y_predictions) == Y)
            
            if torch.argmax(Y_predictions) != 0:
                selected_items +=1
            if  torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():
                prec_score += 1
            
            if  Y.item() != 0:
                relevant_items +=1
            if  Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():
                recall_score += 1
            
            items_total += 1

            
            optimizer.zero_grad()
            loss = criterion(Y_predictions.unsqueeze(0), Y)
            loss.backward()
            optimizer.step()


            loss_score += loss.item() 
    
    precision = prec_score / selected_items
    recall = recall_score / relevant_items
    f1_score = (2*precision * recall) / (precision + recall)
    display('epoch: ', epoch)
    display('loss: ', loss_score / items_total)
    display('acc: ', acc_score / items_total)
    display('prec: ', precision)
    display('recall: : ', recall)
    display('f1: ', f1_score)
'epoch: '
0
'loss: '
0.5382220030078203
'acc: '
0.8581935187313261
'prec: '
0.8677398098465594
'recall: : '
0.8674948240165632
'f1: '
0.8676172996376301
'epoch: '
1
'loss: '
0.2793121223593968
'acc: '
0.9241553665823948
'prec: '
0.9306665413180408
'recall: : '
0.9316299642386598
'f1: '
0.931148003574284
with open('dev-0/in.tsv', "r", encoding="utf-8") as f:
    dev_0_data = [line.rstrip() for line in f]
    
dev_0_data = [i.split() for i in dev_0_data]

with open('dev-0/expected.tsv', "r", encoding="utf-8") as f:
    dev_0_tags = [line.rstrip() for line in f]
    
dev_0_tags = [i.split() for i in dev_0_tags]

for i in range(len(dev_0_tags)):
    for j in range(len(dev_0_tags[i])):
        dev_0_tags[i][j] = ner_tags_dictionary[dev_0_tags[i][j]]
        
test_tokens_ids = data_preprocessing(dev_0_data)
test_labels = labels_preprocessing(dev_0_tags)
result = []

loss_score = 0
acc_score = 0
prec_score = 0
selected_items = 0
recall_score = 0
relevant_items = 0
items_total = 0
nn_model.eval()

for i in range(len(test_tokens_ids)):
    result.append([])
    for j in range(1, len(test_labels[i]) - 1):

        X = test_tokens_ids[i][j-1: j+2]
        Y = test_labels[i][j: j+1]

        Y_predictions = ner_model(X)


        acc_score += int(torch.argmax(Y_predictions) == Y)

        if torch.argmax(Y_predictions) != 0:
            selected_items +=1
        if  torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():
            prec_score += 1

        if  Y.item() != 0:
            relevant_items +=1
        if  Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():
            recall_score += 1

        items_total += 1
        loss = criterion(Y_predictions.unsqueeze(0), Y)
        loss_score += loss.item() 
        
        result[i].append(int(torch.argmax(Y_predictions)))

precision = prec_score / selected_items
recall = recall_score / relevant_items
f1_score = (2*precision * recall) / (precision + recall)
display('loss: ', loss_score / items_total)
display('acc: ', acc_score / items_total)
display('prec: ', precision)
display('recall: : ', recall)
display('f1: ', f1_score)
'loss: '
0.7380534848964866
'acc: '
0.846621708531633
'prec: '
0.8595547727017202
'recall: : '
0.8640559071729957
'f1: '
0.8617994626787158
def save_file(path, data):
  f = open(path, "a")

  for i in data:
      f.write(' '.join(i) + '\n')

  f.close()
tags = []
tmp = []
for i in ner_tags_dictionary:
    tmp.append(i)

for i in range(len(result)):
    tags.append([])
    for j in range(len(result[i])):
        tags[i].append(tmp[result[i][j]])

save_file("dev-0/out.tsv", tags)

with open('dev-0/expected.tsv', "r", encoding="utf-8") as f:
    dev_0_tags = [line.rstrip() for line in f]
    
dev_0_tags = [i.split() for i in dev_0_tags]

import math
t = 0
for i in range(len(tags)):
    for j in range(len(tags[i])):
        if tags[i][j] == dev_0_tags[i][j]:
            t += 1
with open('test-A/in.tsv', "r", encoding="utf-8") as file:
    test_data = [line.rstrip() for line in file]
    
test_data = [i.split() for i in test_data]
test_tokens_ids = data_preprocessing(test_data)
result = []

loss_score = 0
acc_score = 0
prec_score = 0
selected_items = 0
recall_score = 0
relevant_items = 0
items_total = 0
nn_model.eval()

test_tokens_length = len(test_tokens_ids)

for i in range(test_tokens_length):
    result.append([])
    for j in range(1, len(test_tokens_ids[i]) - 1):
        X = test_tokens_ids[i][j-1: j + 2]
        Y_predictions = ner_model(X)
        result[i].append(int(torch.argmax(Y_predictions)))
tags = []
tmp = []

for i in ner_tags_dictionary:
    tmp.append(i)

result_length = len(result)

for i in range(result_length):
    tags.append([])
    for j in range(len(result[i])):
        tags[i].append(tmp[result[i][j]])

save_file("test-A/out.tsv", tags)