forked from kubapok/en-ner-conll-2003
22 KiB
22 KiB
import numpy as np
import gensim
import torch
import pandas as pd
from torchtext.vocab import Vocab
from collections import Counter
import lzma
import re
import itertools
class NeuralNetworkModel(torch.nn.Module):
def __init__(self, output_size):
super(NeuralNetworkModel, self).__init__()
self.fc1 = torch.nn.Linear(10_000,len(train_tokens_ids))
self.softmax = torch.nn.Softmax(dim=0)
def forward(self, x):
x = self.fc1(x)
x = self.softmax(x)
return x
class NERModel(torch.nn.Module):
def __init__(self,):
super(NERModel, self).__init__()
self.emb = torch.nn.Embedding(23627,200)
self.fc1 = torch.nn.Linear(600,9)
def forward(self, x):
x = self.emb(x)
x = x.reshape(600)
x = self.fc1(x)
return x
def get_dataset(path):
data = lzma.open(path).read().decode('UTF-8').split('\n')
return [line.split('\t') for line in data][:-1]
train_data = get_dataset('train.tsv.xz')
tokens = []
ner_tags = []
for i in train_data:
ner_tags.append(i[0].split())
tokens.append(i[1].split())
ner_tags_set = list(set(itertools.chain(*ner_tags)))
ner_tags_dictionary = {}
for i in range(len(ner_tags_set)):
ner_tags_dictionary[ner_tags_set[i]] = i
for i in range(len(ner_tags)):
for j in range(len(ner_tags[i])):
ner_tags[i][j] = ner_tags_dictionary[ner_tags[i][j]]
def data_preprocessing(data):
return [ torch.tensor([vocab['<bos>']] +[vocab[token] for token in document ] + [vocab['<eos>']], dtype = torch.long) for document in data ]
def labels_preprocessing(data):
return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in data ]
def build_vocab(dataset):
counter = Counter()
for document in dataset:
counter.update(document)
return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
vocab = build_vocab(tokens)
train_tokens_ids = data_preprocessing(tokens)
train_labels = labels_preprocessing(ner_tags)
nn_model = NeuralNetworkModel(len(train_tokens_ids))
train_tokens_ids[0][1:4]
ner_model = NERModel()
ner_model(train_tokens_ids[0][1:4])
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(ner_model.parameters())
for epoch in range(2):
loss_score = 0
acc_score = 0
prec_score = 0
selected_items = 0
recall_score = 0
relevant_items = 0
items_total = 0
nn_model.train()
for i in range(100):
for j in range(1, len(train_labels[i]) - 1):
X = train_tokens_ids[i][j-1: j+2]
Y = train_labels[i][j: j+1]
Y_predictions = ner_model(X)
acc_score += int(torch.argmax(Y_predictions) == Y)
if torch.argmax(Y_predictions) != 0:
selected_items +=1
if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():
prec_score += 1
if Y.item() != 0:
relevant_items +=1
if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():
recall_score += 1
items_total += 1
optimizer.zero_grad()
loss = criterion(Y_predictions.unsqueeze(0), Y)
loss.backward()
optimizer.step()
loss_score += loss.item()
precision = prec_score / selected_items
recall = recall_score / relevant_items
f1_score = (2*precision * recall) / (precision + recall)
display('epoch: ', epoch)
display('loss: ', loss_score / items_total)
display('acc: ', acc_score / items_total)
display('prec: ', precision)
display('recall: : ', recall)
display('f1: ', f1_score)
'epoch: '
0
'loss: '
0.5382220030078203
'acc: '
0.8581935187313261
'prec: '
0.8677398098465594
'recall: : '
0.8674948240165632
'f1: '
0.8676172996376301
'epoch: '
1
'loss: '
0.2793121223593968
'acc: '
0.9241553665823948
'prec: '
0.9306665413180408
'recall: : '
0.9316299642386598
'f1: '
0.931148003574284
with open('dev-0/in.tsv', "r", encoding="utf-8") as f:
dev_0_data = [line.rstrip() for line in f]
dev_0_data = [i.split() for i in dev_0_data]
with open('dev-0/expected.tsv', "r", encoding="utf-8") as f:
dev_0_tags = [line.rstrip() for line in f]
dev_0_tags = [i.split() for i in dev_0_tags]
for i in range(len(dev_0_tags)):
for j in range(len(dev_0_tags[i])):
dev_0_tags[i][j] = ner_tags_dictionary[dev_0_tags[i][j]]
test_tokens_ids = data_preprocessing(dev_0_data)
test_labels = labels_preprocessing(dev_0_tags)
result = []
loss_score = 0
acc_score = 0
prec_score = 0
selected_items = 0
recall_score = 0
relevant_items = 0
items_total = 0
nn_model.eval()
for i in range(len(test_tokens_ids)):
result.append([])
for j in range(1, len(test_labels[i]) - 1):
X = test_tokens_ids[i][j-1: j+2]
Y = test_labels[i][j: j+1]
Y_predictions = ner_model(X)
acc_score += int(torch.argmax(Y_predictions) == Y)
if torch.argmax(Y_predictions) != 0:
selected_items +=1
if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():
prec_score += 1
if Y.item() != 0:
relevant_items +=1
if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():
recall_score += 1
items_total += 1
loss = criterion(Y_predictions.unsqueeze(0), Y)
loss_score += loss.item()
result[i].append(int(torch.argmax(Y_predictions)))
precision = prec_score / selected_items
recall = recall_score / relevant_items
f1_score = (2*precision * recall) / (precision + recall)
display('loss: ', loss_score / items_total)
display('acc: ', acc_score / items_total)
display('prec: ', precision)
display('recall: : ', recall)
display('f1: ', f1_score)
'loss: '
0.7380534848964866
'acc: '
0.846621708531633
'prec: '
0.8595547727017202
'recall: : '
0.8640559071729957
'f1: '
0.8617994626787158
def save_file(path, data):
f = open(path, "a")
for i in data:
f.write(' '.join(i) + '\n')
f.close()
tags = []
tmp = []
for i in ner_tags_dictionary:
tmp.append(i)
for i in range(len(result)):
tags.append([])
for j in range(len(result[i])):
tags[i].append(tmp[result[i][j]])
save_file("dev-0/out.tsv", tags)
with open('dev-0/expected.tsv', "r", encoding="utf-8") as f:
dev_0_tags = [line.rstrip() for line in f]
dev_0_tags = [i.split() for i in dev_0_tags]
import math
t = 0
for i in range(len(tags)):
for j in range(len(tags[i])):
if tags[i][j] == dev_0_tags[i][j]:
t += 1
with open('test-A/in.tsv', "r", encoding="utf-8") as file:
test_data = [line.rstrip() for line in file]
test_data = [i.split() for i in test_data]
test_tokens_ids = data_preprocessing(test_data)
result = []
loss_score = 0
acc_score = 0
prec_score = 0
selected_items = 0
recall_score = 0
relevant_items = 0
items_total = 0
nn_model.eval()
test_tokens_length = len(test_tokens_ids)
for i in range(test_tokens_length):
result.append([])
for j in range(1, len(test_tokens_ids[i]) - 1):
X = test_tokens_ids[i][j-1: j + 2]
Y_predictions = ner_model(X)
result[i].append(int(torch.argmax(Y_predictions)))
tags = []
tmp = []
for i in ner_tags_dictionary:
tmp.append(i)
result_length = len(result)
for i in range(result_length):
tags.append([])
for j in range(len(result[i])):
tags[i].append(tmp[result[i][j]])
save_file("test-A/out.tsv", tags)