forked from kubapok/en-ner-conll-2003
15 KiB
15 KiB
from collections import Counter
import pandas as pd
import numpy as np
import gensim
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt
from keras.regularizers import l2
from tqdm.notebook import tqdm
import torch
from torchtext.vocab import vocab
C:\Users\Szpil\anaconda3\envs\py310\lib\site-packages\torchtext\vocab\__init__.py:4: UserWarning: /!\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\ Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()` warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG) C:\Users\Szpil\anaconda3\envs\py310\lib\site-packages\torchtext\utils.py:4: UserWarning: /!\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\ Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()` warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG)
def build_vocab(dataset):
counter = Counter()
for document in dataset:
counter.update(document)
return vocab(counter, specials=["<unk>", "<pad>", "<bos>", "<eos>"])
def fit_data_Y(column):
dt = [
[ner_dict[token] for token in row.split()] for row in column
]
return [torch.tensor([0] + document + [0], dtype=torch.long) for document in dt]
def fit_data_X(dt):
return [
torch.tensor(
[v["<bos>"]] + [v[token] for token in document.split()] + [v["<eos>"]],
dtype=torch.long,
)
for document in dt
]
def predict(X):
Y_predicted = []
for i in tqdm(range(len(X))):
batch_tokens = X[i].unsqueeze(0)
Y_batch_pred_weights = lstm(batch_tokens).squeeze(0)
Y_batch_pred = torch.argmax(Y_batch_pred_weights, 1)
Y_processed_pred = " ".join(reversed_ner_dict[item] for item in Y_batch_pred.numpy()[1:-1])
Y_predicted.append(Y_processed_pred)
return Y_predicted
def save_to_csv(filename, data):
Y_predicted_df = pd.DataFrame(data)
Y_predicted_df.to_csv(filename, sep='\t', index=False, header=None)
Prepairing training data
# Reading the train dataset
train_data = pd.read_csv('./train/train.tsv', sep='\t', usecols=[0, 1], header=None, names=['label', 'sentence'])
train_X = train_data['sentence'].apply(lambda x: gensim.utils.simple_preprocess(x))
v = build_vocab(train_X)
itos = v.get_itos()
print(itos[0:25])
['<unk>', '<pad>', '<bos>', '<eos>', 'eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', 'peter', 'blackburn', 'brussels', 'the', 'european', 'commission', 'said', 'on', 'thursday', 'it', 'disagreed', 'with', 'advice']
v.set_default_index(v["<unk>"])
# Creating a mapping for label to index conversion
ner_dict = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
reversed_ner_dict = {v: k for k, v in ner_dict.items()}
num_tags = len(ner_dict)
train_X = fit_data_X(train_data['sentence'])
train_Y = fit_data_Y(train_data['label'])
Prepairing dev data
dev_texts_data = pd.read_csv('./dev-0/in.tsv', sep='\t', usecols=[0], header=None, names=['sentence'])
dev_labels_data = pd.read_csv('./dev-0/expected.tsv', sep='\t', usecols=[0], header=None, names=['label'])
dev_X = fit_data_X(dev_texts_data['sentence'])
dev_Y = fit_data_Y(dev_labels_data['label'])
Prepairing test data
test_texts_data = pd.read_csv('./test-A/in.tsv', sep='\t', usecols=[0], header=None, names=['sentence'])
test_X = fit_data_X(test_texts_data['sentence'])
Model implementation
class LSTM(torch.nn.Module):
def __init__(self):
super(LSTM, self).__init__()
self.emb = torch.nn.Embedding(len(v.get_itos()), 100)
self.rec = torch.nn.LSTM(100, 256, 1, batch_first=True)
self.fc1 = torch.nn.Linear(256, num_tags)
def forward(self, x):
emb = torch.relu(self.emb(x))
lstm_output, (h_n, c_n) = self.rec(emb)
out_weights = self.fc1(lstm_output)
return out_weights
lstm = LSTM()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm.parameters())
def get_accuracy(y_true, y_pred):
hit = 0
missed = 0
for p, t in zip(y_pred, y_true):
if p == t:
hit += 1
else:
missed += 1
accuracy = hit / (hit + missed)
return accuracy
def eval_model(dataset_tokens, dataset_labels, model):
Y_true = []
Y_pred = []
for i in tqdm(range(len(dataset_labels))):
batch_tokens = dataset_tokens[i].unsqueeze(0)
tags = list(dataset_labels[i].numpy())
Y_true += tags
Y_batch_pred_weights = model(batch_tokens).squeeze(0)
Y_batch_pred = torch.argmax(Y_batch_pred_weights, 1)
Y_pred += list(Y_batch_pred.numpy())
return get_accuracy(Y_true, Y_pred)
Model training
After some tests model with this data preprocessing has gotten 84% accuracy results after 3 epochs and stabilized. Thus more than 3 epochs are redundant.
for epoch in range(3):
lstm.train()
for i in tqdm(range(len(train_Y))):
batch_tokens = train_X[i].unsqueeze(0)
tags = train_Y[i].unsqueeze(1)
predicted_tags = lstm(batch_tokens)
optimizer.zero_grad()
loss = criterion(predicted_tags.squeeze(0), tags.squeeze(1))
loss.backward()
optimizer.step()
lstm.eval()
accuracy = eval_model(dev_X, dev_Y, lstm)
print(f"Accuracy: {accuracy}")
0%| | 0/945 [00:00<?, ?it/s]
0%| | 0/215 [00:00<?, ?it/s]
Accuracy: 0.8424276676815372
0%| | 0/945 [00:00<?, ?it/s]
0%| | 0/215 [00:00<?, ?it/s]
Accuracy: 0.8431933784251883
0%| | 0/945 [00:00<?, ?it/s]
0%| | 0/215 [00:00<?, ?it/s]
Accuracy: 0.8448706495779476
Evaluation
dev_predicted = predict(dev_X)
save_to_csv('./dev-0/out.tsv', dev_predicted)
0%| | 0/215 [00:00<?, ?it/s]
test_predicted = predict(test_X)
save_to_csv('./test-A/out.tsv', test_predicted)
0%| | 0/230 [00:00<?, ?it/s]