### Podejście softmax z embeddingami na przykładzie NER

https://pytorch-crf.readthedocs.io/en/stable/

https://www.aclweb.org/anthology/W03-0419.pdf

In [1]:
import numpy as np
import gensim
import torch
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

from datasets import load_dataset
from torchtext.vocab import Vocab
from collections import Counter

from sklearn.datasets import fetch_20newsgroups
# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

from tqdm.notebook import tqdm

import torch
from torchcrf import CRF



In [2]:
dataset = load_dataset("conll2003")

Reusing dataset conll2003 (/home/kuba/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)


In [3]:
def build_vocab(dataset):
 counter = Counter()
 for document in dataset:
 counter.update(document)
 return Vocab(counter, specials=['', '', '', ''])

In [4]:
vocab = build_vocab(dataset['train']['tokens'])

In [5]:
dataset['train']['tokens']

[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 ['Peter', 'Blackburn'],
 ['BRUSSELS', '1996-08-22'],
 ['The',
 'European',
 'Commission',
 'said',
 'on',
 'Thursday',
 'it',
 'disagreed',
 'with',
 'German',
 'advice',
 'to',
 'consumers',
 'to',
 'shun',
 'British',
 'lamb',
 'until',
 'scientists',
 'determine',
 'whether',
 'mad',
 'cow',
 'disease',
 'can',
 'be',
 'transmitted',
 'to',
 'sheep',
 '.'],
 ['Germany',
 "'s",
 'representative',
 'to',
 'the',
 'European',
 'Union',
 "'s",
 'veterinary',
 'committee',
 'Werner',
 'Zwingmann',
 'said',
 'on',
 'Wednesday',
 'consumers',
 'should',
 'buy',
 'sheepmeat',
 'from',
 'countries',
 'other',
 'than',
 'Britain',
 'until',
 'the',
 'scientific',
 'advice',
 'was',
 'clearer',
 '.'],
 ['"',
 'We',
 'do',
 "n't",
 'support',
 'any',
 'such',
 'recommendation',
 'because',
 'we',
 'do',
 "n't",
 'see',
 'any',
 'grounds',
 'for',
 'it',
 ',',
 '"',
 'the',
 'Commission',
 "'s",
 'chief',
 'spokesman

In [6]:
len(vocab.itos)

23627

In [7]:
vocab['on']

15

In [8]:
def data_process(dt):
 return [ torch.tensor([vocab['']] +[vocab[token] for token in document ] + [vocab['']], dtype = torch.long) for document in dt]

In [9]:
def labels_process(dt):
 return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]


In [10]:
train_tokens_ids = data_process(dataset['train']['tokens'])

In [11]:
test_tokens_ids = data_process(dataset['test']['tokens'])

In [12]:
validation_tokens_ids = data_process(dataset['validation']['tokens'])

In [13]:
train_labels = labels_process(dataset['train']['ner_tags'])

In [14]:
validation_labels = labels_process(dataset['validation']['ner_tags'])

In [15]:
test_labels = labels_process(dataset['test']['ner_tags'])

In [16]:
train_tokens_ids[0]

tensor([ 2, 966, 22409, 238, 773, 9, 4588, 212, 7686, 4,
 3])

In [17]:
def get_scores(y_true, y_pred):
 acc_score = 0
 tp = 0
 fp = 0
 selected_items = 0
 relevant_items = 0 

 for p,t in zip(y_pred, y_true):
 if p == t:
 acc_score +=1

 if p > 0 and p == t:
 tp +=1

 if p > 0:
 selected_items += 1

 if t > 0 :
 relevant_items +=1

 
 
 if selected_items == 0:
 precision = 1.0
 else:
 precision = tp / selected_items
 
 
 if relevant_items == 0:
 recall = 1.0
 else:
 recall = tp / relevant_items
 
 
 if precision + recall == 0.0 :
 f1 = 0.0
 else:
 f1 = 2* precision * recall / (precision + recall)

 return precision, recall, f1

In [29]:
num_tags = max([max(x) for x in dataset['train']['ner_tags'] ]) + 1 

In [30]:
class FF(torch.nn.Module):

 def __init__(self,):
 super(FF, self).__init__()
 self.emb = torch.nn.Embedding(23627,200)
 self.fc1 = torch.nn.Linear(200,num_tags)
 

 def forward(self, x):
 x = self.emb(x)
 x = self.fc1(x)
 return x

In [31]:
ff = FF()

In [32]:
model = CRF(num_tags)

In [33]:
params = list(ff.parameters()) + list(model.parameters())

optimizer = torch.optim.Adam(params)

In [34]:
def eval_model(dataset_tokens, dataset_labels):
 Y_true = []
 Y_pred = []
 ff.eval()
 model.eval()
 for i in tqdm(range(len(dataset_labels))):
 batch_tokens = dataset_tokens[i]
 tags = list(dataset_labels[i].numpy())
 emissions = ff(batch_tokens).unsqueeze(1)
 Y_pred += model.decode(emissions)[0]
 Y_true += tags

 return get_scores(Y_true, Y_pred)
 

In [24]:
for i in range (5):
 ff.train()
 model.train()
 for i in tqdm(range(len(train_labels))):
 batch_tokens = train_tokens_ids[i]
 tags = train_labels[i].unsqueeze(1)
 emissions = ff(batch_tokens).unsqueeze(1)

 optimizer.zero_grad()
 loss = -model(emissions,tags )
 loss.backward()
 optimizer.step()
 
 ff.eval()
 model.eval()
 print(eval_model(validation_tokens_ids, validation_labels))

HBox(children=(FloatProgress(value=0.0, max=14041.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3250.0), HTML(value='')))


(0.7584134615384616, 0.5134255492270138, 0.6123241145075207)


HBox(children=(FloatProgress(value=0.0, max=3453.0), HTML(value='')))


(0.6854983091306942, 0.42480276134122286, 0.5245452469746555)


HBox(children=(FloatProgress(value=0.0, max=14041.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3250.0), HTML(value='')))


(0.7624765478424015, 0.7085900267348599, 0.734546330883239)


HBox(children=(FloatProgress(value=0.0, max=3453.0), HTML(value='')))


(0.6329741112349808, 0.629930966469428, 0.6314488724127278)


HBox(children=(FloatProgress(value=0.0, max=14041.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3250.0), HTML(value='')))


(0.7837289753071692, 0.7636870859002673, 0.7735782409042741)


HBox(children=(FloatProgress(value=0.0, max=3453.0), HTML(value='')))


(0.6449022346368715, 0.6830621301775148, 0.663433908045977)


HBox(children=(FloatProgress(value=0.0, max=14041.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3250.0), HTML(value='')))


(0.7943538861327664, 0.7817040567243985, 0.7879782061046341)


HBox(children=(FloatProgress(value=0.0, max=3453.0), HTML(value='')))


(0.6709526592635885, 0.7075936883629191, 0.6887862242755145)


HBox(children=(FloatProgress(value=0.0, max=14041.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3250.0), HTML(value='')))


(0.8190561682482688, 0.742415436475648, 0.7788549478690324)


HBox(children=(FloatProgress(value=0.0, max=3453.0), HTML(value='')))


(0.6873089700996677, 0.6375739644970414, 0.6615079618852722)


In [27]:
eval_model(validation_tokens_ids, validation_labels)

HBox(children=(FloatProgress(value=0.0, max=3250.0), HTML(value='')))




(0.8190561682482688, 0.742415436475648, 0.7788549478690324)

In [28]:
eval_model(test_tokens_ids, test_labels)

HBox(children=(FloatProgress(value=0.0, max=3453.0), HTML(value='')))




(0.6873089700996677, 0.6375739644970414, 0.6615079618852722)