# Klasyfikacja wieloklasowa i sequence labelling

https://www.aclweb.org/anthology/W03-0419.pdf

In [1]:
import numpy as np
import gensim
import torch
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.datasets import fetch_20newsgroups
# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score



## Klasyfikacja

### Klasfikacja binarna-  2 klasy

In [2]:
CATEGORIES = ['soc.religion.christian', 'alt.atheism']

In [3]:
newsgroups = fetch_20newsgroups(categories=CATEGORIES)

In [4]:
X =  newsgroups['data']

In [5]:
Y = newsgroups['target']

In [6]:
Y_names = newsgroups['target_names']

In [7]:
X[0:1]

['From: nigel.allen@canrem.com (Nigel Allen)\nSubject: library of congress to host dead sea scroll symposium april 21-22\nLines: 96\n\n\n Library of Congress to Host Dead Sea Scroll Symposium April 21-22\n To: National and Assignment desks, Daybook Editor\n Contact: John Sullivan, 202-707-9216, or Lucy Suddreth, 202-707-9191\n          both of the Library of Congress\n\n   WASHINGTON, April 19  -- A symposium on the Dead Sea \nScrolls will be held at the Library of Congress on Wednesday,\nApril 21, and Thursday, April 22.  The two-day program, cosponsored\nby the library and Baltimore Hebrew University, with additional\nsupport from the Project Judaica Foundation, will be held in the\nlibrary\'s Mumford Room, sixth floor, Madison Building.\n   Seating is limited, and admission to any session of the symposium\nmust be requested in writing (see Note A).\n   The symposium will be held one week before the public opening of a\nmajor exhibition, "Scrolls from the Dead Sea: The Ancient Librar

In [8]:
Y

array([1, 0, 1, ..., 0, 1, 1])

In [9]:
Y_names

['alt.atheism', 'soc.religion.christian']

In [10]:
del CATEGORIES, newsgroups, X, Y, Y_names

### klasyfikacja wieloklasowa

In [11]:
newsgroups_train_dev = fetch_20newsgroups(subset = 'train')
newsgroups_test = fetch_20newsgroups(subset = 'test')

In [12]:
newsgroups_train_dev_text = newsgroups_train_dev['data']
newsgroups_test_text = newsgroups_test['data']

In [13]:
Y_train_dev = newsgroups_train_dev['target']
Y_test = newsgroups_test['target']

In [14]:
newsgroups_train_text, newsgroups_dev_text, Y_train, Y_dev = train_test_split(newsgroups_train_dev_text, Y_train_dev, random_state=42)

In [15]:
Y_names = newsgroups_train_dev['target_names']

In [16]:
Y_train_dev

array([7, 4, 4, ..., 3, 1, 8])

In [17]:
Y_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

**Jaki baseline?**

In [18]:
pd.value_counts(Y_train)

10    464
5     457
9     456
15    449
13    449
2     449
6     448
12    447
1     446
3     445
8     443
14    441
11    439
7     430
17    429
4     421
16    396
0     363
18    328
19    285
dtype: int64

In [19]:
accuracy_score(Y_test, np.ones_like(Y_test) * 10)

0.05297397769516728



**Pytanie** - w jaki sposób stworzyć taki klasyfikator na podstawie tylko wiedzy z poprzednich ćwiczeń?

#### Zadanie - stworzyć klasyfikator regresji logistycznej one vs rest na podstawie tfdif. TFIDF powinien mieć słownik o wielkości 10000

https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [20]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
FEAUTERES = 10_000

In [22]:
vectorizer = TfidfVectorizer(max_features=FEAUTERES)
X_train = vectorizer.fit_transform(newsgroups_train_text)

In [23]:
X_dev = vectorizer.transform(newsgroups_dev_text)
X_test = vectorizer.transform(newsgroups_test_text)

In [24]:
clf = OneVsRestClassifier(LogisticRegression()).fit(X_train, Y_train)

In [25]:
clf.predict(X_train[0:1])

array([17])

In [26]:
clf.predict_proba(X_train[0:1])

array([[0.01996353, 0.04403235, 0.03483328, 0.01851026, 0.0341886 ,
        0.02510403, 0.0274924 , 0.03861172, 0.02705295, 0.04672495,
        0.02225957, 0.02924003, 0.02810532, 0.04871973, 0.06882963,
        0.04601515, 0.03421612, 0.31493021, 0.06406168, 0.02710846]])

In [27]:
np.max(clf.predict_proba(X_train[0]))

0.314930213117783

In [28]:
accuracy_score(clf.predict(X_train), Y_train)

0.9558043606364172

In [29]:
accuracy_score(clf.predict(X_dev), Y_dev)

0.8745139625309296

In [30]:
accuracy_score(clf.predict(X_test), Y_test)

0.8027084439723845

### Podejście softmax na tfidif

**Zadanie** Na podstawie poprzednich zajęć stworzyć sieć w pytorch bez warstw ukrytych, z jedną warstwą *output* z funkcją softmax oraz wytrenować sieć. 

Użyć https://pytorch.org/docs/stable/generated/torch.nn.Softmax.html?highlight=softmax

In [31]:
X_train

<8485x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 1115170 stored elements in Compressed Sparse Row format>

In [32]:
class NeuralNetworkModel(torch.nn.Module):

    def __init__(self,FEAUTERES, output_size):
        super(NeuralNetworkModel, self).__init__()
        self.fc1 = torch.nn.Linear(FEAUTERES,OUTPUT_SIZE)
        self.softmax = torch.nn.Softmax(dim=0)
        

    def forward(self, x):
        x = self.fc1(x)
        x = self.softmax(x)
        return x

In [33]:
OUTPUT_SIZE = len(Y_names)

In [34]:
nn_model = NeuralNetworkModel(FEAUTERES, OUTPUT_SIZE)

In [35]:
nn_model(torch.Tensor(X_train[0:3].astype(np.float32).todense()))

tensor([[0.3347, 0.3319, 0.3343, 0.3335, 0.3365, 0.3322, 0.3336, 0.3335, 0.3365,
         0.3299, 0.3329, 0.3343, 0.3303, 0.3343, 0.3338, 0.3325, 0.3327, 0.3330,
         0.3331, 0.3320],
        [0.3317, 0.3330, 0.3296, 0.3341, 0.3297, 0.3341, 0.3311, 0.3346, 0.3334,
         0.3339, 0.3313, 0.3351, 0.3344, 0.3334, 0.3337, 0.3343, 0.3337, 0.3333,
         0.3337, 0.3345],
        [0.3337, 0.3351, 0.3360, 0.3324, 0.3338, 0.3337, 0.3352, 0.3319, 0.3301,
         0.3362, 0.3358, 0.3306, 0.3354, 0.3323, 0.3325, 0.3332, 0.3337, 0.3337,
         0.3331, 0.3336]], grad_fn=<SoftmaxBackward>)

In [36]:
BATCH_SIZE = 5

In [37]:
criterion = torch.nn.NLLLoss()

In [38]:
optimizer = torch.optim.SGD(nn_model.parameters(), lr = 0.2)
#optimizer = torch.optim.Adam(nn_model.parameters())

In [39]:
def get_loss_acc(model, X_dataset, Y_dataset):
    loss_score = 0
    acc_score = 0
    items_total = 0
    model.eval()
    for i in range(0, Y_dataset.shape[0], BATCH_SIZE):
        X = X_dataset[i:i+BATCH_SIZE]
        X = torch.tensor(X.astype(np.float32).todense())
        Y = Y_dataset[i:i+BATCH_SIZE]
        Y = torch.tensor(Y)
        Y_predictions = model(X)
        acc_score += torch.sum(torch.argmax(Y_predictions,dim=1) == Y).item()
        items_total += Y.shape[0] 

        loss = criterion(Y_predictions, Y)

        loss_score += loss.item() * Y.shape[0] 
    return (loss_score / items_total), (acc_score / items_total)

In [40]:
for epoch in range(5):
    loss_score = 0
    acc_score = 0
    items_total = 0
    nn_model.train()
    for i in range(0, Y_train.shape[0], BATCH_SIZE):
        X = X_train[i:i+BATCH_SIZE]
        X = torch.tensor(X.astype(np.float32).todense())
        Y = Y_train[i:i+BATCH_SIZE]

        Y = torch.tensor(Y)
        Y_predictions = nn_model(X)
        acc_score += torch.sum(torch.argmax(Y_predictions,dim=1) == Y).item()
        items_total += Y.shape[0] 

        optimizer.zero_grad()
        loss = criterion(Y_predictions, Y)
        loss.backward()
        optimizer.step()


        loss_score += loss.item() * Y.shape[0]

    
    display(epoch)
    display(get_loss_acc(nn_model, X_train, Y_train))
    display(get_loss_acc(nn_model, X_dev, Y_dev))

0

(-0.21239922212795714, 0.6947554507955215)

(-0.2115392372480927, 0.6376811594202898)

1

(-0.226128276834856, 0.705833824395993)

(-0.22420935996980287, 0.6422764227642277)

2

(-0.24125191222483366, 0.7097230406599883)

(-0.23815746801866158, 0.6489925768822906)

3

(-0.25774253752910187, 0.7105480259281084)

(-0.2533627246411372, 0.6489925768822906)

4

(-0.27545327459213376, 0.7123158515026518)

(-0.26969742850285616, 0.6504065040650406)

In [41]:
X.shape

torch.Size([5, 10000])

In [42]:
newsgroups_train_text

["From: DSHAL@vmd.cso.uiuc.edu\nSubject: Re: Clintons views on Jerusalem\nOrganization: C.C.S.O.\nLines: 10\n\nIt seems that President Clinton can recognize Jerusalem as Israels capitol\nwhile still keeping his diplomatic rear door open by stating that the Parties\nconcerned should decide the city's final status. Even as I endorse Clintons vie\nw (of course), it is definitely a matter to be decided upon by Israel (and\nother participating neighboring contries).\nI see no real conflict in stating both views, nor expect any better from\npoliticians.\n-----\nDavid Shalhevet / dshal@vmd.cso.uiuc.edu / University of Illinois\nDept Anim Sci / 220 PABL / 1201 W. Gregory Dr. / Urbana, IL 61801\n",
 "From: mouse@thunder.mcrcim.mcgill.edu (der Mouse)\nSubject: Re: XWindows always opaque\nKeywords: xwindow, parent-child relation\nOrganization: McGill Research Centre for Intelligent Machines\nLines: 17\n\n> Distribution: comp\n\nPlease don't misuse newsgroup hierarchy names as distributions.\n\nIn

### Podejście softmax z embeddingami na przykładzie NER

In [43]:
# !pip install torchtext
# !pip install datasets

In [44]:
from datasets import load_dataset
from torchtext.vocab import Vocab
from collections import Counter

In [45]:
dataset = load_dataset("conll2003")

Reusing dataset conll2003 (/home/kuba/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)


In [46]:
def build_vocab(dataset):
    counter = Counter()
    for document in dataset:
        counter.update(document)
    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

In [47]:
vocab = build_vocab(dataset['train']['tokens'])

In [48]:
dataset['train']['tokens']

[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 ['Peter', 'Blackburn'],
 ['BRUSSELS', '1996-08-22'],
 ['The',
  'European',
  'Commission',
  'said',
  'on',
  'Thursday',
  'it',
  'disagreed',
  'with',
  'German',
  'advice',
  'to',
  'consumers',
  'to',
  'shun',
  'British',
  'lamb',
  'until',
  'scientists',
  'determine',
  'whether',
  'mad',
  'cow',
  'disease',
  'can',
  'be',
  'transmitted',
  'to',
  'sheep',
  '.'],
 ['Germany',
  "'s",
  'representative',
  'to',
  'the',
  'European',
  'Union',
  "'s",
  'veterinary',
  'committee',
  'Werner',
  'Zwingmann',
  'said',
  'on',
  'Wednesday',
  'consumers',
  'should',
  'buy',
  'sheepmeat',
  'from',
  'countries',
  'other',
  'than',
  'Britain',
  'until',
  'the',
  'scientific',
  'advice',
  'was',
  'clearer',
  '.'],
 ['"',
  'We',
  'do',
  "n't",
  'support',
  'any',
  'such',
  'recommendation',
  'because',
  'we',
  'do',
  "n't",
  'see',
  'any',
  'grounds',
  'fo

In [49]:
len(vocab.itos)

23627

In [50]:
vocab['on']

15

In [52]:
def data_process(dt):
    return [ torch.tensor([vocab['<bos>']] +[vocab[token]  for token in  document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]


In [53]:
def labels_process(dt):
    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]


In [54]:
train_tokens_ids = data_process(dataset['train']['tokens'])

In [84]:
test_tokens_ids = data_process(dataset['test']['tokens'])

In [55]:
train_labels = labels_process(dataset['train']['ner_tags'])

In [85]:
test_labels = labels_process(dataset['test']['ner_tags'])

In [56]:
train_tokens_ids[0]

tensor([    2,   966, 22409,   238,   773,     9,  4588,   212,  7686,     4,
            3])

In [57]:
max([max(x) for x in dataset['train']['ner_tags'] ])

8

In [69]:
class NERModel(torch.nn.Module):

    def __init__(self,):
        super(NERModel, self).__init__()
        self.emb = torch.nn.Embedding(23627,200)
        self.fc1 = torch.nn.Linear(600,9)
        #self.softmax = torch.nn.Softmax(dim=0)
        # nie trzeba, bo używamy https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
        # jako kryterium
        

    def forward(self, x):
        x = self.emb(x)
        x = x.reshape(600) 
        x = self.fc1(x)
        #x = self.softmax(x)
        return x

In [91]:
train_tokens_ids[0][1:4]

tensor([  966, 22409,   238])

In [92]:
ner_model = NERModel()

In [93]:
ner_model(train_tokens_ids[0][1:4])

tensor([-0.6856,  0.8183, -0.4898, -0.3286, -0.2323,  0.4644,  0.3463,  0.3909,
        -0.1110], grad_fn=<AddBackward0>)

In [94]:
criterion = torch.nn.CrossEntropyLoss()

In [95]:
optimizer = torch.optim.Adam(ner_model.parameters())

In [96]:
len(train_labels)

14041

In [97]:
for epoch in range(1):
    loss_score = 0
    acc_score = 0
    prec_score = 0
    selected_items = 0
    recall_score = 0
    relevant_items = 0
    items_total = 0
    nn_model.train()
    for i in range(len(train_labels)):
    #for i in range(100):
        for j in range(1, len(train_labels[i]) - 1):
    
            X = train_tokens_ids[i][j-1: j+2]
            Y = train_labels[i][j: j+1]

            Y_predictions = ner_model(X)
            
            
            acc_score += int(torch.argmax(Y_predictions) == Y)
            
            if torch.argmax(Y_predictions) != 0:
                selected_items +=1
            if  torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():
                prec_score += 1
            
            if  Y.item() != 0:
                relevant_items +=1
            if  Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():
                recall_score += 1
            
            items_total += 1

            
            optimizer.zero_grad()
            loss = criterion(Y_predictions.unsqueeze(0), Y)
            loss.backward()
            optimizer.step()


            loss_score += loss.item() 
    
    precision = prec_score / selected_items
    recall = recall_score / relevant_items
    f1_score = (2*precision * recall) / (precision + recall)
    display('epoch: ', epoch)
    display('loss: ', loss_score / items_total)
    display('acc: ', acc_score / items_total)
    display('prec: ', precision)
    display('recall: : ', recall)
    display('f1: ', f1_score)

KeyboardInterrupt: 

In [98]:
loss_score = 0
acc_score = 0
prec_score = 0
selected_items = 0
recall_score = 0
relevant_items = 0
items_total = 0
nn_model.eval()
for i in range(100):
#for i in range(len(test_labels)):
    for j in range(1, len(test_labels[i]) - 1):

        X = test_tokens_ids[i][j-1: j+2]
        Y = test_labels[i][j: j+1]

        Y_predictions = ner_model(X)


        acc_score += int(torch.argmax(Y_predictions) == Y)

        if torch.argmax(Y_predictions) != 0:
            selected_items +=1
        if  torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():
            prec_score += 1

        if  Y.item() != 0:
            relevant_items +=1
        if  Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():
            recall_score += 1

        items_total += 1


        loss = criterion(Y_predictions.unsqueeze(0), Y)



        loss_score += loss.item() 

precision = prec_score / selected_items
recall = recall_score / relevant_items
f1_score = (2*precision * recall) / (precision + recall)
display('epoch: ', epoch)
display('loss: ', loss_score / items_total)
display('acc: ', acc_score / items_total)
display('prec: ', precision)
display('recall: : ', recall)
display('f1: ', f1_score)

'epoch: '

0

'loss: '

0.5454190555144878

'acc: '

0.8530239099859352

'prec: '

0.5646687697160884

'recall: : '

0.5327380952380952

'f1: '

0.5482388973966309