# Klasyfikacja wieloklasowa i sequence labelling

In [1]:
import numpy as np
import gensim
import torch
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

from datasets import load_dataset
from torchtext.vocab import Vocab
from collections import Counter

from sklearn.datasets import fetch_20newsgroups
# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score



## Klasyfikacja

### Klasfikacja binarna-  2 klasy

In [2]:
CATEGORIES = ['soc.religion.christian', 'alt.atheism']

In [3]:
newsgroups = fetch_20newsgroups(categories=CATEGORIES)

In [4]:
X =  newsgroups['data']

In [5]:
Y = newsgroups['target']

In [6]:
Y_names = newsgroups['target_names']

In [7]:
X[0:1]

['From: nigel.allen@canrem.com (Nigel Allen)\nSubject: library of congress to host dead sea scroll symposium april 21-22\nLines: 96\n\n\n Library of Congress to Host Dead Sea Scroll Symposium April 21-22\n To: National and Assignment desks, Daybook Editor\n Contact: John Sullivan, 202-707-9216, or Lucy Suddreth, 202-707-9191\n          both of the Library of Congress\n\n   WASHINGTON, April 19  -- A symposium on the Dead Sea \nScrolls will be held at the Library of Congress on Wednesday,\nApril 21, and Thursday, April 22.  The two-day program, cosponsored\nby the library and Baltimore Hebrew University, with additional\nsupport from the Project Judaica Foundation, will be held in the\nlibrary\'s Mumford Room, sixth floor, Madison Building.\n   Seating is limited, and admission to any session of the symposium\nmust be requested in writing (see Note A).\n   The symposium will be held one week before the public opening of a\nmajor exhibition, "Scrolls from the Dead Sea: The Ancient Librar

In [8]:
Y

array([1, 0, 1, ..., 0, 1, 1])

In [9]:
Y_names

['alt.atheism', 'soc.religion.christian']

In [10]:
del CATEGORIES, newsgroups, X, Y, Y_names

### klasyfikacja wieloklasowa

In [11]:
newsgroups_train_dev = fetch_20newsgroups(subset = 'train')
newsgroups_test = fetch_20newsgroups(subset = 'test')

In [12]:
newsgroups_train_dev_text = newsgroups_train_dev['data']
newsgroups_test_text = newsgroups_test['data']

In [13]:
Y_train_dev = newsgroups_train_dev['target']
Y_test = newsgroups_test['target']

In [14]:
newsgroups_train_text, newsgroups_dev_text, Y_train, Y_dev = train_test_split(newsgroups_train_dev_text, Y_train_dev, random_state=42)

In [15]:
Y_names = newsgroups_train_dev['target_names']

In [16]:
Y_train_dev

array([7, 4, 4, ..., 3, 1, 8])

In [17]:
Y_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

**Jaki baseline?**

In [18]:
pd.value_counts(Y_train)

10    464
5     457
9     456
15    449
13    449
2     449
6     448
12    447
1     446
3     445
8     443
14    441
11    439
7     430
17    429
4     421
16    396
0     363
18    328
19    285
dtype: int64

In [19]:
accuracy_score(Y_test, np.ones_like(Y_test) * 10)

0.05297397769516728



**Pytanie** - w jaki sposób stworzyć taki klasyfikator na podstawie tylko wiedzy z poprzednich ćwiczeń?

#### Zadanie - stworzyć klasyfikator regresji logistycznej one vs rest na podstawie tfdif. TFIDF powinien mieć słownik o wielkości 10000

https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [20]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

### Podejście softmax na tfidif

**Zadanie** Na podstawie poprzednich zajęć stworzyć sieć w pytorch bez warstw ukrytych, z jedną warstwą *output* z funkcją softmax (bez trenowania i ewaluacji sieci)

Użyć https://pytorch.org/docs/stable/generated/torch.nn.Softmax.html?highlight=softmax

In [31]:
X_train

<8485x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 1115170 stored elements in Compressed Sparse Row format>

In [32]:
class NeuralNetworkModel(torch.nn.Module):

    pass

In [33]:
OUTPUT_SIZE = len(Y_names)

In [34]:
nn_model = NeuralNetworkModel(FEAUTERES, OUTPUT_SIZE)

In [35]:
nn_model(torch.Tensor(X_train[0:3].astype(np.float32).todense()))

tensor([[0.3346, 0.3337, 0.3341, 0.3301, 0.3312, 0.3340, 0.3347, 0.3345, 0.3318,
         0.3337, 0.3344, 0.3328, 0.3314, 0.3322, 0.3335, 0.3331, 0.3328, 0.3302,
         0.3336, 0.3331],
        [0.3325, 0.3342, 0.3339, 0.3370, 0.3361, 0.3309, 0.3321, 0.3332, 0.3349,
         0.3324, 0.3330, 0.3345, 0.3353, 0.3368, 0.3326, 0.3331, 0.3329, 0.3335,
         0.3352, 0.3333],
        [0.3329, 0.3321, 0.3319, 0.3329, 0.3327, 0.3350, 0.3332, 0.3323, 0.3333,
         0.3339, 0.3326, 0.3327, 0.3333, 0.3310, 0.3339, 0.3338, 0.3342, 0.3363,
         0.3312, 0.3336]], grad_fn=<SoftmaxBackward>)

In [36]:
BATCH_SIZE = 5

In [37]:
criterion = torch.nn.NLLLoss()

In [38]:
optimizer = torch.optim.SGD(nn_model.parameters(), lr = 0.2)
#optimizer = torch.optim.Adam(nn_model.parameters())

In [39]:
def get_loss_acc(model, X_dataset, Y_dataset):
    loss_score = 0
    acc_score = 0
    items_total = 0
    model.eval()
    for i in range(0, Y_dataset.shape[0], BATCH_SIZE):
        X = X_dataset[i:i+BATCH_SIZE]
        X = torch.tensor(X.astype(np.float32).todense())
        Y = Y_dataset[i:i+BATCH_SIZE]
        Y = torch.tensor(Y)
        Y_predictions = model(X)
        acc_score += torch.sum(torch.argmax(Y_predictions,dim=1) == Y).item()
        items_total += Y.shape[0] 

        loss = criterion(Y_predictions, Y)

        loss_score += loss.item() * Y.shape[0] 
    return (loss_score / items_total), (acc_score / items_total)

In [40]:
for epoch in range(5):
    loss_score = 0
    acc_score = 0
    items_total = 0
    nn_model.train()
    for i in range(0, Y_train.shape[0], BATCH_SIZE):
        X = X_train[i:i+BATCH_SIZE]
        X = torch.tensor(X.astype(np.float32).todense())
        Y = Y_train[i:i+BATCH_SIZE]

        Y = torch.tensor(Y)
        Y_predictions = nn_model(X)
        acc_score += torch.sum(torch.argmax(Y_predictions,dim=1) == Y).item()
        items_total += Y.shape[0] 

        optimizer.zero_grad()
        loss = criterion(Y_predictions, Y)
        loss.backward()
        optimizer.step()


        loss_score += loss.item() * Y.shape[0]

    
    display(epoch)
    display(get_loss_acc(nn_model, X_train, Y_train))
    display(get_loss_acc(nn_model, X_dev, Y_dev))

0

(-0.2123467895692422, 0.7024160282852092)

(-0.2114526037404629, 0.6313184870979145)

1

(-0.2260659699079635, 0.71101944608132)

(-0.22411313908455777, 0.6415694591728526)

2

(-0.2411795081323782, 0.7104301708898055)

(-0.23805134338660897, 0.6451042771297278)

3

(-0.2576608823620016, 0.7103123158515027)

(-0.2532473176549205, 0.6468716861081655)

4

(-0.27536448837204686, 0.7121979964643489)

(-0.2695745413549463, 0.6504065040650406)

In [41]:
X.shape

torch.Size([5, 10000])

In [42]:
newsgroups_train_text

["From: DSHAL@vmd.cso.uiuc.edu\nSubject: Re: Clintons views on Jerusalem\nOrganization: C.C.S.O.\nLines: 10\n\nIt seems that President Clinton can recognize Jerusalem as Israels capitol\nwhile still keeping his diplomatic rear door open by stating that the Parties\nconcerned should decide the city's final status. Even as I endorse Clintons vie\nw (of course), it is definitely a matter to be decided upon by Israel (and\nother participating neighboring contries).\nI see no real conflict in stating both views, nor expect any better from\npoliticians.\n-----\nDavid Shalhevet / dshal@vmd.cso.uiuc.edu / University of Illinois\nDept Anim Sci / 220 PABL / 1201 W. Gregory Dr. / Urbana, IL 61801\n",
 "From: mouse@thunder.mcrcim.mcgill.edu (der Mouse)\nSubject: Re: XWindows always opaque\nKeywords: xwindow, parent-child relation\nOrganization: McGill Research Centre for Intelligent Machines\nLines: 17\n\n> Distribution: comp\n\nPlease don't misuse newsgroup hierarchy names as distributions.\n\nIn

### Podejście softmax z embeddingami na przykładzie NER

In [43]:
# !pip install torchtext
# !pip install datasets

https://www.aclweb.org/anthology/W03-0419.pdf

In [44]:
dataset = load_dataset("conll2003")

Reusing dataset conll2003 (/home/kuba/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)


In [45]:
def build_vocab(dataset):
    counter = Counter()
    for document in dataset:
        counter.update(document)
    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

In [46]:
vocab = build_vocab(dataset['train']['tokens'])

In [47]:
dataset['train']['tokens']

[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 ['Peter', 'Blackburn'],
 ['BRUSSELS', '1996-08-22'],
 ['The',
  'European',
  'Commission',
  'said',
  'on',
  'Thursday',
  'it',
  'disagreed',
  'with',
  'German',
  'advice',
  'to',
  'consumers',
  'to',
  'shun',
  'British',
  'lamb',
  'until',
  'scientists',
  'determine',
  'whether',
  'mad',
  'cow',
  'disease',
  'can',
  'be',
  'transmitted',
  'to',
  'sheep',
  '.'],
 ['Germany',
  "'s",
  'representative',
  'to',
  'the',
  'European',
  'Union',
  "'s",
  'veterinary',
  'committee',
  'Werner',
  'Zwingmann',
  'said',
  'on',
  'Wednesday',
  'consumers',
  'should',
  'buy',
  'sheepmeat',
  'from',
  'countries',
  'other',
  'than',
  'Britain',
  'until',
  'the',
  'scientific',
  'advice',
  'was',
  'clearer',
  '.'],
 ['"',
  'We',
  'do',
  "n't",
  'support',
  'any',
  'such',
  'recommendation',
  'because',
  'we',
  'do',
  "n't",
  'see',
  'any',
  'grounds',
  'fo

In [48]:
len(vocab.itos)

23627

In [49]:
vocab['on']

15

In [50]:
def data_process(dt):
    return [ torch.tensor([vocab['<bos>']] +[vocab[token]  for token in  document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]


In [51]:
def labels_process(dt):
    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]


In [52]:
train_tokens_ids = data_process(dataset['train']['tokens'])

In [53]:
test_tokens_ids = data_process(dataset['test']['tokens'])

In [54]:
train_labels = labels_process(dataset['train']['ner_tags'])

In [55]:
test_labels = labels_process(dataset['test']['ner_tags'])

In [56]:
train_tokens_ids[0]

tensor([    2,   966, 22409,   238,   773,     9,  4588,   212,  7686,     4,
            3])

In [57]:
max([max(x) for x in dataset['train']['ner_tags'] ])

8

In [58]:
class NERModel(torch.nn.Module):

    def __init__(self,):
        super(NERModel, self).__init__()
        self.emb = torch.nn.Embedding(23627,200)
        self.fc1 = torch.nn.Linear(600,9)
        #self.softmax = torch.nn.Softmax(dim=0)
        # nie trzeba, bo używamy https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
        # jako kryterium
        

    def forward(self, x):
        x = self.emb(x)
        x = x.reshape(600) 
        x = self.fc1(x)
        #x = self.softmax(x)
        return x

In [59]:
train_tokens_ids[0][1:4]

tensor([  966, 22409,   238])

In [60]:
ner_model = NERModel()

In [61]:
ner_model(train_tokens_ids[0][1:4])

tensor([ 0.5914,  0.4670,  0.6421, -0.5443,  0.1544,  0.6162,  1.0013, -0.5271,
        -0.2552], grad_fn=<AddBackward0>)

In [62]:
criterion = torch.nn.CrossEntropyLoss()

In [63]:
optimizer = torch.optim.Adam(ner_model.parameters())

In [64]:
len(train_labels)

14041

In [65]:
for epoch in range(2):
    loss_score = 0
    acc_score = 0
    prec_score = 0
    selected_items = 0
    recall_score = 0
    relevant_items = 0
    items_total = 0
    nn_model.train()
    #for i in range(len(train_labels)):
    for i in range(100):
        for j in range(1, len(train_labels[i]) - 1):
    
            X = train_tokens_ids[i][j-1: j+2]
            Y = train_labels[i][j: j+1]

            Y_predictions = ner_model(X)
            
            
            acc_score += int(torch.argmax(Y_predictions) == Y)
            
            if torch.argmax(Y_predictions) != 0:
                selected_items +=1
            if  torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():
                prec_score += 1
            
            if  Y.item() != 0:
                relevant_items +=1
            if  Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():
                recall_score += 1
            
            items_total += 1

            
            optimizer.zero_grad()
            loss = criterion(Y_predictions.unsqueeze(0), Y)
            loss.backward()
            optimizer.step()


            loss_score += loss.item() 
    
    precision = prec_score / selected_items
    recall = recall_score / relevant_items
    f1_score = (2*precision * recall) / (precision + recall)
    display('epoch: ', epoch)
    display('loss: ', loss_score / items_total)
    display('acc: ', acc_score / items_total)
    display('prec: ', precision)
    display('recall: : ', recall)
    display('f1: ', f1_score)

KeyboardInterrupt: 

In [None]:
loss_score = 0
acc_score = 0
prec_score = 0
selected_items = 0
recall_score = 0
relevant_items = 0
items_total = 0
nn_model.eval()
for i in range(100):
#for i in range(len(test_labels)):
    for j in range(1, len(test_labels[i]) - 1):

        X = test_tokens_ids[i][j-1: j+2]
        Y = test_labels[i][j: j+1]

        Y_predictions = ner_model(X)


        acc_score += int(torch.argmax(Y_predictions) == Y)

        if torch.argmax(Y_predictions) != 0:
            selected_items +=1
        if  torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():
            prec_score += 1

        if  Y.item() != 0:
            relevant_items +=1
        if  Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():
            recall_score += 1

        items_total += 1


        loss = criterion(Y_predictions.unsqueeze(0), Y)



        loss_score += loss.item() 

precision = prec_score / selected_items
recall = recall_score / relevant_items
f1_score = (2*precision * recall) / (precision + recall)
display('loss: ', loss_score / items_total)
display('acc: ', acc_score / items_total)
display('prec: ', precision)
display('recall: : ', recall)
display('f1: ', f1_score)

### Zadanie domowe

- sklonować repozytorium https://git.wmi.amu.edu.pl/kubapok/en-ner-conll-2003
- stworzyć klasyfikator bazujący na sieci neuronowej feed forward w pytorchu (można bazować na tym jupyterze lub nie).
- klasyfikator powinien obejmować dodatkowe cechy (np. długość wyrazu, czy wyraz zaczyna się od wielkiej litery, stemmming słowa, czy zawiera cyfrę)
- stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv
- wynik fscore sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.60
- proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo
termin 08.06, 80 punktów
