# Klasyfikacja wieloklasowa i sequence labelling

In [1]:
import numpy as np
import gensim
import torch
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.datasets import fetch_20newsgroups
# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score



## Klasyfikacja

### Klasfikacja binarna-  2 klasy

In [2]:
CATEGORIES = ['soc.religion.christian', 'alt.atheism']

In [3]:
newsgroups = fetch_20newsgroups(categories=CATEGORIES)

In [4]:
X =  newsgroups['data']

In [5]:
Y = newsgroups['target']

In [6]:
Y_names = newsgroups['target_names']

In [7]:
X[0:1]

['From: nigel.allen@canrem.com (Nigel Allen)\nSubject: library of congress to host dead sea scroll symposium april 21-22\nLines: 96\n\n\n Library of Congress to Host Dead Sea Scroll Symposium April 21-22\n To: National and Assignment desks, Daybook Editor\n Contact: John Sullivan, 202-707-9216, or Lucy Suddreth, 202-707-9191\n          both of the Library of Congress\n\n   WASHINGTON, April 19  -- A symposium on the Dead Sea \nScrolls will be held at the Library of Congress on Wednesday,\nApril 21, and Thursday, April 22.  The two-day program, cosponsored\nby the library and Baltimore Hebrew University, with additional\nsupport from the Project Judaica Foundation, will be held in the\nlibrary\'s Mumford Room, sixth floor, Madison Building.\n   Seating is limited, and admission to any session of the symposium\nmust be requested in writing (see Note A).\n   The symposium will be held one week before the public opening of a\nmajor exhibition, "Scrolls from the Dead Sea: The Ancient Librar

In [8]:
Y

array([1, 0, 1, ..., 0, 1, 1])

In [9]:
Y_names

['alt.atheism', 'soc.religion.christian']

In [10]:
del CATEGORIES, newsgroups, X, Y, Y_names

### klasyfikacja wieloklasowa

In [11]:
newsgroups_train_dev = fetch_20newsgroups(subset = 'train')
newsgroups_test = fetch_20newsgroups(subset = 'test')

In [12]:
newsgroups_train_dev_text = newsgroups_train_dev['data']
newsgroups_test_text = newsgroups_test['data']

In [13]:
Y_train_dev = newsgroups_train_dev['target']
Y_test = newsgroups_test['target']

In [14]:
newsgroups_train_text, newsgroups_dev_text, Y_train, Y_dev = train_test_split(newsgroups_train_dev_text, Y_train_dev, random_state=42)

In [15]:
Y_names = newsgroups_train_dev['target_names']

In [16]:
Y_train_dev

array([7, 4, 4, ..., 3, 1, 8])

In [17]:
Y_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

**Jaki baseline?**

In [18]:
pd.value_counts(Y_train)

10    464
5     457
9     456
15    449
13    449
2     449
6     448
12    447
1     446
3     445
8     443
14    441
11    439
7     430
17    429
4     421
16    396
0     363
18    328
19    285
dtype: int64

In [19]:
accuracy_score(Y_test, np.ones_like(Y_test) * 10)

0.05297397769516728



**Pytanie** - w jaki sposób stworzyć taki klasyfikator na podstawie tylko wiedzy z poprzednich ćwiczeń?

#### Zadanie - stworzyć klasyfikator regresji logistycznej one vs rest na podstawie tfdif. TFIDF powinien mieć słownik o wielkości 10000

https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [20]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
FEAUTERES = 10_000

In [23]:
vectorizer = TfidfVectorizer(max_features=FEAUTERES)
X_train = vectorizer.fit_transform(newsgroups_train_text)

In [24]:
X_dev = vectorizer.transform(newsgroups_dev_text)
X_test = vectorizer.transform(newsgroups_test_text)

In [25]:
clf = OneVsRestClassifier(LogisticRegression()).fit(X_train, Y_train)

In [26]:
clf.predict(X_train[0:1])

array([17])

In [27]:
clf.predict_proba(X_train[0:1])

array([[0.01996353, 0.04403235, 0.03483328, 0.01851026, 0.0341886 ,
        0.02510403, 0.0274924 , 0.03861172, 0.02705295, 0.04672495,
        0.02225957, 0.02924003, 0.02810532, 0.04871973, 0.06882963,
        0.04601515, 0.03421612, 0.31493021, 0.06406168, 0.02710846]])

In [28]:
np.max(clf.predict_proba(X_train[0]))

0.314930213117783

In [29]:
accuracy_score(clf.predict(X_train), Y_train)

0.9558043606364172

In [30]:
accuracy_score(clf.predict(X_dev), Y_dev)

0.8745139625309296

In [31]:
accuracy_score(clf.predict(X_test), Y_test)

0.8027084439723845

### Podejście softmax na tfidif

**Zadanie** Na podstawie poprzednich zajęć stworzyć sieć w pytorch bez warstw ukrytych, z jedną warstwą *output* z funkcją softmax oraz wytrenować sieć. 

Użyć https://pytorch.org/docs/stable/generated/torch.nn.Softmax.html?highlight=softmax

In [33]:
X_train

<8485x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 1115170 stored elements in Compressed Sparse Row format>

In [34]:
class NeuralNetworkModel(torch.nn.Module):

    def __init__(self,FEAUTERES, output_size):
        super(NeuralNetworkModel, self).__init__()
        self.fc1 = torch.nn.Linear(FEAUTERES,OUTPUT_SIZE)
        self.softmax = torch.nn.Softmax(dim=0)
        

    def forward(self, x):
        x = self.fc1(x)
        x = self.softmax(x)
        return x

In [35]:
OUTPUT_SIZE = len(Y_names)

In [36]:
nn_model = NeuralNetworkModel(FEAUTERES, OUTPUT_SIZE)

In [37]:
nn_model(torch.Tensor(X_train[0:3].astype(np.float32).todense()))

tensor([[0.3320, 0.3328, 0.3347, 0.3342, 0.3328, 0.3325, 0.3304, 0.3357, 0.3349,
         0.3329, 0.3312, 0.3340, 0.3350, 0.3334, 0.3349, 0.3319, 0.3340, 0.3345,
         0.3326, 0.3322],
        [0.3355, 0.3337, 0.3323, 0.3318, 0.3339, 0.3345, 0.3339, 0.3306, 0.3355,
         0.3343, 0.3336, 0.3328, 0.3331, 0.3309, 0.3318, 0.3349, 0.3337, 0.3318,
         0.3338, 0.3337],
        [0.3325, 0.3335, 0.3331, 0.3340, 0.3333, 0.3330, 0.3357, 0.3337, 0.3296,
         0.3328, 0.3352, 0.3332, 0.3319, 0.3357, 0.3333, 0.3332, 0.3323, 0.3337,
         0.3337, 0.3341]], grad_fn=<SoftmaxBackward>)

In [38]:
BATCH_SIZE = 5

In [39]:
criterion = torch.nn.NLLLoss()

In [40]:
optimizer = torch.optim.SGD(nn_model.parameters(), lr = 0.2)
#optimizer = torch.optim.Adam(nn_model.parameters())

In [41]:
def get_loss_acc(model, X_dataset, Y_dataset):
    loss_score = 0
    acc_score = 0
    items_total = 0
    model.eval()
    for i in range(0, Y_dataset.shape[0], BATCH_SIZE):
        X = X_dataset[i:i+BATCH_SIZE]
        X = torch.tensor(X.astype(np.float32).todense())
        Y = Y_dataset[i:i+BATCH_SIZE]
        Y = torch.tensor(Y)
        Y_predictions = model(X)
        acc_score += torch.sum(torch.argmax(Y_predictions,dim=1) == Y).item()
        items_total += Y.shape[0] 

        loss = criterion(Y_predictions, Y)

        loss_score += loss.item() * Y.shape[0] 
    return (loss_score / items_total), (acc_score / items_total)

In [42]:
for epoch in range(5):
    loss_score = 0
    acc_score = 0
    items_total = 0
    nn_model.train()
    for i in range(0, Y_train.shape[0], BATCH_SIZE):
        X = X_train[i:i+BATCH_SIZE]
        X = torch.tensor(X.astype(np.float32).todense())
        Y = Y_train[i:i+BATCH_SIZE]

        Y = torch.tensor(Y)
        Y_predictions = nn_model(X)
        acc_score += torch.sum(torch.argmax(Y_predictions,dim=1) == Y).item()
        items_total += Y.shape[0] 

        optimizer.zero_grad()
        loss = criterion(Y_predictions, Y)
        loss.backward()
        optimizer.step()


        loss_score += loss.item() * Y.shape[0]

    
    display(epoch)
    display(get_loss_acc(nn_model, X_train, Y_train))
    display(get_loss_acc(nn_model, X_dev, Y_dev))

0

(-0.21234585829496244, 0.6987625220978196)

(-0.21147151558948685, 0.6341463414634146)

1

(-0.22606862114679835, 0.7084266352386565)

(-0.22413486314982903, 0.6489925768822906)

2

(-0.24118578127537604, 0.7084266352386565)

(-0.23807573430674706, 0.6504065040650406)

3

(-0.25767021444030697, 0.7105480259281084)

(-0.25327372663157616, 0.6514669494521033)

4

(-0.27537571211632517, 0.7124337065409546)

(-0.26960182536529037, 0.6532343584305408)

In [None]:
X.shape

In [43]:
newsgroups_train_text

["From: DSHAL@vmd.cso.uiuc.edu\nSubject: Re: Clintons views on Jerusalem\nOrganization: C.C.S.O.\nLines: 10\n\nIt seems that President Clinton can recognize Jerusalem as Israels capitol\nwhile still keeping his diplomatic rear door open by stating that the Parties\nconcerned should decide the city's final status. Even as I endorse Clintons vie\nw (of course), it is definitely a matter to be decided upon by Israel (and\nother participating neighboring contries).\nI see no real conflict in stating both views, nor expect any better from\npoliticians.\n-----\nDavid Shalhevet / dshal@vmd.cso.uiuc.edu / University of Illinois\nDept Anim Sci / 220 PABL / 1201 W. Gregory Dr. / Urbana, IL 61801\n",
 "From: mouse@thunder.mcrcim.mcgill.edu (der Mouse)\nSubject: Re: XWindows always opaque\nKeywords: xwindow, parent-child relation\nOrganization: McGill Research Centre for Intelligent Machines\nLines: 17\n\n> Distribution: comp\n\nPlease don't misuse newsgroup hierarchy names as distributions.\n\nIn

### Podejście softmax z embeddingami

In [None]:
# !pip install torchtext


In [None]:
!pip install spacy

In [45]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from collections import Counter


In [46]:
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [47]:
tokenizer = get_tokenizer('spacy', language='en')



In [48]:
tokenizer

functools.partial(<function _spacy_tokenize at 0x7fd9f95eec10>, spacy=<spacy.lang.en.English object at 0x7fd9f9bb3ca0>)

In [53]:
tokenizer(newsgroups_train_text[0])

['From',
 ':',
 'DSHAL@vmd.cso.uiuc.edu',
 '\n',
 'Subject',
 ':',
 'Re',
 ':',
 'Clintons',
 'views',
 'on',
 'Jerusalem',
 '\n',
 'Organization',
 ':',
 'C.C.S.O.',
 '\n',
 'Lines',
 ':',
 '10',
 '\n\n',
 'It',
 'seems',
 'that',
 'President',
 'Clinton',
 'can',
 'recognize',
 'Jerusalem',
 'as',
 'Israels',
 'capitol',
 '\n',
 'while',
 'still',
 'keeping',
 'his',
 'diplomatic',
 'rear',
 'door',
 'open',
 'by',
 'stating',
 'that',
 'the',
 'Parties',
 '\n',
 'concerned',
 'should',
 'decide',
 'the',
 'city',
 "'s",
 'final',
 'status',
 '.',
 'Even',
 'as',
 'I',
 'endorse',
 'Clintons',
 'vie',
 '\n',
 'w',
 '(',
 'of',
 'course',
 ')',
 ',',
 'it',
 'is',
 'definitely',
 'a',
 'matter',
 'to',
 'be',
 'decided',
 'upon',
 'by',
 'Israel',
 '(',
 'and',
 '\n',
 'other',
 'participating',
 'neighboring',
 'contries',
 ')',
 '.',
 '\n',
 'I',
 'see',
 'no',
 'real',
 'conflict',
 'in',
 'stating',
 'both',
 'views',
 ',',
 'nor',
 'expect',
 'any',
 'better',
 'from',
 '\n',
 'p

In [51]:
def build_vocab(dataset, tokenizer):
    counter = Counter()
    for document in newsgroups_train_text:
        counter.update(tokenizer(document))
    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

In [52]:
vocab = build_vocab(newsgroups_train_text, tokenizer)

In [57]:
[vocab[token] for token in tokenizer(newsgroups_train_text[0])]

[52,
 9,
 85832,
 4,
 53,
 9,
 76,
 9,
 18791,
 1332,
 31,
 3165,
 4,
 55,
 9,
 82668,
 4,
 54,
 9,
 246,
 17,
 92,
 337,
 26,
 672,
 762,
 62,
 2068,
 3165,
 44,
 23235,
 40764,
 4,
 286,
 224,
 2150,
 130,
 14406,
 2692,
 1461,
 640,
 56,
 5190,
 26,
 7,
 31543,
 4,
 1816,
 134,
 1827,
 7,
 1228,
 38,
 1312,
 2026,
 6,
 1045,
 44,
 21,
 24371,
 18791,
 44123,
 4,
 2027,
 20,
 12,
 312,
 16,
 5,
 27,
 22,
 2458,
 15,
 573,
 11,
 32,
 1251,
 902,
 56,
 412,
 20,
 19,
 4,
 112,
 7824,
 11282,
 58156,
 16,
 6,
 4,
 21,
 160,
 102,
 355,
 2644,
 24,
 5190,
 292,
 1332,
 5,
 1051,
 972,
 93,
 237,
 59,
 4,
 7044,
 6,
 4,
 2303,
 4,
 278,
 116400,
 41,
 131502,
 41,
 101,
 12,
 1231,
 4,
 652,
 48358,
 3638,
 41,
 8123,
 111224,
 41,
 22507,
 1598,
 3908,
 1178,
 41,
 1971,
 5,
 2720,
 29726,
 4]

In [69]:
def data_process(dataset):
    data = []
    for document in dataset:
        tensor_ = torch.tensor([vocab[token] for token in tokenizer(document)],
                            dtype=torch.long)
        data.append(tensor_)
    return data

In [70]:
processed = data_process(newsgroups_train_text)

In [72]:
len(processed)

8485

In [73]:
!pip install datasets

Collecting datasets
  Using cached datasets-1.6.2-py3-none-any.whl (221 kB)
Collecting dill
  Using cached dill-0.3.3-py2.py3-none-any.whl (81 kB)
Collecting pyarrow>=1.0.0<4.0.0
  Using cached pyarrow-4.0.0-cp38-cp38-manylinux2014_x86_64.whl (21.9 MB)
Collecting multiprocess
  Using cached multiprocess-0.70.11.1-py38-none-any.whl (126 kB)
Collecting huggingface-hub<0.1.0
  Using cached huggingface_hub-0.0.8-py3-none-any.whl (34 kB)
Collecting xxhash
  Using cached xxhash-2.0.2-cp38-cp38-manylinux2010_x86_64.whl (243 kB)
Installing collected packages: dill, pyarrow, multiprocess, huggingface-hub, xxhash, datasets
Successfully installed datasets-1.6.2 dill-0.3.3 huggingface-hub-0.0.8 multiprocess-0.70.11.1 pyarrow-4.0.0 xxhash-2.0.2


In [74]:
from datasets import load_dataset

In [76]:
dataset = load_dataset("conll2003")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2603.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1781.0, style=ProgressStyle(description…


Downloading and preparing dataset conll2003/conll2003 (download: 4.63 MiB, generated: 9.78 MiB, post-processed: Unknown size, total: 14.41 MiB) to /home/kuba/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=649539.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=162714.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=145897.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset conll2003 downloaded and prepared to /home/kuba/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6. Subsequent calls will reuse this data.


In [80]:
dataset['train']

Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
    num_rows: 14041
})

In [84]:
dataset['train']['pos_tags']

[[22, 42, 16, 21, 35, 37, 16, 21, 7],
 [22, 22],
 [22, 11],
 [12,
  22,
  22,
  38,
  15,
  22,
  28,
  38,
  15,
  16,
  21,
  35,
  24,
  35,
  37,
  16,
  21,
  15,
  24,
  41,
  15,
  16,
  21,
  21,
  20,
  37,
  40,
  35,
  21,
  7],
 [22,
  27,
  21,
  35,
  12,
  22,
  22,
  27,
  16,
  21,
  22,
  22,
  38,
  15,
  22,
  24,
  20,
  37,
  21,
  15,
  24,
  16,
  15,
  22,
  15,
  12,
  16,
  21,
  38,
  17,
  7],
 [0,
  28,
  41,
  30,
  37,
  12,
  16,
  21,
  15,
  28,
  41,
  30,
  37,
  12,
  24,
  15,
  28,
  6,
  0,
  12,
  22,
  27,
  16,
  21,
  22,
  22,
  14,
  22,
  38,
  12,
  21,
  21,
  7],
 [28,
  38,
  16,
  16,
  21,
  38,
  40,
  10,
  15,
  28,
  38,
  40,
  15,
  21,
  38,
  40,
  28,
  20,
  37,
  40,
  15,
  12,
  22,
  22,
  7],
 [28,
  38,
  12,
  21,
  16,
  21,
  15,
  22,
  22,
  22,
  22,
  22,
  35,
  37,
  21,
  24,
  6,
  24,
  10,
  16,
  24,
  15,
  12,
  21,
  10,
  21,
  21,
  24,
  38,
  12,
  30,
  16,
  10,
  16,
  21,
  35,
  37,
  16,
  