## Uczenie głębokie – przetwarzanie tekstu – laboratoria
# 3. RNN

RNN - rekurencyjna sieć neuronowa \
NEP - oznaczanie jednostek nazewniczych / części mowy

IOB
- O - brak jednostki nazewniczej
- I - kontynuacja jednostki
- B - początek nowej jednostki

Typy jednostek
- LOC - lokalizacja
- PER - osoba
- ORG - organizacja

Jeśli jest to jednoznaczne to nawet na początku jednostki daje się "I" \
"B" używa się zazwyczaj przy sytuacjach w których mogło być to niejednoznaczne

## Zadanie 3

Sklonuj repozytorium https://git.wmi.amu.edu.pl/kubapok/en-ner-conll-2003

Stwórz model *sequence labelling* realizujący zadanie NER, oparty o dowolną rekurencyjną sieć neuronową (możesz wzorować się na przykładzie z zajęć).

W plikach dev-0/out.tsv oraz test-A/out.tsv umieść wyniki predykcji dla dev-0/in.tsv i test-A/in.tsv odpowiednio.
Do ewaluacji wykorzystaj narzędzie GEval (https://gitlab.com/filipg/geval):

    wget https://gonito.net/get/bin/geval
    chmod u+x geval
    ./geval --help

Liczba punktów uzyskanych za zadanie zależy od uzyskanej wartości accuracy na zbiorze `test-A` (wynik zaokrąglony w górę):

    points = math.ceil(accuracy * 7.0)

⚠️ W systemie Moodle proszę załączyć plik `test-A/out.tsv` oraz link do repozytorium z rozwiązaniem zadania.


### Podejście softmax z embeddingami na przykładzie NER

In [155]:
%pip install -v torch torchtext --index-url https://download.pytorch.org/whl/cu118
%pip install -v datasets
%pip install ipywidgets

from collections import Counter
from tqdm import tqdm

import torch
from torchtext.vocab import vocab
from tqdm.notebook import tqdm
import pandas as pd

Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
Looking in indexes: https://download.pytorch.org/whl/cu118
Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)


In [156]:
def preprocess_data(train_data):
    tokenized_data = []
    labels = []

    for _, row in train_data.iterrows():
        row_tokens = row["x"].lower().split(" ")
        row_labels = row["y"].split(" ")

        tokenized_data.append(row_tokens)
        labels.append(row_labels)

    return tokenized_data, labels

train_data = pd.read_csv("train/train.tsv", sep="\t", header=None, names=["y", "x"])
training_data, training_labels = preprocess_data(train_data)

print(training_data[1])
print(len(training_data[1]))
print(training_labels[1])
print(len(training_labels[1]))

['rare', 'hendrix', 'song', 'draft', 'sells', 'for', 'almost', '$', '17,000', '.', '</s>', 'london', '1996-08-22', '</s>', 'a', 'rare', 'early', 'handwritten', 'draft', 'of', 'a', 'song', 'by', 'u.s.', 'guitar', 'legend', 'jimi', 'hendrix', 'was', 'sold', 'for', 'almost', '$', '17,000', 'on', 'thursday', 'at', 'an', 'auction', 'of', 'some', 'of', 'the', 'late', 'musician', "'s", 'favourite', 'possessions', '.', '</s>', 'a', 'florida', 'restaurant', 'paid', '10,925', 'pounds', '(', '$', '16,935', ')', 'for', 'the', 'draft', 'of', '"', 'ai', "n't", 'no', 'telling', '"', ',', 'which', 'hendrix', 'penned', 'on', 'a', 'piece', 'of', 'london', 'hotel', 'stationery', 'in', 'late', '1966', '.', '</s>', 'at', 'the', 'end', 'of', 'a', 'january', '1967', 'concert', 'in', 'the', 'english', 'city', 'of', 'nottingham', 'he', 'threw', 'the', 'sheet', 'of', 'paper', 'into', 'the', 'audience', ',', 'where', 'it', 'was', 'retrieved', 'by', 'a', 'fan', '.', '</s>', 'buyers', 'also', 'snapped', 'up', '16'

In [157]:
def preprocess_data(test_data):
    tokenized_data = []

    for _, row in test_data.iterrows():
        row_tokens = row["x"].lower().split(" ")

        tokenized_data.append(row_tokens)

    return tokenized_data

test_data = pd.read_csv("test-A/in.tsv", sep="\t", header=None, names=["x"])
testing_data = preprocess_data(test_data)

print(testing_data[1])
print(len(testing_data[1]))

['rugby', 'union', '-', 'cuttitta', 'back', 'for', 'italy', 'after', 'a', 'year', '.', '</s>', 'rome', '1996-12-06', '</s>', 'italy', 'recalled', 'marcello', 'cuttitta', '</s>', 'on', 'friday', 'for', 'their', 'friendly', 'against', 'scotland', 'at', 'murrayfield', 'more', 'than', 'a', 'year', 'after', 'the', '30-year-old', 'wing', 'announced', 'he', 'was', 'retiring', 'following', 'differences', 'over', 'selection', '.', '</s>', 'cuttitta', ',', 'who', 'trainer', 'george', 'coste', 'said', 'was', 'certain', 'to', 'play', 'on', 'saturday', 'week', ',', 'was', 'named', 'in', 'a', '21-man', 'squad', 'lacking', 'only', 'two', 'of', 'the', 'team', 'beaten', '54-21', 'by', 'england', 'at', 'twickenham', 'last', 'month', '.', '</s>', 'stefano', 'bordon', 'is', 'out', 'through', 'illness', 'and', 'coste', 'said', 'he', 'had', 'dropped', 'back', 'row', 'corrado', 'covi', ',', 'who', 'had', 'been', 'recalled', 'for', 'the', 'england', 'game', 'after', 'five', 'years', 'out', 'of', 'the', 'natio

In [158]:
def preprocess_data(validate_data):
    tokenized_data = []

    for _, row in validate_data.iterrows():
        row_tokens = row["x"].lower().split(" ")

        tokenized_data.append(row_tokens)

    return tokenized_data

validate_data = pd.read_csv("dev-0/in.tsv", sep="\t", header=None, names=["x"])
validation_data = preprocess_data(validate_data)

print(validation_data[1])

['cricket', '-', 'english', 'county', 'championship', 'scores', '.', '</s>', 'london', '1996-08-30', '</s>', 'result', 'and', 'close', 'of', 'play', 'scores', 'in', 'english', 'county', 'championship', 'matches', 'on', 'friday', ':', '</s>', 'leicester', ':', 'leicestershire', 'beat', 'somerset', 'by', 'an', 'innings', 'and', '39', 'runs', '.', '</s>', 'somerset', '83', 'and', '174', '(', 'p.', 'simmons', '4-38', ')', ',', 'leicestershire', '296', '.', '</s>', 'leicestershire', '22', 'points', ',', 'somerset', '4', '.', '</s>', 'chester-le-street', ':', 'glamorgan', '259', 'and', '207', '(', 'a.', 'dale', '69', ',', 'h.', 'morris', '69', ';', 'd.', 'blenkiron', '4-43', ')', ',', 'durham', '114', '(', 's.', 'watkin', '4-28', ')', 'and', '81-3', '.', '</s>', 'tunbridge', 'wells', ':', 'nottinghamshire', '214', '(', 'p.', 'johnson', '84', ';', 'm.', 'mccague', '4-55', ')', ',', 'kent', '108-3', '.', '</s>', 'london', '(', 'the', 'oval', ')', ':', 'warwickshire', '195', ',', 'surrey', '429

In [159]:
def preprocess_labels(validate_labels):
    labels = []

    for _, row in validate_labels.iterrows():
        row_labels = row["y"].split(" ")

        labels.append(row_labels)

    return labels

validate_labels = pd.read_csv("dev-0/expected.tsv", sep="\t", header=None, names=["y"])
validation_labels = preprocess_labels(validate_labels)

print(validation_labels[1])

['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'B-LOC', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'B-ORG', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'B-ORG', 'O', 'O', 'B-ORG', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-ORG', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'B-PER'

In [160]:
def create_ner_labels(training_labels):
    ner_labels = {}
    i = 0

    for labels in training_labels:
        for label in labels:
            if label not in ner_labels:
                ner_labels[label] = i
                i += 1

    return ner_labels

def reverse_ner_labels(ner_labels):
    return {v: k for k, v in ner_labels.items()}

NER_labels = create_ner_labels(training_labels)
NER_labels_reverse = reverse_ner_labels(NER_labels)

print(NER_labels)
print(NER_labels_reverse)

{'B-ORG': 0, 'O': 1, 'B-MISC': 2, 'B-PER': 3, 'I-PER': 4, 'B-LOC': 5, 'I-ORG': 6, 'I-MISC': 7, 'I-LOC': 8}
{0: 'B-ORG', 1: 'O', 2: 'B-MISC', 3: 'B-PER', 4: 'I-PER', 5: 'B-LOC', 6: 'I-ORG', 7: 'I-MISC', 8: 'I-LOC'}


In [161]:
def build_vocab(dataset):
    word_counter = Counter()

    for document in dataset:
        word_counter.update(document)

    return vocab(word_counter, specials=["<unk>", "<pad>", "<bos>", "<eos>"])

v = build_vocab(training_data)
itos = v.get_itos()
print(itos)



In [162]:
len(itos)  # liczba różnych tokenów w słowniku

21014

In [163]:
v["rejects"]  # indeks tokenu `on`

5

In [164]:
v["<unk>"]  # indeks nieznanego tokenu

0

In [165]:
v.set_default_index(v["<unk>"])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def data_process(dt):
    return [
        torch.tensor(
            [v["<bos>"]] + [v[token] for token in document] + [v["<eos>"]],
            dtype=torch.long,
            device=device
        )
        for document in dt
    ]

def labels_process(dt):
    return [torch.tensor([0] + document + [0], dtype=torch.long, device=device) for document in dt]

def get_NER_tags(labels_array):
    ner_tags = []

    for labels in labels_array:
        ner_tags.append([NER_labels[label] for label in labels])

    return ner_tags

def get_labels(ner_tags_array):
    labels = []

    for ner_tags in ner_tags_array:
        labels.append([NER_labels_reverse[tag] for tag in ner_tags])

    return labels

training_NER_tags = get_NER_tags(training_labels)
training_labels_from_NER_tags = get_labels(training_NER_tags)
print(training_NER_tags[1])
print(training_labels[1])
print(training_labels_from_NER_tags[1])
print(len(training_NER_tags[1]))
print(len(training_labels[1]))

[1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 7, 7, 7, 1, 1, 1, 3, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
['O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O', 'O', 'O', 'B-

In [166]:
validation_NER_tags = get_NER_tags(validation_labels)
print(validation_NER_tags[1])
print(validation_labels[1])
print(len(validation_NER_tags[1]))
print(len(validation_labels[1]))

[1, 1, 2, 7, 7, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 5, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 3, 4, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 5, 1, 0, 1, 1, 1, 1, 3, 4, 1, 1, 3, 4, 1, 1, 3, 4, 1, 1, 1, 0, 1, 1, 3, 4, 1, 1, 1, 1, 1, 1, 5, 8, 1, 0, 1, 1, 3, 4, 1, 1, 3, 4, 1, 1, 1, 0, 1, 1, 1, 5, 1, 5, 8, 1, 1, 0, 1, 1, 0, 1, 1, 3, 4, 1, 1, 1, 1, 3, 4, 1, 1, 3, 4, 1, 1, 3, 4, 1, 1, 3, 4, 1, 1, 1, 1, 5, 1, 0, 1, 1, 3, 4, 1, 1, 3, 4, 1, 1, 3, 4, 1, 1, 1, 0, 1, 1, 3, 4, 1, 1, 1, 5, 1, 0, 1, 1, 1, 1, 3, 4, 1, 1, 3, 4, 1, 1, 3, 4, 1, 1, 1, 0, 1, 1, 1, 1, 1, 5, 1, 0, 1, 1, 1, 1, 0, 1, 1, 3, 4, 1, 1, 3, 1, 1, 1, 1, 3, 4, 1, 1, 3, 4, 1, 1, 1, 5, 1, 0, 1, 1, 1, 1, 3, 4, 1, 1, 1, 1, 1, 0, 1, 1, 3, 4, 1, 1, 3, 4, 1, 1, 1, 1]
['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-OR

In [167]:
train_tokens_ids = data_process(training_data)
test_tokens_ids = data_process(testing_data)
validation_tokens_ids = data_process(validation_data)
train_labels = labels_process(training_NER_tags)
validation_labels = labels_process(validation_NER_tags)
print(train_tokens_ids[1])
print(training_data[1])

tensor([  2, 257, 258, 259, 260, 261,  72, 262, 263, 264,  12,  13, 265,  17,
         13,  81, 257, 177, 266, 260, 163,  81, 259,  95, 267, 268, 269, 270,
        258,  59, 271,  72, 262, 263, 264,  22,  23, 158, 159, 272, 163, 253,
        163,  18, 273, 274,  42, 275, 276,  12,  13,  81, 277, 278, 279, 280,
        281, 132, 263, 282, 134,  72,  18, 260, 163,  61, 283,  64, 284, 285,
         61,  73, 189, 258, 286,  22,  81, 287, 163, 265, 288, 289, 233, 273,
        290,  12,  13, 158,  18, 291, 163,  81, 292, 293, 294, 233,  18, 295,
        296, 163, 297,  84, 298,  18, 299, 163, 300, 301,  18, 302,  73, 303,
         24,  59, 304,  95,  81, 305,  12,  13, 306, 307, 308, 309, 310,  55,
        311,  91, 312, 313, 309,  72, 272,  95, 258,  42, 314, 315, 316, 317,
         73, 318, 319,  26, 320,  53, 290,   8, 321,  12,  13, 322, 323,  81,
        324, 325,  88, 326, 163, 327, 328, 329, 330,  95, 258,   8, 331, 139,
        332,  73, 189, 159, 333, 334, 335, 336,  72, 337, 281, 1

In [168]:
train_tokens_ids[0]

tensor([  2,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  13,
         16,  17,  13,  18,  19,  20,  21,  22,  23,  24,  25,  26,   6,  27,
          8,  28,   8,  29,  10,  11,  30,  31,  32,  33,  34,  35,  36,  37,
         38,  39,   8,  40,  12,  13,  41,  42,  43,   8,  18,  19,  44,  42,
         45,  46,  47,  48,  21,  22,  49,  28,  50,  51,  52,  53,  54,  55,
         56,  57,  30,  18,  58,  27,  59,  60,  12,  13,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  62,  63,  64,  70,  66,  71,  72,  24,  73,
         61,  18,  20,  42,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
         12,  13,  84,  21,  85,  58,  86,  59,  87,  88,  89,  24,  59,  90,
         91,  92,  59,  93,  24,  50,  38,  94,  95,  18,  19,  44,  12,  13,
         84,  21,  81,  96,  97,  98,  95,   4,  99, 100, 101, 102,   8, 103,
         40, 104,  73, 105,  88, 106, 107,  53,  18, 108,  88, 109, 110, 111,
         59,  81, 112, 113,  88, 114, 115,   8, 116, 108, 117,  

In [169]:
train_labels[0]

tensor([0, 0, 1, 2, 1, 1, 1, 2, 1, 1, 1, 3, 4, 1, 5, 1, 1, 1, 0, 6, 1, 1, 1, 1,
        1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        5, 1, 1, 1, 1, 0, 6, 1, 1, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 1, 1, 1, 3, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 6, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 0, 1, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 2, 1, 1, 1, 1, 5, 1, 5, 1,
        1, 1, 1, 1, 1, 1, 2, 7, 7, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1,
        1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 3, 4, 4, 1, 1, 1, 3, 1, 1, 0, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 5, 1, 3, 1, 1, 1, 1,
        1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [170]:
def get_scores(true_labels, predicted_labels):
    correct_predictions = 0
    true_positives = 0
    selected_items = 0
    relevant_items = 0

    for pred, true in zip(predicted_labels, true_labels):
        if pred == true:
            correct_predictions += 1

        if pred > 0 and pred == true:
            true_positives += 1

        if pred > 0:
            selected_items += 1

        if true > 0:
            relevant_items += 1

    precision = true_positives / selected_items if selected_items > 0 else 1.0
    recall = true_positives / relevant_items if relevant_items > 0 else 1.0
    f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0

    return precision, recall, f1_score

num_tags = max([max(x) for x in training_NER_tags]) + 1
print(num_tags)

9


In [171]:
class LSTM(torch.nn.Module):

    def __init__(self):
        super(LSTM, self).__init__()
        self.emb = torch.nn.Embedding(len(v.get_itos()), 100, device=device)
        self.rec = torch.nn.LSTM(100, 256, 1, batch_first=True, device=device)
        self.fc1 = torch.nn.Linear(256, num_tags, device=device)

    def forward(self, x):
        emb = torch.relu(self.emb(x))
        lstm_output, (h_n, c_n) = self.rec(emb)
        out_weights = self.fc1(lstm_output)
        return out_weights

lstm = LSTM()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm.parameters())

def eval_model(tokens_dataset, labels_dataset, model):
    true_labels = []
    predicted_labels = []

    for index in tqdm(range(len(labels_dataset)), desc="Evaluating"):
        token_batch = tokens_dataset[index].unsqueeze(0)
        true_batch_labels = labels_dataset[index].cpu().numpy().tolist()
        true_labels.extend(true_batch_labels)

        prediction_weights = model(token_batch).squeeze(0)
        batch_predictions = torch.argmax(prediction_weights, dim=1).cpu().numpy().tolist()
        predicted_labels.extend(batch_predictions)

    return get_scores(true_labels, predicted_labels)

def make_predictions(tokens_dataset, model):
    predictions = []
    total_batches = len(tokens_dataset)

    for index in tqdm(range(total_batches), desc="Predicting"):
        batch_tokens = tokens_dataset[index]

        prediction_weights = model(batch_tokens)
        batch_predictions = torch.argmax(prediction_weights, dim=1)
        predictions.append(batch_predictions.cpu().numpy().tolist())

    return predictions

In [172]:
NUM_EPOCHS = 15

def train_one_epoch(epoch, lstm, train_tokens_ids, train_labels, optimizer, criterion):
    lstm.train()
    for i in tqdm(range(len(train_labels)), desc=f"Epoch {epoch + 1}/{NUM_EPOCHS}"):
        batch_tokens = train_tokens_ids[i].unsqueeze(0)
        tags = train_labels[i].unsqueeze(1)

        predicted_tags = lstm(batch_tokens)

        optimizer.zero_grad()
        loss = criterion(predicted_tags.squeeze(0), tags.squeeze(1))

        loss.backward()
        optimizer.step()

def check_model(validation_tokens_ids, validation_labels, lstm):
    lstm.eval()
    evaluation_result = eval_model(validation_tokens_ids, validation_labels, lstm)
    print(evaluation_result)
    return evaluation_result

for epoch in range(NUM_EPOCHS):
    train_one_epoch(epoch, lstm, train_tokens_ids, train_labels, optimizer, criterion)
    check_model(validation_tokens_ids, validation_labels, lstm)

Epoch 1/15:   0%|          | 0/945 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/215 [00:00<?, ?it/s]

(0.8603764505561622, 0.8756592827004219, 0.8679505969995985)


Epoch 2/15:   0%|          | 0/945 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/215 [00:00<?, ?it/s]

(0.8984811403590711, 0.9059109403254972, 0.902180743797777)


Epoch 3/15:   0%|          | 0/945 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/215 [00:00<?, ?it/s]

(0.9110740768481762, 0.9164971368294154, 0.9137775607557376)


Epoch 4/15:   0%|          | 0/945 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/215 [00:00<?, ?it/s]

(0.9122724713829987, 0.915743670886076, 0.9140047754234899)


Epoch 5/15:   0%|          | 0/945 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/215 [00:00<?, ?it/s]

(0.9147061034275406, 0.916911543098252, 0.9158074954846478)


Epoch 6/15:   0%|          | 0/945 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/215 [00:00<?, ?it/s]

(0.9111211284536354, 0.9137469861362266, 0.9124321680820848)


Epoch 7/15:   0%|          | 0/945 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/215 [00:00<?, ?it/s]

(0.9146077156480278, 0.9141425557564798, 0.9143750765433495)


Epoch 8/15:   0%|          | 0/945 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/215 [00:00<?, ?it/s]

(0.9203531493543164, 0.9209425858951176, 0.9206477732793523)


Epoch 9/15:   0%|          | 0/945 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/215 [00:00<?, ?it/s]

(0.9173549054651491, 0.9185126582278481, 0.9179334167898121)


Epoch 10/15:   0%|          | 0/945 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/215 [00:00<?, ?it/s]

(0.9211875589066918, 0.9205281796262809, 0.9208577512295313)


Epoch 11/15:   0%|          | 0/945 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/215 [00:00<?, ?it/s]

(0.918977786957583, 0.9219032549728752, 0.9204381964361277)


Epoch 12/15:   0%|          | 0/945 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/215 [00:00<?, ?it/s]

(0.9238502754794857, 0.9191342676311031, 0.9214862376658325)


Epoch 13/15:   0%|          | 0/945 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/215 [00:00<?, ?it/s]

(0.9193953708077468, 0.9165724834237492, 0.9179817569543358)


Epoch 14/15:   0%|          | 0/945 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/215 [00:00<?, ?it/s]

(0.9341066844157079, 0.9252561784207354, 0.9296603673596852)


Epoch 15/15:   0%|          | 0/945 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/215 [00:00<?, ?it/s]

(0.9204113149271159, 0.9205846895720313, 0.9204979940858493)


Ewaluacja:

In [173]:
eval_model(validation_tokens_ids, validation_labels, lstm)

Evaluating:   0%|          | 0/215 [00:00<?, ?it/s]

(0.9204113149271159, 0.9205846895720313, 0.9204979940858493)

In [174]:
validation_NER_tags_predicted = make_predictions(validation_tokens_ids, lstm)
print(validation_NER_tags_predicted[1])
validation_labels_predicted = labels_process(validation_NER_tags_predicted)
print(validation_labels_predicted[1])
validation_labels_from_NER_tags = get_labels(validation_NER_tags_predicted)
print(validation_labels_from_NER_tags[1])

Predicting:   0%|          | 0/215 [00:00<?, ?it/s]

[0, 1, 1, 2, 2, 7, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 5, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 2, 6, 6, 1, 1, 0, 6, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 5, 1, 0, 1, 1, 0, 1, 3, 4, 1, 1, 3, 4, 1, 1, 3, 4, 4, 1, 1, 0, 1, 1, 5, 4, 1, 1, 1, 1, 1, 1, 5, 8, 1, 0, 6, 1, 3, 4, 1, 1, 3, 4, 4, 1, 1, 3, 4, 1, 1, 5, 1, 1, 8, 1, 1, 0, 1, 1, 0, 4, 1, 3, 4, 1, 1, 1, 1, 3, 4, 1, 1, 3, 4, 1, 1, 3, 4, 1, 1, 3, 4, 1, 1, 1, 1, 5, 1, 0, 1, 1, 3, 4, 1, 1, 3, 4, 1, 1, 3, 4, 4, 1, 1, 0, 4, 1, 5, 8, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 3, 4, 1, 1, 3, 4, 1, 1, 3, 4, 1, 1, 1, 0, 1, 1, 3, 1, 1, 5, 1, 0, 1, 1, 3, 1, 0, 6, 1, 3, 4, 7, 1, 5, 1, 1, 1, 1, 3, 4, 1, 1, 3, 4, 3, 1, 1, 0, 1, 0, 1, 1, 3, 1, 3, 4, 1, 1, 1, 1, 1, 3, 1, 1, 3, 4, 1, 1, 3, 4, 1, 1, 1, 1, 0]
tensor([0, 0, 1, 1, 2, 2, 7, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
        1, 1, 1, 1, 5, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 2, 6,
        6, 1, 1, 0, 6, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 5, 1, 0, 1, 1, 0, 

In [175]:
with open("dev-0/out.tsv", "w") as f:
    for row in validation_labels_from_NER_tags:
        f.write(" ".join(row[1:-1]) + "\n")

testing_NER_tags = make_predictions(test_tokens_ids, lstm)
testing_labels_from_NER_tags = get_labels(testing_NER_tags)
test_labels = labels_process(testing_NER_tags)

Predicting:   0%|          | 0/230 [00:00<?, ?it/s]

In [176]:
eval_model(test_tokens_ids, test_labels, lstm)

Evaluating:   0%|          | 0/230 [00:00<?, ?it/s]

(0.6426245151119132, 0.648245412124129, 0.6454227259469145)

In [177]:
with open("test-A/out.tsv", "w") as f:
    for row in testing_labels_from_NER_tags:
        f.write(" ".join(row[1:-1]) + "\n")

print(testing_data[0])
print(testing_NER_tags[0][1:-1])
print(testing_labels_from_NER_tags[0][1:-1])

['soccer', '-', 'japan', 'get', 'lucky', 'win', ',', 'china', 'in', 'surprise', 'defeat', '.', '</s>', 'nadim', 'ladki', '</s>', 'al-ain', ',', 'united', 'arab', 'emirates', '1996-12-06', '</s>', 'japan', 'began', 'the', 'defence', 'of', 'their', 'asian', 'cup', 'title', 'with', 'a', 'lucky', '2-1', 'win', 'against', 'syria', 'in', 'a', 'group', 'c', 'championship', 'match', 'on', 'friday', '.', '</s>', 'but', 'china', 'saw', 'their', 'luck', 'desert', 'them', 'in', 'the', 'second', 'match', 'of', 'the', 'group', ',', 'crashing', 'to', 'a', 'surprise', '2-0', 'defeat', 'to', 'newcomers', 'uzbekistan', '.', '</s>', 'china', 'controlled', 'most', 'of', 'the', 'match', 'and', 'saw', 'several', 'chances', 'missed', 'until', 'the', '78th', 'minute', 'when', 'uzbek', 'striker', 'igor', 'shkvyrin', 'took', 'advantage', 'of', 'a', 'misdirected', 'defensive', 'header', 'to', 'lob', 'the', 'ball', 'over', 'the', 'advancing', 'chinese', 'keeper', 'and', 'into', 'an', 'empty', 'net', '.', '</s>', 