en-ner-conll-2003/rnn_fras.ipynb at master

s470619/en-ner-conll-2003

Fork 0

forked from kubapok/en-ner-conll-2003

Zofia Fraś 499702ff9c add script

2021-06-22 20:21:17 +02:00

34 KiB

Raw Permalink Blame History

Zadanie domowe

sklonować repozytorium https://git.wmi.amu.edu.pl/kubapok/en-ner-conll-2003
stworzyć model seq labelling bazujący na sieci neuronowej opisanej w punkcie niżej (można bazować na tym jupyterze lub nie).
model sieci to GRU (o dowolnych parametrach) + CRF w pytorchu korzystając z modułu CRF z poprzednich zajęć- - stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv
wynik fscore sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.65
proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo termin 22.06, 60 punktów, za najlepszy wynik- 100 punktów

import numpy as np
import torch
from torchtext.vocab import Vocab
from collections import Counter
from tqdm.notebook import tqdm
import lzma
import itertools
from torchcrf import CRF

def read_data(filename):
    all_data = lzma.open(filename).read().decode('UTF-8').split('\n')
    return [line.split('\t') for line in all_data][:-1]

def data_process(dt):
    return [torch.tensor([vocab['<bos>']] + [vocab[token] for token in document] + [vocab['<eos>']], dtype = torch.long) for document in dt]

def labels_process(dt):
    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]

def build_vocab(dataset):
    counter = Counter()
    for document in dataset:
        counter.update(document)
    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

train_data = read_data('train/train.tsv.xz')

tokens, ner_tags = [], []
for i in train_data:
    ner_tags.append(i[0].split())
    tokens.append(i[1].split())

vocab = build_vocab(tokens)

train_tokens_ids = data_process(tokens)

ner_tags_set = list(set(itertools.chain(*ner_tags)))
ner_tags_set.sort()
print(ner_tags_set)
train_labels = labels_process([[ner_tags_set.index(token) for token in doc] for doc in ner_tags])

['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']

num_tags = max([max(x) for x in train_labels]) + 1

class GRU(torch.nn.Module):

    def __init__(self):
        super(GRU, self).__init__()
        self.emb = torch.nn.Embedding(len(vocab.itos),100)
        self.dropout = torch.nn.Dropout(0.2)
        self.rec = torch.nn.GRU(100, 256, 2, batch_first = True, bidirectional = True)
        self.fc1 = torch.nn.Linear(2* 256 , 9)
        
    def forward(self, x):
        emb = torch.relu(self.emb(x))
        emb = self.dropout(emb)
        gru_output, h_n = self.rec(emb)
        out_weights = self.fc1(gru_output)
        return out_weights

def get_scores(y_true, y_pred):
    acc_score = 0
    tp = 0
    fp = 0
    selected_items = 0
    relevant_items = 0 

    for p,t in zip(y_pred, y_true):
        if p == t:
            acc_score +=1

        if p > 0 and p == t:
            tp +=1

        if p > 0:
            selected_items += 1

        if t > 0 :
            relevant_items +=1
            
    if selected_items == 0:
        precision = 1.0
    else:
        precision = tp / selected_items
            
    if relevant_items == 0:
        recall = 1.0
    else:
        recall = tp / relevant_items
    
    if precision + recall == 0.0 :
        f1 = 0.0
    else:
        f1 = 2* precision * recall  / (precision + recall)

    return precision, recall, f1

def eval_model(dataset_tokens, dataset_labels, model):
    Y_true = []
    Y_pred = []
    for i in tqdm(range(len(dataset_labels))):
        batch_tokens = dataset_tokens[i].unsqueeze(1)
        tags = list(dataset_labels[i].numpy())
        emissions = gru(batch_tokens).squeeze(0)
        Y_pred += crf.decode(emissions)[0]
        Y_true += tags
    return get_scores(Y_true, Y_pred)

gru = GRU()
crf = CRF(num_tags)

params = list(gru.parameters()) + list(crf.parameters())
optimizer = torch.optim.Adam(params)

NUM_EPOCHS = 20

criterion = torch.nn.CrossEntropyLoss()

for i in range(NUM_EPOCHS):
    gru.train()
    crf.train()
    for i in tqdm(range(len(train_labels))):
        batch_tokens = train_tokens_ids[i].unsqueeze(1)
        tags = train_labels[i].unsqueeze(1)
        emissions = gru(batch_tokens).squeeze(0)
        optimizer.zero_grad()
        loss = -crf(emissions,tags.squeeze(0))
        loss.backward()
        optimizer.step()
    gru.eval()
    crf.eval()
    print(eval_model(train_tokens_ids, train_labels, gru))

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

/home/zosia/.local/lib/python3.8/site-packages/torchcrf/__init__.py:249: UserWarning: where received a uint8 condition tensor. This behavior is deprecated and will be removed in a future version of PyTorch. Use a boolean condition instead. (Triggered internally at  /pytorch/aten/src/ATen/native/TensorCompare.cpp:255.)
  score = torch.where(mask[i].unsqueeze(1), next_score, score)

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

(0.8601941656899232, 0.8751514345303986, 0.8676083403589915)

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

(0.8815602436292092, 0.8897984198549079, 0.8856601748234387)

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

(0.9144309250302297, 0.919752763828645, 0.9170841238373373)

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

(0.9361905528132853, 0.9398110097060626, 0.9379972877369673)

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

(0.9519541852390448, 0.9547763044748607, 0.9533631563717097)

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

(0.960722713444972, 0.9632376346282668, 0.961978530336279)

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

(0.9697570414352719, 0.9714709221947199, 0.9706132252353172)

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

(0.9760554565110192, 0.9779891394717963, 0.9770213412246582)

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

(0.9811127302761178, 0.9819703829690195, 0.9815413692723396)

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

(0.984655071665091, 0.9846831395763159, 0.9846691054206851)

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

(0.9871442343767067, 0.9875194192515452, 0.9873317911716786)

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

(0.9893908786272786, 0.9889114292094049, 0.9891510958201069)

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

(0.9911312527046112, 0.9901989196482444, 0.9906648668174991)

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

(0.9924332083291745, 0.9919900041332719, 0.9922115567382627)

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

(0.9930640069977942, 0.9924270857582653, 0.9927454442197611)

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

(0.9739162872556146, 0.9674801769230403, 0.9706875636048171)

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

(0.9848088502477955, 0.9837187094689933, 0.9842634780066597)

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

(0.9808100926458495, 0.9802695653413275, 0.9805397545015183)

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

(0.9668917478143436, 0.9694090371376854, 0.968148756174055)

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=945.0), HTML(value='')))

(0.9793555195345366, 0.9788157938495013, 0.979085582310423)

dev-0 i test-A

def predict_labels(dataset_tokens, dataset_labels, model):
    print(len(dataset_tokens[0]), len(dataset_labels[0]))
    Y_true = []
    Y_pred = []
    result = []
    for i in tqdm(range(len(dataset_labels))):
        batch_tokens = dataset_tokens[i].unsqueeze(1)
        tags = list(dataset_labels[i].numpy())
        emissions = gru(batch_tokens).squeeze(0)
        tmp = crf.decode(emissions)[0]
        Y_pred += tmp
        result += [tmp]
        Y_true += tags
    print(get_scores(Y_true, Y_pred))
    return result

with open('dev-0/in.tsv', "r", encoding="utf-8") as f:
    dev_0_data = [line.rstrip() for line in f]
    
dev_0_data = [i.split() for i in dev_0_data]
dev_0_tokens_ids = data_process(dev_0_data)

with open('dev-0/expected.tsv', "r", encoding="utf-8") as f:
    dev_0_labels = [line.rstrip() for line in f]
    
dev_0_labels = [i.split() for i in dev_0_labels]
dev_0_labels = labels_process([[ner_tags_set.index(token) for token in doc] for doc in dev_0_labels])

tmp = predict_labels(dev_0_tokens_ids, dev_0_labels, gru)

458 458

HBox(children=(FloatProgress(value=0.0, max=215.0), HTML(value='')))

(0.9501477944520237, 0.9535808009736432, 0.9518612023310112)

r = [[ner_tags_set[i] for i in tmp2] for tmp2 in tmp]
r = [i[1:-1] for i in r]

for doc in r:
    if doc[0] != 'O':
        doc[0] = 'B' + doc[0][1:]
    for i in range(len(doc))[:-1]:
        if doc[i] == 'O':
            if doc[i + 1] != 'O':
                doc[i + 1] = 'B' + doc[i + 1][1:]
        elif doc[i + 1] != 'O':
            if doc[i][1:] == doc[i + 1][1:]:
                doc[i + 1] = 'I' + doc[i + 1][1:]
            else:
                doc[i + 1] = 'B' + doc[i + 1][1:]

f = open("dev-0/out.tsv", "a")
for i in r:
    f.write(' '.join(i) + '\n')
f.close()

def predict(path, model):
    with open(path + '/in.tsv', "r", encoding="utf-8") as f:
        data = [line.rstrip() for line in f]
    data = [i.split() for i in data]
    tokens_ids = data_process(data)
    
    Y_true = []
    Y_pred = []
    result = []
    for i in tqdm(range(len(tokens_ids))):
        batch_tokens = tokens_ids[i].unsqueeze(1)
        emissions = gru(batch_tokens).squeeze(0)
        tmp = crf.decode(emissions)[0]
        Y_pred += tmp
        result += [tmp]
    r = [[ner_tags_set[i] for i in tmp] for tmp in result]
    r = [i[1:-1] for i in r]
    for doc in r:
        if doc[0] != 'O':
            doc[0] = 'B' + doc[0][1:]
        for i in range(len(doc))[:-1]:
            if doc[i] == 'O':
                if doc[i + 1] != 'O':
                    doc[i + 1] = 'B' + doc[i + 1][1:]
            elif doc[i + 1] != 'O':
                if doc[i][1:] == doc[i + 1][1:]:
                    doc[i + 1] = 'I' + doc[i + 1][1:]
                else:
                    doc[i + 1] = 'B' + doc[i + 1][1:]
    f = open(path + "/out.tsv", "a")
    for i in r:
        f.write(' '.join(i) + '\n')
    f.close()
    return result

result = predict('dev-0', gru)
result = predict('test-A', gru)

HBox(children=(FloatProgress(value=0.0, max=215.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=230.0), HTML(value='')))

34 KiB Raw Permalink Blame History

Zadanie domowe

dev-0 i test-A

34 KiB

Raw Permalink Blame History