en-ner-conll-2003/sequence_labeling_fras.ipynb
2021-06-08 15:55:00 +02:00

26 KiB

Klasyfikacja wieloklasowa i sequence labelling

import numpy as np
import gensim
import torch
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

from datasets import load_dataset
from torchtext.vocab import Vocab
from collections import Counter

from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

Zadanie domowe

  • sklonować repozytorium https://git.wmi.amu.edu.pl/kubapok/en-ner-conll-2003
  • stworzyć klasyfikator bazujący na sieci neuronowej feed forward w pytorchu (można bazować na tym jupyterze lub nie).
  • klasyfikator powinien obejmować dodatkowe cechy (np. długość wyrazu, czy wyraz zaczyna się od wielkiej litery, stemmming słowa, czy zawiera cyfrę)
  • stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv
  • wynik fscore sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.60
  • proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo termin 08.06, 80 punktów

train

import lzma
import re
import itertools
import torch
def read_data(filename):
    all_data = lzma.open(filename).read().decode('UTF-8').split('\n')
    return [line.split('\t') for line in all_data][:-1]

train_data = read_data('train/train.tsv.xz')

tokens, ner_tags = [], []
for i in train_data:
    ner_tags.append(i[0].split())
    tokens.append(i[1].split())
ner_tags_set = list(set(itertools.chain(*ner_tags)))
print(ner_tags_set)
['B-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'I-MISC', 'O', 'B-MISC', 'I-PER']
ner_tags_dic = {}
for i in range(len(ner_tags_set)):
    ner_tags_dic[ner_tags_set[i]] = i
print(ner_tags_dic)
{'B-PER': 0, 'B-LOC': 1, 'I-LOC': 2, 'B-ORG': 3, 'I-ORG': 4, 'I-MISC': 5, 'O': 6, 'B-MISC': 7, 'I-PER': 8}
for i in range(len(ner_tags)):
    for j in range(len(ner_tags[i])):
        ner_tags[i][j] = ner_tags_dic[ner_tags[i][j]]
def data_process(dt):
    return [ torch.tensor([vocab['<bos>']] +[vocab[token]  for token in  document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]
def labels_process(dt):
    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]
def build_vocab(dataset):
    counter = Counter()
    for document in dataset:
        counter.update(document)
    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
vocab = build_vocab(tokens)
train_tokens_ids = data_process(tokens)
train_labels = labels_process(ner_tags)
train_tokens_ids[0]
tensor([    2,   967, 22410,   239,   774,    10,  4588,   213,  7687,     5,
            4,   740,  2091,     4,  1388,   138,     4,    22,   231,   460,
           17,    16,    70,    39, 10855,    28,   239,  4552,    10,  2621,
           10, 22766,   213,  7687,   425,  4100,  2178,   514,  1897,  2010,
          663,   295,    43, 11848,    10,  2056,     5,     4,   118,    18,
         3489,    10,     7,   231,   494,    18,  3107,  1089, 10434, 10494,
           17,    16,    75,  2621,   264,   893, 11638,    30,   547,   128,
          116,   126,   425,     7,  2717,  4552,    23, 19846,     5,     4,
           15,   121,   172,   202,   348,   217,   584,  7880,   159,   103,
          172,   202,   847,   217,  3987,    19,    39,     6,    15,     7,
          460,    18,   451,   179, 17516,  1380,  2632, 17769,    91,    11,
          241,  3909,     5,     4,    86,    17,   724,  2717,  2464,    23,
         3071,    14,   201,    39,    23,   340,    29,   804,    23,   991,
           39,   264,    43,   566,    31,     7,   231,   494,     5,     4,
           86,    17,    11,  2444,    72,   224,    31,   967,  6654,  3178,
         5219,  3683,    10,   639,  2056, 10634,     6, 11710,    14,  4861,
        10782,    30,     7,   814,    14,  2949,  1146,  3915,    23,    11,
         3993,  3508,    14, 22123,  1358,    10,  5997,   814,   944,     5,
            4,  3683,  1651, 15772,  1549,    46,   730,    30,   126,    14,
          134,    29,   107,  7686,   938,  2056,   119,   807,  8919, 10229,
         9189,    12,  2088,    13,    55,  1897,  2010,   663,     5,     4,
          111,  3683,   415,    10,  3494,    40,  2444,    46,     7,   967,
           18,  2731,  3107,  1089,     6, 21529,  2949,   944,   142,     6,
         2047,   201,   584,   804,    23,  5890,    34,   145,    23,   139,
           11,  4112,  1285,    10,   814,   944,     5,     4,  1846,  6654,
          148, 17056,   484, 17738,    37,   249,   600,  3683,    27,    44,
          967,  1445,  1759,   115,   236,     8,  5706, 23399,  7280,   184,
           15,  1870, 20842,     5,    15,     4,     5,     4,  4444,   134,
           14,   126,  3338,  3683,    18,  2444,     5,     4,    22,   967,
           18,  2717,  3107,    14, 21666, 10734,    57,   283,    10, 11507,
            7,   391,   274,   166,   224,    14,   382, 11515,    10,     7,
          909,  3107,   142,     5,     4, 10166,    45,   666,    53,   757,
           10,   807, 11615,     6,    11,  7350,   663,  1055,    10,  2088,
           61,    32,   836,    10,    45,    53,  8050,    10,  2006,   184,
         1351,  4615,  2949,  3541,     5,     4,   213,  1269,   980,    16,
           70,   145,    23,   217,  2394,    10,   814,   944,    30,    58,
         2056,     6,    50,  2184,  1438,    29,   239,    78,  4552,    10,
         2621,    10,  1612,   213,  7687,   649,  5874,  2621,   684,   587,
            5,     4,    15,  1990,   103,    45,    10,    43,  2991, 19735,
            8,    32,   843,   128,   547,    57,   432,    10,   259,   118,
           18,   276,     6,    15, 10431,   265,  9239,   115,   494,    12,
        17439,    13,   860,   448,  1129,  1401,    17,    16,  8822,   994,
            5,     4,  2798,    38,   628,  1623,    10,  5997,   711,   944,
           46,  1618,  2387,  7394,     9,   637,    46,    11,   213,   409,
         6109,  7636,   119,   807,    44,  3425,  1055,    10,  1897,  2010,
          663,    31,  2983, 10768,  2369,     5,     4,   118,  4693,  8565,
         2056,    30,   126,    72,    68,     6,   866,   245,     8,   609,
         1886,     5,     4,    87,   746,     9,  8525,   253,     8,   213,
         7751,     6,   108,    92,    67,     8,  1210,  1886,     5,     4,
            3])
class NeuralNetworkModel(torch.nn.Module):

    def __init__(self, output_size):
        super(NeuralNetworkModel, self).__init__()
        self.fc1 = torch.nn.Linear(10_000,len(train_tokens_ids))
        self.softmax = torch.nn.Softmax(dim=0)
        

    def forward(self, x):
        x = self.fc1(x)
        x = self.softmax(x)
        return x
class NERModel(torch.nn.Module):

    def __init__(self,):
        super(NERModel, self).__init__()
        self.emb = torch.nn.Embedding(23627,200)
        self.fc1 = torch.nn.Linear(600,9)

    def forward(self, x):
        x = self.emb(x)
        x = x.reshape(600) 
        x = self.fc1(x)
        #x = self.softmax(x)
        return x
nn_model = NeuralNetworkModel(len(train_tokens_ids))
train_tokens_ids[0][1:4]
tensor([  967, 22410,   239])
ner_model = NERModel()
ner_model(train_tokens_ids[0][1:4])
tensor([ 0.7428,  1.0342, -0.5970,  0.1479,  0.4966,  0.8864,  0.0432, -0.0845,
         0.2145], grad_fn=<AddBackward0>)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(ner_model.parameters())
len(train_labels)
945
for epoch in range(2):
    loss_score = 0
    acc_score = 0
    prec_score = 0
    selected_items = 0
    recall_score = 0
    relevant_items = 0
    items_total = 0
    nn_model.train()
    for i in range(100):
        for j in range(1, len(train_labels[i]) - 1):
    
            X = train_tokens_ids[i][j-1: j+2]
            Y = train_labels[i][j: j+1]

            Y_predictions = ner_model(X)
            
            
            acc_score += int(torch.argmax(Y_predictions) == Y)
            
            if torch.argmax(Y_predictions) != 0:
                selected_items +=1
            if  torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():
                prec_score += 1
            
            if  Y.item() != 0:
                relevant_items +=1
            if  Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():
                recall_score += 1
            
            items_total += 1

            
            optimizer.zero_grad()
            loss = criterion(Y_predictions.unsqueeze(0), Y)
            loss.backward()
            optimizer.step()


            loss_score += loss.item() 
    
    precision = prec_score / selected_items
    recall = recall_score / relevant_items
    f1_score = (2*precision * recall) / (precision + recall)
    display('epoch: ', epoch)
    display('loss: ', loss_score / items_total)
    display('acc: ', acc_score / items_total)
    display('prec: ', precision)
    display('recall: : ', recall)
    display('f1: ', f1_score)
'epoch: '
0
'loss: '
0.5410224926585327
'acc: '
0.856768558951965
'prec: '
0.8666126186274977
'recall: : '
0.868891651525294
'f1: '
0.8677506386839527
'epoch: '
1
'loss: '
0.28820573237663566
'acc: '
0.923373937025971
'prec: '
0.9287656853857531
'recall: : '
0.9307640814765229
'f1: '
0.9297638096147876

dev-0

with open('dev-0/in.tsv', "r", encoding="utf-8") as f:
    dev_0_data = [line.rstrip() for line in f]
    
dev_0_data = [i.split() for i in dev_0_data]
with open('dev-0/expected.tsv', "r", encoding="utf-8") as f:
    dev_0_tags = [line.rstrip() for line in f]
    
dev_0_tags = [i.split() for i in dev_0_tags]
for i in range(len(dev_0_tags)):
    for j in range(len(dev_0_tags[i])):
        dev_0_tags[i][j] = ner_tags_dic[dev_0_tags[i][j]]
test_tokens_ids = data_process(dev_0_data)
test_labels = labels_process(dev_0_tags)
result = []

loss_score = 0
acc_score = 0
prec_score = 0
selected_items = 0
recall_score = 0
relevant_items = 0
items_total = 0
nn_model.eval()
for i in range(len(test_tokens_ids)):
    result.append([])
    for j in range(1, len(test_labels[i]) - 1):

        X = test_tokens_ids[i][j-1: j+2]
        Y = test_labels[i][j: j+1]

        Y_predictions = ner_model(X)


        acc_score += int(torch.argmax(Y_predictions) == Y)

        if torch.argmax(Y_predictions) != 0:
            selected_items +=1
        if  torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():
            prec_score += 1

        if  Y.item() != 0:
            relevant_items +=1
        if  Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():
            recall_score += 1

        items_total += 1
        loss = criterion(Y_predictions.unsqueeze(0), Y)
        loss_score += loss.item() 
        
        result[i].append(int(torch.argmax(Y_predictions)))

precision = prec_score / selected_items
recall = recall_score / relevant_items
f1_score = (2*precision * recall) / (precision + recall)
display('loss: ', loss_score / items_total)
display('acc: ', acc_score / items_total)
display('prec: ', precision)
display('recall: : ', recall)
display('f1: ', f1_score)
'loss: '
0.7757424341984906
'acc: '
0.8510501460833134
'prec: '
0.8772459727385378
'recall: : '
0.8616800745516441
'f1: '
0.8693933550163583
tags = []
tmp = []
for i in ner_tags_dic:
    tmp.append(i)

for i in range(len(result)):
    tags.append([])
    for j in range(len(result[i])):
        tags[i].append(tmp[result[i][j]])
f = open("dev-0/out.tsv", "a")
for i in tags:
    f.write(' '.join(i) + '\n')
f.close()
with open('dev-0/expected.tsv', "r", encoding="utf-8") as f:
    dev_0_tags = [line.rstrip() for line in f]
    
dev_0_tags = [i.split() for i in dev_0_tags]
import math
t = 0
for i in range(len(tags)):
    for j in range(len(tags[i])):
        if tags[i][j] == dev_0_tags[i][j]:
            t += 1
print(t/len(list((itertools.chain(*tags)))))
0.8510501460833134

test

with open('test-A/in.tsv', "r", encoding="utf-8") as f:
    test_data = [line.rstrip() for line in f]
    
test_data = [i.split() for i in test_data]
test_tokens_ids = data_process(test_data)
result = []

loss_score = 0
acc_score = 0
prec_score = 0
selected_items = 0
recall_score = 0
relevant_items = 0
items_total = 0
nn_model.eval()
for i in range(len(test_tokens_ids)):
    result.append([])
    for j in range(1, len(test_tokens_ids[i]) - 1):

        X = test_tokens_ids[i][j-1: j+2]

        Y_predictions = ner_model(X)
        result[i].append(int(torch.argmax(Y_predictions)))
tags = []
tmp = []
for i in ner_tags_dic:
    tmp.append(i)

for i in range(len(result)):
    tags.append([])
    for j in range(len(result[i])):
        tags[i].append(tmp[result[i][j]])
f = open("test-A/out.tsv", "a")
for i in tags:
    f.write(' '.join(i) + '\n')
f.close()