en-ner-conll-2003/seq_labeling.py.ipynb
2021-06-07 12:41:08 +02:00

15 KiB
Raw Permalink Blame History

import pandas as pd
import numpy as np
import os.path
import shutil
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torchtext.vocab import Vocab
from collections import Counter
model_path = "seq_labeling.model"
if not os.path.isfile('train/train.tsv'):
    import lzma
    with lzma.open('train/train.tsv.xz', 'rb') as f_in:
        with open('train/train.tsv', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
labels = ['O','B-LOC', 'I-LOC','B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']

data = pd.read_csv('train/train.tsv', sep='\t', names=['iob', 'tokens'])
data["iob"]=data["iob"].apply(lambda x: [labels.index(y) for y in  x.split()])
data["tokens"]=data["tokens"].apply(lambda x: x.split())
data
iob tokens
0 [5, 0, 3, 0, 0, 0, 3, 0, 0, 0, 7, 8, 0, 1, 0, ... [EU, rejects, German, call, to, boycott, Briti...
1 [0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ... [Rare, Hendrix, song, draft, sells, for, almos...
2 [1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ... [China, says, Taiwan, spoils, atmosphere, for,...
3 [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, ... [China, says, time, right, for, Taiwan, talks,...
4 [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ... [German, July, car, registrations, up, 14.2, p...
... ... ...
940 [0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 1, 0, ... [CYCLING, -, BALLANGER, KEEPS, SPRINT, TITLE, ...
941 [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ... [CYCLING, -, WORLD, TRACK, CHAMPIONSHIP, RESUL...
942 [0, 0, 3, 0, 7, 0, 5, 0, 0, 1, 0, 1, 0, 0, 3, ... [SOCCER, -, FRENCH, DEFENDER, KOMBOUARE, JOINS...
943 [0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 1, 0, 1, 0, 0, ... [MOTORCYCLING, -, SAN, MARINO, GRAND, PRIX, PR...
944 [0, 0, 3, 4, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, ... [GOLF, -, BRITISH, MASTERS, THIRD, ROUND, SCOR...

945 rows × 2 columns

def build_vocab(dataset):
    counter = Counter()
    for document in dataset:
        counter.update(document)
    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>']) #, '<alpha>', '<notalpha>'])
vocab = build_vocab(data['tokens'])
def labels_process(dt):
    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]

def data_process(dt):
    return [ torch.tensor([vocab['<bos>']] +[vocab[token]  for token in  document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]

# def data_process(dt):
#     result = []
#     for document in dt:
#         sentence = [vocab['<bos>'],vocab['<alpha>']]
#         for token in document:
#             sentence += [vocab[token]]
#             sentence += [vocab['<alpha>'] if token.isalpha() else vocab['<notalpha>']]
#         sentence += [vocab['<eos>'],vocab['<alpha>']]
#         result.append(torch.tensor(sentence, dtype = torch.long))
#     return result
len(vocab.itos)
23628
class NERModel(torch.nn.Module):
    def __init__(self,):
        super(NERModel, self).__init__()
        self.emb = torch.nn.Embedding(23629,200)
        self.fc1 = torch.nn.Linear(1200,9)       

    def forward(self, x):
        x = self.emb(x)
        x = x.reshape(1200) 
        x = self.fc1(x)
        return x
# class NERModel(torch.nn.Module):
#     def __init__(self,):
#         super(NERModel, self).__init__()
#         #self.emb = torch.nn.Embedding(23629,200)
#         self.emb = torch.nn.Embedding(23628,200)
#         self.fc1 = torch.nn.Linear(600,9)       

#     def forward(self, x):
#         x = self.emb(x)
#         x = x.reshape(600) 
#         x = self.fc1(x)
#         return x
device_gpu = torch.device("cuda:0")
device_cpu = torch.device("cpu")

ner_model = NERModel().to(device_gpu)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(ner_model.parameters())

train_labels = labels_process(data['iob'])
train_tokens_ids = data_process(data['tokens'])
if not os.path.isfile(model_path):
    for epoch in range(5):
        acc_score = 0
        prec_score = 0
        selected_items = 0
        recall_score = 0
        relevant_items = 0
        items_total = 0
        ner_model.train()
        for i in range(len(train_labels)):
            for j in range(1, len(train_labels[i]) - 1):
            #for j in range(2, len(train_labels[i]) - 2, 2):

                #X = train_tokens_ids[i][j-2: j+4].to(device_gpu)
                X = train_tokens_ids[i][j-1: j+2].to(device_gpu)
                
                Y = train_labels[i][j: j+1].to(device_gpu)
                Y_predictions = ner_model(X)
                
                acc_score += int(torch.argmax(Y_predictions) == Y)
                if torch.argmax(Y_predictions) != 0:
                    selected_items +=1
                if  torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():
                    prec_score += 1
                if  Y.item() != 0:
                    relevant_items +=1
                if  Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():
                    recall_score += 1

                items_total += 1
                optimizer.zero_grad()
                loss = criterion(Y_predictions.unsqueeze(0), Y)
                loss.backward()
                optimizer.step()

        precision = prec_score / selected_items
        recall = recall_score / relevant_items
        f1_score = (2*precision * recall) / (precision + recall)
        print(f'epoch: {epoch}')
        print(f'f1: {f1_score}')
        print(f'acc: {acc_score/ items_total}')
    torch.save(ner_model.state_dict(), model_path)
else:
    ner_model.load_state_dict(torch.load(model_path))
def process(model, x):
    predicted = model(x)
    result = torch.argmax(predicted)
    return labels[result]

def process_dataset(model, path):
    with open(path, 'r') as f:
        lines = f.readlines()
        X = [x.split() for x in lines]
    data_tokens_ids = data_process(X)
    results = []
    for i in range(len(data_tokens_ids)):
        line_results = []
        #for j in range(1, len(data_tokens_ids[i]) - 1):
        for j in range(2, len(data_tokens_ids[i]) - 3, 2):
            x = data_tokens_ids[i][j-2: j+4].to(device_gpu)
         #   x = data_tokens_ids[i][j-1: j+2].to(device_gpu)
            label = process(model, x)
            line_results.append(label)
        results.append(line_results)
    return results

# Przetwarzanie danych z wyjścia modelu (gdy B- i I- nie dotyczą tej samej etykiety)
def process_output(lines):
    result = []
    for line in lines:
        last_label = None
        new_line = []
        for label in line:
            if(label != "O" and label[0:2] == "I-"):
                if last_label == None or last_label == "O":
                    label = label.replace('I-', 'B-')
                else:
                    label = "I-" + last_label[2:]
            last_label = label
            new_line.append(label)
        result.append(" ".join(new_line))
    return result
                    
            
results = process_dataset(ner_model,"dev-0/in.tsv")
file_content = process_output(results)
with open("dev-0/out.tsv", "w") as f:
    for line in file_content:
        f.write(line + "\n")
# results = process_dataset(ner_model,"test-A/in.tsv")
# file_content = [' '.join(x) for x in results]
# with open("test-A/out.tsv", "w") as f:
#     for line in file_content:
#         print(line)
#         #f.write(line + "\n")