forked from kubapok/en-ner-conll-2003
341 lines
7.4 KiB
Python
341 lines
7.4 KiB
Python
|
#!/usr/bin/env python
|
||
|
# coding: utf-8
|
||
|
|
||
|
# ## Zadanie domowe
|
||
|
#
|
||
|
#
|
||
|
# - sklonować repozytorium https://git.wmi.amu.edu.pl/kubapok/en-ner-conll-2003
|
||
|
# - stworzyć model seq labelling bazujący na sieci neuronowej opisanej w punkcie niżej (można bazować na tym jupyterze lub nie).
|
||
|
# - model sieci to GRU (o dowolnych parametrach) + CRF w pytorchu korzystając z modułu CRF z poprzednich zajęć- - stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv
|
||
|
# - wynik fscore sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.65
|
||
|
# - proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo
|
||
|
# termin 22.06, 60 punktów, za najlepszy wynik- 100 punktów
|
||
|
#
|
||
|
|
||
|
# In[2]:
|
||
|
|
||
|
|
||
|
import numpy as np
|
||
|
import torch
|
||
|
from torchtext.vocab import Vocab
|
||
|
from collections import Counter
|
||
|
from tqdm.notebook import tqdm
|
||
|
import lzma
|
||
|
import itertools
|
||
|
from torchcrf import CRF
|
||
|
|
||
|
|
||
|
# In[3]:
|
||
|
|
||
|
|
||
|
def read_data(filename):
|
||
|
all_data = lzma.open(filename).read().decode('UTF-8').split('\n')
|
||
|
return [line.split('\t') for line in all_data][:-1]
|
||
|
|
||
|
|
||
|
# In[4]:
|
||
|
|
||
|
|
||
|
def data_process(dt):
|
||
|
return [torch.tensor([vocab['<bos>']] + [vocab[token] for token in document] + [vocab['<eos>']], dtype = torch.long) for document in dt]
|
||
|
|
||
|
|
||
|
# In[5]:
|
||
|
|
||
|
|
||
|
def labels_process(dt):
|
||
|
return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]
|
||
|
|
||
|
|
||
|
# In[6]:
|
||
|
|
||
|
|
||
|
def build_vocab(dataset):
|
||
|
counter = Counter()
|
||
|
for document in dataset:
|
||
|
counter.update(document)
|
||
|
return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
|
||
|
|
||
|
|
||
|
# In[7]:
|
||
|
|
||
|
|
||
|
train_data = read_data('train/train.tsv.xz')
|
||
|
|
||
|
tokens, ner_tags = [], []
|
||
|
for i in train_data:
|
||
|
ner_tags.append(i[0].split())
|
||
|
tokens.append(i[1].split())
|
||
|
|
||
|
|
||
|
# In[8]:
|
||
|
|
||
|
|
||
|
vocab = build_vocab(tokens)
|
||
|
|
||
|
|
||
|
# In[9]:
|
||
|
|
||
|
|
||
|
train_tokens_ids = data_process(tokens)
|
||
|
|
||
|
|
||
|
# In[10]:
|
||
|
|
||
|
|
||
|
ner_tags_set = list(set(itertools.chain(*ner_tags)))
|
||
|
ner_tags_set.sort()
|
||
|
print(ner_tags_set)
|
||
|
train_labels = labels_process([[ner_tags_set.index(token) for token in doc] for doc in ner_tags])
|
||
|
|
||
|
|
||
|
# In[11]:
|
||
|
|
||
|
|
||
|
num_tags = max([max(x) for x in train_labels]) + 1
|
||
|
|
||
|
|
||
|
# In[12]:
|
||
|
|
||
|
|
||
|
class GRU(torch.nn.Module):
|
||
|
|
||
|
def __init__(self):
|
||
|
super(GRU, self).__init__()
|
||
|
self.emb = torch.nn.Embedding(len(vocab.itos),100)
|
||
|
self.dropout = torch.nn.Dropout(0.2)
|
||
|
self.rec = torch.nn.GRU(100, 256, 2, batch_first = True, bidirectional = True)
|
||
|
self.fc1 = torch.nn.Linear(2* 256 , 9)
|
||
|
|
||
|
def forward(self, x):
|
||
|
emb = torch.relu(self.emb(x))
|
||
|
emb = self.dropout(emb)
|
||
|
gru_output, h_n = self.rec(emb)
|
||
|
out_weights = self.fc1(gru_output)
|
||
|
return out_weights
|
||
|
|
||
|
|
||
|
# In[13]:
|
||
|
|
||
|
|
||
|
def get_scores(y_true, y_pred):
|
||
|
acc_score = 0
|
||
|
tp = 0
|
||
|
fp = 0
|
||
|
selected_items = 0
|
||
|
relevant_items = 0
|
||
|
|
||
|
for p,t in zip(y_pred, y_true):
|
||
|
if p == t:
|
||
|
acc_score +=1
|
||
|
|
||
|
if p > 0 and p == t:
|
||
|
tp +=1
|
||
|
|
||
|
if p > 0:
|
||
|
selected_items += 1
|
||
|
|
||
|
if t > 0 :
|
||
|
relevant_items +=1
|
||
|
|
||
|
if selected_items == 0:
|
||
|
precision = 1.0
|
||
|
else:
|
||
|
precision = tp / selected_items
|
||
|
|
||
|
if relevant_items == 0:
|
||
|
recall = 1.0
|
||
|
else:
|
||
|
recall = tp / relevant_items
|
||
|
|
||
|
if precision + recall == 0.0 :
|
||
|
f1 = 0.0
|
||
|
else:
|
||
|
f1 = 2* precision * recall / (precision + recall)
|
||
|
|
||
|
return precision, recall, f1
|
||
|
|
||
|
|
||
|
# In[14]:
|
||
|
|
||
|
|
||
|
def eval_model(dataset_tokens, dataset_labels, model):
|
||
|
Y_true = []
|
||
|
Y_pred = []
|
||
|
for i in tqdm(range(len(dataset_labels))):
|
||
|
batch_tokens = dataset_tokens[i].unsqueeze(1)
|
||
|
tags = list(dataset_labels[i].numpy())
|
||
|
emissions = gru(batch_tokens).squeeze(0)
|
||
|
Y_pred += crf.decode(emissions)[0]
|
||
|
Y_true += tags
|
||
|
return get_scores(Y_true, Y_pred)
|
||
|
|
||
|
|
||
|
# In[15]:
|
||
|
|
||
|
|
||
|
gru = GRU()
|
||
|
crf = CRF(num_tags)
|
||
|
|
||
|
|
||
|
# In[16]:
|
||
|
|
||
|
|
||
|
params = list(gru.parameters()) + list(crf.parameters())
|
||
|
optimizer = torch.optim.Adam(params)
|
||
|
|
||
|
|
||
|
# In[17]:
|
||
|
|
||
|
|
||
|
NUM_EPOCHS = 20
|
||
|
|
||
|
|
||
|
# In[18]:
|
||
|
|
||
|
|
||
|
criterion = torch.nn.CrossEntropyLoss()
|
||
|
|
||
|
|
||
|
# In[19]:
|
||
|
|
||
|
|
||
|
for i in range(NUM_EPOCHS):
|
||
|
gru.train()
|
||
|
crf.train()
|
||
|
for i in tqdm(range(len(train_labels))):
|
||
|
batch_tokens = train_tokens_ids[i].unsqueeze(1)
|
||
|
tags = train_labels[i].unsqueeze(1)
|
||
|
emissions = gru(batch_tokens).squeeze(0)
|
||
|
optimizer.zero_grad()
|
||
|
loss = -crf(emissions,tags.squeeze(0))
|
||
|
loss.backward()
|
||
|
optimizer.step()
|
||
|
gru.eval()
|
||
|
crf.eval()
|
||
|
print(eval_model(train_tokens_ids, train_labels, gru))
|
||
|
|
||
|
|
||
|
# ## dev-0 i test-A
|
||
|
|
||
|
# In[20]:
|
||
|
|
||
|
|
||
|
def predict_labels(dataset_tokens, dataset_labels, model):
|
||
|
print(len(dataset_tokens[0]), len(dataset_labels[0]))
|
||
|
Y_true = []
|
||
|
Y_pred = []
|
||
|
result = []
|
||
|
for i in tqdm(range(len(dataset_labels))):
|
||
|
batch_tokens = dataset_tokens[i].unsqueeze(1)
|
||
|
tags = list(dataset_labels[i].numpy())
|
||
|
emissions = gru(batch_tokens).squeeze(0)
|
||
|
tmp = crf.decode(emissions)[0]
|
||
|
Y_pred += tmp
|
||
|
result += [tmp]
|
||
|
Y_true += tags
|
||
|
print(get_scores(Y_true, Y_pred))
|
||
|
return result
|
||
|
|
||
|
|
||
|
# In[21]:
|
||
|
|
||
|
|
||
|
with open('dev-0/in.tsv', "r", encoding="utf-8") as f:
|
||
|
dev_0_data = [line.rstrip() for line in f]
|
||
|
|
||
|
dev_0_data = [i.split() for i in dev_0_data]
|
||
|
dev_0_tokens_ids = data_process(dev_0_data)
|
||
|
|
||
|
|
||
|
# In[22]:
|
||
|
|
||
|
|
||
|
with open('dev-0/expected.tsv', "r", encoding="utf-8") as f:
|
||
|
dev_0_labels = [line.rstrip() for line in f]
|
||
|
|
||
|
dev_0_labels = [i.split() for i in dev_0_labels]
|
||
|
dev_0_labels = labels_process([[ner_tags_set.index(token) for token in doc] for doc in dev_0_labels])
|
||
|
|
||
|
|
||
|
# In[23]:
|
||
|
|
||
|
|
||
|
tmp = predict_labels(dev_0_tokens_ids, dev_0_labels, gru)
|
||
|
|
||
|
|
||
|
# In[24]:
|
||
|
|
||
|
|
||
|
r = [[ner_tags_set[i] for i in tmp2] for tmp2 in tmp]
|
||
|
r = [i[1:-1] for i in r]
|
||
|
|
||
|
|
||
|
# In[25]:
|
||
|
|
||
|
|
||
|
for doc in r:
|
||
|
if doc[0] != 'O':
|
||
|
doc[0] = 'B' + doc[0][1:]
|
||
|
for i in range(len(doc))[:-1]:
|
||
|
if doc[i] == 'O':
|
||
|
if doc[i + 1] != 'O':
|
||
|
doc[i + 1] = 'B' + doc[i + 1][1:]
|
||
|
elif doc[i + 1] != 'O':
|
||
|
if doc[i][1:] == doc[i + 1][1:]:
|
||
|
doc[i + 1] = 'I' + doc[i + 1][1:]
|
||
|
else:
|
||
|
doc[i + 1] = 'B' + doc[i + 1][1:]
|
||
|
|
||
|
|
||
|
# In[26]:
|
||
|
|
||
|
|
||
|
f = open("dev-0/out.tsv", "a")
|
||
|
for i in r:
|
||
|
f.write(' '.join(i) + '\n')
|
||
|
f.close()
|
||
|
|
||
|
|
||
|
# In[27]:
|
||
|
|
||
|
|
||
|
def predict(path, model):
|
||
|
with open(path + '/in.tsv', "r", encoding="utf-8") as f:
|
||
|
data = [line.rstrip() for line in f]
|
||
|
data = [i.split() for i in data]
|
||
|
tokens_ids = data_process(data)
|
||
|
|
||
|
Y_true = []
|
||
|
Y_pred = []
|
||
|
result = []
|
||
|
for i in tqdm(range(len(tokens_ids))):
|
||
|
batch_tokens = tokens_ids[i].unsqueeze(1)
|
||
|
emissions = gru(batch_tokens).squeeze(0)
|
||
|
tmp = crf.decode(emissions)[0]
|
||
|
Y_pred += tmp
|
||
|
result += [tmp]
|
||
|
r = [[ner_tags_set[i] for i in tmp] for tmp in result]
|
||
|
r = [i[1:-1] for i in r]
|
||
|
for doc in r:
|
||
|
if doc[0] != 'O':
|
||
|
doc[0] = 'B' + doc[0][1:]
|
||
|
for i in range(len(doc))[:-1]:
|
||
|
if doc[i] == 'O':
|
||
|
if doc[i + 1] != 'O':
|
||
|
doc[i + 1] = 'B' + doc[i + 1][1:]
|
||
|
elif doc[i + 1] != 'O':
|
||
|
if doc[i][1:] == doc[i + 1][1:]:
|
||
|
doc[i + 1] = 'I' + doc[i + 1][1:]
|
||
|
else:
|
||
|
doc[i + 1] = 'B' + doc[i + 1][1:]
|
||
|
f = open(path + "/out.tsv", "a")
|
||
|
for i in r:
|
||
|
f.write(' '.join(i) + '\n')
|
||
|
f.close()
|
||
|
return result
|
||
|
|
||
|
result = predict('dev-0', gru)
|
||
|
result = predict('test-A', gru)
|
||
|
|