#!/usr/bin/env python # coding: utf-8 # ## Zadanie domowe # # # - sklonować repozytorium https://git.wmi.amu.edu.pl/kubapok/en-ner-conll-2003 # - stworzyć model seq labelling bazujący na sieci neuronowej opisanej w punkcie niżej (można bazować na tym jupyterze lub nie). # - model sieci to GRU (o dowolnych parametrach) + CRF w pytorchu korzystając z modułu CRF z poprzednich zajęć- - stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv # - wynik fscore sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.65 # - proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo # termin 22.06, 60 punktów, za najlepszy wynik- 100 punktów # # In[2]: import numpy as np import torch from torchtext.vocab import Vocab from collections import Counter from tqdm.notebook import tqdm import lzma import itertools from torchcrf import CRF # In[3]: def read_data(filename): all_data = lzma.open(filename).read().decode('UTF-8').split('\n') return [line.split('\t') for line in all_data][:-1] # In[4]: def data_process(dt): return [torch.tensor([vocab['']] + [vocab[token] for token in document] + [vocab['']], dtype = torch.long) for document in dt] # In[5]: def labels_process(dt): return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt] # In[6]: def build_vocab(dataset): counter = Counter() for document in dataset: counter.update(document) return Vocab(counter, specials=['', '', '', '']) # In[7]: train_data = read_data('train/train.tsv.xz') tokens, ner_tags = [], [] for i in train_data: ner_tags.append(i[0].split()) tokens.append(i[1].split()) # In[8]: vocab = build_vocab(tokens) # In[9]: train_tokens_ids = data_process(tokens) # In[10]: ner_tags_set = list(set(itertools.chain(*ner_tags))) ner_tags_set.sort() print(ner_tags_set) train_labels = labels_process([[ner_tags_set.index(token) for token in doc] for doc in ner_tags]) # In[11]: num_tags = max([max(x) for x in train_labels]) + 1 # In[12]: class GRU(torch.nn.Module): def __init__(self): super(GRU, self).__init__() self.emb = torch.nn.Embedding(len(vocab.itos),100) self.dropout = torch.nn.Dropout(0.2) self.rec = torch.nn.GRU(100, 256, 2, batch_first = True, bidirectional = True) self.fc1 = torch.nn.Linear(2* 256 , 9) def forward(self, x): emb = torch.relu(self.emb(x)) emb = self.dropout(emb) gru_output, h_n = self.rec(emb) out_weights = self.fc1(gru_output) return out_weights # In[13]: def get_scores(y_true, y_pred): acc_score = 0 tp = 0 fp = 0 selected_items = 0 relevant_items = 0 for p,t in zip(y_pred, y_true): if p == t: acc_score +=1 if p > 0 and p == t: tp +=1 if p > 0: selected_items += 1 if t > 0 : relevant_items +=1 if selected_items == 0: precision = 1.0 else: precision = tp / selected_items if relevant_items == 0: recall = 1.0 else: recall = tp / relevant_items if precision + recall == 0.0 : f1 = 0.0 else: f1 = 2* precision * recall / (precision + recall) return precision, recall, f1 # In[14]: def eval_model(dataset_tokens, dataset_labels, model): Y_true = [] Y_pred = [] for i in tqdm(range(len(dataset_labels))): batch_tokens = dataset_tokens[i].unsqueeze(1) tags = list(dataset_labels[i].numpy()) emissions = gru(batch_tokens).squeeze(0) Y_pred += crf.decode(emissions)[0] Y_true += tags return get_scores(Y_true, Y_pred) # In[15]: gru = GRU() crf = CRF(num_tags) # In[16]: params = list(gru.parameters()) + list(crf.parameters()) optimizer = torch.optim.Adam(params) # In[17]: NUM_EPOCHS = 20 # In[18]: criterion = torch.nn.CrossEntropyLoss() # In[19]: for i in range(NUM_EPOCHS): gru.train() crf.train() for i in tqdm(range(len(train_labels))): batch_tokens = train_tokens_ids[i].unsqueeze(1) tags = train_labels[i].unsqueeze(1) emissions = gru(batch_tokens).squeeze(0) optimizer.zero_grad() loss = -crf(emissions,tags.squeeze(0)) loss.backward() optimizer.step() gru.eval() crf.eval() print(eval_model(train_tokens_ids, train_labels, gru)) # ## dev-0 i test-A # In[20]: def predict_labels(dataset_tokens, dataset_labels, model): print(len(dataset_tokens[0]), len(dataset_labels[0])) Y_true = [] Y_pred = [] result = [] for i in tqdm(range(len(dataset_labels))): batch_tokens = dataset_tokens[i].unsqueeze(1) tags = list(dataset_labels[i].numpy()) emissions = gru(batch_tokens).squeeze(0) tmp = crf.decode(emissions)[0] Y_pred += tmp result += [tmp] Y_true += tags print(get_scores(Y_true, Y_pred)) return result # In[21]: with open('dev-0/in.tsv', "r", encoding="utf-8") as f: dev_0_data = [line.rstrip() for line in f] dev_0_data = [i.split() for i in dev_0_data] dev_0_tokens_ids = data_process(dev_0_data) # In[22]: with open('dev-0/expected.tsv', "r", encoding="utf-8") as f: dev_0_labels = [line.rstrip() for line in f] dev_0_labels = [i.split() for i in dev_0_labels] dev_0_labels = labels_process([[ner_tags_set.index(token) for token in doc] for doc in dev_0_labels]) # In[23]: tmp = predict_labels(dev_0_tokens_ids, dev_0_labels, gru) # In[24]: r = [[ner_tags_set[i] for i in tmp2] for tmp2 in tmp] r = [i[1:-1] for i in r] # In[25]: for doc in r: if doc[0] != 'O': doc[0] = 'B' + doc[0][1:] for i in range(len(doc))[:-1]: if doc[i] == 'O': if doc[i + 1] != 'O': doc[i + 1] = 'B' + doc[i + 1][1:] elif doc[i + 1] != 'O': if doc[i][1:] == doc[i + 1][1:]: doc[i + 1] = 'I' + doc[i + 1][1:] else: doc[i + 1] = 'B' + doc[i + 1][1:] # In[26]: f = open("dev-0/out.tsv", "a") for i in r: f.write(' '.join(i) + '\n') f.close() # In[27]: def predict(path, model): with open(path + '/in.tsv', "r", encoding="utf-8") as f: data = [line.rstrip() for line in f] data = [i.split() for i in data] tokens_ids = data_process(data) Y_true = [] Y_pred = [] result = [] for i in tqdm(range(len(tokens_ids))): batch_tokens = tokens_ids[i].unsqueeze(1) emissions = gru(batch_tokens).squeeze(0) tmp = crf.decode(emissions)[0] Y_pred += tmp result += [tmp] r = [[ner_tags_set[i] for i in tmp] for tmp in result] r = [i[1:-1] for i in r] for doc in r: if doc[0] != 'O': doc[0] = 'B' + doc[0][1:] for i in range(len(doc))[:-1]: if doc[i] == 'O': if doc[i + 1] != 'O': doc[i + 1] = 'B' + doc[i + 1][1:] elif doc[i + 1] != 'O': if doc[i][1:] == doc[i + 1][1:]: doc[i + 1] = 'I' + doc[i + 1][1:] else: doc[i + 1] = 'B' + doc[i + 1][1:] f = open(path + "/out.tsv", "a") for i in r: f.write(' '.join(i) + '\n') f.close() return result result = predict('dev-0', gru) result = predict('test-A', gru)