add script

2021-06-22 20:21:17 +02:00 · 2021-06-22 20:21:17 +02:00 · 499702ff9c
commit 499702ff9c
parent 994d1d6515
2 changed files with 1743 additions and 0 deletions
--- a/rnn_fras.ipynb
+++ b/rnn_fras.ipynb
--- a/rnn_fras.py
+++ b/rnn_fras.py
@ -0,0 +1,340 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# ## Zadanie domowe
+# 
+# 
+# - sklonować repozytorium https://git.wmi.amu.edu.pl/kubapok/en-ner-conll-2003
+# - stworzyć model seq labelling bazujący na sieci neuronowej opisanej w punkcie niżej (można bazować na tym jupyterze lub nie).
+# - model sieci to GRU (o dowolnych parametrach) + CRF w pytorchu korzystając z modułu CRF z poprzednich zajęć- - stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv
+# - wynik fscore sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.65
+# - proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo
+# termin 22.06, 60 punktów, za najlepszy wynik- 100 punktów
+#  
+
+# In[2]:
+
+
+import numpy as np
+import torch
+from torchtext.vocab import Vocab
+from collections import Counter
+from tqdm.notebook import tqdm
+import lzma
+import itertools
+from torchcrf import CRF
+
+
+# In[3]:
+
+
+def read_data(filename):
+    all_data = lzma.open(filename).read().decode('UTF-8').split('\n')
+    return [line.split('\t') for line in all_data][:-1]
+
+
+# In[4]:
+
+
+def data_process(dt):
+    return [torch.tensor([vocab['<bos>']] + [vocab[token] for token in document] + [vocab['<eos>']], dtype = torch.long) for document in dt]
+
+
+# In[5]:
+
+
+def labels_process(dt):
+    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]
+
+
+# In[6]:
+
+
+def build_vocab(dataset):
+    counter = Counter()
+    for document in dataset:
+        counter.update(document)
+    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
+
+
+# In[7]:
+
+
+train_data = read_data('train/train.tsv.xz')
+
+tokens, ner_tags = [], []
+for i in train_data:
+    ner_tags.append(i[0].split())
+    tokens.append(i[1].split())
+
+
+# In[8]:
+
+
+vocab = build_vocab(tokens)
+
+
+# In[9]:
+
+
+train_tokens_ids = data_process(tokens)
+
+
+# In[10]:
+
+
+ner_tags_set = list(set(itertools.chain(*ner_tags)))
+ner_tags_set.sort()
+print(ner_tags_set)
+train_labels = labels_process([[ner_tags_set.index(token) for token in doc] for doc in ner_tags])
+
+
+# In[11]:
+
+
+num_tags = max([max(x) for x in train_labels]) + 1 
+
+
+# In[12]:
+
+
+class GRU(torch.nn.Module):
+
+    def __init__(self):
+        super(GRU, self).__init__()
+        self.emb = torch.nn.Embedding(len(vocab.itos),100)
+        self.dropout = torch.nn.Dropout(0.2)
+        self.rec = torch.nn.GRU(100, 256, 2, batch_first = True, bidirectional = True)
+        self.fc1 = torch.nn.Linear(2* 256 , 9)
+        
+    def forward(self, x):
+        emb = torch.relu(self.emb(x))
+        emb = self.dropout(emb)
+        gru_output, h_n = self.rec(emb)
+        out_weights = self.fc1(gru_output)
+        return out_weights
+
+
+# In[13]:
+
+
+def get_scores(y_true, y_pred):
+    acc_score = 0
+    tp = 0
+    fp = 0
+    selected_items = 0
+    relevant_items = 0 
+
+    for p,t in zip(y_pred, y_true):
+        if p == t:
+            acc_score +=1
+
+        if p > 0 and p == t:
+            tp +=1
+
+        if p > 0:
+            selected_items += 1
+
+        if t > 0 :
+            relevant_items +=1
+            
+    if selected_items == 0:
+        precision = 1.0
+    else:
+        precision = tp / selected_items
+            
+    if relevant_items == 0:
+        recall = 1.0
+    else:
+        recall = tp / relevant_items
+    
+    if precision + recall == 0.0 :
+        f1 = 0.0
+    else:
+        f1 = 2* precision * recall  / (precision + recall)
+
+    return precision, recall, f1
+
+
+# In[14]:
+
+
+def eval_model(dataset_tokens, dataset_labels, model):
+    Y_true = []
+    Y_pred = []
+    for i in tqdm(range(len(dataset_labels))):
+        batch_tokens = dataset_tokens[i].unsqueeze(1)
+        tags = list(dataset_labels[i].numpy())
+        emissions = gru(batch_tokens).squeeze(0)
+        Y_pred += crf.decode(emissions)[0]
+        Y_true += tags
+    return get_scores(Y_true, Y_pred)
+
+
+# In[15]:
+
+
+gru = GRU()
+crf = CRF(num_tags)
+
+
+# In[16]:
+
+
+params = list(gru.parameters()) + list(crf.parameters())
+optimizer = torch.optim.Adam(params)
+
+
+# In[17]:
+
+
+NUM_EPOCHS = 20
+
+
+# In[18]:
+
+
+criterion = torch.nn.CrossEntropyLoss()
+
+
+# In[19]:
+
+
+for i in range(NUM_EPOCHS):
+    gru.train()
+    crf.train()
+    for i in tqdm(range(len(train_labels))):
+        batch_tokens = train_tokens_ids[i].unsqueeze(1)
+        tags = train_labels[i].unsqueeze(1)
+        emissions = gru(batch_tokens).squeeze(0)
+        optimizer.zero_grad()
+        loss = -crf(emissions,tags.squeeze(0))
+        loss.backward()
+        optimizer.step()
+    gru.eval()
+    crf.eval()
+    print(eval_model(train_tokens_ids, train_labels, gru))
+
+
+# ## dev-0 i test-A
+
+# In[20]:
+
+
+def predict_labels(dataset_tokens, dataset_labels, model):
+    print(len(dataset_tokens[0]), len(dataset_labels[0]))
+    Y_true = []
+    Y_pred = []
+    result = []
+    for i in tqdm(range(len(dataset_labels))):
+        batch_tokens = dataset_tokens[i].unsqueeze(1)
+        tags = list(dataset_labels[i].numpy())
+        emissions = gru(batch_tokens).squeeze(0)
+        tmp = crf.decode(emissions)[0]
+        Y_pred += tmp
+        result += [tmp]
+        Y_true += tags
+    print(get_scores(Y_true, Y_pred))
+    return result
+
+
+# In[21]:
+
+
+with open('dev-0/in.tsv', "r", encoding="utf-8") as f:
+    dev_0_data = [line.rstrip() for line in f]
+    
+dev_0_data = [i.split() for i in dev_0_data]
+dev_0_tokens_ids = data_process(dev_0_data)
+
+
+# In[22]:
+
+
+with open('dev-0/expected.tsv', "r", encoding="utf-8") as f:
+    dev_0_labels = [line.rstrip() for line in f]
+    
+dev_0_labels = [i.split() for i in dev_0_labels]
+dev_0_labels = labels_process([[ner_tags_set.index(token) for token in doc] for doc in dev_0_labels])
+
+
+# In[23]:
+
+
+tmp = predict_labels(dev_0_tokens_ids, dev_0_labels, gru)
+
+
+# In[24]:
+
+
+r = [[ner_tags_set[i] for i in tmp2] for tmp2 in tmp]
+r = [i[1:-1] for i in r]
+
+
+# In[25]:
+
+
+for doc in r:
+    if doc[0] != 'O':
+        doc[0] = 'B' + doc[0][1:]
+    for i in range(len(doc))[:-1]:
+        if doc[i] == 'O':
+            if doc[i + 1] != 'O':
+                doc[i + 1] = 'B' + doc[i + 1][1:]
+        elif doc[i + 1] != 'O':
+            if doc[i][1:] == doc[i + 1][1:]:
+                doc[i + 1] = 'I' + doc[i + 1][1:]
+            else:
+                doc[i + 1] = 'B' + doc[i + 1][1:]
+
+
+# In[26]:
+
+
+f = open("dev-0/out.tsv", "a")
+for i in r:
+    f.write(' '.join(i) + '\n')
+f.close()
+
+
+# In[27]:
+
+
+def predict(path, model):
+    with open(path + '/in.tsv', "r", encoding="utf-8") as f:
+        data = [line.rstrip() for line in f]
+    data = [i.split() for i in data]
+    tokens_ids = data_process(data)
+    
+    Y_true = []
+    Y_pred = []
+    result = []
+    for i in tqdm(range(len(tokens_ids))):
+        batch_tokens = tokens_ids[i].unsqueeze(1)
+        emissions = gru(batch_tokens).squeeze(0)
+        tmp = crf.decode(emissions)[0]
+        Y_pred += tmp
+        result += [tmp]
+    r = [[ner_tags_set[i] for i in tmp] for tmp in result]
+    r = [i[1:-1] for i in r]
+    for doc in r:
+        if doc[0] != 'O':
+            doc[0] = 'B' + doc[0][1:]
+        for i in range(len(doc))[:-1]:
+            if doc[i] == 'O':
+                if doc[i + 1] != 'O':
+                    doc[i + 1] = 'B' + doc[i + 1][1:]
+            elif doc[i + 1] != 'O':
+                if doc[i][1:] == doc[i + 1][1:]:
+                    doc[i + 1] = 'I' + doc[i + 1][1:]
+                else:
+                    doc[i + 1] = 'B' + doc[i + 1][1:]
+    f = open(path + "/out.tsv", "a")
+    for i in r:
+        f.write(' '.join(i) + '\n')
+    f.close()
+    return result
+
+result = predict('dev-0', gru)
+result = predict('test-A', gru)
+