11

2022-06-07 14:56:08 +02:00 · 2022-06-07 14:56:08 +02:00 · 3d85ca4084
commit 3d85ca4084
parent 4080dfa194
2 changed files with 1826 additions and 0 deletions
--- a/cw/11_NER_RNN.ipynb
+++ b/cw/11_NER_RNN.ipynb
@ -0,0 +1,779 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
+    "<div class=\"alert alert-block alert-info\">\n",
+    "<h1> Ekstrakcja informacji </h1>\n",
+    "<h2> 11. <i>NER RNN</i>  [ćwiczenia]</h2> \n",
+    "<h3> Jakub Pokrywka (2021)</h3>\n",
+    "</div>\n",
+    "\n",
+    "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Podejście softmax z embeddingami na przykładzie NER"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import gensim\n",
+    "import torch\n",
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "from datasets import load_dataset\n",
+    "import torchtext\n",
+    "#from torchtext.vocab import vocab\n",
+    "from collections import Counter\n",
+    "\n",
+    "from sklearn.datasets import fetch_20newsgroups\n",
+    "# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html\n",
+    "\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "\n",
+    "from tqdm.notebook import tqdm\n",
+    "\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Reusing dataset conll2003 (/home/kuba/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7c9a8ca324914c40b7606ab8cd487df2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "dataset = load_dataset(\"conll2003\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_vocab(dataset):\n",
+    "    counter = Counter()\n",
+    "    for document in dataset:\n",
+    "        counter.update(document)\n",
+    "    vocab = torchtext.vocab.vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])\n",
+    "    vocab.set_default_index(0)\n",
+    "    return vocab"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vocab = build_vocab(dataset['train']['tokens'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "21"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "vocab['on']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def data_process(dt):\n",
+    "    return [ torch.tensor([vocab['<bos>']] +[vocab[token]  for token in  document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def labels_process(dt):\n",
+    "    return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_tokens_ids = data_process(dataset['train']['tokens'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_tokens_ids = data_process(dataset['test']['tokens'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "validation_tokens_ids =  data_process(dataset['validation']['tokens'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "train_labels = labels_process(dataset['train']['ner_tags'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "validation_labels = labels_process(dataset['validation']['ner_tags'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_labels = labels_process(dataset['test']['ner_tags'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([ 2,  4,  5,  6,  7,  8,  9, 10, 11, 12,  3])"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_tokens_ids[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'id': '0',\n",
+       " 'tokens': ['EU',\n",
+       "  'rejects',\n",
+       "  'German',\n",
+       "  'call',\n",
+       "  'to',\n",
+       "  'boycott',\n",
+       "  'British',\n",
+       "  'lamb',\n",
+       "  '.'],\n",
+       " 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],\n",
+       " 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],\n",
+       " 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset['train'][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([0, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0])"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_labels[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_scores(y_true, y_pred):\n",
+    "    acc_score = 0\n",
+    "    tp = 0\n",
+    "    fp = 0\n",
+    "    selected_items = 0\n",
+    "    relevant_items = 0 \n",
+    "\n",
+    "    for p,t in zip(y_pred, y_true):\n",
+    "        if p == t:\n",
+    "            acc_score +=1\n",
+    "\n",
+    "        if p > 0 and p == t:\n",
+    "            tp +=1\n",
+    "\n",
+    "        if p > 0:\n",
+    "            selected_items += 1\n",
+    "\n",
+    "        if t > 0 :\n",
+    "            relevant_items +=1\n",
+    "\n",
+    "            \n",
+    "            \n",
+    "    if selected_items == 0:\n",
+    "        precision = 1.0\n",
+    "    else:\n",
+    "        precision = tp / selected_items\n",
+    "        \n",
+    "            \n",
+    "    if relevant_items == 0:\n",
+    "        recall = 1.0\n",
+    "    else:\n",
+    "        recall = tp / relevant_items\n",
+    "    \n",
+    "    \n",
+    "    if precision + recall == 0.0 :\n",
+    "        f1 = 0.0\n",
+    "    else:\n",
+    "        f1 = 2* precision * recall  / (precision + recall)\n",
+    "\n",
+    "    return precision, recall, f1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_tags = max([max(x) for x in dataset['train']['ner_tags'] if x]) + 1 "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class LSTM(torch.nn.Module):\n",
+    "\n",
+    "    def __init__(self):\n",
+    "        super(LSTM, self).__init__()\n",
+    "        self.emb = torch.nn.Embedding(len(vocab.get_itos()),100)\n",
+    "        self.rec = torch.nn.LSTM(100, 256, 1, batch_first = True)\n",
+    "        self.fc1 = torch.nn.Linear( 256 , 9)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        emb = torch.relu(self.emb(x))\n",
+    "        \n",
+    "        lstm_output, (h_n, c_n) = self.rec(emb)\n",
+    "        \n",
+    "        out_weights = self.fc1(lstm_output)\n",
+    "\n",
+    "        return out_weights"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lstm = LSTM()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "criterion = torch.nn.CrossEntropyLoss()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimizer = torch.optim.Adam(lstm.parameters())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def eval_model(dataset_tokens, dataset_labels, model):\n",
+    "    Y_true = []\n",
+    "    Y_pred = []\n",
+    "    for i in tqdm(range(len(dataset_labels))):\n",
+    "        batch_tokens = dataset_tokens[i].unsqueeze(0)\n",
+    "        tags = list(dataset_labels[i].numpy())\n",
+    "        Y_true += tags\n",
+    "        \n",
+    "        Y_batch_pred_weights = model(batch_tokens).squeeze(0)\n",
+    "        Y_batch_pred = torch.argmax(Y_batch_pred_weights,1)\n",
+    "        Y_pred += list(Y_batch_pred.numpy())\n",
+    "        \n",
+    "\n",
+    "    return get_scores(Y_true, Y_pred)\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "NUM_EPOCHS = 5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "59e268fa2b29414fb6306ec4ee44d51f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/500 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "77f4b857b41143429af8391023430e23",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/3251 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(0.2310126582278481, 0.02545623619667558, 0.04585907234844519)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "558d39ff9ab34f458e4d64f24028fe50",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/500 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3094de37bef4484a87ed4789bfc85bdc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/3251 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(0.22903453136011276, 0.15111007787980937, 0.1820855802227047)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8baf610abdb04715924dba6109782efd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/500 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "204b0274b9ea42caa10d8d05838ed035",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/3251 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(0.22289679098005205, 0.20911310008136696, 0.21578505457598657)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fc6e663b99614e0e8c2382ef93a6402f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/500 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b963be2045a7494499c309693632e506",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/3251 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(0.2553244180287271, 0.23968383122166687, 0.2472570297979495)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9e941358d44949c5a0f147f2287bf226",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/500 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2363bcab950947b8bff899cd01f4ec0a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/3251 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(0.26687507236308905, 0.2679297919330466, 0.26740139211136893)\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in range(NUM_EPOCHS):\n",
+    "    lstm.train()\n",
+    "    for i in tqdm(range(500)):\n",
+    "    #for i in tqdm(range(len(train_labels))):\n",
+    "        batch_tokens = train_tokens_ids[i].unsqueeze(0)\n",
+    "        tags = train_labels[i].unsqueeze(1)\n",
+    "        \n",
+    "        \n",
+    "        predicted_tags = lstm(batch_tokens)\n",
+    "\n",
+    "        \n",
+    "        optimizer.zero_grad()\n",
+    "        loss  = criterion(predicted_tags.squeeze(0),tags.squeeze(1))\n",
+    "        \n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        \n",
+    "    lstm.eval()\n",
+    "    print(eval_model(validation_tokens_ids, validation_labels, lstm))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "11001c61092a4fd89efd1e155f6b0682",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/3251 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(0.26687507236308905, 0.2679297919330466, 0.26740139211136893)"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_model(validation_tokens_ids, validation_labels, lstm)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "90336a538c2443608d45e094cc62e916",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/3454 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(0.2493934363427404, 0.24075443786982248, 0.24499780467916954)"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval_model(test_tokens_ids, test_labels, lstm)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "14042"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(train_tokens_ids)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## pytania\n",
+    "\n",
+    "- co zrobić z trenowaniem na batchach > 1 ?\n",
+    "- co zrobić, żeby sieć uwzględniała następne tokeny, a nie tylko poprzednie?\n",
+    "- w jaki sposób wykorzystać taką sieć do zadania zwykłej klasyfikacji?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Zadanie na zajęcia ( 20 minut)\n",
+    "\n",
+    "zmodyfikować sieć tak, żeby była używała dwuwarstwowej, dwukierunkowej warstwy GRU oraz dropoutu. Dropout ma nałożony na embeddingi.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Zadanie domowe\n",
+    "\n",
+    "\n",
+    "- stworzyć model seq labelling bazujący na sieci neuronowej opisanej w punkcie niżej (można bazować na tym jupyterze lub nie).\n",
+    "- model sieci to GRU (o dowolnych parametrach) + CRF w pytorchu korzystając z modułu CRF z poprzednich zajęć- - stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv\n",
+    "- wynik fscore sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.65\n",
+    "termin 22.06, 60 punktów, za najlepszy wynik- 100 punktów\n",
+    " "
+   ]
+  }
+ ],
+ "metadata": {
+  "author": "Jakub Pokrywka",
+  "email": "kubapok@wmi.amu.edu.pl",
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "lang": "pl",
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  },
+  "subtitle": "11.NER RNN[ćwiczenia]",
+  "title": "Ekstrakcja informacji",
+  "year": "2021"
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/cw/11_NER_RNN_ODPOWIEDZI.ipynb
+++ b/cw/11_NER_RNN_ODPOWIEDZI.ipynb