From 55f0bea16b5a20d7a9945fc1c38c8f278d3c1578 Mon Sep 17 00:00:00 2001 From: Jakub Pokrywka Date: Wed, 1 Jun 2022 09:50:08 +0200 Subject: [PATCH] 10 --- cw/10_CRF.ipynb | 755 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 755 insertions(+) create mode 100644 cw/10_CRF.ipynb diff --git a/cw/10_CRF.ipynb b/cw/10_CRF.ipynb new file mode 100644 index 0000000..62f8d51 --- /dev/null +++ b/cw/10_CRF.ipynb @@ -0,0 +1,755 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Ekstrakcja informacji

\n", + "

10. CRF [ćwiczenia]

\n", + "

Jakub Pokrywka (2021)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Podejście softmax z embeddingami na przykładzie NER" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "scrolled": true + }, + "source": [ + "https://pytorch-crf.readthedocs.io/en/stable/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://www.aclweb.org/anthology/W03-0419.pdf" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pytorch-crf in /home/kuba/anaconda3/envs/zajeciaei/lib/python3.10/site-packages (0.7.2)\r\n" + ] + } + ], + "source": [ + "!pip install pytorch-crf" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import gensim\n", + "import torch\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "import torchtext\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "from datasets import load_dataset\n", + "from torchtext.vocab import Vocab\n", + "from collections import Counter\n", + "\n", + "from sklearn.datasets import fetch_20newsgroups\n", + "# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html\n", + "\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "from tqdm.notebook import tqdm\n", + "\n", + "import torch\n", + "from torchcrf import CRF" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Reusing dataset conll2003 (/home/kuba/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "60fb8337cb5b4ab28969b9e1d60a851c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3 [00:00', '', '', ''])\n", + " vocab.set_default_index(0)\n", + " return vocab" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "vocab = build_vocab(dataset['train']['tokens'])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "21" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vocab['on']" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def data_process(dt):\n", + " return [ torch.tensor([vocab['']] +[vocab[token] for token in document ] + [vocab['']], dtype = torch.long) for document in dt]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def labels_process(dt):\n", + " return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "train_tokens_ids = data_process(dataset['train']['tokens'])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "test_tokens_ids = data_process(dataset['test']['tokens'])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "validation_tokens_ids = data_process(dataset['validation']['tokens'])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "train_labels = labels_process(dataset['train']['ner_tags'])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "validation_labels = labels_process(dataset['validation']['ner_tags'])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "test_labels = labels_process(dataset['test']['ner_tags'])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([ 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 3])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_tokens_ids[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def get_scores(y_true, y_pred):\n", + " acc_score = 0\n", + " tp = 0\n", + " fp = 0\n", + " selected_items = 0\n", + " relevant_items = 0 \n", + "\n", + " for p,t in zip(y_pred, y_true):\n", + " if p == t:\n", + " acc_score +=1\n", + "\n", + " if p > 0 and p == t:\n", + " tp +=1\n", + "\n", + " if p > 0:\n", + " selected_items += 1\n", + "\n", + " if t > 0 :\n", + " relevant_items +=1\n", + "\n", + " \n", + " \n", + " if selected_items == 0:\n", + " precision = 1.0\n", + " else:\n", + " precision = tp / selected_items\n", + " \n", + " \n", + " if relevant_items == 0:\n", + " recall = 1.0\n", + " else:\n", + " recall = tp / relevant_items\n", + " \n", + " \n", + " if precision + recall == 0.0 :\n", + " f1 = 0.0\n", + " else:\n", + " f1 = 2* precision * recall / (precision + recall)\n", + "\n", + " return precision, recall, f1" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "num_tags = max([max(x) for x in dataset['train']['ner_tags'] if x]) + 1 " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "class FF(torch.nn.Module):\n", + "\n", + " def __init__(self,):\n", + " super(FF, self).__init__()\n", + " self.emb = torch.nn.Embedding(23627,200)\n", + " self.fc1 = torch.nn.Linear(200,num_tags)\n", + " \n", + "\n", + " def forward(self, x):\n", + " x = self.emb(x)\n", + " x = self.fc1(x)\n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "ff = FF()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "crf = CRF(num_tags)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "params = list(ff.parameters()) + list(crf.parameters())\n", + "\n", + "optimizer = torch.optim.Adam(params)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "def eval_model(dataset_tokens, dataset_labels):\n", + " Y_true = []\n", + " Y_pred = []\n", + " ff.eval()\n", + " crf.eval()\n", + " for i in tqdm(range(len(dataset_labels))):\n", + " batch_tokens = dataset_tokens[i]\n", + " tags = list(dataset_labels[i].numpy())\n", + " emissions = ff(batch_tokens).unsqueeze(1)\n", + " Y_pred += crf.decode(emissions)[0]\n", + " Y_true += tags\n", + "\n", + " return get_scores(Y_true, Y_pred)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "NUM_EPOCHS = 4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fc9748f7f63c47fea274592f4dba2c73", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/14042 [00:00 \u001b[0;32m/tmp/ipykernel_306568/4048919537.py\u001b[0m(12)\u001b[0;36m\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 10 \u001b[0;31m \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0mcrf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0memissions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mtags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 11 \u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mpdb\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0mpdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m---> 12 \u001b[0;31m \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 13 \u001b[0;31m \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 14 \u001b[0;31m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n", + "ipdb> batch_tokens\n", + "tensor([ 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 3])\n", + "ipdb> tags.shape\n", + "torch.Size([11, 1])\n", + "ipdb> tags\n", + "tensor([[0],\n", + " [3],\n", + " [0],\n", + " [7],\n", + " [0],\n", + " [0],\n", + " [0],\n", + " [7],\n", + " [0],\n", + " [0],\n", + " [0]])\n" + ] + } + ], + "source": [ + "for i in range(NUM_EPOCHS):\n", + " ff.train()\n", + " crf.train()\n", + " for i in tqdm(range(len(train_labels))):\n", + " batch_tokens = train_tokens_ids[i]\n", + " tags = train_labels[i].unsqueeze(1)\n", + " emissions = ff(batch_tokens).unsqueeze(1)\n", + "\n", + " optimizer.zero_grad()\n", + " loss = -crf(emissions,tags)\n", + " import pdb; pdb.set_trace()\n", + " loss.backward()\n", + " optimizer.step()\n", + " \n", + " ff.eval()\n", + " crf.eval()\n", + " print(eval_model(validation_tokens_ids, validation_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['T_destination',\n", + " '__annotations__',\n", + " '__call__',\n", + " '__class__',\n", + " '__delattr__',\n", + " '__dict__',\n", + " '__dir__',\n", + " '__doc__',\n", + " '__eq__',\n", + " '__format__',\n", + " '__ge__',\n", + " '__getattr__',\n", + " '__getattribute__',\n", + " '__gt__',\n", + " '__hash__',\n", + " '__init__',\n", + " '__init_subclass__',\n", + " '__le__',\n", + " '__lt__',\n", + " '__module__',\n", + " '__ne__',\n", + " '__new__',\n", + " '__reduce__',\n", + " '__reduce_ex__',\n", + " '__repr__',\n", + " '__setattr__',\n", + " '__setstate__',\n", + " '__sizeof__',\n", + " '__str__',\n", + " '__subclasshook__',\n", + " '__weakref__',\n", + " '_apply',\n", + " '_backward_hooks',\n", + " '_buffers',\n", + " '_call_impl',\n", + " '_compute_normalizer',\n", + " '_compute_score',\n", + " '_forward_hooks',\n", + " '_forward_pre_hooks',\n", + " '_get_backward_hooks',\n", + " '_get_name',\n", + " '_is_full_backward_hook',\n", + " '_load_from_state_dict',\n", + " '_load_state_dict_pre_hooks',\n", + " '_maybe_warn_non_full_backward_hook',\n", + " '_modules',\n", + " '_named_members',\n", + " '_non_persistent_buffers_set',\n", + " '_parameters',\n", + " '_register_load_state_dict_pre_hook',\n", + " '_register_state_dict_hook',\n", + " '_replicate_for_data_parallel',\n", + " '_save_to_state_dict',\n", + " '_slow_forward',\n", + " '_state_dict_hooks',\n", + " '_validate',\n", + " '_version',\n", + " '_viterbi_decode',\n", + " 'add_module',\n", + " 'apply',\n", + " 'batch_first',\n", + " 'bfloat16',\n", + " 'buffers',\n", + " 'children',\n", + " 'cpu',\n", + " 'cuda',\n", + " 'decode',\n", + " 'double',\n", + " 'dump_patches',\n", + " 'end_transitions',\n", + " 'eval',\n", + " 'extra_repr',\n", + " 'float',\n", + " 'forward',\n", + " 'get_buffer',\n", + " 'get_extra_state',\n", + " 'get_parameter',\n", + " 'get_submodule',\n", + " 'half',\n", + " 'load_state_dict',\n", + " 'modules',\n", + " 'named_buffers',\n", + " 'named_children',\n", + " 'named_modules',\n", + " 'named_parameters',\n", + " 'num_tags',\n", + " 'parameters',\n", + " 'register_backward_hook',\n", + " 'register_buffer',\n", + " 'register_forward_hook',\n", + " 'register_forward_pre_hook',\n", + " 'register_full_backward_hook',\n", + " 'register_module',\n", + " 'register_parameter',\n", + " 'requires_grad_',\n", + " 'reset_parameters',\n", + " 'set_extra_state',\n", + " 'share_memory',\n", + " 'start_transitions',\n", + " 'state_dict',\n", + " 'to',\n", + " 'to_empty',\n", + " 'train',\n", + " 'training',\n", + " 'transitions',\n", + " 'type',\n", + " 'xpu',\n", + " 'zero_grad']" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dir(crf)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Parameter containing:\n", + "tensor([[ 0.1427, 0.0082, -0.0852, -0.0714, -0.0514, 0.0753, 0.0389, 0.0018,\n", + " -0.0806],\n", + " [-0.0809, -0.0508, 0.0520, -0.0619, 0.0181, -0.0729, -0.1430, -0.1055,\n", + " 0.0384],\n", + " [-0.0011, -0.1476, 0.0425, -0.0081, -0.1181, -0.0098, -0.0567, 0.0311,\n", + " -0.0696],\n", + " [-0.0443, -0.0741, 0.0463, -0.0967, -0.0403, -0.0243, 0.0098, -0.0063,\n", + " -0.0811],\n", + " [ 0.0632, -0.1175, -0.0992, 0.0198, 0.0310, -0.0059, 0.0191, -0.1303,\n", + " -0.1423],\n", + " [ 0.0029, 0.0296, 0.0152, -0.0418, -0.1068, -0.0920, -0.0380, 0.0461,\n", + " 0.0167],\n", + " [-0.1167, -0.0559, -0.0428, -0.0115, -0.1006, -0.1511, 0.0035, -0.0273,\n", + " -0.1201],\n", + " [-0.0378, 0.0481, -0.1474, -0.0154, 0.0347, -0.0392, -0.0755, -0.1227,\n", + " 0.0448],\n", + " [-0.0383, -0.0402, 0.0054, 0.0145, -0.1353, -0.0460, 0.0257, -0.0322,\n", + " 0.0023]], requires_grad=True)" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "crf.transitions" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Parameter containing:\n", + " tensor([-0.0432, -0.1150, -0.1045, -0.0779, -0.0858, 0.0287, -0.1437, -0.1446,\n", + " 0.0335], requires_grad=True),\n", + " Parameter containing:\n", + " tensor([ 0.0838, -0.0097, -0.1136, 0.0010, -0.1177, 0.0225, 0.0292, -0.0837,\n", + " -0.1063], requires_grad=True),\n", + " Parameter containing:\n", + " tensor([[ 0.1427, 0.0082, -0.0852, -0.0714, -0.0514, 0.0753, 0.0389, 0.0018,\n", + " -0.0806],\n", + " [-0.0809, -0.0508, 0.0520, -0.0619, 0.0181, -0.0729, -0.1430, -0.1055,\n", + " 0.0384],\n", + " [-0.0011, -0.1476, 0.0425, -0.0081, -0.1181, -0.0098, -0.0567, 0.0311,\n", + " -0.0696],\n", + " [-0.0443, -0.0741, 0.0463, -0.0967, -0.0403, -0.0243, 0.0098, -0.0063,\n", + " -0.0811],\n", + " [ 0.0632, -0.1175, -0.0992, 0.0198, 0.0310, -0.0059, 0.0191, -0.1303,\n", + " -0.1423],\n", + " [ 0.0029, 0.0296, 0.0152, -0.0418, -0.1068, -0.0920, -0.0380, 0.0461,\n", + " 0.0167],\n", + " [-0.1167, -0.0559, -0.0428, -0.0115, -0.1006, -0.1511, 0.0035, -0.0273,\n", + " -0.1201],\n", + " [-0.0378, 0.0481, -0.1474, -0.0154, 0.0347, -0.0392, -0.0755, -0.1227,\n", + " 0.0448],\n", + " [-0.0383, -0.0402, 0.0054, 0.0145, -0.1353, -0.0460, 0.0257, -0.0322,\n", + " 0.0023]], requires_grad=True)]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(crf.parameters())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "eval_model(validation_tokens_ids, validation_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "eval_model(test_tokens_ids, test_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(train_tokens_ids)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Zadanie domowe\n", + "\n", + "- en-ner-conll-2003\n", + "- stworzyć klasyfikator bazujący na sieci neuronowej feed forward w pytorchu + CRF (można bazować na tym jupyterze lub nie).\n", + "- sieć feedforward powinna obejmować aktualne słowo, poprzednie i następne + dodatkowe cechy (np. długość wyrazu, czy wyraz zaczyna się od wielkiej litery, stemmming słowa, czy zawiera cyfrę)\n", + "- stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv\n", + "- wynik fscore sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.65\n", + "- 60 punktów, za najlepszy wynik- 100 punktów\n" + ] + } + ], + "metadata": { + "author": "Jakub Pokrywka", + "email": "kubapok@wmi.amu.edu.pl", + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "lang": "pl", + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + }, + "subtitle": "10.CRF[ćwiczenia]", + "title": "Ekstrakcja informacji", + "year": "2021" + }, + "nbformat": 4, + "nbformat_minor": 4 +}