en-ner-conll-2003/3_RNN — kopia — kopia copy.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Uczenie głębokie – przetwarzanie tekstu – laboratoria\n",
    "# 3. RNN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "^C\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using pip 24.0 from c:\\Python312\\Lib\\site-packages\\pip (python 3.12)\n",
      "Looking in indexes: https://download.pytorch.org/whl/cu118\n",
      "Collecting torch\n",
      "  Using cached https://download.pytorch.org/whl/cu118/torch-2.3.0%2Bcu118-cp312-cp312-win_amd64.whl (2673.0 MB)\n",
      "Collecting torchtext\n",
      "  Using cached https://download.pytorch.org/whl/torchtext-0.16.2%2Bcpu-cp312-cp312-win_amd64.whl (1.9 MB)\n",
      "Collecting filelock (from torch)\n",
      "  Using cached https://download.pytorch.org/whl/filelock-3.13.1-py3-none-any.whl (11 kB)\n",
      "Collecting typing-extensions>=4.8.0 (from torch)\n",
      "  Using cached https://download.pytorch.org/whl/typing_extensions-4.9.0-py3-none-any.whl (32 kB)\n",
      "Collecting sympy (from torch)\n",
      "  Using cached https://download.pytorch.org/whl/sympy-1.12-py3-none-any.whl (5.7 MB)\n",
      "Collecting networkx (from torch)\n",
      "  Using cached https://download.pytorch.org/whl/networkx-3.2.1-py3-none-any.whl (1.6 MB)\n",
      "Collecting jinja2 (from torch)\n",
      "  Using cached https://download.pytorch.org/whl/Jinja2-3.1.3-py3-none-any.whl (133 kB)\n",
      "Collecting fsspec (from torch)\n",
      "  Using cached https://download.pytorch.org/whl/fsspec-2024.2.0-py3-none-any.whl (170 kB)\n",
      "Collecting mkl<=2021.4.0,>=2021.1.1 (from torch)\n",
      "  Using cached https://download.pytorch.org/whl/mkl-2021.4.0-py2.py3-none-win_amd64.whl (228.5 MB)\n",
      "Collecting tqdm (from torchtext)\n",
      "  Using cached https://download.pytorch.org/whl/tqdm-4.64.1-py2.py3-none-any.whl (78 kB)\n",
      "Collecting requests (from torchtext)\n",
      "  Using cached https://download.pytorch.org/whl/requests-2.28.1-py3-none-any.whl (62 kB)\n",
      "Collecting torch\n",
      "  Using cached https://download.pytorch.org/whl/cu118/torch-2.2.0%2Bcu118-cp312-cp312-win_amd64.whl (2704.2 MB)\n",
      "Collecting numpy (from torchtext)\n",
      "  Using cached https://download.pytorch.org/whl/numpy-1.26.3-cp312-cp312-win_amd64.whl (15.5 MB)\n",
      "Collecting torchdata==0.7.1 (from torchtext)\n",
      "  Using cached https://download.pytorch.org/whl/torchdata-0.7.1-py3-none-any.whl (184 kB)\n",
      "Collecting urllib3>=1.25 (from torchdata==0.7.1->torchtext)\n",
      "  Using cached https://download.pytorch.org/whl/urllib3-1.26.13-py2.py3-none-any.whl (140 kB)\n",
      "Collecting MarkupSafe>=2.0 (from jinja2->torch)\n",
      "  Using cached https://download.pytorch.org/whl/MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl (17 kB)\n",
      "Collecting charset-normalizer<3,>=2 (from requests->torchtext)\n",
      "  Using cached https://download.pytorch.org/whl/charset_normalizer-2.1.1-py3-none-any.whl (39 kB)\n",
      "Collecting idna<4,>=2.5 (from requests->torchtext)\n",
      "  Using cached https://download.pytorch.org/whl/idna-3.4-py3-none-any.whl (61 kB)\n",
      "Collecting certifi>=2017.4.17 (from requests->torchtext)\n",
      "  Using cached https://download.pytorch.org/whl/certifi-2022.12.7-py3-none-any.whl (155 kB)\n",
      "Collecting mpmath>=0.19 (from sympy->torch)\n",
      "  Using cached https://download.pytorch.org/whl/mpmath-1.3.0-py3-none-any.whl (536 kB)\n",
      "Collecting colorama (from tqdm->torchtext)\n",
      "  Using cached https://download.pytorch.org/whl/colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n",
      "Collecting intel-openmp==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch)\n",
      "  Using cached https://download.pytorch.org/whl/intel_openmp-2021.4.0-py2.py3-none-win_amd64.whl (3.5 MB)\n",
      "Collecting tbb==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch)\n",
      "  Using cached https://download.pytorch.org/whl/tbb-2021.11.0-py3-none-win_amd64.whl (298 kB)\n",
      "Installing collected packages: mpmath, urllib3, typing-extensions, sympy, numpy, networkx, MarkupSafe, idna, fsspec, filelock, colorama, charset-normalizer, certifi, tqdm, requests, jinja2, torch, torchdata, torchtext\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "ERROR: Could not install packages due to an OSError.\n",
      "Consider using the `--user` option or check the permissions.\n",
      "Traceback (most recent call last):\n",
      "  File \"c:\\Python312\\Lib\\site-packages\\pip\\_internal\\commands\\install.py\", line 452, in run\n",
      "    installed = install_given_reqs(\n",
      "                ^^^^^^^^^^^^^^^^^^^\n",
      "  File \"c:\\Python312\\Lib\\site-packages\\pip\\_internal\\req\\__init__.py\", line 72, in install_given_reqs\n",
      "    requirement.install(\n",
      "  File \"c:\\Python312\\Lib\\site-packages\\pip\\_internal\\req\\req_install.py\", line 856, in install\n",
      "    install_wheel(\n",
      "  File \"c:\\Python312\\Lib\\site-packages\\pip\\_internal\\operations\\install\\wheel.py\", line 725, in install_wheel\n",
      "    _install_wheel(\n",
      "  File \"c:\\Python312\\Lib\\site-packages\\pip\\_internal\\operations\\install\\wheel.py\", line 585, in _install_wheel\n",
      "    file.save()\n",
      "  File \"c:\\Python312\\Lib\\site-packages\\pip\\_internal\\operations\\install\\wheel.py\", line 378, in save\n",
      "    os.unlink(self.dest_path)\n",
      "PermissionError: [WinError 5] Odmowa dostępu: 'c:\\\\Python312\\\\Lib\\\\site-packages\\\\numpy\\\\core\\\\_multiarray_tests.cp312-win_amd64.pyd'\n"
     ]
    }
   ],
   "source": [
    "%pip install --ignore-installed --force-reinstall -v torch torchtext --index-url https://download.pytorch.org/whl/cu118"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "partially initialized module 'charset_normalizer' has no attribute 'md__mypyc' (most likely due to a circular import)",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\requests\\compat.py:11\u001b[0m\n\u001b[0;32m     10\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m---> 11\u001b[0m     \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mchardet\u001b[39;00m\n\u001b[0;32m     12\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n",
      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'chardet'",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[1], line 3\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcollections\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Counter\n\u001b[0;32m      2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchtext\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mvocab\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m vocab\n\u001b[0;32m      4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtqdm\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m tqdm\n\u001b[0;32m      5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mipywidgets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m FloatProgress\n",
      "File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\torchtext\\__init__.py:12\u001b[0m\n\u001b[0;32m      8\u001b[0m _TEXT_BUCKET \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhttps://download.pytorch.org/models/text/\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m     10\u001b[0m _CACHE_DIR \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexpanduser(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(_get_torch_home(), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m---> 12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m data, datasets, prototype, functional, models, nn, transforms, utils, vocab, experimental\n\u001b[0;32m     14\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m     15\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversion\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m __version__, git_version  \u001b[38;5;66;03m# noqa: F401\u001b[39;00m\n",
      "File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\torchtext\\datasets\\__init__.py:3\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mimportlib\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mag_news\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AG_NEWS\n\u001b[0;32m      4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mamazonreviewfull\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AmazonReviewFull\n\u001b[0;32m      5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mamazonreviewpolarity\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AmazonReviewPolarity\n",
      "File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\torchtext\\datasets\\ag_news.py:5\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfunctools\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m partial\n\u001b[0;32m      3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Union, Tuple\n\u001b[1;32m----> 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatapipes\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01miter\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m FileOpener, IterableWrapper\n\u001b[0;32m      6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchtext\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_download_hooks\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m HttpReader\n\u001b[0;32m      7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchtext\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_internal\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodule_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m is_module_available\n",
      "File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\torchdata\\__init__.py:9\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;66;03m# Copyright (c) Meta Platforms, Inc. and affiliates.\u001b[39;00m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;66;03m# All rights reserved.\u001b[39;00m\n\u001b[0;32m      3\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m      4\u001b[0m \u001b[38;5;66;03m# This source code is licensed under the BSD-style license found in the\u001b[39;00m\n\u001b[0;32m      5\u001b[0m \u001b[38;5;66;03m# LICENSE file in the root directory of this source tree.\u001b[39;00m\n\u001b[0;32m      7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchdata\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _extension  \u001b[38;5;66;03m# noqa: F401\u001b[39;00m\n\u001b[1;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m datapipes\n\u001b[0;32m     11\u001b[0m janitor \u001b[38;5;241m=\u001b[39m datapipes\u001b[38;5;241m.\u001b[39mutils\u001b[38;5;241m.\u001b[39mjanitor\n\u001b[0;32m     13\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
      "File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\torchdata\\datapipes\\__init__.py:9\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;66;03m# Copyright (c) Meta Platforms, Inc. and affiliates.\u001b[39;00m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;66;03m# All rights reserved.\u001b[39;00m\n\u001b[0;32m      3\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m      4\u001b[0m \u001b[38;5;66;03m# This source code is licensed under the BSD-style license found in the\u001b[39;00m\n\u001b[0;32m      5\u001b[0m \u001b[38;5;66;03m# LICENSE file in the root directory of this source tree.\u001b[39;00m\n\u001b[0;32m      7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdata\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DataChunk, functional_datapipe\n\u001b[1;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;28miter\u001b[39m, \u001b[38;5;28mmap\u001b[39m, utils\n\u001b[0;32m     11\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataChunk\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfunctional_datapipe\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124miter\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmap\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutils\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
      "File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\torchdata\\datapipes\\iter\\__init__.py:54\u001b[0m\n\u001b[0;32m     46\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatapipes\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01miter\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mload\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhuggingface\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m HuggingFaceHubReaderIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m HuggingFaceHubReader\n\u001b[0;32m     48\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatapipes\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01miter\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mload\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01miopath\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     49\u001b[0m     IoPathFileListerIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m IoPathFileLister,\n\u001b[0;32m     50\u001b[0m     IoPathFileOpenerIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m IoPathFileOpener,\n\u001b[0;32m     51\u001b[0m     IoPathSaverIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m IoPathSaver,\n\u001b[0;32m     52\u001b[0m )\n\u001b[1;32m---> 54\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatapipes\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01miter\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mload\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01monline\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     55\u001b[0m     GDriveReaderDataPipe \u001b[38;5;28;01mas\u001b[39;00m GDriveReader,\n\u001b[0;32m     56\u001b[0m     HTTPReaderIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m HttpReader,\n\u001b[0;32m     57\u001b[0m     OnlineReaderIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m OnlineReader,\n\u001b[0;32m     58\u001b[0m )\n\u001b[0;32m     59\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatapipes\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01miter\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mload\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01ms3io\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     60\u001b[0m     S3FileListerIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m S3FileLister,\n\u001b[0;32m     61\u001b[0m     S3FileLoaderIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m S3FileLoader,\n\u001b[0;32m     62\u001b[0m )\n\u001b[0;32m     63\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatapipes\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01miter\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtransform\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbucketbatcher\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     64\u001b[0m     BucketBatcherIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m BucketBatcher,\n\u001b[0;32m     65\u001b[0m     InBatchShufflerIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m InBatchShuffler,\n\u001b[0;32m     66\u001b[0m     MaxTokenBucketizerIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m MaxTokenBucketizer,\n\u001b[0;32m     67\u001b[0m )\n",
      "File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\torchdata\\datapipes\\iter\\load\\online.py:12\u001b[0m\n\u001b[0;32m      9\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mwarnings\u001b[39;00m\n\u001b[0;32m     10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Any, Dict, Iterator, Optional, Tuple\n\u001b[1;32m---> 12\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mrequests\u001b[39;00m\n\u001b[0;32m     14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatapipes\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m functional_datapipe\n\u001b[0;32m     15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatapipes\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01miter\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IterDataPipe\n",
      "File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\requests\\__init__.py:45\u001b[0m\n\u001b[0;32m     41\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mwarnings\u001b[39;00m\n\u001b[0;32m     43\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01murllib3\u001b[39;00m\n\u001b[1;32m---> 45\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mexceptions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m RequestsDependencyWarning\n\u001b[0;32m     47\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m     48\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcharset_normalizer\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m __version__ \u001b[38;5;28;01mas\u001b[39;00m charset_normalizer_version\n",
      "File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\requests\\exceptions.py:9\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;124;03mrequests.exceptions\u001b[39;00m\n\u001b[0;32m      3\u001b[0m \u001b[38;5;124;03m~~~~~~~~~~~~~~~~~~~\u001b[39;00m\n\u001b[0;32m      4\u001b[0m \n\u001b[0;32m      5\u001b[0m \u001b[38;5;124;03mThis module contains the set of Requests' exceptions.\u001b[39;00m\n\u001b[0;32m      6\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m      7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01murllib3\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mexceptions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m HTTPError \u001b[38;5;28;01mas\u001b[39;00m BaseHTTPError\n\u001b[1;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompat\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m JSONDecodeError \u001b[38;5;28;01mas\u001b[39;00m CompatJSONDecodeError\n\u001b[0;32m     12\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mRequestException\u001b[39;00m(\u001b[38;5;167;01mIOError\u001b[39;00m):\n\u001b[0;32m     13\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"There was an ambiguous exception that occurred while handling your\u001b[39;00m\n\u001b[0;32m     14\u001b[0m \u001b[38;5;124;03m    request.\u001b[39;00m\n\u001b[0;32m     15\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n",
      "File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\requests\\compat.py:13\u001b[0m\n\u001b[0;32m     11\u001b[0m     \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mchardet\u001b[39;00m\n\u001b[0;32m     12\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[1;32m---> 13\u001b[0m     \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mcharset_normalizer\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mchardet\u001b[39;00m\n\u001b[0;32m     15\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m\n\u001b[0;32m     17\u001b[0m \u001b[38;5;66;03m# -------\u001b[39;00m\n\u001b[0;32m     18\u001b[0m \u001b[38;5;66;03m# Pythons\u001b[39;00m\n\u001b[0;32m     19\u001b[0m \u001b[38;5;66;03m# -------\u001b[39;00m\n\u001b[0;32m     20\u001b[0m \n\u001b[0;32m     21\u001b[0m \u001b[38;5;66;03m# Syntax sugar.\u001b[39;00m\n",
      "File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\charset_normalizer\\__init__.py:24\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m      3\u001b[0m \u001b[38;5;124;03mCharset-Normalizer\u001b[39;00m\n\u001b[0;32m      4\u001b[0m \u001b[38;5;124;03m~~~~~~~~~~~~~~\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     20\u001b[0m \u001b[38;5;124;03m:license: MIT, see LICENSE for more details.\u001b[39;00m\n\u001b[0;32m     21\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m     22\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mlogging\u001b[39;00m\n\u001b[1;32m---> 24\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m from_bytes, from_fp, from_path, normalize\n\u001b[0;32m     25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlegacy\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     26\u001b[0m     CharsetDetector,\n\u001b[0;32m     27\u001b[0m     CharsetDoctor,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     30\u001b[0m     detect,\n\u001b[0;32m     31\u001b[0m )\n\u001b[0;32m     32\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CharsetMatch, CharsetMatches\n",
      "File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\charset_normalizer\\api.py:7\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpath\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m basename, splitext\n\u001b[0;32m      5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Any, BinaryIO, List, Optional, Set\n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcd\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m      8\u001b[0m     coherence_ratio,\n\u001b[0;32m      9\u001b[0m     encoding_languages,\n\u001b[0;32m     10\u001b[0m     mb_encoding_languages,\n\u001b[0;32m     11\u001b[0m     merge_coherence_ratios,\n\u001b[0;32m     12\u001b[0m )\n\u001b[0;32m     13\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconstant\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE\n\u001b[0;32m     14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmd\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m mess_ratio\n",
      "File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\charset_normalizer\\cd.py:9\u001b[0m\n\u001b[0;32m      7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01massets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m FREQUENCIES\n\u001b[0;32m      8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconstant\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES\n\u001b[1;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmd\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m is_suspiciously_successive_range\n\u001b[0;32m     10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CoherenceMatches\n\u001b[0;32m     11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m     12\u001b[0m     is_accentuated,\n\u001b[0;32m     13\u001b[0m     is_latin,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     16\u001b[0m     unicode_range,\n\u001b[0;32m     17\u001b[0m )\n",
      "\u001b[1;31mAttributeError\u001b[0m: partially initialized module 'charset_normalizer' has no attribute 'md__mypyc' (most likely due to a circular import)"
     ]
    }
   ],
   "source": [
    "from collections import Counter\n",
    "import torch\n",
    "from torchtext.vocab import vocab\n",
    "from tqdm import tqdm\n",
    "from ipywidgets import FloatProgress\n",
    "\n",
    "import pandas as pd\n",
    "from nltk.tokenize import word_tokenize\n",
    "from unidecode import unidecode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "torch.cuda.is_available()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "wczytano dane treningowe\n",
      "O B-PER O O O O O O O O O B-LOC O O O O O O O O O O O B-LOC O O B-PER I-PER O O O O O O O O O O O O O O O O O O O O O O O B-LOC O O O O O O O O O O O O O B-MISC I-MISC I-MISC I-MISC O O O B-PER O O O O O B-LOC O O O O O O O O O O O O O O O O O B-MISC O O B-LOC O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-PER O O O B-PER I-PER O O O O O O O O O O O O O O O O O O O O O O O O B-PER O O O O O O O O B-MISC O O O O O O O O O O O O O O O O O O O O O O O O Rare Hendrix song draft sells for almost $ 17,000 . </S> LONDON 1996-08-22 </S> A rare early handwritten draft of a song by U.S. guitar legend Jimi Hendrix was sold for almost $ 17,000 on Thursday at an auction of some of the late musician 's favourite possessions . </S> A Florida restaurant paid 10,925 pounds ( $ 16,935 ) for the draft of \" Ai n't no telling \" , which Hendrix penned on a piece of London hotel stationery in late 1966 . </S> At the end of a January 1967 concert in the English city of Nottingham he threw the sheet of paper into the audience , where it was retrieved by a fan . </S> Buyers also snapped up 16 other items that were put up for auction by Hendrix 's former girlfriend Kathy Etchingham , who lived with him from 1966 to 1969 . </S> They included a black lacquer and mother of pearl inlaid box used by Hendrix to store his drugs , which an anonymous Australian purchaser bought for 5,060 pounds ( $ 7,845 ) . </S> The guitarist died of a drugs overdose in 1970 aged 27 . </S>\n",
      "podzielono dane treningowe na słowa\n",
      "['rare', 'hendrix', 'song', 'draft', 'sells', 'for', 'almost', '$', '17,000', '.', '</s>', 'london', '1996-08-22', '</s>', 'a', 'rare', 'early', 'handwritten', 'draft', 'of', 'a', 'song', 'by', 'u.s.', 'guitar', 'legend', 'jimi', 'hendrix', 'was', 'sold', 'for', 'almost', '$', '17,000', 'on', 'thursday', 'at', 'an', 'auction', 'of', 'some', 'of', 'the', 'late', 'musician', \"'s\", 'favourite', 'possessions', '.', '</s>', 'a', 'florida', 'restaurant', 'paid', '10,925', 'pounds', '(', '$', '16,935', ')', 'for', 'the', 'draft', 'of', '\"', 'ai', \"n't\", 'no', 'telling', '\"', ',', 'which', 'hendrix', 'penned', 'on', 'a', 'piece', 'of', 'london', 'hotel', 'stationery', 'in', 'late', '1966', '.', '</s>', 'at', 'the', 'end', 'of', 'a', 'january', '1967', 'concert', 'in', 'the', 'english', 'city', 'of', 'nottingham', 'he', 'threw', 'the', 'sheet', 'of', 'paper', 'into', 'the', 'audience', ',', 'where', 'it', 'was', 'retrieved', 'by', 'a', 'fan', '.', '</s>', 'buyers', 'also', 'snapped', 'up', '16', 'other', 'items', 'that', 'were', 'put', 'up', 'for', 'auction', 'by', 'hendrix', \"'s\", 'former', 'girlfriend', 'kathy', 'etchingham', ',', 'who', 'lived', 'with', 'him', 'from', '1966', 'to', '1969', '.', '</s>', 'they', 'included', 'a', 'black', 'lacquer', 'and', 'mother', 'of', 'pearl', 'inlaid', 'box', 'used', 'by', 'hendrix', 'to', 'store', 'his', 'drugs', ',', 'which', 'an', 'anonymous', 'australian', 'purchaser', 'bought', 'for', '5,060', 'pounds', '(', '$', '7,845', ')', '.', '</s>', 'the', 'guitarist', 'died', 'of', 'a', 'drugs', 'overdose', 'in', '1970', 'aged', '27', '.', '</s>']\n"
     ]
    }
   ],
   "source": [
    "# odczytaj dane treningowe\n",
    "train = pd.read_csv('train/train.tsv', sep='\\t')\n",
    "train.columns = [\"y\", \"x\"]\n",
    "print(\"wczytano dane treningowe\")\n",
    "print(train[\"y\"][0], train[\"x\"][0])\n",
    "\n",
    "# podziel dane treningowe na słowa\n",
    "# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/\n",
    "slowa_train = []\n",
    "for tekst in train[\"x\"]:\n",
    "    pom = []\n",
    "    for slowo in tekst.split(\" \"):\n",
    "        #if slowo not in (\"<\",\"/s\",\">\",\"/S\",\"``\"):\n",
    "        pom.append(slowo.lower())\n",
    "    slowa_train.append(pom)\n",
    "print(\"podzielono dane treningowe na słowa\")\n",
    "print(slowa_train[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "wczytano dane testowe dev-0\n",
      "CRICKET - ENGLISH COUNTY CHAMPIONSHIP SCORES . </S> LONDON 1996-08-30 </S> Result and close of play scores in English county championship matches on Friday : </S> Leicester : Leicestershire beat Somerset by an innings and 39 runs . </S> Somerset 83 and 174 ( P. Simmons 4-38 ) , Leicestershire 296 . </S> Leicestershire 22 points , Somerset 4 . </S> Chester-le-Street : Glamorgan 259 and 207 ( A. Dale 69 , H. Morris 69 ; D. Blenkiron 4-43 ) , Durham 114 ( S. Watkin 4-28 ) and 81-3 . </S> Tunbridge Wells : Nottinghamshire 214 ( P. Johnson 84 ; M. McCague 4-55 ) , Kent 108-3 . </S> London ( The Oval ) : Warwickshire 195 , Surrey 429-7 ( C. Lewis 80 not out , M. Butcher 70 , G. Kersey 63 , J. Ratcliffe 63 , D. Bicknell 55 ) . </S> Hove : Sussex 363 ( W. Athey 111 , V. Drakes 52 ; I. Austin 4-37 ) , Lancashire 197-8 ( W. Hegg 54 ) </S> Portsmouth : Middlesex 199 and 426 ( J. Pooley 111 , M. Ramprakash 108 , M. Gatting 83 ) , Hampshire 232 and 109-5 . </S> Chesterfield : Worcestershire 238 and 133-5 , Derbyshire 471 ( J. Adams 123 , T.O'Gorman 109 not out , K. Barnett 87 ; T. Moody 6-82 ) </S> Bristol : Gloucestershire 183 and 185-6 ( J. Russell 56 not out ) , Northamptonshire 190 ( K. Curran 52 ; A. Smith 5-68 ) . </S>\n",
      "podzielono dane treningowe na słowa\n",
      "['cricket', '-', 'english', 'county', 'championship', 'scores', '.', '</s>', 'london', '1996-08-30', '</s>', 'result', 'and', 'close', 'of', 'play', 'scores', 'in', 'english', 'county', 'championship', 'matches', 'on', 'friday', ':', '</s>', 'leicester', ':', 'leicestershire', 'beat', 'somerset', 'by', 'an', 'innings', 'and', '39', 'runs', '.', '</s>', 'somerset', '83', 'and', '174', '(', 'p.', 'simmons', '4-38', ')', ',', 'leicestershire', '296', '.', '</s>', 'leicestershire', '22', 'points', ',', 'somerset', '4', '.', '</s>', 'chester-le-street', ':', 'glamorgan', '259', 'and', '207', '(', 'a.', 'dale', '69', ',', 'h.', 'morris', '69', ';', 'd.', 'blenkiron', '4-43', ')', ',', 'durham', '114', '(', 's.', 'watkin', '4-28', ')', 'and', '81-3', '.', '</s>', 'tunbridge', 'wells', ':', 'nottinghamshire', '214', '(', 'p.', 'johnson', '84', ';', 'm.', 'mccague', '4-55', ')', ',', 'kent', '108-3', '.', '</s>', 'london', '(', 'the', 'oval', ')', ':', 'warwickshire', '195', ',', 'surrey', '429-7', '(', 'c.', 'lewis', '80', 'not', 'out', ',', 'm.', 'butcher', '70', ',', 'g.', 'kersey', '63', ',', 'j.', 'ratcliffe', '63', ',', 'd.', 'bicknell', '55', ')', '.', '</s>', 'hove', ':', 'sussex', '363', '(', 'w.', 'athey', '111', ',', 'v.', 'drakes', '52', ';', 'i.', 'austin', '4-37', ')', ',', 'lancashire', '197-8', '(', 'w.', 'hegg', '54', ')', '</s>', 'portsmouth', ':', 'middlesex', '199', 'and', '426', '(', 'j.', 'pooley', '111', ',', 'm.', 'ramprakash', '108', ',', 'm.', 'gatting', '83', ')', ',', 'hampshire', '232', 'and', '109-5', '.', '</s>', 'chesterfield', ':', 'worcestershire', '238', 'and', '133-5', ',', 'derbyshire', '471', '(', 'j.', 'adams', '123', ',', \"t.o'gorman\", '109', 'not', 'out', ',', 'k.', 'barnett', '87', ';', 't.', 'moody', '6-82', ')', '</s>', 'bristol', ':', 'gloucestershire', '183', 'and', '185-6', '(', 'j.', 'russell', '56', 'not', 'out', ')', ',', 'northamptonshire', '190', '(', 'k.', 'curran', '52', ';', 'a.', 'smith', '5-68', ')', '.', '</s>']\n"
     ]
    }
   ],
   "source": [
    "# odczytaj dane testowe dev-0\n",
    "test_dev0 = pd.read_csv('dev-0/in.tsv', sep='\\t')\n",
    "test_dev0.columns = [\"x\"]\n",
    "print(\"wczytano dane testowe dev-0\")\n",
    "print(test_dev0[\"x\"][0])\n",
    "\n",
    "# podziel dane testowe na słowa\n",
    "# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/\n",
    "slowa_test_dev0 = []\n",
    "for tekst in test_dev0[\"x\"]:\n",
    "    pom = []\n",
    "    for slowo in tekst.split(\" \"):\n",
    "        #if slowo not in (\"<\",\"/s\",\">\",\"/S\",\"``\"):\n",
    "        pom.append(slowo.lower())\n",
    "    slowa_test_dev0.append(pom)\n",
    "print(\"podzielono dane treningowe na słowa\")\n",
    "print(slowa_test_dev0[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "wczytano dane testowe A\n",
      "RUGBY UNION - CUTTITTA BACK FOR ITALY AFTER A YEAR . </S> ROME 1996-12-06 </S> Italy recalled Marcello Cuttitta </S> on Friday for their friendly against Scotland at Murrayfield more than a year after the 30-year-old wing announced he was retiring following differences over selection . </S> Cuttitta , who trainer George Coste said was certain to play on Saturday week , was named in a 21-man squad lacking only two of the team beaten 54-21 by England at Twickenham last month . </S> Stefano Bordon is out through illness and Coste said he had dropped back row Corrado Covi , who had been recalled for the England game after five years out of the national team . </S> Cuttitta announced his retirement after the 1995 World Cup , where he took issue with being dropped from the Italy side that faced England in the pool stages . </S> Coste said he had approached the player two months ago about a comeback . </S> \" He ended the World Cup on the wrong note , \" Coste said . </S> \" I thought it would be useful to have him back and he said he would be available . </S> I think now is the right time for him to return . \" </S> Squad : Javier Pertile , Paolo Vaccari , Marcello Cuttitta , Ivan Francescato , Leandro Manteri , Diego Dominguez , Francesco Mazzariol , Alessandro Troncon , Orazio Arancio , Andrea Sgorlon , Massimo Giovanelli , Carlo Checchinato , Walter Cristofoletto , Franco Properzi Curti , Carlo Orlandi , Massimo Cuttitta , Giambatista Croci , Gianluca Guidi , Nicola Mazzucato , Alessandro Moscardi , Andrea Castellani . </S>\n",
      "podzielono dane treningowe na słowa\n",
      "['rugby', 'union', '-', 'cuttitta', 'back', 'for', 'italy', 'after', 'a', 'year', '.', '</s>', 'rome', '1996-12-06', '</s>', 'italy', 'recalled', 'marcello', 'cuttitta', '</s>', 'on', 'friday', 'for', 'their', 'friendly', 'against', 'scotland', 'at', 'murrayfield', 'more', 'than', 'a', 'year', 'after', 'the', '30-year-old', 'wing', 'announced', 'he', 'was', 'retiring', 'following', 'differences', 'over', 'selection', '.', '</s>', 'cuttitta', ',', 'who', 'trainer', 'george', 'coste', 'said', 'was', 'certain', 'to', 'play', 'on', 'saturday', 'week', ',', 'was', 'named', 'in', 'a', '21-man', 'squad', 'lacking', 'only', 'two', 'of', 'the', 'team', 'beaten', '54-21', 'by', 'england', 'at', 'twickenham', 'last', 'month', '.', '</s>', 'stefano', 'bordon', 'is', 'out', 'through', 'illness', 'and', 'coste', 'said', 'he', 'had', 'dropped', 'back', 'row', 'corrado', 'covi', ',', 'who', 'had', 'been', 'recalled', 'for', 'the', 'england', 'game', 'after', 'five', 'years', 'out', 'of', 'the', 'national', 'team', '.', '</s>', 'cuttitta', 'announced', 'his', 'retirement', 'after', 'the', '1995', 'world', 'cup', ',', 'where', 'he', 'took', 'issue', 'with', 'being', 'dropped', 'from', 'the', 'italy', 'side', 'that', 'faced', 'england', 'in', 'the', 'pool', 'stages', '.', '</s>', 'coste', 'said', 'he', 'had', 'approached', 'the', 'player', 'two', 'months', 'ago', 'about', 'a', 'comeback', '.', '</s>', '\"', 'he', 'ended', 'the', 'world', 'cup', 'on', 'the', 'wrong', 'note', ',', '\"', 'coste', 'said', '.', '</s>', '\"', 'i', 'thought', 'it', 'would', 'be', 'useful', 'to', 'have', 'him', 'back', 'and', 'he', 'said', 'he', 'would', 'be', 'available', '.', '</s>', 'i', 'think', 'now', 'is', 'the', 'right', 'time', 'for', 'him', 'to', 'return', '.', '\"', '</s>', 'squad', ':', 'javier', 'pertile', ',', 'paolo', 'vaccari', ',', 'marcello', 'cuttitta', ',', 'ivan', 'francescato', ',', 'leandro', 'manteri', ',', 'diego', 'dominguez', ',', 'francesco', 'mazzariol', ',', 'alessandro', 'troncon', ',', 'orazio', 'arancio', ',', 'andrea', 'sgorlon', ',', 'massimo', 'giovanelli', ',', 'carlo', 'checchinato', ',', 'walter', 'cristofoletto', ',', 'franco', 'properzi', 'curti', ',', 'carlo', 'orlandi', ',', 'massimo', 'cuttitta', ',', 'giambatista', 'croci', ',', 'gianluca', 'guidi', ',', 'nicola', 'mazzucato', ',', 'alessandro', 'moscardi', ',', 'andrea', 'castellani', '.', '</s>']\n"
     ]
    }
   ],
   "source": [
    "# odczytaj dane testowe A\n",
    "test_A = pd.read_csv('test-A/in.tsv', sep='\\t')\n",
    "test_A.columns = [\"x\"]\n",
    "print(\"wczytano dane testowe A\")\n",
    "print(test_A[\"x\"][0])\n",
    "\n",
    "# podziel dane testowe na słowa\n",
    "# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/\n",
    "slowa_test_A = []\n",
    "for tekst in test_A[\"x\"]:\n",
    "    pom = []\n",
    "    for slowo in tekst.split(\" \"):\n",
    "        #if slowo not in (\"<\",\"/s\",\">\",\"/S\",\"``\"):\n",
    "        pom.append(slowo.lower())\n",
    "    slowa_test_A.append(pom)\n",
    "print(\"podzielono dane treningowe na słowa\")\n",
    "print(slowa_test_A[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_vocab(dataset):\n",
    "    counter = Counter()\n",
    "    for document in dataset:\n",
    "        counter.update(document)\n",
    "    return vocab(counter, specials=[\"<unk>\", \"<pad>\", \"<bos>\", \"<eos>\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20998\n",
      "['<unk>', '<pad>', '<bos>', '<eos>', 'rare', 'hendrix', 'song', 'draft', 'sells', 'for', 'almost', '$', '17,000', '.', '</s>', 'london', '1996-08-22', 'a', 'early', 'handwritten', 'of', 'by', 'u.s.', 'guitar', 'legend', 'jimi', 'was', 'sold', 'on', 'thursday', 'at', 'an', 'auction', 'some', 'the', 'late', 'musician', \"'s\", 'favourite', 'possessions', 'florida', 'restaurant', 'paid', '10,925', 'pounds', '(', '16,935', ')', '\"', 'ai', \"n't\", 'no', 'telling', ',', 'which', 'penned', 'piece', 'hotel', 'stationery', 'in', '1966', 'end', 'january', '1967', 'concert', 'english', 'city', 'nottingham', 'he', 'threw', 'sheet', 'paper', 'into', 'audience', 'where', 'it', 'retrieved', 'fan', 'buyers', 'also', 'snapped', 'up', '16', 'other', 'items', 'that', 'were', 'put', 'former', 'girlfriend', 'kathy', 'etchingham', 'who', 'lived', 'with', 'him', 'from', 'to', '1969', 'they', 'included', 'black', 'lacquer', 'and', 'mother', 'pearl', 'inlaid', 'box', 'used', 'store', 'his', 'drugs', 'anonymous', 'australian', 'purchaser', 'bought', '5,060', '7,845', 'guitarist', 'died', 'overdose', '1970', 'aged', '27', 'china', 'says', 'taiwan', 'spoils', 'atmosphere', 'talks', 'beijing', 'accused', 'taipei', 'spoiling', 'resumption', 'across', 'strait', 'visit', 'ukraine', 'taiwanese', 'vice', 'president', 'lien', 'chan', 'this', 'week', 'infuriated', 'speaking', 'only', 'hours', 'after', 'chinese', 'state', 'media', 'said', 'time', 'right', 'engage', 'political', 'foreign', 'ministry', 'spokesman', 'shen', 'guofang', 'told', 'reuters', ':', 'necessary', 'opening', 'has', 'been', 'disrupted', 'authorities', 'quoted', 'top', 'negotiator', 'tang', 'shubei', 'as', 'visiting', 'group', 'wednesday', 'rivals', 'hold', 'now', 'is', 'two', 'sides', '...', 'hostility', 'overseas', 'edition', 'people', 'daily', 'saying', 'television', 'interview', 'had', 'read', 'reports', 'comments', 'but', 'gave', 'details', 'why', 'considered', 'considers', 'renegade', 'province', 'long', 'opposed', 'all', 'efforts', 'gain', 'greater', 'international', 'recognition', 'rival', 'island', 'should', 'take', 'practical', 'steps', 'towards', 'goal', 'consultations', 'be', 'held', 'set', 'format', 'official', 'xinhua', 'news', 'agency', 'executive', 'chairman', 'association', 'relations', 'straits', 'german', 'july', 'car', 'registrations', '14.2', 'pct', 'yr', '/', 'frankfurt', 'first-time', 'motor', 'vehicles', 'jumped', 'percent', 'year', 'year-earlier', 'period', 'federal', 'office', '356,725', 'new', 'cars', 'registered', '1996', '--', '304,850', 'passenger', '15,613', 'trucks', 'figures', 'represent', '13.6', 'increase', '2.2', 'decline', '1995', 'motor-bike', 'registration', 'rose', '32.7', 'growth', 'partly', 'due', 'increased', 'number', 'germans', 'buying', 'abroad', 'while', 'manufacturers', 'domestic', 'demand', 'weak', 'posted', 'gains', 'numbers', 'volkswagen', 'ag', 'won', '77,719', 'slightly', 'more', 'than', 'quarter', 'total', 'opel', 'together', 'general', 'motors', 'came', 'second', 'place', '49,269', '16.4', 'overall', 'figure', 'third', 'ford', '35,563', 'or', '11.7', 'seat', 'porsche', 'fewer', 'compared', 'last', '3,420', '5522', 'earlier', 'fell', '554', '643', 'greek', 'socialists', 'give', 'green', 'light', 'pm', 'elections', 'athens', 'socialist', 'party', 'bureau', 'prime', 'minister', 'costas', 'simitis', 'call', 'snap', 'its', 'secretary', 'skandalidis', 'reporters', 'going', 'make', 'announcement', 'cabinet', 'meeting', 'later', 'dimitris', 'kontogiannis', 'newsroom', '+301', '3311812-4', 'bayervb', 'sets', 'c$', '100', 'million', 'six-year', 'bond', 'following', 'announced', 'lead', 'manager', 'toronto', 'dominion', 'borrower', 'bayerische', 'vereinsbank', 'amt', 'mln', 'coupon', '6.625', 'maturity', '24.sep.02', 'type', 'straight', 'iss', 'price', '100.92', 'pay', 'date', '24.sep.96', 'full', 'fees', '1.875', 'reoffer', '99.32', 'spread', '+20', 'bp', 'moody', 'aa1', 'listing', 'lux', 'freq', '=', 's&p', 'denoms', 'k', '1-10-100', 'sale', 'limits', 'us', 'uk', 'ca', 'neg', 'plg', 'crs', 'deflt', 'force', 'maj', 'gov', 'law', 'home
     ]
    }
   ],
   "source": [
    "v = build_vocab(slowa_train)\n",
    "v.set_default_index(v[\"<unk>\"])\n",
    "itos = v.get_itos()  # mapowanie indeksów na tokeny\n",
    "print(len(itos))  # liczba różnych tokenów w słowniku\n",
    "print(itos)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'O': 0, 'B-PER': 1, 'B-LOC': 2, 'I-PER': 3, 'B-MISC': 4, 'I-MISC': 5, 'I-LOC': 6, 'B-ORG': 7, 'I-ORG': 8}\n"
     ]
    }
   ],
   "source": [
    "# slownik etykiety - kody etykiet\n",
    "etykieta_na_kod = {}\n",
    "licznik = 0\n",
    "for tekst in train[\"y\"]:\n",
    "    for etykieta in tekst.split(\" \"):\n",
    "        if etykieta not in etykieta_na_kod:\n",
    "            etykieta_na_kod[etykieta] = licznik\n",
    "            licznik+=1\n",
    "print(etykieta_na_kod)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 5, 5, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n"
     ]
    }
   ],
   "source": [
    "# podziel etykiety\n",
    "kody_etykiet_train = []\n",
    "for tekst in train[\"y\"]:\n",
    "    pom = []\n",
    "    for etykieta in tekst.split(\" \"):\n",
    "        pom.append(etykieta_na_kod[etykieta])\n",
    "    kody_etykiet_train.append(pom)\n",
    "print(kody_etykiet_train[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "O O B-MISC I-MISC I-MISC O O O B-LOC O O O O O O O O O B-MISC O O O O O O O B-LOC O B-ORG O B-ORG O O O O O O O O B-ORG O O O O B-PER I-PER O O O B-ORG O O O B-ORG O O O B-ORG O O O B-LOC O B-ORG O O O O B-PER I-PER O O B-PER I-PER O O B-PER I-PER O O O B-ORG O O B-PER I-PER O O O O O O B-LOC I-LOC O B-ORG O O B-PER I-PER O O B-PER I-PER O O O B-ORG O O O B-LOC O B-LOC I-LOC O O B-ORG O O B-ORG O O B-PER I-PER O O O O B-PER I-PER O O B-PER I-PER O O B-PER I-PER O O B-PER I-PER O O O O B-LOC O B-ORG O O B-PER I-PER O O B-PER I-PER O O B-PER I-PER O O O B-ORG O O B-PER I-PER O O O B-LOC O B-ORG O O O O B-PER I-PER O O B-PER I-PER O O B-PER I-PER O O O B-ORG O O O O O B-LOC O B-ORG O O O O B-ORG O O B-PER I-PER O O B-PER O O O O B-PER I-PER O O B-PER I-PER O O O B-LOC O B-ORG O O O O B-PER I-PER O O O O O B-ORG O O B-PER I-PER O O B-PER I-PER O O O O\n",
      "[0, 0, 4, 5, 5, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 2, 0, 7, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 1, 3, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 2, 0, 7, 0, 0, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 7, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 2, 6, 0, 7, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 7, 0, 0, 0, 2, 0, 2, 6, 0, 0, 7, 0, 0, 7, 0, 0, 1, 3, 0, 0, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 0, 2, 0, 7, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 7, 0, 0, 1, 3, 0, 0, 0, 2, 0, 7, 0, 0, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 7, 0, 0, 0, 0, 0, 2, 0, 7, 0, 0, 0, 0, 7, 0, 0, 1, 3, 0, 0, 1, 0, 0, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 2, 0, 7, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 7, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 0]\n"
     ]
    }
   ],
   "source": [
    "# odczytaj etykiety dev-0\n",
    "labels_dev0 = pd.read_csv('dev-0/expected.tsv', sep='\\t')\n",
    "labels_dev0.columns = [\"y\"]\n",
    "print(labels_dev0[\"y\"][0])\n",
    "\n",
    "# podziel etykiety\n",
    "kody_etykiet_dev0 = []\n",
    "for tekst in labels_dev0[\"y\"]:\n",
    "    pom = []\n",
    "    for etykieta in tekst.split(\" \"):\n",
    "        pom.append(etykieta_na_kod[etykieta])\n",
    "    kody_etykiet_dev0.append(pom)\n",
    "print(kody_etykiet_dev0[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def data_process(dt):\n",
    "    # Wektoryzacja dokumentów tekstowych.\n",
    "    return [\n",
    "        torch.tensor(\n",
    "            [v[\"<bos>\"]] + [v[token] for token in document] + [v[\"<eos>\"]],\n",
    "            dtype=torch.long,\n",
    "        )\n",
    "        for document in dt\n",
    "    ]\n",
    "\n",
    "def labels_process(dt):\n",
    "    # Wektoryzacja etykiet (NER)\n",
    "    return [torch.tensor([0] + document + [0], dtype=torch.long) for document in dt]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_tokens_ids = data_process(slowa_train)\n",
    "test_dev0_tokens_ids = data_process(slowa_test_dev0)\n",
    "test_A_tokens_ids = data_process(slowa_test_A)\n",
    "\n",
    "train_labels = labels_process(kody_etykiet_train)\n",
    "test_dev0_labels = labels_process(kody_etykiet_dev0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "944 199\n",
      "214 256\n",
      "229 283\n",
      "tensor([  2,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,\n",
      "         14,  17,   4,  18,  19,   7,  20,  17,   6,  21,  22,  23,  24,  25,\n",
      "          5,  26,  27,   9,  10,  11,  12,  28,  29,  30,  31,  32,  20,  33,\n",
      "         20,  34,  35,  36,  37,  38,  39,  13,  14,  17,  40,  41,  42,  43,\n",
      "         44,  45,  11,  46,  47,   9,  34,   7,  20,  48,  49,  50,  51,  52,\n",
      "         48,  53,  54,   5,  55,  28,  17,  56,  20,  15,  57,  58,  59,  35,\n",
      "         60,  13,  14,  30,  34,  61,  20,  17,  62,  63,  64,  59,  34,  65,\n",
      "         66,  20,  67,  68,  69,  34,  70,  20,  71,  72,  34,  73,  53,  74,\n",
      "         75,  26,  76,  21,  17,  77,  13,  14,  78,  79,  80,  81,  82,  83,\n",
      "         84,  85,  86,  87,  81,   9,  32,  21,   5,  37,  88,  89,  90,  91,\n",
      "         53,  92,  93,  94,  95,  96,  60,  97,  98,  13,  14,  99, 100,  17,\n",
      "        101, 102, 103, 104,  20, 105, 106, 107, 108,  21,   5,  97, 109, 110,\n",
      "        111,  53,  54,  31, 112, 113, 114, 115,   9, 116,  44,  45,  11, 117,\n",
      "         47,  13,  14,  34, 118, 119,  20,  17, 111, 120,  59, 121, 122, 123,\n",
      "         13,  14,   3])\n",
      "tensor([    2,  1949,   459,    65,  1950,  1951,  1592,    13,    14,    15,\n",
      "        19342,    14,  1793,   103,  1465,    20,  1952,  1592,    59,    65,\n",
      "         1950,  1951,  1954,    28,   947,   166,    14,  1992,   166,  1993,\n",
      "         1703,  1965,    21,    31,  2038,   103,  3671,  2932,    13,    14,\n",
      "         1965,  6226,   103, 16331,    45,  1995,  1996,     0,    47,    53,\n",
      "         1993,     0,    13,    14,  1993,  1055,  1330,    53,  1965,  1864,\n",
      "           13,    14, 17021,   166,  1991, 19322,   103, 14088,    45,  1977,\n",
      "            0,  1620,    53, 10801, 12466,  1620,  1962,  1958,     0,     0,\n",
      "           47,    53,  1956, 19326,    45,  1960, 19327, 19328,    47,   103,\n",
      "        16667,    13,    14, 19313,  1363,   166,  2012,     0,    45,  1995,\n",
      "         2752,  5725,  1962,  1967,     0,     0,    47,    53,  1985,     0,\n",
      "           13,    14,    15,    45,    34,  2037,    47,   166,  2020, 14779,\n",
      "           53,  2018,     0,    45,  2030,  2059,  5455,   620,   618,    53,\n",
      "         1967,     0,  1602,    53,  1963,     0,  1976,    53,  1974,     0,\n",
      "         1976,    53,  1958,     0,  3843,    47,    13,    14, 19318,   166,\n",
      "         2002, 16329,    45,  2024, 19320,  9379,    53,  2007,  2008,  1979,\n",
      "         1962,  2061, 10865,     0,    47,    53,  2034,     0,    45,  2024,\n",
      "            0,  2054,    47,    14,  6206,   166, 12584, 11568,   103, 11269,\n",
      "           45,  1974, 19334,  9379,    53,  1967, 19335,  1997,    53,  1967,\n",
      "        17052,  6226,    47,    53,  2000, 15584,   103,     0,    13,    14,\n",
      "         9493,   166,  2026, 10970,   103,     0,    53,  9314,     0,    45,\n",
      "         1974,  2717,     0,    53,     0,  6237,   620,   618,    53,  6223,\n",
      "        19332, 11058,  1962,  6227,   401,     0,    47,    14,  9488,   166,\n",
      "         1972, 19340,   103,     0,    45,  1974,  1975,  4451,   620,   618,\n",
      "           47,    53,  2010, 14739,    45,  6223,  6224,  1979,  1962,  1977,\n",
      "         4839,  1981,    47,    13,    14,     3])\n",
      "tensor([    2,  6342,   769,   459,     0,   960,     9,  1681,   150,    17,\n",
      "          253,    13,    14,  5474,     0,    14,  1681,  3063,     0,     0,\n",
      "           14,    28,   947,     9,   701,  7189,   572,  2124,    30,     0,\n",
      "          300,   301,    17,   253,   150,    34, 14863,  6363,   371,    68,\n",
      "           26,  3333,   370,  3631,   608, 11618,    13,    14,     0,    53,\n",
      "           92,  1738,  1753,     0,   154,    26,  3388,    97,  1952,    28,\n",
      "         3978,   145,    53,    26,  2116,    59,    17,     0,  2099, 14403,\n",
      "          148,   186,    20,    34,   695,  2519,     0,    21,  1208,    30,\n",
      "            0,   324,   729,    13,    14,  2725,     0,   185,   618,   863,\n",
      "         1521,   103,     0,   154,    68,   197,  2954,   960,  2955,     0,\n",
      "            0,    53,    92,   197,   170,  3063,     9,    34,  1208,  2154,\n",
      "          150,  1824,  1053,   618,    20,    34,   457,   695,    13,    14,\n",
      "            0,   371,   110, 12530,   150,    34,   274,  1593,  1711,    53,\n",
      "           74,    68,   596,   452,    94,  1458,  2954,    96,    34,  1681,\n",
      "          749,    85,  2517,  1208,    59,    34,  8797,  9174,    13,    14,\n",
      "            0,   154,    68,   197,  4705,    34,  6392,   186,   836,  2521,\n",
      "          700,    17, 17097,    13,    14,    48,    68,  1240,    34,  1593,\n",
      "         1711,    28,    34,  4645,  3370,    53,    48,     0,   154,    13,\n",
      "           14,    48,  1500,  1798,    75,   693,   226,  5612,    97,   606,\n",
      "           95,   960,   103,    68,   154,    68,   693,   226,  1221,    13,\n",
      "           14,  1500,  4604,   184,   185,    34,   156,   155,     9,    95,\n",
      "           97,   671,    13,    48,    14,  2099,   166,  2718,     0,    53,\n",
      "            0, 16807,    53,     0,     0,    53,  2886,     0,    53,     0,\n",
      "            0,    53,  2854,     0,    53, 10959,     0,    53, 11542,     0,\n",
      "           53,     0,     0,    53,  2219,     0,    53,     0,     0,    53,\n",
      "         4036,     0,    53, 17118,     0,    53, 11460,     0,     0,    53,\n",
      "         4036,     0,    53,     0,     0,    53,     0,     0,    53,  9462,\n",
      "            0,    53, 13541,     0,    53, 11542,     0,    53,  2219,     0,\n",
      "           13,    14,     3])\n"
     ]
    }
   ],
   "source": [
    "print(len(train_tokens_ids), len(train_tokens_ids[0]))\n",
    "print(len(test_dev0_tokens_ids), len(test_dev0_tokens_ids[0]))\n",
    "print(len(test_A_tokens_ids), len(test_A_tokens_ids[0]))\n",
    "\n",
    "print(train_tokens_ids[0])\n",
    "print(test_dev0_tokens_ids[0])\n",
    "print(test_A_tokens_ids[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "944 199\n",
      "214 256\n",
      "tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
      "        2, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
      "        0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 5, 5, 0, 0,\n",
      "        0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
      "        0, 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
      "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,\n",
      "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,\n",
      "        0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
      "        0, 0, 0, 0, 0, 0, 0])\n",
      "tensor([0, 0, 0, 4, 5, 5, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0,\n",
      "        0, 0, 0, 2, 0, 7, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 1, 3, 0,\n",
      "        0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 2, 0, 7, 0, 0, 0, 0, 1, 3, 0,\n",
      "        0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 7, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 2, 6, 0,\n",
      "        7, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 7, 0, 0, 0, 2, 0, 2, 6, 0, 0, 7, 0,\n",
      "        0, 7, 0, 0, 1, 3, 0, 0, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 1, 3,\n",
      "        0, 0, 0, 0, 2, 0, 7, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 7, 0,\n",
      "        0, 1, 3, 0, 0, 0, 2, 0, 7, 0, 0, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 1, 3, 0,\n",
      "        0, 0, 7, 0, 0, 0, 0, 0, 2, 0, 7, 0, 0, 0, 0, 7, 0, 0, 1, 3, 0, 0, 1, 0,\n",
      "        0, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 2, 0, 7, 0, 0, 0, 0, 1, 3, 0, 0, 0,\n",
      "        0, 0, 7, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 0, 0])\n"
     ]
    }
   ],
   "source": [
    "print(len(train_labels), len(train_labels[0]))\n",
    "print(len(test_dev0_labels), len(test_dev0_labels[0]))\n",
    "\n",
    "print(train_labels[0])\n",
    "print(test_dev0_labels[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_scores(y_true, y_pred):\n",
    "    # Funkcja zwraca precyzję, pokrycie i F1\n",
    "    acc_score = 0\n",
    "    tp = 0\n",
    "    fp = 0\n",
    "    selected_items = 0\n",
    "    relevant_items = 0\n",
    "\n",
    "    for p, t in zip(y_pred, y_true):\n",
    "        if p == t:\n",
    "            acc_score += 1\n",
    "\n",
    "        if p > 0 and p == t:\n",
    "            tp += 1\n",
    "\n",
    "        if p > 0:\n",
    "            selected_items += 1\n",
    "\n",
    "        if t > 0:\n",
    "            relevant_items += 1\n",
    "\n",
    "    if selected_items == 0:\n",
    "        precision = 1.0\n",
    "    else:\n",
    "        precision = tp / selected_items\n",
    "\n",
    "    if relevant_items == 0:\n",
    "        recall = 1.0\n",
    "    else:\n",
    "        recall = tp / relevant_items\n",
    "\n",
    "    if precision + recall == 0.0:\n",
    "        f1 = 0.0\n",
    "    else:\n",
    "        f1 = 2 * precision * recall / (precision + recall)\n",
    "\n",
    "    return precision, recall, f1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_tags = len(etykieta_na_kod.keys())\n",
    "\n",
    "class LSTM(torch.nn.Module):\n",
    "\n",
    "    def __init__(self):\n",
    "        super(LSTM, self).__init__()\n",
    "        self.emb = torch.nn.Embedding(len(v.get_itos()), 100)\n",
    "        self.rec = torch.nn.LSTM(100, 256, 1, batch_first=True)\n",
    "        self.fc1 = torch.nn.Linear(256, num_tags)\n",
    "\n",
    "    def forward(self, x):\n",
    "        emb = torch.relu(self.emb(x))\n",
    "        lstm_output, (h_n, c_n) = self.rec(emb)\n",
    "        out_weights = self.fc1(lstm_output)\n",
    "        return out_weights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def eval_model(dataset_tokens, dataset_labels, model):\n",
    "    Y_true = []\n",
    "    Y_pred = []\n",
    "    for i in tqdm(range(len(dataset_labels))):\n",
    "        batch_tokens = dataset_tokens[i].unsqueeze(0)\n",
    "        tags = list(dataset_labels[i].numpy())\n",
    "        Y_true += tags\n",
    "\n",
    "        Y_batch_pred_weights = model(batch_tokens).squeeze(0)\n",
    "        Y_batch_pred = torch.argmax(Y_batch_pred_weights, 1)\n",
    "        Y_pred += list(Y_batch_pred.numpy())\n",
    "\n",
    "    return get_scores(Y_true, Y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lstm = LSTM()\n",
    "criterion = torch.nn.CrossEntropyLoss()\n",
    "optimizer = torch.optim.Adam(lstm.parameters())\n",
    "NUM_EPOCHS = 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 88%|████████▊ | 832/944 [00:37<00:05, 21.94it/s]\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[23], line 13\u001b[0m\n\u001b[0;32m     10\u001b[0m     optimizer\u001b[38;5;241m.\u001b[39mzero_grad()\n\u001b[0;32m     11\u001b[0m     loss \u001b[38;5;241m=\u001b[39m criterion(predicted_tags\u001b[38;5;241m.\u001b[39msqueeze(\u001b[38;5;241m0\u001b[39m), tags\u001b[38;5;241m.\u001b[39msqueeze(\u001b[38;5;241m1\u001b[39m))\n\u001b[1;32m---> 13\u001b[0m     \u001b[43mloss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     14\u001b[0m     optimizer\u001b[38;5;241m.\u001b[39mstep()\n\u001b[0;32m     16\u001b[0m lstm\u001b[38;5;241m.\u001b[39meval()\n",
      "File \u001b[1;32mc:\\Users\\Dominik\\Desktop\\Studia\\11.sem1\\en-ner-conll-2003\\.venv\\Lib\\site-packages\\torch\\_tensor.py:525\u001b[0m, in \u001b[0;36mTensor.backward\u001b[1;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[0;32m    515\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m    516\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[0;32m    517\u001b[0m         Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[0;32m    518\u001b[0m         (\u001b[38;5;28mself\u001b[39m,),\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    523\u001b[0m         inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[0;32m    524\u001b[0m     )\n\u001b[1;32m--> 525\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    526\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[0;32m    527\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mc:\\Users\\Dominik\\Desktop\\Studia\\11.sem1\\en-ner-conll-2003\\.venv\\Lib\\site-packages\\torch\\autograd\\__init__.py:267\u001b[0m, in \u001b[0;36mbackward\u001b[1;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[0;32m    262\u001b[0m     retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n\u001b[0;32m    264\u001b[0m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[0;32m    265\u001b[0m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[0;32m    266\u001b[0m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[1;32m--> 267\u001b[0m \u001b[43m_engine_run_backward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    268\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    269\u001b[0m \u001b[43m    \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    270\u001b[0m \u001b[43m    \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    271\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    272\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    273\u001b[0m \u001b[43m    \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m    274\u001b[0m \u001b[43m    \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m    275\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mc:\\Users\\Dominik\\Desktop\\Studia\\11.sem1\\en-ner-conll-2003\\.venv\\Lib\\site-packages\\torch\\autograd\\graph.py:744\u001b[0m, in \u001b[0;36m_engine_run_backward\u001b[1;34m(t_outputs, *args, **kwargs)\u001b[0m\n\u001b[0;32m    742\u001b[0m     unregister_hooks \u001b[38;5;241m=\u001b[39m _register_logging_hooks_on_whole_graph(t_outputs)\n\u001b[0;32m    743\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 744\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mVariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[0;32m    745\u001b[0m \u001b[43m        \u001b[49m\u001b[43mt_outputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[0;32m    746\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001b[39;00m\n\u001b[0;32m    747\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[0;32m    748\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m attach_logging_hooks:\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "for i in range(NUM_EPOCHS):\n",
    "    lstm.train()\n",
    "    # for i in tqdm(range(500)):\n",
    "    for i in tqdm(range(len(train_labels))):\n",
    "        batch_tokens = train_tokens_ids[i].unsqueeze(0)\n",
    "        tags = train_labels[i].unsqueeze(1)\n",
    "\n",
    "        predicted_tags = lstm(batch_tokens)\n",
    "\n",
    "        optimizer.zero_grad()\n",
    "        loss = criterion(predicted_tags.squeeze(0), tags.squeeze(1))\n",
    "\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "\n",
    "    lstm.eval()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 214/214 [00:00<00:00, 262.62it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(0.6558005752636625, 0.7225352112676057, 0.687552353828112)\n"
     ]
    }
   ],
   "source": [
    "print(eval_model(test_dev0_tokens_ids, test_dev0_labels, lstm))"
   ]
  }
 ],
 "metadata": {
  "author": "Jakub Pokrywka",
  "email": "kubapok@wmi.amu.edu.pl",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "lang": "pl",
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  },
  "subtitle": "11.NER RNN[ćwiczenia]",
  "title": "Ekstrakcja informacji",
  "year": "2021"
 },
 "nbformat": 4,
 "nbformat_minor": 4
}