en-ner-conll-2003/3_RNN — kopia — kopia copy.ipynb

767 lines
288 KiB
Plaintext
Raw Normal View History

2024-05-23 21:25:06 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Uczenie głębokie przetwarzanie tekstu laboratoria\n",
"# 3. RNN"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 2,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"^C\n",
2024-05-23 21:25:06 +02:00
"Note: you may need to restart the kernel to use updated packages.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"Using pip 24.0 from c:\\Python312\\Lib\\site-packages\\pip (python 3.12)\n",
"Looking in indexes: https://download.pytorch.org/whl/cu118\n",
"Collecting torch\n",
" Using cached https://download.pytorch.org/whl/cu118/torch-2.3.0%2Bcu118-cp312-cp312-win_amd64.whl (2673.0 MB)\n",
"Collecting torchtext\n",
" Using cached https://download.pytorch.org/whl/torchtext-0.16.2%2Bcpu-cp312-cp312-win_amd64.whl (1.9 MB)\n",
"Collecting filelock (from torch)\n",
" Using cached https://download.pytorch.org/whl/filelock-3.13.1-py3-none-any.whl (11 kB)\n",
"Collecting typing-extensions>=4.8.0 (from torch)\n",
" Using cached https://download.pytorch.org/whl/typing_extensions-4.9.0-py3-none-any.whl (32 kB)\n",
"Collecting sympy (from torch)\n",
" Using cached https://download.pytorch.org/whl/sympy-1.12-py3-none-any.whl (5.7 MB)\n",
"Collecting networkx (from torch)\n",
" Using cached https://download.pytorch.org/whl/networkx-3.2.1-py3-none-any.whl (1.6 MB)\n",
"Collecting jinja2 (from torch)\n",
" Using cached https://download.pytorch.org/whl/Jinja2-3.1.3-py3-none-any.whl (133 kB)\n",
"Collecting fsspec (from torch)\n",
" Using cached https://download.pytorch.org/whl/fsspec-2024.2.0-py3-none-any.whl (170 kB)\n",
"Collecting mkl<=2021.4.0,>=2021.1.1 (from torch)\n",
" Using cached https://download.pytorch.org/whl/mkl-2021.4.0-py2.py3-none-win_amd64.whl (228.5 MB)\n",
"Collecting tqdm (from torchtext)\n",
" Using cached https://download.pytorch.org/whl/tqdm-4.64.1-py2.py3-none-any.whl (78 kB)\n",
"Collecting requests (from torchtext)\n",
" Using cached https://download.pytorch.org/whl/requests-2.28.1-py3-none-any.whl (62 kB)\n",
"Collecting torch\n",
" Using cached https://download.pytorch.org/whl/cu118/torch-2.2.0%2Bcu118-cp312-cp312-win_amd64.whl (2704.2 MB)\n",
"Collecting numpy (from torchtext)\n",
" Using cached https://download.pytorch.org/whl/numpy-1.26.3-cp312-cp312-win_amd64.whl (15.5 MB)\n",
"Collecting torchdata==0.7.1 (from torchtext)\n",
" Using cached https://download.pytorch.org/whl/torchdata-0.7.1-py3-none-any.whl (184 kB)\n",
"Collecting urllib3>=1.25 (from torchdata==0.7.1->torchtext)\n",
" Using cached https://download.pytorch.org/whl/urllib3-1.26.13-py2.py3-none-any.whl (140 kB)\n",
"Collecting MarkupSafe>=2.0 (from jinja2->torch)\n",
" Using cached https://download.pytorch.org/whl/MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl (17 kB)\n",
"Collecting charset-normalizer<3,>=2 (from requests->torchtext)\n",
" Using cached https://download.pytorch.org/whl/charset_normalizer-2.1.1-py3-none-any.whl (39 kB)\n",
"Collecting idna<4,>=2.5 (from requests->torchtext)\n",
" Using cached https://download.pytorch.org/whl/idna-3.4-py3-none-any.whl (61 kB)\n",
"Collecting certifi>=2017.4.17 (from requests->torchtext)\n",
" Using cached https://download.pytorch.org/whl/certifi-2022.12.7-py3-none-any.whl (155 kB)\n",
"Collecting mpmath>=0.19 (from sympy->torch)\n",
" Using cached https://download.pytorch.org/whl/mpmath-1.3.0-py3-none-any.whl (536 kB)\n",
"Collecting colorama (from tqdm->torchtext)\n",
" Using cached https://download.pytorch.org/whl/colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n",
"Collecting intel-openmp==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch)\n",
" Using cached https://download.pytorch.org/whl/intel_openmp-2021.4.0-py2.py3-none-win_amd64.whl (3.5 MB)\n",
"Collecting tbb==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch)\n",
" Using cached https://download.pytorch.org/whl/tbb-2021.11.0-py3-none-win_amd64.whl (298 kB)\n",
"Installing collected packages: mpmath, urllib3, typing-extensions, sympy, numpy, networkx, MarkupSafe, idna, fsspec, filelock, colorama, charset-normalizer, certifi, tqdm, requests, jinja2, torch, torchdata, torchtext\n"
2024-05-23 21:25:06 +02:00
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"ERROR: Could not install packages due to an OSError.\n",
"Consider using the `--user` option or check the permissions.\n",
"Traceback (most recent call last):\n",
" File \"c:\\Python312\\Lib\\site-packages\\pip\\_internal\\commands\\install.py\", line 452, in run\n",
" installed = install_given_reqs(\n",
" ^^^^^^^^^^^^^^^^^^^\n",
" File \"c:\\Python312\\Lib\\site-packages\\pip\\_internal\\req\\__init__.py\", line 72, in install_given_reqs\n",
" requirement.install(\n",
" File \"c:\\Python312\\Lib\\site-packages\\pip\\_internal\\req\\req_install.py\", line 856, in install\n",
" install_wheel(\n",
" File \"c:\\Python312\\Lib\\site-packages\\pip\\_internal\\operations\\install\\wheel.py\", line 725, in install_wheel\n",
" _install_wheel(\n",
" File \"c:\\Python312\\Lib\\site-packages\\pip\\_internal\\operations\\install\\wheel.py\", line 585, in _install_wheel\n",
" file.save()\n",
" File \"c:\\Python312\\Lib\\site-packages\\pip\\_internal\\operations\\install\\wheel.py\", line 378, in save\n",
" os.unlink(self.dest_path)\n",
"PermissionError: [WinError 5] Odmowa dostępu: 'c:\\\\Python312\\\\Lib\\\\site-packages\\\\numpy\\\\core\\\\_multiarray_tests.cp312-win_amd64.pyd'\n"
2024-05-23 21:25:06 +02:00
]
2024-05-25 17:47:40 +02:00
}
],
"source": [
"%pip install --ignore-installed --force-reinstall -v torch torchtext --index-url https://download.pytorch.org/whl/cu118"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
2024-05-23 21:25:06 +02:00
{
2024-05-25 17:47:40 +02:00
"ename": "AttributeError",
"evalue": "partially initialized module 'charset_normalizer' has no attribute 'md__mypyc' (most likely due to a circular import)",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\requests\\compat.py:11\u001b[0m\n\u001b[0;32m 10\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m---> 11\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mchardet\u001b[39;00m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n",
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'chardet'",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[1], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcollections\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Counter\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchtext\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mvocab\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m vocab\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtqdm\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m tqdm\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mipywidgets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m FloatProgress\n",
"File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\torchtext\\__init__.py:12\u001b[0m\n\u001b[0;32m 8\u001b[0m _TEXT_BUCKET \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhttps://download.pytorch.org/models/text/\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 10\u001b[0m _CACHE_DIR \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexpanduser(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(_get_torch_home(), \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m---> 12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m data, datasets, prototype, functional, models, nn, transforms, utils, vocab, experimental\n\u001b[0;32m 14\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversion\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m __version__, git_version \u001b[38;5;66;03m# noqa: F401\u001b[39;00m\n",
"File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\torchtext\\datasets\\__init__.py:3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mimportlib\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mag_news\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AG_NEWS\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mamazonreviewfull\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AmazonReviewFull\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mamazonreviewpolarity\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AmazonReviewPolarity\n",
"File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\torchtext\\datasets\\ag_news.py:5\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfunctools\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m partial\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Union, Tuple\n\u001b[1;32m----> 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatapipes\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01miter\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m FileOpener, IterableWrapper\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchtext\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_download_hooks\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m HttpReader\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchtext\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_internal\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodule_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m is_module_available\n",
"File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\torchdata\\__init__.py:9\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright (c) Meta Platforms, Inc. and affiliates.\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m# All rights reserved.\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;66;03m# This source code is licensed under the BSD-style license found in the\u001b[39;00m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;66;03m# LICENSE file in the root directory of this source tree.\u001b[39;00m\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchdata\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _extension \u001b[38;5;66;03m# noqa: F401\u001b[39;00m\n\u001b[1;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m datapipes\n\u001b[0;32m 11\u001b[0m janitor \u001b[38;5;241m=\u001b[39m datapipes\u001b[38;5;241m.\u001b[39mutils\u001b[38;5;241m.\u001b[39mjanitor\n\u001b[0;32m 13\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
"File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\torchdata\\datapipes\\__init__.py:9\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright (c) Meta Platforms, Inc. and affiliates.\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m# All rights reserved.\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;66;03m# This source code is licensed under the BSD-style license found in the\u001b[39;00m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;66;03m# LICENSE file in the root directory of this source tree.\u001b[39;00m\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdata\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DataChunk, functional_datapipe\n\u001b[1;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;28miter\u001b[39m, \u001b[38;5;28mmap\u001b[39m, utils\n\u001b[0;32m 11\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataChunk\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfunctional_datapipe\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124miter\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmap\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutils\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
"File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\torchdata\\datapipes\\iter\\__init__.py:54\u001b[0m\n\u001b[0;32m 46\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatapipes\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01miter\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mload\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhuggingface\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m HuggingFaceHubReaderIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m HuggingFaceHubReader\n\u001b[0;32m 48\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatapipes\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01miter\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mload\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01miopath\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 49\u001b[0m IoPathFileListerIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m IoPathFileLister,\n\u001b[0;32m 50\u001b[0m IoPathFileOpenerIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m IoPathFileOpener,\n\u001b[0;32m 51\u001b[0m IoPathSaverIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m IoPathSaver,\n\u001b[0;32m 52\u001b[0m )\n\u001b[1;32m---> 54\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatapipes\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01miter\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mload\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01monline\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 55\u001b[0m GDriveReaderDataPipe \u001b[38;5;28;01mas\u001b[39;00m GDriveReader,\n\u001b[0;32m 56\u001b[0m HTTPReaderIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m HttpReader,\n\u001b[0;32m 57\u001b[0m OnlineReaderIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m OnlineReader,\n\u001b[0;32m 58\u001b[0m )\n\u001b[0;32m 59\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatapipes\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01miter\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mload\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01ms3io\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 60\u001b[0m S3FileListerIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m S3FileLister,\n\u001b[0;32m 61\u001b[0m S3FileLoaderIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m S3FileLoader,\n\u001b[0;32m 62\u001b[0m )\n\u001b[0;32m 63\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatapipes\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01miter\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtransform\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbucketbatcher\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 64\u001b[0m BucketBatcherIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m BucketBatcher,\n\u001b[0;32m 65\u001b[0m InBatchShufflerIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m InBatchShuffler,\n\u001b[0;32m 66\u001b[0m MaxTokenBucketizerIterDataPipe \u001b[38;5;28;01mas\u001b[39;00m MaxTokenBucketizer,\n\u001b[0;32m 67\u001b[0m )\n",
"File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\torchdata\\datapipes\\iter\\load\\online.py:12\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mwarnings\u001b[39;00m\n\u001b[0;32m 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Any, Dict, Iterator, Optional, Tuple\n\u001b[1;32m---> 12\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mrequests\u001b[39;00m\n\u001b[0;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatapipes\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m functional_datapipe\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatapipes\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01miter\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IterDataPipe\n",
"File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\requests\\__init__.py:45\u001b[0m\n\u001b[0;32m 41\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mwarnings\u001b[39;00m\n\u001b[0;32m 43\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01murllib3\u001b[39;00m\n\u001b[1;32m---> 45\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mexceptions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m RequestsDependencyWarning\n\u001b[0;32m 47\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 48\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcharset_normalizer\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m __version__ \u001b[38;5;28;01mas\u001b[39;00m charset_normalizer_version\n",
"File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\requests\\exceptions.py:9\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;124;03mrequests.exceptions\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;124;03m~~~~~~~~~~~~~~~~~~~\u001b[39;00m\n\u001b[0;32m 4\u001b[0m \n\u001b[0;32m 5\u001b[0m \u001b[38;5;124;03mThis module contains the set of Requests' exceptions.\u001b[39;00m\n\u001b[0;32m 6\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01murllib3\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mexceptions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m HTTPError \u001b[38;5;28;01mas\u001b[39;00m BaseHTTPError\n\u001b[1;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompat\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m JSONDecodeError \u001b[38;5;28;01mas\u001b[39;00m CompatJSONDecodeError\n\u001b[0;32m 12\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mRequestException\u001b[39;00m(\u001b[38;5;167;01mIOError\u001b[39;00m):\n\u001b[0;32m 13\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"There was an ambiguous exception that occurred while handling your\u001b[39;00m\n\u001b[0;32m 14\u001b[0m \u001b[38;5;124;03m request.\u001b[39;00m\n\u001b[0;32m 15\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n",
"File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\requests\\compat.py:13\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mchardet\u001b[39;00m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[1;32m---> 13\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mcharset_normalizer\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mchardet\u001b[39;00m\n\u001b[0;32m 15\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m\n\u001b[0;32m 17\u001b[0m \u001b[38;5;66;03m# -------\u001b[39;00m\n\u001b[0;32m 18\u001b[0m \u001b[38;5;66;03m# Pythons\u001b[39;00m\n\u001b[0;32m 19\u001b[0m \u001b[38;5;66;03m# -------\u001b[39;00m\n\u001b[0;32m 20\u001b[0m \n\u001b[0;32m 21\u001b[0m \u001b[38;5;66;03m# Syntax sugar.\u001b[39;00m\n",
"File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\charset_normalizer\\__init__.py:24\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;124;03mCharset-Normalizer\u001b[39;00m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;124;03m~~~~~~~~~~~~~~\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 20\u001b[0m \u001b[38;5;124;03m:license: MIT, see LICENSE for more details.\u001b[39;00m\n\u001b[0;32m 21\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 22\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mlogging\u001b[39;00m\n\u001b[1;32m---> 24\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m from_bytes, from_fp, from_path, normalize\n\u001b[0;32m 25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlegacy\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 26\u001b[0m CharsetDetector,\n\u001b[0;32m 27\u001b[0m CharsetDoctor,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 30\u001b[0m detect,\n\u001b[0;32m 31\u001b[0m )\n\u001b[0;32m 32\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CharsetMatch, CharsetMatches\n",
"File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\charset_normalizer\\api.py:7\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpath\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m basename, splitext\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Any, BinaryIO, List, Optional, Set\n\u001b[1;32m----> 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcd\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 8\u001b[0m coherence_ratio,\n\u001b[0;32m 9\u001b[0m encoding_languages,\n\u001b[0;32m 10\u001b[0m mb_encoding_languages,\n\u001b[0;32m 11\u001b[0m merge_coherence_ratios,\n\u001b[0;32m 12\u001b[0m )\n\u001b[0;32m 13\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconstant\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE\n\u001b[0;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmd\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m mess_ratio\n",
"File \u001b[1;32mc:\\Python312\\Lib\\site-packages\\charset_normalizer\\cd.py:9\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01massets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m FREQUENCIES\n\u001b[0;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconstant\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES\n\u001b[1;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmd\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m is_suspiciously_successive_range\n\u001b[0;32m 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CoherenceMatches\n\u001b[0;32m 11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m 12\u001b[0m is_accentuated,\n\u001b[0;32m 13\u001b[0m is_latin,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 16\u001b[0m unicode_range,\n\u001b[0;32m 17\u001b[0m )\n",
"\u001b[1;31mAttributeError\u001b[0m: partially initialized module 'charset_normalizer' has no attribute 'md__mypyc' (most likely due to a circular import)"
2024-05-23 21:25:06 +02:00
]
}
],
"source": [
"from collections import Counter\n",
"import torch\n",
"from torchtext.vocab import vocab\n",
"from tqdm import tqdm\n",
"from ipywidgets import FloatProgress\n",
"\n",
"import pandas as pd\n",
"from nltk.tokenize import word_tokenize\n",
"from unidecode import unidecode"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": null,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
2024-05-25 17:47:40 +02:00
"True"
2024-05-23 21:25:06 +02:00
]
},
2024-05-25 17:47:40 +02:00
"execution_count": 2,
2024-05-23 21:25:06 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"torch.cuda.is_available()"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": null,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"wczytano dane treningowe\n",
"O B-PER O O O O O O O O O B-LOC O O O O O O O O O O O B-LOC O O B-PER I-PER O O O O O O O O O O O O O O O O O O O O O O O B-LOC O O O O O O O O O O O O O B-MISC I-MISC I-MISC I-MISC O O O B-PER O O O O O B-LOC O O O O O O O O O O O O O O O O O B-MISC O O B-LOC O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-PER O O O B-PER I-PER O O O O O O O O O O O O O O O O O O O O O O O O B-PER O O O O O O O O B-MISC O O O O O O O O O O O O O O O O O O O O O O O O Rare Hendrix song draft sells for almost $ 17,000 . </S> LONDON 1996-08-22 </S> A rare early handwritten draft of a song by U.S. guitar legend Jimi Hendrix was sold for almost $ 17,000 on Thursday at an auction of some of the late musician 's favourite possessions . </S> A Florida restaurant paid 10,925 pounds ( $ 16,935 ) for the draft of \" Ai n't no telling \" , which Hendrix penned on a piece of London hotel stationery in late 1966 . </S> At the end of a January 1967 concert in the English city of Nottingham he threw the sheet of paper into the audience , where it was retrieved by a fan . </S> Buyers also snapped up 16 other items that were put up for auction by Hendrix 's former girlfriend Kathy Etchingham , who lived with him from 1966 to 1969 . </S> They included a black lacquer and mother of pearl inlaid box used by Hendrix to store his drugs , which an anonymous Australian purchaser bought for 5,060 pounds ( $ 7,845 ) . </S> The guitarist died of a drugs overdose in 1970 aged 27 . </S>\n",
"podzielono dane treningowe na słowa\n",
"['rare', 'hendrix', 'song', 'draft', 'sells', 'for', 'almost', '$', '17,000', '.', '</s>', 'london', '1996-08-22', '</s>', 'a', 'rare', 'early', 'handwritten', 'draft', 'of', 'a', 'song', 'by', 'u.s.', 'guitar', 'legend', 'jimi', 'hendrix', 'was', 'sold', 'for', 'almost', '$', '17,000', 'on', 'thursday', 'at', 'an', 'auction', 'of', 'some', 'of', 'the', 'late', 'musician', \"'s\", 'favourite', 'possessions', '.', '</s>', 'a', 'florida', 'restaurant', 'paid', '10,925', 'pounds', '(', '$', '16,935', ')', 'for', 'the', 'draft', 'of', '\"', 'ai', \"n't\", 'no', 'telling', '\"', ',', 'which', 'hendrix', 'penned', 'on', 'a', 'piece', 'of', 'london', 'hotel', 'stationery', 'in', 'late', '1966', '.', '</s>', 'at', 'the', 'end', 'of', 'a', 'january', '1967', 'concert', 'in', 'the', 'english', 'city', 'of', 'nottingham', 'he', 'threw', 'the', 'sheet', 'of', 'paper', 'into', 'the', 'audience', ',', 'where', 'it', 'was', 'retrieved', 'by', 'a', 'fan', '.', '</s>', 'buyers', 'also', 'snapped', 'up', '16', 'other', 'items', 'that', 'were', 'put', 'up', 'for', 'auction', 'by', 'hendrix', \"'s\", 'former', 'girlfriend', 'kathy', 'etchingham', ',', 'who', 'lived', 'with', 'him', 'from', '1966', 'to', '1969', '.', '</s>', 'they', 'included', 'a', 'black', 'lacquer', 'and', 'mother', 'of', 'pearl', 'inlaid', 'box', 'used', 'by', 'hendrix', 'to', 'store', 'his', 'drugs', ',', 'which', 'an', 'anonymous', 'australian', 'purchaser', 'bought', 'for', '5,060', 'pounds', '(', '$', '7,845', ')', '.', '</s>', 'the', 'guitarist', 'died', 'of', 'a', 'drugs', 'overdose', 'in', '1970', 'aged', '27', '.', '</s>']\n"
]
}
],
"source": [
"# odczytaj dane treningowe\n",
"train = pd.read_csv('train/train.tsv', sep='\\t')\n",
"train.columns = [\"y\", \"x\"]\n",
"print(\"wczytano dane treningowe\")\n",
"print(train[\"y\"][0], train[\"x\"][0])\n",
"\n",
"# podziel dane treningowe na słowa\n",
"# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/\n",
"slowa_train = []\n",
"for tekst in train[\"x\"]:\n",
" pom = []\n",
" for slowo in tekst.split(\" \"):\n",
" #if slowo not in (\"<\",\"/s\",\">\",\"/S\",\"``\"):\n",
" pom.append(slowo.lower())\n",
" slowa_train.append(pom)\n",
"print(\"podzielono dane treningowe na słowa\")\n",
"print(slowa_train[0])"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": null,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"wczytano dane testowe dev-0\n",
"CRICKET - ENGLISH COUNTY CHAMPIONSHIP SCORES . </S> LONDON 1996-08-30 </S> Result and close of play scores in English county championship matches on Friday : </S> Leicester : Leicestershire beat Somerset by an innings and 39 runs . </S> Somerset 83 and 174 ( P. Simmons 4-38 ) , Leicestershire 296 . </S> Leicestershire 22 points , Somerset 4 . </S> Chester-le-Street : Glamorgan 259 and 207 ( A. Dale 69 , H. Morris 69 ; D. Blenkiron 4-43 ) , Durham 114 ( S. Watkin 4-28 ) and 81-3 . </S> Tunbridge Wells : Nottinghamshire 214 ( P. Johnson 84 ; M. McCague 4-55 ) , Kent 108-3 . </S> London ( The Oval ) : Warwickshire 195 , Surrey 429-7 ( C. Lewis 80 not out , M. Butcher 70 , G. Kersey 63 , J. Ratcliffe 63 , D. Bicknell 55 ) . </S> Hove : Sussex 363 ( W. Athey 111 , V. Drakes 52 ; I. Austin 4-37 ) , Lancashire 197-8 ( W. Hegg 54 ) </S> Portsmouth : Middlesex 199 and 426 ( J. Pooley 111 , M. Ramprakash 108 , M. Gatting 83 ) , Hampshire 232 and 109-5 . </S> Chesterfield : Worcestershire 238 and 133-5 , Derbyshire 471 ( J. Adams 123 , T.O'Gorman 109 not out , K. Barnett 87 ; T. Moody 6-82 ) </S> Bristol : Gloucestershire 183 and 185-6 ( J. Russell 56 not out ) , Northamptonshire 190 ( K. Curran 52 ; A. Smith 5-68 ) . </S>\n",
"podzielono dane treningowe na słowa\n",
"['cricket', '-', 'english', 'county', 'championship', 'scores', '.', '</s>', 'london', '1996-08-30', '</s>', 'result', 'and', 'close', 'of', 'play', 'scores', 'in', 'english', 'county', 'championship', 'matches', 'on', 'friday', ':', '</s>', 'leicester', ':', 'leicestershire', 'beat', 'somerset', 'by', 'an', 'innings', 'and', '39', 'runs', '.', '</s>', 'somerset', '83', 'and', '174', '(', 'p.', 'simmons', '4-38', ')', ',', 'leicestershire', '296', '.', '</s>', 'leicestershire', '22', 'points', ',', 'somerset', '4', '.', '</s>', 'chester-le-street', ':', 'glamorgan', '259', 'and', '207', '(', 'a.', 'dale', '69', ',', 'h.', 'morris', '69', ';', 'd.', 'blenkiron', '4-43', ')', ',', 'durham', '114', '(', 's.', 'watkin', '4-28', ')', 'and', '81-3', '.', '</s>', 'tunbridge', 'wells', ':', 'nottinghamshire', '214', '(', 'p.', 'johnson', '84', ';', 'm.', 'mccague', '4-55', ')', ',', 'kent', '108-3', '.', '</s>', 'london', '(', 'the', 'oval', ')', ':', 'warwickshire', '195', ',', 'surrey', '429-7', '(', 'c.', 'lewis', '80', 'not', 'out', ',', 'm.', 'butcher', '70', ',', 'g.', 'kersey', '63', ',', 'j.', 'ratcliffe', '63', ',', 'd.', 'bicknell', '55', ')', '.', '</s>', 'hove', ':', 'sussex', '363', '(', 'w.', 'athey', '111', ',', 'v.', 'drakes', '52', ';', 'i.', 'austin', '4-37', ')', ',', 'lancashire', '197-8', '(', 'w.', 'hegg', '54', ')', '</s>', 'portsmouth', ':', 'middlesex', '199', 'and', '426', '(', 'j.', 'pooley', '111', ',', 'm.', 'ramprakash', '108', ',', 'm.', 'gatting', '83', ')', ',', 'hampshire', '232', 'and', '109-5', '.', '</s>', 'chesterfield', ':', 'worcestershire', '238', 'and', '133-5', ',', 'derbyshire', '471', '(', 'j.', 'adams', '123', ',', \"t.o'gorman\", '109', 'not', 'out', ',', 'k.', 'barnett', '87', ';', 't.', 'moody', '6-82', ')', '</s>', 'bristol', ':', 'gloucestershire', '183', 'and', '185-6', '(', 'j.', 'russell', '56', 'not', 'out', ')', ',', 'northamptonshire', '190', '(', 'k.', 'curran', '52', ';', 'a.', 'smith', '5-68', ')', '.', '</s>']\n"
]
}
],
"source": [
"# odczytaj dane testowe dev-0\n",
"test_dev0 = pd.read_csv('dev-0/in.tsv', sep='\\t')\n",
"test_dev0.columns = [\"x\"]\n",
"print(\"wczytano dane testowe dev-0\")\n",
"print(test_dev0[\"x\"][0])\n",
"\n",
"# podziel dane testowe na słowa\n",
"# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/\n",
"slowa_test_dev0 = []\n",
"for tekst in test_dev0[\"x\"]:\n",
" pom = []\n",
" for slowo in tekst.split(\" \"):\n",
" #if slowo not in (\"<\",\"/s\",\">\",\"/S\",\"``\"):\n",
" pom.append(slowo.lower())\n",
" slowa_test_dev0.append(pom)\n",
"print(\"podzielono dane treningowe na słowa\")\n",
"print(slowa_test_dev0[0])"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": null,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"wczytano dane testowe A\n",
"RUGBY UNION - CUTTITTA BACK FOR ITALY AFTER A YEAR . </S> ROME 1996-12-06 </S> Italy recalled Marcello Cuttitta </S> on Friday for their friendly against Scotland at Murrayfield more than a year after the 30-year-old wing announced he was retiring following differences over selection . </S> Cuttitta , who trainer George Coste said was certain to play on Saturday week , was named in a 21-man squad lacking only two of the team beaten 54-21 by England at Twickenham last month . </S> Stefano Bordon is out through illness and Coste said he had dropped back row Corrado Covi , who had been recalled for the England game after five years out of the national team . </S> Cuttitta announced his retirement after the 1995 World Cup , where he took issue with being dropped from the Italy side that faced England in the pool stages . </S> Coste said he had approached the player two months ago about a comeback . </S> \" He ended the World Cup on the wrong note , \" Coste said . </S> \" I thought it would be useful to have him back and he said he would be available . </S> I think now is the right time for him to return . \" </S> Squad : Javier Pertile , Paolo Vaccari , Marcello Cuttitta , Ivan Francescato , Leandro Manteri , Diego Dominguez , Francesco Mazzariol , Alessandro Troncon , Orazio Arancio , Andrea Sgorlon , Massimo Giovanelli , Carlo Checchinato , Walter Cristofoletto , Franco Properzi Curti , Carlo Orlandi , Massimo Cuttitta , Giambatista Croci , Gianluca Guidi , Nicola Mazzucato , Alessandro Moscardi , Andrea Castellani . </S>\n",
"podzielono dane treningowe na słowa\n",
"['rugby', 'union', '-', 'cuttitta', 'back', 'for', 'italy', 'after', 'a', 'year', '.', '</s>', 'rome', '1996-12-06', '</s>', 'italy', 'recalled', 'marcello', 'cuttitta', '</s>', 'on', 'friday', 'for', 'their', 'friendly', 'against', 'scotland', 'at', 'murrayfield', 'more', 'than', 'a', 'year', 'after', 'the', '30-year-old', 'wing', 'announced', 'he', 'was', 'retiring', 'following', 'differences', 'over', 'selection', '.', '</s>', 'cuttitta', ',', 'who', 'trainer', 'george', 'coste', 'said', 'was', 'certain', 'to', 'play', 'on', 'saturday', 'week', ',', 'was', 'named', 'in', 'a', '21-man', 'squad', 'lacking', 'only', 'two', 'of', 'the', 'team', 'beaten', '54-21', 'by', 'england', 'at', 'twickenham', 'last', 'month', '.', '</s>', 'stefano', 'bordon', 'is', 'out', 'through', 'illness', 'and', 'coste', 'said', 'he', 'had', 'dropped', 'back', 'row', 'corrado', 'covi', ',', 'who', 'had', 'been', 'recalled', 'for', 'the', 'england', 'game', 'after', 'five', 'years', 'out', 'of', 'the', 'national', 'team', '.', '</s>', 'cuttitta', 'announced', 'his', 'retirement', 'after', 'the', '1995', 'world', 'cup', ',', 'where', 'he', 'took', 'issue', 'with', 'being', 'dropped', 'from', 'the', 'italy', 'side', 'that', 'faced', 'england', 'in', 'the', 'pool', 'stages', '.', '</s>', 'coste', 'said', 'he', 'had', 'approached', 'the', 'player', 'two', 'months', 'ago', 'about', 'a', 'comeback', '.', '</s>', '\"', 'he', 'ended', 'the', 'world', 'cup', 'on', 'the', 'wrong', 'note', ',', '\"', 'coste', 'said', '.', '</s>', '\"', 'i', 'thought', 'it', 'would', 'be', 'useful', 'to', 'have', 'him', 'back', 'and', 'he', 'said', 'he', 'would', 'be', 'available', '.', '</s>', 'i', 'think', 'now', 'is', 'the', 'right', 'time', 'for', 'him', 'to', 'return', '.', '\"', '</s>', 'squad', ':', 'javier', 'pertile', ',', 'paolo', 'vaccari', ',', 'marcello', 'cuttitta', ',', 'ivan', 'francescato', ',', 'leandro', 'manteri', ',', 'diego', 'dominguez', ',', 'francesco', 'mazzariol', ',', 'alessandro', 'troncon', ',', 'orazio', 'arancio', ',', 'andrea', 'sgorlon', ',', 'massimo', 'giovanelli', ',', 'carlo', 'checchinato', ',', 'walter', 'cristofoletto', ',', 'franco', 'properzi', 'curti', ',', 'carlo', 'orlandi', ',', 'massimo', 'cuttitta', ',', 'giambatista', 'croci', ',', 'gianluca', 'guidi', ',', 'nicola', 'mazzucato', ',', 'alessandro', 'moscardi', ',', 'andrea', 'castellani', '.', '</s>']\n"
]
}
],
"source": [
"# odczytaj dane testowe A\n",
"test_A = pd.read_csv('test-A/in.tsv', sep='\\t')\n",
"test_A.columns = [\"x\"]\n",
"print(\"wczytano dane testowe A\")\n",
"print(test_A[\"x\"][0])\n",
"\n",
"# podziel dane testowe na słowa\n",
"# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/\n",
"slowa_test_A = []\n",
"for tekst in test_A[\"x\"]:\n",
" pom = []\n",
" for slowo in tekst.split(\" \"):\n",
" #if slowo not in (\"<\",\"/s\",\">\",\"/S\",\"``\"):\n",
" pom.append(slowo.lower())\n",
" slowa_test_A.append(pom)\n",
"print(\"podzielono dane treningowe na słowa\")\n",
"print(slowa_test_A[0])"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": null,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [],
"source": [
"def build_vocab(dataset):\n",
" counter = Counter()\n",
" for document in dataset:\n",
" counter.update(document)\n",
" return vocab(counter, specials=[\"<unk>\", \"<pad>\", \"<bos>\", \"<eos>\"])"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": null,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"20998\n",
"['<unk>', '<pad>', '<bos>', '<eos>', 'rare', 'hendrix', 'song', 'draft', 'sells', 'for', 'almost', '$', '17,000', '.', '</s>', 'london', '1996-08-22', 'a', 'early', 'handwritten', 'of', 'by', 'u.s.', 'guitar', 'legend', 'jimi', 'was', 'sold', 'on', 'thursday', 'at', 'an', 'auction', 'some', 'the', 'late', 'musician', \"'s\", 'favourite', 'possessions', 'florida', 'restaurant', 'paid', '10,925', 'pounds', '(', '16,935', ')', '\"', 'ai', \"n't\", 'no', 'telling', ',', 'which', 'penned', 'piece', 'hotel', 'stationery', 'in', '1966', 'end', 'january', '1967', 'concert', 'english', 'city', 'nottingham', 'he', 'threw', 'sheet', 'paper', 'into', 'audience', 'where', 'it', 'retrieved', 'fan', 'buyers', 'also', 'snapped', 'up', '16', 'other', 'items', 'that', 'were', 'put', 'former', 'girlfriend', 'kathy', 'etchingham', 'who', 'lived', 'with', 'him', 'from', 'to', '1969', 'they', 'included', 'black', 'lacquer', 'and', 'mother', 'pearl', 'inlaid', 'box', 'used', 'store', 'his', 'drugs', 'anonymous', 'australian', 'purchaser', 'bought', '5,060', '7,845', 'guitarist', 'died', 'overdose', '1970', 'aged', '27', 'china', 'says', 'taiwan', 'spoils', 'atmosphere', 'talks', 'beijing', 'accused', 'taipei', 'spoiling', 'resumption', 'across', 'strait', 'visit', 'ukraine', 'taiwanese', 'vice', 'president', 'lien', 'chan', 'this', 'week', 'infuriated', 'speaking', 'only', 'hours', 'after', 'chinese', 'state', 'media', 'said', 'time', 'right', 'engage', 'political', 'foreign', 'ministry', 'spokesman', 'shen', 'guofang', 'told', 'reuters', ':', 'necessary', 'opening', 'has', 'been', 'disrupted', 'authorities', 'quoted', 'top', 'negotiator', 'tang', 'shubei', 'as', 'visiting', 'group', 'wednesday', 'rivals', 'hold', 'now', 'is', 'two', 'sides', '...', 'hostility', 'overseas', 'edition', 'people', 'daily', 'saying', 'television', 'interview', 'had', 'read', 'reports', 'comments', 'but', 'gave', 'details', 'why', 'considered', 'considers', 'renegade', 'province', 'long', 'opposed', 'all', 'efforts', 'gain', 'greater', 'international', 'recognition', 'rival', 'island', 'should', 'take', 'practical', 'steps', 'towards', 'goal', 'consultations', 'be', 'held', 'set', 'format', 'official', 'xinhua', 'news', 'agency', 'executive', 'chairman', 'association', 'relations', 'straits', 'german', 'july', 'car', 'registrations', '14.2', 'pct', 'yr', '/', 'frankfurt', 'first-time', 'motor', 'vehicles', 'jumped', 'percent', 'year', 'year-earlier', 'period', 'federal', 'office', '356,725', 'new', 'cars', 'registered', '1996', '--', '304,850', 'passenger', '15,613', 'trucks', 'figures', 'represent', '13.6', 'increase', '2.2', 'decline', '1995', 'motor-bike', 'registration', 'rose', '32.7', 'growth', 'partly', 'due', 'increased', 'number', 'germans', 'buying', 'abroad', 'while', 'manufacturers', 'domestic', 'demand', 'weak', 'posted', 'gains', 'numbers', 'volkswagen', 'ag', 'won', '77,719', 'slightly', 'more', 'than', 'quarter', 'total', 'opel', 'together', 'general', 'motors', 'came', 'second', 'place', '49,269', '16.4', 'overall', 'figure', 'third', 'ford', '35,563', 'or', '11.7', 'seat', 'porsche', 'fewer', 'compared', 'last', '3,420', '5522', 'earlier', 'fell', '554', '643', 'greek', 'socialists', 'give', 'green', 'light', 'pm', 'elections', 'athens', 'socialist', 'party', 'bureau', 'prime', 'minister', 'costas', 'simitis', 'call', 'snap', 'its', 'secretary', 'skandalidis', 'reporters', 'going', 'make', 'announcement', 'cabinet', 'meeting', 'later', 'dimitris', 'kontogiannis', 'newsroom', '+301', '3311812-4', 'bayervb', 'sets', 'c$', '100', 'million', 'six-year', 'bond', 'following', 'announced', 'lead', 'manager', 'toronto', 'dominion', 'borrower', 'bayerische', 'vereinsbank', 'amt', 'mln', 'coupon', '6.625', 'maturity', '24.sep.02', 'type', 'straight', 'iss', 'price', '100.92', 'pay', 'date', '24.sep.96', 'full', 'fees', '1.875', 'reoffer', '99.32', 'spread', '+20', 'bp', 'moody', 'aa1', 'listing', 'lux', 'freq', '=', 's&p', 'denoms', 'k', '1-10-100', 'sale', 'limits', 'us', 'uk', 'ca', 'neg', 'plg', 'crs', 'deflt', 'force', 'maj', 'gov', 'law', 'home
]
}
],
"source": [
"v = build_vocab(slowa_train)\n",
"v.set_default_index(v[\"<unk>\"])\n",
"itos = v.get_itos() # mapowanie indeksów na tokeny\n",
"print(len(itos)) # liczba różnych tokenów w słowniku\n",
"print(itos)"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": null,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'O': 0, 'B-PER': 1, 'B-LOC': 2, 'I-PER': 3, 'B-MISC': 4, 'I-MISC': 5, 'I-LOC': 6, 'B-ORG': 7, 'I-ORG': 8}\n"
]
}
],
"source": [
"# slownik etykiety - kody etykiet\n",
"etykieta_na_kod = {}\n",
"licznik = 0\n",
"for tekst in train[\"y\"]:\n",
" for etykieta in tekst.split(\" \"):\n",
" if etykieta not in etykieta_na_kod:\n",
" etykieta_na_kod[etykieta] = licznik\n",
" licznik+=1\n",
"print(etykieta_na_kod)"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": null,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 5, 5, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n"
]
}
],
"source": [
"# podziel etykiety\n",
"kody_etykiet_train = []\n",
"for tekst in train[\"y\"]:\n",
" pom = []\n",
" for etykieta in tekst.split(\" \"):\n",
" pom.append(etykieta_na_kod[etykieta])\n",
" kody_etykiet_train.append(pom)\n",
"print(kody_etykiet_train[0])"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": null,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"O O B-MISC I-MISC I-MISC O O O B-LOC O O O O O O O O O B-MISC O O O O O O O B-LOC O B-ORG O B-ORG O O O O O O O O B-ORG O O O O B-PER I-PER O O O B-ORG O O O B-ORG O O O B-ORG O O O B-LOC O B-ORG O O O O B-PER I-PER O O B-PER I-PER O O B-PER I-PER O O O B-ORG O O B-PER I-PER O O O O O O B-LOC I-LOC O B-ORG O O B-PER I-PER O O B-PER I-PER O O O B-ORG O O O B-LOC O B-LOC I-LOC O O B-ORG O O B-ORG O O B-PER I-PER O O O O B-PER I-PER O O B-PER I-PER O O B-PER I-PER O O B-PER I-PER O O O O B-LOC O B-ORG O O B-PER I-PER O O B-PER I-PER O O B-PER I-PER O O O B-ORG O O B-PER I-PER O O O B-LOC O B-ORG O O O O B-PER I-PER O O B-PER I-PER O O B-PER I-PER O O O B-ORG O O O O O B-LOC O B-ORG O O O O B-ORG O O B-PER I-PER O O B-PER O O O O B-PER I-PER O O B-PER I-PER O O O B-LOC O B-ORG O O O O B-PER I-PER O O O O O B-ORG O O B-PER I-PER O O B-PER I-PER O O O O\n",
"[0, 0, 4, 5, 5, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 2, 0, 7, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 1, 3, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 2, 0, 7, 0, 0, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 7, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 2, 6, 0, 7, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 7, 0, 0, 0, 2, 0, 2, 6, 0, 0, 7, 0, 0, 7, 0, 0, 1, 3, 0, 0, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 0, 2, 0, 7, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 7, 0, 0, 1, 3, 0, 0, 0, 2, 0, 7, 0, 0, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 7, 0, 0, 0, 0, 0, 2, 0, 7, 0, 0, 0, 0, 7, 0, 0, 1, 3, 0, 0, 1, 0, 0, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 2, 0, 7, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 7, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 0]\n"
]
}
],
"source": [
"# odczytaj etykiety dev-0\n",
"labels_dev0 = pd.read_csv('dev-0/expected.tsv', sep='\\t')\n",
"labels_dev0.columns = [\"y\"]\n",
"print(labels_dev0[\"y\"][0])\n",
"\n",
"# podziel etykiety\n",
"kody_etykiet_dev0 = []\n",
"for tekst in labels_dev0[\"y\"]:\n",
" pom = []\n",
" for etykieta in tekst.split(\" \"):\n",
" pom.append(etykieta_na_kod[etykieta])\n",
" kody_etykiet_dev0.append(pom)\n",
"print(kody_etykiet_dev0[0])"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": null,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [],
"source": [
"def data_process(dt):\n",
" # Wektoryzacja dokumentów tekstowych.\n",
" return [\n",
" torch.tensor(\n",
" [v[\"<bos>\"]] + [v[token] for token in document] + [v[\"<eos>\"]],\n",
" dtype=torch.long,\n",
" )\n",
" for document in dt\n",
" ]\n",
"\n",
"def labels_process(dt):\n",
" # Wektoryzacja etykiet (NER)\n",
" return [torch.tensor([0] + document + [0], dtype=torch.long) for document in dt]"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": null,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [],
"source": [
"train_tokens_ids = data_process(slowa_train)\n",
"test_dev0_tokens_ids = data_process(slowa_test_dev0)\n",
"test_A_tokens_ids = data_process(slowa_test_A)\n",
"\n",
"train_labels = labels_process(kody_etykiet_train)\n",
"test_dev0_labels = labels_process(kody_etykiet_dev0)"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": null,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"944 199\n",
"214 256\n",
"229 283\n",
"tensor([ 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,\n",
" 14, 17, 4, 18, 19, 7, 20, 17, 6, 21, 22, 23, 24, 25,\n",
" 5, 26, 27, 9, 10, 11, 12, 28, 29, 30, 31, 32, 20, 33,\n",
" 20, 34, 35, 36, 37, 38, 39, 13, 14, 17, 40, 41, 42, 43,\n",
" 44, 45, 11, 46, 47, 9, 34, 7, 20, 48, 49, 50, 51, 52,\n",
" 48, 53, 54, 5, 55, 28, 17, 56, 20, 15, 57, 58, 59, 35,\n",
" 60, 13, 14, 30, 34, 61, 20, 17, 62, 63, 64, 59, 34, 65,\n",
" 66, 20, 67, 68, 69, 34, 70, 20, 71, 72, 34, 73, 53, 74,\n",
" 75, 26, 76, 21, 17, 77, 13, 14, 78, 79, 80, 81, 82, 83,\n",
" 84, 85, 86, 87, 81, 9, 32, 21, 5, 37, 88, 89, 90, 91,\n",
" 53, 92, 93, 94, 95, 96, 60, 97, 98, 13, 14, 99, 100, 17,\n",
" 101, 102, 103, 104, 20, 105, 106, 107, 108, 21, 5, 97, 109, 110,\n",
" 111, 53, 54, 31, 112, 113, 114, 115, 9, 116, 44, 45, 11, 117,\n",
" 47, 13, 14, 34, 118, 119, 20, 17, 111, 120, 59, 121, 122, 123,\n",
" 13, 14, 3])\n",
"tensor([ 2, 1949, 459, 65, 1950, 1951, 1592, 13, 14, 15,\n",
" 19342, 14, 1793, 103, 1465, 20, 1952, 1592, 59, 65,\n",
" 1950, 1951, 1954, 28, 947, 166, 14, 1992, 166, 1993,\n",
" 1703, 1965, 21, 31, 2038, 103, 3671, 2932, 13, 14,\n",
" 1965, 6226, 103, 16331, 45, 1995, 1996, 0, 47, 53,\n",
" 1993, 0, 13, 14, 1993, 1055, 1330, 53, 1965, 1864,\n",
" 13, 14, 17021, 166, 1991, 19322, 103, 14088, 45, 1977,\n",
" 0, 1620, 53, 10801, 12466, 1620, 1962, 1958, 0, 0,\n",
" 47, 53, 1956, 19326, 45, 1960, 19327, 19328, 47, 103,\n",
" 16667, 13, 14, 19313, 1363, 166, 2012, 0, 45, 1995,\n",
" 2752, 5725, 1962, 1967, 0, 0, 47, 53, 1985, 0,\n",
" 13, 14, 15, 45, 34, 2037, 47, 166, 2020, 14779,\n",
" 53, 2018, 0, 45, 2030, 2059, 5455, 620, 618, 53,\n",
" 1967, 0, 1602, 53, 1963, 0, 1976, 53, 1974, 0,\n",
" 1976, 53, 1958, 0, 3843, 47, 13, 14, 19318, 166,\n",
" 2002, 16329, 45, 2024, 19320, 9379, 53, 2007, 2008, 1979,\n",
" 1962, 2061, 10865, 0, 47, 53, 2034, 0, 45, 2024,\n",
" 0, 2054, 47, 14, 6206, 166, 12584, 11568, 103, 11269,\n",
" 45, 1974, 19334, 9379, 53, 1967, 19335, 1997, 53, 1967,\n",
" 17052, 6226, 47, 53, 2000, 15584, 103, 0, 13, 14,\n",
" 9493, 166, 2026, 10970, 103, 0, 53, 9314, 0, 45,\n",
" 1974, 2717, 0, 53, 0, 6237, 620, 618, 53, 6223,\n",
" 19332, 11058, 1962, 6227, 401, 0, 47, 14, 9488, 166,\n",
" 1972, 19340, 103, 0, 45, 1974, 1975, 4451, 620, 618,\n",
" 47, 53, 2010, 14739, 45, 6223, 6224, 1979, 1962, 1977,\n",
" 4839, 1981, 47, 13, 14, 3])\n",
"tensor([ 2, 6342, 769, 459, 0, 960, 9, 1681, 150, 17,\n",
" 253, 13, 14, 5474, 0, 14, 1681, 3063, 0, 0,\n",
" 14, 28, 947, 9, 701, 7189, 572, 2124, 30, 0,\n",
" 300, 301, 17, 253, 150, 34, 14863, 6363, 371, 68,\n",
" 26, 3333, 370, 3631, 608, 11618, 13, 14, 0, 53,\n",
" 92, 1738, 1753, 0, 154, 26, 3388, 97, 1952, 28,\n",
" 3978, 145, 53, 26, 2116, 59, 17, 0, 2099, 14403,\n",
" 148, 186, 20, 34, 695, 2519, 0, 21, 1208, 30,\n",
" 0, 324, 729, 13, 14, 2725, 0, 185, 618, 863,\n",
" 1521, 103, 0, 154, 68, 197, 2954, 960, 2955, 0,\n",
" 0, 53, 92, 197, 170, 3063, 9, 34, 1208, 2154,\n",
" 150, 1824, 1053, 618, 20, 34, 457, 695, 13, 14,\n",
" 0, 371, 110, 12530, 150, 34, 274, 1593, 1711, 53,\n",
" 74, 68, 596, 452, 94, 1458, 2954, 96, 34, 1681,\n",
" 749, 85, 2517, 1208, 59, 34, 8797, 9174, 13, 14,\n",
" 0, 154, 68, 197, 4705, 34, 6392, 186, 836, 2521,\n",
" 700, 17, 17097, 13, 14, 48, 68, 1240, 34, 1593,\n",
" 1711, 28, 34, 4645, 3370, 53, 48, 0, 154, 13,\n",
" 14, 48, 1500, 1798, 75, 693, 226, 5612, 97, 606,\n",
" 95, 960, 103, 68, 154, 68, 693, 226, 1221, 13,\n",
" 14, 1500, 4604, 184, 185, 34, 156, 155, 9, 95,\n",
" 97, 671, 13, 48, 14, 2099, 166, 2718, 0, 53,\n",
" 0, 16807, 53, 0, 0, 53, 2886, 0, 53, 0,\n",
" 0, 53, 2854, 0, 53, 10959, 0, 53, 11542, 0,\n",
" 53, 0, 0, 53, 2219, 0, 53, 0, 0, 53,\n",
" 4036, 0, 53, 17118, 0, 53, 11460, 0, 0, 53,\n",
" 4036, 0, 53, 0, 0, 53, 0, 0, 53, 9462,\n",
" 0, 53, 13541, 0, 53, 11542, 0, 53, 2219, 0,\n",
" 13, 14, 3])\n"
]
}
],
"source": [
"print(len(train_tokens_ids), len(train_tokens_ids[0]))\n",
"print(len(test_dev0_tokens_ids), len(test_dev0_tokens_ids[0]))\n",
"print(len(test_A_tokens_ids), len(test_A_tokens_ids[0]))\n",
"\n",
"print(train_tokens_ids[0])\n",
"print(test_dev0_tokens_ids[0])\n",
"print(test_A_tokens_ids[0])"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": null,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"944 199\n",
"214 256\n",
"tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 2, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 5, 5, 0, 0,\n",
" 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0])\n",
"tensor([0, 0, 0, 4, 5, 5, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0,\n",
" 0, 0, 0, 2, 0, 7, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 1, 3, 0,\n",
" 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0, 2, 0, 7, 0, 0, 0, 0, 1, 3, 0,\n",
" 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 7, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 2, 6, 0,\n",
" 7, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 7, 0, 0, 0, 2, 0, 2, 6, 0, 0, 7, 0,\n",
" 0, 7, 0, 0, 1, 3, 0, 0, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 1, 3,\n",
" 0, 0, 0, 0, 2, 0, 7, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 7, 0,\n",
" 0, 1, 3, 0, 0, 0, 2, 0, 7, 0, 0, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 1, 3, 0,\n",
" 0, 0, 7, 0, 0, 0, 0, 0, 2, 0, 7, 0, 0, 0, 0, 7, 0, 0, 1, 3, 0, 0, 1, 0,\n",
" 0, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 2, 0, 7, 0, 0, 0, 0, 1, 3, 0, 0, 0,\n",
" 0, 0, 7, 0, 0, 1, 3, 0, 0, 1, 3, 0, 0, 0, 0, 0])\n"
]
}
],
"source": [
"print(len(train_labels), len(train_labels[0]))\n",
"print(len(test_dev0_labels), len(test_dev0_labels[0]))\n",
"\n",
"print(train_labels[0])\n",
"print(test_dev0_labels[0])"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": null,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [],
"source": [
"def get_scores(y_true, y_pred):\n",
" # Funkcja zwraca precyzję, pokrycie i F1\n",
" acc_score = 0\n",
" tp = 0\n",
" fp = 0\n",
" selected_items = 0\n",
" relevant_items = 0\n",
"\n",
" for p, t in zip(y_pred, y_true):\n",
" if p == t:\n",
" acc_score += 1\n",
"\n",
" if p > 0 and p == t:\n",
" tp += 1\n",
"\n",
" if p > 0:\n",
" selected_items += 1\n",
"\n",
" if t > 0:\n",
" relevant_items += 1\n",
"\n",
" if selected_items == 0:\n",
" precision = 1.0\n",
" else:\n",
" precision = tp / selected_items\n",
"\n",
" if relevant_items == 0:\n",
" recall = 1.0\n",
" else:\n",
" recall = tp / relevant_items\n",
"\n",
" if precision + recall == 0.0:\n",
" f1 = 0.0\n",
" else:\n",
" f1 = 2 * precision * recall / (precision + recall)\n",
"\n",
" return precision, recall, f1"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": null,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [],
"source": [
"num_tags = len(etykieta_na_kod.keys())\n",
"\n",
"class LSTM(torch.nn.Module):\n",
"\n",
" def __init__(self):\n",
" super(LSTM, self).__init__()\n",
" self.emb = torch.nn.Embedding(len(v.get_itos()), 100)\n",
" self.rec = torch.nn.LSTM(100, 256, 1, batch_first=True)\n",
" self.fc1 = torch.nn.Linear(256, num_tags)\n",
"\n",
" def forward(self, x):\n",
" emb = torch.relu(self.emb(x))\n",
" lstm_output, (h_n, c_n) = self.rec(emb)\n",
" out_weights = self.fc1(lstm_output)\n",
" return out_weights"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": null,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [],
"source": [
"def eval_model(dataset_tokens, dataset_labels, model):\n",
" Y_true = []\n",
" Y_pred = []\n",
" for i in tqdm(range(len(dataset_labels))):\n",
" batch_tokens = dataset_tokens[i].unsqueeze(0)\n",
" tags = list(dataset_labels[i].numpy())\n",
" Y_true += tags\n",
"\n",
" Y_batch_pred_weights = model(batch_tokens).squeeze(0)\n",
" Y_batch_pred = torch.argmax(Y_batch_pred_weights, 1)\n",
" Y_pred += list(Y_batch_pred.numpy())\n",
"\n",
" return get_scores(Y_true, Y_pred)"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": null,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [],
"source": [
"lstm = LSTM()\n",
"criterion = torch.nn.CrossEntropyLoss()\n",
"optimizer = torch.optim.Adam(lstm.parameters())\n",
"NUM_EPOCHS = 5"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": null,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 88%|████████▊ | 832/944 [00:37<00:05, 21.94it/s]\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[23], line 13\u001b[0m\n\u001b[0;32m 10\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mzero_grad()\n\u001b[0;32m 11\u001b[0m loss \u001b[38;5;241m=\u001b[39m criterion(predicted_tags\u001b[38;5;241m.\u001b[39msqueeze(\u001b[38;5;241m0\u001b[39m), tags\u001b[38;5;241m.\u001b[39msqueeze(\u001b[38;5;241m1\u001b[39m))\n\u001b[1;32m---> 13\u001b[0m \u001b[43mloss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 14\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mstep()\n\u001b[0;32m 16\u001b[0m lstm\u001b[38;5;241m.\u001b[39meval()\n",
"File \u001b[1;32mc:\\Users\\Dominik\\Desktop\\Studia\\11.sem1\\en-ner-conll-2003\\.venv\\Lib\\site-packages\\torch\\_tensor.py:525\u001b[0m, in \u001b[0;36mTensor.backward\u001b[1;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[0;32m 515\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m 516\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[0;32m 517\u001b[0m Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[0;32m 518\u001b[0m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 523\u001b[0m inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[0;32m 524\u001b[0m )\n\u001b[1;32m--> 525\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 526\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[0;32m 527\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\Dominik\\Desktop\\Studia\\11.sem1\\en-ner-conll-2003\\.venv\\Lib\\site-packages\\torch\\autograd\\__init__.py:267\u001b[0m, in \u001b[0;36mbackward\u001b[1;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[0;32m 262\u001b[0m retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n\u001b[0;32m 264\u001b[0m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[0;32m 265\u001b[0m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[0;32m 266\u001b[0m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[1;32m--> 267\u001b[0m \u001b[43m_engine_run_backward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 268\u001b[0m \u001b[43m \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 269\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 270\u001b[0m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 271\u001b[0m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 272\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 273\u001b[0m \u001b[43m \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 274\u001b[0m \u001b[43m \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 275\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\Dominik\\Desktop\\Studia\\11.sem1\\en-ner-conll-2003\\.venv\\Lib\\site-packages\\torch\\autograd\\graph.py:744\u001b[0m, in \u001b[0;36m_engine_run_backward\u001b[1;34m(t_outputs, *args, **kwargs)\u001b[0m\n\u001b[0;32m 742\u001b[0m unregister_hooks \u001b[38;5;241m=\u001b[39m _register_logging_hooks_on_whole_graph(t_outputs)\n\u001b[0;32m 743\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 744\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mVariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[0;32m 745\u001b[0m \u001b[43m \u001b[49m\u001b[43mt_outputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[0;32m 746\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001b[39;00m\n\u001b[0;32m 747\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[0;32m 748\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m attach_logging_hooks:\n",
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"for i in range(NUM_EPOCHS):\n",
" lstm.train()\n",
" # for i in tqdm(range(500)):\n",
" for i in tqdm(range(len(train_labels))):\n",
" batch_tokens = train_tokens_ids[i].unsqueeze(0)\n",
" tags = train_labels[i].unsqueeze(1)\n",
"\n",
" predicted_tags = lstm(batch_tokens)\n",
"\n",
" optimizer.zero_grad()\n",
" loss = criterion(predicted_tags.squeeze(0), tags.squeeze(1))\n",
"\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" lstm.eval()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 214/214 [00:00<00:00, 262.62it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"(0.6558005752636625, 0.7225352112676057, 0.687552353828112)\n"
]
}
],
"source": [
"print(eval_model(test_dev0_tokens_ids, test_dev0_labels, lstm))"
]
}
],
"metadata": {
"author": "Jakub Pokrywka",
"email": "kubapok@wmi.amu.edu.pl",
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"lang": "pl",
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
},
"subtitle": "11.NER RNN[ćwiczenia]",
"title": "Ekstrakcja informacji",
"year": "2021"
},
"nbformat": 4,
"nbformat_minor": 4
}