{ "cells": [ { "cell_type": "code", "execution_count": 99, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "z8cJbMghvK3k", "outputId": "09520694-de64-4046-c2a6-639031aa1a10" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (2.32.3)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests) (3.7)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests) (2024.6.2)\n", "Requirement already satisfied: jieba in /usr/local/lib/python3.10/dist-packages (0.42.1)\n", "Requirement already satisfied: pypinyin in /usr/local/lib/python3.10/dist-packages (0.51.0)\n", "Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)\n", "Looking in indexes: https://download.pytorch.org/whl/cu118\n", "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.3.1+cu118)\n", "Requirement already satisfied: torchtext in /usr/local/lib/python3.10/dist-packages (0.18.0)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch) (3.14.0)\n", "Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch) (4.12.1)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.12.1)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.3)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.4)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch) (2023.6.0)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu11==11.8.89 in /usr/local/lib/python3.10/dist-packages (from torch) (11.8.89)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu11==11.8.89 in /usr/local/lib/python3.10/dist-packages (from torch) (11.8.89)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu11==11.8.87 in /usr/local/lib/python3.10/dist-packages (from torch) (11.8.87)\n", "Requirement already satisfied: nvidia-cudnn-cu11==8.7.0.84 in /usr/local/lib/python3.10/dist-packages (from torch) (8.7.0.84)\n", "Requirement already satisfied: nvidia-cublas-cu11==11.11.3.6 in /usr/local/lib/python3.10/dist-packages (from torch) (11.11.3.6)\n", "Requirement already satisfied: nvidia-cufft-cu11==10.9.0.58 in /usr/local/lib/python3.10/dist-packages (from torch) (10.9.0.58)\n", "Requirement already satisfied: nvidia-curand-cu11==10.3.0.86 in /usr/local/lib/python3.10/dist-packages (from torch) (10.3.0.86)\n", "Requirement already satisfied: nvidia-cusolver-cu11==11.4.1.48 in /usr/local/lib/python3.10/dist-packages (from torch) (11.4.1.48)\n", "Requirement already satisfied: nvidia-cusparse-cu11==11.7.5.86 in /usr/local/lib/python3.10/dist-packages (from torch) (11.7.5.86)\n", "Requirement already satisfied: nvidia-nccl-cu11==2.20.5 in /usr/local/lib/python3.10/dist-packages (from torch) (2.20.5)\n", "Requirement already satisfied: nvidia-nvtx-cu11==11.8.86 in /usr/local/lib/python3.10/dist-packages (from torch) (11.8.86)\n", "Requirement already satisfied: triton==2.3.1 in /usr/local/lib/python3.10/dist-packages (from torch) (2.3.1)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from torchtext) (4.66.4)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from torchtext) (2.32.3)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from torchtext) (1.25.2)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.5)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->torchtext) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->torchtext) (3.7)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->torchtext) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->torchtext) (2024.6.2)\n", "Requirement already satisfied: mpmath<1.4.0,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n", "Requirement already satisfied: chardet in /usr/local/lib/python3.10/dist-packages (5.2.0)\n", "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.41.2)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.14.0)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.23.2)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.25.2)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.0)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.5.15)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.32.3)\n", "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.1)\n", "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.3)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.4)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.0->transformers) (2023.6.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.0->transformers) (4.12.1)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.7)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2024.6.2)\n", "Requirement already satisfied: ipywidgets in /usr/local/lib/python3.10/dist-packages (7.7.1)\n", "Requirement already satisfied: ipykernel>=4.5.1 in /usr/local/lib/python3.10/dist-packages (from ipywidgets) (5.5.6)\n", "Requirement already satisfied: ipython-genutils~=0.2.0 in /usr/local/lib/python3.10/dist-packages (from ipywidgets) (0.2.0)\n", "Requirement already satisfied: traitlets>=4.3.1 in /usr/local/lib/python3.10/dist-packages (from ipywidgets) (5.7.1)\n", "Requirement already satisfied: widgetsnbextension~=3.6.0 in /usr/local/lib/python3.10/dist-packages (from ipywidgets) (3.6.6)\n", "Requirement already satisfied: ipython>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from ipywidgets) (7.34.0)\n", "Requirement already satisfied: jupyterlab-widgets>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from ipywidgets) (3.0.11)\n", "Requirement already satisfied: jupyter-client in /usr/local/lib/python3.10/dist-packages (from ipykernel>=4.5.1->ipywidgets) (8.6.2)\n", "Requirement already satisfied: tornado>=4.2 in /usr/local/lib/python3.10/dist-packages (from ipykernel>=4.5.1->ipywidgets) (6.3.3)\n", "Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.10/dist-packages (from ipython>=4.0.0->ipywidgets) (67.7.2)\n", "Requirement already satisfied: jedi>=0.16 in /usr/local/lib/python3.10/dist-packages (from ipython>=4.0.0->ipywidgets) (0.19.1)\n", "Requirement already satisfied: decorator in /usr/local/lib/python3.10/dist-packages (from ipython>=4.0.0->ipywidgets) (4.4.2)\n", "Requirement already satisfied: pickleshare in /usr/local/lib/python3.10/dist-packages (from ipython>=4.0.0->ipywidgets) (0.7.5)\n", "Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from ipython>=4.0.0->ipywidgets) (3.0.45)\n", "Requirement already satisfied: pygments in /usr/local/lib/python3.10/dist-packages (from ipython>=4.0.0->ipywidgets) (2.16.1)\n", "Requirement already satisfied: backcall in /usr/local/lib/python3.10/dist-packages (from ipython>=4.0.0->ipywidgets) (0.2.0)\n", "Requirement already satisfied: matplotlib-inline in /usr/local/lib/python3.10/dist-packages (from ipython>=4.0.0->ipywidgets) (0.1.7)\n", "Requirement already satisfied: pexpect>4.3 in /usr/local/lib/python3.10/dist-packages (from ipython>=4.0.0->ipywidgets) (4.9.0)\n", "Requirement already satisfied: notebook>=4.4.1 in /usr/local/lib/python3.10/dist-packages (from widgetsnbextension~=3.6.0->ipywidgets) (6.5.5)\n", "Requirement already satisfied: parso<0.9.0,>=0.8.3 in /usr/local/lib/python3.10/dist-packages (from jedi>=0.16->ipython>=4.0.0->ipywidgets) (0.8.4)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (3.1.4)\n", "Requirement already satisfied: pyzmq<25,>=17 in /usr/local/lib/python3.10/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (24.0.1)\n", "Requirement already satisfied: argon2-cffi in /usr/local/lib/python3.10/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (23.1.0)\n", "Requirement already satisfied: jupyter-core>=4.6.1 in /usr/local/lib/python3.10/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (5.7.2)\n", "Collecting jupyter-client (from ipykernel>=4.5.1->ipywidgets)\n", " Using cached jupyter_client-7.4.9-py3-none-any.whl (133 kB)\n", "Requirement already satisfied: nbformat in /usr/local/lib/python3.10/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (5.10.4)\n", "Requirement already satisfied: nbconvert>=5 in /usr/local/lib/python3.10/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (6.5.4)\n", "Requirement already satisfied: nest-asyncio>=1.5 in /usr/local/lib/python3.10/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (1.6.0)\n", "Requirement already satisfied: Send2Trash>=1.8.0 in /usr/local/lib/python3.10/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (1.8.3)\n", "Requirement already satisfied: terminado>=0.8.3 in /usr/local/lib/python3.10/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (0.18.1)\n", "Requirement already satisfied: prometheus-client in /usr/local/lib/python3.10/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (0.20.0)\n", "Requirement already satisfied: nbclassic>=0.4.7 in /usr/local/lib/python3.10/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (1.1.0)\n", "Requirement already satisfied: entrypoints in /usr/local/lib/python3.10/dist-packages (from jupyter-client->ipykernel>=4.5.1->ipywidgets) (0.4)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from jupyter-client->ipykernel>=4.5.1->ipywidgets) (2.8.2)\n", "Requirement already satisfied: ptyprocess>=0.5 in /usr/local/lib/python3.10/dist-packages (from pexpect>4.3->ipython>=4.0.0->ipywidgets) (0.7.0)\n", "Requirement already satisfied: wcwidth in /usr/local/lib/python3.10/dist-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=4.0.0->ipywidgets) (0.2.13)\n", "Requirement already satisfied: platformdirs>=2.5 in /usr/local/lib/python3.10/dist-packages (from jupyter-core>=4.6.1->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (4.2.2)\n", "Requirement already satisfied: notebook-shim>=0.2.3 in /usr/local/lib/python3.10/dist-packages (from nbclassic>=0.4.7->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (0.2.4)\n", "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (4.9.4)\n", "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (4.12.3)\n", "Requirement already satisfied: bleach in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (6.1.0)\n", "Requirement already satisfied: defusedxml in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (0.7.1)\n", "Requirement already satisfied: jupyterlab-pygments in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (0.3.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (2.1.5)\n", "Requirement already satisfied: mistune<2,>=0.8.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (0.8.4)\n", "Requirement already satisfied: nbclient>=0.5.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (0.10.0)\n", "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (24.0)\n", "Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (1.5.1)\n", "Requirement already satisfied: tinycss2 in /usr/local/lib/python3.10/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (1.3.0)\n", "Requirement already satisfied: fastjsonschema>=2.15 in /usr/local/lib/python3.10/dist-packages (from nbformat->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (2.19.1)\n", "Requirement already satisfied: jsonschema>=2.6 in /usr/local/lib/python3.10/dist-packages (from nbformat->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (4.19.2)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->jupyter-client->ipykernel>=4.5.1->ipywidgets) (1.16.0)\n", "Requirement already satisfied: argon2-cffi-bindings in /usr/local/lib/python3.10/dist-packages (from argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (21.2.0)\n", "Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (23.2.0)\n", "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (2023.12.1)\n", "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (0.35.1)\n", "Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (0.18.1)\n", "Requirement already satisfied: jupyter-server<3,>=1.8 in /usr/local/lib/python3.10/dist-packages (from notebook-shim>=0.2.3->nbclassic>=0.4.7->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (1.24.0)\n", "Requirement already satisfied: cffi>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from argon2-cffi-bindings->argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (1.16.0)\n", "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4->nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (2.5)\n", "Requirement already satisfied: webencodings in /usr/local/lib/python3.10/dist-packages (from bleach->nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (0.5.1)\n", "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.0.1->argon2-cffi-bindings->argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (2.22)\n", "Requirement already satisfied: anyio<4,>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from jupyter-server<3,>=1.8->notebook-shim>=0.2.3->nbclassic>=0.4.7->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (3.7.1)\n", "Requirement already satisfied: websocket-client in /usr/local/lib/python3.10/dist-packages (from jupyter-server<3,>=1.8->notebook-shim>=0.2.3->nbclassic>=0.4.7->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (1.8.0)\n", "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<4,>=3.1.0->jupyter-server<3,>=1.8->notebook-shim>=0.2.3->nbclassic>=0.4.7->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (3.7)\n", "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio<4,>=3.1.0->jupyter-server<3,>=1.8->notebook-shim>=0.2.3->nbclassic>=0.4.7->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (1.3.1)\n", "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<4,>=3.1.0->jupyter-server<3,>=1.8->notebook-shim>=0.2.3->nbclassic>=0.4.7->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets) (1.2.1)\n", "Installing collected packages: jupyter-client\n", " Attempting uninstall: jupyter-client\n", " Found existing installation: jupyter_client 8.6.2\n", " Uninstalling jupyter_client-8.6.2:\n", " Successfully uninstalled jupyter_client-8.6.2\n", "Successfully installed jupyter-client-7.4.9\n", "Requirement already satisfied: jupyter_core in /usr/local/lib/python3.10/dist-packages (5.7.2)\n", "Requirement already satisfied: jupyter_client in /usr/local/lib/python3.10/dist-packages (7.4.9)\n", "Collecting jupyter_client\n", " Using cached jupyter_client-8.6.2-py3-none-any.whl (105 kB)\n", "Requirement already satisfied: platformdirs>=2.5 in /usr/local/lib/python3.10/dist-packages (from jupyter_core) (4.2.2)\n", "Requirement already satisfied: traitlets>=5.3 in /usr/local/lib/python3.10/dist-packages (from jupyter_core) (5.7.1)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from jupyter_client) (2.8.2)\n", "Requirement already satisfied: pyzmq>=23.0 in /usr/local/lib/python3.10/dist-packages (from jupyter_client) (24.0.1)\n", "Requirement already satisfied: tornado>=6.2 in /usr/local/lib/python3.10/dist-packages (from jupyter_client) (6.3.3)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->jupyter_client) (1.16.0)\n", "Installing collected packages: jupyter_client\n", " Attempting uninstall: jupyter_client\n", " Found existing installation: jupyter_client 7.4.9\n", " Uninstalling jupyter_client-7.4.9:\n", " Successfully uninstalled jupyter_client-7.4.9\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "notebook 6.5.5 requires jupyter-client<8,>=5.3.4, but you have jupyter-client 8.6.2 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0mSuccessfully installed jupyter_client-8.6.2\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (2.0.3)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2023.4)\n", "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.1)\n", "Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.25.2)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", "\u001b[31mERROR: Could not find a version that satisfies the requirement re (from versions: none)\u001b[0m\u001b[31m\n", "\u001b[0m\u001b[31mERROR: No matching distribution found for re\u001b[0m\u001b[31m\n", "\u001b[0mRequirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (1.2.2)\n", "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.25.2)\n", "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.10.1)\n", "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.4.2)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (3.5.0)\n", "Requirement already satisfied: scipy==1.10.1 in /usr/local/lib/python3.10/dist-packages (1.10.1)\n", "Requirement already satisfied: numpy<1.27.0,>=1.19.5 in /usr/local/lib/python3.10/dist-packages (from scipy==1.10.1) (1.25.2)\n", "Requirement already satisfied: gensim in /usr/local/lib/python3.10/dist-packages (4.3.2)\n", "Requirement already satisfied: numpy>=1.18.5 in /usr/local/lib/python3.10/dist-packages (from gensim) (1.25.2)\n", "Requirement already satisfied: scipy>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from gensim) (1.10.1)\n", "Requirement already satisfied: smart-open>=1.8.1 in /usr/local/lib/python3.10/dist-packages (from gensim) (6.4.0)\n" ] } ], "source": [ "%pip install --upgrade requests\n", "%pip install jieba\n", "%pip install pypinyin\n", "%pip install -v torch torchtext --index-url https://download.pytorch.org/whl/cu118\n", "%pip install chardet\n", "%pip install transformers\n", "%pip install ipywidgets\n", "%pip install --upgrade jupyter_core jupyter_client\n", "%pip install pandas\n", "%pip install re\n", "%pip install scikit-learn\n", "%pip install scipy==1.10.1\n", "%pip install gensim" ] }, { "cell_type": "code", "execution_count": 100, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-fw8K0r8vK3m", "outputId": "7c973fa0-05f5-42c6-cb7e-66b8d0500c6b" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Enabling notebook extension jupyter-js-widgets/extension...\n", "Paths used for configuration of notebook: \n", " \t/root/.jupyter/nbconfig/notebook.json\n", "Paths used for configuration of notebook: \n", " \t\n", " - Validating: \u001b[32mOK\u001b[0m\n", "Paths used for configuration of notebook: \n", " \t/root/.jupyter/nbconfig/notebook.json\n" ] } ], "source": [ "!jupyter nbextension enable --py widgetsnbextension\n", "import jieba\n", "import pypinyin\n", "import torch\n", "from transformers import AutoTokenizer, AutoModel\n", "import pandas\n", "import re\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.datasets import load_iris\n", "import numpy" ] }, { "cell_type": "code", "execution_count": 101, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5EWDs2qIvK3n", "outputId": "a0d96d62-9121-43de-9924-5fa9c67c0a6f" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "True\n" ] } ], "source": [ "print(torch.cuda.is_available())\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")" ] }, { "cell_type": "code", "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "V7yH6_p0xVM4", "outputId": "89c42e07-65d0-4e8b-ecfa-8d0c0c2fa7cf" }, "execution_count": 102, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "hAfKmY-8vK3o" }, "source": [ "## Normalizacja wejścia - pozbycie się spacji i znaków innych niż chińskie (interpunkcyjnych).\n", "### TODO - przepisać używając słownika znaków chińskich?" ] }, { "cell_type": "code", "execution_count": 103, "metadata": { "id": "70WYCTvLvK3p" }, "outputs": [], "source": [ "# lower() - male litery\n", "# strip() - bez krancowych znakow niedrukowalnych\n", "# bez znakow interpunkcyjnych\n", "def normalizeString(s):\n", " s = s.lower().strip()\n", " s = re.sub(r\"([.!?])\", r\"\", s)\n", " s = re.sub(r\"([,;:-])\", r\"\", s)\n", " s = re.sub(r\"([。,?”“《》·、!:;π…ㄚ])\", r\"\", s)\n", " s = re.sub(r\"([/])\", r\"\", s)\n", " s = re.sub(r\"(['\\\"])\", r\" \", s)\n", " return s.strip()\n", "\n", "def normalizeChinese(s):\n", " s = normalizeString(s)\n", " pom = \"\"\n", " for c in s:\n", " if c != \" \":\n", " pom+=c\n", " #pom+=\" \"\n", " return pom.strip()" ] }, { "cell_type": "markdown", "metadata": { "id": "AzN_U3XuvK3p" }, "source": [ "## Wczytanie zbioru danych. https://www.kaggle.com/datasets/marquis03/chinese-couplets-dataset" ] }, { "cell_type": "code", "execution_count": 104, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YMFveDsRvK3q", "outputId": "0f9142d3-94bb-4bd1-d5aa-ad6c0d506a85" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "腾飞上铁锐意改革谋发展勇当千里马\n", "和谐南供安全送电保畅通争做领头羊\n" ] } ], "source": [ "fixed_couplets_in = pandas.read_csv(\"fixed_couplets_in.txt\", sep=\"\\t\", names=[\"in\"], header=None)\n", "fixed_couplets_out = pandas.read_csv(\"fixed_couplets_out.txt\", sep=\"\\t\", names=[\"out\"], header=None)\n", "\n", "normalized_fixed_couplets_in=[]\n", "for _ in fixed_couplets_in[\"in\"]:\n", " normalized_fixed_couplets_in.append(normalizeChinese(_))\n", "normalized_fixed_couplets_out=[]\n", "for _ in fixed_couplets_out[\"out\"]:\n", " normalized_fixed_couplets_out.append(normalizeChinese(_))\n", "\n", "print(normalized_fixed_couplets_in[0])\n", "print(normalized_fixed_couplets_out[0])" ] }, { "cell_type": "code", "execution_count": 105, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "lRWgbshhvK3q", "outputId": "633b6e24-3892-48e9-fde1-4e82b4d32e57" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " in out\n", "0 腾飞上铁锐意改革谋发展勇当千里马 和谐南供安全送电保畅通争做领头羊\n", "1 风弦未拨心先乱 夜幕已沉梦更闲\n", "2 花梦粘于春袖口 莺声溅落柳枝头\n", "3 晋世文章昌二陆 魏家词赋重三曹\n", "4 一句相思吟岁月 千杯美酒醉风情\n", "... ... ...\n", "744910 半榻诗书盈陋室 一墙字画靓寒庐\n", "744911 借角青山埋姓字 掬壶明月洗尘心\n", "744912 苑内尽天姿锦窠仙髻无双艳 亭前多国色金粉紫檀第一香\n", "744913 浩淼洞庭极目天为界 安闲钓叟静心孰羡鱼\n", "744914 志踏云梯能揽月 坚磨铁棒可成针\n", "\n", "[744915 rows x 2 columns]\n" ] } ], "source": [ "fixed_couplets = pandas.DataFrame(\n", " {\"in\": normalized_fixed_couplets_in,\n", " \"out\": normalized_fixed_couplets_out\n", " }\n", " )\n", "print(fixed_couplets)" ] }, { "cell_type": "markdown", "metadata": { "id": "BaOkCk66vK3r" }, "source": [ "### Odrzucenie 95% danych - więcej niż 5% zajmuje całą pamięć i wywala program.\n", "### Podział danych na zbiór treningowy i testowy." ] }, { "cell_type": "code", "execution_count": 106, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gZhvQ-9SvK3r", "outputId": "516353b4-84df-4b7c-fcd5-4ff7642cf46e" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " in out\n", "567354 宇高炎暑净 秋爽飒风来\n", "118920 忧乐关天下 安危系一身\n", "738591 一盏相思量寂寞 三分惆怅兑凄凉\n", "509346 孝驻锦绣城喜吕梁歌飞春融三晋千秋画 义圆和谐梦看汾河景瑞水起九州万卷诗\n", "75388 春临八桂海豚舞 福满九州彩凤飞\n", "... ... ...\n", "116492 创中华古老文明当同日月齐辉功垂万代 启黎庶鸿蒙草昧是与山河并寿德颂千秋\n", "91658 纠缠海角指相思何时作罢 浪迹天涯心倦怠哪处归依\n", "101376 特地显英灵化被逢人歌泽渥 配天昭厚德恩深无处不波恬\n", "262048 温暖鹅城展翅奋飞中国梦 祥和蛇岁铺春欢庆小康年\n", "415192 百业一支歌歌伴和风谐雨唱 九江千古梦梦同朗月艳阳圆\n", "\n", "[5959 rows x 2 columns]\n", " in out\n", "274864 林霭渐浓迷古寺 尘烟已远隐青山\n", "222320 自古青天匡正义 而今华夏振雄风\n", "100260 真心请客就该一五一五 假意为情何必我开我开\n", "435928 爱本有心今不见 人如无欲意何求\n", "446991 欲抹闲愁实不易 谁将片语问何求\n", "... ... ...\n", "213030 万象随缘观自在 一心发愿待君归\n", "299155 春联妙句动心魄 小院雅风入彩光\n", "643294 梅亭吹雪横霜笛 松麓邀云放月筝\n", "628861 红似桃花白似雪 绿如李叶亮如霜\n", "566605 数字双音分两用 联文对句限孤平\n", "\n", "[1490 rows x 2 columns]\n" ] } ], "source": [ "male, duze = train_test_split(fixed_couplets,test_size=0.99,random_state=42)\n", "treningowe, testowe = train_test_split(male,test_size=0.2,random_state=42)\n", "print(treningowe)\n", "print(testowe)" ] }, { "cell_type": "markdown", "metadata": { "id": "2Ek1LXPjvK3r" }, "source": [ "### Przywrócenie numeracji od 0." ] }, { "cell_type": "code", "execution_count": 107, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Pe3iTklRvK3r", "outputId": "d8378e46-4634-49f0-f6c7-cc415cb2a032" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " in out\n", "0 宇高炎暑净 秋爽飒风来\n", "1 忧乐关天下 安危系一身\n", "2 一盏相思量寂寞 三分惆怅兑凄凉\n", "3 孝驻锦绣城喜吕梁歌飞春融三晋千秋画 义圆和谐梦看汾河景瑞水起九州万卷诗\n", "4 春临八桂海豚舞 福满九州彩凤飞\n", "... ... ...\n", "5954 创中华古老文明当同日月齐辉功垂万代 启黎庶鸿蒙草昧是与山河并寿德颂千秋\n", "5955 纠缠海角指相思何时作罢 浪迹天涯心倦怠哪处归依\n", "5956 特地显英灵化被逢人歌泽渥 配天昭厚德恩深无处不波恬\n", "5957 温暖鹅城展翅奋飞中国梦 祥和蛇岁铺春欢庆小康年\n", "5958 百业一支歌歌伴和风谐雨唱 九江千古梦梦同朗月艳阳圆\n", "\n", "[5959 rows x 2 columns]\n", " in out\n", "0 林霭渐浓迷古寺 尘烟已远隐青山\n", "1 自古青天匡正义 而今华夏振雄风\n", "2 真心请客就该一五一五 假意为情何必我开我开\n", "3 爱本有心今不见 人如无欲意何求\n", "4 欲抹闲愁实不易 谁将片语问何求\n", "... ... ...\n", "1485 万象随缘观自在 一心发愿待君归\n", "1486 春联妙句动心魄 小院雅风入彩光\n", "1487 梅亭吹雪横霜笛 松麓邀云放月筝\n", "1488 红似桃花白似雪 绿如李叶亮如霜\n", "1489 数字双音分两用 联文对句限孤平\n", "\n", "[1490 rows x 2 columns]\n" ] } ], "source": [ "treningowe = treningowe.reset_index(drop=True)\n", "testowe = testowe.reset_index(drop=True)\n", "print(treningowe)\n", "print(testowe)" ] }, { "cell_type": "markdown", "metadata": { "id": "1_w2PXrGvK3s" }, "source": [ "### Pakiet *pypinyin* przewiduje wymowę pinyin dobrze bez potrzeby używania pakietu *jieba*." ] }, { "cell_type": "code", "execution_count": 108, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5dnmD3wevK3s", "outputId": "7b3a7727-f0f5-4657-e3b1-e4760c81144d" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "春临八桂海豚舞\n", "[['chun1'], ['lin2'], ['ba1'], ['gui4'], ['hai3'], ['tun2'], ['wu3']]\n", "['chun1', 'lin2', 'ba1', 'gui4', 'hai3', 'tun2', 'wu3']\n", "['春临', '八桂', '海豚', '舞']\n", "[['chun1'], ['lin2'], ['ba1'], ['gui4'], ['hai3'], ['tun2'], ['wu3']]\n", "['chun1', 'lin2', 'ba1', 'gui4', 'hai3', 'tun2', 'wu3']\n" ] } ], "source": [ "from pypinyin import pinyin, lazy_pinyin, Style\n", "\n", "zdanie = treningowe[\"in\"][4]\n", "print(zdanie)\n", "print(pinyin(zdanie, style=Style.TONE3, neutral_tone_with_five=True))\n", "print(lazy_pinyin(zdanie, style=Style.TONE3, neutral_tone_with_five=True))\n", "\n", "slowa = list(jieba.cut(zdanie))\n", "print(slowa)\n", "print(pinyin(slowa, style=Style.TONE3, neutral_tone_with_five=True))\n", "print(lazy_pinyin(slowa, style=Style.TONE3, neutral_tone_with_five=True))" ] }, { "cell_type": "markdown", "metadata": { "id": "LghjKuLGvK3s" }, "source": [ "## Podział wymowy pinyin na początki (initials), końcówki (finals) i tony.\n", "### Zamina w liczby przy pomocy słownika." ] }, { "cell_type": "code", "execution_count": 109, "metadata": { "id": "xj0SAPGsvK3s" }, "outputs": [], "source": [ "from pypinyin.contrib.tone_convert import to_finals, to_initials\n", "# 声母表\n", "_INITIALS=['b','p','m','f','d','t','n','l','g','k','h','j','q','x','zh','ch','sh','r','z','c','s',]\n", "# 声母表,把 y,w 也当作声母\n", "_INITIALS_NOT_STRICT=_INITIALS+['y','w']\n", "# 韵母表\n", "_FINALS=['i','u','ü','a','ia','ua','o','uo','e','ie','üe','ai','uai','ei','uei','ao','iao','ou','iou','an','ian','uan','üan','en','in','uen','ün','ang','iang','uang','eng','ing','ueng','ong','iong','er','ê',]\n", "\n", "slownik_initials = {}\n", "licznik = 1\n", "for indeks_wersu_pierwszego in _INITIALS+[\"\"]:\n", " slownik_initials[indeks_wersu_pierwszego] = licznik\n", " licznik+=1\n", "\n", "slownik_finals = {}\n", "licznik = 1\n", "for indeks_wersu_pierwszego in _FINALS+[\"\"]:\n", " slownik_finals[indeks_wersu_pierwszego] = licznik\n", " licznik+=1\n", "\n", "def poczatek_koniec_ton(zapis_pinyin_3):\n", " poczatek = slownik_initials[to_initials(zapis_pinyin_3)]\n", " koniec = slownik_finals[to_finals(zapis_pinyin_3).replace('v', 'ü')]\n", " ton = int(zapis_pinyin_3[-1])\n", " return poczatek, koniec, ton\n", "\n", "def wymowy_i_tony_zdania(zdanie):\n", " zapis_pinyin_3_zdania = lazy_pinyin(zdanie, style=Style.TONE3, neutral_tone_with_five=True)\n", " poczatki = []\n", " konce =[]\n", " tony = []\n", " # print(zdanie, zapis_pinyin_3_zdania)\n", " for zp3 in zapis_pinyin_3_zdania:\n", " p,k,t = poczatek_koniec_ton(zp3)\n", " poczatki.append(p)\n", " konce.append(k)\n", " tony.append(t)\n", " return poczatki, konce, tony\n", "\n", "def dopasuj_dlugosc_wektora(wektor, dlugosc_wektora):\n", " if len(wektor)>dlugosc_wektora:\n", " wynik = wektor[:dlugosc_wektora]\n", " else:\n", " wynik = numpy.pad(wektor,(0,dlugosc_wektora-len(wektor)), mode='constant', constant_values=0)\n", " return wynik" ] }, { "cell_type": "code", "execution_count": 110, "metadata": { "id": "s4NN04ZpvK3t" }, "outputs": [], "source": [ "def poczatki_konce_tony_dla_zdan(zdania, liczba_wejscia):\n", " poczatki_wyn = []\n", " konce_wyn = []\n", " tony_wyn = []\n", "\n", " for zdanie in zdania:\n", " poczatki, konce, tony = wymowy_i_tony_zdania(zdanie)\n", "\n", " poczatki = dopasuj_dlugosc_wektora(poczatki, liczba_wejscia)\n", " konce = dopasuj_dlugosc_wektora(konce, liczba_wejscia)\n", " tony = dopasuj_dlugosc_wektora(tony, liczba_wejscia)\n", "\n", " poczatki_wyn.append(poczatki)\n", " konce_wyn.append(konce)\n", " tony_wyn.append(tony)\n", "\n", " return poczatki_wyn, konce_wyn, tony_wyn" ] }, { "cell_type": "markdown", "metadata": { "id": "fyj5LRN3vK3t" }, "source": [ "### Początki, końcówki i tony wierszy treningowych." ] }, { "cell_type": "code", "execution_count": 111, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vvooWU0hvK3t", "outputId": "e08da1e6-3955-4e7b-fcda-a4a75231fec9" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "5959\n", "宇高炎暑净\n", "['yu3', 'gao1', 'yan2', 'shu3', 'jing4']\n", "秋爽飒风来\n", "['qiu1', 'shuang3', 'sa4', 'feng1', 'lai2']\n", "5959\n", "[22 9 22 17 12 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0]\n", "[ 3 16 21 2 32 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0]\n", "[3 1 2 3 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", "[13 17 21 4 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0]\n", "[19 30 4 31 12 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0]\n", "[1 3 4 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n" ] } ], "source": [ "print(len(treningowe[\"in\"]))\n", "print(treningowe[\"in\"][0])\n", "print(lazy_pinyin(treningowe[\"in\"][0], style=Style.TONE3, neutral_tone_with_five=True))\n", "print(treningowe[\"out\"][0])\n", "print(lazy_pinyin(treningowe[\"out\"][0], style=Style.TONE3, neutral_tone_with_five=True))\n", "\n", "liczba_wejscia = 35\n", "\n", "poczatki_treningowe_in, konce_treningowe_in, tony_treningowe_in = poczatki_konce_tony_dla_zdan(treningowe[\"in\"], liczba_wejscia)\n", "poczatki_treningowe_out, konce_treningowe_out, tony_treningowe_out = poczatki_konce_tony_dla_zdan(treningowe[\"out\"], liczba_wejscia)\n", "\n", "print(len(poczatki_treningowe_in))\n", "print(poczatki_treningowe_in[0])\n", "print(konce_treningowe_in[0])\n", "print(tony_treningowe_in[0])\n", "print(poczatki_treningowe_out[0])\n", "print(konce_treningowe_out[0])\n", "print(tony_treningowe_out[0])" ] }, { "cell_type": "markdown", "metadata": { "id": "ZQd6eh5bvK3u" }, "source": [ "## Zanurzenia BAAI wierszy treningowych. https://huggingface.co/BAAI/bge-small-zh-v1.5" ] }, { "cell_type": "code", "execution_count": 112, "metadata": { "id": "61vGtuyBvK3u" }, "outputs": [], "source": [ "# # Load model from HuggingFace Hub\n", "# tokenizer = AutoTokenizer.from_pretrained(\"BAAI/bge-small-zh-v1.5\")\n", "# model = AutoModel.from_pretrained(\"BAAI/bge-small-zh-v1.5\")\n", "# model.eval()\n", "\n", "# def zanurzenia_zdan(lista_zdan):\n", "# # Sentences we want sentence embeddings for\n", "# #sentences = [\"样例数据-1样例数据\", \"样例数据-2样例数据\"]\n", "# sentences = lista_zdan\n", "\n", "# # Tokenize sentences\n", "# encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')\n", "# # for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)\n", "# # encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')\n", "\n", "# # Compute token embeddings\n", "# with torch.no_grad():\n", "# model_output = model(**encoded_input)\n", "# # Perform pooling. In this case, cls pooling.\n", "# sentence_embeddings = model_output[0][:, 0]\n", "# # normalize embeddings\n", "# sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)\n", "# # print(\"Sentence embeddings shape:\", sentence_embeddings.shape)\n", "# # print(\"Sentence embeddings:\", sentence_embeddings)\n", "\n", "# return sentence_embeddings\n", "\n", "# def zanurzenie_zdania(zdanie):\n", "# # Tokenize sentences\n", "# encoded_input = tokenizer(zdanie, padding=True, truncation=True, return_tensors='pt')\n", "# # for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)\n", "# # encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')\n", "\n", "# # Compute token embeddings\n", "# with torch.no_grad():\n", "# model_output = model(**encoded_input)\n", "# # Perform pooling. In this case, cls pooling.\n", "# sentence_embedding = model_output[0][:, 0]\n", "# # normalize embeddings\n", "# sentence_embedding = torch.nn.functional.normalize(sentence_embedding, p=2, dim=1)\n", "\n", "# return sentence_embedding" ] }, { "cell_type": "code", "execution_count": 113, "metadata": { "id": "4gt7oqi4vK3u" }, "outputs": [], "source": [ "# treningowe_in_lista = treningowe[\"in\"].tolist()\n", "# treningowe_out_lista = treningowe[\"out\"].tolist()\n", "\n", "# print(len(treningowe_in_lista))\n", "# print(treningowe_in_lista[0])\n", "# print(treningowe_out_lista[0])\n", "\n", "# zanurzenia_treningowe_in = zanurzenia_zdan(treningowe_in_lista)\n", "# zanurzenia_treningowe_out = zanurzenia_zdan(treningowe_out_lista)\n", "\n", "# print(zanurzenia_treningowe_in.shape)\n", "# print(zanurzenia_treningowe_in[0])\n", "# print(zanurzenia_treningowe_out[0])" ] }, { "cell_type": "markdown", "metadata": { "id": "TAfBMDqlvK3u" }, "source": [ "### Tensory - reprezentacje pierwszych wersów wierszy treningowych." ] }, { "cell_type": "code", "execution_count": 114, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "hV9h84mrvK3v", "outputId": "0fbd174d-e6f3-4a08-892e-0de2ac838f99" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "5959\n", "torch.Size([109])\n", "tensor([22, 9, 22, 17, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,\n", " 16, 21, 2, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1,\n", " 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 12, 32,\n", " 4])\n" ] } ], "source": [ "x = []\n", "for indeks_wersu_pierwszego in range(len(poczatki_treningowe_in)):\n", " poczatki = poczatki_treningowe_in\n", " konce = konce_treningowe_in\n", " tony = tony_treningowe_in\n", " niezerowe_poczatki = [p for p in poczatki[indeks_wersu_pierwszego] if p>0.0]\n", " poczatek_ostatniego_znaku = niezerowe_poczatki[len(niezerowe_poczatki)-1]\n", " koniec_ostatniego_znaku = konce[indeks_wersu_pierwszego][len(niezerowe_poczatki)-1]\n", " ton_ostatniego_znaku = tony[indeks_wersu_pierwszego][len(niezerowe_poczatki)-1]\n", " x.append(torch.cat(\n", " (\n", " # zanurzenia_treningowe_in[indeks_wersu_pierwszego],\n", " torch.from_numpy(poczatki_treningowe_in[indeks_wersu_pierwszego]),\n", " torch.from_numpy(konce_treningowe_in[indeks_wersu_pierwszego]),\n", " torch.from_numpy(tony_treningowe_in[indeks_wersu_pierwszego]),\n", " torch.from_numpy(numpy.array([len(niezerowe_poczatki)])),\n", " torch.from_numpy(numpy.array([poczatek_ostatniego_znaku])),\n", " torch.from_numpy(numpy.array([koniec_ostatniego_znaku])),\n", " torch.from_numpy(numpy.array([ton_ostatniego_znaku]))\n", " )\n", " ))\n", "print(len(x))\n", "print(x[0].shape)\n", "print(x[0])" ] }, { "cell_type": "markdown", "metadata": { "id": "wkLMwhjevK3v" }, "source": [ "### Tensory - reprezentacje drugich wersów wierszy treningowych." ] }, { "cell_type": "code", "execution_count": 115, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PqKvzCJavK3v", "outputId": "dbfad29e-8312-49ea-a2fe-891604f8092a" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "5959\n", "torch.Size([109])\n", "tensor([13, 17, 21, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19,\n", " 30, 4, 31, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3,\n", " 4, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 8, 12,\n", " 2])\n" ] } ], "source": [ "y = []\n", "for indeks_wersu_pierwszego in range(len(poczatki_treningowe_out)):\n", " poczatki = poczatki_treningowe_out\n", " konce = konce_treningowe_out\n", " tony = tony_treningowe_out\n", " niezerowe_poczatki = [p for p in poczatki[indeks_wersu_pierwszego] if p>0.0]\n", " poczatek_ostatniego_znaku = niezerowe_poczatki[len(niezerowe_poczatki)-1]\n", " koniec_ostatniego_znaku = konce[indeks_wersu_pierwszego][len(niezerowe_poczatki)-1]\n", " ton_ostatniego_znaku = tony[indeks_wersu_pierwszego][len(niezerowe_poczatki)-1]\n", " y.append(\n", " torch.cat(\n", " (\n", " # zanurzenia_treningowe_out[indeks_wersu_pierwszego],\n", " torch.from_numpy(poczatki_treningowe_out[indeks_wersu_pierwszego]),\n", " torch.from_numpy(konce_treningowe_out[indeks_wersu_pierwszego]),\n", " torch.from_numpy(tony_treningowe_out[indeks_wersu_pierwszego]),\n", " torch.from_numpy(numpy.array([len(niezerowe_poczatki)])),\n", " torch.from_numpy(numpy.array([poczatek_ostatniego_znaku])),\n", " torch.from_numpy(numpy.array([koniec_ostatniego_znaku])),\n", " torch.from_numpy(numpy.array([ton_ostatniego_znaku]))\n", " )\n", " )\n", " )\n", "print(len(y))\n", "print(y[0].shape)\n", "print(y[0])" ] }, { "cell_type": "markdown", "metadata": { "id": "-8hQv7y0vK3w" }, "source": [ "## Wejście do sieci neuronowej.\n", "### Odpowiadające sobie wersy i kilka losowo dobranych nieodpowiadających sobie wersów." ] }, { "cell_type": "code", "execution_count": 116, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "G_zK4KIRvK3w", "outputId": "7966a2bc-b13e-4ac1-a0d4-ae6b7a51c181" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "29795\n", "tensor([22, 9, 22, 17, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,\n", " 16, 21, 2, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1,\n", " 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 12, 32,\n", " 4, 13, 17, 21, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 19, 30, 4, 31, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,\n", " 3, 4, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 8,\n", " 12, 2])\n", "29795\n", "1\n" ] } ], "source": [ "from random import sample\n", "X = []\n", "Y = []\n", "for indeks_wersu_drugiego in range(len(x)):\n", " indeksy = sample(range(len(y)), 5)\n", " if indeks_wersu_drugiego not in indeksy:\n", " indeksy[0] = indeks_wersu_drugiego\n", " for k in indeksy:\n", " X.append(\n", " torch.cat(\n", " (x[indeks_wersu_drugiego], y[k])\n", " )\n", " )\n", " if indeks_wersu_drugiego==k:\n", " Y.append(1)\n", " else:\n", " Y.append(0)\n", "\n", "print(len(X))\n", "print(X[0])\n", "print(len(Y))\n", "print(Y[0])" ] }, { "cell_type": "markdown", "metadata": { "id": "4i_4--zBvK3w" }, "source": [ "## Modele sklearn." ] }, { "cell_type": "code", "execution_count": 117, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 74 }, "id": "WuDfk70LvK3w", "outputId": "35065e9c-8c6b-4837-8018-49cc504f0d90" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "MLPClassifier()" ], "text/html": [ "
MLPClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
MLPClassifier()
MLPRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
MLPRegressor()