"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m9.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: filelock in /home/pawel/.local/lib/python3.10/site-packages (from torch) (3.13.1)\n",
"Requirement already satisfied: fsspec in /home/pawel/.local/lib/python3.10/site-packages (from torch) (2024.2.0)\n",
"Requirement already satisfied: sympy in /home/pawel/.local/lib/python3.10/site-packages (from torch) (1.12)\n",
"Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /home/pawel/.local/lib/python3.10/site-packages (from torch) (12.1.105)\n",
"Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /home/pawel/.local/lib/python3.10/site-packages (from torch) (12.1.105)\n",
"Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /home/pawel/.local/lib/python3.10/site-packages (from torch) (12.1.105)\n",
"Requirement already satisfied: typing-extensions>=4.8.0 in /home/pawel/.local/lib/python3.10/site-packages (from torch) (4.10.0)\n",
"Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /home/pawel/.local/lib/python3.10/site-packages (from torch) (8.9.2.26)\n",
"Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /home/pawel/.local/lib/python3.10/site-packages (from torch) (10.3.2.106)\n",
"Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /home/pawel/.local/lib/python3.10/site-packages (from torch) (12.1.0.106)\n",
"Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /home/pawel/.local/lib/python3.10/site-packages (from torch) (12.1.105)\n",
"Requirement already satisfied: triton==2.3.0 in /home/pawel/.local/lib/python3.10/site-packages (from torch) (2.3.0)\n",
"Requirement already satisfied: jinja2 in /home/pawel/.local/lib/python3.10/site-packages (from torch) (3.1.3)\n",
"Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /home/pawel/.local/lib/python3.10/site-packages (from torch) (2.20.5)\n",
"Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /home/pawel/.local/lib/python3.10/site-packages (from torch) (12.1.3.1)\n",
"Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /home/pawel/.local/lib/python3.10/site-packages (from torch) (11.0.2.54)\n",
"Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /home/pawel/.local/lib/python3.10/site-packages (from torch) (11.4.5.107)\n",
"Requirement already satisfied: networkx in /home/pawel/.local/lib/python3.10/site-packages (from torch) (3.3)\n",
"Requirement already satisfied: nvidia-nvjitlink-cu12 in /home/pawel/.local/lib/python3.10/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch) (12.4.127)\n",
"Requirement already satisfied: requests in /home/pawel/.local/lib/python3.10/site-packages (from torchtext) (2.31.0)\n",
"Requirement already satisfied: numpy in /home/pawel/.local/lib/python3.10/site-packages (from torchtext) (1.26.4)\n",
"Requirement already satisfied: tqdm in /home/pawel/.local/lib/python3.10/site-packages (from torchtext) (4.66.2)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /home/pawel/.local/lib/python3.10/site-packages (from jinja2->torch) (2.1.5)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /home/pawel/.local/lib/python3.10/site-packages (from requests->torchtext) (2024.2.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /home/pawel/.local/lib/python3.10/site-packages (from requests->torchtext) (3.6)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /home/pawel/.local/lib/python3.10/site-packages (from requests->torchtext) (3.3.2)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /home/pawel/.local/lib/python3.10/site-packages (from requests->torchtext) (2.2.1)\n",
"Requirement already satisfied: mpmath>=0.19 in /home/pawel/.local/lib/python3.10/site-packages (from sympy->torch) (1.3.0)\n",
"Installing collected packages: torchtext\n",
"Successfully installed torchtext-0.18.0\n"
]
}
],
"source": [
"!pip install torch torchtext"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from collections import Counter\n",
"\n",
"import torch\n",
"from datasets import load_dataset\n",
"from torchtext.vocab import vocab\n",
"from tqdm.notebook import tqdm"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Wczytujemy zbiór danych `conll2003` (https://huggingface.co/datasets/conll2003), który zawiera teksty oznaczone znacznikami części mowy (*POS tags*): "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"dataset = load_dataset(\"conll2003\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Poiżej funkcja, która tworzy słownik (https://pytorch.org/text/stable/vocab.html).\n",
"\n",
"Parametr `special` określa symbole specjalne:\n",
"itos = v.get_itos() # mapowanie indeksów na tokeny"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"23627"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(itos) # liczba różnych tokenów w słowniku"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"21"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"v['on'] # indeks tokenu `on`"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"v[\"<unk>\"] # indeks nieznanego tokenu"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"W przypadku, gdy w analizowanym tekście znajdzie się token, którego nie ma w słowniku, będzie reprezentowany przez indeks domyślny (*default index*). Ustawiamy, żeby był taki sam, jak indeks „nieznanego tokenu”:"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"v.set_default_index(v[\"<unk>\"])"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"def data_process(dt):\n",
" # Wektoryzacja dokumentów tekstowych.\n",
" return [ torch.tensor([v['<bos>']] +[v[token] for token in document ] + [v['<eos>']], dtype = torch.long) for document in dt]"