diff --git a/08_vector_representations.ipynb b/08_vector_representations.ipynb index 71c770a..31b6b9c 100644 --- a/08_vector_representations.ipynb +++ b/08_vector_representations.ipynb @@ -19,6 +19,21 @@ "https://github.com/unslothai/unsloth - biblioteka do efektywnego finetune'owania LLMów (są gotowe notebooki z kodem na platformie Colab)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Co to jest wektor?\n", + "\n", + "Wektor - jednowymiarowa macierz\n", + "\n", + "[0, 1, 0, 0, 0] - one hot encoding - tylko wartości 0/1\n", + "\n", + "[0, 2, 0, 5, 1, 100] - frequency encoding - liczby całkowite >= 0\n", + "\n", + "[-1.5, 0.0002, 5000.01] - wektor" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -62,9 +77,18 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 67, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\ryssta\\AppData\\Local\\anaconda3\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n", + " warnings.warn(\n" + ] + } + ], "source": [ "from transformers import GPT2Tokenizer, GPT2Model\n", "import torch\n", @@ -78,7 +102,23 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "[\n", + " [0.1, 0.2, 0.3], # Ala\n", + " [-0.5, 0.5, 0.9], # ma\n", + " ...\n", + " # 50254\n", + " ...\n", + " [0.1, -0.1, -0.2] # w GPT2 jest 768 wartości w pojedynczym wektorze, a nie 3\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 78, "metadata": {}, "outputs": [ { @@ -86,20 +126,31 @@ "output_type": "stream", "text": [ "Tekst 'cat' jest konwertowany do tokenu 9246\n", + "\n", + "Tokenizacja\n", "{'input_ids': [33215], 'attention_mask': [1]}\n", - "cat\n" + "\n", + "Detokenizacja\n", + "computer\n", + "\n", + "Liczba tokenów w słowniku\n", + "50257\n" ] } ], "source": [ "print(\"Tekst 'cat' jest konwertowany do tokenu 9246\")\n", + "print(\"\\nTokenizacja\")\n", "print(tokenizer(\"computer\"))\n", - "print(tokenizer.decode([9246]))" + "print(\"\\nDetokenizacja\")\n", + "print(tokenizer.decode([33215]))\n", + "print(\"\\nLiczba tokenów w słowniku\")\n", + "print(len(tokenizer))" ] }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 73, "metadata": {}, "outputs": [ { @@ -107,7 +158,11 @@ "output_type": "stream", "text": [ "Embedding tokenu: 9246\n", + "\n", + "Rozmiar embeddingu (wektora)\n", "torch.Size([1, 768])\n", + "\n", + "Wartości embeddingu\n", "tensor([[-0.0164, -0.0934, 0.2425, 0.1398, 0.0388, -0.2592, -0.2724, -0.1625,\n", " 0.1683, 0.0829, 0.0136, -0.2788, 0.1493, 0.1408, 0.0557, -0.3691,\n", " 0.2200, -0.0428, 0.2206, 0.0865, 0.1237, -0.1499, 0.1446, -0.1150,\n", @@ -211,13 +266,15 @@ "source": [ "print(\"Embedding tokenu: 9246\")\n", "cat_embedding = embedding_layer(torch.LongTensor([9246]))\n", + "print(\"\\nRozmiar embeddingu (wektora)\")\n", "print(cat_embedding.shape)\n", + "print(\"\\nWartości embeddingu\")\n", "print(cat_embedding)" ] }, { "cell_type": "code", - "execution_count": 65, + "execution_count": null, "metadata": {}, "outputs": [ {