diff --git a/1_TFIDF.ipynb b/1_TFIDF.ipynb
new file mode 100644
index 0000000..af2cdba
--- /dev/null
+++ b/1_TFIDF.ipynb
@@ -0,0 +1,1572 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Uczenie głębokie – przetwarzanie tekstu – laboratoria\n",
+    "# 1. TF–IDF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Zbiór dokumentów"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "documents = ['Ala lubi zwierzęta i ma kota oraz psa!',\n",
+    "             'Ola lubi zwierzęta oraz ma kota a także chomika!',\n",
+    "             'I Jan jeździ na rowerze.',\n",
+    "             '2 wojna światowa była wielkim konfliktem zbrojnym',\n",
+    "             'Tomek lubi psy, ma psa  i jeździ na motorze i rowerze.',\n",
+    "            ]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Czego potrzebujemy?\n",
+    "\n",
+    "- Chcemy zamienić teksty na zbiór słów."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### ❔ Pytania\n",
+    "\n",
+    "- Czy do stokenizowania tekstu możemy użyć `document.split(' ')`?\n",
+    "- Jakie trudności możemy napotkać?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_str_cleaned(str_dirty):\n",
+    "    punctuation = '!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'\n",
+    "    new_str = str_dirty.lower()\n",
+    "    new_str = re.sub(' +', ' ', new_str)\n",
+    "    for char in punctuation:\n",
+    "        new_str = new_str.replace(char,'')\n",
+    "    return new_str"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_document = get_str_cleaned(documents[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'ala lubi zwierzęta i ma kota oraz psa'"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sample_document"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tokenizacja"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenize_str(document):\n",
+    "    return document.split(' ')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ala', 'lubi', 'zwierzęta', 'i', 'ma', 'kota', 'oraz', 'psa']"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenize_str(sample_document)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "documents_cleaned = [get_str_cleaned(document) for document in documents]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ala lubi zwierzęta i ma kota oraz psa',\n",
+       " 'ola lubi zwierzęta oraz ma kota a także chomika',\n",
+       " 'i jan jeździ na rowerze',\n",
+       " '2 wojna światowa była wielkim konfliktem zbrojnym',\n",
+       " 'tomek lubi psy ma psa i jeździ na motorze i rowerze']"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "documents_cleaned"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "documents_tokenized = [tokenize_str(d) for d in documents_cleaned]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[['ala', 'lubi', 'zwierzęta', 'i', 'ma', 'kota', 'oraz', 'psa'],\n",
+       " ['ola', 'lubi', 'zwierzęta', 'oraz', 'ma', 'kota', 'a', 'także', 'chomika'],\n",
+       " ['i', 'jan', 'jeździ', 'na', 'rowerze'],\n",
+       " ['2', 'wojna', 'światowa', 'była', 'wielkim', 'konfliktem', 'zbrojnym'],\n",
+       " ['tomek',\n",
+       "  'lubi',\n",
+       "  'psy',\n",
+       "  'ma',\n",
+       "  'psa',\n",
+       "  'i',\n",
+       "  'jeździ',\n",
+       "  'na',\n",
+       "  'motorze',\n",
+       "  'i',\n",
+       "  'rowerze']]"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "documents_tokenized"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### ❔ Pytania\n",
+    "\n",
+    "- Jaki jest następny krok w celu stworzenia wektórów TF lub TF–IDF?\n",
+    "- Jakie wielkości będzie wektor TF lub TF–IDF?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Stworzenie słownika"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vocabulary = []\n",
+    "for document in documents_tokenized:\n",
+    "    for word in document:\n",
+    "        vocabulary.append(word)\n",
+    "vocabulary = sorted(set(vocabulary))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['2',\n",
+       " 'a',\n",
+       " 'ala',\n",
+       " 'była',\n",
+       " 'chomika',\n",
+       " 'i',\n",
+       " 'jan',\n",
+       " 'jeździ',\n",
+       " 'konfliktem',\n",
+       " 'kota',\n",
+       " 'lubi',\n",
+       " 'ma',\n",
+       " 'motorze',\n",
+       " 'na',\n",
+       " 'ola',\n",
+       " 'oraz',\n",
+       " 'psa',\n",
+       " 'psy',\n",
+       " 'rowerze',\n",
+       " 'także',\n",
+       " 'tomek',\n",
+       " 'wielkim',\n",
+       " 'wojna',\n",
+       " 'zbrojnym',\n",
+       " 'zwierzęta',\n",
+       " 'światowa']"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "vocabulary"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 📝 Zadanie 1 (1 pkt)\n",
+    "\n",
+    "Napisz funkcję `word_to_index(word: str)`, która dla danego słowa zwraca wektor jednostkowy (*one-hot vector*) w postaci `numpy.array`.\n",
+    "\n",
+    "Przyjmij, że słownik dany jest za pomocą zmiennej globalnej `vocabulary`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def word_to_index(word: str) -> np.array:\n",
+    "    # TODO\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,\n",
+       "       0., 0., 0., 0., 0., 0., 0., 0., 0.])"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "word_to_index('psa')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 📝 Zadanie 2 (1 pkt)\n",
+    "\n",
+    "Napisz funkcję, która zamienia listę słów na wektor TF. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tf(document: list) -> np.array:\n",
+    "    # TODO\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 1.,\n",
+       "       0., 0., 0., 0., 0., 0., 0., 1., 0.])"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tf(documents_tokenized[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "documents_vectorized = list()\n",
+    "for document in documents_tokenized:\n",
+    "    document_vector = tf(document)\n",
+    "    documents_vectorized.append(document_vector)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[array([0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 1.,\n",
+       "        0., 0., 0., 0., 0., 0., 0., 1., 0.]),\n",
+       " array([0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 1., 0.,\n",
+       "        0., 0., 1., 0., 0., 0., 0., 1., 0.]),\n",
+       " array([0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.,\n",
+       "        0., 1., 0., 0., 0., 0., 0., 0., 0.]),\n",
+       " array([1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+       "        0., 0., 0., 0., 1., 1., 1., 0., 1.]),\n",
+       " array([0., 0., 0., 0., 0., 2., 0., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1.,\n",
+       "        1., 1., 0., 1., 0., 0., 0., 0., 0.])]"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "documents_vectorized"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## IDF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([5.        , 5.        , 5.        , 5.        , 5.        ,\n",
+       "       1.66666667, 5.        , 2.5       , 5.        , 2.5       ,\n",
+       "       1.66666667, 1.66666667, 5.        , 2.5       , 5.        ,\n",
+       "       2.5       , 2.5       , 5.        , 2.5       , 5.        ,\n",
+       "       5.        , 5.        , 5.        , 5.        , 2.5       ,\n",
+       "       5.        ])"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "idf = np.zeros(len(vocabulary))\n",
+    "idf = len(documents_vectorized) / np.sum(np.array(documents_vectorized) != 0,axis=0)\n",
+    "display(idf)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 📝 Zadanie 3 (1 pkt)\n",
+    "\n",
+    "Napisz funkcję, która zwraca podobieństwo kosinusowe między dwoma dokumentami w postaci zwektoryzowanej."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def similarity(query: np.array, document: np.array) -> float:\n",
+    "    # TODO\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Ala lubi zwierzęta i ma kota oraz psa!'"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "documents[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 1.,\n",
+       "       0., 0., 0., 0., 0., 0., 0., 1., 0.])"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "documents_vectorized[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Ola lubi zwierzęta oraz ma kota a także chomika!'"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "documents[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 1., 0.,\n",
+       "       0., 0., 1., 0., 0., 0., 0., 1., 0.])"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "documents_vectorized[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.5892556509887895"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "similarity(documents_vectorized[0], documents_vectorized[1])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prosta wyszukiwarka"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def transform_query(query):\n",
+    "    \"\"\"Funkcja, która czyści i tokenizuje zapytanie\"\"\"\n",
+    "    query_vector = tf(tokenize_str(get_str_cleaned(query)))\n",
+    "    return query_vector"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.4999999999999999"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "similarity(transform_query('psa kota'), documents_vectorized[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Ala lubi zwierzęta i ma kota oraz psa!'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.4999999999999999"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'Ola lubi zwierzęta oraz ma kota a także chomika!'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.2357022603955158"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'I Jan jeździ na rowerze.'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.0"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'2 wojna światowa była wielkim konfliktem zbrojnym'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.0"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'Tomek lubi psy, ma psa  i jeździ na motorze i rowerze.'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.19611613513818402"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "query = 'psa kota'\n",
+    "for i in range(len(documents)):\n",
+    "    display(documents[i])\n",
+    "    display(similarity(transform_query(query), documents_vectorized[i]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Ala lubi zwierzęta i ma kota oraz psa!'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.0"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'Ola lubi zwierzęta oraz ma kota a także chomika!'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.0"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'I Jan jeździ na rowerze.'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.4472135954999579"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'2 wojna światowa była wielkim konfliktem zbrojnym'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.0"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'Tomek lubi psy, ma psa  i jeździ na motorze i rowerze.'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.2773500981126146"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# dlatego potrzebujemy mianownik w cosine similarity\n",
+    "query = 'rowerze'\n",
+    "for i in range(len(documents)):\n",
+    "    display(documents[i])\n",
+    "    display(similarity(transform_query(query), documents_vectorized[i]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Ala lubi zwierzęta i ma kota oraz psa!'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.35355339059327373"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'Ola lubi zwierzęta oraz ma kota a także chomika!'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.0"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'I Jan jeździ na rowerze.'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.4472135954999579"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'2 wojna światowa była wielkim konfliktem zbrojnym'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.0"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'Tomek lubi psy, ma psa  i jeździ na motorze i rowerze.'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.5547001962252291"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# dlatego potrzebujemy term frequency → wiecej znaczy bardziej dopasowany dokument\n",
+    "query = 'i'\n",
+    "for i in range(len(documents)):\n",
+    "    display(documents[i])\n",
+    "    display(similarity(transform_query(query), documents_vectorized[i]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Ala lubi zwierzęta i ma kota oraz psa!'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.24999999999999994"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'Ola lubi zwierzęta oraz ma kota a także chomika!'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.2357022603955158"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'I Jan jeździ na rowerze.'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.31622776601683794"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'2 wojna światowa była wielkim konfliktem zbrojnym'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.0"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'Tomek lubi psy, ma psa  i jeździ na motorze i rowerze.'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.39223227027636803"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# dlatego IDF - żeby ważniejsze słowa miał większą wagę\n",
+    "query = 'i chomika'\n",
+    "for i in range(len(documents)):\n",
+    "    display(documents[i])\n",
+    "    display(similarity(transform_query(query), documents_vectorized[i]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Biblioteki"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import sklearn.metrics\n",
+    "\n",
+    "from sklearn.datasets import fetch_20newsgroups\n",
+    "\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "newsgroups = fetch_20newsgroups()['data']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "11314"
+      ]
+     },
+     "execution_count": 50,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(newsgroups)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "From: lerxst@wam.umd.edu (where's my thing)\n",
+      "Subject: WHAT car is this!?\n",
+      "Nntp-Posting-Host: rac3.wam.umd.edu\n",
+      "Organization: University of Maryland, College Park\n",
+      "Lines: 15\n",
+      "\n",
+      " I was wondering if anyone out there could enlighten me on this car I saw\n",
+      "the other day. It was a 2-door sports car, looked to be from the late 60s/\n",
+      "early 70s. It was called a Bricklin. The doors were really small. In addition,\n",
+      "the front bumper was separate from the rest of the body. This is \n",
+      "all I know. If anyone can tellme a model name, engine specs, years\n",
+      "of production, where this car is made, history, or whatever info you\n",
+      "have on this funky looking car, please e-mail.\n",
+      "\n",
+      "Thanks,\n",
+      "- IL\n",
+      "   ---- brought to you by your neighborhood Lerxst ----\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(newsgroups[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Naiwne przeszukiwanie"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_documents = list() \n",
+    "for document in newsgroups:\n",
+    "    if 'car' in document:\n",
+    "        all_documents.append(document)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "From: lerxst@wam.umd.edu (where's my thing)\n",
+      "Subject: WHAT car is this!?\n",
+      "Nntp-Posting-Host: rac3.wam.umd.edu\n",
+      "Organization: University of Maryland, College Park\n",
+      "Lines: 15\n",
+      "\n",
+      " I was wondering if anyone out there could enlighten me on this car I saw\n",
+      "the other day. It was a 2-door sports car, looked to be from the late 60s/\n",
+      "early 70s. It was called a Bricklin. The doors were really small. In addition,\n",
+      "the front bumper was separate from the rest of the body. This is \n",
+      "all I know. If anyone can tellme a model name, engine specs, years\n",
+      "of production, where this car is made, history, or whatever info you\n",
+      "have on this funky looking car, please e-mail.\n",
+      "\n",
+      "Thanks,\n",
+      "- IL\n",
+      "   ---- brought to you by your neighborhood Lerxst ----\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(all_documents[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "From: guykuo@carson.u.washington.edu (Guy Kuo)\n",
+      "Subject: SI Clock Poll - Final Call\n",
+      "Summary: Final call for SI clock reports\n",
+      "Keywords: SI,acceleration,clock,upgrade\n",
+      "Article-I.D.: shelley.1qvfo9INNc3s\n",
+      "Organization: University of Washington\n",
+      "Lines: 11\n",
+      "NNTP-Posting-Host: carson.u.washington.edu\n",
+      "\n",
+      "A fair number of brave souls who upgraded their SI clock oscillator have\n",
+      "shared their experiences for this poll. Please send a brief message detailing\n",
+      "your experiences with the procedure. Top speed attained, CPU rated speed,\n",
+      "add on cards and adapters, heat sinks, hour of usage per day, floppy disk\n",
+      "functionality with 800 and 1.4 m floppies are especially requested.\n",
+      "\n",
+      "I will be summarizing in the next two days, so please add to the network\n",
+      "knowledge base if you have done the clock upgrade and haven't answered this\n",
+      "poll. Thanks.\n",
+      "\n",
+      "Guy Kuo <guykuo@u.washington.edu>\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(all_documents[1])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### ❔ Pytanie\n",
+    "\n",
+    "Jakie są problemy z takim podejściem?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### TF–IDF i odległość kosinusowa"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vectorizer = TfidfVectorizer()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "document_vectors = vectorizer.fit_transform(newsgroups)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<11314x130107 sparse matrix of type '<class 'numpy.float64'>'\n",
+       "\twith 1787565 stored elements in Compressed Sparse Row format>"
+      ]
+     },
+     "execution_count": 57,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "document_vectors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<1x130107 sparse matrix of type '<class 'numpy.float64'>'\n",
+       "\twith 89 stored elements in Compressed Sparse Row format>"
+      ]
+     },
+     "execution_count": 58,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "document_vectors[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "matrix([[0., 0., 0., ..., 0., 0., 0.]])"
+      ]
+     },
+     "execution_count": 59,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "document_vectors[0].todense()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "matrix([[0., 0., 0., ..., 0., 0., 0.],\n",
+       "        [0., 0., 0., ..., 0., 0., 0.],\n",
+       "        [0., 0., 0., ..., 0., 0., 0.],\n",
+       "        [0., 0., 0., ..., 0., 0., 0.]])"
+      ]
+     },
+     "execution_count": 60,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "document_vectors[0:4].todense()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query_str = 'speed'\n",
+    "#query_str = 'speed car'\n",
+    "#query_str = 'spider man'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0.17360013 0.22933014 0.28954818 0.45372239]\n",
+      "[ 2455  8920  5497 11031]\n",
+      "From: keiths@spider.co.uk (Keith Smith)\n",
+      "Subject: win/NT file systems\n",
+      "Organization: Spider Systems Limited, Edinburgh, UK.\n",
+      "Lines: 6\n",
+      "Nntp-Posting-Host: trapdoor.spider.co.uk\n",
+      "\n",
+      "OK will some one out there tell me why / how DOS 5\n",
+      "can read (I havn't tried writing in case it breaks something)\n",
+      "the Win/NT NTFS file system.\n",
+      "I thought NTFS was supposed to be better than the FAT system\n",
+      "\n",
+      "keith\n",
+      "\n",
+      "0.4537223924558256\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "From: brandt@cs.unc.edu (Andrew Brandt)\n",
+      "Subject: Seeking good Alfa Romeo mechanic.\n",
+      "Organization: The University of North Carolina at Chapel Hill\n",
+      "Lines: 14\n",
+      "NNTP-Posting-Host: axon.cs.unc.edu\n",
+      "Keywords: alfa, romeo, spider, mechanic\n",
+      "\n",
+      "I am looking for recommendations for a good (great?) Alfa Romeo\n",
+      "mechanic in South Jersey or Philadelphia or nearby.\n",
+      "\n",
+      "I have a '78 Alfa Spider that needs some engine, tranny, steering work\n",
+      "done.  The body is in quite good shape.  The car is awful in cold\n",
+      "weather, won't start if below freezing (I know, I know, why drive a\n",
+      "Spider if there's snow on the ground ...).  It has Bosch *mechanical*\n",
+      "fuel injection that I am sure needs adjustment.\n",
+      "\n",
+      "Any opinions are welcome on what to look for or who to call.\n",
+      "\n",
+      "Email or post (to rec.autos), I will summarize if people want.\n",
+      "\n",
+      "Thx, Andy (brandt@cs.unc.edu)\n",
+      "\n",
+      "0.28954817869991817\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "From: michaelr@spider.co.uk (Michael S. A. Robb)\n",
+      "Subject: Re: Honors Degrees: Do they mean anything?\n",
+      "Organization: Spider Systems Limited, Edinburgh, UK.\n",
+      "Lines: 44\n",
+      "\n",
+      "In article <TKLD.93Apr2123341@burns.cogsci.ed.ac.uk> tkld@cogsci.ed.ac.uk (Kevin Davidson) writes:\n",
+      ">\n",
+      ">>   In my opinion, a programming degree is still worth having.\n",
+      ">\n",
+      "> Yes, but a CS degree is *not* a programming degree. Does anybody know of\n",
+      ">a computing course where *programming* is taught ? Computer Science is\n",
+      ">a branch of maths (or the course I did was).\n",
+      "> I've also done a Software Engineering course - much more practical and likely\n",
+      ">to be the sort of thing an employer really wants, rather than what they think\n",
+      ">they want, but also did not teach programming. The ability to program was\n",
+      ">an entry requirement.\n",
+      "\n",
+      "At Robert Gordon University, programming was the main (most time-consuming) \n",
+      "start of the course. The first two years consisted of five subjects:\n",
+      "Software Engineering (Pascal/C/UNIX), Computer Engineering (6502/6809/68000 \n",
+      "assembler), Computer Theory (LISP/Prolog), Mathematics/Statistics and \n",
+      "Communication Skills (How to pass interviews/intelligence tests and group\n",
+      "discussions e.g. How to survive a helicopter crash in the North Sea).\n",
+      "The third year (Industrial placement) was spent working for a computer company \n",
+      "for a year. The company could be anywhere in Europe (there was a special \n",
+      "Travel Allowance Scheme to cover the visiting costs of professors).  \n",
+      "The fourth year included Operating Systems(C/Modula-2), Software Engineering \n",
+      "(C/8086 assembler), Real Time Laboratory (C/68000 assembler) and Computing \n",
+      "Theory (LISP).  There were also Group Projects in 2nd and 4th Years, where \n",
+      "students worked in teams to select their own project or decide to work for an \n",
+      "outside company (the only disadvantage being that specifications would change \n",
+      "suddenly).\n",
+      " \n",
+      "In the first four years, there was a 50%:50% weighting between courseworks and \n",
+      "exams for most subjects. However in the Honours year, this was reduced to a \n",
+      "30%:70% split between an Individual Project and final exams (no coursework \n",
+      "assessment) - are all Computer Science courses like this?\n",
+      "\n",
+      "BTW - we started off with 22 students in our first year and were left with 8 by\n",
+      "Honours year. Also, every course is tutored separately. Not easy trying\n",
+      "to sleep when you are in 8 student class :-). \n",
+      "\n",
+      "Cheers,\n",
+      "  Michael \n",
+      "-- \n",
+      "| Michael S. A. Robb     | Tel: +44 31 554 9424  | \"..The problem with bolt-on\n",
+      "| Software Engineer      | Fax: +44 31 554 0649  |  software is making sure the\n",
+      "| Spider Systems Limited | E-mail:               |  bolts are the right size..\"\n",
+      "| Edinburgh, EH6 5NG     | michaelr@spider.co.uk |             - Anonymous\n",
+      "\n",
+      "0.22933013891071233\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "From: jrm@elm.circa.ufl.edu (Jeff Mason)\n",
+      "Subject: AUCTION: Marvel, DC, Valiant, Image, Dark Horse, etc...\n",
+      "Organization: Univ. of Florida Psychology Dept.\n",
+      "Lines: 59\n",
+      "NNTP-Posting-Host: elm.circa.ufl.edu\n",
+      "\n",
+      "I am auctioning off the following comics.  These minimum bids are set\n",
+      "below what I would normally sell them for.  Make an offer, and I will\n",
+      "accept the highest bid after the auction has been completed.\n",
+      "\n",
+      "TITLE                                                   Minimum/Current \n",
+      "--------------------------------------------------------------\n",
+      "Alpha Flight 51 (Jim Lee's first work at Marvel)\t$ 5.00\n",
+      "Aliens 1 (1st app Aliens in comics, 1st prnt, May 1988)\t$20.00\n",
+      "Amazing Spider-Man 136 (Intro new Green Goblin)         $20.00\n",
+      "Amazing Spider-Man 238 (1st appearance Hobgoblin)\t$50.00\n",
+      "Archer and Armstrong 1 (Frank Miller/Smith/Layton)\t$ 7.50\n",
+      "Avengers 263 (1st appearance X-factor)                  $ 3.50\n",
+      "Bloodshot 1 (Chromium cover, BWSmith Cover/Poster)\t$ 5.00\n",
+      "Daredevil 158 (Frank Miller art begins)                 $35.00\n",
+      "Dark Horse Presents 1 (1st app Concrete, 1st printing)\t$ 7.50 \n",
+      "H.A.R.D. Corps 1 \t\t\t\t\t$ 5.00\n",
+      "Incredible Hulk 324 (1st app Grey Hulk since #1, 1962)\t$ 7.50\n",
+      "Incredible Hulk 330 (1st McFarlane issue)\t\t$15.00\n",
+      "Incredible Hulk 331 (Grey Hulk series begins)\t\t$11.20\t\n",
+      "Incredible Hulk 367 (1st Dale Keown art in Hulk)        $15.00\n",
+      "Incredible Hulk 377 (1st all new hulk, 1st prnt, Keown) $15.00\n",
+      "Marvel Comics Presents 1 (Wolverine, Silver Surfer)     $ 7.50\n",
+      "Maxx Limited Ashcan (4000 copies exist, blue cover)\t$30.00\n",
+      "New Mutants 86 (McFarlane cover, 1st app Cable - cameo)\t$10.00\n",
+      "New Mutants 100 (1st app X-Force)                       $ 5.00\n",
+      "New Mutants Annual 5 (1st Liefeld art on New Mutants)\t$10.00\n",
+      "Omega Men 3 (1st appearance Lobo)                       $ 7.50\n",
+      "Omega Men 10 (1st full Lobo story)                      $ 7.50\n",
+      "Power Man & Iron Fist 78 (3rd appearance Sabretooth)    $25.00\n",
+      "                      84 (4th appearance Sabretooth)    $20.00\n",
+      "Simpsons Comics and Stories 1 (Polybagged special ed.)\t$ 7.50\n",
+      "Spectacular Spider-Man 147 (1st app New Hobgoblin)      $12.50\n",
+      "Star Trek the Next Generation 1 (Feb 1988, DC mini)     $ 7.50\n",
+      "Star Trek the Next Generation 1 (Oct 1989, DC comics)   $ 7.50\n",
+      "Web of Spider-Man 29 (Hobgoblin, Wolverine appear)      $10.00 \n",
+      "Web of Spider-Man 30 (Origin Rose, Hobgoblin appears)   $ 7.50\n",
+      "Wolverine 10 (Before claws, 1st battle with Sabretooth)\t$15.00\n",
+      "Wolverine 41 (Sabretooth claims to be Wolverine's dad)\t$ 5.00\n",
+      "Wolverine 42 (Sabretooth proven not to be his dad)\t$ 3.50\n",
+      "Wolverine 43 (Sabretooth/Wolverine saga concludes)\t$ 3.00\n",
+      "Wolverine 1 (1982 mini-series, Miller art)\t\t$20.00\n",
+      "Wonder Woman 267 (Return of Animal Man)                 $12.50\n",
+      "X-Force 1 (Signed by Liefeld, Bagged, X-Force card)     $20.00\n",
+      "X-Force 1 (Signed by Liefeld, Bagged, Shatterstar card) $10.00\n",
+      "X-Force 1 (Signed by Liefeld, Bagged, Deadpool card)    $10.00\n",
+      "X-Force 1 (Signed by Liefeld, Bagged, Sunspot/Gideon)   $10.00\n",
+      "\n",
+      "All comics are in near mint to mint condition, are bagged in shiny \n",
+      "polypropylene bags, and backed with white acid free boards.  Shipping is\n",
+      "$1.50 for one book, $3.00 for more than one book, or free if you order \n",
+      "a large enough amount of stuff.  I am willing to haggle.\n",
+      "\n",
+      "I have thousands and thousands of other comics, so please let me know what \n",
+      "you've been looking for, and maybe I can help.  Some titles I have posted\n",
+      "here don't list every issue I have of that title, I tried to save space.\n",
+      "-- \n",
+      "Geoffrey R. Mason\t\t|\tjrm@elm.circa.ufl.edu\n",
+      "Department of Psychology\t|\tmason@webb.psych.ufl.edu\n",
+      "University of Florida\t\t|\tprothan@maple.circa.ufl.edu\n",
+      "\n",
+      "0.17360012846950526\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "----------------------------------------------------------------------------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "query_vector = vectorizer.transform([query_str])\n",
+    "similarities = sklearn.metrics.pairwise.cosine_similarity(query_vector,document_vectors)\n",
+    "print(np.sort(similarities)[0][-4:])\n",
+    "print(similarities.argsort()[0][-4:])\n",
+    "\n",
+    "for i in range (1,5):\n",
+    "    print(newsgroups[similarities.argsort()[0][-i]])\n",
+    "    print(np.sort(similarities)[0,-i])\n",
+    "    print('-'*100)\n",
+    "    print('-'*100)\n",
+    "    print('-'*100)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 📝 Zadanie 4 (4 pkt.)\n",
+    "\n",
+    "Wybierz zbiór tekstowy, który ma conajmniej 10000 dokumentów (inny niż w tym przykładzie).\n",
+    "Na jego podstawie stwórz wyszukiwarkę wykorzystującą TF–IDF i podobieństwo kosinusowe do oceny podobieństwa dokumentów. Wyszukiwarka powinna zwracać kilka posortowanych najbardziej pasujących dokumentów razem ze score'ami."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}