diff --git a/sw_lab1.ipynb b/sw_lab1.ipynb
new file mode 100644
index 0000000..3841ee1
--- /dev/null
+++ b/sw_lab1.ipynb
@@ -0,0 +1,456 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "id": "0b35a076",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from load_data import get_dataset\n",
+ "import numpy as np\n",
+ "from collections import Counter\n",
+ "from tabulate import tabulate\n",
+ "from statistics import mean"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "elementary-purchase",
+ "metadata": {},
+ "source": [
+ "# Zadanie 1 (4 pkt)\n",
+ "\n",
+ "Napisz kod klasy KNearestNeighbor
implementującej klasyfikator knn. Należy zimplementować następujące metody:\n",
+ " - konstruktor
pobierający listę obrazów treningowych (zgodną zw składową 'values' wczytanego słownika) oraz listę ich etykiet\n",
+ " - metoda l_p_metric(image1, image2, p):
zwracająca wartość odległości pomiędzy dwoma obrazami, mierzoną normą typu l_p - parametr p
określa 'potęgę' normy\n",
+ " - metoda predict(test_images, k,p):
zwracająca listę prognozowanych etykiet dla obrazów testowych (parametr test_images
). Paramter drugi określa liczbę przeszukiwanych sąsiadów, natomiast paramter trzeci określa potęgę wybranej metryki.\n",
+ " - metoda accuracy(test_images, k,p)
zwracająca dokładność klasyfikatora na zbiorze testowym. Parametr drugi i trzeci są jak w metodzie predict()
\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "id": "great-earthquake",
+ "metadata": {
+ "nbgrader": {
+ "grade": true,
+ "grade_id": "cell-50c8d2866e4d875e",
+ "locked": false,
+ "points": 4,
+ "schema_version": 3,
+ "solution": true,
+ "task": false
+ }
+ },
+ "outputs": [],
+ "source": [
+ "class KNearestNeighbor():\n",
+ " def __init__(self, values, labels):\n",
+ " self.values = values\n",
+ " self.labels = labels\n",
+ "\n",
+ " def l_p_metric(self, image1, image2, p):\n",
+ " return np.sum(np.abs(image1 - image2) ** p) ** (1/p)\n",
+ "\n",
+ " def predict(self, X, K=1, P=1):\n",
+ " predicted = []\n",
+ "\n",
+ " for image in X:\n",
+ " metrics = []\n",
+ "\n",
+ " for value in self.values:\n",
+ " metrics.append(self.l_p_metric(image, value, P))\n",
+ "\n",
+ " mins = sorted(range(len(metrics)), key = lambda sub: metrics[sub])[:K]\n",
+ " nearest = [self.labels[x] for x in mins]\n",
+ " pred = max(nearest, key=Counter(nearest).get)\n",
+ "\n",
+ " predicted.append(pred)\n",
+ "\n",
+ " return predicted\n",
+ "\n",
+ " def accuracy(self, expected, predicted):\n",
+ " return sum(1 for x, y in zip(expected, predicted) if x == y) / len(expected)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "brave-replacement",
+ "metadata": {},
+ "source": [
+ "# Zadanie 2 (2 pkt)\n",
+ "\n",
+ "Napisz kod funkcji crossValidation(X, y, n = 10, k=1, p=1):
obliczającą algorytm kNN
z n-krotną walidacją krzyżową."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "id": "entire-advancement",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def crossValidation(X_train, y_train, X_test, y_test, n=10, k=1, p=1):\n",
+ " tab = []\n",
+ " X_folds = []\n",
+ " y_folds = []\n",
+ " \n",
+ " f_size = len(X_train)//n\n",
+ " index = 0\n",
+ " \n",
+ " for i in range(n):\n",
+ " if i == n-1:\n",
+ " X_folds.append(X_train[index:])\n",
+ " y_folds.append(y_train[index:])\n",
+ " continue\n",
+ " X_folds.append(X_train[index : index+f_size])\n",
+ " y_folds.append(y_train[index : index+f_size])\n",
+ " index = index + f_size\n",
+ " \n",
+ " for i in range(n):\n",
+ " X_train_temp = X_folds[:i] + X_folds[i+1:]\n",
+ " X_train = np.concatenate((X_train_temp))\n",
+ "\n",
+ " y_train_temp = y_folds[:i] + y_folds[i+1:]\n",
+ " y_train = np.concatenate((y_train_temp))\n",
+ " \n",
+ " X_test = X_folds[i]\n",
+ " y_test = y_folds[i]\n",
+ "\n",
+ " Knn = KNearestNeighbor(X_train, y_train)\n",
+ " \n",
+ " pred = Knn.predict(X_test, k, p)\n",
+ " a = Knn.accuracy(y_test, pred)\n",
+ " tab.append(a)\n",
+ " \n",
+ " result = mean(tab)\n",
+ " \n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "id": "searching-globe",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " K P kNN accuracy CrossValidation accuracy\n",
+ "--- --- -------------- --------------------------\n",
+ " 1 1 0.583012 0.658228\n",
+ " 1 2 0.552124 0.617332\n",
+ " 5 1 0.555985 0.574489\n",
+ " 5 2 0.544402 0.565725\n",
+ " 10 1 0.501931 0.523856\n",
+ " 10 2 0.501931 0.534567\n"
+ ]
+ }
+ ],
+ "source": [
+ "X_train, y_train, X_test, y_test = get_dataset(new_size=64)\n",
+ "\n",
+ "kNN = KNearestNeighbor(X_train, y_train)\n",
+ "\n",
+ "Ks = [1, 5, 10]\n",
+ "Ps = [1, 2]\n",
+ "\n",
+ "accuracy = [ [\n",
+ " k, p, \n",
+ " kNN.accuracy(y_test, kNN.predict(X_test, K=k, P=p)),\n",
+ " crossValidation(X_train, y_train, X_test, y_test, n=len(X_train), k=k, p=p)] for k in Ks for p in Ps ]\n",
+ "\n",
+ "print(tabulate(accuracy, headers=['K', 'P', 'kNN accuracy', 'CrossValidation accuracy']))\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a85bb37f",
+ "metadata": {},
+ "source": [
+ "# Zadanie 3 (4 pkt)\n",
+ "\n",
+ "Napisz kod klasy LogisticRegression
implementującej klasyfikator wieloklasowej regresji logistycznej z funkcją softmax()
(ze standardowymi nazwami dwóch kluczowych funkcji: fit(), predict()). Zastosuj ten kod do pobranych danych (zbiór walidacyjny losujemy ze zbioru treningowego) - oblicz następujące charakterystyki modelu dla danych walidacyjnych oraz treningowych: dokładność (accuracy), precyzję (precision), czułość(recall) oraz F1 - dla poszczególnych klas oraz globalnie (zob. np. tu).\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "id": "e433be08",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class LogisticRegression():\n",
+ " def mapY(self, y, cls):\n",
+ " m = len(y)\n",
+ " yBi = np.matrix(np.zeros(m)).reshape(m, 1)\n",
+ " yBi[y == cls] = 1.\n",
+ " return yBi\n",
+ "\n",
+ " def indicatorMatrix(self, y):\n",
+ " classes = np.unique(y.tolist())\n",
+ " m = len(y)\n",
+ " k = len(classes)\n",
+ " Y = np.matrix(np.zeros((m, k)))\n",
+ " for i, cls in enumerate(classes):\n",
+ " Y[:, i] = self.mapY(y, cls)\n",
+ " return Y\n",
+ " \n",
+ " # Zapis macierzowy funkcji softmax\n",
+ " def softmax(self, X):\n",
+ " return np.exp(X) / np.sum(np.exp(X))\n",
+ " \n",
+ " # Funkcja regresji logistcznej\n",
+ " def h(self, theta, X):\n",
+ " return 1.0/(1.0 + np.exp(-X * theta))\n",
+ " \n",
+ " # Funkcja kosztu dla regresji logistycznej\n",
+ " def J(self, h, theta, X, y):\n",
+ " m = len(y)\n",
+ " h_val = h(theta, X)\n",
+ " s1 = np.multiply(y, np.log(h_val))\n",
+ " s2 = np.multiply((1 - y), np.log(1 - h_val))\n",
+ " return -np.sum(s1 + s2, axis=0) / m\n",
+ "\n",
+ " # Gradient dla regresji logistycznej\n",
+ " def dJ(self, h, theta, X, y):\n",
+ " return 1.0 / len(y) * (X.T * (h(theta, X) - y))\n",
+ "\n",
+ " # Metoda gradientu prostego dla regresji logistycznej\n",
+ " def GD(self, h, fJ, fdJ, theta, X, y, alpha=0.01, eps=10**-3, maxSteps=10000):\n",
+ " errorCurr = fJ(h, theta, X, y) # fJ -> J, fdJ -> dJ\n",
+ " errors = [[errorCurr, theta]]\n",
+ " while True:\n",
+ " # oblicz nowe theta\n",
+ " theta = theta - alpha * fdJ(h, theta, X, y)\n",
+ " # raportuj poziom błędu\n",
+ " errorCurr, errorPrev = fJ(h, theta, X, y), errorCurr\n",
+ " # kryteria stopu\n",
+ " if abs(errorPrev - errorCurr) <= eps:\n",
+ " break\n",
+ " if len(errors) > maxSteps:\n",
+ " break\n",
+ " errors.append([errorCurr, theta]) \n",
+ " return theta, errors\n",
+ "\n",
+ " def trainMaxEnt(self, X, Y):\n",
+ " n = X.shape[1]\n",
+ " thetas = []\n",
+ " for c in range(Y.shape[1]):\n",
+ " YBi = Y[:,c]\n",
+ " theta = np.matrix(np.random.random(n)).reshape(n,1)\n",
+ " # Macierz parametrów theta obliczona dla każdej klasy osobno.\n",
+ " thetaBest, errors = self.GD(self.h, self.J, self.dJ, theta, \n",
+ " X, YBi, alpha=0.1, eps=10**-4)\n",
+ " thetas.append(thetaBest)\n",
+ " return thetas\n",
+ "\n",
+ " def classify(self, thetas, X):\n",
+ " regs = np.array([(X*theta).item() for theta in thetas])\n",
+ " probs = self.softmax(regs)\n",
+ " result = np.argmax(probs)\n",
+ " return result\n",
+ "\n",
+ " def class_score(self, expected, predicted):\n",
+ " # accuracy = TP + TN / FP + FN + TP + TN\n",
+ " accuracy = sum(1 for exp, pred in zip(expected, predicted) if exp == pred) / len(expected)\n",
+ " # precision = TP / FP + TP\n",
+ " precision = sum(\n",
+ " 1 for exp, pred in zip(expected, predicted) if exp == 1.0 and pred == 1.0) / sum(\n",
+ " 1 for exp, pred in zip(expected, predicted) if exp == 1.0)\n",
+ " # recall = TP / FN + TP\n",
+ " recall = sum(\n",
+ " 1 for exp, pred in zip(expected, predicted) if exp == 1.0 and pred == 1.0) / sum(\n",
+ " 1 for exp, pred in zip(expected, predicted) if pred == 1.0)\n",
+ " f1 = (2 * precision * recall) / (precision + recall)\n",
+ " return accuracy, precision, recall, f1\n",
+ "\n",
+ " def fit(self, X_train, y_train):\n",
+ " Y = self.indicatorMatrix(y_train)\n",
+ " self.thetas = self.trainMaxEnt(X_train, Y)\n",
+ "\n",
+ " def predict(self, X_test):\n",
+ " return np.array([self.classify(self.thetas, x) for x in X_test])\n",
+ " \n",
+ " def score(self, expected, predicted):\n",
+ " score = {\n",
+ " 'Class' : [], \n",
+ " 'Accuracy': [],\n",
+ " 'Precision': [],\n",
+ " 'Recall': [],\n",
+ " 'F1': []}\n",
+ "\n",
+ " oh_expected = self.indicatorMatrix(expected).T.tolist()\n",
+ " oh_predicted = self.indicatorMatrix(predicted).T.tolist()\n",
+ " n_classes = len(oh_expected)\n",
+ "\n",
+ " for i in range(n_classes):\n",
+ " e = oh_expected[i]\n",
+ " p = oh_predicted[i]\n",
+ " a, p, r, f1 = self.class_score(e, p)\n",
+ " score['Class'].append(i)\n",
+ " score['Accuracy'].append(a)\n",
+ " score['Precision'].append(p)\n",
+ " score['Recall'].append(r)\n",
+ " score['F1'].append(f1)\n",
+ "\n",
+ " score['Class'].append('Global')\n",
+ " score['Accuracy'].append(sum(1 for exp, pred in zip(expected, predicted) if exp == pred) / len(expected))\n",
+ " score['Precision'].append(np.mean(score['Precision']))\n",
+ " score['Recall'].append(np.mean(score['Recall']))\n",
+ " score['F1'].append(np.mean(score['F1']))\n",
+ "\n",
+ " return score\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 86,
+ "id": "ba36ecbb",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/7c/v61kq2b95dzbt7s47fxy0grm0000gn/T/ipykernel_2525/2624688826.py:30: RuntimeWarning: divide by zero encountered in log\n",
+ " s2 = np.multiply((1 - y), np.log(1 - h_val))\n",
+ "/var/folders/7c/v61kq2b95dzbt7s47fxy0grm0000gn/T/ipykernel_2525/2624688826.py:30: RuntimeWarning: invalid value encountered in multiply\n",
+ " s2 = np.multiply((1 - y), np.log(1 - h_val))\n",
+ "/var/folders/7c/v61kq2b95dzbt7s47fxy0grm0000gn/T/ipykernel_2525/2624688826.py:47: RuntimeWarning: invalid value encountered in subtract\n",
+ " if abs(errorPrev - errorCurr) <= eps:\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Class Accuracy Precision Recall F1\n",
+ "------- ---------- ----------- -------- --------\n",
+ "0 0.96139 0.823529 0.976744 0.893617\n",
+ "1 0.857143 0.557692 0.674419 0.610526\n",
+ "2 0.872587 0.788462 0.650794 0.713043\n",
+ "3 0.861004 0.596154 0.673913 0.632653\n",
+ "4 0.776062 0.557692 0.453125 0.5\n",
+ "Global 0.664093 0.664706 0.685799 0.669968\n"
+ ]
+ }
+ ],
+ "source": [
+ "X_train, y_train, X_test, y_test = get_dataset(new_size=32) \n",
+ "\n",
+ "logreg = LogisticRegression()\n",
+ "logreg.fit(X_train, y_train)\n",
+ "\n",
+ "predicted = logreg.predict(X_test)\n",
+ "score = logreg.score(y_test, predicted)\n",
+ "\n",
+ "print(tabulate(score, headers='keys'))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7f8326ba",
+ "metadata": {},
+ "source": [
+ "# Zadanie 4 (1 pkt)\n",
+ "\n",
+ "Oblicz ile danych z poszczególnych klas znajduje się po dodatniej/ujemnej stronie hiperpłaszczyzny klasyfikacyjnej dla danej klasy."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "id": "09f0a567",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " Klasa Dodatnia strona Ujemna strona\n",
+ "------- ----------------- ---------------\n",
+ " 0 48 211\n",
+ " 1 52 207\n",
+ " 2 65 194\n",
+ " 3 47 212\n",
+ " 4 47 212\n"
+ ]
+ }
+ ],
+ "source": [
+ "one_hot = logreg.indicatorMatrix(predicted)\n",
+ "length = len(one_hot)\n",
+ "one_hot = one_hot.sum(axis=0).tolist()[0]\n",
+ "\n",
+ "hyperplane = [\n",
+ " [i for i in np.unique(predicted)], \n",
+ " [int(x) for x in one_hot],\n",
+ " [length - int(x) for x in one_hot]]\n",
+ " \n",
+ "\n",
+ "print(tabulate(np.array(hyperplane).T, headers=['Klasa', 'Dodatnia strona', 'Ujemna strona']))"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3.10.5 64-bit",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.5"
+ },
+ "latex_envs": {
+ "LaTeX_envs_menu_present": true,
+ "autoclose": false,
+ "autocomplete": true,
+ "bibliofile": "biblio.bib",
+ "cite_by": "apalike",
+ "current_citInitial": 1,
+ "eqLabelWithNumbers": true,
+ "eqNumInitial": 1,
+ "hotkeys": {
+ "equation": "Ctrl-E",
+ "itemize": "Ctrl-I"
+ },
+ "labels_anchors": false,
+ "latex_user_defs": false,
+ "report_style_numbering": false,
+ "user_envs_cfg": false
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": false,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": false
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "7e1998ff7f8aa20ada591c520b972326324e5ea05489af9e422744c7c09f6dad"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}