432 lines
15 KiB
Plaintext
432 lines
15 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"id": "0b35a076",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from load_data import get_dataset\n",
|
|
"import numpy as np\n",
|
|
"from collections import Counter\n",
|
|
"from tabulate import tabulate\n",
|
|
"from statistics import mean"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "elementary-purchase",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Zadanie 1 (4 pkt)\n",
|
|
"\n",
|
|
"Napisz kod klasy <code>KNearestNeighbor</code> implementującej klasyfikator <i>knn</i>. Należy zimplementować następujące metody:\n",
|
|
" - <code>konstruktor</code> pobierający listę obrazów treningowych (zgodną zw składową 'values' wczytanego słownika) oraz listę ich etykiet\n",
|
|
" - metoda <code>l_p_metric(image1, image2, p):</code> zwracająca wartość odległości pomiędzy dwoma obrazami, mierzoną normą typu <i>l_p</i> - parametr <code>p</code> określa 'potęgę' normy\n",
|
|
" - metoda <code>predict(test_images, k,p):</code> zwracająca listę prognozowanych etykiet dla obrazów testowych (parametr <code>test_images</code>). Paramter drugi określa liczbę przeszukiwanych sąsiadów, natomiast paramter trzeci określa potęgę wybranej metryki.\n",
|
|
" - metoda <code>accuracy(test_images, k,p)</code> zwracająca dokładność klasyfikatora na zbiorze testowym. Parametr drugi i trzeci są jak w metodzie <code>predict()</code>\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"id": "great-earthquake",
|
|
"metadata": {
|
|
"nbgrader": {
|
|
"grade": true,
|
|
"grade_id": "cell-50c8d2866e4d875e",
|
|
"locked": false,
|
|
"points": 4,
|
|
"schema_version": 3,
|
|
"solution": true,
|
|
"task": false
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"class KNearestNeighbor():\n",
|
|
" def __init__(self, values, labels):\n",
|
|
" self.values = values\n",
|
|
" self.labels = labels\n",
|
|
"\n",
|
|
" def l_p_metric(self, image1, image2, p):\n",
|
|
" return np.sum(np.abs(image1 - image2) ** p) ** (1/p)\n",
|
|
"\n",
|
|
" def predict(self, X, K=1, P=1):\n",
|
|
" predicted = []\n",
|
|
"\n",
|
|
" for image in X:\n",
|
|
" metrics = []\n",
|
|
"\n",
|
|
" for value in self.values:\n",
|
|
" metrics.append(self.l_p_metric(image, value, P))\n",
|
|
"\n",
|
|
" mins = sorted(range(len(metrics)), key = lambda sub: metrics[sub])[:K]\n",
|
|
" nearest = [self.labels[x] for x in mins]\n",
|
|
" pred = max(nearest, key=Counter(nearest).get)\n",
|
|
"\n",
|
|
" predicted.append(pred)\n",
|
|
"\n",
|
|
" return predicted\n",
|
|
"\n",
|
|
" def accuracy(self, expected, predicted):\n",
|
|
" return sum(1 for x, y in zip(expected, predicted) if x == y) / len(expected)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "brave-replacement",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Zadanie 2 (2 pkt)\n",
|
|
"\n",
|
|
"Napisz kod funkcji <code>crossValidation(X, y, n = 10, k=1, p=1):</code> obliczającą algorytm <code>kNN</code> z n-krotną walidacją krzyżową."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"id": "entire-advancement",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def crossValidation(X_train, y_train, X_test, y_test, n=10, k=1, p=1):\n",
|
|
" tab = []\n",
|
|
" X_folds = []\n",
|
|
" y_folds = []\n",
|
|
" \n",
|
|
" f_size = len(X_train)//n\n",
|
|
" index = 0\n",
|
|
" \n",
|
|
" for i in range(n):\n",
|
|
" if i == n-1:\n",
|
|
" X_folds.append(X_train[index:])\n",
|
|
" y_folds.append(y_train[index:])\n",
|
|
" continue\n",
|
|
" X_folds.append(X_train[index : index+f_size])\n",
|
|
" y_folds.append(y_train[index : index+f_size])\n",
|
|
" index = index + f_size\n",
|
|
" \n",
|
|
" for i in range(n):\n",
|
|
" X_train_temp = X_folds[:i] + X_folds[i+1:]\n",
|
|
" X_train = np.concatenate((X_train_temp))\n",
|
|
"\n",
|
|
" y_train_temp = y_folds[:i] + y_folds[i+1:]\n",
|
|
" y_train = np.concatenate((y_train_temp))\n",
|
|
" \n",
|
|
" X_test = X_folds[i]\n",
|
|
" y_test = y_folds[i]\n",
|
|
"\n",
|
|
" Knn = KNearestNeighbor(X_train, y_train)\n",
|
|
" \n",
|
|
" pred = Knn.predict(X_test, k, p)\n",
|
|
" a = Knn.accuracy(y_test, pred)\n",
|
|
" tab.append(a)\n",
|
|
" \n",
|
|
" result = mean(tab)\n",
|
|
" \n",
|
|
" return result"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"id": "searching-globe",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" K P kNN accuracy CrossValidation accuracy\n",
|
|
"--- --- -------------- --------------------------\n",
|
|
" 1 1 0.583012 0.658228\n",
|
|
" 1 2 0.552124 0.617332\n",
|
|
" 5 1 0.555985 0.574489\n",
|
|
" 5 2 0.544402 0.565725\n",
|
|
" 10 1 0.501931 0.523856\n",
|
|
" 10 2 0.501931 0.534567\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"X_train, y_train, X_test, y_test = get_dataset(new_size=64)\n",
|
|
"\n",
|
|
"kNN = KNearestNeighbor(X_train, y_train)\n",
|
|
"\n",
|
|
"Ks = [1, 5, 10]\n",
|
|
"Ps = [1, 2]\n",
|
|
"\n",
|
|
"accuracy = [ [\n",
|
|
" k, p, \n",
|
|
" kNN.accuracy(y_test, kNN.predict(X_test, K=k, P=p)),\n",
|
|
" crossValidation(X_train, y_train, X_test, y_test, n=len(X_train), k=k, p=p)] for k in Ks for p in Ps ]\n",
|
|
"\n",
|
|
"print(tabulate(accuracy, headers=['K', 'P', 'kNN accuracy', 'CrossValidation accuracy']))\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "a85bb37f",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Zadanie 3 (4 pkt)\n",
|
|
"\n",
|
|
"Napisz kod klasy <code>LogisticRegression</code> implementującej klasyfikator <i>wieloklasowej regresji logistycznej</i> z funkcją <code>softmax()</code> (ze standardowymi nazwami dwóch kluczowych funkcji: <i>fit()</i>, <i>predict()</i>). Zastosuj ten kod do pobranych danych (zbiór walidacyjny losujemy ze zbioru treningowego) - oblicz następujące charakterystyki modelu dla danych walidacyjnych oraz treningowych: dokładność (accuracy), precyzję (precision), czułość(recall) oraz F1 - dla poszczególnych klas oraz globalnie (zob. np. <a href=\"https://medium.com/synthesio-engineering/precision-accuracy-and-f1-score-for-multi-label-classification-34ac6bdfb404\">tu</a>).\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"id": "e433be08",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"class LogisticRegression():\n",
|
|
" def mapY(self, y, cls):\n",
|
|
" m = len(y)\n",
|
|
" yBi = np.matrix(np.zeros(m)).reshape(m, 1)\n",
|
|
" yBi[y == cls] = 1.\n",
|
|
" return yBi\n",
|
|
"\n",
|
|
" def indicatorMatrix(self, y):\n",
|
|
" classes = np.unique(y.tolist())\n",
|
|
" m = len(y)\n",
|
|
" k = len(classes)\n",
|
|
" Y = np.matrix(np.zeros((m, k)))\n",
|
|
" for i, cls in enumerate(classes):\n",
|
|
" Y[:, i] = self.mapY(y, cls)\n",
|
|
" return Y\n",
|
|
" \n",
|
|
" # Zapis macierzowy funkcji softmax\n",
|
|
" def softmax(self, X):\n",
|
|
" return np.exp(X) / np.sum(np.exp(X))\n",
|
|
" \n",
|
|
" # Funkcja regresji logistcznej\n",
|
|
" def h(self, theta, X):\n",
|
|
" return 1.0/(1.0 + np.exp(-X * theta))\n",
|
|
" \n",
|
|
" # Funkcja kosztu dla regresji logistycznej\n",
|
|
" def J(self, h, theta, X, y):\n",
|
|
" m = len(y)\n",
|
|
" h_val = h(theta, X)\n",
|
|
" s1 = np.multiply(y, np.log(h_val))\n",
|
|
" s2 = np.multiply((1 - y), np.log(1 - h_val))\n",
|
|
" return -np.sum(s1 + s2, axis=0) / m\n",
|
|
"\n",
|
|
" # Gradient dla regresji logistycznej\n",
|
|
" def dJ(self, h, theta, X, y):\n",
|
|
" return 1.0 / len(y) * (X.T * (h(theta, X) - y))\n",
|
|
"\n",
|
|
" # Metoda gradientu prostego dla regresji logistycznej\n",
|
|
" def GD(self, h, fJ, fdJ, theta, X, y, alpha=0.01, eps=10**-3, maxSteps=1000):\n",
|
|
" errorCurr = fJ(h, theta, X, y) # fJ -> J, fdJ -> dJ\n",
|
|
" errors = [[errorCurr, theta]]\n",
|
|
" while True:\n",
|
|
" # oblicz nowe theta\n",
|
|
" theta = theta - alpha * fdJ(h, theta, X, y)\n",
|
|
" # raportuj poziom błędu\n",
|
|
" errorCurr, errorPrev = fJ(h, theta, X, y), errorCurr\n",
|
|
" # kryteria stopu\n",
|
|
" if abs(errorPrev - errorCurr) <= eps:\n",
|
|
" break\n",
|
|
" if len(errors) > maxSteps:\n",
|
|
" break\n",
|
|
" errors.append([errorCurr, theta]) \n",
|
|
" return theta, errors\n",
|
|
"\n",
|
|
" def trainMaxEnt(self, X, Y):\n",
|
|
" n = X.shape[1]\n",
|
|
" thetas = []\n",
|
|
" for c in range(Y.shape[1]):\n",
|
|
" YBi = Y[:,c]\n",
|
|
" theta = np.matrix(np.random.random(n)).reshape(n,1)\n",
|
|
" # Macierz parametrów theta obliczona dla każdej klasy osobno.\n",
|
|
" thetaBest, errors = self.GD(self.h, self.J, self.dJ, theta, \n",
|
|
" X, YBi, alpha=0.1, eps=10**-4)\n",
|
|
" thetas.append(thetaBest)\n",
|
|
" return thetas\n",
|
|
"\n",
|
|
" def classify(self, thetas, X):\n",
|
|
" regs = np.array([(X*theta).item() for theta in thetas])\n",
|
|
" probs = self.softmax(regs)\n",
|
|
" result = np.argmax(probs)\n",
|
|
" return result\n",
|
|
"\n",
|
|
" def class_score(self, expected, predicted):\n",
|
|
" # accuracy = TP + TN / FP + FN + TP + TN\n",
|
|
" accuracy = sum(1 for exp, pred in zip(expected, predicted) if exp == pred) / len(expected)\n",
|
|
" # precision = TP / FP + TP\n",
|
|
" precision = sum(\n",
|
|
" 1 for exp, pred in zip(expected, predicted) if exp == 1.0 and pred == 1.0) / sum(\n",
|
|
" 1 for exp, pred in zip(expected, predicted) if exp == 1.0)\n",
|
|
" # recall = TP / FN + TP\n",
|
|
" recall = sum(\n",
|
|
" 1 for exp, pred in zip(expected, predicted) if exp == 1.0 and pred == 1.0) / sum(\n",
|
|
" 1 for exp, pred in zip(expected, predicted) if pred == 1.0)\n",
|
|
" f1 = (2 * precision * recall) / (precision + recall)\n",
|
|
" return accuracy, precision, recall, f1\n",
|
|
"\n",
|
|
" def fit(self, X_train, y_train):\n",
|
|
" Y = self.indicatorMatrix(y_train)\n",
|
|
" self.thetas = self.trainMaxEnt(X_train, Y)\n",
|
|
"\n",
|
|
" def predict(self, X_test):\n",
|
|
" return np.array([self.classify(self.thetas, x) for x in X_test])\n",
|
|
" \n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"id": "ba36ecbb",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/var/folders/3r/c8tg1h051m18qhsdccdysrt40000gn/T/ipykernel_42943/2006812993.py:30: RuntimeWarning: divide by zero encountered in log\n",
|
|
" s2 = np.multiply((1 - y), np.log(1 - h_val))\n",
|
|
"/var/folders/3r/c8tg1h051m18qhsdccdysrt40000gn/T/ipykernel_42943/2006812993.py:30: RuntimeWarning: invalid value encountered in multiply\n",
|
|
" s2 = np.multiply((1 - y), np.log(1 - h_val))\n",
|
|
"/var/folders/3r/c8tg1h051m18qhsdccdysrt40000gn/T/ipykernel_42943/2006812993.py:47: RuntimeWarning: invalid value encountered in subtract\n",
|
|
" if abs(errorPrev - errorCurr) <= eps:\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"X_train, y_train, X_test, y_test = get_dataset(new_size=64) \n",
|
|
"\n",
|
|
"logreg = LogisticRegression()\n",
|
|
"logreg.fit(X_train, y_train)\n",
|
|
"\n",
|
|
"predicted = logreg.predict(X_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"id": "af8d9107",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0.2857142857142857"
|
|
]
|
|
},
|
|
"execution_count": 23,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"from sklearn import metrics\n",
|
|
"\n",
|
|
"metrics.accuracy_score(y_test,predicted)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "7f8326ba",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Zadanie 4 (1 pkt)\n",
|
|
"\n",
|
|
"Oblicz ile danych z poszczególnych klas znajduje się po dodatniej/ujemnej stronie hiperpłaszczyzny klasyfikacyjnej dla danej klasy."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"id": "09f0a567",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" Klasa Dodatnia strona Ujemna strona\n",
|
|
"------- ----------------- ---------------\n",
|
|
" 0 23 236\n",
|
|
" 2 229 30\n",
|
|
" 4 7 252\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"one_hot = logreg.indicatorMatrix(predicted)\n",
|
|
"length = len(one_hot)\n",
|
|
"one_hot = one_hot.sum(axis=0).tolist()[0]\n",
|
|
"\n",
|
|
"hyperplane = [\n",
|
|
" [i for i in np.unique(predicted)], \n",
|
|
" [int(x) for x in one_hot],\n",
|
|
" [length - int(x) for x in one_hot]]\n",
|
|
" \n",
|
|
"\n",
|
|
"print(tabulate(np.array(hyperplane).T, headers=['Klasa', 'Dodatnia strona', 'Ujemna strona']))"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "base",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.13"
|
|
},
|
|
"latex_envs": {
|
|
"LaTeX_envs_menu_present": true,
|
|
"autoclose": false,
|
|
"autocomplete": true,
|
|
"bibliofile": "biblio.bib",
|
|
"cite_by": "apalike",
|
|
"current_citInitial": 1,
|
|
"eqLabelWithNumbers": true,
|
|
"eqNumInitial": 1,
|
|
"hotkeys": {
|
|
"equation": "Ctrl-E",
|
|
"itemize": "Ctrl-I"
|
|
},
|
|
"labels_anchors": false,
|
|
"latex_user_defs": false,
|
|
"report_style_numbering": false,
|
|
"user_envs_cfg": false
|
|
},
|
|
"toc": {
|
|
"base_numbering": 1,
|
|
"nav_menu": {},
|
|
"number_sections": false,
|
|
"sideBar": true,
|
|
"skip_h1_title": false,
|
|
"title_cell": "Table of Contents",
|
|
"title_sidebar": "Contents",
|
|
"toc_cell": false,
|
|
"toc_position": {},
|
|
"toc_section_display": true,
|
|
"toc_window_display": false
|
|
},
|
|
"vscode": {
|
|
"interpreter": {
|
|
"hash": "83181d593ff87630d4a0d7997796f399a1d96b6a7eabd5b16ad531f517f6300b"
|
|
}
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|