This commit is contained in:
MatOgr 2022-05-18 16:57:38 +02:00
commit 5a363e62f0

View File

@ -142,13 +142,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 20,
"metadata": {}, "metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"import pandas as pd\n", "import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score\n", "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
"import scipy.stats as stats\n", "import scipy.stats as stats\n",
"import numpy as np\n", "import numpy as np\n",
"import plotly\n", "import plotly\n",
@ -160,17 +165,23 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 21,
"metadata": {}, "metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"class NaiveBayes():\n", "class NaiveBayes():\n",
"\n", "\n",
" def __init__(self,classes,className,attribsNames,data):\n", " def __init__(self, classes, className, attribsNames, data):\n",
" self.classes = classes\n", " self.classes = classes\n",
" self.className = className\n", " self.className = className\n",
" self.attribsNames = attribsNames\n", " self.attribsNames = attribsNames\n",
" self.data = data\n", " self.data = data\n",
"\n",
" #przygotowanie prawdopodobienstw wartosci danych cech w zaleznosci od klasy\n", " #przygotowanie prawdopodobienstw wartosci danych cech w zaleznosci od klasy\n",
" def getDictOfAttribProbs(self):\n", " def getDictOfAttribProbs(self):\n",
" dictionaries = {}\n", " dictionaries = {}\n",
@ -189,24 +200,25 @@
" return dictionaries\n", " return dictionaries\n",
"\n", "\n",
" #a priori dla klas\n", " #a priori dla klas\n",
" def classProb(self,class_):\n", " def classProb(self, class_):\n",
" x = len(self.data[self.data[self.className] == class_][self.className])\n", " x = len(self.data[self.data[self.className] == class_][self.className])\n",
" y = len(self.data[self.className])\n", " y = len(self.data[self.className])\n",
" return x / y\n", " return x / y\n",
"\n", "\n",
" #prawdopodobienstwo dla wartosic danej cechy w zaelznosci od klasy\n", " #prawdopodobienstwo dla wartosic danej cechy w zaelznosci od klasy\n",
" def getAttribProbs(self,attrib, value, data, clas, dictProbs):\n", " def getAttribProbs(self, attrib, value, data, clas, dictProbs):\n",
" return dictProbs[clas][attrib].get(value, 1.0 / len(data))\n", " return dictProbs[clas][attrib].get(value, 1.0 / len(data))\n",
"\n", "\n",
" #a posteriori dla danego obiektu\n", " #a posteriori dla danego obiektu\n",
" def getPosteriori(self,attribs, attribsNames, clas, dictProbs):\n", " def getPosteriori(self, attribs, attribsNames, clas, dictProbs):\n",
" dic = {}\n", " dic = {}\n",
" for i in range(len(attribs)):\n", " for i in range(len(attribs)):\n",
" dic[attribsNames[i]] = attribs[i]\n", " dic[attribsNames[i]] = attribs[i]\n",
" sum = 0.0\n", " sum = 0.0\n",
" for key in dic:\n", " for key in dic:\n",
" sum = sum + np.log(NaiveBayes.getAttribProbs(self,key, dic[key], X_train, clas, dictProbs))\n", " sum = sum + np.log(self.getAttribProbs(key, dic[key], X_train, clas, dictProbs))\n",
" return sum + np.log(NaiveBayes.classProb(self,clas))\n", " return sum + np.log(self.classProb(clas))\n",
"\n",
" #predykcja dla danych\n", " #predykcja dla danych\n",
" def predict(self, data, model):\n", " def predict(self, data, model):\n",
" attribNames = data.columns\n", " attribNames = data.columns\n",
@ -214,21 +226,25 @@
" for i in range(len(data)):\n", " for i in range(len(data)):\n",
" probs = {}\n", " probs = {}\n",
" for name in self.classes:\n", " for name in self.classes:\n",
" probs[name] = NaiveBayes.getPosteriori(self,list(data.iloc[i]), list(attribNames),name, model)\n", " probs[name] = self.getPosteriori(list(data.iloc[i]), list(attribNames), name, model)\n",
" keyMax = max(zip(probs.values(), probs.keys()))[1]\n", " keyMax = max(zip(probs.values(), probs.keys()))[1]\n",
" predictions.append(keyMax)\n", " predictions.append(keyMax)\n",
" return predictions\n", " return predictions\n",
" \n", "\n",
" def fitModel(self):\n", " def fitModel(self):\n",
" model = NaiveBayes.getDictOfAttribProbs(self)\n", " probabilities = self.getDictOfAttribProbs()\n",
" return model\n", " return probabilities\n"
" "
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 22,
"metadata": {}, "metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"features = [\n", "features = [\n",
@ -245,8 +261,13 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 23,
"metadata": {}, "metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [ "outputs": [
{ {
"data": { "data": {
@ -457,7 +478,7 @@
"[5 rows x 23 columns]" "[5 rows x 23 columns]"
] ]
}, },
"execution_count": 5, "execution_count": 23,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -640,16 +661,19 @@
" },\n", " },\n",
"}\n", "}\n",
"\n", "\n",
"# data = pd.read_csv('mushrooms.csv')\n",
"for key in NAMES_DICT.keys():\n", "for key in NAMES_DICT.keys():\n",
" mushrooms[key] = mushrooms[key].apply(lambda x: NAMES_DICT[key][x])\n", " mushrooms[key] = mushrooms[key].apply(lambda x: NAMES_DICT[key][x])\n",
"mushrooms.head()\n", "mushrooms.head()"
"# .drop(['veil-type'], axis=1)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
},
"source": [ "source": [
"##### Features' distribution \n", "##### Features' distribution \n",
"\n", "\n",
@ -658,8 +682,13 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 24,
"metadata": {}, "metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [ "outputs": [
{ {
"data": { "data": {
@ -2442,7 +2471,7 @@
" labels=[count for count in data[col].value_counts().index],\n", " labels=[count for count in data[col].value_counts().index],\n",
" values=[\n", " values=[\n",
" val for val in data[col].value_counts() * 100 /\n", " val for val in data[col].value_counts() * 100 /\n",
" sum(data[col].value_counts())\n", " sum(data[col].value_counts())\n",
" ],\n", " ],\n",
" name=col), a, b)\n", " name=col), a, b)\n",
" l.append(\n", " l.append(\n",
@ -2469,7 +2498,12 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
},
"source": [ "source": [
"##### Korelacja zmiennych\n", "##### Korelacja zmiennych\n",
"\n", "\n",
@ -2478,8 +2512,13 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 25,
"metadata": {}, "metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [ "outputs": [
{ {
"data": { "data": {
@ -2502,8 +2541,8 @@
" phi2 = chi2 / n\n", " phi2 = chi2 / n\n",
" r, k = confusion_matrix.shape\n", " r, k = confusion_matrix.shape\n",
" phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))\n", " phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))\n",
" rcorr = r - ((r - 1)**2) / (n - 1)\n", " rcorr = r - ((r - 1) ** 2) / (n - 1)\n",
" kcorr = k - ((k - 1)**2) / (n - 1)\n", " kcorr = k - ((k - 1) ** 2) / (n - 1)\n",
" return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))\n", " return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))\n",
"\n", "\n",
"\n", "\n",
@ -2523,7 +2562,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 26,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -2564,6 +2603,15 @@
" plt.show()\n", " plt.show()\n",
"\n", "\n",
"\n", "\n",
"training_cols = [\n",
" 'odor',\n",
" 'spore-print-color',\n",
" 'gill-color',\n",
" 'ring-type',\n",
" 'stalk-surface-above-ring',\n",
" 'gill-size',\n",
"]\n",
"\n",
"plot_chosen_features(mushrooms,\n", "plot_chosen_features(mushrooms,\n",
" col='odor',\n", " col='odor',\n",
" labels=NAMES_DICT['odor'].values(),\n", " labels=NAMES_DICT['odor'].values(),\n",
@ -2572,7 +2620,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 27,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -2597,7 +2645,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 28,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -2622,7 +2670,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 29,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -2647,7 +2695,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 30,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -2672,7 +2720,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 13, "execution_count": 31,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -2697,8 +2745,13 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 32,
"metadata": {}, "metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"X_train, X_test = train_test_split(mushrooms,\n", "X_train, X_test = train_test_split(mushrooms,\n",
@ -2719,8 +2772,13 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": 33,
"metadata": {}, "metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"X_test_data = X_test[columns]\n", "X_test_data = X_test[columns]\n",
@ -2729,18 +2787,28 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 16, "execution_count": 34,
"metadata": {}, "metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [], "outputs": [],
"source": [ "source": [
"bayModel = NaiveBayes(classValue,className,columns,X_train)\n", "bayModel = NaiveBayes(classValue, className, columns, X_train)\n",
"model = bayModel.fitModel()\n" "model = bayModel.fitModel()\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 18, "execution_count": 35,
"metadata": {}, "metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
@ -2750,50 +2818,51 @@
"acuracy score = 0.9944903581267218\n", "acuracy score = 0.9944903581267218\n",
"\n", "\n",
"accuracy score losowych predykcji\n", "accuracy score losowych predykcji\n",
"acuracy score = 0.49931129476584024\n" "acuracy score = 0.48829201101928377\n"
] ]
} }
], ],
"source": [ "source": [
"pred = bayModel.predict(X_test[columns],model)\n", "pred = bayModel.predict(X_test[columns], model)\n",
"print('accuracy score naiwnego klasyfikatora')\n", "print('accuracy score naiwnego klasyfikatora')\n",
"print(\"acuracy score = \",accuracy_score(list(X_test_results),pred))\n", "print(\"acuracy score = \", accuracy_score(list(X_test_results), pred))\n",
"\n", "\n",
"print('\\naccuracy score losowych predykcji')\n", "print('\\naccuracy score losowych predykcji')\n",
"randomPred = ['poisonous' if random.randint(0,1) == 1 else 'edible' for _ in range(len(list(X_test_results)))]\n", "randomPred = ['poisonous' if random.randint(0, 1) == 1 else 'edible' for _ in range(len(list(X_test_results)))]\n",
"print(\"acuracy score = \",accuracy_score(list(X_test_results),randomPred))" "print(\"acuracy score = \", accuracy_score(list(X_test_results), randomPred))"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 19, "execution_count": 36,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Accuracy bez \"odor\"\n", "accuracy score naiwnego klasyfikatora\n",
"acuracy score = 0.8657024793388429\n" "acuracy score = 0.8980716253443526\n"
] ]
} }
], ],
"source": [ "source": [
"without_odor = [\n", "columns_wihtout_odor = [\n",
" 'spore-print-color',\n",
" 'gill-color',\n", " 'gill-color',\n",
" 'ring-type',\n", " 'ring-type',\n",
" 'stalk-surface-above-ring',\n", " 'stalk-surface-above-ring',\n",
" 'gill-size',\n", " 'gill-size',\n",
"]\n", "]\n",
"\n", "\n",
"X_test_data = X_test[without_odor]\n", "X_test_data = X_test[columns_wihtout_odor]\n",
"X_test_results = X_test[className]\n", "X_test_results = X_test[className]\n",
"\n", "\n",
"bayModel = NaiveBayes(classValue,className,without_odor,X_train)\n", "bayModel = NaiveBayes(classValue, className, columns_wihtout_odor, X_train)\n",
"model = bayModel.fitModel()\n", "model = bayModel.fitModel()\n",
"pred = bayModel.predict(X_test[without_odor],model)\n", "pred = bayModel.predict(X_test[columns_wihtout_odor], model)\n",
"print('Accuracy bez \"odor\"')\n", "print('accuracy score naiwnego klasyfikatora')\n",
"print(\"acuracy score = \",accuracy_score(list(X_test_results),pred))\n" "print(\"acuracy score = \", accuracy_score(list(X_test_results), pred))\n"
] ]
} }
], ],