Merge branch 'master' of https://git.wmi.amu.edu.pl/s444452/mpsic_projekt_1_bayes_classifier

2022-05-18 16:57:38 +02:00 · 2022-05-18 16:57:38 +02:00 · 5a363e62f0
parent f533423f31 477156433f
commit 5a363e62f0
1 changed files with 131 additions and 62 deletions
--- a/projekt.ipynb
+++ b/projekt.ipynb
@ -142,13 +142,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
+   "execution_count": 20,
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
-    "from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score\n",
+    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
    "import scipy.stats as stats\n",
    "import numpy as np\n",
    "import plotly\n",
@ -160,17 +165,23 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
+   "execution_count": 21,
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [],
   "source": [
    "class NaiveBayes():\n",
    "\n",
-    "    def __init__(self,classes,className,attribsNames,data):\n",
+    "    def __init__(self, classes, className, attribsNames, data):\n",
    "        self.classes = classes\n",
    "        self.className = className\n",
    "        self.attribsNames = attribsNames\n",
    "        self.data = data\n",
+    "\n",
    "    #przygotowanie prawdopodobienstw wartosci danych cech w zaleznosci od klasy\n",
    "    def getDictOfAttribProbs(self):\n",
    "        dictionaries = {}\n",
@ -189,24 +200,25 @@
    "        return dictionaries\n",
    "\n",
    "    #a priori dla klas\n",
-    "    def classProb(self,class_):\n",
+    "    def classProb(self, class_):\n",
    "        x = len(self.data[self.data[self.className] == class_][self.className])\n",
    "        y = len(self.data[self.className])\n",
    "        return x / y\n",
    "\n",
    "    #prawdopodobienstwo dla wartosic danej cechy w zaelznosci od klasy\n",
-    "    def getAttribProbs(self,attrib, value, data, clas, dictProbs):\n",
+    "    def getAttribProbs(self, attrib, value, data, clas, dictProbs):\n",
    "        return dictProbs[clas][attrib].get(value, 1.0 / len(data))\n",
    "\n",
    "    #a posteriori dla danego obiektu\n",
-    "    def getPosteriori(self,attribs, attribsNames, clas, dictProbs):\n",
+    "    def getPosteriori(self, attribs, attribsNames, clas, dictProbs):\n",
    "        dic = {}\n",
    "        for i in range(len(attribs)):\n",
    "            dic[attribsNames[i]] = attribs[i]\n",
    "        sum = 0.0\n",
    "        for key in dic:\n",
-    "            sum = sum + np.log(NaiveBayes.getAttribProbs(self,key, dic[key], X_train, clas, dictProbs))\n",
-    "        return sum + np.log(NaiveBayes.classProb(self,clas))\n",
+    "            sum = sum + np.log(self.getAttribProbs(key, dic[key], X_train, clas, dictProbs))\n",
+    "        return sum + np.log(self.classProb(clas))\n",
+    "\n",
    "    #predykcja dla danych\n",
    "    def predict(self, data, model):\n",
    "        attribNames = data.columns\n",
@ -214,21 +226,25 @@
    "        for i in range(len(data)):\n",
    "            probs = {}\n",
    "            for name in self.classes:\n",
-    "                probs[name] = NaiveBayes.getPosteriori(self,list(data.iloc[i]), list(attribNames),name, model)\n",
+    "                probs[name] = self.getPosteriori(list(data.iloc[i]), list(attribNames), name, model)\n",
    "            keyMax = max(zip(probs.values(), probs.keys()))[1]\n",
    "            predictions.append(keyMax)\n",
    "        return predictions\n",
-    "    \n",
+    "\n",
    "    def fitModel(self):\n",
-    "        model = NaiveBayes.getDictOfAttribProbs(self)\n",
-    "        return model\n",
-    "        "
+    "        probabilities = self.getDictOfAttribProbs()\n",
+    "        return probabilities\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
+   "execution_count": 22,
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [],
   "source": [
    "features = [\n",
@ -245,8 +261,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
+   "execution_count": 23,
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [
    {
     "data": {
@ -457,7 +478,7 @@
       "[5 rows x 23 columns]"
      ]
     },
-     "execution_count": 5,
+     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -640,16 +661,19 @@
    "    },\n",
    "}\n",
    "\n",
-    "# data = pd.read_csv('mushrooms.csv')\n",
    "for key in NAMES_DICT.keys():\n",
    "    mushrooms[key] = mushrooms[key].apply(lambda x: NAMES_DICT[key][x])\n",
-    "mushrooms.head()\n",
-    "# .drop(['veil-type'], axis=1)"
+    "mushrooms.head()"
   ]
  },
  {
   "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
   "source": [
    "##### Features' distribution \n",
    "\n",
@ -658,8 +682,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
+   "execution_count": 24,
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [
    {
     "data": {
@ -2442,7 +2471,7 @@
    "                labels=[count for count in data[col].value_counts().index],\n",
    "                values=[\n",
    "                    val for val in data[col].value_counts() * 100 /\n",
-    "                    sum(data[col].value_counts())\n",
+    "                                   sum(data[col].value_counts())\n",
    "                ],\n",
    "                name=col), a, b)\n",
    "        l.append(\n",
@ -2469,7 +2498,12 @@
  },
  {
   "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
   "source": [
    "##### Korelacja zmiennych\n",
    "\n",
@ -2478,8 +2512,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
+   "execution_count": 25,
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [
    {
     "data": {
@ -2502,8 +2541,8 @@
    "    phi2 = chi2 / n\n",
    "    r, k = confusion_matrix.shape\n",
    "    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))\n",
-    "    rcorr = r - ((r - 1)**2) / (n - 1)\n",
-    "    kcorr = k - ((k - 1)**2) / (n - 1)\n",
+    "    rcorr = r - ((r - 1) ** 2) / (n - 1)\n",
+    "    kcorr = k - ((k - 1) ** 2) / (n - 1)\n",
    "    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))\n",
    "\n",
    "\n",
@ -2523,7 +2562,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
@ -2564,6 +2603,15 @@
    "    plt.show()\n",
    "\n",
    "\n",
+    "training_cols = [\n",
+    "    'odor',\n",
+    "    'spore-print-color',\n",
+    "    'gill-color',\n",
+    "    'ring-type',\n",
+    "    'stalk-surface-above-ring',\n",
+    "    'gill-size',\n",
+    "]\n",
+    "\n",
    "plot_chosen_features(mushrooms,\n",
    "                     col='odor',\n",
    "                     labels=NAMES_DICT['odor'].values(),\n",
@ -2572,7 +2620,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
@ -2597,7 +2645,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
@ -2622,7 +2670,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
@ -2647,7 +2695,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
@ -2672,7 +2720,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
@ -2697,8 +2745,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
+   "execution_count": 32,
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [],
   "source": [
    "X_train, X_test = train_test_split(mushrooms,\n",
@ -2719,8 +2772,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
+   "execution_count": 33,
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [],
   "source": [
    "X_test_data = X_test[columns]\n",
@ -2729,18 +2787,28 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
+   "execution_count": 34,
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [],
   "source": [
-    "bayModel = NaiveBayes(classValue,className,columns,X_train)\n",
+    "bayModel = NaiveBayes(classValue, className, columns, X_train)\n",
    "model = bayModel.fitModel()\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
+   "execution_count": 35,
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
@ -2750,50 +2818,51 @@
      "acuracy score =  0.9944903581267218\n",
      "\n",
      "accuracy score losowych predykcji\n",
-      "acuracy score =  0.49931129476584024\n"
+      "acuracy score =  0.48829201101928377\n"
     ]
    }
   ],
   "source": [
-    "pred = bayModel.predict(X_test[columns],model)\n",
+    "pred = bayModel.predict(X_test[columns], model)\n",
    "print('accuracy score naiwnego klasyfikatora')\n",
-    "print(\"acuracy score = \",accuracy_score(list(X_test_results),pred))\n",
+    "print(\"acuracy score = \", accuracy_score(list(X_test_results), pred))\n",
    "\n",
    "print('\\naccuracy score losowych predykcji')\n",
-    "randomPred = ['poisonous' if random.randint(0,1) == 1 else 'edible' for _ in range(len(list(X_test_results)))]\n",
-    "print(\"acuracy score = \",accuracy_score(list(X_test_results),randomPred))"
+    "randomPred = ['poisonous' if random.randint(0, 1) == 1 else 'edible' for _ in range(len(list(X_test_results)))]\n",
+    "print(\"acuracy score = \", accuracy_score(list(X_test_results), randomPred))"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Accuracy bez \"odor\"\n",
-      "acuracy score =  0.8657024793388429\n"
+      "accuracy score naiwnego klasyfikatora\n",
+      "acuracy score =  0.8980716253443526\n"
     ]
    }
   ],
   "source": [
-    "without_odor = [\n",
+    "columns_wihtout_odor = [\n",
+    "    'spore-print-color',\n",
    "    'gill-color',\n",
    "    'ring-type',\n",
    "    'stalk-surface-above-ring',\n",
    "    'gill-size',\n",
    "]\n",
    "\n",
-    "X_test_data = X_test[without_odor]\n",
+    "X_test_data = X_test[columns_wihtout_odor]\n",
    "X_test_results = X_test[className]\n",
    "\n",
-    "bayModel = NaiveBayes(classValue,className,without_odor,X_train)\n",
+    "bayModel = NaiveBayes(classValue, className, columns_wihtout_odor, X_train)\n",
    "model = bayModel.fitModel()\n",
-    "pred = bayModel.predict(X_test[without_odor],model)\n",
-    "print('Accuracy bez \"odor\"')\n",
-    "print(\"acuracy score = \",accuracy_score(list(X_test_results),pred))\n"
+    "pred = bayModel.predict(X_test[columns_wihtout_odor], model)\n",
+    "print('accuracy score naiwnego klasyfikatora')\n",
+    "print(\"acuracy score = \", accuracy_score(list(X_test_results), pred))\n"
   ]
  }
 ],