moj

2021-05-27 22:13:47 +02:00 · 2021-05-27 22:13:47 +02:00 · 18320ef656
commit 18320ef656
parent 9fcd29016f
1 changed files with 490 additions and 0 deletions
--- a/ow.ipynb
+++ b/ow.ipynb
@ -0,0 +1,490 @@
+{
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  },
+  "orig_nbformat": 2,
+  "kernelspec": {
+   "name": "python385jvsc74a57bd0916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1",
+   "display_name": "Python 3.8.5 64-bit"
+  },
+  "metadata": {
+   "interpreter": {
+    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 133,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd \n",
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "def NormalizeData(data):\n",
+    "    for col in data.columns:\n",
+    "        if data[col].dtype == object: \n",
+    "            data[col] = data[col].str.lower()\n",
+    "        if col == 'smoking_status':\n",
+    "            data[col] = data[col].str.replace(\" \", \"_\")\n",
+    "        if col == 'work_type':\n",
+    "            data[col] = data[col].str.replace(\"-\", \"_\")\n",
+    "        if col == 'bmi':\n",
+    "            bins = [0, 21, 28, 40]\n",
+    "            labels=['low','mid','high']\n",
+    "            data[col] = pd.cut(data[col], bins=bins, labels=labels)\n",
+    "        if col == 'stroke':\n",
+    "            data[col] = data[col].replace({1: 'yes'})\n",
+    "            data[col] = data[col].replace({0: 'no'})\n",
+    "        if col == 'hypertension':\n",
+    "            data[col] = data[col].replace({1: 'yes'})\n",
+    "            data[col] = data[col].replace({0: 'no'})\n",
+    "        if col == 'heart_disease':\n",
+    "            data[col] = data[col].replace({1: 'yes'})\n",
+    "            data[col] = data[col].replace({0: 'no'})\n",
+    "    data = data.dropna()\n",
+    "    return data\n",
+    "\n",
+    "data = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 136,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "         id  gender   age hypertension heart_disease ever_married  \\\n",
+       "0      9046    male  67.0           no           yes          yes   \n",
+       "2     31112    male  80.0           no           yes          yes   \n",
+       "3     60182  female  49.0           no            no          yes   \n",
+       "4      1665  female  79.0          yes            no          yes   \n",
+       "5     56669    male  81.0           no            no          yes   \n",
+       "...     ...     ...   ...          ...           ...          ...   \n",
+       "5104  14180  female  13.0           no            no           no   \n",
+       "5106  44873  female  81.0           no            no          yes   \n",
+       "5107  19723  female  35.0           no            no          yes   \n",
+       "5108  37544    male  51.0           no            no          yes   \n",
+       "5109  44679  female  44.0           no            no          yes   \n",
+       "\n",
+       "          work_type Residence_type  avg_glucose_level   bmi   smoking_status  \\\n",
+       "0           private          urban             228.69  high  formerly_smoked   \n",
+       "2           private          rural             105.92  high     never_smoked   \n",
+       "3           private          urban             171.23  high           smokes   \n",
+       "4     self_employed          rural             174.12   mid     never_smoked   \n",
+       "5           private          urban             186.21  high  formerly_smoked   \n",
+       "...             ...            ...                ...   ...              ...   \n",
+       "5104       children          rural             103.08   low          unknown   \n",
+       "5106  self_employed          urban             125.20  high     never_smoked   \n",
+       "5107  self_employed          rural              82.99  high     never_smoked   \n",
+       "5108        private          rural             166.29   mid  formerly_smoked   \n",
+       "5109       govt_job          urban              85.28   mid          unknown   \n",
+       "\n",
+       "     stroke  \n",
+       "0       yes  \n",
+       "2       yes  \n",
+       "3       yes  \n",
+       "4       yes  \n",
+       "5       yes  \n",
+       "...     ...  \n",
+       "5104     no  \n",
+       "5106     no  \n",
+       "5107     no  \n",
+       "5108     no  \n",
+       "5109     no  \n",
+       "\n",
+       "[4501 rows x 12 columns]"
+      ],
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>gender</th>\n      <th>age</th>\n      <th>hypertension</th>\n      <th>heart_disease</th>\n      <th>ever_married</th>\n      <th>work_type</th>\n      <th>Residence_type</th>\n      <th>avg_glucose_level</th>\n      <th>bmi</th>\n      <th>smoking_status</th>\n      <th>stroke</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>9046</td>\n      <td>male</td>\n      <td>67.0</td>\n      <td>no</td>\n      <td>yes</td>\n      <td>yes</td>\n      <td>private</td>\n      <td>urban</td>\n      <td>228.69</td>\n      <td>high</td>\n      <td>formerly_smoked</td>\n      <td>yes</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>31112</td>\n      <td>male</td>\n      <td>80.0</td>\n      <td>no</td>\n      <td>yes</td>\n      <td>yes</td>\n      <td>private</td>\n      <td>rural</td>\n      <td>105.92</td>\n      <td>high</td>\n      <td>never_smoked</td>\n      <td>yes</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>60182</td>\n      <td>female</td>\n      <td>49.0</td>\n      <td>no</td>\n      <td>no</td>\n      <td>yes</td>\n      <td>private</td>\n      <td>urban</td>\n      <td>171.23</td>\n      <td>high</td>\n      <td>smokes</td>\n      <td>yes</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>1665</td>\n      <td>female</td>\n      <td>79.0</td>\n      <td>yes</td>\n      <td>no</td>\n      <td>yes</td>\n      <td>self_employed</td>\n      <td>rural</td>\n      <td>174.12</td>\n      <td>mid</td>\n      <td>never_smoked</td>\n      <td>yes</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>56669</td>\n      <td>male</td>\n      <td>81.0</td>\n      <td>no</td>\n      <td>no</td>\n      <td>yes</td>\n      <td>private</td>\n      <td>urban</td>\n      <td>186.21</td>\n      <td>high</td>\n      <td>formerly_smoked</td>\n      <td>yes</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>5104</th>\n      <td>14180</td>\n      <td>female</td>\n      <td>13.0</td>\n      <td>no</td>\n      <td>no</td>\n      <td>no</td>\n      <td>children</td>\n      <td>rural</td>\n      <td>103.08</td>\n      <td>low</td>\n      <td>unknown</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>5106</th>\n      <td>44873</td>\n      <td>female</td>\n      <td>81.0</td>\n      <td>no</td>\n      <td>no</td>\n      <td>yes</td>\n      <td>self_employed</td>\n      <td>urban</td>\n      <td>125.20</td>\n      <td>high</td>\n      <td>never_smoked</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>5107</th>\n      <td>19723</td>\n      <td>female</td>\n      <td>35.0</td>\n      <td>no</td>\n      <td>no</td>\n      <td>yes</td>\n      <td>self_employed</td>\n      <td>rural</td>\n      <td>82.99</td>\n      <td>high</td>\n      <td>never_smoked</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>5108</th>\n      <td>37544</td>\n      <td>male</td>\n      <td>51.0</td>\n      <td>no</td>\n      <td>no</td>\n      <td>yes</td>\n      <td>private</td>\n      <td>rural</td>\n      <td>166.29</td>\n      <td>mid</td>\n      <td>formerly_smoked</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>5109</th>\n      <td>44679</td>\n      <td>female</td>\n      <td>44.0</td>\n      <td>no</td>\n      <td>no</td>\n      <td>yes</td>\n      <td>govt_job</td>\n      <td>urban</td>\n      <td>85.28</td>\n      <td>mid</td>\n      <td>unknown</td>\n      <td>no</td>\n    </tr>\n  </tbody>\n</table>\n<p>4501 rows × 12 columns</p>\n</div>"
+     },
+     "metadata": {},
+     "execution_count": 136
+    }
+   ],
+   "source": [
+    "data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 135,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = NormalizeData(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 137,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = data[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','stroke','hypertension','heart_disease']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 138,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "      gender ever_married Residence_type   bmi   smoking_status  \\\n",
+       "0       male          yes          urban  high  formerly_smoked   \n",
+       "2       male          yes          rural  high     never_smoked   \n",
+       "3     female          yes          urban  high           smokes   \n",
+       "4     female          yes          rural   mid     never_smoked   \n",
+       "5       male          yes          urban  high  formerly_smoked   \n",
+       "...      ...          ...            ...   ...              ...   \n",
+       "5104  female           no          rural   low          unknown   \n",
+       "5106  female          yes          urban  high     never_smoked   \n",
+       "5107  female          yes          rural  high     never_smoked   \n",
+       "5108    male          yes          rural   mid  formerly_smoked   \n",
+       "5109  female          yes          urban   mid          unknown   \n",
+       "\n",
+       "          work_type stroke hypertension heart_disease  \n",
+       "0           private    yes           no           yes  \n",
+       "2           private    yes           no           yes  \n",
+       "3           private    yes           no            no  \n",
+       "4     self_employed    yes          yes            no  \n",
+       "5           private    yes           no            no  \n",
+       "...             ...    ...          ...           ...  \n",
+       "5104       children     no           no            no  \n",
+       "5106  self_employed     no           no            no  \n",
+       "5107  self_employed     no           no            no  \n",
+       "5108        private     no           no            no  \n",
+       "5109       govt_job     no           no            no  \n",
+       "\n",
+       "[4501 rows x 9 columns]"
+      ],
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>gender</th>\n      <th>ever_married</th>\n      <th>Residence_type</th>\n      <th>bmi</th>\n      <th>smoking_status</th>\n      <th>work_type</th>\n      <th>stroke</th>\n      <th>hypertension</th>\n      <th>heart_disease</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>male</td>\n      <td>yes</td>\n      <td>urban</td>\n      <td>high</td>\n      <td>formerly_smoked</td>\n      <td>private</td>\n      <td>yes</td>\n      <td>no</td>\n      <td>yes</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>male</td>\n      <td>yes</td>\n      <td>rural</td>\n      <td>high</td>\n      <td>never_smoked</td>\n      <td>private</td>\n      <td>yes</td>\n      <td>no</td>\n      <td>yes</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>female</td>\n      <td>yes</td>\n      <td>urban</td>\n      <td>high</td>\n      <td>smokes</td>\n      <td>private</td>\n      <td>yes</td>\n      <td>no</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>female</td>\n      <td>yes</td>\n      <td>rural</td>\n      <td>mid</td>\n      <td>never_smoked</td>\n      <td>self_employed</td>\n      <td>yes</td>\n      <td>yes</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>male</td>\n      <td>yes</td>\n      <td>urban</td>\n      <td>high</td>\n      <td>formerly_smoked</td>\n      <td>private</td>\n      <td>yes</td>\n      <td>no</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>5104</th>\n      <td>female</td>\n      <td>no</td>\n      <td>rural</td>\n      <td>low</td>\n      <td>unknown</td>\n      <td>children</td>\n      <td>no</td>\n      <td>no</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>5106</th>\n      <td>female</td>\n      <td>yes</td>\n      <td>urban</td>\n      <td>high</td>\n      <td>never_smoked</td>\n      <td>self_employed</td>\n      <td>no</td>\n      <td>no</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>5107</th>\n      <td>female</td>\n      <td>yes</td>\n      <td>rural</td>\n      <td>high</td>\n      <td>never_smoked</td>\n      <td>self_employed</td>\n      <td>no</td>\n      <td>no</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>5108</th>\n      <td>male</td>\n      <td>yes</td>\n      <td>rural</td>\n      <td>mid</td>\n      <td>formerly_smoked</td>\n      <td>private</td>\n      <td>no</td>\n      <td>no</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>5109</th>\n      <td>female</td>\n      <td>yes</td>\n      <td>urban</td>\n      <td>mid</td>\n      <td>unknown</td>\n      <td>govt_job</td>\n      <td>no</td>\n      <td>no</td>\n      <td>no</td>\n    </tr>\n  </tbody>\n</table>\n<p>4501 rows × 9 columns</p>\n</div>"
+     },
+     "metadata": {},
+     "execution_count": 138
+    }
+   ],
+   "source": [
+    "data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class  NaiveBayes:\n",
+    "\n",
+    "\t\"\"\"\n",
+    "\t\tBayes Theorem:\n",
+    "\t\t\t\t\t\t\t\t\t\tLikelihood * Class prior probability\n",
+    "\t\t\t\tPosterior Probability = -------------------------------------\n",
+    "\t\t\t\t\t\t\t\t\t\t\tPredictor prior probability\n",
+    "\t\t\t\t\n",
+    "\t\t\t\t\t\t\t  \t\t\t P(x|c) * p(c)\n",
+    "\t\t\t\t\t\t\t   P(c|x) = ------------------ \n",
+    "\t\t\t\t\t\t\t\t\t\t\t  P(x)\n",
+    "\t\"\"\"\n",
+    "\n",
+    "\tdef __init__(self):\n",
+    "\n",
+    "\t\t\"\"\"\n",
+    "\t\t\tAttributes:\n",
+    "\t\t\t\tlikelihoods: Likelihood of each feature per class\n",
+    "\t\t\t\tclass_priors: Prior probabilities of classes \n",
+    "\t\t\t\tpred_priors: Prior probabilities of features \n",
+    "\t\t\t\tfeatures: All features of dataset\n",
+    "\t\t\"\"\"\n",
+    "\t\tself.features = list\n",
+    "\t\tself.likelihoods = {}\n",
+    "\t\tself.class_priors = {}\n",
+    "\t\tself.pred_priors = {}\n",
+    "\n",
+    "\t\tself.X_train = np.array\n",
+    "\t\tself.y_train = np.array\n",
+    "\t\tself.train_size = int\n",
+    "\t\tself.num_feats = int\n",
+    "\n",
+    "\tdef fit(self, X, y):\n",
+    "\n",
+    "\t\tself.features = list(X.columns)\n",
+    "\t\tself.X_train = X\n",
+    "\t\tself.y_train = y\n",
+    "\t\tself.train_size = X.shape[0]\n",
+    "\t\tself.num_feats = X.shape[1]\n",
+    "\n",
+    "\t\tfor feature in self.features:\n",
+    "\t\t\tself.likelihoods[feature] = {}\n",
+    "\t\t\tself.pred_priors[feature] = {}\n",
+    "\n",
+    "\t\t\tfor feat_val in np.unique(self.X_train[feature]):\n",
+    "\t\t\t\tself.pred_priors[feature].update({feat_val: 0})\n",
+    "\n",
+    "\t\t\t\tfor outcome in np.unique(self.y_train):\n",
+    "\t\t\t\t\tself.likelihoods[feature].update({feat_val+'_'+outcome:0})\n",
+    "\t\t\t\t\tself.class_priors.update({outcome: 0})\n",
+    "\n",
+    "\t\tself._calc_class_prior()\n",
+    "\t\tself._calc_likelihoods()\n",
+    "\t\tself._calc_predictor_prior()\n",
+    "\n",
+    "\tdef _calc_class_prior(self):\n",
+    "\n",
+    "\t\t\"\"\" P(c) - Prior Class Probability \"\"\"\n",
+    "\n",
+    "\t\tfor outcome in np.unique(self.y_train):\n",
+    "\t\t\toutcome_count = sum(self.y_train == outcome)\n",
+    "\t\t\tself.class_priors[outcome] = outcome_count / self.train_size\n",
+    "\n",
+    "\tdef _calc_likelihoods(self):\n",
+    "\n",
+    "\t\t\"\"\" P(x|c) - Likelihood \"\"\"\n",
+    "\n",
+    "\t\tfor feature in self.features:\n",
+    "\n",
+    "\t\t\tfor outcome in np.unique(self.y_train):\n",
+    "\t\t\t\toutcome_count = sum(self.y_train == outcome)\n",
+    "\t\t\t\tfeat_likelihood = self.X_train[feature][self.y_train[self.y_train == outcome].index.values.tolist()].value_counts().to_dict()\n",
+    "\n",
+    "\t\t\t\tfor feat_val, count in feat_likelihood.items():\n",
+    "\t\t\t\t\tself.likelihoods[feature][feat_val + '_' + outcome] = count/outcome_count\n",
+    "\n",
+    "\n",
+    "\tdef _calc_predictor_prior(self):\n",
+    "\n",
+    "\t\t\"\"\" P(x) - Evidence \"\"\"\n",
+    "\n",
+    "\t\tfor feature in self.features:\n",
+    "\t\t\tfeat_vals = self.X_train[feature].value_counts().to_dict()\n",
+    "\n",
+    "\t\t\tfor feat_val, count in feat_vals.items():\n",
+    "\t\t\t\tself.pred_priors[feature][feat_val] = count/self.train_size\n",
+    "\n",
+    "\n",
+    "\tdef predict(self, X):\n",
+    "\n",
+    "\t\t\"\"\" Calculates Posterior probability P(c|x) \"\"\"\n",
+    "\n",
+    "\t\tresults = []\n",
+    "\t\tX = np.array(X)\n",
+    "\n",
+    "\t\tfor query in X:\n",
+    "\t\t\tprobs_outcome = {}\n",
+    "\t\t\tfor outcome in np.unique(self.y_train):\n",
+    "\t\t\t\tprior = self.class_priors[outcome]\n",
+    "\t\t\t\tlikelihood = 1\n",
+    "\t\t\t\tevidence = 1\n",
+    "\n",
+    "\t\t\t\tfor feat, feat_val in zip(self.features, query):\n",
+    "\t\t\t\t\tlikelihood *= self.likelihoods[feat][feat_val + '_' + outcome]\n",
+    "\t\t\t\t\tevidence *= self.pred_priors[feat][feat_val]\n",
+    "\n",
+    "\t\t\t\tposterior = (likelihood * prior) / (evidence)\n",
+    "\n",
+    "\t\t\t\tprobs_outcome[outcome] = posterior\n",
+    "\n",
+    "\t\t\tresult = max(probs_outcome, key = lambda x: probs_outcome[x])\n",
+    "\t\t\tresults.append(result)\n",
+    "\n",
+    "\t\treturn np.array(results)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def accuracy_score(y_true, y_pred):\n",
+    "\n",
+    "\t\"\"\"\tscore = (y_true - y_pred) / len(y_true) \"\"\"\n",
+    "\n",
+    "\treturn round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)\n",
+    "\n",
+    "def pre_processing(df):\n",
+    "\n",
+    "\t\"\"\" partioning data into features and target \"\"\"\n",
+    "\n",
+    "\tX = df.drop([df.columns[-1]], axis = 1)\n",
+    "\ty = df[df.columns[-1]]\n",
+    "\n",
+    "\treturn X, y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 139,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_train, data_test = train_test_split(data, random_state = 42)\n",
+    "\n",
+    "X_train =data_train[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]\n",
+    "Y_train = data_train['stroke']\n",
+    "\n",
+    "X_test =data_test[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]\n",
+    "Y_test = data_test['stroke']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 141,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "['male' 'female' 'other']\n['yes' 'no']\n['urban' 'rural']\n['high', 'mid', 'low']\nCategories (3, object): ['low' < 'mid' < 'high']\n['formerly_smoked' 'never_smoked' 'smokes' 'unknown']\n['private' 'self_employed' 'govt_job' 'children' 'never_worked']\n['no' 'yes']\n['yes' 'no']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(pd.unique(data['gender']))\n",
+    "print(pd.unique(data['ever_married']))\n",
+    "print(pd.unique(data['Residence_type']))\n",
+    "print(pd.unique(data['bmi']))\n",
+    "print(pd.unique(data['smoking_status']))\n",
+    "print(pd.unique(data['work_type']))\n",
+    "print(pd.unique(data['hypertension']))\n",
+    "print(pd.unique(data['heart_disease']))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 148,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Query 1:- [['male' 'no' 'urban' 'high' 'smokes' 'private' 'yes' 'yes']] ---> ['no']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# X = data[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type']]\n",
+    "# y = data['stroke']\n",
+    "\n",
+    "\n",
+    "nb_clf = NaiveBayes()\n",
+    "nb_clf.fit(X_train, Y_train)\n",
+    "\n",
+    "#\n",
+    "\t\n",
+    "#Query 1:\n",
+    "query = np.array([['male','no', 'urban', 'high','smokes', 'private','yes','yes']])\n",
+    "print(\"Query 1:- {} ---> {}\".format(query, nb_clf.predict(query)))\n",
+    "\n",
+    "# #Query 2:\n",
+    "# query = np.array([['Overcast','Cool', 'Normal', 't']])\n",
+    "# print(\"Query 2:- {} ---> {}\".format(query, nb_clf.predict(query)))\n",
+    "\n",
+    "# #Query 3:\n",
+    "# query = np.array([['Sunny','Hot', 'High', 't']])\n",
+    "# print(\"Query 3:- {} ---> {}\".format(query, nb_clf.predict(query)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 143,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Test Accuracy: 94.67\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Test Accuracy: {}\".format(accuracy_score(Y_test, nb_clf.predict(X_test))))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 157,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lol = nb_clf.predict(X_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 158,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lol =pd.DataFrame(data=lol)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 167,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "no     1072\n",
+       "yes      54\n",
+       "Name: stroke, dtype: int64"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 167
+    }
+   ],
+   "source": [
+    "Y_test.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 166,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "no     1116\n",
+       "yes      10\n",
+       "dtype: int64"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 166
+    }
+   ],
+   "source": [
+    "lol.value_counts()"
+   ]
+  }
+ ]
+}