moj

2021-05-27 22:13:47 +02:00 · 2021-05-27 22:13:47 +02:00 · 18320ef656
commit 18320ef656
parent 9fcd29016f
1 changed files with 490 additions and 0 deletions
--- a/ow.ipynb
+++ b/ow.ipynb
@ -0,0 +1,490 @@
 {
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python385jvsc74a57bd0916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1",
   "display_name": "Python 3.8.5 64-bit"
  },
  "metadata": {
   "interpreter": {
    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "def NormalizeData(data):\n",
    "    for col in data.columns:\n",
    "        if data[col].dtype == object: \n",
    "            data[col] = data[col].str.lower()\n",
    "        if col == 'smoking_status':\n",
    "            data[col] = data[col].str.replace(\" \", \"_\")\n",
    "        if col == 'work_type':\n",
    "            data[col] = data[col].str.replace(\"-\", \"_\")\n",
    "        if col == 'bmi':\n",
    "            bins = [0, 21, 28, 40]\n",
    "            labels=['low','mid','high']\n",
    "            data[col] = pd.cut(data[col], bins=bins, labels=labels)\n",
    "        if col == 'stroke':\n",
    "            data[col] = data[col].replace({1: 'yes'})\n",
    "            data[col] = data[col].replace({0: 'no'})\n",
    "        if col == 'hypertension':\n",
    "            data[col] = data[col].replace({1: 'yes'})\n",
    "            data[col] = data[col].replace({0: 'no'})\n",
    "        if col == 'heart_disease':\n",
    "            data[col] = data[col].replace({1: 'yes'})\n",
    "            data[col] = data[col].replace({0: 'no'})\n",
    "    data = data.dropna()\n",
    "    return data\n",
    "\n",
    "data = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "         id  gender   age hypertension heart_disease ever_married  \\\n",
       "0      9046    male  67.0           no           yes          yes   \n",
       "2     31112    male  80.0           no           yes          yes   \n",
       "3     60182  female  49.0           no            no          yes   \n",
       "4      1665  female  79.0          yes            no          yes   \n",
       "5     56669    male  81.0           no            no          yes   \n",
       "...     ...     ...   ...          ...           ...          ...   \n",
       "5104  14180  female  13.0           no            no           no   \n",
       "5106  44873  female  81.0           no            no          yes   \n",
       "5107  19723  female  35.0           no            no          yes   \n",
       "5108  37544    male  51.0           no            no          yes   \n",
       "5109  44679  female  44.0           no            no          yes   \n",
       "\n",
       "          work_type Residence_type  avg_glucose_level   bmi   smoking_status  \\\n",
       "0           private          urban             228.69  high  formerly_smoked   \n",
       "2           private          rural             105.92  high     never_smoked   \n",
       "3           private          urban             171.23  high           smokes   \n",
       "4     self_employed          rural             174.12   mid     never_smoked   \n",
       "5           private          urban             186.21  high  formerly_smoked   \n",
       "...             ...            ...                ...   ...              ...   \n",
       "5104       children          rural             103.08   low          unknown   \n",
       "5106  self_employed          urban             125.20  high     never_smoked   \n",
       "5107  self_employed          rural              82.99  high     never_smoked   \n",
       "5108        private          rural             166.29   mid  formerly_smoked   \n",
       "5109       govt_job          urban              85.28   mid          unknown   \n",
       "\n",
       "     stroke  \n",
       "0       yes  \n",
       "2       yes  \n",
       "3       yes  \n",
       "4       yes  \n",
       "5       yes  \n",
       "...     ...  \n",
       "5104     no  \n",
       "5106     no  \n",
       "5107     no  \n",
       "5108     no  \n",
       "5109     no  \n",
       "\n",
       "[4501 rows x 12 columns]"
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>gender</th>\n      <th>age</th>\n      <th>hypertension</th>\n      <th>heart_disease</th>\n      <th>ever_married</th>\n      <th>work_type</th>\n      <th>Residence_type</th>\n      <th>avg_glucose_level</th>\n      <th>bmi</th>\n      <th>smoking_status</th>\n      <th>stroke</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>9046</td>\n      <td>male</td>\n      <td>67.0</td>\n      <td>no</td>\n      <td>yes</td>\n      <td>yes</td>\n      <td>private</td>\n      <td>urban</td>\n      <td>228.69</td>\n      <td>high</td>\n      <td>formerly_smoked</td>\n      <td>yes</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>31112</td>\n      <td>male</td>\n      <td>80.0</td>\n      <td>no</td>\n      <td>yes</td>\n      <td>yes</td>\n      <td>private</td>\n      <td>rural</td>\n      <td>105.92</td>\n      <td>high</td>\n      <td>never_smoked</td>\n      <td>yes</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>60182</td>\n      <td>female</td>\n      <td>49.0</td>\n      <td>no</td>\n      <td>no</td>\n      <td>yes</td>\n      <td>private</td>\n      <td>urban</td>\n      <td>171.23</td>\n      <td>high</td>\n      <td>smokes</td>\n      <td>yes</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>1665</td>\n      <td>female</td>\n      <td>79.0</td>\n      <td>yes</td>\n      <td>no</td>\n      <td>yes</td>\n      <td>self_employed</td>\n      <td>rural</td>\n      <td>174.12</td>\n      <td>mid</td>\n      <td>never_smoked</td>\n      <td>yes</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>56669</td>\n      <td>male</td>\n      <td>81.0</td>\n      <td>no</td>\n      <td>no</td>\n      <td>yes</td>\n      <td>private</td>\n      <td>urban</td>\n      <td>186.21</td>\n      <td>high</td>\n      <td>formerly_smoked</td>\n      <td>yes</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>5104</th>\n      <td>14180</td>\n      <td>female</td>\n      <td>13.0</td>\n      <td>no</td>\n      <td>no</td>\n      <td>no</td>\n      <td>children</td>\n      <td>rural</td>\n      <td>103.08</td>\n      <td>low</td>\n      <td>unknown</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>5106</th>\n      <td>44873</td>\n      <td>female</td>\n      <td>81.0</td>\n      <td>no</td>\n      <td>no</td>\n      <td>yes</td>\n      <td>self_employed</td>\n      <td>urban</td>\n      <td>125.20</td>\n      <td>high</td>\n      <td>never_smoked</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>5107</th>\n      <td>19723</td>\n      <td>female</td>\n      <td>35.0</td>\n      <td>no</td>\n      <td>no</td>\n      <td>yes</td>\n      <td>self_employed</td>\n      <td>rural</td>\n      <td>82.99</td>\n      <td>high</td>\n      <td>never_smoked</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>5108</th>\n      <td>37544</td>\n      <td>male</td>\n      <td>51.0</td>\n      <td>no</td>\n      <td>no</td>\n      <td>yes</td>\n      <td>private</td>\n      <td>rural</td>\n      <td>166.29</td>\n      <td>mid</td>\n      <td>formerly_smoked</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>5109</th>\n      <td>44679</td>\n      <td>female</td>\n      <td>44.0</td>\n      <td>no</td>\n      <td>no</td>\n      <td>yes</td>\n      <td>govt_job</td>\n      <td>urban</td>\n      <td>85.28</td>\n      <td>mid</td>\n      <td>unknown</td>\n      <td>no</td>\n    </tr>\n  </tbody>\n</table>\n<p>4501 rows × 12 columns</p>\n</div>"
     },
     "metadata": {},
     "execution_count": 136
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = NormalizeData(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','stroke','hypertension','heart_disease']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "      gender ever_married Residence_type   bmi   smoking_status  \\\n",
       "0       male          yes          urban  high  formerly_smoked   \n",
       "2       male          yes          rural  high     never_smoked   \n",
       "3     female          yes          urban  high           smokes   \n",
       "4     female          yes          rural   mid     never_smoked   \n",
       "5       male          yes          urban  high  formerly_smoked   \n",
       "...      ...          ...            ...   ...              ...   \n",
       "5104  female           no          rural   low          unknown   \n",
       "5106  female          yes          urban  high     never_smoked   \n",
       "5107  female          yes          rural  high     never_smoked   \n",
       "5108    male          yes          rural   mid  formerly_smoked   \n",
       "5109  female          yes          urban   mid          unknown   \n",
       "\n",
       "          work_type stroke hypertension heart_disease  \n",
       "0           private    yes           no           yes  \n",
       "2           private    yes           no           yes  \n",
       "3           private    yes           no            no  \n",
       "4     self_employed    yes          yes            no  \n",
       "5           private    yes           no            no  \n",
       "...             ...    ...          ...           ...  \n",
       "5104       children     no           no            no  \n",
       "5106  self_employed     no           no            no  \n",
       "5107  self_employed     no           no            no  \n",
       "5108        private     no           no            no  \n",
       "5109       govt_job     no           no            no  \n",
       "\n",
       "[4501 rows x 9 columns]"
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>gender</th>\n      <th>ever_married</th>\n      <th>Residence_type</th>\n      <th>bmi</th>\n      <th>smoking_status</th>\n      <th>work_type</th>\n      <th>stroke</th>\n      <th>hypertension</th>\n      <th>heart_disease</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>male</td>\n      <td>yes</td>\n      <td>urban</td>\n      <td>high</td>\n      <td>formerly_smoked</td>\n      <td>private</td>\n      <td>yes</td>\n      <td>no</td>\n      <td>yes</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>male</td>\n      <td>yes</td>\n      <td>rural</td>\n      <td>high</td>\n      <td>never_smoked</td>\n      <td>private</td>\n      <td>yes</td>\n      <td>no</td>\n      <td>yes</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>female</td>\n      <td>yes</td>\n      <td>urban</td>\n      <td>high</td>\n      <td>smokes</td>\n      <td>private</td>\n      <td>yes</td>\n      <td>no</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>female</td>\n      <td>yes</td>\n      <td>rural</td>\n      <td>mid</td>\n      <td>never_smoked</td>\n      <td>self_employed</td>\n      <td>yes</td>\n      <td>yes</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>male</td>\n      <td>yes</td>\n      <td>urban</td>\n      <td>high</td>\n      <td>formerly_smoked</td>\n      <td>private</td>\n      <td>yes</td>\n      <td>no</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>5104</th>\n      <td>female</td>\n      <td>no</td>\n      <td>rural</td>\n      <td>low</td>\n      <td>unknown</td>\n      <td>children</td>\n      <td>no</td>\n      <td>no</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>5106</th>\n      <td>female</td>\n      <td>yes</td>\n      <td>urban</td>\n      <td>high</td>\n      <td>never_smoked</td>\n      <td>self_employed</td>\n      <td>no</td>\n      <td>no</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>5107</th>\n      <td>female</td>\n      <td>yes</td>\n      <td>rural</td>\n      <td>high</td>\n      <td>never_smoked</td>\n      <td>self_employed</td>\n      <td>no</td>\n      <td>no</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>5108</th>\n      <td>male</td>\n      <td>yes</td>\n      <td>rural</td>\n      <td>mid</td>\n      <td>formerly_smoked</td>\n      <td>private</td>\n      <td>no</td>\n      <td>no</td>\n      <td>no</td>\n    </tr>\n    <tr>\n      <th>5109</th>\n      <td>female</td>\n      <td>yes</td>\n      <td>urban</td>\n      <td>mid</td>\n      <td>unknown</td>\n      <td>govt_job</td>\n      <td>no</td>\n      <td>no</td>\n      <td>no</td>\n    </tr>\n  </tbody>\n</table>\n<p>4501 rows × 9 columns</p>\n</div>"
     },
     "metadata": {},
     "execution_count": 138
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "class  NaiveBayes:\n",
    "\n",
    "\t\"\"\"\n",
    "\t\tBayes Theorem:\n",
    "\t\t\t\t\t\t\t\t\t\tLikelihood * Class prior probability\n",
    "\t\t\t\tPosterior Probability = -------------------------------------\n",
    "\t\t\t\t\t\t\t\t\t\t\tPredictor prior probability\n",
    "\t\t\t\t\n",
    "\t\t\t\t\t\t\t  \t\t\t P(x|c) * p(c)\n",
    "\t\t\t\t\t\t\t   P(c|x) = ------------------ \n",
    "\t\t\t\t\t\t\t\t\t\t\t  P(x)\n",
    "\t\"\"\"\n",
    "\n",
    "\tdef __init__(self):\n",
    "\n",
    "\t\t\"\"\"\n",
    "\t\t\tAttributes:\n",
    "\t\t\t\tlikelihoods: Likelihood of each feature per class\n",
    "\t\t\t\tclass_priors: Prior probabilities of classes \n",
    "\t\t\t\tpred_priors: Prior probabilities of features \n",
    "\t\t\t\tfeatures: All features of dataset\n",
    "\t\t\"\"\"\n",
    "\t\tself.features = list\n",
    "\t\tself.likelihoods = {}\n",
    "\t\tself.class_priors = {}\n",
    "\t\tself.pred_priors = {}\n",
    "\n",
    "\t\tself.X_train = np.array\n",
    "\t\tself.y_train = np.array\n",
    "\t\tself.train_size = int\n",
    "\t\tself.num_feats = int\n",
    "\n",
    "\tdef fit(self, X, y):\n",
    "\n",
    "\t\tself.features = list(X.columns)\n",
    "\t\tself.X_train = X\n",
    "\t\tself.y_train = y\n",
    "\t\tself.train_size = X.shape[0]\n",
    "\t\tself.num_feats = X.shape[1]\n",
    "\n",
    "\t\tfor feature in self.features:\n",
    "\t\t\tself.likelihoods[feature] = {}\n",
    "\t\t\tself.pred_priors[feature] = {}\n",
    "\n",
    "\t\t\tfor feat_val in np.unique(self.X_train[feature]):\n",
    "\t\t\t\tself.pred_priors[feature].update({feat_val: 0})\n",
    "\n",
    "\t\t\t\tfor outcome in np.unique(self.y_train):\n",
    "\t\t\t\t\tself.likelihoods[feature].update({feat_val+'_'+outcome:0})\n",
    "\t\t\t\t\tself.class_priors.update({outcome: 0})\n",
    "\n",
    "\t\tself._calc_class_prior()\n",
    "\t\tself._calc_likelihoods()\n",
    "\t\tself._calc_predictor_prior()\n",
    "\n",
    "\tdef _calc_class_prior(self):\n",
    "\n",
    "\t\t\"\"\" P(c) - Prior Class Probability \"\"\"\n",
    "\n",
    "\t\tfor outcome in np.unique(self.y_train):\n",
    "\t\t\toutcome_count = sum(self.y_train == outcome)\n",
    "\t\t\tself.class_priors[outcome] = outcome_count / self.train_size\n",
    "\n",
    "\tdef _calc_likelihoods(self):\n",
    "\n",
    "\t\t\"\"\" P(x|c) - Likelihood \"\"\"\n",
    "\n",
    "\t\tfor feature in self.features:\n",
    "\n",
    "\t\t\tfor outcome in np.unique(self.y_train):\n",
    "\t\t\t\toutcome_count = sum(self.y_train == outcome)\n",
    "\t\t\t\tfeat_likelihood = self.X_train[feature][self.y_train[self.y_train == outcome].index.values.tolist()].value_counts().to_dict()\n",
    "\n",
    "\t\t\t\tfor feat_val, count in feat_likelihood.items():\n",
    "\t\t\t\t\tself.likelihoods[feature][feat_val + '_' + outcome] = count/outcome_count\n",
    "\n",
    "\n",
    "\tdef _calc_predictor_prior(self):\n",
    "\n",
    "\t\t\"\"\" P(x) - Evidence \"\"\"\n",
    "\n",
    "\t\tfor feature in self.features:\n",
    "\t\t\tfeat_vals = self.X_train[feature].value_counts().to_dict()\n",
    "\n",
    "\t\t\tfor feat_val, count in feat_vals.items():\n",
    "\t\t\t\tself.pred_priors[feature][feat_val] = count/self.train_size\n",
    "\n",
    "\n",
    "\tdef predict(self, X):\n",
    "\n",
    "\t\t\"\"\" Calculates Posterior probability P(c|x) \"\"\"\n",
    "\n",
    "\t\tresults = []\n",
    "\t\tX = np.array(X)\n",
    "\n",
    "\t\tfor query in X:\n",
    "\t\t\tprobs_outcome = {}\n",
    "\t\t\tfor outcome in np.unique(self.y_train):\n",
    "\t\t\t\tprior = self.class_priors[outcome]\n",
    "\t\t\t\tlikelihood = 1\n",
    "\t\t\t\tevidence = 1\n",
    "\n",
    "\t\t\t\tfor feat, feat_val in zip(self.features, query):\n",
    "\t\t\t\t\tlikelihood *= self.likelihoods[feat][feat_val + '_' + outcome]\n",
    "\t\t\t\t\tevidence *= self.pred_priors[feat][feat_val]\n",
    "\n",
    "\t\t\t\tposterior = (likelihood * prior) / (evidence)\n",
    "\n",
    "\t\t\t\tprobs_outcome[outcome] = posterior\n",
    "\n",
    "\t\t\tresult = max(probs_outcome, key = lambda x: probs_outcome[x])\n",
    "\t\t\tresults.append(result)\n",
    "\n",
    "\t\treturn np.array(results)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "def accuracy_score(y_true, y_pred):\n",
    "\n",
    "\t\"\"\"\tscore = (y_true - y_pred) / len(y_true) \"\"\"\n",
    "\n",
    "\treturn round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)\n",
    "\n",
    "def pre_processing(df):\n",
    "\n",
    "\t\"\"\" partioning data into features and target \"\"\"\n",
    "\n",
    "\tX = df.drop([df.columns[-1]], axis = 1)\n",
    "\ty = df[df.columns[-1]]\n",
    "\n",
    "\treturn X, y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_train, data_test = train_test_split(data, random_state = 42)\n",
    "\n",
    "X_train =data_train[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]\n",
    "Y_train = data_train['stroke']\n",
    "\n",
    "X_test =data_test[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]\n",
    "Y_test = data_test['stroke']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "['male' 'female' 'other']\n['yes' 'no']\n['urban' 'rural']\n['high', 'mid', 'low']\nCategories (3, object): ['low' < 'mid' < 'high']\n['formerly_smoked' 'never_smoked' 'smokes' 'unknown']\n['private' 'self_employed' 'govt_job' 'children' 'never_worked']\n['no' 'yes']\n['yes' 'no']\n"
     ]
    }
   ],
   "source": [
    "print(pd.unique(data['gender']))\n",
    "print(pd.unique(data['ever_married']))\n",
    "print(pd.unique(data['Residence_type']))\n",
    "print(pd.unique(data['bmi']))\n",
    "print(pd.unique(data['smoking_status']))\n",
    "print(pd.unique(data['work_type']))\n",
    "print(pd.unique(data['hypertension']))\n",
    "print(pd.unique(data['heart_disease']))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Query 1:- [['male' 'no' 'urban' 'high' 'smokes' 'private' 'yes' 'yes']] ---> ['no']\n"
     ]
    }
   ],
   "source": [
    "# X = data[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type']]\n",
    "# y = data['stroke']\n",
    "\n",
    "\n",
    "nb_clf = NaiveBayes()\n",
    "nb_clf.fit(X_train, Y_train)\n",
    "\n",
    "#\n",
    "\t\n",
    "#Query 1:\n",
    "query = np.array([['male','no', 'urban', 'high','smokes', 'private','yes','yes']])\n",
    "print(\"Query 1:- {} ---> {}\".format(query, nb_clf.predict(query)))\n",
    "\n",
    "# #Query 2:\n",
    "# query = np.array([['Overcast','Cool', 'Normal', 't']])\n",
    "# print(\"Query 2:- {} ---> {}\".format(query, nb_clf.predict(query)))\n",
    "\n",
    "# #Query 3:\n",
    "# query = np.array([['Sunny','Hot', 'High', 't']])\n",
    "# print(\"Query 3:- {} ---> {}\".format(query, nb_clf.predict(query)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Test Accuracy: 94.67\n"
     ]
    }
   ],
   "source": [
    "print(\"Test Accuracy: {}\".format(accuracy_score(Y_test, nb_clf.predict(X_test))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [],
   "source": [
    "lol = nb_clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {},
   "outputs": [],
   "source": [
    "lol =pd.DataFrame(data=lol)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "no     1072\n",
       "yes      54\n",
       "Name: stroke, dtype: int64"
      ]
     },
     "metadata": {},
     "execution_count": 167
    }
   ],
   "source": [
    "Y_test.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "no     1116\n",
       "yes      10\n",
       "dtype: int64"
      ]
     },
     "metadata": {},
     "execution_count": 166
    }
   ],
   "source": [
    "lol.value_counts()"
   ]
  }
 ]
 }