{ "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" }, "orig_nbformat": 2, "kernelspec": { "name": "python385jvsc74a57bd0916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1", "display_name": "Python 3.8.5 64-bit" }, "metadata": { "interpreter": { "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" } } }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "cell_type": "code", "execution_count": 133, "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "\n", "def NormalizeData(data):\n", " for col in data.columns:\n", " if data[col].dtype == object: \n", " data[col] = data[col].str.lower()\n", " if col == 'smoking_status':\n", " data[col] = data[col].str.replace(\" \", \"_\")\n", " if col == 'work_type':\n", " data[col] = data[col].str.replace(\"-\", \"_\")\n", " if col == 'bmi':\n", " bins = [0, 21, 28, 40]\n", " labels=['low','mid','high']\n", " data[col] = pd.cut(data[col], bins=bins, labels=labels)\n", " if col == 'stroke':\n", " data[col] = data[col].replace({1: 'yes'})\n", " data[col] = data[col].replace({0: 'no'})\n", " if col == 'hypertension':\n", " data[col] = data[col].replace({1: 'yes'})\n", " data[col] = data[col].replace({0: 'no'})\n", " if col == 'heart_disease':\n", " data[col] = data[col].replace({1: 'yes'})\n", " data[col] = data[col].replace({0: 'no'})\n", " data = data.dropna()\n", " return data\n", "\n", "data = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")" ] }, { "cell_type": "code", "execution_count": 136, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id gender age hypertension heart_disease ever_married \\\n", "0 9046 male 67.0 no yes yes \n", "2 31112 male 80.0 no yes yes \n", "3 60182 female 49.0 no no yes \n", "4 1665 female 79.0 yes no yes \n", "5 56669 male 81.0 no no yes \n", "... ... ... ... ... ... ... \n", "5104 14180 female 13.0 no no no \n", "5106 44873 female 81.0 no no yes \n", "5107 19723 female 35.0 no no yes \n", "5108 37544 male 51.0 no no yes \n", "5109 44679 female 44.0 no no yes \n", "\n", " work_type Residence_type avg_glucose_level bmi smoking_status \\\n", "0 private urban 228.69 high formerly_smoked \n", "2 private rural 105.92 high never_smoked \n", "3 private urban 171.23 high smokes \n", "4 self_employed rural 174.12 mid never_smoked \n", "5 private urban 186.21 high formerly_smoked \n", "... ... ... ... ... ... \n", "5104 children rural 103.08 low unknown \n", "5106 self_employed urban 125.20 high never_smoked \n", "5107 self_employed rural 82.99 high never_smoked \n", "5108 private rural 166.29 mid formerly_smoked \n", "5109 govt_job urban 85.28 mid unknown \n", "\n", " stroke \n", "0 yes \n", "2 yes \n", "3 yes \n", "4 yes \n", "5 yes \n", "... ... \n", "5104 no \n", "5106 no \n", "5107 no \n", "5108 no \n", "5109 no \n", "\n", "[4501 rows x 12 columns]" ], "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idgenderagehypertensionheart_diseaseever_marriedwork_typeResidence_typeavg_glucose_levelbmismoking_statusstroke
09046male67.0noyesyesprivateurban228.69highformerly_smokedyes
231112male80.0noyesyesprivaterural105.92highnever_smokedyes
360182female49.0nonoyesprivateurban171.23highsmokesyes
41665female79.0yesnoyesself_employedrural174.12midnever_smokedyes
556669male81.0nonoyesprivateurban186.21highformerly_smokedyes
.......................................
510414180female13.0nononochildrenrural103.08lowunknownno
510644873female81.0nonoyesself_employedurban125.20highnever_smokedno
510719723female35.0nonoyesself_employedrural82.99highnever_smokedno
510837544male51.0nonoyesprivaterural166.29midformerly_smokedno
510944679female44.0nonoyesgovt_joburban85.28midunknownno
\n

4501 rows × 12 columns

\n
" }, "metadata": {}, "execution_count": 136 } ], "source": [ "data" ] }, { "cell_type": "code", "execution_count": 135, "metadata": {}, "outputs": [], "source": [ "data = NormalizeData(data)" ] }, { "cell_type": "code", "execution_count": 137, "metadata": {}, "outputs": [], "source": [ "data = data[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','stroke','hypertension','heart_disease']]" ] }, { "cell_type": "code", "execution_count": 138, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " gender ever_married Residence_type bmi smoking_status \\\n", "0 male yes urban high formerly_smoked \n", "2 male yes rural high never_smoked \n", "3 female yes urban high smokes \n", "4 female yes rural mid never_smoked \n", "5 male yes urban high formerly_smoked \n", "... ... ... ... ... ... \n", "5104 female no rural low unknown \n", "5106 female yes urban high never_smoked \n", "5107 female yes rural high never_smoked \n", "5108 male yes rural mid formerly_smoked \n", "5109 female yes urban mid unknown \n", "\n", " work_type stroke hypertension heart_disease \n", "0 private yes no yes \n", "2 private yes no yes \n", "3 private yes no no \n", "4 self_employed yes yes no \n", "5 private yes no no \n", "... ... ... ... ... \n", "5104 children no no no \n", "5106 self_employed no no no \n", "5107 self_employed no no no \n", "5108 private no no no \n", "5109 govt_job no no no \n", "\n", "[4501 rows x 9 columns]" ], "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
genderever_marriedResidence_typebmismoking_statuswork_typestrokehypertensionheart_disease
0maleyesurbanhighformerly_smokedprivateyesnoyes
2maleyesruralhighnever_smokedprivateyesnoyes
3femaleyesurbanhighsmokesprivateyesnono
4femaleyesruralmidnever_smokedself_employedyesyesno
5maleyesurbanhighformerly_smokedprivateyesnono
..............................
5104femalenorurallowunknownchildrennonono
5106femaleyesurbanhighnever_smokedself_employednonono
5107femaleyesruralhighnever_smokedself_employednonono
5108maleyesruralmidformerly_smokedprivatenonono
5109femaleyesurbanmidunknowngovt_jobnonono
\n

4501 rows × 9 columns

\n
" }, "metadata": {}, "execution_count": 138 } ], "source": [ "data" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [], "source": [ "class NaiveBayes:\n", "\n", "\t\"\"\"\n", "\t\tBayes Theorem:\n", "\t\t\t\t\t\t\t\t\t\tLikelihood * Class prior probability\n", "\t\t\t\tPosterior Probability = -------------------------------------\n", "\t\t\t\t\t\t\t\t\t\t\tPredictor prior probability\n", "\t\t\t\t\n", "\t\t\t\t\t\t\t \t\t\t P(x|c) * p(c)\n", "\t\t\t\t\t\t\t P(c|x) = ------------------ \n", "\t\t\t\t\t\t\t\t\t\t\t P(x)\n", "\t\"\"\"\n", "\n", "\tdef __init__(self):\n", "\n", "\t\t\"\"\"\n", "\t\t\tAttributes:\n", "\t\t\t\tlikelihoods: Likelihood of each feature per class\n", "\t\t\t\tclass_priors: Prior probabilities of classes \n", "\t\t\t\tpred_priors: Prior probabilities of features \n", "\t\t\t\tfeatures: All features of dataset\n", "\t\t\"\"\"\n", "\t\tself.features = list\n", "\t\tself.likelihoods = {}\n", "\t\tself.class_priors = {}\n", "\t\tself.pred_priors = {}\n", "\n", "\t\tself.X_train = np.array\n", "\t\tself.y_train = np.array\n", "\t\tself.train_size = int\n", "\t\tself.num_feats = int\n", "\n", "\tdef fit(self, X, y):\n", "\n", "\t\tself.features = list(X.columns)\n", "\t\tself.X_train = X\n", "\t\tself.y_train = y\n", "\t\tself.train_size = X.shape[0]\n", "\t\tself.num_feats = X.shape[1]\n", "\n", "\t\tfor feature in self.features:\n", "\t\t\tself.likelihoods[feature] = {}\n", "\t\t\tself.pred_priors[feature] = {}\n", "\n", "\t\t\tfor feat_val in np.unique(self.X_train[feature]):\n", "\t\t\t\tself.pred_priors[feature].update({feat_val: 0})\n", "\n", "\t\t\t\tfor outcome in np.unique(self.y_train):\n", "\t\t\t\t\tself.likelihoods[feature].update({feat_val+'_'+outcome:0})\n", "\t\t\t\t\tself.class_priors.update({outcome: 0})\n", "\n", "\t\tself._calc_class_prior()\n", "\t\tself._calc_likelihoods()\n", "\t\tself._calc_predictor_prior()\n", "\n", "\tdef _calc_class_prior(self):\n", "\n", "\t\t\"\"\" P(c) - Prior Class Probability \"\"\"\n", "\n", "\t\tfor outcome in np.unique(self.y_train):\n", "\t\t\toutcome_count = sum(self.y_train == outcome)\n", "\t\t\tself.class_priors[outcome] = outcome_count / self.train_size\n", "\n", "\tdef _calc_likelihoods(self):\n", "\n", "\t\t\"\"\" P(x|c) - Likelihood \"\"\"\n", "\n", "\t\tfor feature in self.features:\n", "\n", "\t\t\tfor outcome in np.unique(self.y_train):\n", "\t\t\t\toutcome_count = sum(self.y_train == outcome)\n", "\t\t\t\tfeat_likelihood = self.X_train[feature][self.y_train[self.y_train == outcome].index.values.tolist()].value_counts().to_dict()\n", "\n", "\t\t\t\tfor feat_val, count in feat_likelihood.items():\n", "\t\t\t\t\tself.likelihoods[feature][feat_val + '_' + outcome] = count/outcome_count\n", "\n", "\n", "\tdef _calc_predictor_prior(self):\n", "\n", "\t\t\"\"\" P(x) - Evidence \"\"\"\n", "\n", "\t\tfor feature in self.features:\n", "\t\t\tfeat_vals = self.X_train[feature].value_counts().to_dict()\n", "\n", "\t\t\tfor feat_val, count in feat_vals.items():\n", "\t\t\t\tself.pred_priors[feature][feat_val] = count/self.train_size\n", "\n", "\n", "\tdef predict(self, X):\n", "\n", "\t\t\"\"\" Calculates Posterior probability P(c|x) \"\"\"\n", "\n", "\t\tresults = []\n", "\t\tX = np.array(X)\n", "\n", "\t\tfor query in X:\n", "\t\t\tprobs_outcome = {}\n", "\t\t\tfor outcome in np.unique(self.y_train):\n", "\t\t\t\tprior = self.class_priors[outcome]\n", "\t\t\t\tlikelihood = 1\n", "\t\t\t\tevidence = 1\n", "\n", "\t\t\t\tfor feat, feat_val in zip(self.features, query):\n", "\t\t\t\t\tlikelihood *= self.likelihoods[feat][feat_val + '_' + outcome]\n", "\t\t\t\t\tevidence *= self.pred_priors[feat][feat_val]\n", "\n", "\t\t\t\tposterior = (likelihood * prior) / (evidence)\n", "\n", "\t\t\t\tprobs_outcome[outcome] = posterior\n", "\n", "\t\t\tresult = max(probs_outcome, key = lambda x: probs_outcome[x])\n", "\t\t\tresults.append(result)\n", "\n", "\t\treturn np.array(results)\n" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [], "source": [ "def accuracy_score(y_true, y_pred):\n", "\n", "\t\"\"\"\tscore = (y_true - y_pred) / len(y_true) \"\"\"\n", "\n", "\treturn round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)\n", "\n", "def pre_processing(df):\n", "\n", "\t\"\"\" partioning data into features and target \"\"\"\n", "\n", "\tX = df.drop([df.columns[-1]], axis = 1)\n", "\ty = df[df.columns[-1]]\n", "\n", "\treturn X, y" ] }, { "cell_type": "code", "execution_count": 139, "metadata": {}, "outputs": [], "source": [ "data_train, data_test = train_test_split(data, random_state = 42)\n", "\n", "X_train =data_train[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]\n", "Y_train = data_train['stroke']\n", "\n", "X_test =data_test[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]\n", "Y_test = data_test['stroke']" ] }, { "cell_type": "code", "execution_count": 141, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "['male' 'female' 'other']\n['yes' 'no']\n['urban' 'rural']\n['high', 'mid', 'low']\nCategories (3, object): ['low' < 'mid' < 'high']\n['formerly_smoked' 'never_smoked' 'smokes' 'unknown']\n['private' 'self_employed' 'govt_job' 'children' 'never_worked']\n['no' 'yes']\n['yes' 'no']\n" ] } ], "source": [ "print(pd.unique(data['gender']))\n", "print(pd.unique(data['ever_married']))\n", "print(pd.unique(data['Residence_type']))\n", "print(pd.unique(data['bmi']))\n", "print(pd.unique(data['smoking_status']))\n", "print(pd.unique(data['work_type']))\n", "print(pd.unique(data['hypertension']))\n", "print(pd.unique(data['heart_disease']))\n" ] }, { "cell_type": "code", "execution_count": 148, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Query 1:- [['male' 'no' 'urban' 'high' 'smokes' 'private' 'yes' 'yes']] ---> ['no']\n" ] } ], "source": [ "# X = data[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type']]\n", "# y = data['stroke']\n", "\n", "\n", "nb_clf = NaiveBayes()\n", "nb_clf.fit(X_train, Y_train)\n", "\n", "#\n", "\t\n", "#Query 1:\n", "query = np.array([['male','no', 'urban', 'high','smokes', 'private','yes','yes']])\n", "print(\"Query 1:- {} ---> {}\".format(query, nb_clf.predict(query)))\n", "\n", "# #Query 2:\n", "# query = np.array([['Overcast','Cool', 'Normal', 't']])\n", "# print(\"Query 2:- {} ---> {}\".format(query, nb_clf.predict(query)))\n", "\n", "# #Query 3:\n", "# query = np.array([['Sunny','Hot', 'High', 't']])\n", "# print(\"Query 3:- {} ---> {}\".format(query, nb_clf.predict(query)))" ] }, { "cell_type": "code", "execution_count": 143, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Test Accuracy: 94.67\n" ] } ], "source": [ "print(\"Test Accuracy: {}\".format(accuracy_score(Y_test, nb_clf.predict(X_test))))" ] }, { "cell_type": "code", "execution_count": 157, "metadata": {}, "outputs": [], "source": [ "lol = nb_clf.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 158, "metadata": {}, "outputs": [], "source": [ "lol =pd.DataFrame(data=lol)" ] }, { "cell_type": "code", "execution_count": 167, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "no 1072\n", "yes 54\n", "Name: stroke, dtype: int64" ] }, "metadata": {}, "execution_count": 167 } ], "source": [ "Y_test.value_counts()" ] }, { "cell_type": "code", "execution_count": 166, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "no 1116\n", "yes 10\n", "dtype: int64" ] }, "metadata": {}, "execution_count": 166 } ], "source": [ "lol.value_counts()" ] } ] }