diff --git a/ow.ipynb b/ow.ipynb new file mode 100644 index 0000000..b1c7569 --- /dev/null +++ b/ow.ipynb @@ -0,0 +1,490 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python385jvsc74a57bd0916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1", + "display_name": "Python 3.8.5 64-bit" + }, + "metadata": { + "interpreter": { + "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd \n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "def NormalizeData(data):\n", + " for col in data.columns:\n", + " if data[col].dtype == object: \n", + " data[col] = data[col].str.lower()\n", + " if col == 'smoking_status':\n", + " data[col] = data[col].str.replace(\" \", \"_\")\n", + " if col == 'work_type':\n", + " data[col] = data[col].str.replace(\"-\", \"_\")\n", + " if col == 'bmi':\n", + " bins = [0, 21, 28, 40]\n", + " labels=['low','mid','high']\n", + " data[col] = pd.cut(data[col], bins=bins, labels=labels)\n", + " if col == 'stroke':\n", + " data[col] = data[col].replace({1: 'yes'})\n", + " data[col] = data[col].replace({0: 'no'})\n", + " if col == 'hypertension':\n", + " data[col] = data[col].replace({1: 'yes'})\n", + " data[col] = data[col].replace({0: 'no'})\n", + " if col == 'heart_disease':\n", + " data[col] = data[col].replace({1: 'yes'})\n", + " data[col] = data[col].replace({0: 'no'})\n", + " data = data.dropna()\n", + " return data\n", + "\n", + "data = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " id gender age hypertension heart_disease ever_married \\\n", + "0 9046 male 67.0 no yes yes \n", + "2 31112 male 80.0 no yes yes \n", + "3 60182 female 49.0 no no yes \n", + "4 1665 female 79.0 yes no yes \n", + "5 56669 male 81.0 no no yes \n", + "... ... ... ... ... ... ... \n", + "5104 14180 female 13.0 no no no \n", + "5106 44873 female 81.0 no no yes \n", + "5107 19723 female 35.0 no no yes \n", + "5108 37544 male 51.0 no no yes \n", + "5109 44679 female 44.0 no no yes \n", + "\n", + " work_type Residence_type avg_glucose_level bmi smoking_status \\\n", + "0 private urban 228.69 high formerly_smoked \n", + "2 private rural 105.92 high never_smoked \n", + "3 private urban 171.23 high smokes \n", + "4 self_employed rural 174.12 mid never_smoked \n", + "5 private urban 186.21 high formerly_smoked \n", + "... ... ... ... ... ... \n", + "5104 children rural 103.08 low unknown \n", + "5106 self_employed urban 125.20 high never_smoked \n", + "5107 self_employed rural 82.99 high never_smoked \n", + "5108 private rural 166.29 mid formerly_smoked \n", + "5109 govt_job urban 85.28 mid unknown \n", + "\n", + " stroke \n", + "0 yes \n", + "2 yes \n", + "3 yes \n", + "4 yes \n", + "5 yes \n", + "... ... \n", + "5104 no \n", + "5106 no \n", + "5107 no \n", + "5108 no \n", + "5109 no \n", + "\n", + "[4501 rows x 12 columns]" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idgenderagehypertensionheart_diseaseever_marriedwork_typeResidence_typeavg_glucose_levelbmismoking_statusstroke
09046male67.0noyesyesprivateurban228.69highformerly_smokedyes
231112male80.0noyesyesprivaterural105.92highnever_smokedyes
360182female49.0nonoyesprivateurban171.23highsmokesyes
41665female79.0yesnoyesself_employedrural174.12midnever_smokedyes
556669male81.0nonoyesprivateurban186.21highformerly_smokedyes
.......................................
510414180female13.0nononochildrenrural103.08lowunknownno
510644873female81.0nonoyesself_employedurban125.20highnever_smokedno
510719723female35.0nonoyesself_employedrural82.99highnever_smokedno
510837544male51.0nonoyesprivaterural166.29midformerly_smokedno
510944679female44.0nonoyesgovt_joburban85.28midunknownno
\n

4501 rows × 12 columns

\n
" + }, + "metadata": {}, + "execution_count": 136 + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": {}, + "outputs": [], + "source": [ + "data = NormalizeData(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [], + "source": [ + "data = data[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','stroke','hypertension','heart_disease']]" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " gender ever_married Residence_type bmi smoking_status \\\n", + "0 male yes urban high formerly_smoked \n", + "2 male yes rural high never_smoked \n", + "3 female yes urban high smokes \n", + "4 female yes rural mid never_smoked \n", + "5 male yes urban high formerly_smoked \n", + "... ... ... ... ... ... \n", + "5104 female no rural low unknown \n", + "5106 female yes urban high never_smoked \n", + "5107 female yes rural high never_smoked \n", + "5108 male yes rural mid formerly_smoked \n", + "5109 female yes urban mid unknown \n", + "\n", + " work_type stroke hypertension heart_disease \n", + "0 private yes no yes \n", + "2 private yes no yes \n", + "3 private yes no no \n", + "4 self_employed yes yes no \n", + "5 private yes no no \n", + "... ... ... ... ... \n", + "5104 children no no no \n", + "5106 self_employed no no no \n", + "5107 self_employed no no no \n", + "5108 private no no no \n", + "5109 govt_job no no no \n", + "\n", + "[4501 rows x 9 columns]" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
genderever_marriedResidence_typebmismoking_statuswork_typestrokehypertensionheart_disease
0maleyesurbanhighformerly_smokedprivateyesnoyes
2maleyesruralhighnever_smokedprivateyesnoyes
3femaleyesurbanhighsmokesprivateyesnono
4femaleyesruralmidnever_smokedself_employedyesyesno
5maleyesurbanhighformerly_smokedprivateyesnono
..............................
5104femalenorurallowunknownchildrennonono
5106femaleyesurbanhighnever_smokedself_employednonono
5107femaleyesruralhighnever_smokedself_employednonono
5108maleyesruralmidformerly_smokedprivatenonono
5109femaleyesurbanmidunknowngovt_jobnonono
\n

4501 rows × 9 columns

\n
" + }, + "metadata": {}, + "execution_count": 138 + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "class NaiveBayes:\n", + "\n", + "\t\"\"\"\n", + "\t\tBayes Theorem:\n", + "\t\t\t\t\t\t\t\t\t\tLikelihood * Class prior probability\n", + "\t\t\t\tPosterior Probability = -------------------------------------\n", + "\t\t\t\t\t\t\t\t\t\t\tPredictor prior probability\n", + "\t\t\t\t\n", + "\t\t\t\t\t\t\t \t\t\t P(x|c) * p(c)\n", + "\t\t\t\t\t\t\t P(c|x) = ------------------ \n", + "\t\t\t\t\t\t\t\t\t\t\t P(x)\n", + "\t\"\"\"\n", + "\n", + "\tdef __init__(self):\n", + "\n", + "\t\t\"\"\"\n", + "\t\t\tAttributes:\n", + "\t\t\t\tlikelihoods: Likelihood of each feature per class\n", + "\t\t\t\tclass_priors: Prior probabilities of classes \n", + "\t\t\t\tpred_priors: Prior probabilities of features \n", + "\t\t\t\tfeatures: All features of dataset\n", + "\t\t\"\"\"\n", + "\t\tself.features = list\n", + "\t\tself.likelihoods = {}\n", + "\t\tself.class_priors = {}\n", + "\t\tself.pred_priors = {}\n", + "\n", + "\t\tself.X_train = np.array\n", + "\t\tself.y_train = np.array\n", + "\t\tself.train_size = int\n", + "\t\tself.num_feats = int\n", + "\n", + "\tdef fit(self, X, y):\n", + "\n", + "\t\tself.features = list(X.columns)\n", + "\t\tself.X_train = X\n", + "\t\tself.y_train = y\n", + "\t\tself.train_size = X.shape[0]\n", + "\t\tself.num_feats = X.shape[1]\n", + "\n", + "\t\tfor feature in self.features:\n", + "\t\t\tself.likelihoods[feature] = {}\n", + "\t\t\tself.pred_priors[feature] = {}\n", + "\n", + "\t\t\tfor feat_val in np.unique(self.X_train[feature]):\n", + "\t\t\t\tself.pred_priors[feature].update({feat_val: 0})\n", + "\n", + "\t\t\t\tfor outcome in np.unique(self.y_train):\n", + "\t\t\t\t\tself.likelihoods[feature].update({feat_val+'_'+outcome:0})\n", + "\t\t\t\t\tself.class_priors.update({outcome: 0})\n", + "\n", + "\t\tself._calc_class_prior()\n", + "\t\tself._calc_likelihoods()\n", + "\t\tself._calc_predictor_prior()\n", + "\n", + "\tdef _calc_class_prior(self):\n", + "\n", + "\t\t\"\"\" P(c) - Prior Class Probability \"\"\"\n", + "\n", + "\t\tfor outcome in np.unique(self.y_train):\n", + "\t\t\toutcome_count = sum(self.y_train == outcome)\n", + "\t\t\tself.class_priors[outcome] = outcome_count / self.train_size\n", + "\n", + "\tdef _calc_likelihoods(self):\n", + "\n", + "\t\t\"\"\" P(x|c) - Likelihood \"\"\"\n", + "\n", + "\t\tfor feature in self.features:\n", + "\n", + "\t\t\tfor outcome in np.unique(self.y_train):\n", + "\t\t\t\toutcome_count = sum(self.y_train == outcome)\n", + "\t\t\t\tfeat_likelihood = self.X_train[feature][self.y_train[self.y_train == outcome].index.values.tolist()].value_counts().to_dict()\n", + "\n", + "\t\t\t\tfor feat_val, count in feat_likelihood.items():\n", + "\t\t\t\t\tself.likelihoods[feature][feat_val + '_' + outcome] = count/outcome_count\n", + "\n", + "\n", + "\tdef _calc_predictor_prior(self):\n", + "\n", + "\t\t\"\"\" P(x) - Evidence \"\"\"\n", + "\n", + "\t\tfor feature in self.features:\n", + "\t\t\tfeat_vals = self.X_train[feature].value_counts().to_dict()\n", + "\n", + "\t\t\tfor feat_val, count in feat_vals.items():\n", + "\t\t\t\tself.pred_priors[feature][feat_val] = count/self.train_size\n", + "\n", + "\n", + "\tdef predict(self, X):\n", + "\n", + "\t\t\"\"\" Calculates Posterior probability P(c|x) \"\"\"\n", + "\n", + "\t\tresults = []\n", + "\t\tX = np.array(X)\n", + "\n", + "\t\tfor query in X:\n", + "\t\t\tprobs_outcome = {}\n", + "\t\t\tfor outcome in np.unique(self.y_train):\n", + "\t\t\t\tprior = self.class_priors[outcome]\n", + "\t\t\t\tlikelihood = 1\n", + "\t\t\t\tevidence = 1\n", + "\n", + "\t\t\t\tfor feat, feat_val in zip(self.features, query):\n", + "\t\t\t\t\tlikelihood *= self.likelihoods[feat][feat_val + '_' + outcome]\n", + "\t\t\t\t\tevidence *= self.pred_priors[feat][feat_val]\n", + "\n", + "\t\t\t\tposterior = (likelihood * prior) / (evidence)\n", + "\n", + "\t\t\t\tprobs_outcome[outcome] = posterior\n", + "\n", + "\t\t\tresult = max(probs_outcome, key = lambda x: probs_outcome[x])\n", + "\t\t\tresults.append(result)\n", + "\n", + "\t\treturn np.array(results)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "def accuracy_score(y_true, y_pred):\n", + "\n", + "\t\"\"\"\tscore = (y_true - y_pred) / len(y_true) \"\"\"\n", + "\n", + "\treturn round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)\n", + "\n", + "def pre_processing(df):\n", + "\n", + "\t\"\"\" partioning data into features and target \"\"\"\n", + "\n", + "\tX = df.drop([df.columns[-1]], axis = 1)\n", + "\ty = df[df.columns[-1]]\n", + "\n", + "\treturn X, y" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [], + "source": [ + "data_train, data_test = train_test_split(data, random_state = 42)\n", + "\n", + "X_train =data_train[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]\n", + "Y_train = data_train['stroke']\n", + "\n", + "X_test =data_test[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]\n", + "Y_test = data_test['stroke']" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['male' 'female' 'other']\n['yes' 'no']\n['urban' 'rural']\n['high', 'mid', 'low']\nCategories (3, object): ['low' < 'mid' < 'high']\n['formerly_smoked' 'never_smoked' 'smokes' 'unknown']\n['private' 'self_employed' 'govt_job' 'children' 'never_worked']\n['no' 'yes']\n['yes' 'no']\n" + ] + } + ], + "source": [ + "print(pd.unique(data['gender']))\n", + "print(pd.unique(data['ever_married']))\n", + "print(pd.unique(data['Residence_type']))\n", + "print(pd.unique(data['bmi']))\n", + "print(pd.unique(data['smoking_status']))\n", + "print(pd.unique(data['work_type']))\n", + "print(pd.unique(data['hypertension']))\n", + "print(pd.unique(data['heart_disease']))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Query 1:- [['male' 'no' 'urban' 'high' 'smokes' 'private' 'yes' 'yes']] ---> ['no']\n" + ] + } + ], + "source": [ + "# X = data[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type']]\n", + "# y = data['stroke']\n", + "\n", + "\n", + "nb_clf = NaiveBayes()\n", + "nb_clf.fit(X_train, Y_train)\n", + "\n", + "#\n", + "\t\n", + "#Query 1:\n", + "query = np.array([['male','no', 'urban', 'high','smokes', 'private','yes','yes']])\n", + "print(\"Query 1:- {} ---> {}\".format(query, nb_clf.predict(query)))\n", + "\n", + "# #Query 2:\n", + "# query = np.array([['Overcast','Cool', 'Normal', 't']])\n", + "# print(\"Query 2:- {} ---> {}\".format(query, nb_clf.predict(query)))\n", + "\n", + "# #Query 3:\n", + "# query = np.array([['Sunny','Hot', 'High', 't']])\n", + "# print(\"Query 3:- {} ---> {}\".format(query, nb_clf.predict(query)))" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test Accuracy: 94.67\n" + ] + } + ], + "source": [ + "print(\"Test Accuracy: {}\".format(accuracy_score(Y_test, nb_clf.predict(X_test))))" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": {}, + "outputs": [], + "source": [ + "lol = nb_clf.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "metadata": {}, + "outputs": [], + "source": [ + "lol =pd.DataFrame(data=lol)" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "no 1072\n", + "yes 54\n", + "Name: stroke, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 167 + } + ], + "source": [ + "Y_test.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "no 1116\n", + "yes 10\n", + "dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 166 + } + ], + "source": [ + "lol.value_counts()" + ] + } + ] +} \ No newline at end of file