moj
This commit is contained in:
parent
9fcd29016f
commit
18320ef656
490
ow.ipynb
Normal file
490
ow.ipynb
Normal file
@ -0,0 +1,490 @@
|
|||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.5"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 2,
|
||||||
|
"kernelspec": {
|
||||||
|
"name": "python385jvsc74a57bd0916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1",
|
||||||
|
"display_name": "Python 3.8.5 64-bit"
|
||||||
|
},
|
||||||
|
"metadata": {
|
||||||
|
"interpreter": {
|
||||||
|
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2,
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 133,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd \n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"\n",
|
||||||
|
"def NormalizeData(data):\n",
|
||||||
|
" for col in data.columns:\n",
|
||||||
|
" if data[col].dtype == object: \n",
|
||||||
|
" data[col] = data[col].str.lower()\n",
|
||||||
|
" if col == 'smoking_status':\n",
|
||||||
|
" data[col] = data[col].str.replace(\" \", \"_\")\n",
|
||||||
|
" if col == 'work_type':\n",
|
||||||
|
" data[col] = data[col].str.replace(\"-\", \"_\")\n",
|
||||||
|
" if col == 'bmi':\n",
|
||||||
|
" bins = [0, 21, 28, 40]\n",
|
||||||
|
" labels=['low','mid','high']\n",
|
||||||
|
" data[col] = pd.cut(data[col], bins=bins, labels=labels)\n",
|
||||||
|
" if col == 'stroke':\n",
|
||||||
|
" data[col] = data[col].replace({1: 'yes'})\n",
|
||||||
|
" data[col] = data[col].replace({0: 'no'})\n",
|
||||||
|
" if col == 'hypertension':\n",
|
||||||
|
" data[col] = data[col].replace({1: 'yes'})\n",
|
||||||
|
" data[col] = data[col].replace({0: 'no'})\n",
|
||||||
|
" if col == 'heart_disease':\n",
|
||||||
|
" data[col] = data[col].replace({1: 'yes'})\n",
|
||||||
|
" data[col] = data[col].replace({0: 'no'})\n",
|
||||||
|
" data = data.dropna()\n",
|
||||||
|
" return data\n",
|
||||||
|
"\n",
|
||||||
|
"data = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 136,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
" id gender age hypertension heart_disease ever_married \\\n",
|
||||||
|
"0 9046 male 67.0 no yes yes \n",
|
||||||
|
"2 31112 male 80.0 no yes yes \n",
|
||||||
|
"3 60182 female 49.0 no no yes \n",
|
||||||
|
"4 1665 female 79.0 yes no yes \n",
|
||||||
|
"5 56669 male 81.0 no no yes \n",
|
||||||
|
"... ... ... ... ... ... ... \n",
|
||||||
|
"5104 14180 female 13.0 no no no \n",
|
||||||
|
"5106 44873 female 81.0 no no yes \n",
|
||||||
|
"5107 19723 female 35.0 no no yes \n",
|
||||||
|
"5108 37544 male 51.0 no no yes \n",
|
||||||
|
"5109 44679 female 44.0 no no yes \n",
|
||||||
|
"\n",
|
||||||
|
" work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
|
||||||
|
"0 private urban 228.69 high formerly_smoked \n",
|
||||||
|
"2 private rural 105.92 high never_smoked \n",
|
||||||
|
"3 private urban 171.23 high smokes \n",
|
||||||
|
"4 self_employed rural 174.12 mid never_smoked \n",
|
||||||
|
"5 private urban 186.21 high formerly_smoked \n",
|
||||||
|
"... ... ... ... ... ... \n",
|
||||||
|
"5104 children rural 103.08 low unknown \n",
|
||||||
|
"5106 self_employed urban 125.20 high never_smoked \n",
|
||||||
|
"5107 self_employed rural 82.99 high never_smoked \n",
|
||||||
|
"5108 private rural 166.29 mid formerly_smoked \n",
|
||||||
|
"5109 govt_job urban 85.28 mid unknown \n",
|
||||||
|
"\n",
|
||||||
|
" stroke \n",
|
||||||
|
"0 yes \n",
|
||||||
|
"2 yes \n",
|
||||||
|
"3 yes \n",
|
||||||
|
"4 yes \n",
|
||||||
|
"5 yes \n",
|
||||||
|
"... ... \n",
|
||||||
|
"5104 no \n",
|
||||||
|
"5106 no \n",
|
||||||
|
"5107 no \n",
|
||||||
|
"5108 no \n",
|
||||||
|
"5109 no \n",
|
||||||
|
"\n",
|
||||||
|
"[4501 rows x 12 columns]"
|
||||||
|
],
|
||||||
|
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>gender</th>\n <th>age</th>\n <th>hypertension</th>\n <th>heart_disease</th>\n <th>ever_married</th>\n <th>work_type</th>\n <th>Residence_type</th>\n <th>avg_glucose_level</th>\n <th>bmi</th>\n <th>smoking_status</th>\n <th>stroke</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>9046</td>\n <td>male</td>\n <td>67.0</td>\n <td>no</td>\n <td>yes</td>\n <td>yes</td>\n <td>private</td>\n <td>urban</td>\n <td>228.69</td>\n <td>high</td>\n <td>formerly_smoked</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>2</th>\n <td>31112</td>\n <td>male</td>\n <td>80.0</td>\n <td>no</td>\n <td>yes</td>\n <td>yes</td>\n <td>private</td>\n <td>rural</td>\n <td>105.92</td>\n <td>high</td>\n <td>never_smoked</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>3</th>\n <td>60182</td>\n <td>female</td>\n <td>49.0</td>\n <td>no</td>\n <td>no</td>\n <td>yes</td>\n <td>private</td>\n <td>urban</td>\n <td>171.23</td>\n <td>high</td>\n <td>smokes</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>4</th>\n <td>1665</td>\n <td>female</td>\n <td>79.0</td>\n <td>yes</td>\n <td>no</td>\n <td>yes</td>\n <td>self_employed</td>\n <td>rural</td>\n <td>174.12</td>\n <td>mid</td>\n <td>never_smoked</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>5</th>\n <td>56669</td>\n <td>male</td>\n <td>81.0</td>\n <td>no</td>\n <td>no</td>\n <td>yes</td>\n <td>private</td>\n <td>urban</td>\n <td>186.21</td>\n <td>high</td>\n <td>formerly_smoked</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>5104</th>\n <td>14180</td>\n <td>female</td>\n <td>13.0</td>\n <td>no</td>\n <td>no</td>\n <td>no</td>\n <td>children</td>\n <td>rural</td>\n <td>103.08</td>\n <td>low</td>\n <td>unknown</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5106</th>\n <td>44873</td>\n <td>female</td>\n <td>81.0</td>\n <td>no</td>\n <td>no</td>\n <td>yes</td>\n <td>self_employed</td>\n <td>urban</td>\n <td>125.20</td>\n <td>high</td>\n <td>never_smoked</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5107</th>\n <td>19723</td>\n <td>female</td>\n <td>35.0</td>\n <td>no</td>\n <td>no</td>\n <td>yes</td>\n <td>self_employed</td>\n <td>rural</td>\n <td>82.99</td>\n <td>high</td>\n <td>never_smoked</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5108</th>\n <td>37544</td>\n <td>male</td>\n <td>51.0</td>\n <td>no</td>\n <td>no</td>\n <td>yes</td>\n <td>private</td>\n <td>rural</td>\n <td>166.29</td>\n <td>mid</td>\n <td>formerly_smoked</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5109</th>\n <td>44679</td>\n <td>female</td>\n <td>44.0</td>\n <td>no</td>\n <td>no</td>\n <td>yes</td>\n <td>govt_job</td>\n <td>urban</td>\n <td>85.28</td>\n <td>mid</td>\n <td>unknown</td>\n <td>no</td>\n </tr>\n </tbody>\n</table>\n<p>4501 rows × 12 columns</p>\n</div>"
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 136
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 135,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data = NormalizeData(data)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 137,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data = data[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','stroke','hypertension','heart_disease']]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 138,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
" gender ever_married Residence_type bmi smoking_status \\\n",
|
||||||
|
"0 male yes urban high formerly_smoked \n",
|
||||||
|
"2 male yes rural high never_smoked \n",
|
||||||
|
"3 female yes urban high smokes \n",
|
||||||
|
"4 female yes rural mid never_smoked \n",
|
||||||
|
"5 male yes urban high formerly_smoked \n",
|
||||||
|
"... ... ... ... ... ... \n",
|
||||||
|
"5104 female no rural low unknown \n",
|
||||||
|
"5106 female yes urban high never_smoked \n",
|
||||||
|
"5107 female yes rural high never_smoked \n",
|
||||||
|
"5108 male yes rural mid formerly_smoked \n",
|
||||||
|
"5109 female yes urban mid unknown \n",
|
||||||
|
"\n",
|
||||||
|
" work_type stroke hypertension heart_disease \n",
|
||||||
|
"0 private yes no yes \n",
|
||||||
|
"2 private yes no yes \n",
|
||||||
|
"3 private yes no no \n",
|
||||||
|
"4 self_employed yes yes no \n",
|
||||||
|
"5 private yes no no \n",
|
||||||
|
"... ... ... ... ... \n",
|
||||||
|
"5104 children no no no \n",
|
||||||
|
"5106 self_employed no no no \n",
|
||||||
|
"5107 self_employed no no no \n",
|
||||||
|
"5108 private no no no \n",
|
||||||
|
"5109 govt_job no no no \n",
|
||||||
|
"\n",
|
||||||
|
"[4501 rows x 9 columns]"
|
||||||
|
],
|
||||||
|
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>gender</th>\n <th>ever_married</th>\n <th>Residence_type</th>\n <th>bmi</th>\n <th>smoking_status</th>\n <th>work_type</th>\n <th>stroke</th>\n <th>hypertension</th>\n <th>heart_disease</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>male</td>\n <td>yes</td>\n <td>urban</td>\n <td>high</td>\n <td>formerly_smoked</td>\n <td>private</td>\n <td>yes</td>\n <td>no</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>2</th>\n <td>male</td>\n <td>yes</td>\n <td>rural</td>\n <td>high</td>\n <td>never_smoked</td>\n <td>private</td>\n <td>yes</td>\n <td>no</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>3</th>\n <td>female</td>\n <td>yes</td>\n <td>urban</td>\n <td>high</td>\n <td>smokes</td>\n <td>private</td>\n <td>yes</td>\n <td>no</td>\n <td>no</td>\n </tr>\n <tr>\n <th>4</th>\n <td>female</td>\n <td>yes</td>\n <td>rural</td>\n <td>mid</td>\n <td>never_smoked</td>\n <td>self_employed</td>\n <td>yes</td>\n <td>yes</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5</th>\n <td>male</td>\n <td>yes</td>\n <td>urban</td>\n <td>high</td>\n <td>formerly_smoked</td>\n <td>private</td>\n <td>yes</td>\n <td>no</td>\n <td>no</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>5104</th>\n <td>female</td>\n <td>no</td>\n <td>rural</td>\n <td>low</td>\n <td>unknown</td>\n <td>children</td>\n <td>no</td>\n <td>no</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5106</th>\n <td>female</td>\n <td>yes</td>\n <td>urban</td>\n <td>high</td>\n <td>never_smoked</td>\n <td>self_employed</td>\n <td>no</td>\n <td>no</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5107</th>\n <td>female</td>\n <td>yes</td>\n <td>rural</td>\n <td>high</td>\n <td>never_smoked</td>\n <td>self_employed</td>\n <td>no</td>\n <td>no</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5108</th>\n <td>male</td>\n <td>yes</td>\n <td>rural</td>\n <td>mid</td>\n <td>formerly_smoked</td>\n <td>private</td>\n <td>no</td>\n <td>no</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5109</th>\n <td>female</td>\n <td>yes</td>\n <td>urban</td>\n <td>mid</td>\n <td>unknown</td>\n <td>govt_job</td>\n <td>no</td>\n <td>no</td>\n <td>no</td>\n </tr>\n </tbody>\n</table>\n<p>4501 rows × 9 columns</p>\n</div>"
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 138
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 95,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"class NaiveBayes:\n",
|
||||||
|
"\n",
|
||||||
|
"\t\"\"\"\n",
|
||||||
|
"\t\tBayes Theorem:\n",
|
||||||
|
"\t\t\t\t\t\t\t\t\t\tLikelihood * Class prior probability\n",
|
||||||
|
"\t\t\t\tPosterior Probability = -------------------------------------\n",
|
||||||
|
"\t\t\t\t\t\t\t\t\t\t\tPredictor prior probability\n",
|
||||||
|
"\t\t\t\t\n",
|
||||||
|
"\t\t\t\t\t\t\t \t\t\t P(x|c) * p(c)\n",
|
||||||
|
"\t\t\t\t\t\t\t P(c|x) = ------------------ \n",
|
||||||
|
"\t\t\t\t\t\t\t\t\t\t\t P(x)\n",
|
||||||
|
"\t\"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"\tdef __init__(self):\n",
|
||||||
|
"\n",
|
||||||
|
"\t\t\"\"\"\n",
|
||||||
|
"\t\t\tAttributes:\n",
|
||||||
|
"\t\t\t\tlikelihoods: Likelihood of each feature per class\n",
|
||||||
|
"\t\t\t\tclass_priors: Prior probabilities of classes \n",
|
||||||
|
"\t\t\t\tpred_priors: Prior probabilities of features \n",
|
||||||
|
"\t\t\t\tfeatures: All features of dataset\n",
|
||||||
|
"\t\t\"\"\"\n",
|
||||||
|
"\t\tself.features = list\n",
|
||||||
|
"\t\tself.likelihoods = {}\n",
|
||||||
|
"\t\tself.class_priors = {}\n",
|
||||||
|
"\t\tself.pred_priors = {}\n",
|
||||||
|
"\n",
|
||||||
|
"\t\tself.X_train = np.array\n",
|
||||||
|
"\t\tself.y_train = np.array\n",
|
||||||
|
"\t\tself.train_size = int\n",
|
||||||
|
"\t\tself.num_feats = int\n",
|
||||||
|
"\n",
|
||||||
|
"\tdef fit(self, X, y):\n",
|
||||||
|
"\n",
|
||||||
|
"\t\tself.features = list(X.columns)\n",
|
||||||
|
"\t\tself.X_train = X\n",
|
||||||
|
"\t\tself.y_train = y\n",
|
||||||
|
"\t\tself.train_size = X.shape[0]\n",
|
||||||
|
"\t\tself.num_feats = X.shape[1]\n",
|
||||||
|
"\n",
|
||||||
|
"\t\tfor feature in self.features:\n",
|
||||||
|
"\t\t\tself.likelihoods[feature] = {}\n",
|
||||||
|
"\t\t\tself.pred_priors[feature] = {}\n",
|
||||||
|
"\n",
|
||||||
|
"\t\t\tfor feat_val in np.unique(self.X_train[feature]):\n",
|
||||||
|
"\t\t\t\tself.pred_priors[feature].update({feat_val: 0})\n",
|
||||||
|
"\n",
|
||||||
|
"\t\t\t\tfor outcome in np.unique(self.y_train):\n",
|
||||||
|
"\t\t\t\t\tself.likelihoods[feature].update({feat_val+'_'+outcome:0})\n",
|
||||||
|
"\t\t\t\t\tself.class_priors.update({outcome: 0})\n",
|
||||||
|
"\n",
|
||||||
|
"\t\tself._calc_class_prior()\n",
|
||||||
|
"\t\tself._calc_likelihoods()\n",
|
||||||
|
"\t\tself._calc_predictor_prior()\n",
|
||||||
|
"\n",
|
||||||
|
"\tdef _calc_class_prior(self):\n",
|
||||||
|
"\n",
|
||||||
|
"\t\t\"\"\" P(c) - Prior Class Probability \"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"\t\tfor outcome in np.unique(self.y_train):\n",
|
||||||
|
"\t\t\toutcome_count = sum(self.y_train == outcome)\n",
|
||||||
|
"\t\t\tself.class_priors[outcome] = outcome_count / self.train_size\n",
|
||||||
|
"\n",
|
||||||
|
"\tdef _calc_likelihoods(self):\n",
|
||||||
|
"\n",
|
||||||
|
"\t\t\"\"\" P(x|c) - Likelihood \"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"\t\tfor feature in self.features:\n",
|
||||||
|
"\n",
|
||||||
|
"\t\t\tfor outcome in np.unique(self.y_train):\n",
|
||||||
|
"\t\t\t\toutcome_count = sum(self.y_train == outcome)\n",
|
||||||
|
"\t\t\t\tfeat_likelihood = self.X_train[feature][self.y_train[self.y_train == outcome].index.values.tolist()].value_counts().to_dict()\n",
|
||||||
|
"\n",
|
||||||
|
"\t\t\t\tfor feat_val, count in feat_likelihood.items():\n",
|
||||||
|
"\t\t\t\t\tself.likelihoods[feature][feat_val + '_' + outcome] = count/outcome_count\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\tdef _calc_predictor_prior(self):\n",
|
||||||
|
"\n",
|
||||||
|
"\t\t\"\"\" P(x) - Evidence \"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"\t\tfor feature in self.features:\n",
|
||||||
|
"\t\t\tfeat_vals = self.X_train[feature].value_counts().to_dict()\n",
|
||||||
|
"\n",
|
||||||
|
"\t\t\tfor feat_val, count in feat_vals.items():\n",
|
||||||
|
"\t\t\t\tself.pred_priors[feature][feat_val] = count/self.train_size\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\tdef predict(self, X):\n",
|
||||||
|
"\n",
|
||||||
|
"\t\t\"\"\" Calculates Posterior probability P(c|x) \"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"\t\tresults = []\n",
|
||||||
|
"\t\tX = np.array(X)\n",
|
||||||
|
"\n",
|
||||||
|
"\t\tfor query in X:\n",
|
||||||
|
"\t\t\tprobs_outcome = {}\n",
|
||||||
|
"\t\t\tfor outcome in np.unique(self.y_train):\n",
|
||||||
|
"\t\t\t\tprior = self.class_priors[outcome]\n",
|
||||||
|
"\t\t\t\tlikelihood = 1\n",
|
||||||
|
"\t\t\t\tevidence = 1\n",
|
||||||
|
"\n",
|
||||||
|
"\t\t\t\tfor feat, feat_val in zip(self.features, query):\n",
|
||||||
|
"\t\t\t\t\tlikelihood *= self.likelihoods[feat][feat_val + '_' + outcome]\n",
|
||||||
|
"\t\t\t\t\tevidence *= self.pred_priors[feat][feat_val]\n",
|
||||||
|
"\n",
|
||||||
|
"\t\t\t\tposterior = (likelihood * prior) / (evidence)\n",
|
||||||
|
"\n",
|
||||||
|
"\t\t\t\tprobs_outcome[outcome] = posterior\n",
|
||||||
|
"\n",
|
||||||
|
"\t\t\tresult = max(probs_outcome, key = lambda x: probs_outcome[x])\n",
|
||||||
|
"\t\t\tresults.append(result)\n",
|
||||||
|
"\n",
|
||||||
|
"\t\treturn np.array(results)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 99,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def accuracy_score(y_true, y_pred):\n",
|
||||||
|
"\n",
|
||||||
|
"\t\"\"\"\tscore = (y_true - y_pred) / len(y_true) \"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"\treturn round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)\n",
|
||||||
|
"\n",
|
||||||
|
"def pre_processing(df):\n",
|
||||||
|
"\n",
|
||||||
|
"\t\"\"\" partioning data into features and target \"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"\tX = df.drop([df.columns[-1]], axis = 1)\n",
|
||||||
|
"\ty = df[df.columns[-1]]\n",
|
||||||
|
"\n",
|
||||||
|
"\treturn X, y"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 139,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data_train, data_test = train_test_split(data, random_state = 42)\n",
|
||||||
|
"\n",
|
||||||
|
"X_train =data_train[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]\n",
|
||||||
|
"Y_train = data_train['stroke']\n",
|
||||||
|
"\n",
|
||||||
|
"X_test =data_test[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]\n",
|
||||||
|
"Y_test = data_test['stroke']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 141,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"['male' 'female' 'other']\n['yes' 'no']\n['urban' 'rural']\n['high', 'mid', 'low']\nCategories (3, object): ['low' < 'mid' < 'high']\n['formerly_smoked' 'never_smoked' 'smokes' 'unknown']\n['private' 'self_employed' 'govt_job' 'children' 'never_worked']\n['no' 'yes']\n['yes' 'no']\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(pd.unique(data['gender']))\n",
|
||||||
|
"print(pd.unique(data['ever_married']))\n",
|
||||||
|
"print(pd.unique(data['Residence_type']))\n",
|
||||||
|
"print(pd.unique(data['bmi']))\n",
|
||||||
|
"print(pd.unique(data['smoking_status']))\n",
|
||||||
|
"print(pd.unique(data['work_type']))\n",
|
||||||
|
"print(pd.unique(data['hypertension']))\n",
|
||||||
|
"print(pd.unique(data['heart_disease']))\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 148,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Query 1:- [['male' 'no' 'urban' 'high' 'smokes' 'private' 'yes' 'yes']] ---> ['no']\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# X = data[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type']]\n",
|
||||||
|
"# y = data['stroke']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"nb_clf = NaiveBayes()\n",
|
||||||
|
"nb_clf.fit(X_train, Y_train)\n",
|
||||||
|
"\n",
|
||||||
|
"#\n",
|
||||||
|
"\t\n",
|
||||||
|
"#Query 1:\n",
|
||||||
|
"query = np.array([['male','no', 'urban', 'high','smokes', 'private','yes','yes']])\n",
|
||||||
|
"print(\"Query 1:- {} ---> {}\".format(query, nb_clf.predict(query)))\n",
|
||||||
|
"\n",
|
||||||
|
"# #Query 2:\n",
|
||||||
|
"# query = np.array([['Overcast','Cool', 'Normal', 't']])\n",
|
||||||
|
"# print(\"Query 2:- {} ---> {}\".format(query, nb_clf.predict(query)))\n",
|
||||||
|
"\n",
|
||||||
|
"# #Query 3:\n",
|
||||||
|
"# query = np.array([['Sunny','Hot', 'High', 't']])\n",
|
||||||
|
"# print(\"Query 3:- {} ---> {}\".format(query, nb_clf.predict(query)))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 143,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Test Accuracy: 94.67\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(\"Test Accuracy: {}\".format(accuracy_score(Y_test, nb_clf.predict(X_test))))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 157,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"lol = nb_clf.predict(X_test)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 158,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"lol =pd.DataFrame(data=lol)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 167,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"no 1072\n",
|
||||||
|
"yes 54\n",
|
||||||
|
"Name: stroke, dtype: int64"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 167
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"Y_test.value_counts()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 166,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"no 1116\n",
|
||||||
|
"yes 10\n",
|
||||||
|
"dtype: int64"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 166
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"lol.value_counts()"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user