This commit is contained in:
s434766 2021-05-27 22:13:47 +02:00
parent 9fcd29016f
commit 18320ef656

490
ow.ipynb Normal file
View File

@ -0,0 +1,490 @@
{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "python385jvsc74a57bd0916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1",
"display_name": "Python 3.8.5 64-bit"
},
"metadata": {
"interpreter": {
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
}
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"cell_type": "code",
"execution_count": 133,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd \n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"def NormalizeData(data):\n",
" for col in data.columns:\n",
" if data[col].dtype == object: \n",
" data[col] = data[col].str.lower()\n",
" if col == 'smoking_status':\n",
" data[col] = data[col].str.replace(\" \", \"_\")\n",
" if col == 'work_type':\n",
" data[col] = data[col].str.replace(\"-\", \"_\")\n",
" if col == 'bmi':\n",
" bins = [0, 21, 28, 40]\n",
" labels=['low','mid','high']\n",
" data[col] = pd.cut(data[col], bins=bins, labels=labels)\n",
" if col == 'stroke':\n",
" data[col] = data[col].replace({1: 'yes'})\n",
" data[col] = data[col].replace({0: 'no'})\n",
" if col == 'hypertension':\n",
" data[col] = data[col].replace({1: 'yes'})\n",
" data[col] = data[col].replace({0: 'no'})\n",
" if col == 'heart_disease':\n",
" data[col] = data[col].replace({1: 'yes'})\n",
" data[col] = data[col].replace({0: 'no'})\n",
" data = data.dropna()\n",
" return data\n",
"\n",
"data = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" id gender age hypertension heart_disease ever_married \\\n",
"0 9046 male 67.0 no yes yes \n",
"2 31112 male 80.0 no yes yes \n",
"3 60182 female 49.0 no no yes \n",
"4 1665 female 79.0 yes no yes \n",
"5 56669 male 81.0 no no yes \n",
"... ... ... ... ... ... ... \n",
"5104 14180 female 13.0 no no no \n",
"5106 44873 female 81.0 no no yes \n",
"5107 19723 female 35.0 no no yes \n",
"5108 37544 male 51.0 no no yes \n",
"5109 44679 female 44.0 no no yes \n",
"\n",
" work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
"0 private urban 228.69 high formerly_smoked \n",
"2 private rural 105.92 high never_smoked \n",
"3 private urban 171.23 high smokes \n",
"4 self_employed rural 174.12 mid never_smoked \n",
"5 private urban 186.21 high formerly_smoked \n",
"... ... ... ... ... ... \n",
"5104 children rural 103.08 low unknown \n",
"5106 self_employed urban 125.20 high never_smoked \n",
"5107 self_employed rural 82.99 high never_smoked \n",
"5108 private rural 166.29 mid formerly_smoked \n",
"5109 govt_job urban 85.28 mid unknown \n",
"\n",
" stroke \n",
"0 yes \n",
"2 yes \n",
"3 yes \n",
"4 yes \n",
"5 yes \n",
"... ... \n",
"5104 no \n",
"5106 no \n",
"5107 no \n",
"5108 no \n",
"5109 no \n",
"\n",
"[4501 rows x 12 columns]"
],
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>gender</th>\n <th>age</th>\n <th>hypertension</th>\n <th>heart_disease</th>\n <th>ever_married</th>\n <th>work_type</th>\n <th>Residence_type</th>\n <th>avg_glucose_level</th>\n <th>bmi</th>\n <th>smoking_status</th>\n <th>stroke</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>9046</td>\n <td>male</td>\n <td>67.0</td>\n <td>no</td>\n <td>yes</td>\n <td>yes</td>\n <td>private</td>\n <td>urban</td>\n <td>228.69</td>\n <td>high</td>\n <td>formerly_smoked</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>2</th>\n <td>31112</td>\n <td>male</td>\n <td>80.0</td>\n <td>no</td>\n <td>yes</td>\n <td>yes</td>\n <td>private</td>\n <td>rural</td>\n <td>105.92</td>\n <td>high</td>\n <td>never_smoked</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>3</th>\n <td>60182</td>\n <td>female</td>\n <td>49.0</td>\n <td>no</td>\n <td>no</td>\n <td>yes</td>\n <td>private</td>\n <td>urban</td>\n <td>171.23</td>\n <td>high</td>\n <td>smokes</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>4</th>\n <td>1665</td>\n <td>female</td>\n <td>79.0</td>\n <td>yes</td>\n <td>no</td>\n <td>yes</td>\n <td>self_employed</td>\n <td>rural</td>\n <td>174.12</td>\n <td>mid</td>\n <td>never_smoked</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>5</th>\n <td>56669</td>\n <td>male</td>\n <td>81.0</td>\n <td>no</td>\n <td>no</td>\n <td>yes</td>\n <td>private</td>\n <td>urban</td>\n <td>186.21</td>\n <td>high</td>\n <td>formerly_smoked</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>5104</th>\n <td>14180</td>\n <td>female</td>\n <td>13.0</td>\n <td>no</td>\n <td>no</td>\n <td>no</td>\n <td>children</td>\n <td>rural</td>\n <td>103.08</td>\n <td>low</td>\n <td>unknown</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5106</th>\n <td>44873</td>\n <td>female</td>\n <td>81.0</td>\n <td>no</td>\n <td>no</td>\n <td>yes</td>\n <td>self_employed</td>\n <td>urban</td>\n <td>125.20</td>\n <td>high</td>\n <td>never_smoked</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5107</th>\n <td>19723</td>\n <td>female</td>\n <td>35.0</td>\n <td>no</td>\n <td>no</td>\n <td>yes</td>\n <td>self_employed</td>\n <td>rural</td>\n <td>82.99</td>\n <td>high</td>\n <td>never_smoked</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5108</th>\n <td>37544</td>\n <td>male</td>\n <td>51.0</td>\n <td>no</td>\n <td>no</td>\n <td>yes</td>\n <td>private</td>\n <td>rural</td>\n <td>166.29</td>\n <td>mid</td>\n <td>formerly_smoked</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5109</th>\n <td>44679</td>\n <td>female</td>\n <td>44.0</td>\n <td>no</td>\n <td>no</td>\n <td>yes</td>\n <td>govt_job</td>\n <td>urban</td>\n <td>85.28</td>\n <td>mid</td>\n <td>unknown</td>\n <td>no</td>\n </tr>\n </tbody>\n</table>\n<p>4501 rows × 12 columns</p>\n</div>"
},
"metadata": {},
"execution_count": 136
}
],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {},
"outputs": [],
"source": [
"data = NormalizeData(data)"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {},
"outputs": [],
"source": [
"data = data[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','stroke','hypertension','heart_disease']]"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" gender ever_married Residence_type bmi smoking_status \\\n",
"0 male yes urban high formerly_smoked \n",
"2 male yes rural high never_smoked \n",
"3 female yes urban high smokes \n",
"4 female yes rural mid never_smoked \n",
"5 male yes urban high formerly_smoked \n",
"... ... ... ... ... ... \n",
"5104 female no rural low unknown \n",
"5106 female yes urban high never_smoked \n",
"5107 female yes rural high never_smoked \n",
"5108 male yes rural mid formerly_smoked \n",
"5109 female yes urban mid unknown \n",
"\n",
" work_type stroke hypertension heart_disease \n",
"0 private yes no yes \n",
"2 private yes no yes \n",
"3 private yes no no \n",
"4 self_employed yes yes no \n",
"5 private yes no no \n",
"... ... ... ... ... \n",
"5104 children no no no \n",
"5106 self_employed no no no \n",
"5107 self_employed no no no \n",
"5108 private no no no \n",
"5109 govt_job no no no \n",
"\n",
"[4501 rows x 9 columns]"
],
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>gender</th>\n <th>ever_married</th>\n <th>Residence_type</th>\n <th>bmi</th>\n <th>smoking_status</th>\n <th>work_type</th>\n <th>stroke</th>\n <th>hypertension</th>\n <th>heart_disease</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>male</td>\n <td>yes</td>\n <td>urban</td>\n <td>high</td>\n <td>formerly_smoked</td>\n <td>private</td>\n <td>yes</td>\n <td>no</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>2</th>\n <td>male</td>\n <td>yes</td>\n <td>rural</td>\n <td>high</td>\n <td>never_smoked</td>\n <td>private</td>\n <td>yes</td>\n <td>no</td>\n <td>yes</td>\n </tr>\n <tr>\n <th>3</th>\n <td>female</td>\n <td>yes</td>\n <td>urban</td>\n <td>high</td>\n <td>smokes</td>\n <td>private</td>\n <td>yes</td>\n <td>no</td>\n <td>no</td>\n </tr>\n <tr>\n <th>4</th>\n <td>female</td>\n <td>yes</td>\n <td>rural</td>\n <td>mid</td>\n <td>never_smoked</td>\n <td>self_employed</td>\n <td>yes</td>\n <td>yes</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5</th>\n <td>male</td>\n <td>yes</td>\n <td>urban</td>\n <td>high</td>\n <td>formerly_smoked</td>\n <td>private</td>\n <td>yes</td>\n <td>no</td>\n <td>no</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>5104</th>\n <td>female</td>\n <td>no</td>\n <td>rural</td>\n <td>low</td>\n <td>unknown</td>\n <td>children</td>\n <td>no</td>\n <td>no</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5106</th>\n <td>female</td>\n <td>yes</td>\n <td>urban</td>\n <td>high</td>\n <td>never_smoked</td>\n <td>self_employed</td>\n <td>no</td>\n <td>no</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5107</th>\n <td>female</td>\n <td>yes</td>\n <td>rural</td>\n <td>high</td>\n <td>never_smoked</td>\n <td>self_employed</td>\n <td>no</td>\n <td>no</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5108</th>\n <td>male</td>\n <td>yes</td>\n <td>rural</td>\n <td>mid</td>\n <td>formerly_smoked</td>\n <td>private</td>\n <td>no</td>\n <td>no</td>\n <td>no</td>\n </tr>\n <tr>\n <th>5109</th>\n <td>female</td>\n <td>yes</td>\n <td>urban</td>\n <td>mid</td>\n <td>unknown</td>\n <td>govt_job</td>\n <td>no</td>\n <td>no</td>\n <td>no</td>\n </tr>\n </tbody>\n</table>\n<p>4501 rows × 9 columns</p>\n</div>"
},
"metadata": {},
"execution_count": 138
}
],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"class NaiveBayes:\n",
"\n",
"\t\"\"\"\n",
"\t\tBayes Theorem:\n",
"\t\t\t\t\t\t\t\t\t\tLikelihood * Class prior probability\n",
"\t\t\t\tPosterior Probability = -------------------------------------\n",
"\t\t\t\t\t\t\t\t\t\t\tPredictor prior probability\n",
"\t\t\t\t\n",
"\t\t\t\t\t\t\t \t\t\t P(x|c) * p(c)\n",
"\t\t\t\t\t\t\t P(c|x) = ------------------ \n",
"\t\t\t\t\t\t\t\t\t\t\t P(x)\n",
"\t\"\"\"\n",
"\n",
"\tdef __init__(self):\n",
"\n",
"\t\t\"\"\"\n",
"\t\t\tAttributes:\n",
"\t\t\t\tlikelihoods: Likelihood of each feature per class\n",
"\t\t\t\tclass_priors: Prior probabilities of classes \n",
"\t\t\t\tpred_priors: Prior probabilities of features \n",
"\t\t\t\tfeatures: All features of dataset\n",
"\t\t\"\"\"\n",
"\t\tself.features = list\n",
"\t\tself.likelihoods = {}\n",
"\t\tself.class_priors = {}\n",
"\t\tself.pred_priors = {}\n",
"\n",
"\t\tself.X_train = np.array\n",
"\t\tself.y_train = np.array\n",
"\t\tself.train_size = int\n",
"\t\tself.num_feats = int\n",
"\n",
"\tdef fit(self, X, y):\n",
"\n",
"\t\tself.features = list(X.columns)\n",
"\t\tself.X_train = X\n",
"\t\tself.y_train = y\n",
"\t\tself.train_size = X.shape[0]\n",
"\t\tself.num_feats = X.shape[1]\n",
"\n",
"\t\tfor feature in self.features:\n",
"\t\t\tself.likelihoods[feature] = {}\n",
"\t\t\tself.pred_priors[feature] = {}\n",
"\n",
"\t\t\tfor feat_val in np.unique(self.X_train[feature]):\n",
"\t\t\t\tself.pred_priors[feature].update({feat_val: 0})\n",
"\n",
"\t\t\t\tfor outcome in np.unique(self.y_train):\n",
"\t\t\t\t\tself.likelihoods[feature].update({feat_val+'_'+outcome:0})\n",
"\t\t\t\t\tself.class_priors.update({outcome: 0})\n",
"\n",
"\t\tself._calc_class_prior()\n",
"\t\tself._calc_likelihoods()\n",
"\t\tself._calc_predictor_prior()\n",
"\n",
"\tdef _calc_class_prior(self):\n",
"\n",
"\t\t\"\"\" P(c) - Prior Class Probability \"\"\"\n",
"\n",
"\t\tfor outcome in np.unique(self.y_train):\n",
"\t\t\toutcome_count = sum(self.y_train == outcome)\n",
"\t\t\tself.class_priors[outcome] = outcome_count / self.train_size\n",
"\n",
"\tdef _calc_likelihoods(self):\n",
"\n",
"\t\t\"\"\" P(x|c) - Likelihood \"\"\"\n",
"\n",
"\t\tfor feature in self.features:\n",
"\n",
"\t\t\tfor outcome in np.unique(self.y_train):\n",
"\t\t\t\toutcome_count = sum(self.y_train == outcome)\n",
"\t\t\t\tfeat_likelihood = self.X_train[feature][self.y_train[self.y_train == outcome].index.values.tolist()].value_counts().to_dict()\n",
"\n",
"\t\t\t\tfor feat_val, count in feat_likelihood.items():\n",
"\t\t\t\t\tself.likelihoods[feature][feat_val + '_' + outcome] = count/outcome_count\n",
"\n",
"\n",
"\tdef _calc_predictor_prior(self):\n",
"\n",
"\t\t\"\"\" P(x) - Evidence \"\"\"\n",
"\n",
"\t\tfor feature in self.features:\n",
"\t\t\tfeat_vals = self.X_train[feature].value_counts().to_dict()\n",
"\n",
"\t\t\tfor feat_val, count in feat_vals.items():\n",
"\t\t\t\tself.pred_priors[feature][feat_val] = count/self.train_size\n",
"\n",
"\n",
"\tdef predict(self, X):\n",
"\n",
"\t\t\"\"\" Calculates Posterior probability P(c|x) \"\"\"\n",
"\n",
"\t\tresults = []\n",
"\t\tX = np.array(X)\n",
"\n",
"\t\tfor query in X:\n",
"\t\t\tprobs_outcome = {}\n",
"\t\t\tfor outcome in np.unique(self.y_train):\n",
"\t\t\t\tprior = self.class_priors[outcome]\n",
"\t\t\t\tlikelihood = 1\n",
"\t\t\t\tevidence = 1\n",
"\n",
"\t\t\t\tfor feat, feat_val in zip(self.features, query):\n",
"\t\t\t\t\tlikelihood *= self.likelihoods[feat][feat_val + '_' + outcome]\n",
"\t\t\t\t\tevidence *= self.pred_priors[feat][feat_val]\n",
"\n",
"\t\t\t\tposterior = (likelihood * prior) / (evidence)\n",
"\n",
"\t\t\t\tprobs_outcome[outcome] = posterior\n",
"\n",
"\t\t\tresult = max(probs_outcome, key = lambda x: probs_outcome[x])\n",
"\t\t\tresults.append(result)\n",
"\n",
"\t\treturn np.array(results)\n"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [],
"source": [
"def accuracy_score(y_true, y_pred):\n",
"\n",
"\t\"\"\"\tscore = (y_true - y_pred) / len(y_true) \"\"\"\n",
"\n",
"\treturn round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)\n",
"\n",
"def pre_processing(df):\n",
"\n",
"\t\"\"\" partioning data into features and target \"\"\"\n",
"\n",
"\tX = df.drop([df.columns[-1]], axis = 1)\n",
"\ty = df[df.columns[-1]]\n",
"\n",
"\treturn X, y"
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {},
"outputs": [],
"source": [
"data_train, data_test = train_test_split(data, random_state = 42)\n",
"\n",
"X_train =data_train[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]\n",
"Y_train = data_train['stroke']\n",
"\n",
"X_test =data_test[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]\n",
"Y_test = data_test['stroke']"
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"['male' 'female' 'other']\n['yes' 'no']\n['urban' 'rural']\n['high', 'mid', 'low']\nCategories (3, object): ['low' < 'mid' < 'high']\n['formerly_smoked' 'never_smoked' 'smokes' 'unknown']\n['private' 'self_employed' 'govt_job' 'children' 'never_worked']\n['no' 'yes']\n['yes' 'no']\n"
]
}
],
"source": [
"print(pd.unique(data['gender']))\n",
"print(pd.unique(data['ever_married']))\n",
"print(pd.unique(data['Residence_type']))\n",
"print(pd.unique(data['bmi']))\n",
"print(pd.unique(data['smoking_status']))\n",
"print(pd.unique(data['work_type']))\n",
"print(pd.unique(data['hypertension']))\n",
"print(pd.unique(data['heart_disease']))\n"
]
},
{
"cell_type": "code",
"execution_count": 148,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Query 1:- [['male' 'no' 'urban' 'high' 'smokes' 'private' 'yes' 'yes']] ---> ['no']\n"
]
}
],
"source": [
"# X = data[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type']]\n",
"# y = data['stroke']\n",
"\n",
"\n",
"nb_clf = NaiveBayes()\n",
"nb_clf.fit(X_train, Y_train)\n",
"\n",
"#\n",
"\t\n",
"#Query 1:\n",
"query = np.array([['male','no', 'urban', 'high','smokes', 'private','yes','yes']])\n",
"print(\"Query 1:- {} ---> {}\".format(query, nb_clf.predict(query)))\n",
"\n",
"# #Query 2:\n",
"# query = np.array([['Overcast','Cool', 'Normal', 't']])\n",
"# print(\"Query 2:- {} ---> {}\".format(query, nb_clf.predict(query)))\n",
"\n",
"# #Query 3:\n",
"# query = np.array([['Sunny','Hot', 'High', 't']])\n",
"# print(\"Query 3:- {} ---> {}\".format(query, nb_clf.predict(query)))"
]
},
{
"cell_type": "code",
"execution_count": 143,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Test Accuracy: 94.67\n"
]
}
],
"source": [
"print(\"Test Accuracy: {}\".format(accuracy_score(Y_test, nb_clf.predict(X_test))))"
]
},
{
"cell_type": "code",
"execution_count": 157,
"metadata": {},
"outputs": [],
"source": [
"lol = nb_clf.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {},
"outputs": [],
"source": [
"lol =pd.DataFrame(data=lol)"
]
},
{
"cell_type": "code",
"execution_count": 167,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"no 1072\n",
"yes 54\n",
"Name: stroke, dtype: int64"
]
},
"metadata": {},
"execution_count": 167
}
],
"source": [
"Y_test.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"no 1116\n",
"yes 10\n",
"dtype: int64"
]
},
"metadata": {},
"execution_count": 166
}
],
"source": [
"lol.value_counts()"
]
}
]
}