23 KiB
23 KiB
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
def NormalizeData(data):
for col in data.columns:
if data[col].dtype == object:
data[col] = data[col].str.lower()
if col == 'smoking_status':
data[col] = data[col].str.replace(" ", "_")
if col == 'work_type':
data[col] = data[col].str.replace("-", "_")
if col == 'bmi':
bins = [0, 21, 28, 40]
labels=['low','mid','high']
data[col] = pd.cut(data[col], bins=bins, labels=labels)
if col == 'stroke':
data[col] = data[col].replace({1: 'yes'})
data[col] = data[col].replace({0: 'no'})
if col == 'hypertension':
data[col] = data[col].replace({1: 'yes'})
data[col] = data[col].replace({0: 'no'})
if col == 'heart_disease':
data[col] = data[col].replace({1: 'yes'})
data[col] = data[col].replace({0: 'no'})
data = data.dropna()
return data
data = pd.read_csv("healthcare-dataset-stroke-data.csv")
data
id | gender | age | hypertension | heart_disease | ever_married | work_type | Residence_type | avg_glucose_level | bmi | smoking_status | stroke | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9046 | male | 67.0 | no | yes | yes | private | urban | 228.69 | high | formerly_smoked | yes |
2 | 31112 | male | 80.0 | no | yes | yes | private | rural | 105.92 | high | never_smoked | yes |
3 | 60182 | female | 49.0 | no | no | yes | private | urban | 171.23 | high | smokes | yes |
4 | 1665 | female | 79.0 | yes | no | yes | self_employed | rural | 174.12 | mid | never_smoked | yes |
5 | 56669 | male | 81.0 | no | no | yes | private | urban | 186.21 | high | formerly_smoked | yes |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5104 | 14180 | female | 13.0 | no | no | no | children | rural | 103.08 | low | unknown | no |
5106 | 44873 | female | 81.0 | no | no | yes | self_employed | urban | 125.20 | high | never_smoked | no |
5107 | 19723 | female | 35.0 | no | no | yes | self_employed | rural | 82.99 | high | never_smoked | no |
5108 | 37544 | male | 51.0 | no | no | yes | private | rural | 166.29 | mid | formerly_smoked | no |
5109 | 44679 | female | 44.0 | no | no | yes | govt_job | urban | 85.28 | mid | unknown | no |
4501 rows × 12 columns
data = NormalizeData(data)
data = data[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','stroke','hypertension','heart_disease']]
data
gender | ever_married | Residence_type | bmi | smoking_status | work_type | stroke | hypertension | heart_disease | |
---|---|---|---|---|---|---|---|---|---|
0 | male | yes | urban | high | formerly_smoked | private | yes | no | yes |
2 | male | yes | rural | high | never_smoked | private | yes | no | yes |
3 | female | yes | urban | high | smokes | private | yes | no | no |
4 | female | yes | rural | mid | never_smoked | self_employed | yes | yes | no |
5 | male | yes | urban | high | formerly_smoked | private | yes | no | no |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5104 | female | no | rural | low | unknown | children | no | no | no |
5106 | female | yes | urban | high | never_smoked | self_employed | no | no | no |
5107 | female | yes | rural | high | never_smoked | self_employed | no | no | no |
5108 | male | yes | rural | mid | formerly_smoked | private | no | no | no |
5109 | female | yes | urban | mid | unknown | govt_job | no | no | no |
4501 rows × 9 columns
class NaiveBayes:
"""
Bayes Theorem:
Likelihood * Class prior probability
Posterior Probability = -------------------------------------
Predictor prior probability
P(x|c) * p(c)
P(c|x) = ------------------
P(x)
"""
def __init__(self):
"""
Attributes:
likelihoods: Likelihood of each feature per class
class_priors: Prior probabilities of classes
pred_priors: Prior probabilities of features
features: All features of dataset
"""
self.features = list
self.likelihoods = {}
self.class_priors = {}
self.pred_priors = {}
self.X_train = np.array
self.y_train = np.array
self.train_size = int
self.num_feats = int
def fit(self, X, y):
self.features = list(X.columns)
self.X_train = X
self.y_train = y
self.train_size = X.shape[0]
self.num_feats = X.shape[1]
for feature in self.features:
self.likelihoods[feature] = {}
self.pred_priors[feature] = {}
for feat_val in np.unique(self.X_train[feature]):
self.pred_priors[feature].update({feat_val: 0})
for outcome in np.unique(self.y_train):
self.likelihoods[feature].update({feat_val+'_'+outcome:0})
self.class_priors.update({outcome: 0})
self._calc_class_prior()
self._calc_likelihoods()
self._calc_predictor_prior()
def _calc_class_prior(self):
""" P(c) - Prior Class Probability """
for outcome in np.unique(self.y_train):
outcome_count = sum(self.y_train == outcome)
self.class_priors[outcome] = outcome_count / self.train_size
def _calc_likelihoods(self):
""" P(x|c) - Likelihood """
for feature in self.features:
for outcome in np.unique(self.y_train):
outcome_count = sum(self.y_train == outcome)
feat_likelihood = self.X_train[feature][self.y_train[self.y_train == outcome].index.values.tolist()].value_counts().to_dict()
for feat_val, count in feat_likelihood.items():
self.likelihoods[feature][feat_val + '_' + outcome] = count/outcome_count
def _calc_predictor_prior(self):
""" P(x) - Evidence """
for feature in self.features:
feat_vals = self.X_train[feature].value_counts().to_dict()
for feat_val, count in feat_vals.items():
self.pred_priors[feature][feat_val] = count/self.train_size
def predict(self, X):
""" Calculates Posterior probability P(c|x) """
results = []
X = np.array(X)
for query in X:
probs_outcome = {}
for outcome in np.unique(self.y_train):
prior = self.class_priors[outcome]
likelihood = 1
evidence = 1
for feat, feat_val in zip(self.features, query):
likelihood *= self.likelihoods[feat][feat_val + '_' + outcome]
evidence *= self.pred_priors[feat][feat_val]
posterior = (likelihood * prior) / (evidence)
probs_outcome[outcome] = posterior
result = max(probs_outcome, key = lambda x: probs_outcome[x])
results.append(result)
return np.array(results)
def accuracy_score(y_true, y_pred):
""" score = (y_true - y_pred) / len(y_true) """
return round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)
def pre_processing(df):
""" partioning data into features and target """
X = df.drop([df.columns[-1]], axis = 1)
y = df[df.columns[-1]]
return X, y
data_train, data_test = train_test_split(data, random_state = 42)
X_train =data_train[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]
Y_train = data_train['stroke']
X_test =data_test[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]
Y_test = data_test['stroke']
print(pd.unique(data['gender']))
print(pd.unique(data['ever_married']))
print(pd.unique(data['Residence_type']))
print(pd.unique(data['bmi']))
print(pd.unique(data['smoking_status']))
print(pd.unique(data['work_type']))
print(pd.unique(data['hypertension']))
print(pd.unique(data['heart_disease']))
['male' 'female' 'other'] ['yes' 'no'] ['urban' 'rural'] ['high', 'mid', 'low'] Categories (3, object): ['low' < 'mid' < 'high'] ['formerly_smoked' 'never_smoked' 'smokes' 'unknown'] ['private' 'self_employed' 'govt_job' 'children' 'never_worked'] ['no' 'yes'] ['yes' 'no']
# X = data[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type']]
# y = data['stroke']
nb_clf = NaiveBayes()
nb_clf.fit(X_train, Y_train)
#
#Query 1:
query = np.array([['male','no', 'urban', 'high','smokes', 'private','yes','yes']])
print("Query 1:- {} ---> {}".format(query, nb_clf.predict(query)))
# #Query 2:
# query = np.array([['Overcast','Cool', 'Normal', 't']])
# print("Query 2:- {} ---> {}".format(query, nb_clf.predict(query)))
# #Query 3:
# query = np.array([['Sunny','Hot', 'High', 't']])
# print("Query 3:- {} ---> {}".format(query, nb_clf.predict(query)))
Query 1:- [['male' 'no' 'urban' 'high' 'smokes' 'private' 'yes' 'yes']] ---> ['no']
print("Test Accuracy: {}".format(accuracy_score(Y_test, nb_clf.predict(X_test))))
Test Accuracy: 94.67
lol = nb_clf.predict(X_test)
lol =pd.DataFrame(data=lol)
Y_test.value_counts()
no 1072 yes 54 Name: stroke, dtype: int64
lol.value_counts()
no 1116 yes 10 dtype: int64