naiwny_bayes_dyskretny/ow.ipynb
2021-05-27 22:13:47 +02:00

23 KiB
Raw Permalink Blame History

import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split

def NormalizeData(data):
    for col in data.columns:
        if data[col].dtype == object: 
            data[col] = data[col].str.lower()
        if col == 'smoking_status':
            data[col] = data[col].str.replace(" ", "_")
        if col == 'work_type':
            data[col] = data[col].str.replace("-", "_")
        if col == 'bmi':
            bins = [0, 21, 28, 40]
            labels=['low','mid','high']
            data[col] = pd.cut(data[col], bins=bins, labels=labels)
        if col == 'stroke':
            data[col] = data[col].replace({1: 'yes'})
            data[col] = data[col].replace({0: 'no'})
        if col == 'hypertension':
            data[col] = data[col].replace({1: 'yes'})
            data[col] = data[col].replace({0: 'no'})
        if col == 'heart_disease':
            data[col] = data[col].replace({1: 'yes'})
            data[col] = data[col].replace({0: 'no'})
    data = data.dropna()
    return data

data = pd.read_csv("healthcare-dataset-stroke-data.csv")
data
id gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 9046 male 67.0 no yes yes private urban 228.69 high formerly_smoked yes
2 31112 male 80.0 no yes yes private rural 105.92 high never_smoked yes
3 60182 female 49.0 no no yes private urban 171.23 high smokes yes
4 1665 female 79.0 yes no yes self_employed rural 174.12 mid never_smoked yes
5 56669 male 81.0 no no yes private urban 186.21 high formerly_smoked yes
... ... ... ... ... ... ... ... ... ... ... ... ...
5104 14180 female 13.0 no no no children rural 103.08 low unknown no
5106 44873 female 81.0 no no yes self_employed urban 125.20 high never_smoked no
5107 19723 female 35.0 no no yes self_employed rural 82.99 high never_smoked no
5108 37544 male 51.0 no no yes private rural 166.29 mid formerly_smoked no
5109 44679 female 44.0 no no yes govt_job urban 85.28 mid unknown no

4501 rows × 12 columns

data = NormalizeData(data)
data = data[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','stroke','hypertension','heart_disease']]
data
gender ever_married Residence_type bmi smoking_status work_type stroke hypertension heart_disease
0 male yes urban high formerly_smoked private yes no yes
2 male yes rural high never_smoked private yes no yes
3 female yes urban high smokes private yes no no
4 female yes rural mid never_smoked self_employed yes yes no
5 male yes urban high formerly_smoked private yes no no
... ... ... ... ... ... ... ... ... ...
5104 female no rural low unknown children no no no
5106 female yes urban high never_smoked self_employed no no no
5107 female yes rural high never_smoked self_employed no no no
5108 male yes rural mid formerly_smoked private no no no
5109 female yes urban mid unknown govt_job no no no

4501 rows × 9 columns

class  NaiveBayes:

	"""
		Bayes Theorem:
										Likelihood * Class prior probability
				Posterior Probability = -------------------------------------
											Predictor prior probability
				
							  			 P(x|c) * p(c)
							   P(c|x) = ------------------ 
											  P(x)
	"""

	def __init__(self):

		"""
			Attributes:
				likelihoods: Likelihood of each feature per class
				class_priors: Prior probabilities of classes 
				pred_priors: Prior probabilities of features 
				features: All features of dataset
		"""
		self.features = list
		self.likelihoods = {}
		self.class_priors = {}
		self.pred_priors = {}

		self.X_train = np.array
		self.y_train = np.array
		self.train_size = int
		self.num_feats = int

	def fit(self, X, y):

		self.features = list(X.columns)
		self.X_train = X
		self.y_train = y
		self.train_size = X.shape[0]
		self.num_feats = X.shape[1]

		for feature in self.features:
			self.likelihoods[feature] = {}
			self.pred_priors[feature] = {}

			for feat_val in np.unique(self.X_train[feature]):
				self.pred_priors[feature].update({feat_val: 0})

				for outcome in np.unique(self.y_train):
					self.likelihoods[feature].update({feat_val+'_'+outcome:0})
					self.class_priors.update({outcome: 0})

		self._calc_class_prior()
		self._calc_likelihoods()
		self._calc_predictor_prior()

	def _calc_class_prior(self):

		""" P(c) - Prior Class Probability """

		for outcome in np.unique(self.y_train):
			outcome_count = sum(self.y_train == outcome)
			self.class_priors[outcome] = outcome_count / self.train_size

	def _calc_likelihoods(self):

		""" P(x|c) - Likelihood """

		for feature in self.features:

			for outcome in np.unique(self.y_train):
				outcome_count = sum(self.y_train == outcome)
				feat_likelihood = self.X_train[feature][self.y_train[self.y_train == outcome].index.values.tolist()].value_counts().to_dict()

				for feat_val, count in feat_likelihood.items():
					self.likelihoods[feature][feat_val + '_' + outcome] = count/outcome_count


	def _calc_predictor_prior(self):

		""" P(x) - Evidence """

		for feature in self.features:
			feat_vals = self.X_train[feature].value_counts().to_dict()

			for feat_val, count in feat_vals.items():
				self.pred_priors[feature][feat_val] = count/self.train_size


	def predict(self, X):

		""" Calculates Posterior probability P(c|x) """

		results = []
		X = np.array(X)

		for query in X:
			probs_outcome = {}
			for outcome in np.unique(self.y_train):
				prior = self.class_priors[outcome]
				likelihood = 1
				evidence = 1

				for feat, feat_val in zip(self.features, query):
					likelihood *= self.likelihoods[feat][feat_val + '_' + outcome]
					evidence *= self.pred_priors[feat][feat_val]

				posterior = (likelihood * prior) / (evidence)

				probs_outcome[outcome] = posterior

			result = max(probs_outcome, key = lambda x: probs_outcome[x])
			results.append(result)

		return np.array(results)
def accuracy_score(y_true, y_pred):

	"""	score = (y_true - y_pred) / len(y_true) """

	return round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)

def pre_processing(df):

	""" partioning data into features and target """

	X = df.drop([df.columns[-1]], axis = 1)
	y = df[df.columns[-1]]

	return X, y
data_train, data_test = train_test_split(data, random_state = 42)

X_train =data_train[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]
Y_train = data_train['stroke']

X_test =data_test[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]
Y_test = data_test['stroke']
print(pd.unique(data['gender']))
print(pd.unique(data['ever_married']))
print(pd.unique(data['Residence_type']))
print(pd.unique(data['bmi']))
print(pd.unique(data['smoking_status']))
print(pd.unique(data['work_type']))
print(pd.unique(data['hypertension']))
print(pd.unique(data['heart_disease']))
['male' 'female' 'other']
['yes' 'no']
['urban' 'rural']
['high', 'mid', 'low']
Categories (3, object): ['low' < 'mid' < 'high']
['formerly_smoked' 'never_smoked' 'smokes' 'unknown']
['private' 'self_employed' 'govt_job' 'children' 'never_worked']
['no' 'yes']
['yes' 'no']
# X = data[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type']]
# y = data['stroke']


nb_clf = NaiveBayes()
nb_clf.fit(X_train, Y_train)

#
	
#Query 1:
query = np.array([['male','no', 'urban', 'high','smokes', 'private','yes','yes']])
print("Query 1:- {} ---> {}".format(query, nb_clf.predict(query)))

# #Query 2:
# query = np.array([['Overcast','Cool', 'Normal', 't']])
# print("Query 2:- {} ---> {}".format(query, nb_clf.predict(query)))

# #Query 3:
# query = np.array([['Sunny','Hot', 'High', 't']])
# print("Query 3:- {} ---> {}".format(query, nb_clf.predict(query)))
Query 1:- [['male' 'no' 'urban' 'high' 'smokes' 'private' 'yes' 'yes']] ---> ['no']
print("Test Accuracy: {}".format(accuracy_score(Y_test, nb_clf.predict(X_test))))
Test Accuracy: 94.67
lol = nb_clf.predict(X_test)
lol =pd.DataFrame(data=lol)
Y_test.value_counts()
no     1072
yes      54
Name: stroke, dtype: int64
lol.value_counts()
no     1116
yes      10
dtype: int64