diff --git a/ow.ipynb b/ow.ipynb
new file mode 100644
index 0000000..b1c7569
--- /dev/null
+++ b/ow.ipynb
@@ -0,0 +1,490 @@
+{
+ "metadata": {
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.5"
+ },
+ "orig_nbformat": 2,
+ "kernelspec": {
+ "name": "python385jvsc74a57bd0916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1",
+ "display_name": "Python 3.8.5 64-bit"
+ },
+ "metadata": {
+ "interpreter": {
+ "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 133,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd \n",
+ "import numpy as np\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "def NormalizeData(data):\n",
+ " for col in data.columns:\n",
+ " if data[col].dtype == object: \n",
+ " data[col] = data[col].str.lower()\n",
+ " if col == 'smoking_status':\n",
+ " data[col] = data[col].str.replace(\" \", \"_\")\n",
+ " if col == 'work_type':\n",
+ " data[col] = data[col].str.replace(\"-\", \"_\")\n",
+ " if col == 'bmi':\n",
+ " bins = [0, 21, 28, 40]\n",
+ " labels=['low','mid','high']\n",
+ " data[col] = pd.cut(data[col], bins=bins, labels=labels)\n",
+ " if col == 'stroke':\n",
+ " data[col] = data[col].replace({1: 'yes'})\n",
+ " data[col] = data[col].replace({0: 'no'})\n",
+ " if col == 'hypertension':\n",
+ " data[col] = data[col].replace({1: 'yes'})\n",
+ " data[col] = data[col].replace({0: 'no'})\n",
+ " if col == 'heart_disease':\n",
+ " data[col] = data[col].replace({1: 'yes'})\n",
+ " data[col] = data[col].replace({0: 'no'})\n",
+ " data = data.dropna()\n",
+ " return data\n",
+ "\n",
+ "data = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 136,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id gender age hypertension heart_disease ever_married \\\n",
+ "0 9046 male 67.0 no yes yes \n",
+ "2 31112 male 80.0 no yes yes \n",
+ "3 60182 female 49.0 no no yes \n",
+ "4 1665 female 79.0 yes no yes \n",
+ "5 56669 male 81.0 no no yes \n",
+ "... ... ... ... ... ... ... \n",
+ "5104 14180 female 13.0 no no no \n",
+ "5106 44873 female 81.0 no no yes \n",
+ "5107 19723 female 35.0 no no yes \n",
+ "5108 37544 male 51.0 no no yes \n",
+ "5109 44679 female 44.0 no no yes \n",
+ "\n",
+ " work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
+ "0 private urban 228.69 high formerly_smoked \n",
+ "2 private rural 105.92 high never_smoked \n",
+ "3 private urban 171.23 high smokes \n",
+ "4 self_employed rural 174.12 mid never_smoked \n",
+ "5 private urban 186.21 high formerly_smoked \n",
+ "... ... ... ... ... ... \n",
+ "5104 children rural 103.08 low unknown \n",
+ "5106 self_employed urban 125.20 high never_smoked \n",
+ "5107 self_employed rural 82.99 high never_smoked \n",
+ "5108 private rural 166.29 mid formerly_smoked \n",
+ "5109 govt_job urban 85.28 mid unknown \n",
+ "\n",
+ " stroke \n",
+ "0 yes \n",
+ "2 yes \n",
+ "3 yes \n",
+ "4 yes \n",
+ "5 yes \n",
+ "... ... \n",
+ "5104 no \n",
+ "5106 no \n",
+ "5107 no \n",
+ "5108 no \n",
+ "5109 no \n",
+ "\n",
+ "[4501 rows x 12 columns]"
+ ],
+ "text/html": "
\n\n
\n \n \n | \n id | \n gender | \n age | \n hypertension | \n heart_disease | \n ever_married | \n work_type | \n Residence_type | \n avg_glucose_level | \n bmi | \n smoking_status | \n stroke | \n
\n \n \n \n 0 | \n 9046 | \n male | \n 67.0 | \n no | \n yes | \n yes | \n private | \n urban | \n 228.69 | \n high | \n formerly_smoked | \n yes | \n
\n \n 2 | \n 31112 | \n male | \n 80.0 | \n no | \n yes | \n yes | \n private | \n rural | \n 105.92 | \n high | \n never_smoked | \n yes | \n
\n \n 3 | \n 60182 | \n female | \n 49.0 | \n no | \n no | \n yes | \n private | \n urban | \n 171.23 | \n high | \n smokes | \n yes | \n
\n \n 4 | \n 1665 | \n female | \n 79.0 | \n yes | \n no | \n yes | \n self_employed | \n rural | \n 174.12 | \n mid | \n never_smoked | \n yes | \n
\n \n 5 | \n 56669 | \n male | \n 81.0 | \n no | \n no | \n yes | \n private | \n urban | \n 186.21 | \n high | \n formerly_smoked | \n yes | \n
\n \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n
\n \n 5104 | \n 14180 | \n female | \n 13.0 | \n no | \n no | \n no | \n children | \n rural | \n 103.08 | \n low | \n unknown | \n no | \n
\n \n 5106 | \n 44873 | \n female | \n 81.0 | \n no | \n no | \n yes | \n self_employed | \n urban | \n 125.20 | \n high | \n never_smoked | \n no | \n
\n \n 5107 | \n 19723 | \n female | \n 35.0 | \n no | \n no | \n yes | \n self_employed | \n rural | \n 82.99 | \n high | \n never_smoked | \n no | \n
\n \n 5108 | \n 37544 | \n male | \n 51.0 | \n no | \n no | \n yes | \n private | \n rural | \n 166.29 | \n mid | \n formerly_smoked | \n no | \n
\n \n 5109 | \n 44679 | \n female | \n 44.0 | \n no | \n no | \n yes | \n govt_job | \n urban | \n 85.28 | \n mid | \n unknown | \n no | \n
\n \n
\n
4501 rows × 12 columns
\n
"
+ },
+ "metadata": {},
+ "execution_count": 136
+ }
+ ],
+ "source": [
+ "data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 135,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data = NormalizeData(data)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 137,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data = data[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','stroke','hypertension','heart_disease']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 138,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " gender ever_married Residence_type bmi smoking_status \\\n",
+ "0 male yes urban high formerly_smoked \n",
+ "2 male yes rural high never_smoked \n",
+ "3 female yes urban high smokes \n",
+ "4 female yes rural mid never_smoked \n",
+ "5 male yes urban high formerly_smoked \n",
+ "... ... ... ... ... ... \n",
+ "5104 female no rural low unknown \n",
+ "5106 female yes urban high never_smoked \n",
+ "5107 female yes rural high never_smoked \n",
+ "5108 male yes rural mid formerly_smoked \n",
+ "5109 female yes urban mid unknown \n",
+ "\n",
+ " work_type stroke hypertension heart_disease \n",
+ "0 private yes no yes \n",
+ "2 private yes no yes \n",
+ "3 private yes no no \n",
+ "4 self_employed yes yes no \n",
+ "5 private yes no no \n",
+ "... ... ... ... ... \n",
+ "5104 children no no no \n",
+ "5106 self_employed no no no \n",
+ "5107 self_employed no no no \n",
+ "5108 private no no no \n",
+ "5109 govt_job no no no \n",
+ "\n",
+ "[4501 rows x 9 columns]"
+ ],
+ "text/html": "\n\n
\n \n \n | \n gender | \n ever_married | \n Residence_type | \n bmi | \n smoking_status | \n work_type | \n stroke | \n hypertension | \n heart_disease | \n
\n \n \n \n 0 | \n male | \n yes | \n urban | \n high | \n formerly_smoked | \n private | \n yes | \n no | \n yes | \n
\n \n 2 | \n male | \n yes | \n rural | \n high | \n never_smoked | \n private | \n yes | \n no | \n yes | \n
\n \n 3 | \n female | \n yes | \n urban | \n high | \n smokes | \n private | \n yes | \n no | \n no | \n
\n \n 4 | \n female | \n yes | \n rural | \n mid | \n never_smoked | \n self_employed | \n yes | \n yes | \n no | \n
\n \n 5 | \n male | \n yes | \n urban | \n high | \n formerly_smoked | \n private | \n yes | \n no | \n no | \n
\n \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n ... | \n
\n \n 5104 | \n female | \n no | \n rural | \n low | \n unknown | \n children | \n no | \n no | \n no | \n
\n \n 5106 | \n female | \n yes | \n urban | \n high | \n never_smoked | \n self_employed | \n no | \n no | \n no | \n
\n \n 5107 | \n female | \n yes | \n rural | \n high | \n never_smoked | \n self_employed | \n no | \n no | \n no | \n
\n \n 5108 | \n male | \n yes | \n rural | \n mid | \n formerly_smoked | \n private | \n no | \n no | \n no | \n
\n \n 5109 | \n female | \n yes | \n urban | \n mid | \n unknown | \n govt_job | \n no | \n no | \n no | \n
\n \n
\n
4501 rows × 9 columns
\n
"
+ },
+ "metadata": {},
+ "execution_count": 138
+ }
+ ],
+ "source": [
+ "data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 95,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class NaiveBayes:\n",
+ "\n",
+ "\t\"\"\"\n",
+ "\t\tBayes Theorem:\n",
+ "\t\t\t\t\t\t\t\t\t\tLikelihood * Class prior probability\n",
+ "\t\t\t\tPosterior Probability = -------------------------------------\n",
+ "\t\t\t\t\t\t\t\t\t\t\tPredictor prior probability\n",
+ "\t\t\t\t\n",
+ "\t\t\t\t\t\t\t \t\t\t P(x|c) * p(c)\n",
+ "\t\t\t\t\t\t\t P(c|x) = ------------------ \n",
+ "\t\t\t\t\t\t\t\t\t\t\t P(x)\n",
+ "\t\"\"\"\n",
+ "\n",
+ "\tdef __init__(self):\n",
+ "\n",
+ "\t\t\"\"\"\n",
+ "\t\t\tAttributes:\n",
+ "\t\t\t\tlikelihoods: Likelihood of each feature per class\n",
+ "\t\t\t\tclass_priors: Prior probabilities of classes \n",
+ "\t\t\t\tpred_priors: Prior probabilities of features \n",
+ "\t\t\t\tfeatures: All features of dataset\n",
+ "\t\t\"\"\"\n",
+ "\t\tself.features = list\n",
+ "\t\tself.likelihoods = {}\n",
+ "\t\tself.class_priors = {}\n",
+ "\t\tself.pred_priors = {}\n",
+ "\n",
+ "\t\tself.X_train = np.array\n",
+ "\t\tself.y_train = np.array\n",
+ "\t\tself.train_size = int\n",
+ "\t\tself.num_feats = int\n",
+ "\n",
+ "\tdef fit(self, X, y):\n",
+ "\n",
+ "\t\tself.features = list(X.columns)\n",
+ "\t\tself.X_train = X\n",
+ "\t\tself.y_train = y\n",
+ "\t\tself.train_size = X.shape[0]\n",
+ "\t\tself.num_feats = X.shape[1]\n",
+ "\n",
+ "\t\tfor feature in self.features:\n",
+ "\t\t\tself.likelihoods[feature] = {}\n",
+ "\t\t\tself.pred_priors[feature] = {}\n",
+ "\n",
+ "\t\t\tfor feat_val in np.unique(self.X_train[feature]):\n",
+ "\t\t\t\tself.pred_priors[feature].update({feat_val: 0})\n",
+ "\n",
+ "\t\t\t\tfor outcome in np.unique(self.y_train):\n",
+ "\t\t\t\t\tself.likelihoods[feature].update({feat_val+'_'+outcome:0})\n",
+ "\t\t\t\t\tself.class_priors.update({outcome: 0})\n",
+ "\n",
+ "\t\tself._calc_class_prior()\n",
+ "\t\tself._calc_likelihoods()\n",
+ "\t\tself._calc_predictor_prior()\n",
+ "\n",
+ "\tdef _calc_class_prior(self):\n",
+ "\n",
+ "\t\t\"\"\" P(c) - Prior Class Probability \"\"\"\n",
+ "\n",
+ "\t\tfor outcome in np.unique(self.y_train):\n",
+ "\t\t\toutcome_count = sum(self.y_train == outcome)\n",
+ "\t\t\tself.class_priors[outcome] = outcome_count / self.train_size\n",
+ "\n",
+ "\tdef _calc_likelihoods(self):\n",
+ "\n",
+ "\t\t\"\"\" P(x|c) - Likelihood \"\"\"\n",
+ "\n",
+ "\t\tfor feature in self.features:\n",
+ "\n",
+ "\t\t\tfor outcome in np.unique(self.y_train):\n",
+ "\t\t\t\toutcome_count = sum(self.y_train == outcome)\n",
+ "\t\t\t\tfeat_likelihood = self.X_train[feature][self.y_train[self.y_train == outcome].index.values.tolist()].value_counts().to_dict()\n",
+ "\n",
+ "\t\t\t\tfor feat_val, count in feat_likelihood.items():\n",
+ "\t\t\t\t\tself.likelihoods[feature][feat_val + '_' + outcome] = count/outcome_count\n",
+ "\n",
+ "\n",
+ "\tdef _calc_predictor_prior(self):\n",
+ "\n",
+ "\t\t\"\"\" P(x) - Evidence \"\"\"\n",
+ "\n",
+ "\t\tfor feature in self.features:\n",
+ "\t\t\tfeat_vals = self.X_train[feature].value_counts().to_dict()\n",
+ "\n",
+ "\t\t\tfor feat_val, count in feat_vals.items():\n",
+ "\t\t\t\tself.pred_priors[feature][feat_val] = count/self.train_size\n",
+ "\n",
+ "\n",
+ "\tdef predict(self, X):\n",
+ "\n",
+ "\t\t\"\"\" Calculates Posterior probability P(c|x) \"\"\"\n",
+ "\n",
+ "\t\tresults = []\n",
+ "\t\tX = np.array(X)\n",
+ "\n",
+ "\t\tfor query in X:\n",
+ "\t\t\tprobs_outcome = {}\n",
+ "\t\t\tfor outcome in np.unique(self.y_train):\n",
+ "\t\t\t\tprior = self.class_priors[outcome]\n",
+ "\t\t\t\tlikelihood = 1\n",
+ "\t\t\t\tevidence = 1\n",
+ "\n",
+ "\t\t\t\tfor feat, feat_val in zip(self.features, query):\n",
+ "\t\t\t\t\tlikelihood *= self.likelihoods[feat][feat_val + '_' + outcome]\n",
+ "\t\t\t\t\tevidence *= self.pred_priors[feat][feat_val]\n",
+ "\n",
+ "\t\t\t\tposterior = (likelihood * prior) / (evidence)\n",
+ "\n",
+ "\t\t\t\tprobs_outcome[outcome] = posterior\n",
+ "\n",
+ "\t\t\tresult = max(probs_outcome, key = lambda x: probs_outcome[x])\n",
+ "\t\t\tresults.append(result)\n",
+ "\n",
+ "\t\treturn np.array(results)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 99,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def accuracy_score(y_true, y_pred):\n",
+ "\n",
+ "\t\"\"\"\tscore = (y_true - y_pred) / len(y_true) \"\"\"\n",
+ "\n",
+ "\treturn round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)\n",
+ "\n",
+ "def pre_processing(df):\n",
+ "\n",
+ "\t\"\"\" partioning data into features and target \"\"\"\n",
+ "\n",
+ "\tX = df.drop([df.columns[-1]], axis = 1)\n",
+ "\ty = df[df.columns[-1]]\n",
+ "\n",
+ "\treturn X, y"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 139,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data_train, data_test = train_test_split(data, random_state = 42)\n",
+ "\n",
+ "X_train =data_train[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]\n",
+ "Y_train = data_train['stroke']\n",
+ "\n",
+ "X_test =data_test[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]\n",
+ "Y_test = data_test['stroke']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 141,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "['male' 'female' 'other']\n['yes' 'no']\n['urban' 'rural']\n['high', 'mid', 'low']\nCategories (3, object): ['low' < 'mid' < 'high']\n['formerly_smoked' 'never_smoked' 'smokes' 'unknown']\n['private' 'self_employed' 'govt_job' 'children' 'never_worked']\n['no' 'yes']\n['yes' 'no']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(pd.unique(data['gender']))\n",
+ "print(pd.unique(data['ever_married']))\n",
+ "print(pd.unique(data['Residence_type']))\n",
+ "print(pd.unique(data['bmi']))\n",
+ "print(pd.unique(data['smoking_status']))\n",
+ "print(pd.unique(data['work_type']))\n",
+ "print(pd.unique(data['hypertension']))\n",
+ "print(pd.unique(data['heart_disease']))\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 148,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Query 1:- [['male' 'no' 'urban' 'high' 'smokes' 'private' 'yes' 'yes']] ---> ['no']\n"
+ ]
+ }
+ ],
+ "source": [
+ "# X = data[['gender', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type']]\n",
+ "# y = data['stroke']\n",
+ "\n",
+ "\n",
+ "nb_clf = NaiveBayes()\n",
+ "nb_clf.fit(X_train, Y_train)\n",
+ "\n",
+ "#\n",
+ "\t\n",
+ "#Query 1:\n",
+ "query = np.array([['male','no', 'urban', 'high','smokes', 'private','yes','yes']])\n",
+ "print(\"Query 1:- {} ---> {}\".format(query, nb_clf.predict(query)))\n",
+ "\n",
+ "# #Query 2:\n",
+ "# query = np.array([['Overcast','Cool', 'Normal', 't']])\n",
+ "# print(\"Query 2:- {} ---> {}\".format(query, nb_clf.predict(query)))\n",
+ "\n",
+ "# #Query 3:\n",
+ "# query = np.array([['Sunny','Hot', 'High', 't']])\n",
+ "# print(\"Query 3:- {} ---> {}\".format(query, nb_clf.predict(query)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 143,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Test Accuracy: 94.67\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Test Accuracy: {}\".format(accuracy_score(Y_test, nb_clf.predict(X_test))))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 157,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lol = nb_clf.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 158,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lol =pd.DataFrame(data=lol)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 167,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "no 1072\n",
+ "yes 54\n",
+ "Name: stroke, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 167
+ }
+ ],
+ "source": [
+ "Y_test.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 166,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "no 1116\n",
+ "yes 10\n",
+ "dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 166
+ }
+ ],
+ "source": [
+ "lol.value_counts()"
+ ]
+ }
+ ]
+}
\ No newline at end of file