naivebayes

2021-05-29 12:20:28 +02:00 · 2021-05-29 12:20:28 +02:00 · 0517754510
commit 0517754510
2 changed files with 5501 additions and 0 deletions
--- a/healthcare-dataset-stroke-data.csv
+++ b/healthcare-dataset-stroke-data.csv
--- a/main.ipynb
+++ b/main.ipynb
@ -0,0 +1,390 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd \n",
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "#Wczytanie i normalizacja danych\n",
+    "def NormalizeData(data):\n",
+    "    for col in data.columns:\n",
+    "        if data[col].dtype == object: \n",
+    "            data[col] = data[col].str.lower()\n",
+    "        if col == 'smoking_status':\n",
+    "            data[col] = data[col].str.replace(\" \", \"_\")\n",
+    "        if col == 'work_type':\n",
+    "            data[col] = data[col].str.replace(\"-\", \"_\")\n",
+    "        if col == 'bmi':\n",
+    "            bins = [0, 21, 28, 40]\n",
+    "            labels=['low','mid','high']\n",
+    "            data[col] = pd.cut(data[col], bins=bins, labels=labels)\n",
+    "        if col == 'age':\n",
+    "            bins = [18, 30, 40, 50, 60, 70, 120]\n",
+    "            labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70+']\n",
+    "            data[col] = pd.cut(data[col], bins, labels = labels,include_lowest = True)\n",
+    "        if col == 'stroke':\n",
+    "            data[col] = data[col].replace({1: 'yes'})\n",
+    "            data[col] = data[col].replace({0: 'no'})\n",
+    "        if col == 'hypertension':\n",
+    "            data[col] = data[col].replace({1: 'yes'})\n",
+    "            data[col] = data[col].replace({0: 'no'})\n",
+    "        if col == 'heart_disease':\n",
+    "            data[col] = data[col].replace({1: 'yes'})\n",
+    "            data[col] = data[col].replace({0: 'no'})\n",
+    "    data = data.dropna()\n",
+    "    return data\n",
+    "\n",
+    "data = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")\n",
+    "data = NormalizeData(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>age</th>\n",
+       "      <th>hypertension</th>\n",
+       "      <th>heart_disease</th>\n",
+       "      <th>ever_married</th>\n",
+       "      <th>work_type</th>\n",
+       "      <th>Residence_type</th>\n",
+       "      <th>avg_glucose_level</th>\n",
+       "      <th>bmi</th>\n",
+       "      <th>smoking_status</th>\n",
+       "      <th>stroke</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>9046</td>\n",
+       "      <td>male</td>\n",
+       "      <td>60-69</td>\n",
+       "      <td>no</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>private</td>\n",
+       "      <td>urban</td>\n",
+       "      <td>228.69</td>\n",
+       "      <td>high</td>\n",
+       "      <td>formerly_smoked</td>\n",
+       "      <td>yes</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>31112</td>\n",
+       "      <td>male</td>\n",
+       "      <td>70+</td>\n",
+       "      <td>no</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>private</td>\n",
+       "      <td>rural</td>\n",
+       "      <td>105.92</td>\n",
+       "      <td>high</td>\n",
+       "      <td>never_smoked</td>\n",
+       "      <td>yes</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>60182</td>\n",
+       "      <td>female</td>\n",
+       "      <td>40-49</td>\n",
+       "      <td>no</td>\n",
+       "      <td>no</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>private</td>\n",
+       "      <td>urban</td>\n",
+       "      <td>171.23</td>\n",
+       "      <td>high</td>\n",
+       "      <td>smokes</td>\n",
+       "      <td>yes</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1665</td>\n",
+       "      <td>female</td>\n",
+       "      <td>70+</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>no</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>self_employed</td>\n",
+       "      <td>rural</td>\n",
+       "      <td>174.12</td>\n",
+       "      <td>mid</td>\n",
+       "      <td>never_smoked</td>\n",
+       "      <td>yes</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>56669</td>\n",
+       "      <td>male</td>\n",
+       "      <td>70+</td>\n",
+       "      <td>no</td>\n",
+       "      <td>no</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>private</td>\n",
+       "      <td>urban</td>\n",
+       "      <td>186.21</td>\n",
+       "      <td>high</td>\n",
+       "      <td>formerly_smoked</td>\n",
+       "      <td>yes</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5102</th>\n",
+       "      <td>45010</td>\n",
+       "      <td>female</td>\n",
+       "      <td>50-59</td>\n",
+       "      <td>no</td>\n",
+       "      <td>no</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>private</td>\n",
+       "      <td>rural</td>\n",
+       "      <td>77.93</td>\n",
+       "      <td>mid</td>\n",
+       "      <td>never_smoked</td>\n",
+       "      <td>no</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5106</th>\n",
+       "      <td>44873</td>\n",
+       "      <td>female</td>\n",
+       "      <td>70+</td>\n",
+       "      <td>no</td>\n",
+       "      <td>no</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>self_employed</td>\n",
+       "      <td>urban</td>\n",
+       "      <td>125.20</td>\n",
+       "      <td>high</td>\n",
+       "      <td>never_smoked</td>\n",
+       "      <td>no</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5107</th>\n",
+       "      <td>19723</td>\n",
+       "      <td>female</td>\n",
+       "      <td>30-39</td>\n",
+       "      <td>no</td>\n",
+       "      <td>no</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>self_employed</td>\n",
+       "      <td>rural</td>\n",
+       "      <td>82.99</td>\n",
+       "      <td>high</td>\n",
+       "      <td>never_smoked</td>\n",
+       "      <td>no</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5108</th>\n",
+       "      <td>37544</td>\n",
+       "      <td>male</td>\n",
+       "      <td>50-59</td>\n",
+       "      <td>no</td>\n",
+       "      <td>no</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>private</td>\n",
+       "      <td>rural</td>\n",
+       "      <td>166.29</td>\n",
+       "      <td>mid</td>\n",
+       "      <td>formerly_smoked</td>\n",
+       "      <td>no</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5109</th>\n",
+       "      <td>44679</td>\n",
+       "      <td>female</td>\n",
+       "      <td>40-49</td>\n",
+       "      <td>no</td>\n",
+       "      <td>no</td>\n",
+       "      <td>yes</td>\n",
+       "      <td>govt_job</td>\n",
+       "      <td>urban</td>\n",
+       "      <td>85.28</td>\n",
+       "      <td>mid</td>\n",
+       "      <td>unknown</td>\n",
+       "      <td>no</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3681 rows × 12 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         id  gender    age hypertension heart_disease ever_married  \\\n",
+       "0      9046    male  60-69           no           yes          yes   \n",
+       "2     31112    male    70+           no           yes          yes   \n",
+       "3     60182  female  40-49           no            no          yes   \n",
+       "4      1665  female    70+          yes            no          yes   \n",
+       "5     56669    male    70+           no            no          yes   \n",
+       "...     ...     ...    ...          ...           ...          ...   \n",
+       "5102  45010  female  50-59           no            no          yes   \n",
+       "5106  44873  female    70+           no            no          yes   \n",
+       "5107  19723  female  30-39           no            no          yes   \n",
+       "5108  37544    male  50-59           no            no          yes   \n",
+       "5109  44679  female  40-49           no            no          yes   \n",
+       "\n",
+       "          work_type Residence_type  avg_glucose_level   bmi   smoking_status  \\\n",
+       "0           private          urban             228.69  high  formerly_smoked   \n",
+       "2           private          rural             105.92  high     never_smoked   \n",
+       "3           private          urban             171.23  high           smokes   \n",
+       "4     self_employed          rural             174.12   mid     never_smoked   \n",
+       "5           private          urban             186.21  high  formerly_smoked   \n",
+       "...             ...            ...                ...   ...              ...   \n",
+       "5102        private          rural              77.93   mid     never_smoked   \n",
+       "5106  self_employed          urban             125.20  high     never_smoked   \n",
+       "5107  self_employed          rural              82.99  high     never_smoked   \n",
+       "5108        private          rural             166.29   mid  formerly_smoked   \n",
+       "5109       govt_job          urban              85.28   mid          unknown   \n",
+       "\n",
+       "     stroke  \n",
+       "0       yes  \n",
+       "2       yes  \n",
+       "3       yes  \n",
+       "4       yes  \n",
+       "5       yes  \n",
+       "...     ...  \n",
+       "5102     no  \n",
+       "5106     no  \n",
+       "5107     no  \n",
+       "5108     no  \n",
+       "5109     no  \n",
+       "\n",
+       "[3681 rows x 12 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#podział danych na treningowy i testowy \n",
+    "data_train, data_test = train_test_split(data, random_state = 42)\n",
+    "\n",
+    "X_train =data_train[['gender', 'age', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]\n",
+    "Y_train = data_train['stroke']\n",
+    "\n",
+    "#rozdzielenie etykiet i cech\n",
+    "X_test =data_test[['gender', 'age', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]\n",
+    "Y_test = data_test['stroke']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class  NaiveBayes:\n",
+    "    def __init__(self):\n",
+    "        self.features = list\n",
+    "        self.likelihoods = {}\n",
+    "        self.class_priors = {}\n",
+    "        self.pred_priors = {}\n",
+    "\n",
+    "        self.X_train = np.array\n",
+    "        self.y_train = np.array\n",
+    "        self.train_size = int\n",
+    "        self.num_feats = int\n",
+    "        \n",
+    "    def fit(self, x_train, y_train):\n",
+    "\n",
+    "        self.features = list(X.columns)\n",
+    "        self.X_train = x_train\n",
+    "        self.y_train = y_train\n",
+    "        self.train_size = X.shape[0]\n",
+    "        self.num_feats = X.shape[1]\n",
+    "\n",
+    "        for feature in self.features:\n",
+    "            self.likelihoods[feature] = {}\n",
+    "            self.pred_priors[feature] = {}\n",
+    "\n",
+    "            for feat_val in np.unique(self.X_train[feature]):\n",
+    "                self.pred_priors[feature].update({feat_val: 0})\n",
+    "\n",
+    "                for outcome in np.unique(self.y_train):\n",
+    "                    self.likelihoods[feature].update({feat_val+'_'+outcome:0})\n",
+    "                    self.class_priors.update({outcome: 0})\n",
+    "\n",
+    "        self._calc_class_prior()\n",
+    "        self._calc_likelihoods()\n",
+    "        self._calc_predictor_prior()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}