diff --git a/main.ipynb b/main.ipynb deleted file mode 100644 index 946e327..0000000 --- a/main.ipynb +++ /dev/null @@ -1,390 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd \n", - "import numpy as np\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "#Wczytanie i normalizacja danych\n", - "def NormalizeData(data):\n", - " for col in data.columns:\n", - " if data[col].dtype == object: \n", - " data[col] = data[col].str.lower()\n", - " if col == 'smoking_status':\n", - " data[col] = data[col].str.replace(\" \", \"_\")\n", - " if col == 'work_type':\n", - " data[col] = data[col].str.replace(\"-\", \"_\")\n", - " if col == 'bmi':\n", - " bins = [0, 21, 28, 40]\n", - " labels=['low','mid','high']\n", - " data[col] = pd.cut(data[col], bins=bins, labels=labels)\n", - " if col == 'age':\n", - " bins = [18, 30, 40, 50, 60, 70, 120]\n", - " labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '70+']\n", - " data[col] = pd.cut(data[col], bins, labels = labels,include_lowest = True)\n", - " if col == 'stroke':\n", - " data[col] = data[col].replace({1: 'yes'})\n", - " data[col] = data[col].replace({0: 'no'})\n", - " if col == 'hypertension':\n", - " data[col] = data[col].replace({1: 'yes'})\n", - " data[col] = data[col].replace({0: 'no'})\n", - " if col == 'heart_disease':\n", - " data[col] = data[col].replace({1: 'yes'})\n", - " data[col] = data[col].replace({0: 'no'})\n", - " data = data.dropna()\n", - " return data\n", - "\n", - "data = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")\n", - "data = NormalizeData(data)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idgenderagehypertensionheart_diseaseever_marriedwork_typeResidence_typeavg_glucose_levelbmismoking_statusstroke
09046male60-69noyesyesprivateurban228.69highformerly_smokedyes
231112male70+noyesyesprivaterural105.92highnever_smokedyes
360182female40-49nonoyesprivateurban171.23highsmokesyes
41665female70+yesnoyesself_employedrural174.12midnever_smokedyes
556669male70+nonoyesprivateurban186.21highformerly_smokedyes
.......................................
510245010female50-59nonoyesprivaterural77.93midnever_smokedno
510644873female70+nonoyesself_employedurban125.20highnever_smokedno
510719723female30-39nonoyesself_employedrural82.99highnever_smokedno
510837544male50-59nonoyesprivaterural166.29midformerly_smokedno
510944679female40-49nonoyesgovt_joburban85.28midunknownno
\n", - "

3681 rows × 12 columns

\n", - "
" - ], - "text/plain": [ - " id gender age hypertension heart_disease ever_married \\\n", - "0 9046 male 60-69 no yes yes \n", - "2 31112 male 70+ no yes yes \n", - "3 60182 female 40-49 no no yes \n", - "4 1665 female 70+ yes no yes \n", - "5 56669 male 70+ no no yes \n", - "... ... ... ... ... ... ... \n", - "5102 45010 female 50-59 no no yes \n", - "5106 44873 female 70+ no no yes \n", - "5107 19723 female 30-39 no no yes \n", - "5108 37544 male 50-59 no no yes \n", - "5109 44679 female 40-49 no no yes \n", - "\n", - " work_type Residence_type avg_glucose_level bmi smoking_status \\\n", - "0 private urban 228.69 high formerly_smoked \n", - "2 private rural 105.92 high never_smoked \n", - "3 private urban 171.23 high smokes \n", - "4 self_employed rural 174.12 mid never_smoked \n", - "5 private urban 186.21 high formerly_smoked \n", - "... ... ... ... ... ... \n", - "5102 private rural 77.93 mid never_smoked \n", - "5106 self_employed urban 125.20 high never_smoked \n", - "5107 self_employed rural 82.99 high never_smoked \n", - "5108 private rural 166.29 mid formerly_smoked \n", - "5109 govt_job urban 85.28 mid unknown \n", - "\n", - " stroke \n", - "0 yes \n", - "2 yes \n", - "3 yes \n", - "4 yes \n", - "5 yes \n", - "... ... \n", - "5102 no \n", - "5106 no \n", - "5107 no \n", - "5108 no \n", - "5109 no \n", - "\n", - "[3681 rows x 12 columns]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "#podział danych na treningowy i testowy \n", - "data_train, data_test = train_test_split(data, random_state = 42)\n", - "\n", - "X_train =data_train[['gender', 'age', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]\n", - "Y_train = data_train['stroke']\n", - "\n", - "#rozdzielenie etykiet i cech\n", - "X_test =data_test[['gender', 'age', 'ever_married', 'Residence_type', 'bmi','smoking_status', 'work_type','hypertension','heart_disease']]\n", - "Y_test = data_test['stroke']" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "class NaiveBayes:\n", - " def __init__(self):\n", - " self.features = list\n", - " self.likelihoods = {}\n", - " self.class_priors = {}\n", - " self.pred_priors = {}\n", - "\n", - " self.X_train = np.array\n", - " self.y_train = np.array\n", - " self.train_size = int\n", - " self.num_feats = int\n", - " \n", - " def fit(self, x_train, y_train):\n", - "\n", - " self.features = list(X.columns)\n", - " self.X_train = x_train\n", - " self.y_train = y_train\n", - " self.train_size = X.shape[0]\n", - " self.num_feats = X.shape[1]\n", - "\n", - " for feature in self.features:\n", - " self.likelihoods[feature] = {}\n", - " self.pred_priors[feature] = {}\n", - "\n", - " for feat_val in np.unique(self.X_train[feature]):\n", - " self.pred_priors[feature].update({feat_val: 0})\n", - "\n", - " for outcome in np.unique(self.y_train):\n", - " self.likelihoods[feature].update({feat_val+'_'+outcome:0})\n", - " self.class_priors.update({outcome: 0})\n", - "\n", - " self._calc_class_prior()\n", - " self._calc_likelihoods()\n", - " self._calc_predictor_prior()\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}