{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import plotly.graph_objects as go\n", "from plotly.subplots import make_subplots\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn import metrics" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Przygotowanie danych:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 454 entries, 0 to 498\n", "Data columns (total 11 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 gender 454 non-null object \n", " 1 age 454 non-null float64\n", " 2 hypertension 454 non-null int64 \n", " 3 heart_disease 454 non-null int64 \n", " 4 ever_married 454 non-null object \n", " 5 work_type 454 non-null object \n", " 6 Residence_type 454 non-null object \n", " 7 avg_glucose_level 454 non-null float64\n", " 8 bmi 454 non-null float64\n", " 9 smoking_status 454 non-null object \n", " 10 stroke 454 non-null int64 \n", "dtypes: float64(3), int64(3), object(5)\n", "memory usage: 42.6+ KB\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
genderagehypertensionheart_diseaseever_marriedwork_typeResidence_typeavg_glucose_levelbmismoking_statusstroke
0Male67.001YesPrivateUrban228.6936.6formerly smoked1
2Male80.001YesPrivateRural105.9232.5never smoked1
3Female49.000YesPrivateUrban171.2334.4smokes1
4Female79.010YesSelf-employedRural174.1224.0never smoked1
5Male81.000YesPrivateUrban186.2129.0formerly smoked1
....................................
494Female55.000YesPrivateRural111.1939.7formerly smoked0
495Female71.000YesPrivateUrban93.2834.7never smoked0
496Male5.000NochildrenRural122.1935.0Unknown0
497Female14.000NochildrenRural129.5321.3never smoked0
498Female15.000NochildrenRural114.5329.1Unknown0
\n", "

454 rows × 11 columns

\n", "
" ], "text/plain": [ " gender age hypertension heart_disease ever_married work_type \\\n", "0 Male 67.0 0 1 Yes Private \n", "2 Male 80.0 0 1 Yes Private \n", "3 Female 49.0 0 0 Yes Private \n", "4 Female 79.0 1 0 Yes Self-employed \n", "5 Male 81.0 0 0 Yes Private \n", ".. ... ... ... ... ... ... \n", "494 Female 55.0 0 0 Yes Private \n", "495 Female 71.0 0 0 Yes Private \n", "496 Male 5.0 0 0 No children \n", "497 Female 14.0 0 0 No children \n", "498 Female 15.0 0 0 No children \n", "\n", " Residence_type avg_glucose_level bmi smoking_status stroke \n", "0 Urban 228.69 36.6 formerly smoked 1 \n", "2 Rural 105.92 32.5 never smoked 1 \n", "3 Urban 171.23 34.4 smokes 1 \n", "4 Rural 174.12 24.0 never smoked 1 \n", "5 Urban 186.21 29.0 formerly smoked 1 \n", ".. ... ... ... ... ... \n", "494 Rural 111.19 39.7 formerly smoked 0 \n", "495 Urban 93.28 34.7 never smoked 0 \n", "496 Rural 122.19 35.0 Unknown 0 \n", "497 Rural 129.53 21.3 never smoked 0 \n", "498 Rural 114.53 29.1 Unknown 0 \n", "\n", "[454 rows x 11 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('data.csv')\n", "del df['id']\n", "df = df[df['gender'].map(lambda x: str(x) != \"Other\")]\n", "df = df.dropna()\n", "df.info()\n", "df" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agehypertensionheart_diseaseavg_glucose_levelbmistrokegender_Maleever_married_Yeswork_type_Never_workedwork_type_Privatework_type_Self-employedwork_type_childrenResidence_type_Urbansmoking_status_formerly smokedsmoking_status_never smokedsmoking_status_smokes
067.001228.6936.611101001100
280.001105.9232.511101000010
349.000171.2334.410101001001
479.010174.1224.010100100010
581.000186.2129.011101001100
...................................................
49455.000111.1939.700101000100
49571.00093.2834.700101001010
4965.000122.1935.001000010000
49714.000129.5321.300000010010
49815.000114.5329.100000010000
\n", "

454 rows × 16 columns

\n", "
" ], "text/plain": [ " age hypertension heart_disease avg_glucose_level bmi stroke \\\n", "0 67.0 0 1 228.69 36.6 1 \n", "2 80.0 0 1 105.92 32.5 1 \n", "3 49.0 0 0 171.23 34.4 1 \n", "4 79.0 1 0 174.12 24.0 1 \n", "5 81.0 0 0 186.21 29.0 1 \n", ".. ... ... ... ... ... ... \n", "494 55.0 0 0 111.19 39.7 0 \n", "495 71.0 0 0 93.28 34.7 0 \n", "496 5.0 0 0 122.19 35.0 0 \n", "497 14.0 0 0 129.53 21.3 0 \n", "498 15.0 0 0 114.53 29.1 0 \n", "\n", " gender_Male ever_married_Yes work_type_Never_worked work_type_Private \\\n", "0 1 1 0 1 \n", "2 1 1 0 1 \n", "3 0 1 0 1 \n", "4 0 1 0 0 \n", "5 1 1 0 1 \n", ".. ... ... ... ... \n", "494 0 1 0 1 \n", "495 0 1 0 1 \n", "496 1 0 0 0 \n", "497 0 0 0 0 \n", "498 0 0 0 0 \n", "\n", " work_type_Self-employed work_type_children Residence_type_Urban \\\n", "0 0 0 1 \n", "2 0 0 0 \n", "3 0 0 1 \n", "4 1 0 0 \n", "5 0 0 1 \n", ".. ... ... ... \n", "494 0 0 0 \n", "495 0 0 1 \n", "496 0 1 0 \n", "497 0 1 0 \n", "498 0 1 0 \n", "\n", " smoking_status_formerly smoked smoking_status_never smoked \\\n", "0 1 0 \n", "2 0 1 \n", "3 0 0 \n", "4 0 1 \n", "5 1 0 \n", ".. ... ... \n", "494 1 0 \n", "495 0 1 \n", "496 0 0 \n", "497 0 1 \n", "498 0 0 \n", "\n", " smoking_status_smokes \n", "0 0 \n", "2 0 \n", "3 1 \n", "4 0 \n", "5 0 \n", ".. ... \n", "494 0 \n", "495 0 \n", "496 0 \n", "497 0 \n", "498 0 \n", "\n", "[454 rows x 16 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.get_dummies(df, columns=['gender'], drop_first=True)\n", "df = pd.get_dummies(df, columns=['ever_married'], drop_first=True)\n", "df = pd.get_dummies(df, columns=['work_type'], drop_first=True)\n", "df = pd.get_dummies(df, columns=['Residence_type'], drop_first=True)\n", "df = pd.get_dummies(df, columns=['smoking_status'], drop_first=True)\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['stroke'])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 196\n", "1 167\n", "Name: stroke, dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train['stroke'].value_counts(dropna=False)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 49\n", "1 42\n", "Name: stroke, dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_test['stroke'].value_counts(dropna=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Wizualizacja" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "sns.pairplot(df, x_vars=['avg_glucose_level','bmi'], y_vars='age', height=7, aspect=0.7, diag_kind = None)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ " \n", " " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.plotly.v1+json": { "config": { "plotlyServerURL": "https://plot.ly" }, "data": [ { "name": "Smoker", "type": "bar", "x": [ "False", "True" ], "xaxis": "x", "y": [ 77, 377 ], "yaxis": "y" }, { "name": "Male", "type": "bar", "x": [ "False", "True" ], "xaxis": "x2", "y": [ 179, 275 ], "yaxis": "y2" }, { "name": "Urban", "type": "bar", "x": [ "False", "True" ], "xaxis": "x3", "y": [ 217, 237 ], "yaxis": "y3" } ], "layout": { "annotations": [ { "font": { "size": 16 }, "showarrow": false, "text": "Smoker", "x": 0.14444444444444446, "xanchor": "center", "xref": "paper", "y": 1, "yanchor": "bottom", "yref": "paper" }, { "font": { "size": 16 }, "showarrow": false, "text": "Male", "x": 0.5, "xanchor": "center", "xref": "paper", "y": 1, "yanchor": "bottom", "yref": "paper" }, { "font": { "size": 16 }, "showarrow": false, "text": "Urban", "x": 0.8555555555555556, "xanchor": "center", "xref": "paper", "y": 1, "yanchor": "bottom", "yref": "paper" } ], "bargap": 0.05, "barmode": "relative", "height": 400, "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "width": 700, "xaxis": { "anchor": "y", "domain": [ 0, 0.2888888888888889 ] }, "xaxis2": { "anchor": "y2", "domain": [ 0.35555555555555557, 0.6444444444444445 ] }, "xaxis3": { "anchor": "y3", "domain": [ 0.7111111111111111, 1 ] }, "yaxis": { "anchor": "x", "domain": [ 0, 1 ] }, "yaxis2": { "anchor": "x2", "domain": [ 0, 1 ] }, "yaxis3": { "anchor": "x3", "domain": [ 0, 1 ] } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "features = {'Smoker': df['smoking_status_smokes'].values,\n", " 'Male': df['gender_Male'].values,\n", " 'Urban': df['Residence_type_Urban']}\n", "df_vis = pd.DataFrame(features)\n", "\n", "fig = make_subplots(rows=1, cols=3, subplot_titles=('Smoker', 'Male', 'Urban'))\n", "L= len(df_vis)\n", "\n", "cnames = list(df_vis.columns)\n", "for k, name in enumerate(cnames):\n", " n_true = df_vis[name].sum()\n", " fig.add_trace(go.Bar(x=['False', 'True'], y=[n_true, L-n_true], name=name ), 1,k+1)\n", "fig.update_layout(barmode='relative', bargap=0.05, width=700, height=400)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Regresja liniowa" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "X_cols = list(set(df_train.columns) - {'stroke'})\n", "y_cols = 'stroke'\n", "\n", "X_train_lin = df_train[X_cols]\n", "y_train_lin = df_train[y_cols]\n", "\n", "X_test_lin = df_test[X_cols]\n", "y_test_lin = df_test[y_cols]\n", "\n", "clf_lin = LinearRegression().fit(X_train_lin, y_train_lin)\n", "test_pred_lin = clf_lin.predict(X_test_lin)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def regression_results(y_true, y_pred):\n", "\n", " # Regression metrics\n", " explained_variance=metrics.explained_variance_score(y_true, y_pred)\n", " mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) \n", " mse=metrics.mean_squared_error(y_true, y_pred) \n", " mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)\n", " median_absolute_error=metrics.median_absolute_error(y_true, y_pred)\n", " r2=metrics.r2_score(y_true, y_pred)\n", "\n", " print('explained_variance: ', round(explained_variance,4)) \n", " print('mean_squared_log_error: ', round(mean_squared_log_error,4))\n", " print('r2: ', round(r2,4))\n", " print('MAE: ', round(mean_absolute_error,4))\n", " print('MSE: ', round(mse,4))\n", " print('RMSE: ', round(np.sqrt(mse),4))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "explained_variance: 0.0292\n", "mean_squared_log_error: 0.1162\n", "r2: 0.0272\n", "MAE: 0.2418\n", "MSE: 0.2418\n", "RMSE: 0.4917\n" ] } ], "source": [ "test_pred_lin = np.round(np.clip(test_pred_lin, 0, 1))\n", "regression_results(y_test_lin, test_pred_lin)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.79 0.76 0.77 49\n", " 1 0.73 0.76 0.74 42\n", "\n", " accuracy 0.76 91\n", " macro avg 0.76 0.76 0.76 91\n", "weighted avg 0.76 0.76 0.76 91\n", "\n" ] } ], "source": [ "print(metrics.classification_report(y_test_lin, test_pred_lin))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1.,\n", " 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 1.,\n", " 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1.,\n", " 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.,\n", " 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1.,\n", " 0., 1., 0., 0., 1., 0.])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_pred_lin" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1,\n", " 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,\n", " 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,\n", " 0, 1, 0], dtype=int64)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_test_lin.values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Regresja logistyczna" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "num_cols = ['age', 'avg_glucose_level', 'bmi']\n", "cat_cols = list(set(df.columns) - {'stroke', 'age', 'avg_glucose_level', 'bmi'})\n", "\n", "scaler = StandardScaler()\n", "scaler.fit(df_train[num_cols])\n", "\n", "X_num_train = scaler.transform(df_train[num_cols])\n", "X_cat_train = df_train[cat_cols].to_numpy()\n", "X_train = np.hstack((X_num_train, X_cat_train))\n", "y_train = df_train['stroke']\n", "\n", "X_num_test = scaler.transform(df_test[num_cols])\n", "X_cat_test = df_test[cat_cols].to_numpy()\n", "X_test = np.hstack((X_num_test, X_cat_test))\n", "y_test = df_test['stroke']" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "explained_variance: -0.0165\n", "mean_squared_log_error: 0.1214\n", "r2: -0.017\n", "MAE: 0.2527\n", "MSE: 0.2527\n", "RMSE: 0.5027\n" ] } ], "source": [ "clf_log = LogisticRegression()\n", "\n", "clf_log.fit(X_train, y_train)\n", "\n", "test_pred_log = clf_log.predict(X_test)\n", "\n", "regression_results(y_test, test_pred_log)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Classification Report\n", " precision recall f1-score support\n", "\n", " 0 0.77 0.76 0.76 49\n", " 1 0.72 0.74 0.73 42\n", "\n", " accuracy 0.75 91\n", " macro avg 0.75 0.75 0.75 91\n", "weighted avg 0.75 0.75 0.75 91\n", "\n" ] } ], "source": [ "print('\\nClassification Report')\n", "print(metrics.classification_report(y_test, test_pred_log))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,\n", " 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,\n", " 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,\n", " 0, 1, 0], dtype=int64)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_pred_log" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1,\n", " 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,\n", " 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,\n", " 0, 1, 0], dtype=int64)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_test.values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Naiwny Bayes" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "explained_variance: 0.2187\n", "mean_squared_log_error: 0.1003\n", "r2: 0.1599\n", "MAE: 0.2088\n", "MSE: 0.2088\n", "RMSE: 0.4569\n" ] } ], "source": [ "gnb = GaussianNB()\n", "\n", "gnb = gnb.fit(X_train, y_train)\n", "\n", "test_pred_bay = gnb.predict(X_test)\n", "\n", "regression_results(y_test, test_pred_bay)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Classification Report\n", " precision recall f1-score support\n", "\n", " 0 0.89 0.69 0.78 49\n", " 1 0.72 0.90 0.80 42\n", "\n", " accuracy 0.79 91\n", " macro avg 0.81 0.80 0.79 91\n", "weighted avg 0.81 0.79 0.79 91\n", "\n" ] } ], "source": [ "print('\\nClassification Report')\n", "print(metrics.classification_report(y_test, test_pred_bay))" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1,\n", " 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,\n", " 0, 1, 0], dtype=int64)" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_pred_bay" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1,\n", " 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,\n", " 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,\n", " 0, 1, 0], dtype=int64)" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_test.values" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }