3.5 MiB
3.5 MiB
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
Przygotowanie danych:
df = pd.read_csv('data.csv')
del df['id']
df = df[df['gender'].map(lambda x: str(x) != "Other")]
df = df.dropna()
df.info()
df
<class 'pandas.core.frame.DataFrame'> Int64Index: 454 entries, 0 to 498 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 gender 454 non-null object 1 age 454 non-null float64 2 hypertension 454 non-null int64 3 heart_disease 454 non-null int64 4 ever_married 454 non-null object 5 work_type 454 non-null object 6 Residence_type 454 non-null object 7 avg_glucose_level 454 non-null float64 8 bmi 454 non-null float64 9 smoking_status 454 non-null object 10 stroke 454 non-null int64 dtypes: float64(3), int64(3), object(5) memory usage: 42.6+ KB
gender | age | hypertension | heart_disease | ever_married | work_type | Residence_type | avg_glucose_level | bmi | smoking_status | stroke | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Male | 67.0 | 0 | 1 | Yes | Private | Urban | 228.69 | 36.6 | formerly smoked | 1 |
2 | Male | 80.0 | 0 | 1 | Yes | Private | Rural | 105.92 | 32.5 | never smoked | 1 |
3 | Female | 49.0 | 0 | 0 | Yes | Private | Urban | 171.23 | 34.4 | smokes | 1 |
4 | Female | 79.0 | 1 | 0 | Yes | Self-employed | Rural | 174.12 | 24.0 | never smoked | 1 |
5 | Male | 81.0 | 0 | 0 | Yes | Private | Urban | 186.21 | 29.0 | formerly smoked | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
494 | Female | 55.0 | 0 | 0 | Yes | Private | Rural | 111.19 | 39.7 | formerly smoked | 0 |
495 | Female | 71.0 | 0 | 0 | Yes | Private | Urban | 93.28 | 34.7 | never smoked | 0 |
496 | Male | 5.0 | 0 | 0 | No | children | Rural | 122.19 | 35.0 | Unknown | 0 |
497 | Female | 14.0 | 0 | 0 | No | children | Rural | 129.53 | 21.3 | never smoked | 0 |
498 | Female | 15.0 | 0 | 0 | No | children | Rural | 114.53 | 29.1 | Unknown | 0 |
454 rows × 11 columns
df = pd.get_dummies(df, columns=['gender'], drop_first=True)
df = pd.get_dummies(df, columns=['ever_married'], drop_first=True)
df = pd.get_dummies(df, columns=['work_type'], drop_first=True)
df = pd.get_dummies(df, columns=['Residence_type'], drop_first=True)
df = pd.get_dummies(df, columns=['smoking_status'], drop_first=True)
df
age | hypertension | heart_disease | avg_glucose_level | bmi | stroke | gender_Male | ever_married_Yes | work_type_Never_worked | work_type_Private | work_type_Self-employed | work_type_children | Residence_type_Urban | smoking_status_formerly smoked | smoking_status_never smoked | smoking_status_smokes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 67.0 | 0 | 1 | 228.69 | 36.6 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 |
2 | 80.0 | 0 | 1 | 105.92 | 32.5 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | 49.0 | 0 | 0 | 171.23 | 34.4 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
4 | 79.0 | 1 | 0 | 174.12 | 24.0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
5 | 81.0 | 0 | 0 | 186.21 | 29.0 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
494 | 55.0 | 0 | 0 | 111.19 | 39.7 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
495 | 71.0 | 0 | 0 | 93.28 | 34.7 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
496 | 5.0 | 0 | 0 | 122.19 | 35.0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
497 | 14.0 | 0 | 0 | 129.53 | 21.3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
498 | 15.0 | 0 | 0 | 114.53 | 29.1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
454 rows × 16 columns
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['stroke'])
df_train['stroke'].value_counts(dropna=False)
0 196 1 167 Name: stroke, dtype: int64
df_test['stroke'].value_counts(dropna=False)
0 49 1 42 Name: stroke, dtype: int64
Wizualizacja
sns.pairplot(df, x_vars=['avg_glucose_level','bmi'], y_vars='age', height=7, aspect=0.7, diag_kind = None)
<seaborn.axisgrid.PairGrid at 0x19196a188e0>
features = {'Smoker': df['smoking_status_smokes'].values,
'Male': df['gender_Male'].values,
'Urban': df['Residence_type_Urban']}
df_vis = pd.DataFrame(features)
fig = make_subplots(rows=1, cols=3, subplot_titles=('Smoker', 'Male', 'Urban'))
L= len(df_vis)
cnames = list(df_vis.columns)
for k, name in enumerate(cnames):
n_true = df_vis[name].sum()
fig.add_trace(go.Bar(x=['False', 'True'], y=[n_true, L-n_true], name=name ), 1,k+1)
fig.update_layout(barmode='relative', bargap=0.05, width=700, height=400)
Regresja liniowa
X_cols = list(set(df_train.columns) - {'stroke'})
y_cols = 'stroke'
X_train_lin = df_train[X_cols]
y_train_lin = df_train[y_cols]
X_test_lin = df_test[X_cols]
y_test_lin = df_test[y_cols]
clf_lin = LinearRegression().fit(X_train_lin, y_train_lin)
test_pred_lin = clf_lin.predict(X_test_lin)
def regression_results(y_true, y_pred):
# Regression metrics
explained_variance=metrics.explained_variance_score(y_true, y_pred)
mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred)
mse=metrics.mean_squared_error(y_true, y_pred)
mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
r2=metrics.r2_score(y_true, y_pred)
print('explained_variance: ', round(explained_variance,4))
print('mean_squared_log_error: ', round(mean_squared_log_error,4))
print('r2: ', round(r2,4))
print('MAE: ', round(mean_absolute_error,4))
print('MSE: ', round(mse,4))
print('RMSE: ', round(np.sqrt(mse),4))
test_pred_lin = np.round(np.clip(test_pred_lin, 0, 1))
regression_results(y_test_lin, test_pred_lin)
explained_variance: 0.0292 mean_squared_log_error: 0.1162 r2: 0.0272 MAE: 0.2418 MSE: 0.2418 RMSE: 0.4917
print(metrics.classification_report(y_test_lin, test_pred_lin))
precision recall f1-score support 0 0.79 0.76 0.77 49 1 0.73 0.76 0.74 42 accuracy 0.76 91 macro avg 0.76 0.76 0.76 91 weighted avg 0.76 0.76 0.76 91
test_pred_lin
array([1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0.])
y_test_lin.values
array([1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0], dtype=int64)
Regresja logistyczna
num_cols = ['age', 'avg_glucose_level', 'bmi']
cat_cols = list(set(df.columns) - {'stroke', 'age', 'avg_glucose_level', 'bmi'})
scaler = StandardScaler()
scaler.fit(df_train[num_cols])
X_num_train = scaler.transform(df_train[num_cols])
X_cat_train = df_train[cat_cols].to_numpy()
X_train = np.hstack((X_num_train, X_cat_train))
y_train = df_train['stroke']
X_num_test = scaler.transform(df_test[num_cols])
X_cat_test = df_test[cat_cols].to_numpy()
X_test = np.hstack((X_num_test, X_cat_test))
y_test = df_test['stroke']
clf_log = LogisticRegression()
clf_log.fit(X_train, y_train)
test_pred_log = clf_log.predict(X_test)
regression_results(y_test, test_pred_log)
explained_variance: -0.0165 mean_squared_log_error: 0.1214 r2: -0.017 MAE: 0.2527 MSE: 0.2527 RMSE: 0.5027
print('\nClassification Report')
print(metrics.classification_report(y_test, test_pred_log))
Classification Report precision recall f1-score support 0 0.77 0.76 0.76 49 1 0.72 0.74 0.73 42 accuracy 0.75 91 macro avg 0.75 0.75 0.75 91 weighted avg 0.75 0.75 0.75 91
test_pred_log
array([1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0], dtype=int64)
y_test.values
array([1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0], dtype=int64)
Naiwny Bayes
gnb = GaussianNB()
gnb = gnb.fit(X_train, y_train)
test_pred_bay = gnb.predict(X_test)
regression_results(y_test, test_pred_bay)
explained_variance: 0.2187 mean_squared_log_error: 0.1003 r2: 0.1599 MAE: 0.2088 MSE: 0.2088 RMSE: 0.4569
print('\nClassification Report')
print(metrics.classification_report(y_test, test_pred_bay))
Classification Report precision recall f1-score support 0 0.89 0.69 0.78 49 1 0.72 0.90 0.80 42 accuracy 0.79 91 macro avg 0.81 0.80 0.79 91 weighted avg 0.81 0.79 0.79 91
test_pred_bay
array([1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0], dtype=int64)
y_test.values
array([1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0], dtype=int64)