duma_projekt/Projekt.ipynb
2021-06-20 18:35:43 +02:00

3.5 MiB
Raw Permalink Blame History

import pandas as pd
import numpy as np
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

Przygotowanie danych:

df = pd.read_csv('data.csv')
del df['id']
df = df[df['gender'].map(lambda x: str(x) != "Other")]
df = df.dropna()
df.info()
df
<class 'pandas.core.frame.DataFrame'>
Int64Index: 454 entries, 0 to 498
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             454 non-null    object 
 1   age                454 non-null    float64
 2   hypertension       454 non-null    int64  
 3   heart_disease      454 non-null    int64  
 4   ever_married       454 non-null    object 
 5   work_type          454 non-null    object 
 6   Residence_type     454 non-null    object 
 7   avg_glucose_level  454 non-null    float64
 8   bmi                454 non-null    float64
 9   smoking_status     454 non-null    object 
 10  stroke             454 non-null    int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 42.6+ KB
gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 Male 67.0 0 1 Yes Private Urban 228.69 36.6 formerly smoked 1
2 Male 80.0 0 1 Yes Private Rural 105.92 32.5 never smoked 1
3 Female 49.0 0 0 Yes Private Urban 171.23 34.4 smokes 1
4 Female 79.0 1 0 Yes Self-employed Rural 174.12 24.0 never smoked 1
5 Male 81.0 0 0 Yes Private Urban 186.21 29.0 formerly smoked 1
... ... ... ... ... ... ... ... ... ... ... ...
494 Female 55.0 0 0 Yes Private Rural 111.19 39.7 formerly smoked 0
495 Female 71.0 0 0 Yes Private Urban 93.28 34.7 never smoked 0
496 Male 5.0 0 0 No children Rural 122.19 35.0 Unknown 0
497 Female 14.0 0 0 No children Rural 129.53 21.3 never smoked 0
498 Female 15.0 0 0 No children Rural 114.53 29.1 Unknown 0

454 rows × 11 columns

df = pd.get_dummies(df, columns=['gender'], drop_first=True)
df = pd.get_dummies(df, columns=['ever_married'], drop_first=True)
df = pd.get_dummies(df, columns=['work_type'], drop_first=True)
df = pd.get_dummies(df, columns=['Residence_type'], drop_first=True)
df = pd.get_dummies(df, columns=['smoking_status'], drop_first=True)

df
age hypertension heart_disease avg_glucose_level bmi stroke gender_Male ever_married_Yes work_type_Never_worked work_type_Private work_type_Self-employed work_type_children Residence_type_Urban smoking_status_formerly smoked smoking_status_never smoked smoking_status_smokes
0 67.0 0 1 228.69 36.6 1 1 1 0 1 0 0 1 1 0 0
2 80.0 0 1 105.92 32.5 1 1 1 0 1 0 0 0 0 1 0
3 49.0 0 0 171.23 34.4 1 0 1 0 1 0 0 1 0 0 1
4 79.0 1 0 174.12 24.0 1 0 1 0 0 1 0 0 0 1 0
5 81.0 0 0 186.21 29.0 1 1 1 0 1 0 0 1 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
494 55.0 0 0 111.19 39.7 0 0 1 0 1 0 0 0 1 0 0
495 71.0 0 0 93.28 34.7 0 0 1 0 1 0 0 1 0 1 0
496 5.0 0 0 122.19 35.0 0 1 0 0 0 0 1 0 0 0 0
497 14.0 0 0 129.53 21.3 0 0 0 0 0 0 1 0 0 1 0
498 15.0 0 0 114.53 29.1 0 0 0 0 0 0 1 0 0 0 0

454 rows × 16 columns

df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['stroke'])
df_train['stroke'].value_counts(dropna=False)
0    196
1    167
Name: stroke, dtype: int64
df_test['stroke'].value_counts(dropna=False)
0    49
1    42
Name: stroke, dtype: int64

Wizualizacja

sns.pairplot(df, x_vars=['avg_glucose_level','bmi'], y_vars='age', height=7, aspect=0.7, diag_kind = None)
<seaborn.axisgrid.PairGrid at 0x19196a188e0>
features = {'Smoker': df['smoking_status_smokes'].values,
     'Male': df['gender_Male'].values,
     'Urban': df['Residence_type_Urban']}
df_vis = pd.DataFrame(features)

fig = make_subplots(rows=1, cols=3, subplot_titles=('Smoker', 'Male', 'Urban'))
L= len(df_vis)

cnames = list(df_vis.columns)
for k, name in enumerate(cnames):
    n_true = df_vis[name].sum()
    fig.add_trace(go.Bar(x=['False', 'True'], y=[n_true, L-n_true], name=name ), 1,k+1)
fig.update_layout(barmode='relative',  bargap=0.05, width=700, height=400)

Regresja liniowa

X_cols = list(set(df_train.columns) - {'stroke'})
y_cols = 'stroke'

X_train_lin = df_train[X_cols]
y_train_lin = df_train[y_cols]

X_test_lin = df_test[X_cols]
y_test_lin = df_test[y_cols]

clf_lin = LinearRegression().fit(X_train_lin, y_train_lin)
test_pred_lin = clf_lin.predict(X_test_lin)
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))
test_pred_lin = np.round(np.clip(test_pred_lin, 0, 1))
regression_results(y_test_lin, test_pred_lin)
explained_variance:  0.0292
mean_squared_log_error:  0.1162
r2:  0.0272
MAE:  0.2418
MSE:  0.2418
RMSE:  0.4917
print(metrics.classification_report(y_test_lin, test_pred_lin))
              precision    recall  f1-score   support

           0       0.79      0.76      0.77        49
           1       0.73      0.76      0.74        42

    accuracy                           0.76        91
   macro avg       0.76      0.76      0.76        91
weighted avg       0.76      0.76      0.76        91

test_pred_lin
array([1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1.,
       0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 1.,
       0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1.,
       0., 1., 0., 0., 1., 0.])
y_test_lin.values
array([1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 0], dtype=int64)

Regresja logistyczna

num_cols = ['age', 'avg_glucose_level', 'bmi']
cat_cols = list(set(df.columns) - {'stroke', 'age', 'avg_glucose_level', 'bmi'})

scaler = StandardScaler()
scaler.fit(df_train[num_cols])

X_num_train = scaler.transform(df_train[num_cols])
X_cat_train = df_train[cat_cols].to_numpy()
X_train = np.hstack((X_num_train, X_cat_train))
y_train = df_train['stroke']

X_num_test = scaler.transform(df_test[num_cols])
X_cat_test = df_test[cat_cols].to_numpy()
X_test = np.hstack((X_num_test, X_cat_test))
y_test = df_test['stroke']
clf_log = LogisticRegression()

clf_log.fit(X_train, y_train)

test_pred_log  = clf_log.predict(X_test)

regression_results(y_test, test_pred_log)
explained_variance:  -0.0165
mean_squared_log_error:  0.1214
r2:  -0.017
MAE:  0.2527
MSE:  0.2527
RMSE:  0.5027
print('\nClassification Report')
print(metrics.classification_report(y_test, test_pred_log))
Classification Report
              precision    recall  f1-score   support

           0       0.77      0.76      0.76        49
           1       0.72      0.74      0.73        42

    accuracy                           0.75        91
   macro avg       0.75      0.75      0.75        91
weighted avg       0.75      0.75      0.75        91

test_pred_log
array([1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0], dtype=int64)
y_test.values
array([1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 0], dtype=int64)

Naiwny Bayes

gnb = GaussianNB()

gnb = gnb.fit(X_train, y_train)

test_pred_bay  = gnb.predict(X_test)

regression_results(y_test, test_pred_bay)
explained_variance:  0.2187
mean_squared_log_error:  0.1003
r2:  0.1599
MAE:  0.2088
MSE:  0.2088
RMSE:  0.4569
print('\nClassification Report')
print(metrics.classification_report(y_test, test_pred_bay))
Classification Report
              precision    recall  f1-score   support

           0       0.89      0.69      0.78        49
           1       0.72      0.90      0.80        42

    accuracy                           0.79        91
   macro avg       0.81      0.80      0.79        91
weighted avg       0.81      0.79      0.79        91

test_pred_bay
array([1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 1, 0], dtype=int64)
y_test.values
array([1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 0], dtype=int64)