ium_434766/lab2.ipynb
2021-03-28 20:50:30 +02:00

137 KiB

import kaggle
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

def downloadCSV():
    kaggle.api.authenticate()
    kaggle.api.dataset_download_files('fedesoriano/stroke-prediction-dataset', path='.', unzip=True)
    data = pd.read_csv('healthcare-dataset-stroke-data.csv')

def dropNaN():
    data = pd.read_csv('healthcare-dataset-stroke-data.csv')
    data = data.dropna()
    return data

def NormalizeData(data):
    data = data.astype({"age": np.int64})
    for col in data.columns:
        if data[col].dtype == object: # STRINGS TO LOWERCASE
            data[col] = data[col].str.lower()
        if data[col].dtype == np.float64: # FLOATS TO VALUES IN [ 0, 1]
            dataReshaped = data[col].values.reshape(-1,1)
            scaler = MinMaxScaler(feature_range=(0, 1))
            data[col] = scaler.fit_transform(dataReshaped)
        if col == 'ever_married': # YES/NO TO 1/0
            data[col] = data[col].map(dict(yes=1, no=0))
        if col == 'smoking_status':
            data[col] = data[col].str.replace(" ", "_")
        if col == 'work_type':
            data[col] = data[col].str.replace("-", "_")
    return data

def saveToCSV(data1,data2,data3):
    data1.to_csv("data_train.csv", index=False)
    data2.to_csv("data_test.csv",index=False)
    data3.to_csv("data_val.csv",index=False)

def describeDataset(dt, dt2, dv):
    data = pd.read_csv('healthcare-dataset-stroke-data.csv')
    print("Whole dataset size: ", data.size)
    print("Train dataset size: ", dt.size)
    print("Test dataset size: ", dt2.size)
    print("Validate dataset size: ", dv.size)
    print(data.describe(include='all'))


downloadCSV()
data = dropNaN()
data = NormalizeData(data)

data_train, data_test = train_test_split(data, test_size=0.2, random_state=1)
data_train, data_val = train_test_split(data_train, test_size=0.25, random_state=1) ## Twice to get 0.6, 0.2, 0.2
saveToCSV(data_train,data_test,data_val)
describeDataset(data_train,data_test,data_val)

Whole dataset size:  61320
Train dataset size:  35340
Test dataset size:  11784
Validate dataset size:  11784
                  id  gender          age  hypertension  heart_disease  \
count    5110.000000    5110  5110.000000   5110.000000    5110.000000   
unique           NaN       3          NaN           NaN            NaN   
top              NaN  Female          NaN           NaN            NaN   
freq             NaN    2994          NaN           NaN            NaN   
mean    36517.829354     NaN    43.226614      0.097456       0.054012   
std     21161.721625     NaN    22.612647      0.296607       0.226063   
min        67.000000     NaN     0.080000      0.000000       0.000000   
25%     17741.250000     NaN    25.000000      0.000000       0.000000   
50%     36932.000000     NaN    45.000000      0.000000       0.000000   
75%     54682.000000     NaN    61.000000      0.000000       0.000000   
max     72940.000000     NaN    82.000000      1.000000       1.000000   

       ever_married work_type Residence_type  avg_glucose_level          bmi  \
count          5110      5110           5110        5110.000000  4909.000000   
unique            2         5              2                NaN          NaN   
top             Yes   Private          Urban                NaN          NaN   
freq           3353      2925           2596                NaN          NaN   
mean            NaN       NaN            NaN         106.147677    28.893237   
std             NaN       NaN            NaN          45.283560     7.854067   
min             NaN       NaN            NaN          55.120000    10.300000   
25%             NaN       NaN            NaN          77.245000    23.500000   
50%             NaN       NaN            NaN          91.885000    28.100000   
75%             NaN       NaN            NaN         114.090000    33.100000   
max             NaN       NaN            NaN         271.740000    97.600000   

       smoking_status       stroke  
count            5110  5110.000000  
unique              4          NaN  
top      never smoked          NaN  
freq             1892          NaN  
mean              NaN     0.048728  
std               NaN     0.215320  
min               NaN     0.000000  
25%               NaN     0.000000  
50%               NaN     0.000000  
75%               NaN     0.000000  
max               NaN     1.000000  
data_orginal = pd.read_csv('healthcare-dataset-stroke-data.csv')
data_orginal['work_type'].value_counts().plot(kind="bar")
<AxesSubplot:>
data_orginal['stroke'].value_counts().plot(kind="bar")
<AxesSubplot:>
data_orginal['heart_disease'].value_counts().plot(kind="bar")
<AxesSubplot:>
data_orginal['smoking_status'].value_counts().plot(kind="bar")
<AxesSubplot:>
data_orginal['Residence_type'].value_counts().plot(kind="bar")
<AxesSubplot:>