137 KiB
137 KiB
import kaggle
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
def downloadCSV():
kaggle.api.authenticate()
kaggle.api.dataset_download_files('fedesoriano/stroke-prediction-dataset', path='.', unzip=True)
data = pd.read_csv('healthcare-dataset-stroke-data.csv')
def dropNaN():
data = pd.read_csv('healthcare-dataset-stroke-data.csv')
data = data.dropna()
return data
def NormalizeData(data):
data = data.astype({"age": np.int64})
for col in data.columns:
if data[col].dtype == object: # STRINGS TO LOWERCASE
data[col] = data[col].str.lower()
if data[col].dtype == np.float64: # FLOATS TO VALUES IN [ 0, 1]
dataReshaped = data[col].values.reshape(-1,1)
scaler = MinMaxScaler(feature_range=(0, 1))
data[col] = scaler.fit_transform(dataReshaped)
if col == 'ever_married': # YES/NO TO 1/0
data[col] = data[col].map(dict(yes=1, no=0))
if col == 'smoking_status':
data[col] = data[col].str.replace(" ", "_")
if col == 'work_type':
data[col] = data[col].str.replace("-", "_")
return data
def saveToCSV(data1,data2,data3):
data1.to_csv("data_train.csv", index=False)
data2.to_csv("data_test.csv",index=False)
data3.to_csv("data_val.csv",index=False)
def describeDataset(dt, dt2, dv):
data = pd.read_csv('healthcare-dataset-stroke-data.csv')
print("Whole dataset size: ", data.size)
print("Train dataset size: ", dt.size)
print("Test dataset size: ", dt2.size)
print("Validate dataset size: ", dv.size)
print(data.describe(include='all'))
# downloadCSV()
data = dropNaN()
data = NormalizeData(data)
data_train, data_test = train_test_split(data, test_size=0.2, random_state=1)
data_train, data_val = train_test_split(data_train, test_size=0.25, random_state=1) ## Twice to get 0.6, 0.2, 0.2
saveToCSV(data_train,data_test,data_val)
describeDataset(data_train,data_test,data_val)
Whole dataset size: 61320 Train dataset size: 35340 Test dataset size: 11784 Validate dataset size: 11784 id gender age hypertension heart_disease \ count 5110.000000 5110 5110.000000 5110.000000 5110.000000 unique NaN 3 NaN NaN NaN top NaN Female NaN NaN NaN freq NaN 2994 NaN NaN NaN mean 36517.829354 NaN 43.226614 0.097456 0.054012 std 21161.721625 NaN 22.612647 0.296607 0.226063 min 67.000000 NaN 0.080000 0.000000 0.000000 25% 17741.250000 NaN 25.000000 0.000000 0.000000 50% 36932.000000 NaN 45.000000 0.000000 0.000000 75% 54682.000000 NaN 61.000000 0.000000 0.000000 max 72940.000000 NaN 82.000000 1.000000 1.000000 ever_married work_type Residence_type avg_glucose_level bmi \ count 5110 5110 5110 5110.000000 4909.000000 unique 2 5 2 NaN NaN top Yes Private Urban NaN NaN freq 3353 2925 2596 NaN NaN mean NaN NaN NaN 106.147677 28.893237 std NaN NaN NaN 45.283560 7.854067 min NaN NaN NaN 55.120000 10.300000 25% NaN NaN NaN 77.245000 23.500000 50% NaN NaN NaN 91.885000 28.100000 75% NaN NaN NaN 114.090000 33.100000 max NaN NaN NaN 271.740000 97.600000 smoking_status stroke count 5110 5110.000000 unique 4 NaN top never smoked NaN freq 1892 NaN mean NaN 0.048728 std NaN 0.215320 min NaN 0.000000 25% NaN 0.000000 50% NaN 0.000000 75% NaN 0.000000 max NaN 1.000000
pd.unique(data['work_type'])
array(['private', 'self_employed', 'govt_job', 'children', 'never_worked'], dtype=object)
data_orginal = pd.read_csv('healthcare-dataset-stroke-data.csv')
data_orginal['work_type'].value_counts().plot(kind="bar")
<AxesSubplot:>
data_orginal['stroke'].value_counts().plot(kind="bar")
<AxesSubplot:>
data_orginal['heart_disease'].value_counts().plot(kind="bar")
<AxesSubplot:>
data_orginal['smoking_status'].value_counts().plot(kind="bar")
<AxesSubplot:>
data_orginal['Residence_type'].value_counts().plot(kind="bar")
<AxesSubplot:>