51 lines
1.7 KiB
Python
51 lines
1.7 KiB
Python
|
import os
|
||
|
import numpy as np
|
||
|
import pandas as pd
|
||
|
import wget
|
||
|
from sklearn.preprocessing import MinMaxScaler
|
||
|
from sklearn.model_selection import train_test_split
|
||
|
|
||
|
def downloadCSV():
|
||
|
url = 'https://git.wmi.amu.edu.pl/s434766/ium_434766/raw/branch/master/healthcare-dataset-stroke-data.csv'
|
||
|
wget.download(url, out='healthcare-dataset-stroke-data.csv', bar=None)
|
||
|
|
||
|
def dropNaN():
|
||
|
data = pd.read_csv('healthcare-dataset-stroke-data.csv')
|
||
|
data = data.dropna()
|
||
|
return data
|
||
|
|
||
|
|
||
|
def NormalizeData(data):
|
||
|
data = data.astype({"age": np.int64})
|
||
|
for col in data.columns:
|
||
|
if data[col].dtype == object: # STRINGS TO LOWERCASE
|
||
|
data[col] = data[col].str.lower()
|
||
|
if data[col].dtype == np.float64: # FLOATS TO VALUES IN [ 0, 1]
|
||
|
dataReshaped = data[col].values.reshape(-1,1)
|
||
|
scaler = MinMaxScaler(feature_range=(0, 1))
|
||
|
data[col] = scaler.fit_transform(dataReshaped)
|
||
|
if col == 'ever_married': # YES/NO TO 1/0
|
||
|
data[col] = data[col].map(dict(yes=1, no=0))
|
||
|
if col == 'smoking_status':
|
||
|
data[col] = data[col].str.replace(" ", "_")
|
||
|
if col == 'work_type':
|
||
|
data[col] = data[col].str.replace("-", "_")
|
||
|
return data
|
||
|
|
||
|
def saveToCSV(data1,data2,data3):
|
||
|
data1.to_csv("data_train.csv", index=False)
|
||
|
data2.to_csv("data_test.csv",index=False)
|
||
|
data3.to_csv("data_val.csv",index=False)
|
||
|
|
||
|
|
||
|
|
||
|
downloadCSV()
|
||
|
data = dropNaN()
|
||
|
data = NormalizeData(data)
|
||
|
|
||
|
data_train, data_test = train_test_split(data, test_size=0.2, random_state=1)
|
||
|
data_train, data_val = train_test_split(data_train, test_size=0.25, random_state=1) ## Twice to get 0.6, 0.2, 0.2
|
||
|
saveToCSV(data_train,data_test,data_val)
|
||
|
|
||
|
|