import os import numpy as np import pandas as pd import wget from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split def downloadCSV(): url = 'https://git.wmi.amu.edu.pl/s434766/ium_434766/raw/branch/master/healthcare-dataset-stroke-data.csv' wget.download(url, out='healthcare-dataset-stroke-data.csv', bar=None) def dropNaN(): data = pd.read_csv('healthcare-dataset-stroke-data.csv') data = data.dropna() return data def NormalizeData(data): data = data.astype({"age": np.int64}) for col in data.columns: if data[col].dtype == object: # STRINGS TO LOWERCASE data[col] = data[col].str.lower() if data[col].dtype == np.float64: # FLOATS TO VALUES IN [ 0, 1] dataReshaped = data[col].values.reshape(-1,1) scaler = MinMaxScaler(feature_range=(0, 1)) data[col] = scaler.fit_transform(dataReshaped) if col == 'ever_married': # YES/NO TO 1/0 data[col] = data[col].map(dict(yes=1, no=0)) if col == 'smoking_status': data[col] = data[col].str.replace(" ", "_") if col == 'work_type': data[col] = data[col].str.replace("-", "_") return data def saveToCSV(data1,data2,data3): data1.to_csv("data_train.csv", index=False) data2.to_csv("data_test.csv",index=False) data3.to_csv("data_val.csv",index=False) downloadCSV() data = dropNaN() data = NormalizeData(data) data_train, data_test = train_test_split(data, test_size=0.2, random_state=1) data_train, data_val = train_test_split(data_train, test_size=0.25, random_state=1) ## Twice to get 0.6, 0.2, 0.2 saveToCSV(data_train,data_test,data_val)