import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler import numpy as np import kaggle kaggle.api.authenticate() kaggle.api.dataset_download_files('andrewmvd/heart-failure-clinical-data', path='.', unzip=True) results = pd.read_csv('heart_failure_clinical_records_dataset.csv') #brak wierszy z NaN results.dropna() results = results.astype({"age": np.int64}) for col in results.columns: if results[col].dtype == np.float64: # FLOATS TO VALUES IN [ 0, 1] dataReshaped = results[col].values.reshape(-1, 1) scaler = MinMaxScaler(feature_range=(0, 1)) results[col] = scaler.fit_transform(dataReshaped) # PodziaƂ zbioru 6:1:1 train, test = train_test_split(results, test_size= 1 - 0.6) valid, test = train_test_split(test, test_size=0.5) train.to_csv("train.csv", index=False) valid.to_csv("valid.csv",index=False) test.to_csv("test.csv",index=False)