34 lines
953 B
Python
34 lines
953 B
Python
import pandas as pd
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
import numpy as np
|
|
import kaggle
|
|
|
|
kaggle.api.authenticate()
|
|
|
|
kaggle.api.dataset_download_files('andrewmvd/heart-failure-clinical-data', path='.', unzip=True)
|
|
|
|
results = pd.read_csv('heart_failure_clinical_records_dataset.csv')
|
|
|
|
#brak wierszy z NaN
|
|
results.dropna()
|
|
|
|
results = results.astype({"age": np.int64})
|
|
|
|
|
|
for col in results.columns:
|
|
if results[col].dtype == np.float64: # FLOATS TO VALUES IN [ 0, 1]
|
|
dataReshaped = results[col].values.reshape(-1, 1)
|
|
scaler = MinMaxScaler(feature_range=(0, 1))
|
|
results[col] = scaler.fit_transform(dataReshaped)
|
|
|
|
|
|
# Podział zbioru 6:1:1
|
|
train, test = train_test_split(results, test_size= 1 - 0.6)
|
|
|
|
valid, test = train_test_split(test, test_size=0.5)
|
|
|
|
train.to_csv("train.csv", index=False)
|
|
valid.to_csv("valid.csv",index=False)
|
|
test.to_csv("test.csv",index=False)
|