ium_434732/skrypt_download.py

34 lines
953 B
Python
Raw Normal View History

2021-04-10 15:08:18 +02:00
import pandas as pd
from sklearn.model_selection import train_test_split
2021-05-15 15:24:37 +02:00
from sklearn.preprocessing import MinMaxScaler
import numpy as np
2021-04-10 15:08:18 +02:00
import kaggle
kaggle.api.authenticate()
2021-05-15 15:24:37 +02:00
kaggle.api.dataset_download_files('andrewmvd/heart-failure-clinical-data', path='.', unzip=True)
2021-04-10 15:08:18 +02:00
2021-05-15 15:24:37 +02:00
results = pd.read_csv('heart_failure_clinical_records_dataset.csv')
2021-04-10 15:08:18 +02:00
#brak wierszy z NaN
results.dropna()
2021-05-15 15:24:37 +02:00
results = results.astype({"age": np.int64})
for col in results.columns:
if results[col].dtype == np.float64: # FLOATS TO VALUES IN [ 0, 1]
dataReshaped = results[col].values.reshape(-1, 1)
scaler = MinMaxScaler(feature_range=(0, 1))
results[col] = scaler.fit_transform(dataReshaped)
2021-04-24 12:21:33 +02:00
2021-04-10 15:08:18 +02:00
# Podział zbioru 6:1:1
train, test = train_test_split(results, test_size= 1 - 0.6)
2021-05-15 15:24:37 +02:00
valid, test = train_test_split(test, test_size=0.5)
2021-04-10 15:08:18 +02:00
2021-04-11 09:54:05 +02:00
train.to_csv("train.csv", index=False)
valid.to_csv("valid.csv",index=False)
test.to_csv("test.csv",index=False)