import pandas as pd from sklearn.model_selection import train_test_split import kaggle import os kaggle.api.authenticate() c_directory = os.getcwd() kaggle.api.dataset_download_files('kamilpytlak/personal-key-indicators-of-heart-disease', path=f"{c_directory}", unzip=True) dataset = pd.read_csv("heart_2020_cleaned.csv") print(dataset.describe(include='all')) dataset = dataset.dropna() print(dataset.describe(include='all')) dataset_train, dataset_test = train_test_split(dataset, test_size=.2, train_size=.8, random_state=1) print(dataset_train.describe(include='all')) print("Wielkości:") print("Zbiór uczący:", dataset_train.shape[0]) print("Zbiór testowy:", dataset_test.shape[0]) print("Łącznie: ", dataset.shape[0]) print(dataset["GenHealth"].value_counts()) print(dataset_train["GenHealth"].value_counts()) print("Średnia BMI -łącznie: ", dataset["BMI"].mean()) print("Odchylenie standardowe BMI - uczący:", dataset_train["BMI"].std()) print("Odchylenie standardowe BMI - łącznie:", dataset["BMI"].std()) print("Mediana BMI:", dataset_test["BMI"].median()) max_bmi = dataset_train["BMI"].max() print(max_bmi) dataset_train["BMI"] = dataset_train["BMI"].apply(lambda x: x/max_bmi) dataset_test["BMI"] = dataset_test["BMI"].apply(lambda x: x/max_bmi) print(dataset_train["AgeCategory"].value_counts()) print(dataset_train["BMI"])