27 lines
1.0 KiB
Python
27 lines
1.0 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
|
def main():
|
|
feature_names = ["BMI", "SleepTime", "Sex", "Diabetic", "PhysicalActivity", "Smoking", "AlcoholDrinking", "HeartDisease"]
|
|
|
|
dataset = pd.read_csv('heart_2020_cleaned.csv')
|
|
dataset = dataset.dropna()
|
|
|
|
dataset["Diabetic"] = dataset["Diabetic"].apply(lambda x: int("Yes" in x))
|
|
dataset["HeartDisease"] = dataset["HeartDisease"].apply(lambda x: int(x == "Yes"))
|
|
dataset["PhysicalActivity"] = dataset["PhysicalActivity"].apply(lambda x: int(x == "Yes"))
|
|
dataset["Smoking"] = dataset["Smoking"].apply(lambda x: (x == "Yes"))
|
|
dataset["AlcoholDrinking"] = dataset["AlcoholDrinking"].apply(lambda x: int(x == "Yes"))
|
|
dataset["Sex"] = dataset["Sex"].apply(lambda x: 1 if x == "Female" else 0)
|
|
|
|
dataset = dataset[feature_names]
|
|
dataset_train, dataset_test = train_test_split(dataset, test_size=.1, train_size=.9, random_state=1)
|
|
|
|
dataset_train.to_csv("training_data.csv")
|
|
dataset_test.to_csv("test_data.csv")
|
|
|
|
|
|
main()
|