Update dane.py
This commit is contained in:
parent
e28768f74c
commit
7896165633
294
dane.py
294
dane.py
@ -1,148 +1,148 @@
|
|||||||
### 1. Pobieranie zbioru danych
|
### 1. Pobieranie zbioru danych
|
||||||
import zipfile
|
import zipfile
|
||||||
with zipfile.ZipFile("personal-key-indicators-of-heart-disease.zip", 'r') as zip_ref:
|
with zipfile.ZipFile("personal-key-indicators-of-heart-disease.zip", 'r') as zip_ref:
|
||||||
zip_ref.extractall("dataset_extracted")
|
zip_ref.extractall("dataset_extracted")
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
# W pobranym zbiorze danych jest kilka podzbiorów więc celowo otwieram ten z NaN, żeby manualnie go oczyścić dla praktyki
|
# W pobranym zbiorze danych jest kilka podzbiorów więc celowo otwieram ten z NaN, żeby manualnie go oczyścić dla praktyki
|
||||||
df = pd.read_csv("dataset_extracted/2022/heart_2022_with_nans.csv")
|
df = pd.read_csv("dataset_extracted/2022/heart_2022_with_nans.csv")
|
||||||
## Przeglądanie nieoczyszczonego datasetu
|
## Przeglądanie nieoczyszczonego datasetu
|
||||||
df.info()
|
df.info()
|
||||||
df.head()
|
df.head()
|
||||||
df.describe()
|
df.describe()
|
||||||
df["HadHeartAttack"].value_counts().plot(kind="pie")
|
df["HadHeartAttack"].value_counts().plot(kind="pie")
|
||||||
df["HadHeartAttack"].value_counts()
|
df["HadHeartAttack"].value_counts()
|
||||||
|
|
||||||
## 2. Podział na podzbiory (train / dev / test - 8:1:1)) i oversampling
|
## 2. Podział na podzbiory (train / dev / test - 8:1:1)) i oversampling
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
#Funkcji z sklearn musimy użyć dwukrotnie, bo dzieli tylko na dwa podzbiory
|
#Funkcji z sklearn musimy użyć dwukrotnie, bo dzieli tylko na dwa podzbiory
|
||||||
train, test_and_valid = train_test_split(df, test_size=0.2) #0.8 train, 0.2 test&valid
|
train, test_and_valid = train_test_split(df, test_size=0.2) #0.8 train, 0.2 test&valid
|
||||||
|
|
||||||
test, valid = train_test_split(test_and_valid, test_size=0.5) #0.1 test, 0.1 valid
|
test, valid = train_test_split(test_and_valid, test_size=0.5) #0.1 test, 0.1 valid
|
||||||
train["HadHeartAttack"].value_counts()
|
train["HadHeartAttack"].value_counts()
|
||||||
def oversample(dataset):
|
def oversample(dataset):
|
||||||
num_true = len(dataset[dataset["HadHeartAttack"]=="Yes"])
|
num_true = len(dataset[dataset["HadHeartAttack"]=="Yes"])
|
||||||
num_false = len(dataset[dataset["HadHeartAttack"]=="No"])
|
num_false = len(dataset[dataset["HadHeartAttack"]=="No"])
|
||||||
num_oversampling_steps = num_false//num_true
|
num_oversampling_steps = num_false//num_true
|
||||||
oversampled = dataset.copy()
|
oversampled = dataset.copy()
|
||||||
for x in range(num_oversampling_steps):
|
for x in range(num_oversampling_steps):
|
||||||
oversampled = pd.concat([oversampled, dataset[dataset["HadHeartAttack"]=="Yes"]], ignore_index=True)
|
oversampled = pd.concat([oversampled, dataset[dataset["HadHeartAttack"]=="Yes"]], ignore_index=True)
|
||||||
return oversampled
|
return oversampled
|
||||||
train = oversample(train)
|
train = oversample(train)
|
||||||
train["HadHeartAttack"].value_counts().plot(kind="pie")
|
train["HadHeartAttack"].value_counts().plot(kind="pie")
|
||||||
test["HadHeartAttack"].value_counts().plot(kind="pie")
|
test["HadHeartAttack"].value_counts().plot(kind="pie")
|
||||||
valid["HadHeartAttack"].value_counts().plot(kind="pie")
|
valid["HadHeartAttack"].value_counts().plot(kind="pie")
|
||||||
df["SmokerStatus"].value_counts().plot(kind="pie")
|
df["SmokerStatus"].value_counts().plot(kind="pie")
|
||||||
df["ECigaretteUsage"].value_counts().plot(kind="pie")
|
df["ECigaretteUsage"].value_counts().plot(kind="pie")
|
||||||
df["CovidPos"].value_counts().plot(kind="pie")
|
df["CovidPos"].value_counts().plot(kind="pie")
|
||||||
## Normalizacja część 1 - zamiana na kolumny liczbowe i kategoryczne
|
## Normalizacja część 1 - zamiana na kolumny liczbowe i kategoryczne
|
||||||
df["Sex"].unique()
|
df["Sex"].unique()
|
||||||
df["GeneralHealth"].unique()
|
df["GeneralHealth"].unique()
|
||||||
health_map = {
|
health_map = {
|
||||||
"Excellent": 5,
|
"Excellent": 5,
|
||||||
"Very good": 4,
|
"Very good": 4,
|
||||||
"Good": 3,
|
"Good": 3,
|
||||||
"Fair": 2,
|
"Fair": 2,
|
||||||
"Poor": 1
|
"Poor": 1
|
||||||
}
|
}
|
||||||
for col in df:
|
for col in df:
|
||||||
print(f"{col}:")
|
print(f"{col}:")
|
||||||
print(df[col].unique())
|
print(df[col].unique())
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
def normalize_dataset(dataset):
|
def normalize_dataset(dataset):
|
||||||
dataset["GeneralHealth"] = dataset["GeneralHealth"].map(defaultdict(lambda: float('NaN'), health_map), na_action='ignore')
|
dataset["GeneralHealth"] = dataset["GeneralHealth"].map(defaultdict(lambda: float('NaN'), health_map), na_action='ignore')
|
||||||
dataset["Sex"] = dataset["Sex"].map({"Female":0,"Male":1}).astype(float) #Zamiana z kolumn tekstowych na numeryczne
|
dataset["Sex"] = dataset["Sex"].map({"Female":0,"Male":1}).astype(float) #Zamiana z kolumn tekstowych na numeryczne
|
||||||
dataset.rename(columns ={"Sex":"Male"},inplace=True)
|
dataset.rename(columns ={"Sex":"Male"},inplace=True)
|
||||||
dataset["State"] = dataset["State"].astype('category')
|
dataset["State"] = dataset["State"].astype('category')
|
||||||
dataset["PhysicalHealthDays"].astype(float)
|
dataset["PhysicalHealthDays"].astype(float)
|
||||||
dataset["MentalHealthDays"].astype(float)
|
dataset["MentalHealthDays"].astype(float)
|
||||||
dataset["LastCheckupTime"] = dataset["LastCheckupTime"].fillna("Unknown").astype('category') # Potem korzystam z fillna-->median ale nie działa to na kolumnach kategorycznych więc wykonuję to przed konwersją
|
dataset["LastCheckupTime"] = dataset["LastCheckupTime"].fillna("Unknown").astype('category') # Potem korzystam z fillna-->median ale nie działa to na kolumnach kategorycznych więc wykonuję to przed konwersją
|
||||||
dataset["PhysicalActivities"]= dataset["PhysicalActivities"].map({"No":0,"Yes":1})
|
dataset["PhysicalActivities"]= dataset["PhysicalActivities"].map({"No":0,"Yes":1})
|
||||||
dataset["SleepHours"].astype(float)
|
dataset["SleepHours"].astype(float)
|
||||||
dataset["RemovedTeeth"] = dataset["RemovedTeeth"].map(defaultdict(lambda: float('NaN'), {"None of them":0,"1 to 5":1, "6 or more, but not all":2, "All":3}), na_action='ignore')
|
dataset["RemovedTeeth"] = dataset["RemovedTeeth"].map(defaultdict(lambda: float('NaN'), {"None of them":0,"1 to 5":1, "6 or more, but not all":2, "All":3}), na_action='ignore')
|
||||||
dataset["HadHeartAttack"]= dataset["HadHeartAttack"].map({"No":0,"Yes":1})
|
dataset["HadHeartAttack"]= dataset["HadHeartAttack"].map({"No":0,"Yes":1})
|
||||||
dataset["HadAngina"]= dataset["HadAngina"].map({"No":0,"Yes":1})
|
dataset["HadAngina"]= dataset["HadAngina"].map({"No":0,"Yes":1})
|
||||||
dataset["HadStroke"]= dataset["HadStroke"].map({"No":0,"Yes":1})
|
dataset["HadStroke"]= dataset["HadStroke"].map({"No":0,"Yes":1})
|
||||||
dataset["HadAsthma"]= dataset["HadAsthma"].map({"No":0,"Yes":1})
|
dataset["HadAsthma"]= dataset["HadAsthma"].map({"No":0,"Yes":1})
|
||||||
dataset["HadSkinCancer"]= dataset["HadSkinCancer"].map({"No":0,"Yes":1})
|
dataset["HadSkinCancer"]= dataset["HadSkinCancer"].map({"No":0,"Yes":1})
|
||||||
dataset["HadCOPD"]= dataset["HadCOPD"].map({"No":0,"Yes":1})
|
dataset["HadCOPD"]= dataset["HadCOPD"].map({"No":0,"Yes":1})
|
||||||
dataset["HadDepressiveDisorder"]= dataset["HadDepressiveDisorder"].map({"No":0,"Yes":1})
|
dataset["HadDepressiveDisorder"]= dataset["HadDepressiveDisorder"].map({"No":0,"Yes":1})
|
||||||
dataset["HadKidneyDisease"]= dataset["HadKidneyDisease"].map({"No":0,"Yes":1})
|
dataset["HadKidneyDisease"]= dataset["HadKidneyDisease"].map({"No":0,"Yes":1})
|
||||||
dataset["HadArthritis"]= dataset["HadArthritis"].map({"No":0,"Yes":1})
|
dataset["HadArthritis"]= dataset["HadArthritis"].map({"No":0,"Yes":1})
|
||||||
dataset["HadDiabetes"]= dataset["HadDiabetes"].map({"No":0,"Yes, but only during pregnancy (female)":1,"No, pre-diabetes or borderline diabetes":2,"Yes":3})
|
dataset["HadDiabetes"]= dataset["HadDiabetes"].map({"No":0,"Yes, but only during pregnancy (female)":1,"No, pre-diabetes or borderline diabetes":2,"Yes":3})
|
||||||
|
|
||||||
dataset["DeafOrHardOfHearing"]= dataset["DeafOrHardOfHearing"].map({"No":0,"Yes":1})
|
dataset["DeafOrHardOfHearing"]= dataset["DeafOrHardOfHearing"].map({"No":0,"Yes":1})
|
||||||
dataset["BlindOrVisionDifficulty"]= dataset["BlindOrVisionDifficulty"].map({"No":0,"Yes":1})
|
dataset["BlindOrVisionDifficulty"]= dataset["BlindOrVisionDifficulty"].map({"No":0,"Yes":1})
|
||||||
dataset["DifficultyConcentrating"]= dataset["DifficultyConcentrating"].map({"No":0,"Yes":1})
|
dataset["DifficultyConcentrating"]= dataset["DifficultyConcentrating"].map({"No":0,"Yes":1})
|
||||||
dataset["DifficultyWalking"]= dataset["DifficultyWalking"].map({"No":0,"Yes":1})
|
dataset["DifficultyWalking"]= dataset["DifficultyWalking"].map({"No":0,"Yes":1})
|
||||||
dataset["DifficultyDressingBathing"]= dataset["DifficultyDressingBathing"].map({"No":0,"Yes":1})
|
dataset["DifficultyDressingBathing"]= dataset["DifficultyDressingBathing"].map({"No":0,"Yes":1})
|
||||||
dataset["DifficultyErrands"]= dataset["DifficultyErrands"].map({"No":0,"Yes":1})
|
dataset["DifficultyErrands"]= dataset["DifficultyErrands"].map({"No":0,"Yes":1})
|
||||||
dataset["SmokerStatus"]= dataset["SmokerStatus"].map({"Never smoked":0,"Current smoker - now smokes some days":1,"Former smoker":2,"Current smoker - now smokes every day":3})
|
dataset["SmokerStatus"]= dataset["SmokerStatus"].map({"Never smoked":0,"Current smoker - now smokes some days":1,"Former smoker":2,"Current smoker - now smokes every day":3})
|
||||||
dataset["ECigaretteUsage"]= dataset["ECigaretteUsage"].map({"Never used e-cigarettes in my entire life":0,"Not at all (right now)":1,"Use them some days":2,"Use them every day":3})
|
dataset["ECigaretteUsage"]= dataset["ECigaretteUsage"].map({"Never used e-cigarettes in my entire life":0,"Not at all (right now)":1,"Use them some days":2,"Use them every day":3})
|
||||||
dataset["ChestScan"]= dataset["ChestScan"].map({"No":0,"Yes":1})
|
dataset["ChestScan"]= dataset["ChestScan"].map({"No":0,"Yes":1})
|
||||||
dataset["RaceEthnicityCategory"] = dataset["RaceEthnicityCategory"].fillna("Unknown").astype('category')
|
dataset["RaceEthnicityCategory"] = dataset["RaceEthnicityCategory"].fillna("Unknown").astype('category')
|
||||||
dataset["AgeCategory"] = dataset["AgeCategory"].fillna("Unknown").astype('category')
|
dataset["AgeCategory"] = dataset["AgeCategory"].fillna("Unknown").astype('category')
|
||||||
dataset["HeightInMeters"] = dataset["HeightInMeters"].astype(float)
|
dataset["HeightInMeters"] = dataset["HeightInMeters"].astype(float)
|
||||||
dataset["WeightInKilograms"] = dataset["WeightInKilograms"].astype(float)
|
dataset["WeightInKilograms"] = dataset["WeightInKilograms"].astype(float)
|
||||||
dataset["BMI"] = dataset["BMI"].astype(float)
|
dataset["BMI"] = dataset["BMI"].astype(float)
|
||||||
dataset["AlcoholDrinkers"]= dataset["AlcoholDrinkers"].map({"No":0,"Yes":1})
|
dataset["AlcoholDrinkers"]= dataset["AlcoholDrinkers"].map({"No":0,"Yes":1})
|
||||||
dataset["HIVTesting"]= dataset["HIVTesting"].map({"No":0,"Yes":1})
|
dataset["HIVTesting"]= dataset["HIVTesting"].map({"No":0,"Yes":1})
|
||||||
dataset["FluVaxLast12"]= dataset["FluVaxLast12"].map({"No":0,"Yes":1})
|
dataset["FluVaxLast12"]= dataset["FluVaxLast12"].map({"No":0,"Yes":1})
|
||||||
dataset["PneumoVaxEver"]= dataset["PneumoVaxEver"].map({"No":0,"Yes":1})
|
dataset["PneumoVaxEver"]= dataset["PneumoVaxEver"].map({"No":0,"Yes":1})
|
||||||
dataset["TetanusLast10Tdap"]= dataset["TetanusLast10Tdap"].apply(lambda x: float('NaN') if type(x)!=str else 1.0 if 'Yes,' in x else 1.0 if 'No,' in x else float('NaN'))
|
dataset["TetanusLast10Tdap"]= dataset["TetanusLast10Tdap"].apply(lambda x: float('NaN') if type(x)!=str else 1.0 if 'Yes,' in x else 1.0 if 'No,' in x else float('NaN'))
|
||||||
dataset["HighRiskLastYear"]= dataset["HighRiskLastYear"].map({"No":0,"Yes":1})
|
dataset["HighRiskLastYear"]= dataset["HighRiskLastYear"].map({"No":0,"Yes":1})
|
||||||
dataset["CovidPos"]= dataset["CovidPos"].map({"No":0,"Yes":1})
|
dataset["CovidPos"]= dataset["CovidPos"].map({"No":0,"Yes":1})
|
||||||
test.head()
|
test.head()
|
||||||
normalize_dataset(test)
|
normalize_dataset(test)
|
||||||
test.head()
|
test.head()
|
||||||
test.info()
|
test.info()
|
||||||
normalize_dataset(train)
|
normalize_dataset(train)
|
||||||
normalize_dataset(valid)
|
normalize_dataset(valid)
|
||||||
train.describe()
|
train.describe()
|
||||||
test.describe()
|
test.describe()
|
||||||
valid.describe()
|
valid.describe()
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
sns.set_theme()
|
sns.set_theme()
|
||||||
g = sns.catplot(
|
g = sns.catplot(
|
||||||
data=train, kind="bar",
|
data=train, kind="bar",
|
||||||
x="GeneralHealth", y="WeightInKilograms", hue="HadHeartAttack",
|
x="GeneralHealth", y="WeightInKilograms", hue="HadHeartAttack",
|
||||||
errorbar="sd", palette="dark", alpha=.6, height=6
|
errorbar="sd", palette="dark", alpha=.6, height=6
|
||||||
)
|
)
|
||||||
g.despine(left=True)
|
g.despine(left=True)
|
||||||
g.set_axis_labels("General health index", "Body mass (kg)")
|
g.set_axis_labels("General health index", "Body mass (kg)")
|
||||||
g.legend.set_title("Had heart attack")
|
g.legend.set_title("Had heart attack")
|
||||||
valid.groupby('SmokerStatus', as_index=False)['HadHeartAttack'].mean()
|
valid.groupby('SmokerStatus', as_index=False)['HadHeartAttack'].mean()
|
||||||
valid.groupby('GeneralHealth', as_index=False)['HadHeartAttack'].mean()
|
valid.groupby('GeneralHealth', as_index=False)['HadHeartAttack'].mean()
|
||||||
valid.pivot_table('HadHeartAttack',index='GeneralHealth', columns='SmokerStatus')
|
valid.pivot_table('HadHeartAttack',index='GeneralHealth', columns='SmokerStatus')
|
||||||
## Normalizacja część 2 - Skalowanie kolumn numerycznych do 0-1
|
## Normalizacja część 2 - Skalowanie kolumn numerycznych do 0-1
|
||||||
from sklearn.preprocessing import MinMaxScaler
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
scaler = MinMaxScaler()
|
scaler = MinMaxScaler()
|
||||||
def scale_float_columns(dataset):
|
def scale_float_columns(dataset):
|
||||||
numerical_columns = list(dataset.select_dtypes(include=['float64']).columns)
|
numerical_columns = list(dataset.select_dtypes(include=['float64']).columns)
|
||||||
dataset[numerical_columns] = scaler.fit_transform(dataset[numerical_columns])
|
dataset[numerical_columns] = scaler.fit_transform(dataset[numerical_columns])
|
||||||
test.head()
|
test.head()
|
||||||
scale_float_columns(test)
|
scale_float_columns(test)
|
||||||
scale_float_columns(train)
|
scale_float_columns(train)
|
||||||
scale_float_columns(valid)
|
scale_float_columns(valid)
|
||||||
test.head()
|
test.head()
|
||||||
## 5. Czyszczenie brakujących pól
|
## 5. Czyszczenie brakujących pól
|
||||||
print(df.shape[0])
|
print(df.shape[0])
|
||||||
print(df.shape[0] - df.dropna().shape[0])
|
print(df.shape[0] - df.dropna().shape[0])
|
||||||
test.head()
|
test.head()
|
||||||
|
|
||||||
numeric_columns = train.select_dtypes(include=['number']).columns
|
numeric_columns = train.select_dtypes(include=['number']).columns
|
||||||
test[numeric_columns] = test[numeric_columns].fillna(test[numeric_columns].median().iloc[0])
|
test[numeric_columns] = test[numeric_columns].fillna(test[numeric_columns].median().iloc[0])
|
||||||
train[numeric_columns] = train[numeric_columns].fillna(train[numeric_columns].median().iloc[0])
|
train[numeric_columns] = train[numeric_columns].fillna(train[numeric_columns].median().iloc[0])
|
||||||
valid[numeric_columns] = valid[numeric_columns].fillna(valid[numeric_columns].iloc[0])
|
valid[numeric_columns] = valid[numeric_columns].fillna(valid[numeric_columns].median().iloc[0])
|
||||||
|
|
||||||
test.head()
|
test.head()
|
||||||
test["HighRiskLastYear"].value_counts()
|
test["HighRiskLastYear"].value_counts()
|
||||||
test["HighRiskLastYear"].isna().sum()
|
test["HighRiskLastYear"].isna().sum()
|
||||||
test.info()
|
test.info()
|
||||||
train.info()
|
train.info()
|
||||||
valid.info()
|
valid.info()
|
||||||
|
|
||||||
cat_columns = test.select_dtypes(['category']).columns
|
cat_columns = test.select_dtypes(['category']).columns
|
||||||
|
|
||||||
test.to_csv("test.csv")
|
test.to_csv("test.csv")
|
||||||
train.to_csv("train.csv")
|
train.to_csv("train.csv")
|
||||||
valid.to_csv("valid.csv")
|
valid.to_csv("valid.csv")
|
Loading…
Reference in New Issue
Block a user