Compare commits
4 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
913fceecb1 | ||
7c37dc816f | |||
7896165633 | |||
e28768f74c |
294
dane.py
294
dane.py
@ -1,148 +1,148 @@
|
||||
### 1. Pobieranie zbioru danych
|
||||
import zipfile
|
||||
with zipfile.ZipFile("personal-key-indicators-of-heart-disease.zip", 'r') as zip_ref:
|
||||
zip_ref.extractall("dataset_extracted")
|
||||
import pandas as pd
|
||||
# W pobranym zbiorze danych jest kilka podzbiorów więc celowo otwieram ten z NaN, żeby manualnie go oczyścić dla praktyki
|
||||
df = pd.read_csv("dataset_extracted/2022/heart_2022_with_nans.csv")
|
||||
## Przeglądanie nieoczyszczonego datasetu
|
||||
df.info()
|
||||
df.head()
|
||||
df.describe()
|
||||
df["HadHeartAttack"].value_counts().plot(kind="pie")
|
||||
df["HadHeartAttack"].value_counts()
|
||||
|
||||
## 2. Podział na podzbiory (train / dev / test - 8:1:1)) i oversampling
|
||||
from sklearn.model_selection import train_test_split
|
||||
#Funkcji z sklearn musimy użyć dwukrotnie, bo dzieli tylko na dwa podzbiory
|
||||
train, test_and_valid = train_test_split(df, test_size=0.2) #0.8 train, 0.2 test&valid
|
||||
|
||||
test, valid = train_test_split(test_and_valid, test_size=0.5) #0.1 test, 0.1 valid
|
||||
train["HadHeartAttack"].value_counts()
|
||||
def oversample(dataset):
|
||||
num_true = len(dataset[dataset["HadHeartAttack"]=="Yes"])
|
||||
num_false = len(dataset[dataset["HadHeartAttack"]=="No"])
|
||||
num_oversampling_steps = num_false//num_true
|
||||
oversampled = dataset.copy()
|
||||
for x in range(num_oversampling_steps):
|
||||
oversampled = pd.concat([oversampled, dataset[dataset["HadHeartAttack"]=="Yes"]], ignore_index=True)
|
||||
return oversampled
|
||||
train = oversample(train)
|
||||
train["HadHeartAttack"].value_counts().plot(kind="pie")
|
||||
test["HadHeartAttack"].value_counts().plot(kind="pie")
|
||||
valid["HadHeartAttack"].value_counts().plot(kind="pie")
|
||||
df["SmokerStatus"].value_counts().plot(kind="pie")
|
||||
df["ECigaretteUsage"].value_counts().plot(kind="pie")
|
||||
df["CovidPos"].value_counts().plot(kind="pie")
|
||||
## Normalizacja część 1 - zamiana na kolumny liczbowe i kategoryczne
|
||||
df["Sex"].unique()
|
||||
df["GeneralHealth"].unique()
|
||||
health_map = {
|
||||
"Excellent": 5,
|
||||
"Very good": 4,
|
||||
"Good": 3,
|
||||
"Fair": 2,
|
||||
"Poor": 1
|
||||
}
|
||||
for col in df:
|
||||
print(f"{col}:")
|
||||
print(df[col].unique())
|
||||
from collections import defaultdict
|
||||
def normalize_dataset(dataset):
|
||||
dataset["GeneralHealth"] = dataset["GeneralHealth"].map(defaultdict(lambda: float('NaN'), health_map), na_action='ignore')
|
||||
dataset["Sex"] = dataset["Sex"].map({"Female":0,"Male":1}).astype(float) #Zamiana z kolumn tekstowych na numeryczne
|
||||
dataset.rename(columns ={"Sex":"Male"},inplace=True)
|
||||
dataset["State"] = dataset["State"].astype('category')
|
||||
dataset["PhysicalHealthDays"].astype(float)
|
||||
dataset["MentalHealthDays"].astype(float)
|
||||
dataset["LastCheckupTime"] = dataset["LastCheckupTime"].fillna("Unknown").astype('category') # Potem korzystam z fillna-->median ale nie działa to na kolumnach kategorycznych więc wykonuję to przed konwersją
|
||||
dataset["PhysicalActivities"]= dataset["PhysicalActivities"].map({"No":0,"Yes":1})
|
||||
dataset["SleepHours"].astype(float)
|
||||
dataset["RemovedTeeth"] = dataset["RemovedTeeth"].map(defaultdict(lambda: float('NaN'), {"None of them":0,"1 to 5":1, "6 or more, but not all":2, "All":3}), na_action='ignore')
|
||||
dataset["HadHeartAttack"]= dataset["HadHeartAttack"].map({"No":0,"Yes":1})
|
||||
dataset["HadAngina"]= dataset["HadAngina"].map({"No":0,"Yes":1})
|
||||
dataset["HadStroke"]= dataset["HadStroke"].map({"No":0,"Yes":1})
|
||||
dataset["HadAsthma"]= dataset["HadAsthma"].map({"No":0,"Yes":1})
|
||||
dataset["HadSkinCancer"]= dataset["HadSkinCancer"].map({"No":0,"Yes":1})
|
||||
dataset["HadCOPD"]= dataset["HadCOPD"].map({"No":0,"Yes":1})
|
||||
dataset["HadDepressiveDisorder"]= dataset["HadDepressiveDisorder"].map({"No":0,"Yes":1})
|
||||
dataset["HadKidneyDisease"]= dataset["HadKidneyDisease"].map({"No":0,"Yes":1})
|
||||
dataset["HadArthritis"]= dataset["HadArthritis"].map({"No":0,"Yes":1})
|
||||
dataset["HadDiabetes"]= dataset["HadDiabetes"].map({"No":0,"Yes, but only during pregnancy (female)":1,"No, pre-diabetes or borderline diabetes":2,"Yes":3})
|
||||
|
||||
dataset["DeafOrHardOfHearing"]= dataset["DeafOrHardOfHearing"].map({"No":0,"Yes":1})
|
||||
dataset["BlindOrVisionDifficulty"]= dataset["BlindOrVisionDifficulty"].map({"No":0,"Yes":1})
|
||||
dataset["DifficultyConcentrating"]= dataset["DifficultyConcentrating"].map({"No":0,"Yes":1})
|
||||
dataset["DifficultyWalking"]= dataset["DifficultyWalking"].map({"No":0,"Yes":1})
|
||||
dataset["DifficultyDressingBathing"]= dataset["DifficultyDressingBathing"].map({"No":0,"Yes":1})
|
||||
dataset["DifficultyErrands"]= dataset["DifficultyErrands"].map({"No":0,"Yes":1})
|
||||
dataset["SmokerStatus"]= dataset["SmokerStatus"].map({"Never smoked":0,"Current smoker - now smokes some days":1,"Former smoker":2,"Current smoker - now smokes every day":3})
|
||||
dataset["ECigaretteUsage"]= dataset["ECigaretteUsage"].map({"Never used e-cigarettes in my entire life":0,"Not at all (right now)":1,"Use them some days":2,"Use them every day":3})
|
||||
dataset["ChestScan"]= dataset["ChestScan"].map({"No":0,"Yes":1})
|
||||
dataset["RaceEthnicityCategory"] = dataset["RaceEthnicityCategory"].fillna("Unknown").astype('category')
|
||||
dataset["AgeCategory"] = dataset["AgeCategory"].fillna("Unknown").astype('category')
|
||||
dataset["HeightInMeters"] = dataset["HeightInMeters"].astype(float)
|
||||
dataset["WeightInKilograms"] = dataset["WeightInKilograms"].astype(float)
|
||||
dataset["BMI"] = dataset["BMI"].astype(float)
|
||||
dataset["AlcoholDrinkers"]= dataset["AlcoholDrinkers"].map({"No":0,"Yes":1})
|
||||
dataset["HIVTesting"]= dataset["HIVTesting"].map({"No":0,"Yes":1})
|
||||
dataset["FluVaxLast12"]= dataset["FluVaxLast12"].map({"No":0,"Yes":1})
|
||||
dataset["PneumoVaxEver"]= dataset["PneumoVaxEver"].map({"No":0,"Yes":1})
|
||||
dataset["TetanusLast10Tdap"]= dataset["TetanusLast10Tdap"].apply(lambda x: float('NaN') if type(x)!=str else 1.0 if 'Yes,' in x else 1.0 if 'No,' in x else float('NaN'))
|
||||
dataset["HighRiskLastYear"]= dataset["HighRiskLastYear"].map({"No":0,"Yes":1})
|
||||
dataset["CovidPos"]= dataset["CovidPos"].map({"No":0,"Yes":1})
|
||||
test.head()
|
||||
normalize_dataset(test)
|
||||
test.head()
|
||||
test.info()
|
||||
normalize_dataset(train)
|
||||
normalize_dataset(valid)
|
||||
train.describe()
|
||||
test.describe()
|
||||
valid.describe()
|
||||
import seaborn as sns
|
||||
sns.set_theme()
|
||||
g = sns.catplot(
|
||||
data=train, kind="bar",
|
||||
x="GeneralHealth", y="WeightInKilograms", hue="HadHeartAttack",
|
||||
errorbar="sd", palette="dark", alpha=.6, height=6
|
||||
)
|
||||
g.despine(left=True)
|
||||
g.set_axis_labels("General health index", "Body mass (kg)")
|
||||
g.legend.set_title("Had heart attack")
|
||||
valid.groupby('SmokerStatus', as_index=False)['HadHeartAttack'].mean()
|
||||
valid.groupby('GeneralHealth', as_index=False)['HadHeartAttack'].mean()
|
||||
valid.pivot_table('HadHeartAttack',index='GeneralHealth', columns='SmokerStatus')
|
||||
## Normalizacja część 2 - Skalowanie kolumn numerycznych do 0-1
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
scaler = MinMaxScaler()
|
||||
def scale_float_columns(dataset):
|
||||
numerical_columns = list(dataset.select_dtypes(include=['float64']).columns)
|
||||
dataset[numerical_columns] = scaler.fit_transform(dataset[numerical_columns])
|
||||
test.head()
|
||||
scale_float_columns(test)
|
||||
scale_float_columns(train)
|
||||
scale_float_columns(valid)
|
||||
test.head()
|
||||
## 5. Czyszczenie brakujących pól
|
||||
print(df.shape[0])
|
||||
print(df.shape[0] - df.dropna().shape[0])
|
||||
test.head()
|
||||
|
||||
numeric_columns = train.select_dtypes(include=['number']).columns
|
||||
test[numeric_columns] = test[numeric_columns].fillna(test[numeric_columns].median().iloc[0])
|
||||
train[numeric_columns] = train[numeric_columns].fillna(train[numeric_columns].median().iloc[0])
|
||||
valid[numeric_columns] = valid[numeric_columns].fillna(valid[numeric_columns].iloc[0])
|
||||
|
||||
test.head()
|
||||
test["HighRiskLastYear"].value_counts()
|
||||
test["HighRiskLastYear"].isna().sum()
|
||||
test.info()
|
||||
train.info()
|
||||
valid.info()
|
||||
|
||||
cat_columns = test.select_dtypes(['category']).columns
|
||||
|
||||
test.to_csv("test.csv")
|
||||
train.to_csv("train.csv")
|
||||
### 1. Pobieranie zbioru danych
|
||||
import zipfile
|
||||
with zipfile.ZipFile("personal-key-indicators-of-heart-disease.zip", 'r') as zip_ref:
|
||||
zip_ref.extractall("dataset_extracted")
|
||||
import pandas as pd
|
||||
# W pobranym zbiorze danych jest kilka podzbiorów więc celowo otwieram ten z NaN, żeby manualnie go oczyścić dla praktyki
|
||||
df = pd.read_csv("dataset_extracted/2022/heart_2022_with_nans.csv")
|
||||
## Przeglądanie nieoczyszczonego datasetu
|
||||
df.info()
|
||||
df.head()
|
||||
df.describe()
|
||||
df["HadHeartAttack"].value_counts().plot(kind="pie")
|
||||
df["HadHeartAttack"].value_counts()
|
||||
|
||||
## 2. Podział na podzbiory (train / dev / test - 8:1:1)) i oversampling
|
||||
from sklearn.model_selection import train_test_split
|
||||
#Funkcji z sklearn musimy użyć dwukrotnie, bo dzieli tylko na dwa podzbiory
|
||||
train, test_and_valid = train_test_split(df, test_size=0.2) #0.8 train, 0.2 test&valid
|
||||
|
||||
test, valid = train_test_split(test_and_valid, test_size=0.5) #0.1 test, 0.1 valid
|
||||
train["HadHeartAttack"].value_counts()
|
||||
def oversample(dataset):
|
||||
num_true = len(dataset[dataset["HadHeartAttack"]=="Yes"])
|
||||
num_false = len(dataset[dataset["HadHeartAttack"]=="No"])
|
||||
num_oversampling_steps = num_false//num_true
|
||||
oversampled = dataset.copy()
|
||||
for x in range(num_oversampling_steps):
|
||||
oversampled = pd.concat([oversampled, dataset[dataset["HadHeartAttack"]=="Yes"]], ignore_index=True)
|
||||
return oversampled
|
||||
train = oversample(train)
|
||||
train["HadHeartAttack"].value_counts().plot(kind="pie")
|
||||
test["HadHeartAttack"].value_counts().plot(kind="pie")
|
||||
valid["HadHeartAttack"].value_counts().plot(kind="pie")
|
||||
df["SmokerStatus"].value_counts().plot(kind="pie")
|
||||
df["ECigaretteUsage"].value_counts().plot(kind="pie")
|
||||
df["CovidPos"].value_counts().plot(kind="pie")
|
||||
## Normalizacja część 1 - zamiana na kolumny liczbowe i kategoryczne
|
||||
df["Sex"].unique()
|
||||
df["GeneralHealth"].unique()
|
||||
health_map = {
|
||||
"Excellent": 5,
|
||||
"Very good": 4,
|
||||
"Good": 3,
|
||||
"Fair": 2,
|
||||
"Poor": 1
|
||||
}
|
||||
for col in df:
|
||||
print(f"{col}:")
|
||||
print(df[col].unique())
|
||||
from collections import defaultdict
|
||||
def normalize_dataset(dataset):
|
||||
dataset["GeneralHealth"] = dataset["GeneralHealth"].map(defaultdict(lambda: float('NaN'), health_map), na_action='ignore')
|
||||
dataset["Sex"] = dataset["Sex"].map({"Female":0,"Male":1}).astype(float) #Zamiana z kolumn tekstowych na numeryczne
|
||||
dataset.rename(columns ={"Sex":"Male"},inplace=True)
|
||||
dataset["State"] = dataset["State"].astype('category')
|
||||
dataset["PhysicalHealthDays"].astype(float)
|
||||
dataset["MentalHealthDays"].astype(float)
|
||||
dataset["LastCheckupTime"] = dataset["LastCheckupTime"].fillna("Unknown").astype('category') # Potem korzystam z fillna-->median ale nie działa to na kolumnach kategorycznych więc wykonuję to przed konwersją
|
||||
dataset["PhysicalActivities"]= dataset["PhysicalActivities"].map({"No":0,"Yes":1})
|
||||
dataset["SleepHours"].astype(float)
|
||||
dataset["RemovedTeeth"] = dataset["RemovedTeeth"].map(defaultdict(lambda: float('NaN'), {"None of them":0,"1 to 5":1, "6 or more, but not all":2, "All":3}), na_action='ignore')
|
||||
dataset["HadHeartAttack"]= dataset["HadHeartAttack"].map({"No":0,"Yes":1})
|
||||
dataset["HadAngina"]= dataset["HadAngina"].map({"No":0,"Yes":1})
|
||||
dataset["HadStroke"]= dataset["HadStroke"].map({"No":0,"Yes":1})
|
||||
dataset["HadAsthma"]= dataset["HadAsthma"].map({"No":0,"Yes":1})
|
||||
dataset["HadSkinCancer"]= dataset["HadSkinCancer"].map({"No":0,"Yes":1})
|
||||
dataset["HadCOPD"]= dataset["HadCOPD"].map({"No":0,"Yes":1})
|
||||
dataset["HadDepressiveDisorder"]= dataset["HadDepressiveDisorder"].map({"No":0,"Yes":1})
|
||||
dataset["HadKidneyDisease"]= dataset["HadKidneyDisease"].map({"No":0,"Yes":1})
|
||||
dataset["HadArthritis"]= dataset["HadArthritis"].map({"No":0,"Yes":1})
|
||||
dataset["HadDiabetes"]= dataset["HadDiabetes"].map({"No":0,"Yes, but only during pregnancy (female)":1,"No, pre-diabetes or borderline diabetes":2,"Yes":3})
|
||||
|
||||
dataset["DeafOrHardOfHearing"]= dataset["DeafOrHardOfHearing"].map({"No":0,"Yes":1})
|
||||
dataset["BlindOrVisionDifficulty"]= dataset["BlindOrVisionDifficulty"].map({"No":0,"Yes":1})
|
||||
dataset["DifficultyConcentrating"]= dataset["DifficultyConcentrating"].map({"No":0,"Yes":1})
|
||||
dataset["DifficultyWalking"]= dataset["DifficultyWalking"].map({"No":0,"Yes":1})
|
||||
dataset["DifficultyDressingBathing"]= dataset["DifficultyDressingBathing"].map({"No":0,"Yes":1})
|
||||
dataset["DifficultyErrands"]= dataset["DifficultyErrands"].map({"No":0,"Yes":1})
|
||||
dataset["SmokerStatus"]= dataset["SmokerStatus"].map({"Never smoked":0,"Current smoker - now smokes some days":1,"Former smoker":2,"Current smoker - now smokes every day":3})
|
||||
dataset["ECigaretteUsage"]= dataset["ECigaretteUsage"].map({"Never used e-cigarettes in my entire life":0,"Not at all (right now)":1,"Use them some days":2,"Use them every day":3})
|
||||
dataset["ChestScan"]= dataset["ChestScan"].map({"No":0,"Yes":1})
|
||||
dataset["RaceEthnicityCategory"] = dataset["RaceEthnicityCategory"].fillna("Unknown").astype('category')
|
||||
dataset["AgeCategory"] = dataset["AgeCategory"].fillna("Unknown").astype('category')
|
||||
dataset["HeightInMeters"] = dataset["HeightInMeters"].astype(float)
|
||||
dataset["WeightInKilograms"] = dataset["WeightInKilograms"].astype(float)
|
||||
dataset["BMI"] = dataset["BMI"].astype(float)
|
||||
dataset["AlcoholDrinkers"]= dataset["AlcoholDrinkers"].map({"No":0,"Yes":1})
|
||||
dataset["HIVTesting"]= dataset["HIVTesting"].map({"No":0,"Yes":1})
|
||||
dataset["FluVaxLast12"]= dataset["FluVaxLast12"].map({"No":0,"Yes":1})
|
||||
dataset["PneumoVaxEver"]= dataset["PneumoVaxEver"].map({"No":0,"Yes":1})
|
||||
dataset["TetanusLast10Tdap"]= dataset["TetanusLast10Tdap"].apply(lambda x: float('NaN') if type(x)!=str else 1.0 if 'Yes,' in x else 1.0 if 'No,' in x else float('NaN'))
|
||||
dataset["HighRiskLastYear"]= dataset["HighRiskLastYear"].map({"No":0,"Yes":1})
|
||||
dataset["CovidPos"]= dataset["CovidPos"].map({"No":0,"Yes":1})
|
||||
test.head()
|
||||
normalize_dataset(test)
|
||||
test.head()
|
||||
test.info()
|
||||
normalize_dataset(train)
|
||||
normalize_dataset(valid)
|
||||
train.describe()
|
||||
test.describe()
|
||||
valid.describe()
|
||||
import seaborn as sns
|
||||
sns.set_theme()
|
||||
g = sns.catplot(
|
||||
data=train, kind="bar",
|
||||
x="GeneralHealth", y="WeightInKilograms", hue="HadHeartAttack",
|
||||
errorbar="sd", palette="dark", alpha=.6, height=6
|
||||
)
|
||||
g.despine(left=True)
|
||||
g.set_axis_labels("General health index", "Body mass (kg)")
|
||||
g.legend.set_title("Had heart attack")
|
||||
valid.groupby('SmokerStatus', as_index=False)['HadHeartAttack'].mean()
|
||||
valid.groupby('GeneralHealth', as_index=False)['HadHeartAttack'].mean()
|
||||
valid.pivot_table('HadHeartAttack',index='GeneralHealth', columns='SmokerStatus')
|
||||
## Normalizacja część 2 - Skalowanie kolumn numerycznych do 0-1
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
scaler = MinMaxScaler()
|
||||
def scale_float_columns(dataset):
|
||||
numerical_columns = list(dataset.select_dtypes(include=['float64']).columns)
|
||||
dataset[numerical_columns] = scaler.fit_transform(dataset[numerical_columns])
|
||||
test.head()
|
||||
scale_float_columns(test)
|
||||
scale_float_columns(train)
|
||||
scale_float_columns(valid)
|
||||
test.head()
|
||||
## 5. Czyszczenie brakujących pól
|
||||
print(df.shape[0])
|
||||
print(df.shape[0] - df.dropna().shape[0])
|
||||
test.head()
|
||||
|
||||
numeric_columns = train.select_dtypes(include=['number']).columns
|
||||
test[numeric_columns] = test[numeric_columns].fillna(test[numeric_columns].median().iloc[0])
|
||||
train[numeric_columns] = train[numeric_columns].fillna(train[numeric_columns].median().iloc[0])
|
||||
valid[numeric_columns] = valid[numeric_columns].fillna(valid[numeric_columns].median().iloc[0])
|
||||
|
||||
test.head()
|
||||
test["HighRiskLastYear"].value_counts()
|
||||
test["HighRiskLastYear"].isna().sum()
|
||||
test.info()
|
||||
train.info()
|
||||
valid.info()
|
||||
|
||||
cat_columns = test.select_dtypes(['category']).columns
|
||||
|
||||
test.to_csv("test.csv")
|
||||
train.to_csv("train.csv")
|
||||
valid.to_csv("valid.csv")
|
Binary file not shown.
236
environment.yml
Normal file
236
environment.yml
Normal file
@ -0,0 +1,236 @@
|
||||
name: ML
|
||||
channels:
|
||||
- defaults
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- _tflow_select=2.3.0=mkl
|
||||
- absl-py=2.1.0=py39haa95532_0
|
||||
- aiohttp=3.9.5=py39h2bbff1b_0
|
||||
- aiosignal=1.2.0=pyhd3eb1b0_0
|
||||
- anyio=4.2.0=py39haa95532_0
|
||||
- argon2-cffi=21.3.0=pyhd3eb1b0_0
|
||||
- argon2-cffi-bindings=21.2.0=py39h2bbff1b_0
|
||||
- asttokens=2.0.5=pyhd3eb1b0_0
|
||||
- astunparse=1.6.3=py_0
|
||||
- async-lru=2.0.4=py39haa95532_0
|
||||
- async-timeout=4.0.3=py39haa95532_0
|
||||
- attrs=23.1.0=py39haa95532_0
|
||||
- babel=2.11.0=py39haa95532_0
|
||||
- backcall=0.2.0=pyhd3eb1b0_0
|
||||
- beautifulsoup4=4.12.2=py39haa95532_0
|
||||
- blas=1.0=mkl
|
||||
- bleach=4.1.0=pyhd3eb1b0_0
|
||||
- blinker=1.6.2=py39haa95532_0
|
||||
- bottleneck=1.3.7=py39h9128911_0
|
||||
- brotli=1.0.9=h2bbff1b_8
|
||||
- brotli-bin=1.0.9=h2bbff1b_8
|
||||
- brotli-python=1.0.9=py39hd77b12b_8
|
||||
- bzip2=1.0.8=h2bbff1b_6
|
||||
- ca-certificates=2024.3.11=haa95532_0
|
||||
- cachetools=5.3.3=py39haa95532_0
|
||||
- certifi=2024.2.2=py39haa95532_0
|
||||
- cffi=1.16.0=py39h2bbff1b_1
|
||||
- charset-normalizer=2.0.4=pyhd3eb1b0_0
|
||||
- click=8.1.7=py39haa95532_0
|
||||
- colorama=0.4.6=py39haa95532_0
|
||||
- comm=0.2.1=py39haa95532_0
|
||||
- contourpy=1.2.0=py39h59b6b97_0
|
||||
- cryptography=41.0.3=py39h3438e0d_0
|
||||
- cycler=0.11.0=pyhd3eb1b0_0
|
||||
- debugpy=1.6.7=py39hd77b12b_0
|
||||
- decorator=5.1.1=pyhd3eb1b0_0
|
||||
- defusedxml=0.7.1=pyhd3eb1b0_0
|
||||
- exceptiongroup=1.2.0=py39haa95532_0
|
||||
- executing=0.8.3=pyhd3eb1b0_0
|
||||
- flatbuffers=2.0.0=h6c2663c_0
|
||||
- fonttools=4.51.0=py39h2bbff1b_0
|
||||
- freetype=2.12.1=ha860e81_0
|
||||
- frozenlist=1.4.0=py39h2bbff1b_0
|
||||
- gast=0.4.0=pyhd3eb1b0_0
|
||||
- giflib=5.2.1=h8cc25b3_3
|
||||
- glib=2.78.4=hd77b12b_0
|
||||
- glib-tools=2.78.4=hd77b12b_0
|
||||
- google-auth=2.29.0=py39haa95532_0
|
||||
- google-auth-oauthlib=0.4.4=pyhd3eb1b0_0
|
||||
- google-pasta=0.2.0=pyhd3eb1b0_0
|
||||
- grpcio=1.42.0=py39hc60d5dd_0
|
||||
- gst-plugins-base=1.18.5=h9e645db_0
|
||||
- gstreamer=1.18.5=hd78058f_0
|
||||
- h5py=3.11.0=py39hed405ee_0
|
||||
- hdf5=1.12.1=h51c971a_3
|
||||
- icc_rt=2022.1.0=h6049295_2
|
||||
- icu=58.2=ha925a31_3
|
||||
- idna=3.7=py39haa95532_0
|
||||
- importlib-metadata=7.0.1=py39haa95532_0
|
||||
- importlib_metadata=7.0.1=hd3eb1b0_0
|
||||
- importlib_resources=6.1.1=py39haa95532_1
|
||||
- intel-openmp=2023.1.0=h59b6b97_46320
|
||||
- ipykernel=6.28.0=py39haa95532_0
|
||||
- ipython=8.15.0=py39haa95532_0
|
||||
- ipywidgets=8.1.2=py39haa95532_0
|
||||
- jedi=0.18.1=py39haa95532_1
|
||||
- jinja2=3.1.3=py39haa95532_0
|
||||
- joblib=1.4.0=py39haa95532_0
|
||||
- jpeg=9e=h2bbff1b_1
|
||||
- json5=0.9.6=pyhd3eb1b0_0
|
||||
- jsonschema=4.19.2=py39haa95532_0
|
||||
- jsonschema-specifications=2023.7.1=py39haa95532_0
|
||||
- jupyter=1.0.0=py39haa95532_9
|
||||
- jupyter-lsp=2.2.0=py39haa95532_0
|
||||
- jupyter_client=8.6.0=py39haa95532_0
|
||||
- jupyter_console=6.6.3=py39haa95532_0
|
||||
- jupyter_core=5.5.0=py39haa95532_0
|
||||
- jupyter_events=0.8.0=py39haa95532_0
|
||||
- jupyter_server=2.10.0=py39haa95532_0
|
||||
- jupyter_server_terminals=0.4.4=py39haa95532_1
|
||||
- jupyterlab=4.0.11=py39haa95532_0
|
||||
- jupyterlab_pygments=0.1.2=py_0
|
||||
- jupyterlab_server=2.25.1=py39haa95532_0
|
||||
- jupyterlab_widgets=3.0.10=py39haa95532_0
|
||||
- keras=2.10.0=py39haa95532_0
|
||||
- keras-preprocessing=1.1.2=pyhd3eb1b0_0
|
||||
- kiwisolver=1.4.4=py39hd77b12b_0
|
||||
- krb5=1.19.4=h5b6d351_0
|
||||
- lcms2=2.12=h83e58a3_0
|
||||
- lerc=3.0=hd77b12b_0
|
||||
- libbrotlicommon=1.0.9=h2bbff1b_8
|
||||
- libbrotlidec=1.0.9=h2bbff1b_8
|
||||
- libbrotlienc=1.0.9=h2bbff1b_8
|
||||
- libclang=14.0.6=default_hb5a9fac_1
|
||||
- libclang13=14.0.6=default_h8e68704_1
|
||||
- libcurl=8.7.1=h86230a5_0
|
||||
- libdeflate=1.17=h2bbff1b_1
|
||||
- libffi=3.4.4=hd77b12b_1
|
||||
- libglib=2.78.4=ha17d25a_0
|
||||
- libiconv=1.16=h2bbff1b_3
|
||||
- libogg=1.3.5=h2bbff1b_1
|
||||
- libpng=1.6.39=h8cc25b3_0
|
||||
- libprotobuf=3.20.3=h23ce68f_0
|
||||
- libsodium=1.0.18=h62dcd97_0
|
||||
- libssh2=1.10.0=hcd4344a_2
|
||||
- libtiff=4.5.1=hd77b12b_0
|
||||
- libvorbis=1.3.7=he774522_0
|
||||
- libwebp-base=1.3.2=h2bbff1b_0
|
||||
- lz4-c=1.9.4=h2bbff1b_1
|
||||
- markdown=3.4.1=py39haa95532_0
|
||||
- markupsafe=2.1.3=py39h2bbff1b_0
|
||||
- matplotlib-base=3.8.4=py39h4ed8f06_0
|
||||
- matplotlib-inline=0.1.6=py39haa95532_0
|
||||
- mistune=2.0.4=py39haa95532_0
|
||||
- mkl=2023.1.0=h6b88ed4_46358
|
||||
- mkl-service=2.4.0=py39h2bbff1b_1
|
||||
- mkl_fft=1.3.8=py39h2bbff1b_0
|
||||
- mkl_random=1.2.4=py39h59b6b97_0
|
||||
- multidict=6.0.4=py39h2bbff1b_0
|
||||
- nbclient=0.8.0=py39haa95532_0
|
||||
- nbconvert=7.10.0=py39haa95532_0
|
||||
- nbformat=5.9.2=py39haa95532_0
|
||||
- nest-asyncio=1.6.0=py39haa95532_0
|
||||
- notebook=7.0.8=py39haa95532_0
|
||||
- notebook-shim=0.2.3=py39haa95532_0
|
||||
- numexpr=2.8.7=py39h2cd9be0_0
|
||||
- numpy=1.26.4=py39h055cbcc_0
|
||||
- numpy-base=1.26.4=py39h65a83cf_0
|
||||
- oauthlib=3.2.2=py39haa95532_0
|
||||
- openjpeg=2.4.0=h4fc8c34_0
|
||||
- openssl=1.1.1w=h2bbff1b_0
|
||||
- opt_einsum=3.3.0=pyhd3eb1b0_1
|
||||
- overrides=7.4.0=py39haa95532_0
|
||||
- packaging=23.2=py39haa95532_0
|
||||
- pandas=2.2.1=py39h5da7b33_0
|
||||
- pandocfilters=1.5.0=pyhd3eb1b0_0
|
||||
- parso=0.8.3=pyhd3eb1b0_0
|
||||
- pcre2=10.42=h0ff8eda_1
|
||||
- pickleshare=0.7.5=pyhd3eb1b0_1003
|
||||
- pillow=10.3.0=py39h2bbff1b_0
|
||||
- pip=24.0=py39haa95532_0
|
||||
- platformdirs=3.10.0=py39haa95532_0
|
||||
- ply=3.11=py39haa95532_0
|
||||
- prometheus_client=0.14.1=py39haa95532_0
|
||||
- prompt-toolkit=3.0.43=py39haa95532_0
|
||||
- prompt_toolkit=3.0.43=hd3eb1b0_0
|
||||
- protobuf=3.20.3=py39hd77b12b_0
|
||||
- psutil=5.9.0=py39h2bbff1b_0
|
||||
- pure_eval=0.2.2=pyhd3eb1b0_0
|
||||
- pyasn1=0.4.8=pyhd3eb1b0_0
|
||||
- pyasn1-modules=0.2.8=py_0
|
||||
- pybind11-abi=5=hd3eb1b0_0
|
||||
- pycparser=2.21=pyhd3eb1b0_0
|
||||
- pygments=2.15.1=py39haa95532_1
|
||||
- pyjwt=2.8.0=py39haa95532_0
|
||||
- pyopenssl=23.2.0=py39haa95532_0
|
||||
- pyparsing=3.0.9=py39haa95532_0
|
||||
- pyqt=5.15.10=py39hd77b12b_0
|
||||
- pyqt5-sip=12.13.0=py39h2bbff1b_0
|
||||
- pysocks=1.7.1=py39haa95532_0
|
||||
- python=3.9.18=h6244533_0
|
||||
- python-dateutil=2.9.0post0=py39haa95532_0
|
||||
- python-fastjsonschema=2.16.2=py39haa95532_0
|
||||
- python-flatbuffers=2.0=pyhd3eb1b0_0
|
||||
- python-json-logger=2.0.7=py39haa95532_0
|
||||
- python-tzdata=2023.3=pyhd3eb1b0_0
|
||||
- pytz=2024.1=py39haa95532_0
|
||||
- pywin32=305=py39h2bbff1b_0
|
||||
- pywinpty=2.0.10=py39h5da7b33_0
|
||||
- pyyaml=6.0.1=py39h2bbff1b_0
|
||||
- pyzmq=25.1.2=py39hd77b12b_0
|
||||
- qt-main=5.15.2=he8e5bd7_8
|
||||
- qtconsole=5.5.1=py39haa95532_0
|
||||
- qtpy=2.4.1=py39haa95532_0
|
||||
- referencing=0.30.2=py39haa95532_0
|
||||
- requests=2.31.0=py39haa95532_1
|
||||
- requests-oauthlib=1.3.0=py_0
|
||||
- rfc3339-validator=0.1.4=py39haa95532_0
|
||||
- rfc3986-validator=0.1.1=py39haa95532_0
|
||||
- rpds-py=0.10.6=py39h062c2fa_0
|
||||
- rsa=4.7.2=pyhd3eb1b0_1
|
||||
- scikit-learn=1.4.2=py39h4ed8f06_1
|
||||
- scipy=1.13.0=py39h8640f81_0
|
||||
- seaborn=0.12.2=py39haa95532_0
|
||||
- send2trash=1.8.2=py39haa95532_0
|
||||
- setuptools=69.5.1=py39haa95532_0
|
||||
- sip=6.7.12=py39hd77b12b_0
|
||||
- six=1.16.0=pyhd3eb1b0_1
|
||||
- snappy=1.1.10=h6c2663c_1
|
||||
- sniffio=1.3.0=py39haa95532_0
|
||||
- soupsieve=2.5=py39haa95532_0
|
||||
- sqlite=3.45.3=h2bbff1b_0
|
||||
- stack_data=0.2.0=pyhd3eb1b0_0
|
||||
- tbb=2021.8.0=h59b6b97_0
|
||||
- tensorboard=2.10.0=py39haa95532_0
|
||||
- tensorboard-data-server=0.6.1=py39haa95532_0
|
||||
- tensorboard-plugin-wit=1.8.1=py39haa95532_0
|
||||
- tensorflow=2.10.0=mkl_py39ha510bab_0
|
||||
- tensorflow-base=2.10.0=mkl_py39h6a7f48e_0
|
||||
- tensorflow-estimator=2.10.0=py39haa95532_0
|
||||
- termcolor=2.1.0=py39haa95532_0
|
||||
- terminado=0.17.1=py39haa95532_0
|
||||
- threadpoolctl=2.2.0=pyh0d69192_0
|
||||
- tinycss2=1.2.1=py39haa95532_0
|
||||
- tomli=2.0.1=py39haa95532_0
|
||||
- tornado=6.3.3=py39h2bbff1b_0
|
||||
- traitlets=5.7.1=py39haa95532_0
|
||||
- typing-extensions=4.11.0=py39haa95532_0
|
||||
- typing_extensions=4.11.0=py39haa95532_0
|
||||
- tzdata=2024a=h04d1e81_0
|
||||
- unicodedata2=15.1.0=py39h2bbff1b_0
|
||||
- urllib3=2.2.1=py39haa95532_0
|
||||
- vc=14.2=h2eaa2aa_1
|
||||
- vs2015_runtime=14.29.30133=h43f2093_3
|
||||
- wcwidth=0.2.5=pyhd3eb1b0_0
|
||||
- webencodings=0.5.1=py39haa95532_1
|
||||
- websocket-client=1.8.0=py39haa95532_0
|
||||
- werkzeug=2.3.8=py39haa95532_0
|
||||
- wheel=0.43.0=py39haa95532_0
|
||||
- widgetsnbextension=4.0.10=py39haa95532_0
|
||||
- win_inet_pton=1.1.0=py39haa95532_0
|
||||
- winpty=0.4.3=4
|
||||
- wrapt=1.14.1=py39h2bbff1b_0
|
||||
- xz=5.4.6=h8cc25b3_1
|
||||
- yaml=0.2.5=he774522_0
|
||||
- yarl=1.9.3=py39h2bbff1b_0
|
||||
- zeromq=4.3.5=hd77b12b_0
|
||||
- zipp=3.17.0=py39haa95532_0
|
||||
- zlib=1.2.13=h8cc25b3_1
|
||||
- zstd=1.5.5=hd43e919_2
|
||||
prefix: C:\Users\Adrian\miniconda3\envs\ML
|
81
evaluate.py
Normal file
81
evaluate.py
Normal file
@ -0,0 +1,81 @@
|
||||
import pandas as pd
|
||||
valid = pd.read_csv("valid.csv")
|
||||
|
||||
x_columns = ['Male', 'GeneralHealth', 'PhysicalHealthDays', 'MentalHealthDays',
|
||||
'PhysicalActivities', 'SleepHours', 'RemovedTeeth',
|
||||
'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD',
|
||||
'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis',
|
||||
'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
|
||||
'DifficultyConcentrating', 'DifficultyWalking',
|
||||
'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus',
|
||||
'ECigaretteUsage', 'ChestScan', 'HeightInMeters', 'WeightInKilograms',
|
||||
'BMI', 'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver',
|
||||
'TetanusLast10Tdap', 'HighRiskLastYear', 'CovidPos']
|
||||
y_column = 'HadHeartAttack'
|
||||
|
||||
valid_x = valid[x_columns]
|
||||
valid_y = valid[y_column]
|
||||
|
||||
from tensorflow import keras
|
||||
model = keras.models.load_model('model.keras')
|
||||
|
||||
import numpy as np
|
||||
predictions = model.predict(valid_x)[:,0]
|
||||
true_answers = valid_y.to_numpy()
|
||||
validation_accuracy = np.sum(np.rint(predictions) == true_answers)/len(true_answers)
|
||||
print(f"Poprawność na zbiorze walidacyjnym: {validation_accuracy:.2%}")
|
||||
|
||||
np.savetxt("predictions.txt",predictions)
|
||||
np.savetxt("predictions_two_digits.txt",predictions, fmt='%1.2f')
|
||||
|
||||
validate_heart_disease_true = valid.loc[valid[y_column]==1]
|
||||
validate_heart_disease_false = valid.loc[valid[y_column]==0]
|
||||
|
||||
from datetime import timezone
|
||||
import datetime
|
||||
import json
|
||||
|
||||
validate_heart_disease_true_x = validate_heart_disease_true[x_columns]
|
||||
validate_heart_disease_false_x = validate_heart_disease_false[x_columns]
|
||||
|
||||
|
||||
|
||||
predictions_for_true = model.predict(validate_heart_disease_true_x)[:,0]
|
||||
predictions_for_false = model.predict(validate_heart_disease_false_x)[:,0]
|
||||
|
||||
true_positives = np.sum(np.rint(predictions_for_true) == np.ones_like(predictions_for_true)).tolist()
|
||||
true_negatives = np.sum(np.rint(predictions_for_false) == np.zeros_like(predictions_for_false)).tolist()
|
||||
false_positives = len(predictions_for_false)-true_negatives
|
||||
false_negatives = len(predictions_for_true)-true_positives
|
||||
|
||||
|
||||
current_datetime = datetime.datetime.now(timezone.utc)
|
||||
metrics = {"true_positives": true_positives, "true_negatives": true_negatives, "false_positives": false_positives, "false_negatives" : false_negatives, "datetime_utc" : str(current_datetime)}
|
||||
history = []
|
||||
try:
|
||||
with open("metrics.json","r") as f:
|
||||
history = json.load(f)
|
||||
except FileNotFoundError:
|
||||
print('No historical metrics found')
|
||||
|
||||
history.append(metrics)
|
||||
with open("metrics.json","w") as f:
|
||||
json.dump(history, f)
|
||||
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
true_positives_history = [x["true_positives"] for x in history]
|
||||
true_negatives_history = [x["true_negatives"] for x in history]
|
||||
false_positives_history = [x["false_positives"] for x in history]
|
||||
false_negatives_history = [x["false_negatives"] for x in history]
|
||||
|
||||
plt.plot(true_positives_history)
|
||||
plt.plot(true_negatives_history)
|
||||
plt.plot(false_positives_history)
|
||||
plt.plot(false_negatives_history)
|
||||
|
||||
plt.legend(["True positives", "True negatives", "False positives", "False negatives"])
|
||||
plt.xlabel("Build number")
|
||||
plt.ylabel("Metric value")
|
||||
plt.title("Model evaluation history")
|
||||
plt.savefig("metrics.jpg")
|
Loading…
Reference in New Issue
Block a user