diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..472bdf0 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,6 @@ +FROM python:3 + +COPY requirements.txt requirements.txt +RUN pip install -r requirements.txt +COPY . . +CMD ["sh", "dataset_download_and_run.sh"] \ No newline at end of file diff --git a/dane.py b/dane.py new file mode 100644 index 0000000..34f43e3 --- /dev/null +++ b/dane.py @@ -0,0 +1,142 @@ +### 1. Pobieranie zbioru danych +import zipfile +with zipfile.ZipFile("personal-key-indicators-of-heart-disease.zip", 'r') as zip_ref: + zip_ref.extractall("dataset_extracted") +import pandas as pd +# W pobranym zbiorze danych jest kilka podzbiorów więc celowo otwieram ten z NaN, żeby manualnie go oczyścić dla praktyki +df = pd.read_csv("dataset_extracted/2022/heart_2022_with_nans.csv") +## Przeglądanie nieoczyszczonego datasetu +df.info() +df.head() +df.describe() +df["HadHeartAttack"].value_counts().plot(kind="pie") +df["HadHeartAttack"].value_counts() + +## 2. Podział na podzbiory (train / dev / test - 8:1:1)) i oversampling +from sklearn.model_selection import train_test_split +#Funkcji z sklearn musimy użyć dwukrotnie, bo dzieli tylko na dwa podzbiory +train, test_and_valid = train_test_split(df, test_size=0.2) #0.8 train, 0.2 test&valid + +test, valid = train_test_split(test_and_valid, test_size=0.5) #0.1 test, 0.1 valid +train["HadHeartAttack"].value_counts() +def oversample(dataset): + num_true = len(dataset[dataset["HadHeartAttack"]=="Yes"]) + num_false = len(dataset[dataset["HadHeartAttack"]=="No"]) + num_oversampling_steps = num_false//num_true + oversampled = dataset.copy() + for x in range(num_oversampling_steps): + oversampled = pd.concat([oversampled, dataset[dataset["HadHeartAttack"]=="Yes"]], ignore_index=True) + return oversampled +train = oversample(train) +train["HadHeartAttack"].value_counts().plot(kind="pie") +test["HadHeartAttack"].value_counts().plot(kind="pie") +valid["HadHeartAttack"].value_counts().plot(kind="pie") +df["SmokerStatus"].value_counts().plot(kind="pie") +df["ECigaretteUsage"].value_counts().plot(kind="pie") +df["CovidPos"].value_counts().plot(kind="pie") +## Normalizacja część 1 - zamiana na kolumny liczbowe i kategoryczne +df["Sex"].unique() +df["GeneralHealth"].unique() +health_map = { + "Excellent": 5, + "Very good": 4, + "Good": 3, + "Fair": 2, + "Poor": 1 +} +for col in df: + print(f"{col}:") + print(df[col].unique()) +from collections import defaultdict +def normalize_dataset(dataset): + dataset["GeneralHealth"] = dataset["GeneralHealth"].map(defaultdict(lambda: float('NaN'), health_map), na_action='ignore') + dataset["Sex"] = dataset["Sex"].map({"Female":0,"Male":1}).astype(float) #Zamiana z kolumn tekstowych na numeryczne + dataset.rename(columns ={"Sex":"Male"},inplace=True) + dataset["State"] = dataset["State"].astype('category') + dataset["PhysicalHealthDays"].astype(float) + dataset["MentalHealthDays"].astype(float) + dataset["LastCheckupTime"] = dataset["LastCheckupTime"].fillna("Unknown").astype('category') # Potem korzystam z fillna-->median ale nie działa to na kolumnach kategorycznych więc wykonuję to przed konwersją + dataset["PhysicalActivities"]= dataset["PhysicalActivities"].map({"No":0,"Yes":1}) + dataset["SleepHours"].astype(float) + dataset["RemovedTeeth"] = dataset["RemovedTeeth"].map(defaultdict(lambda: float('NaN'), {"None of them":0,"1 to 5":1, "6 or more, but not all":2, "All":3}), na_action='ignore') + dataset["HadHeartAttack"]= dataset["HadHeartAttack"].map({"No":0,"Yes":1}) + dataset["HadAngina"]= dataset["HadAngina"].map({"No":0,"Yes":1}) + dataset["HadStroke"]= dataset["HadStroke"].map({"No":0,"Yes":1}) + dataset["HadAsthma"]= dataset["HadAsthma"].map({"No":0,"Yes":1}) + dataset["HadSkinCancer"]= dataset["HadSkinCancer"].map({"No":0,"Yes":1}) + dataset["HadCOPD"]= dataset["HadCOPD"].map({"No":0,"Yes":1}) + dataset["HadDepressiveDisorder"]= dataset["HadDepressiveDisorder"].map({"No":0,"Yes":1}) + dataset["HadKidneyDisease"]= dataset["HadKidneyDisease"].map({"No":0,"Yes":1}) + dataset["HadArthritis"]= dataset["HadArthritis"].map({"No":0,"Yes":1}) + dataset["HadDiabetes"]= dataset["HadDiabetes"].map({"No":0,"Yes, but only during pregnancy (female)":1,"No, pre-diabetes or borderline diabetes":2,"Yes":3}) + + dataset["DeafOrHardOfHearing"]= dataset["DeafOrHardOfHearing"].map({"No":0,"Yes":1}) + dataset["BlindOrVisionDifficulty"]= dataset["BlindOrVisionDifficulty"].map({"No":0,"Yes":1}) + dataset["DifficultyConcentrating"]= dataset["DifficultyConcentrating"].map({"No":0,"Yes":1}) + dataset["DifficultyWalking"]= dataset["DifficultyWalking"].map({"No":0,"Yes":1}) + dataset["DifficultyDressingBathing"]= dataset["DifficultyDressingBathing"].map({"No":0,"Yes":1}) + dataset["DifficultyErrands"]= dataset["DifficultyErrands"].map({"No":0,"Yes":1}) + dataset["SmokerStatus"]= dataset["SmokerStatus"].map({"Never smoked":0,"Current smoker - now smokes some days":1,"Former smoker":2,"Current smoker - now smokes every day":3}) + dataset["ECigaretteUsage"]= dataset["ECigaretteUsage"].map({"Never used e-cigarettes in my entire life":0,"Not at all (right now)":1,"Use them some days":2,"Use them every day":3}) + dataset["ChestScan"]= dataset["ChestScan"].map({"No":0,"Yes":1}) + dataset["RaceEthnicityCategory"] = dataset["RaceEthnicityCategory"].fillna("Unknown").astype('category') + dataset["AgeCategory"] = dataset["AgeCategory"].fillna("Unknown").astype('category') + dataset["HeightInMeters"] = dataset["HeightInMeters"].astype(float) + dataset["WeightInKilograms"] = dataset["WeightInKilograms"].astype(float) + dataset["BMI"] = dataset["BMI"].astype(float) + dataset["AlcoholDrinkers"]= dataset["AlcoholDrinkers"].map({"No":0,"Yes":1}) + dataset["HIVTesting"]= dataset["HIVTesting"].map({"No":0,"Yes":1}) + dataset["FluVaxLast12"]= dataset["FluVaxLast12"].map({"No":0,"Yes":1}) + dataset["PneumoVaxEver"]= dataset["PneumoVaxEver"].map({"No":0,"Yes":1}) + dataset["TetanusLast10Tdap"]= dataset["TetanusLast10Tdap"].apply(lambda x: float('NaN') if type(x)!=str else 1.0 if 'Yes,' in x else 1.0 if 'No,' in x else float('NaN')) + dataset["HighRiskLastYear"]= dataset["HighRiskLastYear"].map({"No":0,"Yes":1}) + dataset["CovidPos"]= dataset["CovidPos"].map({"No":0,"Yes":1}) +test.head() +normalize_dataset(test) +test.head() +test.info() +normalize_dataset(train) +normalize_dataset(valid) +train.describe() +test.describe() +valid.describe() +import seaborn as sns +sns.set_theme() +g = sns.catplot( + data=train, kind="bar", + x="GeneralHealth", y="WeightInKilograms", hue="HadHeartAttack", + errorbar="sd", palette="dark", alpha=.6, height=6 +) +g.despine(left=True) +g.set_axis_labels("General health index", "Body mass (kg)") +g.legend.set_title("Had heart attack") +valid.groupby('SmokerStatus', as_index=False)['HadHeartAttack'].mean() +valid.groupby('GeneralHealth', as_index=False)['HadHeartAttack'].mean() +valid.pivot_table('HadHeartAttack',index='GeneralHealth', columns='SmokerStatus') +## Normalizacja część 2 - Skalowanie kolumn numerycznych do 0-1 +from sklearn.preprocessing import MinMaxScaler +scaler = MinMaxScaler() +def scale_float_columns(dataset): + numerical_columns = list(dataset.select_dtypes(include=['float64']).columns) + dataset[numerical_columns] = scaler.fit_transform(dataset[numerical_columns]) +test.head() +scale_float_columns(test) +scale_float_columns(train) +scale_float_columns(valid) +test.head() +## 5. Czyszczenie brakujących pól +print(df.shape[0]) +print(df.shape[0] - df.dropna().shape[0]) +test.head() + +numeric_columns = train.select_dtypes(include=['number']).columns +test[numeric_columns] = test[numeric_columns].fillna(test[numeric_columns].median().iloc[0]) +train[numeric_columns] = train[numeric_columns].fillna(train[numeric_columns].median().iloc[0]) +valid[numeric_columns] = valid[numeric_columns].fillna(valid[numeric_columns].iloc[0]) + +test.head() +test["HighRiskLastYear"].value_counts() +test["HighRiskLastYear"].isna().sum() +test.info() +train.info() +valid.info() diff --git a/dataset_download_and_run.sh b/dataset_download_and_run.sh new file mode 100644 index 0000000..f9a2b54 --- /dev/null +++ b/dataset_download_and_run.sh @@ -0,0 +1,4 @@ +export KAGGLE_USERNAME=${kaggle_username} +export KAGGLE_KEY=${kaggle_password} +kaggle datasets download -d kamilpytlak/personal-key-indicators-of-heart-disease/ +python ./dane.py \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..72c4f6d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +kaggle +pandas +scikit-learn +seaborn \ No newline at end of file