ium_452487/train.ipynb
2024-04-14 17:30:10 +02:00

18 KiB

import zipfile
with zipfile.ZipFile("dataset_cleaned.zip", 'r') as zip_ref:
    zip_ref.extractall("dataset_cleaned_extracted")
import pandas as pd
# W pobranym zbiorze danych jest kilka podzbiorów więc celowo otwieram ten z NaN, żeby manualnie go oczyścić dla praktyki
train = pd.read_csv("dataset_cleaned_extracted/train.csv")
test = pd.read_csv("dataset_cleaned_extracted/test.csv")
valid = pd.read_csv("dataset_cleaned_extracted/valid.csv")
num_columns = train.select_dtypes(['float64']).columns
print(num_columns)
Index(['Male', 'GeneralHealth', 'PhysicalHealthDays', 'MentalHealthDays',
       'PhysicalActivities', 'SleepHours', 'RemovedTeeth', 'HadHeartAttack',
       'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD',
       'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis',
       'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
       'DifficultyConcentrating', 'DifficultyWalking',
       'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus',
       'ECigaretteUsage', 'ChestScan', 'HeightInMeters', 'WeightInKilograms',
       'BMI', 'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver',
       'TetanusLast10Tdap', 'HighRiskLastYear', 'CovidPos'],
      dtype='object')
len(num_columns)
36
x_columns = ['Male', 'GeneralHealth', 'PhysicalHealthDays', 'MentalHealthDays',
       'PhysicalActivities', 'SleepHours', 'RemovedTeeth',
       'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD',
       'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis',
       'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
       'DifficultyConcentrating', 'DifficultyWalking',
       'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus',
       'ECigaretteUsage', 'ChestScan', 'HeightInMeters', 'WeightInKilograms',
       'BMI', 'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver',
       'TetanusLast10Tdap', 'HighRiskLastYear', 'CovidPos']
print(x_columns)
['Male', 'GeneralHealth', 'PhysicalHealthDays', 'MentalHealthDays', 'PhysicalActivities', 'SleepHours', 'RemovedTeeth', 'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis', 'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty', 'DifficultyConcentrating', 'DifficultyWalking', 'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus', 'ECigaretteUsage', 'ChestScan', 'HeightInMeters', 'WeightInKilograms', 'BMI', 'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'TetanusLast10Tdap', 'HighRiskLastYear', 'CovidPos']
len(x_columns)
35
y_column = 'HadHeartAttack'
train_x = train[x_columns]
train_y = train[y_column]

test_x = test[x_columns]
test_y = test[y_column]
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 676617 entries, 0 to 676616
Data columns (total 41 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Unnamed: 0                 676617 non-null  int64  
 1   State                      676617 non-null  object 
 2   Male                       676617 non-null  float64
 3   GeneralHealth              676617 non-null  float64
 4   PhysicalHealthDays         676617 non-null  float64
 5   MentalHealthDays           676617 non-null  float64
 6   LastCheckupTime            676617 non-null  object 
 7   PhysicalActivities         676617 non-null  float64
 8   SleepHours                 676617 non-null  float64
 9   RemovedTeeth               676617 non-null  float64
 10  HadHeartAttack             676617 non-null  float64
 11  HadAngina                  676617 non-null  float64
 12  HadStroke                  676617 non-null  float64
 13  HadAsthma                  676617 non-null  float64
 14  HadSkinCancer              676617 non-null  float64
 15  HadCOPD                    676617 non-null  float64
 16  HadDepressiveDisorder      676617 non-null  float64
 17  HadKidneyDisease           676617 non-null  float64
 18  HadArthritis               676617 non-null  float64
 19  HadDiabetes                676617 non-null  float64
 20  DeafOrHardOfHearing        676617 non-null  float64
 21  BlindOrVisionDifficulty    676617 non-null  float64
 22  DifficultyConcentrating    676617 non-null  float64
 23  DifficultyWalking          676617 non-null  float64
 24  DifficultyDressingBathing  676617 non-null  float64
 25  DifficultyErrands          676617 non-null  float64
 26  SmokerStatus               676617 non-null  float64
 27  ECigaretteUsage            676617 non-null  float64
 28  ChestScan                  676617 non-null  float64
 29  RaceEthnicityCategory      676617 non-null  object 
 30  AgeCategory                676617 non-null  object 
 31  HeightInMeters             676617 non-null  float64
 32  WeightInKilograms          676617 non-null  float64
 33  BMI                        676617 non-null  float64
 34  AlcoholDrinkers            676617 non-null  float64
 35  HIVTesting                 676617 non-null  float64
 36  FluVaxLast12               676617 non-null  float64
 37  PneumoVaxEver              676617 non-null  float64
 38  TetanusLast10Tdap          676617 non-null  float64
 39  HighRiskLastYear           676617 non-null  float64
 40  CovidPos                   676617 non-null  float64
dtypes: float64(36), int64(1), object(4)
memory usage: 211.6+ MB

Definiowanie modelu

import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.optimizers import Adam
def create_model():
    inputs = keras.Input(shape=(35,))
    dense1 = layers.Dense(64, activation="relu")(inputs)
    dropout1 = layers.Dropout(0.2)(dense1)
    dense2 = layers.Dense(32, activation="relu")(dropout1)
    dropout2 = layers.Dropout(0.2)(dense2)
    output = layers.Dense(1, activation="sigmoid")(dropout2)
    model = keras.Model(inputs=inputs, outputs=output)

    model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    return model

model = create_model()
model.summary()
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_2 (InputLayer)        [(None, 35)]              0         
                                                                 
 dense_3 (Dense)             (None, 64)                2304      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dropout_3 (Dropout)         (None, 32)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                                 
=================================================================
Total params: 4,417
Trainable params: 4,417
Non-trainable params: 0
_________________________________________________________________

Trenowanie modelu

callback = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=3, restore_best_weights=True)
history = model.fit(train_x, train_y, validation_data=(test_x, test_y), epochs=1000, callbacks=[callback])
Epoch 1/1000
21145/21145 [==============================] - 21s 963us/step - loss: 0.4343 - accuracy: 0.7991 - val_loss: 0.3911 - val_accuracy: 0.8412
Epoch 2/1000
21145/21145 [==============================] - 20s 957us/step - loss: 0.4262 - accuracy: 0.8043 - val_loss: 0.3980 - val_accuracy: 0.8347
Epoch 3/1000
21145/21145 [==============================] - 20s 959us/step - loss: 0.4227 - accuracy: 0.8057 - val_loss: 0.3904 - val_accuracy: 0.8396
Epoch 4/1000
21145/21145 [==============================] - 20s 950us/step - loss: 0.4202 - accuracy: 0.8073 - val_loss: 0.4032 - val_accuracy: 0.8285
Epoch 5/1000
21145/21145 [==============================] - 20s 962us/step - loss: 0.4184 - accuracy: 0.8083 - val_loss: 0.3639 - val_accuracy: 0.8613
Epoch 6/1000
21145/21145 [==============================] - 20s 965us/step - loss: 0.4172 - accuracy: 0.8086 - val_loss: 0.3897 - val_accuracy: 0.8328
Epoch 7/1000
21145/21145 [==============================] - 20s 954us/step - loss: 0.4155 - accuracy: 0.8094 - val_loss: 0.4143 - val_accuracy: 0.8272
Epoch 8/1000
21145/21145 [==============================] - 21s 970us/step - loss: 0.4145 - accuracy: 0.8102 - val_loss: 0.4026 - val_accuracy: 0.8323

Zapisywanie modelu

model.save("model_v1.keras")

Testowanie na zbiorze walidacyjnym

valid_x = valid[x_columns]
valid_y = valid[y_column]
import numpy as np
predictions = model.predict(valid_x)[:,0]
true_answers = valid_y.to_numpy()
validation_accuracy = np.sum(np.rint(predictions) == true_answers)/len(true_answers)
print(f"Poprawność na zbiorze walidacyjnym: {validation_accuracy:.2%}")
1392/1392 [==============================] - 1s 569us/step
Poprawność na zbiorze walidacyjnym: 86.15%
print(predictions[:100])
[0.08692811 0.12067404 0.31880796 0.64843357 0.15188715 0.06517262
 0.03407578 0.49311596 0.00781232 0.2089161  0.46056542 0.45341685
 0.4294767  0.25619727 0.20345858 0.2302334  0.38631877 0.36519188
 0.04014764 0.23888215 0.27519897 0.08928084 0.05204074 0.42043713
 0.19055638 0.29787344 0.23068897 0.88435644 0.03139259 0.95048493
 0.2457671  0.5858893  0.02678488 0.06240147 0.52132165 0.01431455
 0.02444405 0.07804424 0.11274771 0.12714393 0.35450152 0.01294624
 0.190797   0.07512036 0.48486376 0.06140704 0.9019506  0.08810509
 0.61831665 0.15642735 0.03310075 0.04532438 0.10763614 0.4277772
 0.20325996 0.8980398  0.7491019  0.38502344 0.03970775 0.0401529
 0.03046079 0.10123587 0.04993626 0.05702    0.18049946 0.1223311
 0.731555   0.40104443 0.18443953 0.1265702  0.07467585 0.03895461
 0.35271063 0.38039213 0.4450048  0.03670818 0.05534125 0.91664517
 0.413391   0.12545326 0.11306539 0.4350903  0.48778924 0.40804324
 0.33885244 0.21948677 0.01242744 0.02531701 0.6693964  0.15393472
 0.9307252  0.09181138 0.05571133 0.1261858  0.02687709 0.27069062
 0.22613294 0.20686075 0.47390068 0.40349996]
print(np.rint(predictions)[:100])
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
print(true_answers[:100])
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]