ium_452487/train.ipynb
s452487 f9618bc1a5 Dodanie wykresów do treningu i zmniejszenie liczby epok
Liczba 1000 była skopiowana z innego projektu, trenowanie kończyło się po ~3 i tak ze względu na early stopping ale zmieniłem na bardziej rozsądną wartość
2024-04-20 14:55:06 +02:00

83 KiB

import zipfile
with zipfile.ZipFile("dataset_cleaned.zip", 'r') as zip_ref:
    zip_ref.extractall("dataset_cleaned_extracted")
import pandas as pd
train = pd.read_csv("dataset_cleaned_extracted/train.csv")
test = pd.read_csv("dataset_cleaned_extracted/test.csv")
valid = pd.read_csv("dataset_cleaned_extracted/valid.csv")
num_columns = train.select_dtypes(['float64']).columns
print(num_columns)
Index(['Male', 'GeneralHealth', 'PhysicalHealthDays', 'MentalHealthDays',
       'PhysicalActivities', 'SleepHours', 'RemovedTeeth', 'HadHeartAttack',
       'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD',
       'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis',
       'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
       'DifficultyConcentrating', 'DifficultyWalking',
       'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus',
       'ECigaretteUsage', 'ChestScan', 'HeightInMeters', 'WeightInKilograms',
       'BMI', 'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver',
       'TetanusLast10Tdap', 'HighRiskLastYear', 'CovidPos'],
      dtype='object')
len(num_columns)
36
x_columns = ['Male', 'GeneralHealth', 'PhysicalHealthDays', 'MentalHealthDays',
       'PhysicalActivities', 'SleepHours', 'RemovedTeeth',
       'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD',
       'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis',
       'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
       'DifficultyConcentrating', 'DifficultyWalking',
       'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus',
       'ECigaretteUsage', 'ChestScan', 'HeightInMeters', 'WeightInKilograms',
       'BMI', 'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver',
       'TetanusLast10Tdap', 'HighRiskLastYear', 'CovidPos']
print(x_columns)
['Male', 'GeneralHealth', 'PhysicalHealthDays', 'MentalHealthDays', 'PhysicalActivities', 'SleepHours', 'RemovedTeeth', 'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis', 'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty', 'DifficultyConcentrating', 'DifficultyWalking', 'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus', 'ECigaretteUsage', 'ChestScan', 'HeightInMeters', 'WeightInKilograms', 'BMI', 'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'TetanusLast10Tdap', 'HighRiskLastYear', 'CovidPos']
len(x_columns)
35
y_column = 'HadHeartAttack'
train_x = train[x_columns]
train_y = train[y_column]

test_x = test[x_columns]
test_y = test[y_column]
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 676617 entries, 0 to 676616
Data columns (total 41 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Unnamed: 0                 676617 non-null  int64  
 1   State                      676617 non-null  object 
 2   Male                       676617 non-null  float64
 3   GeneralHealth              676617 non-null  float64
 4   PhysicalHealthDays         676617 non-null  float64
 5   MentalHealthDays           676617 non-null  float64
 6   LastCheckupTime            676617 non-null  object 
 7   PhysicalActivities         676617 non-null  float64
 8   SleepHours                 676617 non-null  float64
 9   RemovedTeeth               676617 non-null  float64
 10  HadHeartAttack             676617 non-null  float64
 11  HadAngina                  676617 non-null  float64
 12  HadStroke                  676617 non-null  float64
 13  HadAsthma                  676617 non-null  float64
 14  HadSkinCancer              676617 non-null  float64
 15  HadCOPD                    676617 non-null  float64
 16  HadDepressiveDisorder      676617 non-null  float64
 17  HadKidneyDisease           676617 non-null  float64
 18  HadArthritis               676617 non-null  float64
 19  HadDiabetes                676617 non-null  float64
 20  DeafOrHardOfHearing        676617 non-null  float64
 21  BlindOrVisionDifficulty    676617 non-null  float64
 22  DifficultyConcentrating    676617 non-null  float64
 23  DifficultyWalking          676617 non-null  float64
 24  DifficultyDressingBathing  676617 non-null  float64
 25  DifficultyErrands          676617 non-null  float64
 26  SmokerStatus               676617 non-null  float64
 27  ECigaretteUsage            676617 non-null  float64
 28  ChestScan                  676617 non-null  float64
 29  RaceEthnicityCategory      676617 non-null  object 
 30  AgeCategory                676617 non-null  object 
 31  HeightInMeters             676617 non-null  float64
 32  WeightInKilograms          676617 non-null  float64
 33  BMI                        676617 non-null  float64
 34  AlcoholDrinkers            676617 non-null  float64
 35  HIVTesting                 676617 non-null  float64
 36  FluVaxLast12               676617 non-null  float64
 37  PneumoVaxEver              676617 non-null  float64
 38  TetanusLast10Tdap          676617 non-null  float64
 39  HighRiskLastYear           676617 non-null  float64
 40  CovidPos                   676617 non-null  float64
dtypes: float64(36), int64(1), object(4)
memory usage: 211.6+ MB

Definiowanie modelu

import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.optimizers import Adam
def create_model():
    inputs = keras.Input(shape=(35,))
    dense1 = layers.Dense(64, activation="relu")(inputs)
    dropout1 = layers.Dropout(0.2)(dense1)
    dense2 = layers.Dense(32, activation="relu")(dropout1)
    dropout2 = layers.Dropout(0.2)(dense2)
    output = layers.Dense(1, activation="sigmoid")(dropout2)
    model = keras.Model(inputs=inputs, outputs=output)

    model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    return model

model = create_model()
model.summary()
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_1 (InputLayer)        [(None, 35)]              0         
                                                                 
 dense (Dense)               (None, 64)                2304      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
=================================================================
Total params: 4,417
Trainable params: 4,417
Non-trainable params: 0
_________________________________________________________________

Trenowanie modelu

# Early stopping dla regularyzacji
callback = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=3, restore_best_weights=True)
history = model.fit(train_x, train_y, validation_data=(test_x, test_y), epochs=11, callbacks=[callback])
Epoch 1/11
21145/21145 [==============================] - 22s 994us/step - loss: 0.4334 - accuracy: 0.7998 - val_loss: 0.3714 - val_accuracy: 0.8448
Epoch 2/11
21145/21145 [==============================] - 21s 972us/step - loss: 0.4257 - accuracy: 0.8038 - val_loss: 0.4273 - val_accuracy: 0.8249
Epoch 3/11
21145/21145 [==============================] - 21s 992us/step - loss: 0.4224 - accuracy: 0.8056 - val_loss: 0.4245 - val_accuracy: 0.8219
Epoch 4/11
21145/21145 [==============================] - 20s 962us/step - loss: 0.4201 - accuracy: 0.8074 - val_loss: 0.4108 - val_accuracy: 0.8234

Historia treningu

Loss
from matplotlib import pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
<matplotlib.legend.Legend at 0x226e7b95760>

Test loss < Train loss ze względu na warstwy dropout

Accuracy
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

Zapisywanie modelu

model.save("model_v1.keras")

Testowanie na zbiorze walidacyjnym

valid_x = valid[x_columns]
valid_y = valid[y_column]
import numpy as np
predictions = model.predict(valid_x)[:,0]
true_answers = valid_y.to_numpy()
validation_accuracy = np.sum(np.rint(predictions) == true_answers)/len(true_answers)
print(f"Poprawność na zbiorze walidacyjnym: {validation_accuracy:.2%}")
1392/1392 [==============================] - 1s 569us/step
Poprawność na zbiorze walidacyjnym: 86.15%
print(predictions[:100])
[0.08692811 0.12067404 0.31880796 0.64843357 0.15188715 0.06517262
 0.03407578 0.49311596 0.00781232 0.2089161  0.46056542 0.45341685
 0.4294767  0.25619727 0.20345858 0.2302334  0.38631877 0.36519188
 0.04014764 0.23888215 0.27519897 0.08928084 0.05204074 0.42043713
 0.19055638 0.29787344 0.23068897 0.88435644 0.03139259 0.95048493
 0.2457671  0.5858893  0.02678488 0.06240147 0.52132165 0.01431455
 0.02444405 0.07804424 0.11274771 0.12714393 0.35450152 0.01294624
 0.190797   0.07512036 0.48486376 0.06140704 0.9019506  0.08810509
 0.61831665 0.15642735 0.03310075 0.04532438 0.10763614 0.4277772
 0.20325996 0.8980398  0.7491019  0.38502344 0.03970775 0.0401529
 0.03046079 0.10123587 0.04993626 0.05702    0.18049946 0.1223311
 0.731555   0.40104443 0.18443953 0.1265702  0.07467585 0.03895461
 0.35271063 0.38039213 0.4450048  0.03670818 0.05534125 0.91664517
 0.413391   0.12545326 0.11306539 0.4350903  0.48778924 0.40804324
 0.33885244 0.21948677 0.01242744 0.02531701 0.6693964  0.15393472
 0.9307252  0.09181138 0.05571133 0.1261858  0.02687709 0.27069062
 0.22613294 0.20686075 0.47390068 0.40349996]
print(np.rint(predictions)[:100])
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
print(true_answers[:100])
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]