s452487
f9618bc1a5
Liczba 1000 była skopiowana z innego projektu, trenowanie kończyło się po ~3 i tak ze względu na early stopping ale zmieniłem na bardziej rozsądną wartość
83 KiB
83 KiB
import zipfile
with zipfile.ZipFile("dataset_cleaned.zip", 'r') as zip_ref:
zip_ref.extractall("dataset_cleaned_extracted")
import pandas as pd
train = pd.read_csv("dataset_cleaned_extracted/train.csv")
test = pd.read_csv("dataset_cleaned_extracted/test.csv")
valid = pd.read_csv("dataset_cleaned_extracted/valid.csv")
num_columns = train.select_dtypes(['float64']).columns
print(num_columns)
Index(['Male', 'GeneralHealth', 'PhysicalHealthDays', 'MentalHealthDays', 'PhysicalActivities', 'SleepHours', 'RemovedTeeth', 'HadHeartAttack', 'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis', 'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty', 'DifficultyConcentrating', 'DifficultyWalking', 'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus', 'ECigaretteUsage', 'ChestScan', 'HeightInMeters', 'WeightInKilograms', 'BMI', 'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'TetanusLast10Tdap', 'HighRiskLastYear', 'CovidPos'], dtype='object')
len(num_columns)
36
x_columns = ['Male', 'GeneralHealth', 'PhysicalHealthDays', 'MentalHealthDays',
'PhysicalActivities', 'SleepHours', 'RemovedTeeth',
'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD',
'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis',
'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
'DifficultyConcentrating', 'DifficultyWalking',
'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus',
'ECigaretteUsage', 'ChestScan', 'HeightInMeters', 'WeightInKilograms',
'BMI', 'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver',
'TetanusLast10Tdap', 'HighRiskLastYear', 'CovidPos']
print(x_columns)
['Male', 'GeneralHealth', 'PhysicalHealthDays', 'MentalHealthDays', 'PhysicalActivities', 'SleepHours', 'RemovedTeeth', 'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis', 'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty', 'DifficultyConcentrating', 'DifficultyWalking', 'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus', 'ECigaretteUsage', 'ChestScan', 'HeightInMeters', 'WeightInKilograms', 'BMI', 'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'TetanusLast10Tdap', 'HighRiskLastYear', 'CovidPos']
len(x_columns)
35
y_column = 'HadHeartAttack'
train_x = train[x_columns]
train_y = train[y_column]
test_x = test[x_columns]
test_y = test[y_column]
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 676617 entries, 0 to 676616 Data columns (total 41 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 676617 non-null int64 1 State 676617 non-null object 2 Male 676617 non-null float64 3 GeneralHealth 676617 non-null float64 4 PhysicalHealthDays 676617 non-null float64 5 MentalHealthDays 676617 non-null float64 6 LastCheckupTime 676617 non-null object 7 PhysicalActivities 676617 non-null float64 8 SleepHours 676617 non-null float64 9 RemovedTeeth 676617 non-null float64 10 HadHeartAttack 676617 non-null float64 11 HadAngina 676617 non-null float64 12 HadStroke 676617 non-null float64 13 HadAsthma 676617 non-null float64 14 HadSkinCancer 676617 non-null float64 15 HadCOPD 676617 non-null float64 16 HadDepressiveDisorder 676617 non-null float64 17 HadKidneyDisease 676617 non-null float64 18 HadArthritis 676617 non-null float64 19 HadDiabetes 676617 non-null float64 20 DeafOrHardOfHearing 676617 non-null float64 21 BlindOrVisionDifficulty 676617 non-null float64 22 DifficultyConcentrating 676617 non-null float64 23 DifficultyWalking 676617 non-null float64 24 DifficultyDressingBathing 676617 non-null float64 25 DifficultyErrands 676617 non-null float64 26 SmokerStatus 676617 non-null float64 27 ECigaretteUsage 676617 non-null float64 28 ChestScan 676617 non-null float64 29 RaceEthnicityCategory 676617 non-null object 30 AgeCategory 676617 non-null object 31 HeightInMeters 676617 non-null float64 32 WeightInKilograms 676617 non-null float64 33 BMI 676617 non-null float64 34 AlcoholDrinkers 676617 non-null float64 35 HIVTesting 676617 non-null float64 36 FluVaxLast12 676617 non-null float64 37 PneumoVaxEver 676617 non-null float64 38 TetanusLast10Tdap 676617 non-null float64 39 HighRiskLastYear 676617 non-null float64 40 CovidPos 676617 non-null float64 dtypes: float64(36), int64(1), object(4) memory usage: 211.6+ MB
Definiowanie modelu
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.optimizers import Adam
def create_model():
inputs = keras.Input(shape=(35,))
dense1 = layers.Dense(64, activation="relu")(inputs)
dropout1 = layers.Dropout(0.2)(dense1)
dense2 = layers.Dense(32, activation="relu")(dropout1)
dropout2 = layers.Dropout(0.2)(dense2)
output = layers.Dense(1, activation="sigmoid")(dropout2)
model = keras.Model(inputs=inputs, outputs=output)
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
return model
model = create_model()
model.summary()
Model: "model" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input_1 (InputLayer) [(None, 35)] 0 dense (Dense) (None, 64) 2304 dropout (Dropout) (None, 64) 0 dense_1 (Dense) (None, 32) 2080 dropout_1 (Dropout) (None, 32) 0 dense_2 (Dense) (None, 1) 33 ================================================================= Total params: 4,417 Trainable params: 4,417 Non-trainable params: 0 _________________________________________________________________
Trenowanie modelu
# Early stopping dla regularyzacji
callback = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=3, restore_best_weights=True)
history = model.fit(train_x, train_y, validation_data=(test_x, test_y), epochs=11, callbacks=[callback])
Epoch 1/11 21145/21145 [==============================] - 22s 994us/step - loss: 0.4334 - accuracy: 0.7998 - val_loss: 0.3714 - val_accuracy: 0.8448 Epoch 2/11 21145/21145 [==============================] - 21s 972us/step - loss: 0.4257 - accuracy: 0.8038 - val_loss: 0.4273 - val_accuracy: 0.8249 Epoch 3/11 21145/21145 [==============================] - 21s 992us/step - loss: 0.4224 - accuracy: 0.8056 - val_loss: 0.4245 - val_accuracy: 0.8219 Epoch 4/11 21145/21145 [==============================] - 20s 962us/step - loss: 0.4201 - accuracy: 0.8074 - val_loss: 0.4108 - val_accuracy: 0.8234
Historia treningu
Loss
from matplotlib import pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
<matplotlib.legend.Legend at 0x226e7b95760>
Test loss < Train loss ze względu na warstwy dropout
Accuracy
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
Zapisywanie modelu
model.save("model_v1.keras")
Testowanie na zbiorze walidacyjnym
valid_x = valid[x_columns]
valid_y = valid[y_column]
import numpy as np
predictions = model.predict(valid_x)[:,0]
true_answers = valid_y.to_numpy()
validation_accuracy = np.sum(np.rint(predictions) == true_answers)/len(true_answers)
print(f"Poprawność na zbiorze walidacyjnym: {validation_accuracy:.2%}")
1392/1392 [==============================] - 1s 569us/step Poprawność na zbiorze walidacyjnym: 86.15%
print(predictions[:100])
[0.08692811 0.12067404 0.31880796 0.64843357 0.15188715 0.06517262 0.03407578 0.49311596 0.00781232 0.2089161 0.46056542 0.45341685 0.4294767 0.25619727 0.20345858 0.2302334 0.38631877 0.36519188 0.04014764 0.23888215 0.27519897 0.08928084 0.05204074 0.42043713 0.19055638 0.29787344 0.23068897 0.88435644 0.03139259 0.95048493 0.2457671 0.5858893 0.02678488 0.06240147 0.52132165 0.01431455 0.02444405 0.07804424 0.11274771 0.12714393 0.35450152 0.01294624 0.190797 0.07512036 0.48486376 0.06140704 0.9019506 0.08810509 0.61831665 0.15642735 0.03310075 0.04532438 0.10763614 0.4277772 0.20325996 0.8980398 0.7491019 0.38502344 0.03970775 0.0401529 0.03046079 0.10123587 0.04993626 0.05702 0.18049946 0.1223311 0.731555 0.40104443 0.18443953 0.1265702 0.07467585 0.03895461 0.35271063 0.38039213 0.4450048 0.03670818 0.05534125 0.91664517 0.413391 0.12545326 0.11306539 0.4350903 0.48778924 0.40804324 0.33885244 0.21948677 0.01242744 0.02531701 0.6693964 0.15393472 0.9307252 0.09181138 0.05571133 0.1261858 0.02687709 0.27069062 0.22613294 0.20686075 0.47390068 0.40349996]
print(np.rint(predictions)[:100])
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
print(true_answers[:100])
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]