75 KiB
75 KiB
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
from datasets import load_dataset
dataset = load_dataset("mstz/liver")['train']
Found cached dataset liver (/Users/natalia.szymczyk/.cache/huggingface/datasets/mstz___liver/liver/1.0.0/3115a4001e742dc2c89457a3906d35982a649915f71f35fc5e6d025c786eeacf) 100%|██████████| 1/1 [00:00<00:00, 684.45it/s]
dataset
Dataset({ features: ['age', 'is_male', 'total_bilirubin', 'direct_ribilubin', 'alkaline_phosphotase', 'alamine_aminotransferasi', 'aspartate_aminotransferase', 'total_proteins', 'albumin', 'albumin_to_globulin_ratio', 'class'], num_rows: 583 })
dataset = dataset.to_pandas()
train, test = train_test_split(dataset, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.2, random_state=42)
train
age | is_male | total_bilirubin | direct_ribilubin | alkaline_phosphotase | alamine_aminotransferasi | aspartate_aminotransferase | total_proteins | albumin | albumin_to_globulin_ratio | class | |
---|---|---|---|---|---|---|---|---|---|---|---|
107 | 36 | True | 0.8 | 0.2 | 158 | 29 | 39 | 6.0 | 2.2 | 0.50 | 1 |
33 | 38 | False | 2.6 | 1.2 | 410 | 59 | 57 | 5.6 | 3.0 | 0.80 | 1 |
534 | 39 | True | 1.6 | 0.8 | 230 | 88 | 74 | 8.0 | 4.0 | 1.00 | 1 |
204 | 21 | True | 0.7 | 0.2 | 135 | 27 | 26 | 6.4 | 3.3 | 1.00 | 1 |
48 | 32 | False | 0.6 | 0.1 | 176 | 39 | 28 | 6.0 | 3.0 | 1.00 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
42 | 42 | True | 6.8 | 3.2 | 630 | 25 | 47 | 6.1 | 2.3 | 0.60 | 1 |
179 | 75 | True | 8.0 | 4.6 | 386 | 30 | 25 | 5.5 | 1.8 | 0.48 | 0 |
430 | 53 | False | 0.7 | 0.1 | 182 | 20 | 33 | 4.8 | 1.9 | 0.60 | 0 |
475 | 38 | True | 2.2 | 1.0 | 310 | 119 | 42 | 7.9 | 4.1 | 1.00 | 1 |
425 | 58 | True | 0.4 | 0.1 | 100 | 59 | 126 | 4.3 | 2.5 | 1.40 | 0 |
372 rows × 11 columns
val
age | is_male | total_bilirubin | direct_ribilubin | alkaline_phosphotase | alamine_aminotransferasi | aspartate_aminotransferase | total_proteins | albumin | albumin_to_globulin_ratio | class | |
---|---|---|---|---|---|---|---|---|---|---|---|
582 | 38 | True | 1.0 | 0.3 | 216 | 21 | 24 | 7.3 | 4.4 | 1.50 | 1 |
453 | 46 | True | 0.7 | 0.2 | 224 | 40 | 23 | 7.1 | 3.0 | 0.70 | 0 |
89 | 60 | True | 4.0 | 1.9 | 238 | 119 | 350 | 7.1 | 3.3 | 0.80 | 0 |
71 | 75 | False | 0.8 | 0.2 | 188 | 20 | 29 | 4.4 | 1.8 | 0.60 | 0 |
124 | 28 | True | 0.6 | 0.1 | 177 | 36 | 29 | 6.9 | 4.1 | 1.40 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
236 | 22 | True | 0.8 | 0.2 | 300 | 57 | 40 | 7.9 | 3.8 | 0.90 | 1 |
487 | 32 | True | 0.7 | 0.2 | 276 | 102 | 190 | 6.0 | 2.9 | 0.93 | 0 |
27 | 34 | True | 6.2 | 3.0 | 240 | 1680 | 850 | 7.2 | 4.0 | 1.20 | 0 |
307 | 30 | True | 0.8 | 0.2 | 174 | 21 | 47 | 4.6 | 2.3 | 1.00 | 0 |
512 | 56 | True | 1.0 | 0.3 | 195 | 22 | 28 | 5.8 | 2.6 | 0.80 | 1 |
94 rows × 11 columns
test
age | is_male | total_bilirubin | direct_ribilubin | alkaline_phosphotase | alamine_aminotransferasi | aspartate_aminotransferase | total_proteins | albumin | albumin_to_globulin_ratio | class | |
---|---|---|---|---|---|---|---|---|---|---|---|
355 | 19 | True | 1.4 | 0.8 | 178 | 13 | 26 | 8.0 | 4.6 | 1.30 | 1 |
407 | 12 | True | 1.0 | 0.2 | 719 | 157 | 108 | 7.2 | 3.7 | 1.00 | 0 |
90 | 60 | True | 5.7 | 2.8 | 214 | 412 | 850 | 7.3 | 3.2 | 0.78 | 0 |
402 | 42 | False | 0.5 | 0.1 | 162 | 155 | 108 | 8.1 | 4.0 | 0.90 | 0 |
268 | 40 | True | 14.5 | 6.4 | 358 | 50 | 75 | 5.7 | 2.1 | 0.50 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
516 | 60 | True | 0.9 | 0.3 | 168 | 16 | 24 | 6.7 | 3.0 | 0.80 | 0 |
305 | 54 | False | 1.4 | 0.7 | 195 | 36 | 16 | 7.9 | 3.7 | 0.90 | 1 |
167 | 47 | False | 3.0 | 1.5 | 292 | 64 | 67 | 5.6 | 1.8 | 0.47 | 0 |
312 | 27 | True | 1.3 | 0.6 | 106 | 25 | 54 | 8.5 | 4.8 | NaN | 1 |
329 | 21 | True | 0.7 | 0.2 | 211 | 14 | 23 | 7.3 | 4.1 | 1.20 | 1 |
117 rows × 11 columns
numerical_features = ['age', 'total_bilirubin', 'direct_ribilubin', 'alkaline_phosphotase',
'alamine_aminotransferasi', 'aspartate_aminotransferase', 'total_proteins', 'albumin',
'albumin_to_globulin_ratio']
train[numerical_features].describe()
age | total_bilirubin | direct_ribilubin | alkaline_phosphotase | alamine_aminotransferasi | aspartate_aminotransferase | total_proteins | albumin | albumin_to_globulin_ratio | |
---|---|---|---|---|---|---|---|---|---|
count | 372.000000 | 372.000000 | 372.000000 | 372.000000 | 372.000000 | 372.000000 | 372.000000 | 372.000000 | 371.000000 |
mean | 44.680108 | 3.415860 | 1.494355 | 286.473118 | 72.986559 | 110.147849 | 6.500269 | 3.150806 | 0.959515 |
std | 16.054568 | 6.736683 | 2.877245 | 242.459927 | 147.472734 | 306.425153 | 1.100049 | 0.806994 | 0.336514 |
min | 4.000000 | 0.400000 | 0.100000 | 63.000000 | 10.000000 | 11.000000 | 2.700000 | 0.900000 | 0.300000 |
25% | 32.000000 | 0.800000 | 0.200000 | 170.000000 | 24.000000 | 25.000000 | 5.775000 | 2.575000 | 0.700000 |
50% | 45.000000 | 1.000000 | 0.300000 | 205.500000 | 35.000000 | 42.000000 | 6.600000 | 3.100000 | 1.000000 |
75% | 58.000000 | 2.625000 | 1.300000 | 298.000000 | 60.000000 | 86.250000 | 7.200000 | 3.800000 | 1.100000 |
max | 85.000000 | 75.000000 | 19.700000 | 2110.000000 | 1350.000000 | 4929.000000 | 9.600000 | 5.500000 | 2.800000 |
label_counter = Counter(train['class'])
print("\nRozkład częstości dla klas:")
for label in label_counter.keys():
print(f"{label}: {label_counter[label] / len(train) * 100:.2f}%")
Rozkład częstości dla klas: 1: 30.38% 0: 69.62%
scaler = MinMaxScaler()
train[numerical_features] = scaler.fit_transform(train[numerical_features])
test[numerical_features] = scaler.fit_transform(test[numerical_features])
val[numerical_features] = scaler.fit_transform(val[numerical_features])
train
age | is_male | total_bilirubin | direct_ribilubin | alkaline_phosphotase | alamine_aminotransferasi | aspartate_aminotransferase | total_proteins | albumin | albumin_to_globulin_ratio | class | |
---|---|---|---|---|---|---|---|---|---|---|---|
107 | 0.395062 | True | 0.005362 | 0.005102 | 0.046409 | 0.014179 | 0.005693 | 0.478261 | 0.282609 | 0.080 | 1 |
33 | 0.419753 | False | 0.029491 | 0.056122 | 0.169516 | 0.036567 | 0.009353 | 0.420290 | 0.456522 | 0.200 | 1 |
534 | 0.432099 | True | 0.016086 | 0.035714 | 0.081583 | 0.058209 | 0.012810 | 0.768116 | 0.673913 | 0.280 | 1 |
204 | 0.209877 | True | 0.004021 | 0.005102 | 0.035173 | 0.012687 | 0.003050 | 0.536232 | 0.521739 | 0.280 | 1 |
48 | 0.345679 | False | 0.002681 | 0.000000 | 0.055203 | 0.021642 | 0.003457 | 0.478261 | 0.456522 | 0.280 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
42 | 0.469136 | True | 0.085791 | 0.158163 | 0.276991 | 0.011194 | 0.007320 | 0.492754 | 0.304348 | 0.120 | 1 |
179 | 0.876543 | True | 0.101877 | 0.229592 | 0.157792 | 0.014925 | 0.002847 | 0.405797 | 0.195652 | 0.072 | 0 |
430 | 0.604938 | False | 0.004021 | 0.000000 | 0.058134 | 0.007463 | 0.004473 | 0.304348 | 0.217391 | 0.120 | 0 |
475 | 0.419753 | True | 0.024129 | 0.045918 | 0.120664 | 0.081343 | 0.006303 | 0.753623 | 0.695652 | 0.280 | 1 |
425 | 0.666667 | True | 0.000000 | 0.000000 | 0.018075 | 0.036567 | 0.023383 | 0.231884 | 0.347826 | 0.440 | 0 |
371 rows × 11 columns
train.dropna(inplace=True)
test.dropna(inplace=True)
val.dropna(inplace=True)
train
age | is_male | total_bilirubin | direct_ribilubin | alkaline_phosphotase | alamine_aminotransferasi | aspartate_aminotransferase | total_proteins | albumin | albumin_to_globulin_ratio | class | |
---|---|---|---|---|---|---|---|---|---|---|---|
107 | 0.395062 | True | 0.005362 | 0.005102 | 0.046409 | 0.014179 | 0.005693 | 0.478261 | 0.282609 | 0.080 | 1 |
33 | 0.419753 | False | 0.029491 | 0.056122 | 0.169516 | 0.036567 | 0.009353 | 0.420290 | 0.456522 | 0.200 | 1 |
534 | 0.432099 | True | 0.016086 | 0.035714 | 0.081583 | 0.058209 | 0.012810 | 0.768116 | 0.673913 | 0.280 | 1 |
204 | 0.209877 | True | 0.004021 | 0.005102 | 0.035173 | 0.012687 | 0.003050 | 0.536232 | 0.521739 | 0.280 | 1 |
48 | 0.345679 | False | 0.002681 | 0.000000 | 0.055203 | 0.021642 | 0.003457 | 0.478261 | 0.456522 | 0.280 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
42 | 0.469136 | True | 0.085791 | 0.158163 | 0.276991 | 0.011194 | 0.007320 | 0.492754 | 0.304348 | 0.120 | 1 |
179 | 0.876543 | True | 0.101877 | 0.229592 | 0.157792 | 0.014925 | 0.002847 | 0.405797 | 0.195652 | 0.072 | 0 |
430 | 0.604938 | False | 0.004021 | 0.000000 | 0.058134 | 0.007463 | 0.004473 | 0.304348 | 0.217391 | 0.120 | 0 |
475 | 0.419753 | True | 0.024129 | 0.045918 | 0.120664 | 0.081343 | 0.006303 | 0.753623 | 0.695652 | 0.280 | 1 |
425 | 0.666667 | True | 0.000000 | 0.000000 | 0.018075 | 0.036567 | 0.023383 | 0.231884 | 0.347826 | 0.440 | 0 |
371 rows × 11 columns
test
age | is_male | total_bilirubin | direct_ribilubin | alkaline_phosphotase | alamine_aminotransferasi | aspartate_aminotransferase | total_proteins | albumin | albumin_to_globulin_ratio | class | |
---|---|---|---|---|---|---|---|---|---|---|---|
355 | 0.174419 | True | 0.030000 | 0.049645 | 0.069831 | 0.002567 | 0.015385 | 0.909091 | 0.947368 | 0.666667 | 1 |
407 | 0.093023 | True | 0.016667 | 0.007092 | 0.436610 | 0.187420 | 0.094231 | 0.763636 | 0.710526 | 0.466667 | 0 |
90 | 0.651163 | True | 0.173333 | 0.191489 | 0.094237 | 0.514763 | 0.807692 | 0.781818 | 0.578947 | 0.320000 | 0 |
402 | 0.441860 | False | 0.000000 | 0.000000 | 0.058983 | 0.184852 | 0.094231 | 0.927273 | 0.789474 | 0.400000 | 0 |
268 | 0.418605 | True | 0.466667 | 0.446809 | 0.191864 | 0.050064 | 0.062500 | 0.490909 | 0.289474 | 0.133333 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
109 | 0.372093 | True | 0.013333 | 0.000000 | 0.278644 | 0.017972 | 0.023077 | 0.527273 | 0.473684 | 0.400000 | 1 |
516 | 0.651163 | True | 0.013333 | 0.014184 | 0.063051 | 0.006418 | 0.013462 | 0.672727 | 0.526316 | 0.333333 | 0 |
305 | 0.581395 | False | 0.030000 | 0.042553 | 0.081356 | 0.032092 | 0.005769 | 0.890909 | 0.710526 | 0.400000 | 1 |
167 | 0.500000 | False | 0.083333 | 0.099291 | 0.147119 | 0.068036 | 0.054808 | 0.472727 | 0.210526 | 0.113333 | 0 |
329 | 0.197674 | True | 0.006667 | 0.007092 | 0.092203 | 0.003851 | 0.012500 | 0.781818 | 0.815789 | 0.600000 | 1 |
115 rows × 11 columns
val
age | is_male | total_bilirubin | direct_ribilubin | alkaline_phosphotase | alamine_aminotransferasi | aspartate_aminotransferase | total_proteins | albumin | albumin_to_globulin_ratio | class | |
---|---|---|---|---|---|---|---|---|---|---|---|
582 | 0.403226 | True | 0.018100 | 0.017094 | 0.069175 | 0.005528 | 0.004090 | 0.725490 | 0.731707 | 0.846154 | 1 |
453 | 0.532258 | True | 0.004525 | 0.008547 | 0.074029 | 0.015075 | 0.003749 | 0.686275 | 0.390244 | 0.230769 | 0 |
89 | 0.758065 | True | 0.153846 | 0.153846 | 0.082524 | 0.054774 | 0.115201 | 0.686275 | 0.463415 | 0.307692 | 0 |
71 | 1.000000 | False | 0.009050 | 0.008547 | 0.052184 | 0.005025 | 0.005794 | 0.156863 | 0.097561 | 0.153846 | 0 |
124 | 0.241935 | True | 0.000000 | 0.000000 | 0.045510 | 0.013065 | 0.005794 | 0.647059 | 0.658537 | 0.769231 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
236 | 0.145161 | True | 0.009050 | 0.008547 | 0.120146 | 0.023618 | 0.009543 | 0.843137 | 0.585366 | 0.384615 | 1 |
487 | 0.306452 | True | 0.004525 | 0.008547 | 0.105583 | 0.046231 | 0.060668 | 0.470588 | 0.365854 | 0.407692 | 0 |
27 | 0.338710 | True | 0.253394 | 0.247863 | 0.083738 | 0.839196 | 0.285617 | 0.705882 | 0.634146 | 0.615385 | 0 |
307 | 0.274194 | True | 0.009050 | 0.008547 | 0.043689 | 0.005528 | 0.011929 | 0.196078 | 0.219512 | 0.461538 | 0 |
512 | 0.693548 | True | 0.018100 | 0.017094 | 0.056432 | 0.006030 | 0.005453 | 0.431373 | 0.292683 | 0.307692 | 1 |
93 rows × 11 columns