ium_z487186/lab_02.ipynb

75 KiB
Raw Permalink Blame History

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
from datasets import load_dataset

dataset = load_dataset("mstz/liver")['train']
Found cached dataset liver (/Users/natalia.szymczyk/.cache/huggingface/datasets/mstz___liver/liver/1.0.0/3115a4001e742dc2c89457a3906d35982a649915f71f35fc5e6d025c786eeacf)
100%|██████████| 1/1 [00:00<00:00, 684.45it/s]
dataset
Dataset({
    features: ['age', 'is_male', 'total_bilirubin', 'direct_ribilubin', 'alkaline_phosphotase', 'alamine_aminotransferasi', 'aspartate_aminotransferase', 'total_proteins', 'albumin', 'albumin_to_globulin_ratio', 'class'],
    num_rows: 583
})
dataset = dataset.to_pandas()
train, test = train_test_split(dataset, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.2, random_state=42)
train
age is_male total_bilirubin direct_ribilubin alkaline_phosphotase alamine_aminotransferasi aspartate_aminotransferase total_proteins albumin albumin_to_globulin_ratio class
107 36 True 0.8 0.2 158 29 39 6.0 2.2 0.50 1
33 38 False 2.6 1.2 410 59 57 5.6 3.0 0.80 1
534 39 True 1.6 0.8 230 88 74 8.0 4.0 1.00 1
204 21 True 0.7 0.2 135 27 26 6.4 3.3 1.00 1
48 32 False 0.6 0.1 176 39 28 6.0 3.0 1.00 0
... ... ... ... ... ... ... ... ... ... ... ...
42 42 True 6.8 3.2 630 25 47 6.1 2.3 0.60 1
179 75 True 8.0 4.6 386 30 25 5.5 1.8 0.48 0
430 53 False 0.7 0.1 182 20 33 4.8 1.9 0.60 0
475 38 True 2.2 1.0 310 119 42 7.9 4.1 1.00 1
425 58 True 0.4 0.1 100 59 126 4.3 2.5 1.40 0

372 rows × 11 columns

val
age is_male total_bilirubin direct_ribilubin alkaline_phosphotase alamine_aminotransferasi aspartate_aminotransferase total_proteins albumin albumin_to_globulin_ratio class
582 38 True 1.0 0.3 216 21 24 7.3 4.4 1.50 1
453 46 True 0.7 0.2 224 40 23 7.1 3.0 0.70 0
89 60 True 4.0 1.9 238 119 350 7.1 3.3 0.80 0
71 75 False 0.8 0.2 188 20 29 4.4 1.8 0.60 0
124 28 True 0.6 0.1 177 36 29 6.9 4.1 1.40 1
... ... ... ... ... ... ... ... ... ... ... ...
236 22 True 0.8 0.2 300 57 40 7.9 3.8 0.90 1
487 32 True 0.7 0.2 276 102 190 6.0 2.9 0.93 0
27 34 True 6.2 3.0 240 1680 850 7.2 4.0 1.20 0
307 30 True 0.8 0.2 174 21 47 4.6 2.3 1.00 0
512 56 True 1.0 0.3 195 22 28 5.8 2.6 0.80 1

94 rows × 11 columns

test
age is_male total_bilirubin direct_ribilubin alkaline_phosphotase alamine_aminotransferasi aspartate_aminotransferase total_proteins albumin albumin_to_globulin_ratio class
355 19 True 1.4 0.8 178 13 26 8.0 4.6 1.30 1
407 12 True 1.0 0.2 719 157 108 7.2 3.7 1.00 0
90 60 True 5.7 2.8 214 412 850 7.3 3.2 0.78 0
402 42 False 0.5 0.1 162 155 108 8.1 4.0 0.90 0
268 40 True 14.5 6.4 358 50 75 5.7 2.1 0.50 0
... ... ... ... ... ... ... ... ... ... ... ...
516 60 True 0.9 0.3 168 16 24 6.7 3.0 0.80 0
305 54 False 1.4 0.7 195 36 16 7.9 3.7 0.90 1
167 47 False 3.0 1.5 292 64 67 5.6 1.8 0.47 0
312 27 True 1.3 0.6 106 25 54 8.5 4.8 NaN 1
329 21 True 0.7 0.2 211 14 23 7.3 4.1 1.20 1

117 rows × 11 columns

numerical_features = ['age', 'total_bilirubin', 'direct_ribilubin', 'alkaline_phosphotase',
                      'alamine_aminotransferasi', 'aspartate_aminotransferase', 'total_proteins', 'albumin',
                      'albumin_to_globulin_ratio']
train[numerical_features].describe()
age total_bilirubin direct_ribilubin alkaline_phosphotase alamine_aminotransferasi aspartate_aminotransferase total_proteins albumin albumin_to_globulin_ratio
count 372.000000 372.000000 372.000000 372.000000 372.000000 372.000000 372.000000 372.000000 371.000000
mean 44.680108 3.415860 1.494355 286.473118 72.986559 110.147849 6.500269 3.150806 0.959515
std 16.054568 6.736683 2.877245 242.459927 147.472734 306.425153 1.100049 0.806994 0.336514
min 4.000000 0.400000 0.100000 63.000000 10.000000 11.000000 2.700000 0.900000 0.300000
25% 32.000000 0.800000 0.200000 170.000000 24.000000 25.000000 5.775000 2.575000 0.700000
50% 45.000000 1.000000 0.300000 205.500000 35.000000 42.000000 6.600000 3.100000 1.000000
75% 58.000000 2.625000 1.300000 298.000000 60.000000 86.250000 7.200000 3.800000 1.100000
max 85.000000 75.000000 19.700000 2110.000000 1350.000000 4929.000000 9.600000 5.500000 2.800000
label_counter = Counter(train['class'])
print("\nRozkład częstości dla klas:")
for label in label_counter.keys():
    print(f"{label}: {label_counter[label] / len(train) * 100:.2f}%")
Rozkład częstości dla klas:
1: 30.38%
0: 69.62%
scaler = MinMaxScaler()
train[numerical_features] = scaler.fit_transform(train[numerical_features])
test[numerical_features] = scaler.fit_transform(test[numerical_features])
val[numerical_features] = scaler.fit_transform(val[numerical_features])
train
age is_male total_bilirubin direct_ribilubin alkaline_phosphotase alamine_aminotransferasi aspartate_aminotransferase total_proteins albumin albumin_to_globulin_ratio class
107 0.395062 True 0.005362 0.005102 0.046409 0.014179 0.005693 0.478261 0.282609 0.080 1
33 0.419753 False 0.029491 0.056122 0.169516 0.036567 0.009353 0.420290 0.456522 0.200 1
534 0.432099 True 0.016086 0.035714 0.081583 0.058209 0.012810 0.768116 0.673913 0.280 1
204 0.209877 True 0.004021 0.005102 0.035173 0.012687 0.003050 0.536232 0.521739 0.280 1
48 0.345679 False 0.002681 0.000000 0.055203 0.021642 0.003457 0.478261 0.456522 0.280 0
... ... ... ... ... ... ... ... ... ... ... ...
42 0.469136 True 0.085791 0.158163 0.276991 0.011194 0.007320 0.492754 0.304348 0.120 1
179 0.876543 True 0.101877 0.229592 0.157792 0.014925 0.002847 0.405797 0.195652 0.072 0
430 0.604938 False 0.004021 0.000000 0.058134 0.007463 0.004473 0.304348 0.217391 0.120 0
475 0.419753 True 0.024129 0.045918 0.120664 0.081343 0.006303 0.753623 0.695652 0.280 1
425 0.666667 True 0.000000 0.000000 0.018075 0.036567 0.023383 0.231884 0.347826 0.440 0

371 rows × 11 columns

train.dropna(inplace=True)
test.dropna(inplace=True)
val.dropna(inplace=True)
train
age is_male total_bilirubin direct_ribilubin alkaline_phosphotase alamine_aminotransferasi aspartate_aminotransferase total_proteins albumin albumin_to_globulin_ratio class
107 0.395062 True 0.005362 0.005102 0.046409 0.014179 0.005693 0.478261 0.282609 0.080 1
33 0.419753 False 0.029491 0.056122 0.169516 0.036567 0.009353 0.420290 0.456522 0.200 1
534 0.432099 True 0.016086 0.035714 0.081583 0.058209 0.012810 0.768116 0.673913 0.280 1
204 0.209877 True 0.004021 0.005102 0.035173 0.012687 0.003050 0.536232 0.521739 0.280 1
48 0.345679 False 0.002681 0.000000 0.055203 0.021642 0.003457 0.478261 0.456522 0.280 0
... ... ... ... ... ... ... ... ... ... ... ...
42 0.469136 True 0.085791 0.158163 0.276991 0.011194 0.007320 0.492754 0.304348 0.120 1
179 0.876543 True 0.101877 0.229592 0.157792 0.014925 0.002847 0.405797 0.195652 0.072 0
430 0.604938 False 0.004021 0.000000 0.058134 0.007463 0.004473 0.304348 0.217391 0.120 0
475 0.419753 True 0.024129 0.045918 0.120664 0.081343 0.006303 0.753623 0.695652 0.280 1
425 0.666667 True 0.000000 0.000000 0.018075 0.036567 0.023383 0.231884 0.347826 0.440 0

371 rows × 11 columns

test
age is_male total_bilirubin direct_ribilubin alkaline_phosphotase alamine_aminotransferasi aspartate_aminotransferase total_proteins albumin albumin_to_globulin_ratio class
355 0.174419 True 0.030000 0.049645 0.069831 0.002567 0.015385 0.909091 0.947368 0.666667 1
407 0.093023 True 0.016667 0.007092 0.436610 0.187420 0.094231 0.763636 0.710526 0.466667 0
90 0.651163 True 0.173333 0.191489 0.094237 0.514763 0.807692 0.781818 0.578947 0.320000 0
402 0.441860 False 0.000000 0.000000 0.058983 0.184852 0.094231 0.927273 0.789474 0.400000 0
268 0.418605 True 0.466667 0.446809 0.191864 0.050064 0.062500 0.490909 0.289474 0.133333 0
... ... ... ... ... ... ... ... ... ... ... ...
109 0.372093 True 0.013333 0.000000 0.278644 0.017972 0.023077 0.527273 0.473684 0.400000 1
516 0.651163 True 0.013333 0.014184 0.063051 0.006418 0.013462 0.672727 0.526316 0.333333 0
305 0.581395 False 0.030000 0.042553 0.081356 0.032092 0.005769 0.890909 0.710526 0.400000 1
167 0.500000 False 0.083333 0.099291 0.147119 0.068036 0.054808 0.472727 0.210526 0.113333 0
329 0.197674 True 0.006667 0.007092 0.092203 0.003851 0.012500 0.781818 0.815789 0.600000 1

115 rows × 11 columns

val
age is_male total_bilirubin direct_ribilubin alkaline_phosphotase alamine_aminotransferasi aspartate_aminotransferase total_proteins albumin albumin_to_globulin_ratio class
582 0.403226 True 0.018100 0.017094 0.069175 0.005528 0.004090 0.725490 0.731707 0.846154 1
453 0.532258 True 0.004525 0.008547 0.074029 0.015075 0.003749 0.686275 0.390244 0.230769 0
89 0.758065 True 0.153846 0.153846 0.082524 0.054774 0.115201 0.686275 0.463415 0.307692 0
71 1.000000 False 0.009050 0.008547 0.052184 0.005025 0.005794 0.156863 0.097561 0.153846 0
124 0.241935 True 0.000000 0.000000 0.045510 0.013065 0.005794 0.647059 0.658537 0.769231 1
... ... ... ... ... ... ... ... ... ... ... ...
236 0.145161 True 0.009050 0.008547 0.120146 0.023618 0.009543 0.843137 0.585366 0.384615 1
487 0.306452 True 0.004525 0.008547 0.105583 0.046231 0.060668 0.470588 0.365854 0.407692 0
27 0.338710 True 0.253394 0.247863 0.083738 0.839196 0.285617 0.705882 0.634146 0.615385 0
307 0.274194 True 0.009050 0.008547 0.043689 0.005528 0.011929 0.196078 0.219512 0.461538 0
512 0.693548 True 0.018100 0.017094 0.056432 0.006030 0.005453 0.431373 0.292683 0.307692 1

93 rows × 11 columns