From 99bb6baaa8ef16ed305f3a7ec8905db8925d65f3 Mon Sep 17 00:00:00 2001 From: Natalia Szymczyk Date: Fri, 21 Apr 2023 10:34:30 +0200 Subject: [PATCH] python scripts --- lab_02.ipynb | 2145 ++++++++++++++++++++++++++++++++++++++++++++++++++ main.py | 22 + 2 files changed, 2167 insertions(+) create mode 100644 lab_02.ipynb create mode 100644 main.py diff --git a/lab_02.ipynb b/lab_02.ipynb new file mode 100644 index 0000000..8846e79 --- /dev/null +++ b/lab_02.ipynb @@ -0,0 +1,2145 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "from collections import Counter" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset liver (/Users/natalia.szymczyk/.cache/huggingface/datasets/mstz___liver/liver/1.0.0/3115a4001e742dc2c89457a3906d35982a649915f71f35fc5e6d025c786eeacf)\n", + "100%|██████████| 1/1 [00:00<00:00, 684.45it/s]\n" + ] + } + ], + "source": [ + "from datasets import load_dataset\n", + "\n", + "dataset = load_dataset(\"mstz/liver\")['train']" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['age', 'is_male', 'total_bilirubin', 'direct_ribilubin', 'alkaline_phosphotase', 'alamine_aminotransferasi', 'aspartate_aminotransferase', 'total_proteins', 'albumin', 'albumin_to_globulin_ratio', 'class'],\n", + " num_rows: 583\n", + "})" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = dataset.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "train, test = train_test_split(dataset, test_size=0.2, random_state=42)\n", + "train, val = train_test_split(train, test_size=0.2, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageis_maletotal_bilirubindirect_ribilubinalkaline_phosphotasealamine_aminotransferasiaspartate_aminotransferasetotal_proteinsalbuminalbumin_to_globulin_ratioclass
10736True0.80.215829396.02.20.501
3338False2.61.241059575.63.00.801
53439True1.60.823088748.04.01.001
20421True0.70.213527266.43.31.001
4832False0.60.117639286.03.01.000
....................................
4242True6.83.263025476.12.30.601
17975True8.04.638630255.51.80.480
43053False0.70.118220334.81.90.600
47538True2.21.0310119427.94.11.001
42558True0.40.1100591264.32.51.400
\n", + "

372 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " age is_male total_bilirubin direct_ribilubin alkaline_phosphotase \\\n", + "107 36 True 0.8 0.2 158 \n", + "33 38 False 2.6 1.2 410 \n", + "534 39 True 1.6 0.8 230 \n", + "204 21 True 0.7 0.2 135 \n", + "48 32 False 0.6 0.1 176 \n", + ".. ... ... ... ... ... \n", + "42 42 True 6.8 3.2 630 \n", + "179 75 True 8.0 4.6 386 \n", + "430 53 False 0.7 0.1 182 \n", + "475 38 True 2.2 1.0 310 \n", + "425 58 True 0.4 0.1 100 \n", + "\n", + " alamine_aminotransferasi aspartate_aminotransferase total_proteins \\\n", + "107 29 39 6.0 \n", + "33 59 57 5.6 \n", + "534 88 74 8.0 \n", + "204 27 26 6.4 \n", + "48 39 28 6.0 \n", + ".. ... ... ... \n", + "42 25 47 6.1 \n", + "179 30 25 5.5 \n", + "430 20 33 4.8 \n", + "475 119 42 7.9 \n", + "425 59 126 4.3 \n", + "\n", + " albumin albumin_to_globulin_ratio class \n", + "107 2.2 0.50 1 \n", + "33 3.0 0.80 1 \n", + "534 4.0 1.00 1 \n", + "204 3.3 1.00 1 \n", + "48 3.0 1.00 0 \n", + ".. ... ... ... \n", + "42 2.3 0.60 1 \n", + "179 1.8 0.48 0 \n", + "430 1.9 0.60 0 \n", + "475 4.1 1.00 1 \n", + "425 2.5 1.40 0 \n", + "\n", + "[372 rows x 11 columns]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageis_maletotal_bilirubindirect_ribilubinalkaline_phosphotasealamine_aminotransferasiaspartate_aminotransferasetotal_proteinsalbuminalbumin_to_globulin_ratioclass
58238True1.00.321621247.34.41.501
45346True0.70.222440237.13.00.700
8960True4.01.92381193507.13.30.800
7175False0.80.218820294.41.80.600
12428True0.60.117736296.94.11.401
....................................
23622True0.80.230057407.93.80.901
48732True0.70.22761021906.02.90.930
2734True6.23.024016808507.24.01.200
30730True0.80.217421474.62.31.000
51256True1.00.319522285.82.60.801
\n", + "

94 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " age is_male total_bilirubin direct_ribilubin alkaline_phosphotase \\\n", + "582 38 True 1.0 0.3 216 \n", + "453 46 True 0.7 0.2 224 \n", + "89 60 True 4.0 1.9 238 \n", + "71 75 False 0.8 0.2 188 \n", + "124 28 True 0.6 0.1 177 \n", + ".. ... ... ... ... ... \n", + "236 22 True 0.8 0.2 300 \n", + "487 32 True 0.7 0.2 276 \n", + "27 34 True 6.2 3.0 240 \n", + "307 30 True 0.8 0.2 174 \n", + "512 56 True 1.0 0.3 195 \n", + "\n", + " alamine_aminotransferasi aspartate_aminotransferase total_proteins \\\n", + "582 21 24 7.3 \n", + "453 40 23 7.1 \n", + "89 119 350 7.1 \n", + "71 20 29 4.4 \n", + "124 36 29 6.9 \n", + ".. ... ... ... \n", + "236 57 40 7.9 \n", + "487 102 190 6.0 \n", + "27 1680 850 7.2 \n", + "307 21 47 4.6 \n", + "512 22 28 5.8 \n", + "\n", + " albumin albumin_to_globulin_ratio class \n", + "582 4.4 1.50 1 \n", + "453 3.0 0.70 0 \n", + "89 3.3 0.80 0 \n", + "71 1.8 0.60 0 \n", + "124 4.1 1.40 1 \n", + ".. ... ... ... \n", + "236 3.8 0.90 1 \n", + "487 2.9 0.93 0 \n", + "27 4.0 1.20 0 \n", + "307 2.3 1.00 0 \n", + "512 2.6 0.80 1 \n", + "\n", + "[94 rows x 11 columns]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "val" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageis_maletotal_bilirubindirect_ribilubinalkaline_phosphotasealamine_aminotransferasiaspartate_aminotransferasetotal_proteinsalbuminalbumin_to_globulin_ratioclass
35519True1.40.817813268.04.61.301
40712True1.00.27191571087.23.71.000
9060True5.72.82144128507.33.20.780
40242False0.50.11621551088.14.00.900
26840True14.56.435850755.72.10.500
....................................
51660True0.90.316816246.73.00.800
30554False1.40.719536167.93.70.901
16747False3.01.529264675.61.80.470
31227True1.30.610625548.54.8NaN1
32921True0.70.221114237.34.11.201
\n", + "

117 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " age is_male total_bilirubin direct_ribilubin alkaline_phosphotase \\\n", + "355 19 True 1.4 0.8 178 \n", + "407 12 True 1.0 0.2 719 \n", + "90 60 True 5.7 2.8 214 \n", + "402 42 False 0.5 0.1 162 \n", + "268 40 True 14.5 6.4 358 \n", + ".. ... ... ... ... ... \n", + "516 60 True 0.9 0.3 168 \n", + "305 54 False 1.4 0.7 195 \n", + "167 47 False 3.0 1.5 292 \n", + "312 27 True 1.3 0.6 106 \n", + "329 21 True 0.7 0.2 211 \n", + "\n", + " alamine_aminotransferasi aspartate_aminotransferase total_proteins \\\n", + "355 13 26 8.0 \n", + "407 157 108 7.2 \n", + "90 412 850 7.3 \n", + "402 155 108 8.1 \n", + "268 50 75 5.7 \n", + ".. ... ... ... \n", + "516 16 24 6.7 \n", + "305 36 16 7.9 \n", + "167 64 67 5.6 \n", + "312 25 54 8.5 \n", + "329 14 23 7.3 \n", + "\n", + " albumin albumin_to_globulin_ratio class \n", + "355 4.6 1.30 1 \n", + "407 3.7 1.00 0 \n", + "90 3.2 0.78 0 \n", + "402 4.0 0.90 0 \n", + "268 2.1 0.50 0 \n", + ".. ... ... ... \n", + "516 3.0 0.80 0 \n", + "305 3.7 0.90 1 \n", + "167 1.8 0.47 0 \n", + "312 4.8 NaN 1 \n", + "329 4.1 1.20 1 \n", + "\n", + "[117 rows x 11 columns]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agetotal_bilirubindirect_ribilubinalkaline_phosphotasealamine_aminotransferasiaspartate_aminotransferasetotal_proteinsalbuminalbumin_to_globulin_ratio
count372.000000372.000000372.000000372.000000372.000000372.000000372.000000372.000000371.000000
mean44.6801083.4158601.494355286.47311872.986559110.1478496.5002693.1508060.959515
std16.0545686.7366832.877245242.459927147.472734306.4251531.1000490.8069940.336514
min4.0000000.4000000.10000063.00000010.00000011.0000002.7000000.9000000.300000
25%32.0000000.8000000.200000170.00000024.00000025.0000005.7750002.5750000.700000
50%45.0000001.0000000.300000205.50000035.00000042.0000006.6000003.1000001.000000
75%58.0000002.6250001.300000298.00000060.00000086.2500007.2000003.8000001.100000
max85.00000075.00000019.7000002110.0000001350.0000004929.0000009.6000005.5000002.800000
\n", + "
" + ], + "text/plain": [ + " age total_bilirubin direct_ribilubin alkaline_phosphotase \\\n", + "count 372.000000 372.000000 372.000000 372.000000 \n", + "mean 44.680108 3.415860 1.494355 286.473118 \n", + "std 16.054568 6.736683 2.877245 242.459927 \n", + "min 4.000000 0.400000 0.100000 63.000000 \n", + "25% 32.000000 0.800000 0.200000 170.000000 \n", + "50% 45.000000 1.000000 0.300000 205.500000 \n", + "75% 58.000000 2.625000 1.300000 298.000000 \n", + "max 85.000000 75.000000 19.700000 2110.000000 \n", + "\n", + " alamine_aminotransferasi aspartate_aminotransferase total_proteins \\\n", + "count 372.000000 372.000000 372.000000 \n", + "mean 72.986559 110.147849 6.500269 \n", + "std 147.472734 306.425153 1.100049 \n", + "min 10.000000 11.000000 2.700000 \n", + "25% 24.000000 25.000000 5.775000 \n", + "50% 35.000000 42.000000 6.600000 \n", + "75% 60.000000 86.250000 7.200000 \n", + "max 1350.000000 4929.000000 9.600000 \n", + "\n", + " albumin albumin_to_globulin_ratio \n", + "count 372.000000 371.000000 \n", + "mean 3.150806 0.959515 \n", + "std 0.806994 0.336514 \n", + "min 0.900000 0.300000 \n", + "25% 2.575000 0.700000 \n", + "50% 3.100000 1.000000 \n", + "75% 3.800000 1.100000 \n", + "max 5.500000 2.800000 " + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numerical_features = ['age', 'total_bilirubin', 'direct_ribilubin', 'alkaline_phosphotase',\n", + " 'alamine_aminotransferasi', 'aspartate_aminotransferase', 'total_proteins', 'albumin',\n", + " 'albumin_to_globulin_ratio']\n", + "train[numerical_features].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Rozkład częstości dla klas:\n", + "1: 30.38%\n", + "0: 69.62%\n" + ] + } + ], + "source": [ + "label_counter = Counter(train['class'])\n", + "print(\"\\nRozkład częstości dla klas:\")\n", + "for label in label_counter.keys():\n", + " print(f\"{label}: {label_counter[label] / len(train) * 100:.2f}%\")" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "scaler = MinMaxScaler()\n", + "train[numerical_features] = scaler.fit_transform(train[numerical_features])\n", + "test[numerical_features] = scaler.fit_transform(test[numerical_features])\n", + "val[numerical_features] = scaler.fit_transform(val[numerical_features])" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageis_maletotal_bilirubindirect_ribilubinalkaline_phosphotasealamine_aminotransferasiaspartate_aminotransferasetotal_proteinsalbuminalbumin_to_globulin_ratioclass
1070.395062True0.0053620.0051020.0464090.0141790.0056930.4782610.2826090.0801
330.419753False0.0294910.0561220.1695160.0365670.0093530.4202900.4565220.2001
5340.432099True0.0160860.0357140.0815830.0582090.0128100.7681160.6739130.2801
2040.209877True0.0040210.0051020.0351730.0126870.0030500.5362320.5217390.2801
480.345679False0.0026810.0000000.0552030.0216420.0034570.4782610.4565220.2800
....................................
420.469136True0.0857910.1581630.2769910.0111940.0073200.4927540.3043480.1201
1790.876543True0.1018770.2295920.1577920.0149250.0028470.4057970.1956520.0720
4300.604938False0.0040210.0000000.0581340.0074630.0044730.3043480.2173910.1200
4750.419753True0.0241290.0459180.1206640.0813430.0063030.7536230.6956520.2801
4250.666667True0.0000000.0000000.0180750.0365670.0233830.2318840.3478260.4400
\n", + "

371 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " age is_male total_bilirubin direct_ribilubin \\\n", + "107 0.395062 True 0.005362 0.005102 \n", + "33 0.419753 False 0.029491 0.056122 \n", + "534 0.432099 True 0.016086 0.035714 \n", + "204 0.209877 True 0.004021 0.005102 \n", + "48 0.345679 False 0.002681 0.000000 \n", + ".. ... ... ... ... \n", + "42 0.469136 True 0.085791 0.158163 \n", + "179 0.876543 True 0.101877 0.229592 \n", + "430 0.604938 False 0.004021 0.000000 \n", + "475 0.419753 True 0.024129 0.045918 \n", + "425 0.666667 True 0.000000 0.000000 \n", + "\n", + " alkaline_phosphotase alamine_aminotransferasi \\\n", + "107 0.046409 0.014179 \n", + "33 0.169516 0.036567 \n", + "534 0.081583 0.058209 \n", + "204 0.035173 0.012687 \n", + "48 0.055203 0.021642 \n", + ".. ... ... \n", + "42 0.276991 0.011194 \n", + "179 0.157792 0.014925 \n", + "430 0.058134 0.007463 \n", + "475 0.120664 0.081343 \n", + "425 0.018075 0.036567 \n", + "\n", + " aspartate_aminotransferase total_proteins albumin \\\n", + "107 0.005693 0.478261 0.282609 \n", + "33 0.009353 0.420290 0.456522 \n", + "534 0.012810 0.768116 0.673913 \n", + "204 0.003050 0.536232 0.521739 \n", + "48 0.003457 0.478261 0.456522 \n", + ".. ... ... ... \n", + "42 0.007320 0.492754 0.304348 \n", + "179 0.002847 0.405797 0.195652 \n", + "430 0.004473 0.304348 0.217391 \n", + "475 0.006303 0.753623 0.695652 \n", + "425 0.023383 0.231884 0.347826 \n", + "\n", + " albumin_to_globulin_ratio class \n", + "107 0.080 1 \n", + "33 0.200 1 \n", + "534 0.280 1 \n", + "204 0.280 1 \n", + "48 0.280 0 \n", + ".. ... ... \n", + "42 0.120 1 \n", + "179 0.072 0 \n", + "430 0.120 0 \n", + "475 0.280 1 \n", + "425 0.440 0 \n", + "\n", + "[371 rows x 11 columns]" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "train.dropna(inplace=True)\n", + "test.dropna(inplace=True)\n", + "val.dropna(inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageis_maletotal_bilirubindirect_ribilubinalkaline_phosphotasealamine_aminotransferasiaspartate_aminotransferasetotal_proteinsalbuminalbumin_to_globulin_ratioclass
1070.395062True0.0053620.0051020.0464090.0141790.0056930.4782610.2826090.0801
330.419753False0.0294910.0561220.1695160.0365670.0093530.4202900.4565220.2001
5340.432099True0.0160860.0357140.0815830.0582090.0128100.7681160.6739130.2801
2040.209877True0.0040210.0051020.0351730.0126870.0030500.5362320.5217390.2801
480.345679False0.0026810.0000000.0552030.0216420.0034570.4782610.4565220.2800
....................................
420.469136True0.0857910.1581630.2769910.0111940.0073200.4927540.3043480.1201
1790.876543True0.1018770.2295920.1577920.0149250.0028470.4057970.1956520.0720
4300.604938False0.0040210.0000000.0581340.0074630.0044730.3043480.2173910.1200
4750.419753True0.0241290.0459180.1206640.0813430.0063030.7536230.6956520.2801
4250.666667True0.0000000.0000000.0180750.0365670.0233830.2318840.3478260.4400
\n", + "

371 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " age is_male total_bilirubin direct_ribilubin \\\n", + "107 0.395062 True 0.005362 0.005102 \n", + "33 0.419753 False 0.029491 0.056122 \n", + "534 0.432099 True 0.016086 0.035714 \n", + "204 0.209877 True 0.004021 0.005102 \n", + "48 0.345679 False 0.002681 0.000000 \n", + ".. ... ... ... ... \n", + "42 0.469136 True 0.085791 0.158163 \n", + "179 0.876543 True 0.101877 0.229592 \n", + "430 0.604938 False 0.004021 0.000000 \n", + "475 0.419753 True 0.024129 0.045918 \n", + "425 0.666667 True 0.000000 0.000000 \n", + "\n", + " alkaline_phosphotase alamine_aminotransferasi \\\n", + "107 0.046409 0.014179 \n", + "33 0.169516 0.036567 \n", + "534 0.081583 0.058209 \n", + "204 0.035173 0.012687 \n", + "48 0.055203 0.021642 \n", + ".. ... ... \n", + "42 0.276991 0.011194 \n", + "179 0.157792 0.014925 \n", + "430 0.058134 0.007463 \n", + "475 0.120664 0.081343 \n", + "425 0.018075 0.036567 \n", + "\n", + " aspartate_aminotransferase total_proteins albumin \\\n", + "107 0.005693 0.478261 0.282609 \n", + "33 0.009353 0.420290 0.456522 \n", + "534 0.012810 0.768116 0.673913 \n", + "204 0.003050 0.536232 0.521739 \n", + "48 0.003457 0.478261 0.456522 \n", + ".. ... ... ... \n", + "42 0.007320 0.492754 0.304348 \n", + "179 0.002847 0.405797 0.195652 \n", + "430 0.004473 0.304348 0.217391 \n", + "475 0.006303 0.753623 0.695652 \n", + "425 0.023383 0.231884 0.347826 \n", + "\n", + " albumin_to_globulin_ratio class \n", + "107 0.080 1 \n", + "33 0.200 1 \n", + "534 0.280 1 \n", + "204 0.280 1 \n", + "48 0.280 0 \n", + ".. ... ... \n", + "42 0.120 1 \n", + "179 0.072 0 \n", + "430 0.120 0 \n", + "475 0.280 1 \n", + "425 0.440 0 \n", + "\n", + "[371 rows x 11 columns]" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageis_maletotal_bilirubindirect_ribilubinalkaline_phosphotasealamine_aminotransferasiaspartate_aminotransferasetotal_proteinsalbuminalbumin_to_globulin_ratioclass
3550.174419True0.0300000.0496450.0698310.0025670.0153850.9090910.9473680.6666671
4070.093023True0.0166670.0070920.4366100.1874200.0942310.7636360.7105260.4666670
900.651163True0.1733330.1914890.0942370.5147630.8076920.7818180.5789470.3200000
4020.441860False0.0000000.0000000.0589830.1848520.0942310.9272730.7894740.4000000
2680.418605True0.4666670.4468090.1918640.0500640.0625000.4909090.2894740.1333330
....................................
1090.372093True0.0133330.0000000.2786440.0179720.0230770.5272730.4736840.4000001
5160.651163True0.0133330.0141840.0630510.0064180.0134620.6727270.5263160.3333330
3050.581395False0.0300000.0425530.0813560.0320920.0057690.8909090.7105260.4000001
1670.500000False0.0833330.0992910.1471190.0680360.0548080.4727270.2105260.1133330
3290.197674True0.0066670.0070920.0922030.0038510.0125000.7818180.8157890.6000001
\n", + "

115 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " age is_male total_bilirubin direct_ribilubin \\\n", + "355 0.174419 True 0.030000 0.049645 \n", + "407 0.093023 True 0.016667 0.007092 \n", + "90 0.651163 True 0.173333 0.191489 \n", + "402 0.441860 False 0.000000 0.000000 \n", + "268 0.418605 True 0.466667 0.446809 \n", + ".. ... ... ... ... \n", + "109 0.372093 True 0.013333 0.000000 \n", + "516 0.651163 True 0.013333 0.014184 \n", + "305 0.581395 False 0.030000 0.042553 \n", + "167 0.500000 False 0.083333 0.099291 \n", + "329 0.197674 True 0.006667 0.007092 \n", + "\n", + " alkaline_phosphotase alamine_aminotransferasi \\\n", + "355 0.069831 0.002567 \n", + "407 0.436610 0.187420 \n", + "90 0.094237 0.514763 \n", + "402 0.058983 0.184852 \n", + "268 0.191864 0.050064 \n", + ".. ... ... \n", + "109 0.278644 0.017972 \n", + "516 0.063051 0.006418 \n", + "305 0.081356 0.032092 \n", + "167 0.147119 0.068036 \n", + "329 0.092203 0.003851 \n", + "\n", + " aspartate_aminotransferase total_proteins albumin \\\n", + "355 0.015385 0.909091 0.947368 \n", + "407 0.094231 0.763636 0.710526 \n", + "90 0.807692 0.781818 0.578947 \n", + "402 0.094231 0.927273 0.789474 \n", + "268 0.062500 0.490909 0.289474 \n", + ".. ... ... ... \n", + "109 0.023077 0.527273 0.473684 \n", + "516 0.013462 0.672727 0.526316 \n", + "305 0.005769 0.890909 0.710526 \n", + "167 0.054808 0.472727 0.210526 \n", + "329 0.012500 0.781818 0.815789 \n", + "\n", + " albumin_to_globulin_ratio class \n", + "355 0.666667 1 \n", + "407 0.466667 0 \n", + "90 0.320000 0 \n", + "402 0.400000 0 \n", + "268 0.133333 0 \n", + ".. ... ... \n", + "109 0.400000 1 \n", + "516 0.333333 0 \n", + "305 0.400000 1 \n", + "167 0.113333 0 \n", + "329 0.600000 1 \n", + "\n", + "[115 rows x 11 columns]" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ageis_maletotal_bilirubindirect_ribilubinalkaline_phosphotasealamine_aminotransferasiaspartate_aminotransferasetotal_proteinsalbuminalbumin_to_globulin_ratioclass
5820.403226True0.0181000.0170940.0691750.0055280.0040900.7254900.7317070.8461541
4530.532258True0.0045250.0085470.0740290.0150750.0037490.6862750.3902440.2307690
890.758065True0.1538460.1538460.0825240.0547740.1152010.6862750.4634150.3076920
711.000000False0.0090500.0085470.0521840.0050250.0057940.1568630.0975610.1538460
1240.241935True0.0000000.0000000.0455100.0130650.0057940.6470590.6585370.7692311
....................................
2360.145161True0.0090500.0085470.1201460.0236180.0095430.8431370.5853660.3846151
4870.306452True0.0045250.0085470.1055830.0462310.0606680.4705880.3658540.4076920
270.338710True0.2533940.2478630.0837380.8391960.2856170.7058820.6341460.6153850
3070.274194True0.0090500.0085470.0436890.0055280.0119290.1960780.2195120.4615380
5120.693548True0.0181000.0170940.0564320.0060300.0054530.4313730.2926830.3076921
\n", + "

93 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " age is_male total_bilirubin direct_ribilubin \\\n", + "582 0.403226 True 0.018100 0.017094 \n", + "453 0.532258 True 0.004525 0.008547 \n", + "89 0.758065 True 0.153846 0.153846 \n", + "71 1.000000 False 0.009050 0.008547 \n", + "124 0.241935 True 0.000000 0.000000 \n", + ".. ... ... ... ... \n", + "236 0.145161 True 0.009050 0.008547 \n", + "487 0.306452 True 0.004525 0.008547 \n", + "27 0.338710 True 0.253394 0.247863 \n", + "307 0.274194 True 0.009050 0.008547 \n", + "512 0.693548 True 0.018100 0.017094 \n", + "\n", + " alkaline_phosphotase alamine_aminotransferasi \\\n", + "582 0.069175 0.005528 \n", + "453 0.074029 0.015075 \n", + "89 0.082524 0.054774 \n", + "71 0.052184 0.005025 \n", + "124 0.045510 0.013065 \n", + ".. ... ... \n", + "236 0.120146 0.023618 \n", + "487 0.105583 0.046231 \n", + "27 0.083738 0.839196 \n", + "307 0.043689 0.005528 \n", + "512 0.056432 0.006030 \n", + "\n", + " aspartate_aminotransferase total_proteins albumin \\\n", + "582 0.004090 0.725490 0.731707 \n", + "453 0.003749 0.686275 0.390244 \n", + "89 0.115201 0.686275 0.463415 \n", + "71 0.005794 0.156863 0.097561 \n", + "124 0.005794 0.647059 0.658537 \n", + ".. ... ... ... \n", + "236 0.009543 0.843137 0.585366 \n", + "487 0.060668 0.470588 0.365854 \n", + "27 0.285617 0.705882 0.634146 \n", + "307 0.011929 0.196078 0.219512 \n", + "512 0.005453 0.431373 0.292683 \n", + "\n", + " albumin_to_globulin_ratio class \n", + "582 0.846154 1 \n", + "453 0.230769 0 \n", + "89 0.307692 0 \n", + "71 0.153846 0 \n", + "124 0.769231 1 \n", + ".. ... ... \n", + "236 0.384615 1 \n", + "487 0.407692 0 \n", + "27 0.615385 0 \n", + "307 0.461538 0 \n", + "512 0.307692 1 \n", + "\n", + "[93 rows x 11 columns]" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "val" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ium", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/main.py b/main.py new file mode 100644 index 0000000..471d4f8 --- /dev/null +++ b/main.py @@ -0,0 +1,22 @@ +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import MinMaxScaler +from datasets import load_dataset + +dataset = load_dataset("mstz/liver")['train'] +dataset = dataset.to_pandas() + +train, test = train_test_split(dataset, test_size=0.2, random_state=42) +train, val = train_test_split(train, test_size=0.2, random_state=42) + +numerical_features = ['age', 'total_bilirubin', 'direct_ribilubin', 'alkaline_phosphotase', + 'alamine_aminotransferasi', 'aspartate_aminotransferase', 'total_proteins', 'albumin', + 'albumin_to_globulin_ratio'] + +scaler = MinMaxScaler() +train[numerical_features] = scaler.fit_transform(train[numerical_features]) +test[numerical_features] = scaler.fit_transform(test[numerical_features]) +val[numerical_features] = scaler.fit_transform(val[numerical_features]) + +train.dropna(inplace=True) +test.dropna(inplace=True) +val.dropna(inplace=True)