ium_464863/IUM_02.ipynb
Paweł Łączkowski 060d6cb715 IUM_02 - tasks
2024-03-15 20:07:35 +01:00

536 lines
56 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"source": [
"## IUM_02"
],
"metadata": {
"collapsed": false
},
"id": "da5635319c1475f3"
},
{
"cell_type": "markdown",
"source": [
"#### Wymagane zależności"
],
"metadata": {
"collapsed": false
},
"id": "5c88bd65c24cfc75"
},
{
"cell_type": "code",
"execution_count": 19,
"outputs": [],
"source": [
"# Instalacja wymaganych zależności\n",
"!pip install kaggle\n",
"!pip install pandas\n",
"!pip install scikit-learn"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-15T19:06:25.947706400Z",
"start_time": "2024-03-15T19:06:25.928113300Z"
}
},
"id": "ae6cca2241835fba"
},
{
"cell_type": "markdown",
"source": [
"#### Import bibliotek"
],
"metadata": {
"collapsed": false
},
"id": "ba9581e73648e5c3"
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [],
"source": [
"# Import bibliotek\n",
"import pandas as pd\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import MinMaxScaler"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-15T19:05:35.877258600Z",
"start_time": "2024-03-15T19:05:35.821429600Z"
}
},
"id": "5db08fde342b5463"
},
{
"cell_type": "markdown",
"source": [
"#### 1. Pobieranie zbioru danych"
],
"metadata": {
"collapsed": false
},
"id": "2000b14bbb95a446"
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading breast-cancer-wisconsin-data.zip to C:\\Users\\broke\\PycharmProjects\\ium_464863\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
" 0%| | 0.00/48.6k [00:00<?, ?B/s]\n",
"100%|##########| 48.6k/48.6k [00:00<00:00, 270kB/s]\n",
"100%|##########| 48.6k/48.6k [00:00<00:00, 268kB/s]\n"
]
}
],
"source": [
"# Pobranie zbioru danych\n",
"!kaggle datasets download -d uciml/breast-cancer-wisconsin-data\n",
"\n",
"# Wypakowanie archiwum\n",
"!tar -xf breast-cancer-wisconsin-data.zip"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-15T19:05:44.855185900Z",
"start_time": "2024-03-15T19:05:35.842341200Z"
}
},
"id": "679faee0f0f27fa1"
},
{
"cell_type": "markdown",
"source": [
"#### 2. Wczytanie danych oraz wstępne przetworzenie"
],
"metadata": {
"collapsed": false
},
"id": "2aa06e9443f948c9"
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [],
"source": [
"# Wczytanie danych, ustawienie kolumny 'id' jako indeks\n",
"df = pd.read_csv('data.csv', index_col='id')\n",
"\n",
"# Usunięcie niepotrzebnych/błędnych kolumn\n",
"df = df.drop(columns=['Unnamed: 32'])"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-15T19:05:44.915248600Z",
"start_time": "2024-03-15T19:05:44.855185900Z"
}
},
"id": "2bc8c4be37c8fa69"
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [
{
"data": {
"text/plain": "diagnosis 0\nradius_mean 0\ntexture_mean 0\nperimeter_mean 0\narea_mean 0\nsmoothness_mean 0\ncompactness_mean 0\nconcavity_mean 0\nconcave points_mean 0\nsymmetry_mean 0\nfractal_dimension_mean 0\nradius_se 0\ntexture_se 0\nperimeter_se 0\narea_se 0\nsmoothness_se 0\ncompactness_se 0\nconcavity_se 0\nconcave points_se 0\nsymmetry_se 0\nfractal_dimension_se 0\nradius_worst 0\ntexture_worst 0\nperimeter_worst 0\narea_worst 0\nsmoothness_worst 0\ncompactness_worst 0\nconcavity_worst 0\nconcave points_worst 0\nsymmetry_worst 0\nfractal_dimension_worst 0\ndtype: int64"
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Sprawdzenie czy istnieją wartości brakujące (NaN)\n",
"df.isnull().sum()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-15T19:05:44.961022900Z",
"start_time": "2024-03-15T19:05:44.916665400Z"
}
},
"id": "abcf61d13e9b47f1"
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [
{
"data": {
"text/plain": " diagnosis radius_mean texture_mean perimeter_mean area_mean \\\nid \n842302 M 17.99 10.38 122.80 1001.0 \n842517 M 20.57 17.77 132.90 1326.0 \n84300903 M 19.69 21.25 130.00 1203.0 \n84348301 M 11.42 20.38 77.58 386.1 \n84358402 M 20.29 14.34 135.10 1297.0 \n\n smoothness_mean compactness_mean concavity_mean \\\nid \n842302 0.11840 0.27760 0.3001 \n842517 0.08474 0.07864 0.0869 \n84300903 0.10960 0.15990 0.1974 \n84348301 0.14250 0.28390 0.2414 \n84358402 0.10030 0.13280 0.1980 \n\n concave points_mean symmetry_mean ... radius_worst \\\nid ... \n842302 0.14710 0.2419 ... 25.38 \n842517 0.07017 0.1812 ... 24.99 \n84300903 0.12790 0.2069 ... 23.57 \n84348301 0.10520 0.2597 ... 14.91 \n84358402 0.10430 0.1809 ... 22.54 \n\n texture_worst perimeter_worst area_worst smoothness_worst \\\nid \n842302 17.33 184.60 2019.0 0.1622 \n842517 23.41 158.80 1956.0 0.1238 \n84300903 25.53 152.50 1709.0 0.1444 \n84348301 26.50 98.87 567.7 0.2098 \n84358402 16.67 152.20 1575.0 0.1374 \n\n compactness_worst concavity_worst concave points_worst \\\nid \n842302 0.6656 0.7119 0.2654 \n842517 0.1866 0.2416 0.1860 \n84300903 0.4245 0.4504 0.2430 \n84348301 0.8663 0.6869 0.2575 \n84358402 0.2050 0.4000 0.1625 \n\n symmetry_worst fractal_dimension_worst \nid \n842302 0.4601 0.11890 \n842517 0.2750 0.08902 \n84300903 0.3613 0.08758 \n84348301 0.6638 0.17300 \n84358402 0.2364 0.07678 \n\n[5 rows x 31 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>diagnosis</th>\n <th>radius_mean</th>\n <th>texture_mean</th>\n <th>perimeter_mean</th>\n <th>area_mean</th>\n <th>smoothness_mean</th>\n <th>compactness_mean</th>\n <th>concavity_mean</th>\n <th>concave points_mean</th>\n <th>symmetry_mean</th>\n <th>...</th>\n <th>radius_worst</th>\n <th>texture_worst</th>\n <th>perimeter_worst</th>\n <th>area_worst</th>\n <th>smoothness_worst</th>\n <th>compactness_worst</th>\n <th>concavity_worst</th>\n <th>concave points_worst</th>\n <th>symmetry_worst</th>\n <th>fractal_dimension_worst</th>\n </tr>\n <tr>\n <th>id</th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>842302</th>\n <td>M</td>\n <td>17.99</td>\n <td>10.38</td>\n <td>122.80</td>\n <td>1001.0</td>\n <td>0.11840</td>\n <td>0.27760</td>\n <td>0.3001</td>\n <td>0.14710</td>\n <td>0.2419</td>\n <td>...</td>\n <td>25.38</td>\n <td>17.33</td>\n <td>184.60</td>\n <td>2019.0</td>\n <td>0.1622</td>\n <td>0.6656</td>\n <td>0.7119</td>\n <td>0.2654</td>\n <td>0.4601</td>\n <td>0.11890</td>\n </tr>\n <tr>\n <th>842517</th>\n <td>M</td>\n <td>20.57</td>\n <td>17.77</td>\n <td>132.90</td>\n <td>1326.0</td>\n <td>0.08474</td>\n <td>0.07864</td>\n <td>0.0869</td>\n <td>0.07017</td>\n <td>0.1812</td>\n <td>...</td>\n <td>24.99</td>\n <td>23.41</td>\n <td>158.80</td>\n <td>1956.0</td>\n <td>0.1238</td>\n <td>0.1866</td>\n <td>0.2416</td>\n <td>0.1860</td>\n <td>0.2750</td>\n <td>0.08902</td>\n </tr>\n <tr>\n <th>84300903</th>\n <td>M</td>\n <td>19.69</td>\n <td>21.25</td>\n <td>130.00</td>\n <td>1203.0</td>\n <td>0.10960</td>\n <td>0.15990</td>\n <td>0.1974</td>\n <td>0.12790</td>\n <td>0.2069</td>\n <td>...</td>\n <td>23.57</td>\n <td>25.53</td>\n <td>152.50</td>\n <td>1709.0</td>\n <td>0.1444</td>\n <td>0.4245</td>\n <td>0.4504</td>\n <td>0.2430</td>\n <td>0.3613</td>\n <td>0.08758</td>\n </tr>\n <tr>\n <th>84348301</th>\n <td>M</td>\n <td>11.42</td>\n <td>20.38</td>\n <td>77.58</td>\n <td>386.1</td>\n <td>0.14250</td>\n <td>0.28390</td>\n <td>0.2414</td>\n <td>0.10520</td>\n <td>0.2597</td>\n <td>...</td>\n <td>14.91</td>\n <td>26.50</td>\n <td>98.87</td>\n <td>567.7</td>\n <td>0.2098</td>\n <td>0.8663</td>\n <td>0.6869</td>\n <td>0.2575</td>\n <td>0.6638</td>\n <td>0.17300</td>\n </tr>\n <tr>\n <th>84358402</th>\n <td>M</td>\n <td>20.29</td>\n <td>14.34</td>\n <td>135.10</td>\n <td>1297.0</td>\n <td>0.10030</td>\n <td>0.13280</td>\n <td>0.1980</td>\n <td>0.10430</td>\n <td>0.1809</td>\n <td>...</td>\n <td>22.54</td>\n <td>16.67</td>\n <td>152.20</td>\n <td>1575.0</td>\n <td>0.1374</td>\n <td>0.2050</td>\n <td>0.4000</td>\n <td>0.1625</td>\n <td>0.2364</td>\n <td>0.07678</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 31 columns</p>\n</div>"
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Wyświetlenie 5 pierwszych wierszy\n",
"df.head()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-15T19:05:45.015712300Z",
"start_time": "2024-03-15T19:05:44.947489400Z"
}
},
"id": "db9cfc5e73a4da57"
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [],
"source": [
"# Normalizacja cech do wartości z przedziału [0, 1]\n",
"scaler = MinMaxScaler()\n",
"\n",
"df[df.columns[1:]] = scaler.fit_transform(df[df.columns[1:]])"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-15T19:05:45.023766200Z",
"start_time": "2024-03-15T19:05:44.995971700Z"
}
},
"id": "f8513c47a4a1f844"
},
{
"cell_type": "markdown",
"source": [
"#### 3. Podział danych na zbiór treningowy, walidacyjny i testowy"
],
"metadata": {
"collapsed": false
},
"id": "7d74496029e594b1"
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [],
"source": [
"# Podział zbioru na zbiór treningowy, walidacyjny i testowy w proporcji 80/10/10\n",
"df_train, df_val_test = train_test_split(df, test_size=0.2, random_state=1234)\n",
"df_val, df_test = train_test_split(df_val_test, test_size=0.5, random_state=1234)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-15T19:05:45.111265300Z",
"start_time": "2024-03-15T19:05:45.028046500Z"
}
},
"id": "651b6bf8d1dd8e6d"
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cały zbiór: 569 wierszy, 31 kolumn\n",
"Zbiór treningowy: 455 wierszy, 31 kolumn\n",
"Zbiór walidacyjny: 57 wierszy, 31 kolumn\n",
"Zbiór testowy: 57 wierszy, 31 kolumn\n"
]
}
],
"source": [
"# Wymiary zbiorów i podzbiorów\n",
"print(f\"Cały zbiór: {df.shape[0]} wierszy, {df.shape[1]} kolumn\")\n",
"print(f\"Zbiór treningowy: {df_train.shape[0]} wierszy, {df_train.shape[1]} kolumn\")\n",
"print(f\"Zbiór walidacyjny: {df_val.shape[0]} wierszy, {df_val.shape[1]} kolumn\")\n",
"print(f\"Zbiór testowy: {df_test.shape[0]} wierszy, {df_test.shape[1]} kolumn\")"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-15T19:05:45.168725700Z",
"start_time": "2024-03-15T19:05:45.058794500Z"
}
},
"id": "97f1bdbc7597c39f"
},
{
"cell_type": "markdown",
"source": [
"#### Statystyki dla cech numerycznych (średnia, odchylenie standardowe, min, max, kwantyle)"
],
"metadata": {
"collapsed": false
},
"id": "9014307b7d26b73f"
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [
{
"data": {
"text/plain": " radius_mean texture_mean perimeter_mean area_mean smoothness_mean \\\ncount 569.000000 569.000000 569.000000 569.000000 569.000000 \nmean 0.338222 0.323965 0.332935 0.216920 0.394785 \nstd 0.166787 0.145453 0.167915 0.149274 0.126967 \nmin 0.000000 0.000000 0.000000 0.000000 0.000000 \n25% 0.223342 0.218465 0.216847 0.117413 0.304595 \n50% 0.302381 0.308759 0.293345 0.172895 0.390358 \n75% 0.416442 0.408860 0.416765 0.271135 0.475490 \nmax 1.000000 1.000000 1.000000 1.000000 1.000000 \n\n compactness_mean concavity_mean concave points_mean symmetry_mean \\\ncount 569.000000 569.000000 569.000000 569.000000 \nmean 0.260601 0.208058 0.243137 0.379605 \nstd 0.161992 0.186785 0.192857 0.138456 \nmin 0.000000 0.000000 0.000000 0.000000 \n25% 0.139685 0.069260 0.100944 0.282323 \n50% 0.224679 0.144189 0.166501 0.369697 \n75% 0.340531 0.306232 0.367793 0.453030 \nmax 1.000000 1.000000 1.000000 1.000000 \n\n fractal_dimension_mean ... radius_worst texture_worst \\\ncount 569.000000 ... 569.000000 569.000000 \nmean 0.270379 ... 0.296663 0.363998 \nstd 0.148702 ... 0.171940 0.163813 \nmin 0.000000 ... 0.000000 0.000000 \n25% 0.163016 ... 0.180719 0.241471 \n50% 0.243892 ... 0.250445 0.356876 \n75% 0.340354 ... 0.386339 0.471748 \nmax 1.000000 ... 1.000000 1.000000 \n\n perimeter_worst area_worst smoothness_worst compactness_worst \\\ncount 569.000000 569.000000 569.000000 569.000000 \nmean 0.283138 0.170906 0.404138 0.220212 \nstd 0.167352 0.139932 0.150779 0.152649 \nmin 0.000000 0.000000 0.000000 0.000000 \n25% 0.167837 0.081130 0.300007 0.116337 \n50% 0.235320 0.123206 0.397081 0.179110 \n75% 0.373475 0.220901 0.494156 0.302520 \nmax 1.000000 1.000000 1.000000 1.000000 \n\n concavity_worst concave points_worst symmetry_worst \\\ncount 569.000000 569.000000 569.000000 \nmean 0.217403 0.393836 0.263307 \nstd 0.166633 0.225884 0.121954 \nmin 0.000000 0.000000 0.000000 \n25% 0.091454 0.223127 0.185098 \n50% 0.181070 0.343402 0.247782 \n75% 0.305831 0.554639 0.318155 \nmax 1.000000 1.000000 1.000000 \n\n fractal_dimension_worst \ncount 569.000000 \nmean 0.189596 \nstd 0.118466 \nmin 0.000000 \n25% 0.107700 \n50% 0.163977 \n75% 0.242949 \nmax 1.000000 \n\n[8 rows x 30 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>radius_mean</th>\n <th>texture_mean</th>\n <th>perimeter_mean</th>\n <th>area_mean</th>\n <th>smoothness_mean</th>\n <th>compactness_mean</th>\n <th>concavity_mean</th>\n <th>concave points_mean</th>\n <th>symmetry_mean</th>\n <th>fractal_dimension_mean</th>\n <th>...</th>\n <th>radius_worst</th>\n <th>texture_worst</th>\n <th>perimeter_worst</th>\n <th>area_worst</th>\n <th>smoothness_worst</th>\n <th>compactness_worst</th>\n <th>concavity_worst</th>\n <th>concave points_worst</th>\n <th>symmetry_worst</th>\n <th>fractal_dimension_worst</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>...</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>0.338222</td>\n <td>0.323965</td>\n <td>0.332935</td>\n <td>0.216920</td>\n <td>0.394785</td>\n <td>0.260601</td>\n <td>0.208058</td>\n <td>0.243137</td>\n <td>0.379605</td>\n <td>0.270379</td>\n <td>...</td>\n <td>0.296663</td>\n <td>0.363998</td>\n <td>0.283138</td>\n <td>0.170906</td>\n <td>0.404138</td>\n <td>0.220212</td>\n <td>0.217403</td>\n <td>0.393836</td>\n <td>0.263307</td>\n <td>0.189596</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.166787</td>\n <td>0.145453</td>\n <td>0.167915</td>\n <td>0.149274</td>\n <td>0.126967</td>\n <td>0.161992</td>\n <td>0.186785</td>\n <td>0.192857</td>\n <td>0.138456</td>\n <td>0.148702</td>\n <td>...</td>\n <td>0.171940</td>\n <td>0.163813</td>\n <td>0.167352</td>\n <td>0.139932</td>\n <td>0.150779</td>\n <td>0.152649</td>\n <td>0.166633</td>\n <td>0.225884</td>\n <td>0.121954</td>\n <td>0.118466</td>\n </tr>\n <tr>\n <th>min</th>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>0.223342</td>\n <td>0.218465</td>\n <td>0.216847</td>\n <td>0.117413</td>\n <td>0.304595</td>\n <td>0.139685</td>\n <td>0.069260</td>\n <td>0.100944</td>\n <td>0.282323</td>\n <td>0.163016</td>\n <td>...</td>\n <td>0.180719</td>\n <td>0.241471</td>\n <td>0.167837</td>\n <td>0.081130</td>\n <td>0.300007</td>\n <td>0.116337</td>\n <td>0.091454</td>\n <td>0.223127</td>\n <td>0.185098</td>\n <td>0.107700</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>0.302381</td>\n <td>0.308759</td>\n <td>0.293345</td>\n <td>0.172895</td>\n <td>0.390358</td>\n <td>0.224679</td>\n <td>0.144189</td>\n <td>0.166501</td>\n <td>0.369697</td>\n <td>0.243892</td>\n <td>...</td>\n <td>0.250445</td>\n <td>0.356876</td>\n <td>0.235320</td>\n <td>0.123206</td>\n <td>0.397081</td>\n <td>0.179110</td>\n <td>0.181070</td>\n <td>0.343402</td>\n <td>0.247782</td>\n <td>0.163977</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>0.416442</td>\n <td>0.408860</td>\n <td>0.416765</td>\n <td>0.271135</td>\n <td>0.475490</td>\n <td>0.340531</td>\n <td>0.306232</td>\n <td>0.367793</td>\n <td>0.453030</td>\n <td>0.340354</td>\n <td>...</td>\n <td>0.386339</td>\n <td>0.471748</td>\n <td>0.373475</td>\n <td>0.220901</td>\n <td>0.494156</td>\n <td>0.302520</td>\n <td>0.305831</td>\n <td>0.554639</td>\n <td>0.318155</td>\n <td>0.242949</td>\n </tr>\n <tr>\n <th>max</th>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>...</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n </tr>\n </tbody>\n</table>\n<p>8 rows × 30 columns</p>\n</div>"
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Cały zbiór\n",
"df.describe()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-15T19:05:45.568275400Z",
"start_time": "2024-03-15T19:05:45.073564100Z"
}
},
"id": "4f0c97b4de052a0c"
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"data": {
"text/plain": " radius_mean texture_mean perimeter_mean area_mean smoothness_mean \\\ncount 455.000000 455.000000 455.000000 455.000000 455.000000 \nmean 0.338949 0.326381 0.333644 0.217261 0.395892 \nstd 0.165349 0.145664 0.166246 0.147801 0.126845 \nmin 0.000000 0.022658 0.000000 0.000000 0.000000 \n25% 0.225235 0.219817 0.219128 0.119321 0.304776 \n50% 0.300961 0.310450 0.295833 0.170859 0.389636 \n75% 0.415259 0.411397 0.414346 0.271113 0.476393 \nmax 1.000000 0.815015 1.000000 0.999152 0.831182 \n\n compactness_mean concavity_mean concave points_mean symmetry_mean \\\ncount 455.000000 455.000000 455.000000 455.000000 \nmean 0.260580 0.209922 0.243100 0.381150 \nstd 0.160494 0.187617 0.192341 0.136984 \nmin 0.000000 0.000000 0.000000 0.000000 \n25% 0.136096 0.068030 0.100497 0.283081 \n50% 0.230262 0.145150 0.168191 0.369697 \n75% 0.340991 0.310098 0.357952 0.453030 \nmax 0.895712 1.000000 1.000000 1.000000 \n\n fractal_dimension_mean ... radius_worst texture_worst \\\ncount 455.000000 ... 455.000000 455.000000 \nmean 0.270577 ... 0.297118 0.369023 \nstd 0.147338 ... 0.170654 0.166060 \nmin 0.000000 ... 0.000000 0.012527 \n25% 0.168176 ... 0.182675 0.248801 \n50% 0.241786 ... 0.250445 0.358742 \n75% 0.340354 ... 0.377090 0.481343 \nmax 1.000000 ... 0.896478 1.000000 \n\n perimeter_worst area_worst smoothness_worst compactness_worst \\\ncount 455.000000 455.000000 455.000000 455.000000 \nmean 0.284164 0.171085 0.407890 0.221950 \nstd 0.166564 0.138560 0.153644 0.156748 \nmin 0.000000 0.000000 0.000000 0.000000 \n25% 0.169929 0.081781 0.303308 0.116871 \n50% 0.235370 0.123206 0.396421 0.181244 \n75% 0.369740 0.209964 0.496467 0.299318 \nmax 0.890931 0.797975 1.000000 1.000000 \n\n concavity_worst concave points_worst symmetry_worst \\\ncount 455.000000 455.000000 455.000000 \nmean 0.221686 0.394672 0.263384 \nstd 0.172039 0.226663 0.119284 \nmin 0.000000 0.000000 0.000000 \n25% 0.092212 0.221753 0.188252 \n50% 0.184505 0.347079 0.248571 \n75% 0.309265 0.558935 0.317465 \nmax 1.000000 1.000000 1.000000 \n\n fractal_dimension_worst \ncount 455.000000 \nmean 0.191552 \nstd 0.122769 \nmin 0.000000 \n25% 0.107110 \n50% 0.164305 \n75% 0.242785 \nmax 1.000000 \n\n[8 rows x 30 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>radius_mean</th>\n <th>texture_mean</th>\n <th>perimeter_mean</th>\n <th>area_mean</th>\n <th>smoothness_mean</th>\n <th>compactness_mean</th>\n <th>concavity_mean</th>\n <th>concave points_mean</th>\n <th>symmetry_mean</th>\n <th>fractal_dimension_mean</th>\n <th>...</th>\n <th>radius_worst</th>\n <th>texture_worst</th>\n <th>perimeter_worst</th>\n <th>area_worst</th>\n <th>smoothness_worst</th>\n <th>compactness_worst</th>\n <th>concavity_worst</th>\n <th>concave points_worst</th>\n <th>symmetry_worst</th>\n <th>fractal_dimension_worst</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>...</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>0.338949</td>\n <td>0.326381</td>\n <td>0.333644</td>\n <td>0.217261</td>\n <td>0.395892</td>\n <td>0.260580</td>\n <td>0.209922</td>\n <td>0.243100</td>\n <td>0.381150</td>\n <td>0.270577</td>\n <td>...</td>\n <td>0.297118</td>\n <td>0.369023</td>\n <td>0.284164</td>\n <td>0.171085</td>\n <td>0.407890</td>\n <td>0.221950</td>\n <td>0.221686</td>\n <td>0.394672</td>\n <td>0.263384</td>\n <td>0.191552</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.165349</td>\n <td>0.145664</td>\n <td>0.166246</td>\n <td>0.147801</td>\n <td>0.126845</td>\n <td>0.160494</td>\n <td>0.187617</td>\n <td>0.192341</td>\n <td>0.136984</td>\n <td>0.147338</td>\n <td>...</td>\n <td>0.170654</td>\n <td>0.166060</td>\n <td>0.166564</td>\n <td>0.138560</td>\n <td>0.153644</td>\n <td>0.156748</td>\n <td>0.172039</td>\n <td>0.226663</td>\n <td>0.119284</td>\n <td>0.122769</td>\n </tr>\n <tr>\n <th>min</th>\n <td>0.000000</td>\n <td>0.022658</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>0.000000</td>\n <td>0.012527</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>0.225235</td>\n <td>0.219817</td>\n <td>0.219128</td>\n <td>0.119321</td>\n <td>0.304776</td>\n <td>0.136096</td>\n <td>0.068030</td>\n <td>0.100497</td>\n <td>0.283081</td>\n <td>0.168176</td>\n <td>...</td>\n <td>0.182675</td>\n <td>0.248801</td>\n <td>0.169929</td>\n <td>0.081781</td>\n <td>0.303308</td>\n <td>0.116871</td>\n <td>0.092212</td>\n <td>0.221753</td>\n <td>0.188252</td>\n <td>0.107110</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>0.300961</td>\n <td>0.310450</td>\n <td>0.295833</td>\n <td>0.170859</td>\n <td>0.389636</td>\n <td>0.230262</td>\n <td>0.145150</td>\n <td>0.168191</td>\n <td>0.369697</td>\n <td>0.241786</td>\n <td>...</td>\n <td>0.250445</td>\n <td>0.358742</td>\n <td>0.235370</td>\n <td>0.123206</td>\n <td>0.396421</td>\n <td>0.181244</td>\n <td>0.184505</td>\n <td>0.347079</td>\n <td>0.248571</td>\n <td>0.164305</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>0.415259</td>\n <td>0.411397</td>\n <td>0.414346</td>\n <td>0.271113</td>\n <td>0.476393</td>\n <td>0.340991</td>\n <td>0.310098</td>\n <td>0.357952</td>\n <td>0.453030</td>\n <td>0.340354</td>\n <td>...</td>\n <td>0.377090</td>\n <td>0.481343</td>\n <td>0.369740</td>\n <td>0.209964</td>\n <td>0.496467</td>\n <td>0.299318</td>\n <td>0.309265</td>\n <td>0.558935</td>\n <td>0.317465</td>\n <td>0.242785</td>\n </tr>\n <tr>\n <th>max</th>\n <td>1.000000</td>\n <td>0.815015</td>\n <td>1.000000</td>\n <td>0.999152</td>\n <td>0.831182</td>\n <td>0.895712</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>...</td>\n <td>0.896478</td>\n <td>1.000000</td>\n <td>0.890931</td>\n <td>0.797975</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n </tr>\n </tbody>\n</table>\n<p>8 rows × 30 columns</p>\n</div>"
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Zbiór treningowy\n",
"df_train.describe()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-15T19:05:45.578805Z",
"start_time": "2024-03-15T19:05:45.179260600Z"
}
},
"id": "a6594457c1209a45"
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [
{
"data": {
"text/plain": " radius_mean texture_mean perimeter_mean area_mean smoothness_mean \\\ncount 57.000000 57.000000 57.000000 57.000000 57.000000 \nmean 0.334850 0.315985 0.331800 0.216332 0.399432 \nstd 0.176956 0.119165 0.181708 0.171264 0.144361 \nmin 0.089782 0.106865 0.089489 0.041357 0.167193 \n25% 0.225709 0.226581 0.223205 0.117413 0.293220 \n50% 0.282976 0.290159 0.279110 0.161909 0.396678 \n75% 0.413129 0.389922 0.405017 0.260912 0.471879 \nmax 0.967343 0.623267 0.988943 1.000000 1.000000 \n\n compactness_mean concavity_mean concave points_mean symmetry_mean \\\ncount 57.000000 57.000000 57.000000 57.000000 \nmean 0.279956 0.216035 0.254466 0.387684 \nstd 0.184302 0.208557 0.203610 0.160009 \nmin 0.046500 0.003622 0.027793 0.078283 \n25% 0.165235 0.076406 0.112326 0.282323 \n50% 0.222195 0.127413 0.154026 0.381818 \n75% 0.359242 0.296626 0.371918 0.464141 \nmax 1.000000 0.879569 0.839463 0.932323 \n\n fractal_dimension_mean ... radius_worst texture_worst \\\ncount 57.000000 ... 57.000000 57.000000 \nmean 0.281045 ... 0.294395 0.357428 \nstd 0.161084 ... 0.180500 0.130661 \nmin 0.047810 ... 0.072963 0.116205 \n25% 0.169545 ... 0.180719 0.267058 \n50% 0.258214 ... 0.225187 0.348348 \n75% 0.342249 ... 0.355034 0.463486 \nmax 0.949031 ... 1.000000 0.619670 \n\n perimeter_worst area_worst smoothness_worst compactness_worst \\\ncount 57.000000 57.000000 57.000000 57.000000 \nmean 0.281028 0.170796 0.391354 0.222054 \nstd 0.179389 0.161083 0.140480 0.133225 \nmin 0.074008 0.028264 0.085320 0.019239 \n25% 0.162757 0.082653 0.301327 0.126233 \n50% 0.228597 0.109836 0.416232 0.182893 \n75% 0.343593 0.197454 0.484911 0.308632 \nmax 1.000000 1.000000 0.786040 0.571557 \n\n concavity_worst concave points_worst symmetry_worst \\\ncount 57.000000 57.000000 57.000000 \nmean 0.209888 0.405999 0.276614 \nstd 0.152149 0.214849 0.143775 \nmin 0.006176 0.095670 0.066233 \n25% 0.107827 0.248179 0.198502 \n50% 0.164537 0.352234 0.234969 \n75% 0.312380 0.517182 0.323674 \nmax 0.613498 0.902062 0.829687 \n\n fractal_dimension_worst \ncount 57.000000 \nmean 0.187256 \nstd 0.103423 \nmin 0.028073 \n25% 0.107700 \n50% 0.155910 \n75% 0.260724 \nmax 0.460186 \n\n[8 rows x 30 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>radius_mean</th>\n <th>texture_mean</th>\n <th>perimeter_mean</th>\n <th>area_mean</th>\n <th>smoothness_mean</th>\n <th>compactness_mean</th>\n <th>concavity_mean</th>\n <th>concave points_mean</th>\n <th>symmetry_mean</th>\n <th>fractal_dimension_mean</th>\n <th>...</th>\n <th>radius_worst</th>\n <th>texture_worst</th>\n <th>perimeter_worst</th>\n <th>area_worst</th>\n <th>smoothness_worst</th>\n <th>compactness_worst</th>\n <th>concavity_worst</th>\n <th>concave points_worst</th>\n <th>symmetry_worst</th>\n <th>fractal_dimension_worst</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>...</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>0.334850</td>\n <td>0.315985</td>\n <td>0.331800</td>\n <td>0.216332</td>\n <td>0.399432</td>\n <td>0.279956</td>\n <td>0.216035</td>\n <td>0.254466</td>\n <td>0.387684</td>\n <td>0.281045</td>\n <td>...</td>\n <td>0.294395</td>\n <td>0.357428</td>\n <td>0.281028</td>\n <td>0.170796</td>\n <td>0.391354</td>\n <td>0.222054</td>\n <td>0.209888</td>\n <td>0.405999</td>\n <td>0.276614</td>\n <td>0.187256</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.176956</td>\n <td>0.119165</td>\n <td>0.181708</td>\n <td>0.171264</td>\n <td>0.144361</td>\n <td>0.184302</td>\n <td>0.208557</td>\n <td>0.203610</td>\n <td>0.160009</td>\n <td>0.161084</td>\n <td>...</td>\n <td>0.180500</td>\n <td>0.130661</td>\n <td>0.179389</td>\n <td>0.161083</td>\n <td>0.140480</td>\n <td>0.133225</td>\n <td>0.152149</td>\n <td>0.214849</td>\n <td>0.143775</td>\n <td>0.103423</td>\n </tr>\n <tr>\n <th>min</th>\n <td>0.089782</td>\n <td>0.106865</td>\n <td>0.089489</td>\n <td>0.041357</td>\n <td>0.167193</td>\n <td>0.046500</td>\n <td>0.003622</td>\n <td>0.027793</td>\n <td>0.078283</td>\n <td>0.047810</td>\n <td>...</td>\n <td>0.072963</td>\n <td>0.116205</td>\n <td>0.074008</td>\n <td>0.028264</td>\n <td>0.085320</td>\n <td>0.019239</td>\n <td>0.006176</td>\n <td>0.095670</td>\n <td>0.066233</td>\n <td>0.028073</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>0.225709</td>\n <td>0.226581</td>\n <td>0.223205</td>\n <td>0.117413</td>\n <td>0.293220</td>\n <td>0.165235</td>\n <td>0.076406</td>\n <td>0.112326</td>\n <td>0.282323</td>\n <td>0.169545</td>\n <td>...</td>\n <td>0.180719</td>\n <td>0.267058</td>\n <td>0.162757</td>\n <td>0.082653</td>\n <td>0.301327</td>\n <td>0.126233</td>\n <td>0.107827</td>\n <td>0.248179</td>\n <td>0.198502</td>\n <td>0.107700</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>0.282976</td>\n <td>0.290159</td>\n <td>0.279110</td>\n <td>0.161909</td>\n <td>0.396678</td>\n <td>0.222195</td>\n <td>0.127413</td>\n <td>0.154026</td>\n <td>0.381818</td>\n <td>0.258214</td>\n <td>...</td>\n <td>0.225187</td>\n <td>0.348348</td>\n <td>0.228597</td>\n <td>0.109836</td>\n <td>0.416232</td>\n <td>0.182893</td>\n <td>0.164537</td>\n <td>0.352234</td>\n <td>0.234969</td>\n <td>0.155910</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>0.413129</td>\n <td>0.389922</td>\n <td>0.405017</td>\n <td>0.260912</td>\n <td>0.471879</td>\n <td>0.359242</td>\n <td>0.296626</td>\n <td>0.371918</td>\n <td>0.464141</td>\n <td>0.342249</td>\n <td>...</td>\n <td>0.355034</td>\n <td>0.463486</td>\n <td>0.343593</td>\n <td>0.197454</td>\n <td>0.484911</td>\n <td>0.308632</td>\n <td>0.312380</td>\n <td>0.517182</td>\n <td>0.323674</td>\n <td>0.260724</td>\n </tr>\n <tr>\n <th>max</th>\n <td>0.967343</td>\n <td>0.623267</td>\n <td>0.988943</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>0.879569</td>\n <td>0.839463</td>\n <td>0.932323</td>\n <td>0.949031</td>\n <td>...</td>\n <td>1.000000</td>\n <td>0.619670</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>0.786040</td>\n <td>0.571557</td>\n <td>0.613498</td>\n <td>0.902062</td>\n <td>0.829687</td>\n <td>0.460186</td>\n </tr>\n </tbody>\n</table>\n<p>8 rows × 30 columns</p>\n</div>"
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Zbiór walidacyjny\n",
"df_val.describe()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-15T19:05:45.583803900Z",
"start_time": "2024-03-15T19:05:45.287918100Z"
}
},
"id": "5ce8c6c658be60c1"
},
{
"cell_type": "code",
"execution_count": 14,
"outputs": [
{
"data": {
"text/plain": " radius_mean texture_mean perimeter_mean area_mean smoothness_mean \\\ncount 57.000000 57.000000 57.000000 57.000000 57.000000 \nmean 0.335794 0.312663 0.328412 0.214790 0.381297 \nstd 0.170785 0.167714 0.169927 0.139898 0.109590 \nmin 0.075252 0.000000 0.072904 0.033001 0.146249 \n25% 0.192106 0.199188 0.185751 0.097222 0.308658 \n50% 0.315159 0.310450 0.301223 0.180445 0.390990 \n75% 0.486961 0.396348 0.481031 0.329629 0.460143 \nmax 0.692366 1.000000 0.695253 0.535949 0.578406 \n\n compactness_mean concavity_mean concave points_mean symmetry_mean \\\ncount 57.000000 57.000000 57.000000 57.000000 \nmean 0.241410 0.185207 0.232105 0.359197 \nstd 0.150381 0.156133 0.188722 0.127273 \nmin 0.021839 0.002798 0.011948 0.072222 \n25% 0.129471 0.062910 0.095179 0.271717 \n50% 0.203239 0.119845 0.154573 0.361616 \n75% 0.330102 0.261246 0.383996 0.439899 \nmax 0.809214 0.658388 0.776342 0.674242 \n\n fractal_dimension_mean ... radius_worst texture_worst \\\ncount 57.000000 ... 57.000000 57.000000 \nmean 0.258133 ... 0.295299 0.330458 \nstd 0.148563 ... 0.176543 0.173641 \nmin 0.021061 ... 0.054891 0.000000 \n25% 0.139217 ... 0.146211 0.199893 \n50% 0.246420 ... 0.256492 0.325160 \n75% 0.339090 ... 0.429740 0.425640 \nmax 0.839090 ... 0.667022 0.875533 \n\n perimeter_worst area_worst smoothness_worst compactness_worst \\\ncount 57.000000 57.000000 57.000000 57.000000 \nmean 0.277057 0.169590 0.386967 0.204497 \nstd 0.164054 0.130427 0.137458 0.138294 \nmin 0.047263 0.022046 0.111074 0.015504 \n25% 0.146023 0.064368 0.285478 0.098777 \n50% 0.245530 0.128048 0.396421 0.172318 \n75% 0.410827 0.256046 0.478307 0.280205 \nmax 0.627970 0.467902 0.674437 0.709327 \n\n concavity_worst concave points_worst symmetry_worst \\\ncount 57.000000 57.000000 57.000000 \nmean 0.190725 0.374998 0.249384 \nstd 0.132668 0.233099 0.120215 \nmin 0.002860 0.035808 0.081411 \n25% 0.086981 0.202131 0.159866 \n50% 0.155511 0.292509 0.234772 \n75% 0.275240 0.520619 0.304356 \nmax 0.563339 0.997595 0.622708 \n\n fractal_dimension_worst \ncount 57.000000 \nmean 0.176326 \nstd 0.096130 \nmin 0.001115 \n25% 0.113210 \n50% 0.160042 \n75% 0.231011 \nmax 0.481175 \n\n[8 rows x 30 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>radius_mean</th>\n <th>texture_mean</th>\n <th>perimeter_mean</th>\n <th>area_mean</th>\n <th>smoothness_mean</th>\n <th>compactness_mean</th>\n <th>concavity_mean</th>\n <th>concave points_mean</th>\n <th>symmetry_mean</th>\n <th>fractal_dimension_mean</th>\n <th>...</th>\n <th>radius_worst</th>\n <th>texture_worst</th>\n <th>perimeter_worst</th>\n <th>area_worst</th>\n <th>smoothness_worst</th>\n <th>compactness_worst</th>\n <th>concavity_worst</th>\n <th>concave points_worst</th>\n <th>symmetry_worst</th>\n <th>fractal_dimension_worst</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>...</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>0.335794</td>\n <td>0.312663</td>\n <td>0.328412</td>\n <td>0.214790</td>\n <td>0.381297</td>\n <td>0.241410</td>\n <td>0.185207</td>\n <td>0.232105</td>\n <td>0.359197</td>\n <td>0.258133</td>\n <td>...</td>\n <td>0.295299</td>\n <td>0.330458</td>\n <td>0.277057</td>\n <td>0.169590</td>\n <td>0.386967</td>\n <td>0.204497</td>\n <td>0.190725</td>\n <td>0.374998</td>\n <td>0.249384</td>\n <td>0.176326</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.170785</td>\n <td>0.167714</td>\n <td>0.169927</td>\n <td>0.139898</td>\n <td>0.109590</td>\n <td>0.150381</td>\n <td>0.156133</td>\n <td>0.188722</td>\n <td>0.127273</td>\n <td>0.148563</td>\n <td>...</td>\n <td>0.176543</td>\n <td>0.173641</td>\n <td>0.164054</td>\n <td>0.130427</td>\n <td>0.137458</td>\n <td>0.138294</td>\n <td>0.132668</td>\n <td>0.233099</td>\n <td>0.120215</td>\n <td>0.096130</td>\n </tr>\n <tr>\n <th>min</th>\n <td>0.075252</td>\n <td>0.000000</td>\n <td>0.072904</td>\n <td>0.033001</td>\n <td>0.146249</td>\n <td>0.021839</td>\n <td>0.002798</td>\n <td>0.011948</td>\n <td>0.072222</td>\n <td>0.021061</td>\n <td>...</td>\n <td>0.054891</td>\n <td>0.000000</td>\n <td>0.047263</td>\n <td>0.022046</td>\n <td>0.111074</td>\n <td>0.015504</td>\n <td>0.002860</td>\n <td>0.035808</td>\n <td>0.081411</td>\n <td>0.001115</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>0.192106</td>\n <td>0.199188</td>\n <td>0.185751</td>\n <td>0.097222</td>\n <td>0.308658</td>\n <td>0.129471</td>\n <td>0.062910</td>\n <td>0.095179</td>\n <td>0.271717</td>\n <td>0.139217</td>\n <td>...</td>\n <td>0.146211</td>\n <td>0.199893</td>\n <td>0.146023</td>\n <td>0.064368</td>\n <td>0.285478</td>\n <td>0.098777</td>\n <td>0.086981</td>\n <td>0.202131</td>\n <td>0.159866</td>\n <td>0.113210</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>0.315159</td>\n <td>0.310450</td>\n <td>0.301223</td>\n <td>0.180445</td>\n <td>0.390990</td>\n <td>0.203239</td>\n <td>0.119845</td>\n <td>0.154573</td>\n <td>0.361616</td>\n <td>0.246420</td>\n <td>...</td>\n <td>0.256492</td>\n <td>0.325160</td>\n <td>0.245530</td>\n <td>0.128048</td>\n <td>0.396421</td>\n <td>0.172318</td>\n <td>0.155511</td>\n <td>0.292509</td>\n <td>0.234772</td>\n <td>0.160042</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>0.486961</td>\n <td>0.396348</td>\n <td>0.481031</td>\n <td>0.329629</td>\n <td>0.460143</td>\n <td>0.330102</td>\n <td>0.261246</td>\n <td>0.383996</td>\n <td>0.439899</td>\n <td>0.339090</td>\n <td>...</td>\n <td>0.429740</td>\n <td>0.425640</td>\n <td>0.410827</td>\n <td>0.256046</td>\n <td>0.478307</td>\n <td>0.280205</td>\n <td>0.275240</td>\n <td>0.520619</td>\n <td>0.304356</td>\n <td>0.231011</td>\n </tr>\n <tr>\n <th>max</th>\n <td>0.692366</td>\n <td>1.000000</td>\n <td>0.695253</td>\n <td>0.535949</td>\n <td>0.578406</td>\n <td>0.809214</td>\n <td>0.658388</td>\n <td>0.776342</td>\n <td>0.674242</td>\n <td>0.839090</td>\n <td>...</td>\n <td>0.667022</td>\n <td>0.875533</td>\n <td>0.627970</td>\n <td>0.467902</td>\n <td>0.674437</td>\n <td>0.709327</td>\n <td>0.563339</td>\n <td>0.997595</td>\n <td>0.622708</td>\n <td>0.481175</td>\n </tr>\n </tbody>\n</table>\n<p>8 rows × 30 columns</p>\n</div>"
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Zbiór testowy\n",
"df_test.describe()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-15T19:05:45.588195700Z",
"start_time": "2024-03-15T19:05:45.381735500Z"
}
},
"id": "49acb8e5dfbda89f"
},
{
"cell_type": "markdown",
"source": [
"#### Rozkład klas w zbiorze"
],
"metadata": {
"collapsed": false
},
"id": "1c07bc584a09d6b8"
},
{
"cell_type": "code",
"execution_count": 15,
"outputs": [
{
"data": {
"text/plain": "diagnosis\nB 357\nM 212\nName: count, dtype: int64"
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Cały zbiór\n",
"df['diagnosis'].value_counts()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-15T19:05:45.651344400Z",
"start_time": "2024-03-15T19:05:45.482422700Z"
}
},
"id": "1e655fa9f8e48ff9"
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [
{
"data": {
"text/plain": "diagnosis\nB 288\nM 167\nName: count, dtype: int64"
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Zbiór treningowy\n",
"df_train['diagnosis'].value_counts()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-15T19:05:45.652345300Z",
"start_time": "2024-03-15T19:05:45.493491600Z"
}
},
"id": "d66c931287444033"
},
{
"cell_type": "code",
"execution_count": 17,
"outputs": [
{
"data": {
"text/plain": "diagnosis\nB 35\nM 22\nName: count, dtype: int64"
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Zbiór walidacyjny\n",
"df_val['diagnosis'].value_counts()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-15T19:05:45.652345300Z",
"start_time": "2024-03-15T19:05:45.508825500Z"
}
},
"id": "e3f1504600da351b"
},
{
"cell_type": "code",
"execution_count": 18,
"outputs": [
{
"data": {
"text/plain": "diagnosis\nB 34\nM 23\nName: count, dtype: int64"
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Zbiór testowy\n",
"df_test['diagnosis'].value_counts()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-15T19:05:45.653709400Z",
"start_time": "2024-03-15T19:05:45.523627900Z"
}
},
"id": "de5b287d4c68a1ec"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}