ium_464863/IUM_02.ipynb

597 lines
60 KiB
Plaintext
Raw Permalink Normal View History

2024-03-15 20:07:35 +01:00
{
"cells": [
{
"cell_type": "markdown",
"source": [
"## IUM_02"
],
"metadata": {
"collapsed": false
},
"id": "da5635319c1475f3"
},
{
"cell_type": "markdown",
"source": [
"#### Wymagane zależności"
],
"metadata": {
"collapsed": false
},
"id": "5c88bd65c24cfc75"
},
{
"cell_type": "code",
"execution_count": 1,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: kaggle in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (1.6.6)\n",
"Requirement already satisfied: python-dateutil in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from kaggle) (2.9.0.post0)\n",
"Requirement already satisfied: requests in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from kaggle) (2.31.0)\n",
"Requirement already satisfied: tqdm in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from kaggle) (4.66.2)\n",
"Requirement already satisfied: certifi in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from kaggle) (2024.2.2)\n",
"Requirement already satisfied: urllib3 in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from kaggle) (2.2.1)\n",
"Requirement already satisfied: bleach in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from kaggle) (6.1.0)\n",
"Requirement already satisfied: six>=1.10 in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from kaggle) (1.16.0)\n",
"Requirement already satisfied: python-slugify in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from kaggle) (8.0.4)\n",
"Requirement already satisfied: webencodings in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from bleach->kaggle) (0.5.1)\n",
"Requirement already satisfied: text-unidecode>=1.3 in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from python-slugify->kaggle) (1.3)\n",
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from requests->kaggle) (3.6)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from requests->kaggle) (3.3.2)\n",
"Requirement already satisfied: colorama in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from tqdm->kaggle) (0.4.6)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"[notice] A new release of pip available: 22.3.1 -> 24.0\n",
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pandas in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (2.2.1)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from pandas) (2.9.0.post0)\n",
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from pandas) (2024.1)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from pandas) (2024.1)\n",
"Requirement already satisfied: numpy<2,>=1.22.4 in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from pandas) (1.26.4)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"[notice] A new release of pip available: 22.3.1 -> 24.0\n",
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: scikit-learn in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (1.4.1.post1)\n",
"Requirement already satisfied: joblib>=1.2.0 in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from scikit-learn) (1.3.2)\n",
"Requirement already satisfied: numpy<2.0,>=1.19.5 in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from scikit-learn) (1.26.4)\n",
"Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from scikit-learn) (3.3.0)\n",
"Requirement already satisfied: scipy>=1.6.0 in c:\\users\\broke\\pycharmprojects\\ium_464863\\venv\\lib\\site-packages (from scikit-learn) (1.12.0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"[notice] A new release of pip available: 22.3.1 -> 24.0\n",
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
]
}
],
2024-03-15 20:07:35 +01:00
"source": [
"# Instalacja wymaganych zależności\n",
"!pip install kaggle\n",
"!pip install pandas\n",
"!pip install scikit-learn"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-27T11:20:58.698061Z",
"start_time": "2024-03-27T11:20:54.216389900Z"
2024-03-15 20:07:35 +01:00
}
},
"id": "ae6cca2241835fba"
},
{
"cell_type": "markdown",
"source": [
"#### Import bibliotek"
],
"metadata": {
"collapsed": false
},
"id": "ba9581e73648e5c3"
},
{
"cell_type": "code",
"execution_count": 2,
2024-03-15 20:07:35 +01:00
"outputs": [],
"source": [
"# Import bibliotek\n",
"import pandas as pd\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import MinMaxScaler"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-27T11:20:59.612164600Z",
"start_time": "2024-03-27T11:20:58.699325900Z"
2024-03-15 20:07:35 +01:00
}
},
"id": "5db08fde342b5463"
},
{
"cell_type": "markdown",
"source": [
"#### 1. Pobieranie zbioru danych"
],
"metadata": {
"collapsed": false
},
"id": "2000b14bbb95a446"
},
{
"cell_type": "code",
"execution_count": 3,
2024-03-15 20:07:35 +01:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"breast-cancer-wisconsin-data.zip: Skipping, found more recently modified local copy (use --force to force download)\n"
2024-03-15 20:07:35 +01:00
]
}
],
"source": [
"# Pobranie zbioru danych\n",
"!kaggle datasets download -d uciml/breast-cancer-wisconsin-data\n",
"\n",
"# Wypakowanie archiwum\n",
"!tar -xf breast-cancer-wisconsin-data.zip"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-27T11:21:01.293628200Z",
"start_time": "2024-03-27T11:20:59.612164600Z"
2024-03-15 20:07:35 +01:00
}
},
"id": "679faee0f0f27fa1"
},
{
"cell_type": "markdown",
"source": [
"#### 2. Wczytanie danych oraz wstępne przetworzenie"
],
"metadata": {
"collapsed": false
},
"id": "2aa06e9443f948c9"
},
{
"cell_type": "code",
"execution_count": 4,
2024-03-15 20:07:35 +01:00
"outputs": [],
"source": [
"# Wczytanie danych, ustawienie kolumny 'id' jako indeks\n",
"df = pd.read_csv('data.csv', index_col='id')\n",
"\n",
"# Usunięcie niepotrzebnych/błędnych kolumn\n",
"df = df.drop(columns=['Unnamed: 32'])"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-27T11:21:01.312651300Z",
"start_time": "2024-03-27T11:21:01.294632700Z"
2024-03-15 20:07:35 +01:00
}
},
"id": "2bc8c4be37c8fa69"
},
{
"cell_type": "code",
"execution_count": 5,
2024-03-15 20:07:35 +01:00
"outputs": [
{
"data": {
"text/plain": "diagnosis 0\nradius_mean 0\ntexture_mean 0\nperimeter_mean 0\narea_mean 0\nsmoothness_mean 0\ncompactness_mean 0\nconcavity_mean 0\nconcave points_mean 0\nsymmetry_mean 0\nfractal_dimension_mean 0\nradius_se 0\ntexture_se 0\nperimeter_se 0\narea_se 0\nsmoothness_se 0\ncompactness_se 0\nconcavity_se 0\nconcave points_se 0\nsymmetry_se 0\nfractal_dimension_se 0\nradius_worst 0\ntexture_worst 0\nperimeter_worst 0\narea_worst 0\nsmoothness_worst 0\ncompactness_worst 0\nconcavity_worst 0\nconcave points_worst 0\nsymmetry_worst 0\nfractal_dimension_worst 0\ndtype: int64"
},
"execution_count": 5,
2024-03-15 20:07:35 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Sprawdzenie czy istnieją wartości brakujące (NaN)\n",
"df.isnull().sum()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-27T11:21:01.335790100Z",
"start_time": "2024-03-27T11:21:01.311644700Z"
2024-03-15 20:07:35 +01:00
}
},
"id": "abcf61d13e9b47f1"
},
{
"cell_type": "code",
"execution_count": 6,
2024-03-15 20:07:35 +01:00
"outputs": [
{
"data": {
"text/plain": " diagnosis radius_mean texture_mean perimeter_mean area_mean \\\nid \n842302 M 17.99 10.38 122.80 1001.0 \n842517 M 20.57 17.77 132.90 1326.0 \n84300903 M 19.69 21.25 130.00 1203.0 \n84348301 M 11.42 20.38 77.58 386.1 \n84358402 M 20.29 14.34 135.10 1297.0 \n\n smoothness_mean compactness_mean concavity_mean \\\nid \n842302 0.11840 0.27760 0.3001 \n842517 0.08474 0.07864 0.0869 \n84300903 0.10960 0.15990 0.1974 \n84348301 0.14250 0.28390 0.2414 \n84358402 0.10030 0.13280 0.1980 \n\n concave points_mean symmetry_mean ... radius_worst \\\nid ... \n842302 0.14710 0.2419 ... 25.38 \n842517 0.07017 0.1812 ... 24.99 \n84300903 0.12790 0.2069 ... 23.57 \n84348301 0.10520 0.2597 ... 14.91 \n84358402 0.10430 0.1809 ... 22.54 \n\n texture_worst perimeter_worst area_worst smoothness_worst \\\nid \n842302 17.33 184.60 2019.0 0.1622 \n842517 23.41 158.80 1956.0 0.1238 \n84300903 25.53 152.50 1709.0 0.1444 \n84348301 26.50 98.87 567.7 0.2098 \n84358402 16.67 152.20 1575.0 0.1374 \n\n compactness_worst concavity_worst concave points_worst \\\nid \n842302 0.6656 0.7119 0.2654 \n842517 0.1866 0.2416 0.1860 \n84300903 0.4245 0.4504 0.2430 \n84348301 0.8663 0.6869 0.2575 \n84358402 0.2050 0.4000 0.1625 \n\n symmetry_worst fractal_dimension_worst \nid \n842302 0.4601 0.11890 \n842517 0.2750 0.08902 \n84300903 0.3613 0.08758 \n84348301 0.6638 0.17300 \n84358402 0.2364 0.07678 \n\n[5 rows x 31 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>diagnosis</th>\n <th>radius_mean</th>\n <th>texture_mean</th>\n <th>perimeter_mean</th>\n <th>area_mean</th>\n <th>smoothness_mean</th>\n <th>compactness_mean</th>\n <th>concavity_mean</th>\n <th>concave points_mean</th>\n <th>symmetry_mean</th>\n <th>...</th>\n <th>radius_worst</th>\n <th>texture_worst</th>\n <th>perimeter_worst</th>\n <th>area_worst</th>\n <th>smoothness_worst</th>\n <th>compactness_worst</th>\n <th>concavity_worst</th>\n <th>concave points_worst</th>\n <th>symmetry_worst</th>\n <th>fractal_dimension_worst</th>\n </tr>\n <tr>\n <th>id</th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>842302</th>\n <td>M</td>\n <td>17.99</td>\n <td>10.38</td>\n <td>122.80</td>\n <td>1001.0</td>\n <td>0.11840</td>\n <td>0.27760</td>\n <td>0.3001</td>\n <td>0.14710</td>\n <td>0.2419</td>\n <td>...</td>\n <td>25.38</td>\n <td>17.33</td>\n <td>184.60</td>\n <td>2019.0</td>\n <td>0.1622</td>\n <td>0.6656</td>\n <td>0.7119</td>\n <td>0.2654</td>\n <td>0.4601</td>\n <td>0.11890</td>\n </tr>\n <tr>\n <th>842517</th>\n <td>M</td>\n <td>20.57</td>\n <td>17.77</td>\n <td>132.90</td>\n <td>1326.0</td>\n <td>0.08474</td>\n <td>0.07864</td>\n <td>0.0869</td>\n <td>0.07017</td>\n <td>0.1812</td>\n <td>...</td>\n <td>24.99</td>\n <td>23.41</td>\n <td>158.80</td>\n <td>1956.0</td>\n <td>0.1238</td>\n <td>0.1866</td>\n <td>0.2416</td>\n <td>0.1860</td>\n <td>0.2750</td>\n <td>0.08902</td>\n </tr>\n <tr>\n <th>84300903</th>\n <td>M</td>\n <td>19.69</td>\n <td>21.25</td>\n <td>130.00</td>\n <td>1203.0</td>\n <td>0.10960</td>\n <td>0.15990</td>\n <td>0.1974</td>\n <td>0.12790</td>\n <td>0.2069</td>\n <td>...</td>\n <td>23.57</td>\n <td>25.53</td>\n <td>152.50</td>\n <td>1709.0</td>\n <td>0.1444</td>\n <td>0.4245</td>\n <td>0.4504</td>\n <td>0.2430</td>\n <td>0.3613</td>\n <td>0.08758</td>\n </tr>\n <tr>\n <th>84348301</th>\n <td>M</td>\n <td>11.42</td>\n <td>20.38</td>\n <td>77.58</td>\n <td>386.1</td>\n <td>0.14250</td>\n <td>0.28390</td>\n <td>0.2414</td>\n <td>0.10520</td>\n <td>0.2597</td>\n <td>...</td>\n <td>14.91</td>\n <td>26.50</td>\n <td>98.87</td>\n <td>567.7</td>\n <td>0.2098</td>\n <td>0.8663</td>\n <td>0.6869</td>\n <td>0.2575</td>\n <td>0.6638</td>\n <td>0.17300</td>\n </tr>\n <tr>\n <th>84358402</th>\n <td>M</td>\n <td>20.29</td>\n <td>14.34</td>\n <td>135.10</td>\n <td>1297.0</td>\n <td>0.10030</td>\n <td>0.13280</td>\n <td>0.1980</td>\n <td>0.10430</td>\n <td>0.1809</td>\n <td>...</td>\n <td>22.54</td>\n <td>16.67</td>\n <td>152.20</td>\n <td>1575.0</td>\n <td>0.1374</td>\n <td>0.2050</td>\n <td>0.4000</td>\n <td>0.1625</td>\n <td>0.2364</td>\n <td>0.07678</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 31 colu
},
"execution_count": 6,
2024-03-15 20:07:35 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Wyświetlenie 5 pierwszych wierszy\n",
"df.head()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-27T11:21:01.360029700Z",
"start_time": "2024-03-27T11:21:01.325157900Z"
2024-03-15 20:07:35 +01:00
}
},
"id": "db9cfc5e73a4da57"
},
{
"cell_type": "code",
"execution_count": 7,
2024-03-15 20:07:35 +01:00
"outputs": [],
"source": [
"# Normalizacja cech do wartości z przedziału [0, 1]\n",
"scaler = MinMaxScaler()\n",
"\n",
"df[df.columns[1:]] = scaler.fit_transform(df[df.columns[1:]])"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-27T11:21:01.385350900Z",
"start_time": "2024-03-27T11:21:01.358019600Z"
2024-03-15 20:07:35 +01:00
}
},
"id": "f8513c47a4a1f844"
},
{
"cell_type": "markdown",
"source": [
"#### 3. Podział danych na zbiór treningowy, walidacyjny i testowy"
],
"metadata": {
"collapsed": false
},
"id": "7d74496029e594b1"
},
{
"cell_type": "code",
"execution_count": 8,
2024-03-15 20:07:35 +01:00
"outputs": [],
"source": [
"# Podział zbioru na zbiór treningowy, walidacyjny i testowy w proporcji 80/10/10\n",
"df_train, df_val_test = train_test_split(df, test_size=0.2, random_state=1234)\n",
"df_val, df_test = train_test_split(df_val_test, test_size=0.5, random_state=1234)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-27T11:21:01.416540300Z",
"start_time": "2024-03-27T11:21:01.372555800Z"
2024-03-15 20:07:35 +01:00
}
},
"id": "651b6bf8d1dd8e6d"
},
{
"cell_type": "code",
"execution_count": 9,
2024-03-15 20:07:35 +01:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cały zbiór: 569 wierszy, 31 kolumn\n",
"Zbiór treningowy: 455 wierszy, 31 kolumn\n",
"Zbiór walidacyjny: 57 wierszy, 31 kolumn\n",
"Zbiór testowy: 57 wierszy, 31 kolumn\n"
]
}
],
"source": [
"# Wymiary zbiorów i podzbiorów\n",
"print(f\"Cały zbiór: {df.shape[0]} wierszy, {df.shape[1]} kolumn\")\n",
"print(f\"Zbiór treningowy: {df_train.shape[0]} wierszy, {df_train.shape[1]} kolumn\")\n",
"print(f\"Zbiór walidacyjny: {df_val.shape[0]} wierszy, {df_val.shape[1]} kolumn\")\n",
"print(f\"Zbiór testowy: {df_test.shape[0]} wierszy, {df_test.shape[1]} kolumn\")"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-27T11:21:01.539113800Z",
"start_time": "2024-03-27T11:21:01.388350400Z"
2024-03-15 20:07:35 +01:00
}
},
"id": "97f1bdbc7597c39f"
},
{
"cell_type": "markdown",
"source": [
"#### Statystyki dla cech numerycznych (średnia, odchylenie standardowe, min, max, kwantyle)"
],
"metadata": {
"collapsed": false
},
"id": "9014307b7d26b73f"
},
{
"cell_type": "code",
"execution_count": 10,
2024-03-15 20:07:35 +01:00
"outputs": [
{
"data": {
"text/plain": " radius_mean texture_mean perimeter_mean area_mean smoothness_mean \\\ncount 569.000000 569.000000 569.000000 569.000000 569.000000 \nmean 0.338222 0.323965 0.332935 0.216920 0.394785 \nstd 0.166787 0.145453 0.167915 0.149274 0.126967 \nmin 0.000000 0.000000 0.000000 0.000000 0.000000 \n25% 0.223342 0.218465 0.216847 0.117413 0.304595 \n50% 0.302381 0.308759 0.293345 0.172895 0.390358 \n75% 0.416442 0.408860 0.416765 0.271135 0.475490 \nmax 1.000000 1.000000 1.000000 1.000000 1.000000 \n\n compactness_mean concavity_mean concave points_mean symmetry_mean \\\ncount 569.000000 569.000000 569.000000 569.000000 \nmean 0.260601 0.208058 0.243137 0.379605 \nstd 0.161992 0.186785 0.192857 0.138456 \nmin 0.000000 0.000000 0.000000 0.000000 \n25% 0.139685 0.069260 0.100944 0.282323 \n50% 0.224679 0.144189 0.166501 0.369697 \n75% 0.340531 0.306232 0.367793 0.453030 \nmax 1.000000 1.000000 1.000000 1.000000 \n\n fractal_dimension_mean ... radius_worst texture_worst \\\ncount 569.000000 ... 569.000000 569.000000 \nmean 0.270379 ... 0.296663 0.363998 \nstd 0.148702 ... 0.171940 0.163813 \nmin 0.000000 ... 0.000000 0.000000 \n25% 0.163016 ... 0.180719 0.241471 \n50% 0.243892 ... 0.250445 0.356876 \n75% 0.340354 ... 0.386339 0.471748 \nmax 1.000000 ... 1.000000 1.000000 \n\n perimeter_worst area_worst smoothness_worst compactness_worst \\\ncount 569.000000 569.000000 569.000000 569.000000 \nmean 0.283138 0.170906 0.404138 0.220212 \nstd 0.167352 0.139932 0.150779 0.152649 \nmin 0.000000 0.000000 0.000000 0.000000 \n25% 0.167837 0.081130 0.300007 0.116337 \n50% 0.235320 0.123206 0.397081 0.179110 \n75% 0.373475 0.220901 0.494156 0.302520 \nmax 1.000000 1.000000 1.000000 1.000000 \n\n concavity_worst concave points_worst symmetry_worst \\\ncount 569.000000 569.000000 569.000000 \nmean 0.217403 0.393836 0.263307 \nstd 0.166633 0.225884 0.121954 \nmin 0.000000 0.000000 0.000000 \n25% 0.091454 0.223127 0.185098 \n50% 0.181070 0.343402 0.247782 \n75% 0.305831 0.554639 0.318155 \nmax 1.000000 1.000000 1.000000 \n\n fractal_dimension_worst \ncount 569.000000 \nmean 0.189596 \nstd 0.118466 \nmin 0.000000 \n25% 0.107700 \n50% 0.163977 \n75% 0.242949 \nmax 1.000000 \n\n[8 rows x 30 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>radius_mean</th>\n <th>texture_mean</th>\n <th>perimeter_mean</th>\n <th>area_mean</th>\n <th>smoothness_mean</th>\n <th>compactness_mean</th>\n <th>concavity_mean</th>\n <th>concave points_mean</th>\n <th>symmetry_mean</th>\n <th>fractal_dimension_mean</th>\n <th>...</th>\n <th>radius_worst</th>\n <th>texture_worst</th>\n <th>perimeter_worst</th>\n <th>area_worst</th>\n <th>smoothness_worst</th>\n <th>compactness_worst</th>\n <th>concavity_worst</th>\n <th>concave points_worst</th>\n <th>symmetry_worst</th>\n <th>fractal_dimension_worst</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>...</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n <td>569.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>0.338222</td>\n <td>0.323965</td>\n <td>0.332935</td>\n <td>0.216920</td>\n <td>0.394785</td>\n <td>0.260601</td>\n <td>0.208058</td>\n <td>0.243137</td>\n <td>0.379605</td>\n <td>0.270379</td>\n <td>...</td>\n <td>0.296663</td>\n <td>0.363998</td>\n <td>0.283138</td>\n <td>0.170906</td>\n <td>0.404138</td>\n <td>0.220212</td>\n <td>0.217403</td>\n <td>0.393836</td>\n <td>0.263307</td>\n <td>0.189596</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.166787</td>\n <td>0.145453</td>\n <td>0.167915</td>\n <td>0.149274</td>\n <td>0.126967</td>\n <td>0.161992</td>\n <td>0.186785</td>\n <td>0.192857</td>\n <td>0.138456</td>\n <td>0.148702</td>\n <td>...</td>\n <td>0.171940</td>\n <td>0.163813</td>\n <td>0.167352</td>\n <td>0.139932</td>\n <td>0.150779</td>\n <td>0.152649</td>\n <td>0.166633</td>\n <td>0.225884</td>\n <td>0.121954</td>\n <td>0.118466</td>\n </tr>\n <tr>\n <th>min</th>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>0.223342</td>\n <td>0.218465</td>\n <td>0.216847</td>\n <td>0.117413</td>\n <td>0.304595</td>\n <td>0.139685</td>\n <td>0.069260</td>\n <td>0.100944</td>\n <td>0.282323</td>\n <td>0.163016</td>\n <td>...</td>\n <td>0.180719</td>\n <td>0.241471</td>\n <td>0.167837</td>\n <td>0.081130</td>\n <td>0.300007</td>\n <td>0.116337</td>\n <td>0.091454</td>\n <td>0.223127</td>\n <td>0.185098</td>\n <td>0.107700</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>0.302381</td>\n <td>0.308759</td>\n <td>0.293345</td>\n <td>0.172895</td>\n <td>0.390358</td>\n <td>0.224679</
},
"execution_count": 10,
2024-03-15 20:07:35 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Cały zbiór\n",
"df.describe()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-27T11:21:01.651383Z",
"start_time": "2024-03-27T11:21:01.403035300Z"
2024-03-15 20:07:35 +01:00
}
},
"id": "4f0c97b4de052a0c"
},
{
"cell_type": "code",
"execution_count": 11,
2024-03-15 20:07:35 +01:00
"outputs": [
{
"data": {
"text/plain": " radius_mean texture_mean perimeter_mean area_mean smoothness_mean \\\ncount 455.000000 455.000000 455.000000 455.000000 455.000000 \nmean 0.338949 0.326381 0.333644 0.217261 0.395892 \nstd 0.165349 0.145664 0.166246 0.147801 0.126845 \nmin 0.000000 0.022658 0.000000 0.000000 0.000000 \n25% 0.225235 0.219817 0.219128 0.119321 0.304776 \n50% 0.300961 0.310450 0.295833 0.170859 0.389636 \n75% 0.415259 0.411397 0.414346 0.271113 0.476393 \nmax 1.000000 0.815015 1.000000 0.999152 0.831182 \n\n compactness_mean concavity_mean concave points_mean symmetry_mean \\\ncount 455.000000 455.000000 455.000000 455.000000 \nmean 0.260580 0.209922 0.243100 0.381150 \nstd 0.160494 0.187617 0.192341 0.136984 \nmin 0.000000 0.000000 0.000000 0.000000 \n25% 0.136096 0.068030 0.100497 0.283081 \n50% 0.230262 0.145150 0.168191 0.369697 \n75% 0.340991 0.310098 0.357952 0.453030 \nmax 0.895712 1.000000 1.000000 1.000000 \n\n fractal_dimension_mean ... radius_worst texture_worst \\\ncount 455.000000 ... 455.000000 455.000000 \nmean 0.270577 ... 0.297118 0.369023 \nstd 0.147338 ... 0.170654 0.166060 \nmin 0.000000 ... 0.000000 0.012527 \n25% 0.168176 ... 0.182675 0.248801 \n50% 0.241786 ... 0.250445 0.358742 \n75% 0.340354 ... 0.377090 0.481343 \nmax 1.000000 ... 0.896478 1.000000 \n\n perimeter_worst area_worst smoothness_worst compactness_worst \\\ncount 455.000000 455.000000 455.000000 455.000000 \nmean 0.284164 0.171085 0.407890 0.221950 \nstd 0.166564 0.138560 0.153644 0.156748 \nmin 0.000000 0.000000 0.000000 0.000000 \n25% 0.169929 0.081781 0.303308 0.116871 \n50% 0.235370 0.123206 0.396421 0.181244 \n75% 0.369740 0.209964 0.496467 0.299318 \nmax 0.890931 0.797975 1.000000 1.000000 \n\n concavity_worst concave points_worst symmetry_worst \\\ncount 455.000000 455.000000 455.000000 \nmean 0.221686 0.394672 0.263384 \nstd 0.172039 0.226663 0.119284 \nmin 0.000000 0.000000 0.000000 \n25% 0.092212 0.221753 0.188252 \n50% 0.184505 0.347079 0.248571 \n75% 0.309265 0.558935 0.317465 \nmax 1.000000 1.000000 1.000000 \n\n fractal_dimension_worst \ncount 455.000000 \nmean 0.191552 \nstd 0.122769 \nmin 0.000000 \n25% 0.107110 \n50% 0.164305 \n75% 0.242785 \nmax 1.000000 \n\n[8 rows x 30 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>radius_mean</th>\n <th>texture_mean</th>\n <th>perimeter_mean</th>\n <th>area_mean</th>\n <th>smoothness_mean</th>\n <th>compactness_mean</th>\n <th>concavity_mean</th>\n <th>concave points_mean</th>\n <th>symmetry_mean</th>\n <th>fractal_dimension_mean</th>\n <th>...</th>\n <th>radius_worst</th>\n <th>texture_worst</th>\n <th>perimeter_worst</th>\n <th>area_worst</th>\n <th>smoothness_worst</th>\n <th>compactness_worst</th>\n <th>concavity_worst</th>\n <th>concave points_worst</th>\n <th>symmetry_worst</th>\n <th>fractal_dimension_worst</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>...</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n <td>455.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>0.338949</td>\n <td>0.326381</td>\n <td>0.333644</td>\n <td>0.217261</td>\n <td>0.395892</td>\n <td>0.260580</td>\n <td>0.209922</td>\n <td>0.243100</td>\n <td>0.381150</td>\n <td>0.270577</td>\n <td>...</td>\n <td>0.297118</td>\n <td>0.369023</td>\n <td>0.284164</td>\n <td>0.171085</td>\n <td>0.407890</td>\n <td>0.221950</td>\n <td>0.221686</td>\n <td>0.394672</td>\n <td>0.263384</td>\n <td>0.191552</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.165349</td>\n <td>0.145664</td>\n <td>0.166246</td>\n <td>0.147801</td>\n <td>0.126845</td>\n <td>0.160494</td>\n <td>0.187617</td>\n <td>0.192341</td>\n <td>0.136984</td>\n <td>0.147338</td>\n <td>...</td>\n <td>0.170654</td>\n <td>0.166060</td>\n <td>0.166564</td>\n <td>0.138560</td>\n <td>0.153644</td>\n <td>0.156748</td>\n <td>0.172039</td>\n <td>0.226663</td>\n <td>0.119284</td>\n <td>0.122769</td>\n </tr>\n <tr>\n <th>min</th>\n <td>0.000000</td>\n <td>0.022658</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>0.000000</td>\n <td>0.012527</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>0.225235</td>\n <td>0.219817</td>\n <td>0.219128</td>\n <td>0.119321</td>\n <td>0.304776</td>\n <td>0.136096</td>\n <td>0.068030</td>\n <td>0.100497</td>\n <td>0.283081</td>\n <td>0.168176</td>\n <td>...</td>\n <td>0.182675</td>\n <td>0.248801</td>\n <td>0.169929</td>\n <td>0.081781</td>\n <td>0.303308</td>\n <td>0.116871</td>\n <td>0.092212</td>\n <td>0.221753</td>\n <td>0.188252</td>\n <td>0.107110</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>0.300961</td>\n <td>0.310450</td>\n <td>0.295833</td>\n <td>0.170859</td>\n <td>0.389636</td>\n <td>0.230262</
},
"execution_count": 11,
2024-03-15 20:07:35 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Zbiór treningowy\n",
"df_train.describe()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-27T11:21:01.652394400Z",
"start_time": "2024-03-27T11:21:01.450048100Z"
2024-03-15 20:07:35 +01:00
}
},
"id": "a6594457c1209a45"
},
{
"cell_type": "code",
"execution_count": 12,
2024-03-15 20:07:35 +01:00
"outputs": [
{
"data": {
"text/plain": " radius_mean texture_mean perimeter_mean area_mean smoothness_mean \\\ncount 57.000000 57.000000 57.000000 57.000000 57.000000 \nmean 0.334850 0.315985 0.331800 0.216332 0.399432 \nstd 0.176956 0.119165 0.181708 0.171264 0.144361 \nmin 0.089782 0.106865 0.089489 0.041357 0.167193 \n25% 0.225709 0.226581 0.223205 0.117413 0.293220 \n50% 0.282976 0.290159 0.279110 0.161909 0.396678 \n75% 0.413129 0.389922 0.405017 0.260912 0.471879 \nmax 0.967343 0.623267 0.988943 1.000000 1.000000 \n\n compactness_mean concavity_mean concave points_mean symmetry_mean \\\ncount 57.000000 57.000000 57.000000 57.000000 \nmean 0.279956 0.216035 0.254466 0.387684 \nstd 0.184302 0.208557 0.203610 0.160009 \nmin 0.046500 0.003622 0.027793 0.078283 \n25% 0.165235 0.076406 0.112326 0.282323 \n50% 0.222195 0.127413 0.154026 0.381818 \n75% 0.359242 0.296626 0.371918 0.464141 \nmax 1.000000 0.879569 0.839463 0.932323 \n\n fractal_dimension_mean ... radius_worst texture_worst \\\ncount 57.000000 ... 57.000000 57.000000 \nmean 0.281045 ... 0.294395 0.357428 \nstd 0.161084 ... 0.180500 0.130661 \nmin 0.047810 ... 0.072963 0.116205 \n25% 0.169545 ... 0.180719 0.267058 \n50% 0.258214 ... 0.225187 0.348348 \n75% 0.342249 ... 0.355034 0.463486 \nmax 0.949031 ... 1.000000 0.619670 \n\n perimeter_worst area_worst smoothness_worst compactness_worst \\\ncount 57.000000 57.000000 57.000000 57.000000 \nmean 0.281028 0.170796 0.391354 0.222054 \nstd 0.179389 0.161083 0.140480 0.133225 \nmin 0.074008 0.028264 0.085320 0.019239 \n25% 0.162757 0.082653 0.301327 0.126233 \n50% 0.228597 0.109836 0.416232 0.182893 \n75% 0.343593 0.197454 0.484911 0.308632 \nmax 1.000000 1.000000 0.786040 0.571557 \n\n concavity_worst concave points_worst symmetry_worst \\\ncount 57.000000 57.000000 57.000000 \nmean 0.209888 0.405999 0.276614 \nstd 0.152149 0.214849 0.143775 \nmin 0.006176 0.095670 0.066233 \n25% 0.107827 0.248179 0.198502 \n50% 0.164537 0.352234 0.234969 \n75% 0.312380 0.517182 0.323674 \nmax 0.613498 0.902062 0.829687 \n\n fractal_dimension_worst \ncount 57.000000 \nmean 0.187256 \nstd 0.103423 \nmin 0.028073 \n25% 0.107700 \n50% 0.155910 \n75% 0.260724 \nmax 0.460186 \n\n[8 rows x 30 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>radius_mean</th>\n <th>texture_mean</th>\n <th>perimeter_mean</th>\n <th>area_mean</th>\n <th>smoothness_mean</th>\n <th>compactness_mean</th>\n <th>concavity_mean</th>\n <th>concave points_mean</th>\n <th>symmetry_mean</th>\n <th>fractal_dimension_mean</th>\n <th>...</th>\n <th>radius_worst</th>\n <th>texture_worst</th>\n <th>perimeter_worst</th>\n <th>area_worst</th>\n <th>smoothness_worst</th>\n <th>compactness_worst</th>\n <th>concavity_worst</th>\n <th>concave points_worst</th>\n <th>symmetry_worst</th>\n <th>fractal_dimension_worst</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>...</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>0.334850</td>\n <td>0.315985</td>\n <td>0.331800</td>\n <td>0.216332</td>\n <td>0.399432</td>\n <td>0.279956</td>\n <td>0.216035</td>\n <td>0.254466</td>\n <td>0.387684</td>\n <td>0.281045</td>\n <td>...</td>\n <td>0.294395</td>\n <td>0.357428</td>\n <td>0.281028</td>\n <td>0.170796</td>\n <td>0.391354</td>\n <td>0.222054</td>\n <td>0.209888</td>\n <td>0.405999</td>\n <td>0.276614</td>\n <td>0.187256</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.176956</td>\n <td>0.119165</td>\n <td>0.181708</td>\n <td>0.171264</td>\n <td>0.144361</td>\n <td>0.184302</td>\n <td>0.208557</td>\n <td>0.203610</td>\n <td>0.160009</td>\n <td>0.161084</td>\n <td>...</td>\n <td>0.180500</td>\n <td>0.130661</td>\n <td>0.179389</td>\n <td>0.161083</td>\n <td>0.140480</td>\n <td>0.133225</td>\n <td>0.152149</td>\n <td>0.214849</td>\n <td>0.143775</td>\n <td>0.103423</td>\n </tr>\n <tr>\n <th>min</th>\n <td>0.089782</td>\n <td>0.106865</td>\n <td>0.089489</td>\n <td>0.041357</td>\n <td>0.167193</td>\n <td>0.046500</td>\n <td>0.003622</td>\n <td>0.027793</td>\n <td>0.078283</td>\n <td>0.047810</td>\n <td>...</td>\n <td>0.072963</td>\n <td>0.116205</td>\n <td>0.074008</td>\n <td>0.028264</td>\n <td>0.085320</td>\n <td>0.019239</td>\n <td>0.006176</td>\n <td>0.095670</td>\n <td>0.066233</td>\n <td>0.028073</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>0.225709</td>\n <td>0.226581</td>\n <td>0.223205</td>\n <td>0.117413</td>\n <td>0.293220</td>\n <td>0.165235</td>\n <td>0.076406</td>\n <td>0.112326</td>\n <td>0.282323</td>\n <td>0.169545</td>\n <td>...</td>\n <td>0.180719</td>\n <td>0.267058</td>\n <td>0.162757</td>\n <td>0.082653</td>\n <td>0.301327</td>\n <td>0.126233</td>\n <td>0.107827</td>\n <td>0.248179</td>\n <td>0.198502</td>\n <td>0.107700</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>0.282976</td>\n <td>0.290159</td>\n <td>0.279110</td>\n <td>0.161909</td>\n <td>0.396678</td>\n <td>0.222195</td>\n <td>0.127
},
"execution_count": 12,
2024-03-15 20:07:35 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Zbiór walidacyjny\n",
"df_val.describe()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-27T11:21:01.654387800Z",
"start_time": "2024-03-27T11:21:01.498448500Z"
2024-03-15 20:07:35 +01:00
}
},
"id": "5ce8c6c658be60c1"
},
{
"cell_type": "code",
"execution_count": 13,
2024-03-15 20:07:35 +01:00
"outputs": [
{
"data": {
"text/plain": " radius_mean texture_mean perimeter_mean area_mean smoothness_mean \\\ncount 57.000000 57.000000 57.000000 57.000000 57.000000 \nmean 0.335794 0.312663 0.328412 0.214790 0.381297 \nstd 0.170785 0.167714 0.169927 0.139898 0.109590 \nmin 0.075252 0.000000 0.072904 0.033001 0.146249 \n25% 0.192106 0.199188 0.185751 0.097222 0.308658 \n50% 0.315159 0.310450 0.301223 0.180445 0.390990 \n75% 0.486961 0.396348 0.481031 0.329629 0.460143 \nmax 0.692366 1.000000 0.695253 0.535949 0.578406 \n\n compactness_mean concavity_mean concave points_mean symmetry_mean \\\ncount 57.000000 57.000000 57.000000 57.000000 \nmean 0.241410 0.185207 0.232105 0.359197 \nstd 0.150381 0.156133 0.188722 0.127273 \nmin 0.021839 0.002798 0.011948 0.072222 \n25% 0.129471 0.062910 0.095179 0.271717 \n50% 0.203239 0.119845 0.154573 0.361616 \n75% 0.330102 0.261246 0.383996 0.439899 \nmax 0.809214 0.658388 0.776342 0.674242 \n\n fractal_dimension_mean ... radius_worst texture_worst \\\ncount 57.000000 ... 57.000000 57.000000 \nmean 0.258133 ... 0.295299 0.330458 \nstd 0.148563 ... 0.176543 0.173641 \nmin 0.021061 ... 0.054891 0.000000 \n25% 0.139217 ... 0.146211 0.199893 \n50% 0.246420 ... 0.256492 0.325160 \n75% 0.339090 ... 0.429740 0.425640 \nmax 0.839090 ... 0.667022 0.875533 \n\n perimeter_worst area_worst smoothness_worst compactness_worst \\\ncount 57.000000 57.000000 57.000000 57.000000 \nmean 0.277057 0.169590 0.386967 0.204497 \nstd 0.164054 0.130427 0.137458 0.138294 \nmin 0.047263 0.022046 0.111074 0.015504 \n25% 0.146023 0.064368 0.285478 0.098777 \n50% 0.245530 0.128048 0.396421 0.172318 \n75% 0.410827 0.256046 0.478307 0.280205 \nmax 0.627970 0.467902 0.674437 0.709327 \n\n concavity_worst concave points_worst symmetry_worst \\\ncount 57.000000 57.000000 57.000000 \nmean 0.190725 0.374998 0.249384 \nstd 0.132668 0.233099 0.120215 \nmin 0.002860 0.035808 0.081411 \n25% 0.086981 0.202131 0.159866 \n50% 0.155511 0.292509 0.234772 \n75% 0.275240 0.520619 0.304356 \nmax 0.563339 0.997595 0.622708 \n\n fractal_dimension_worst \ncount 57.000000 \nmean 0.176326 \nstd 0.096130 \nmin 0.001115 \n25% 0.113210 \n50% 0.160042 \n75% 0.231011 \nmax 0.481175 \n\n[8 rows x 30 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>radius_mean</th>\n <th>texture_mean</th>\n <th>perimeter_mean</th>\n <th>area_mean</th>\n <th>smoothness_mean</th>\n <th>compactness_mean</th>\n <th>concavity_mean</th>\n <th>concave points_mean</th>\n <th>symmetry_mean</th>\n <th>fractal_dimension_mean</th>\n <th>...</th>\n <th>radius_worst</th>\n <th>texture_worst</th>\n <th>perimeter_worst</th>\n <th>area_worst</th>\n <th>smoothness_worst</th>\n <th>compactness_worst</th>\n <th>concavity_worst</th>\n <th>concave points_worst</th>\n <th>symmetry_worst</th>\n <th>fractal_dimension_worst</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>...</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n <td>57.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>0.335794</td>\n <td>0.312663</td>\n <td>0.328412</td>\n <td>0.214790</td>\n <td>0.381297</td>\n <td>0.241410</td>\n <td>0.185207</td>\n <td>0.232105</td>\n <td>0.359197</td>\n <td>0.258133</td>\n <td>...</td>\n <td>0.295299</td>\n <td>0.330458</td>\n <td>0.277057</td>\n <td>0.169590</td>\n <td>0.386967</td>\n <td>0.204497</td>\n <td>0.190725</td>\n <td>0.374998</td>\n <td>0.249384</td>\n <td>0.176326</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.170785</td>\n <td>0.167714</td>\n <td>0.169927</td>\n <td>0.139898</td>\n <td>0.109590</td>\n <td>0.150381</td>\n <td>0.156133</td>\n <td>0.188722</td>\n <td>0.127273</td>\n <td>0.148563</td>\n <td>...</td>\n <td>0.176543</td>\n <td>0.173641</td>\n <td>0.164054</td>\n <td>0.130427</td>\n <td>0.137458</td>\n <td>0.138294</td>\n <td>0.132668</td>\n <td>0.233099</td>\n <td>0.120215</td>\n <td>0.096130</td>\n </tr>\n <tr>\n <th>min</th>\n <td>0.075252</td>\n <td>0.000000</td>\n <td>0.072904</td>\n <td>0.033001</td>\n <td>0.146249</td>\n <td>0.021839</td>\n <td>0.002798</td>\n <td>0.011948</td>\n <td>0.072222</td>\n <td>0.021061</td>\n <td>...</td>\n <td>0.054891</td>\n <td>0.000000</td>\n <td>0.047263</td>\n <td>0.022046</td>\n <td>0.111074</td>\n <td>0.015504</td>\n <td>0.002860</td>\n <td>0.035808</td>\n <td>0.081411</td>\n <td>0.001115</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>0.192106</td>\n <td>0.199188</td>\n <td>0.185751</td>\n <td>0.097222</td>\n <td>0.308658</td>\n <td>0.129471</td>\n <td>0.062910</td>\n <td>0.095179</td>\n <td>0.271717</td>\n <td>0.139217</td>\n <td>...</td>\n <td>0.146211</td>\n <td>0.199893</td>\n <td>0.146023</td>\n <td>0.064368</td>\n <td>0.285478</td>\n <td>0.098777</td>\n <td>0.086981</td>\n <td>0.202131</td>\n <td>0.159866</td>\n <td>0.113210</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>0.315159</td>\n <td>0.310450</td>\n <td>0.301223</td>\n <td>0.180445</td>\n <td>0.390990</td>\n <td>0.203239</td>\n <td>0.119
},
"execution_count": 13,
2024-03-15 20:07:35 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Zbiór testowy\n",
"df_test.describe()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-27T11:21:01.668390300Z",
"start_time": "2024-03-27T11:21:01.544332200Z"
2024-03-15 20:07:35 +01:00
}
},
"id": "49acb8e5dfbda89f"
},
{
"cell_type": "markdown",
"source": [
"#### Rozkład klas w zbiorze"
],
"metadata": {
"collapsed": false
},
"id": "1c07bc584a09d6b8"
},
{
"cell_type": "code",
"execution_count": 14,
2024-03-15 20:07:35 +01:00
"outputs": [
{
"data": {
"text/plain": "diagnosis\nB 357\nM 212\nName: count, dtype: int64"
},
"execution_count": 14,
2024-03-15 20:07:35 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Cały zbiór\n",
"df['diagnosis'].value_counts()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-27T11:21:01.668390300Z",
"start_time": "2024-03-27T11:21:01.592136900Z"
2024-03-15 20:07:35 +01:00
}
},
"id": "1e655fa9f8e48ff9"
},
{
"cell_type": "code",
"execution_count": 15,
2024-03-15 20:07:35 +01:00
"outputs": [
{
"data": {
"text/plain": "diagnosis\nB 288\nM 167\nName: count, dtype: int64"
},
"execution_count": 15,
2024-03-15 20:07:35 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Zbiór treningowy\n",
"df_train['diagnosis'].value_counts()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-27T11:21:01.704119700Z",
"start_time": "2024-03-27T11:21:01.607147900Z"
2024-03-15 20:07:35 +01:00
}
},
"id": "d66c931287444033"
},
{
"cell_type": "code",
"execution_count": 16,
2024-03-15 20:07:35 +01:00
"outputs": [
{
"data": {
"text/plain": "diagnosis\nB 35\nM 22\nName: count, dtype: int64"
},
"execution_count": 16,
2024-03-15 20:07:35 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Zbiór walidacyjny\n",
"df_val['diagnosis'].value_counts()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-27T11:21:01.704119700Z",
"start_time": "2024-03-27T11:21:01.621660400Z"
2024-03-15 20:07:35 +01:00
}
},
"id": "e3f1504600da351b"
},
{
"cell_type": "code",
"execution_count": 17,
2024-03-15 20:07:35 +01:00
"outputs": [
{
"data": {
"text/plain": "diagnosis\nB 34\nM 23\nName: count, dtype: int64"
},
"execution_count": 17,
2024-03-15 20:07:35 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Zbiór testowy\n",
"df_test['diagnosis'].value_counts()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-27T11:21:01.705119900Z",
"start_time": "2024-03-27T11:21:01.637872300Z"
2024-03-15 20:07:35 +01:00
}
},
"id": "de5b287d4c68a1ec"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}