ium_478855/02_Dane.ipynb
2022-04-03 19:10:49 +02:00

794 lines
36 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: kaggle in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (1.5.12)\n",
"Requirement already satisfied: tqdm in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from kaggle) (4.63.0)\n",
"Requirement already satisfied: certifi in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from kaggle) (2021.10.8)\n",
"Requirement already satisfied: six>=1.10 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from kaggle) (1.16.0)\n",
"Requirement already satisfied: requests in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from kaggle) (2.27.1)\n",
"Requirement already satisfied: python-slugify in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from kaggle) (6.1.1)\n",
"Requirement already satisfied: urllib3 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from kaggle) (1.26.9)\n",
"Requirement already satisfied: python-dateutil in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from kaggle) (2.8.2)\n",
"Requirement already satisfied: text-unidecode>=1.3 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from python-slugify->kaggle) (1.3)\n",
"Requirement already satisfied: charset-normalizer~=2.0.0 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from requests->kaggle) (2.0.12)\n",
"Requirement already satisfied: idna<4,>=2.5 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from requests->kaggle) (3.3)\n",
"Requirement already satisfied: colorama in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from tqdm->kaggle) (0.4.4)\n",
"Requirement already satisfied: pandas in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (1.3.5)\n",
"Requirement already satisfied: pytz>=2017.3 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from pandas) (2022.1)\n",
"Requirement already satisfied: python-dateutil>=2.7.3 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from pandas) (2.8.2)\n",
"Requirement already satisfied: numpy>=1.17.3 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from pandas) (1.21.5)\n",
"Requirement already satisfied: six>=1.5 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from python-dateutil>=2.7.3->pandas) (1.16.0)\n",
"Requirement already satisfied: seaborn in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (0.11.2)\n",
"Requirement already satisfied: scipy>=1.0 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from seaborn) (1.7.3)\n",
"Requirement already satisfied: numpy>=1.15 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from seaborn) (1.21.5)\n",
"Requirement already satisfied: matplotlib>=2.2 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from seaborn) (3.5.1)\n",
"Requirement already satisfied: pandas>=0.23 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from seaborn) (1.3.5)\n",
"Requirement already satisfied: fonttools>=4.22.0 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from matplotlib>=2.2->seaborn) (4.31.1)\n",
"Requirement already satisfied: pyparsing>=2.2.1 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from matplotlib>=2.2->seaborn) (3.0.7)\n",
"Requirement already satisfied: cycler>=0.10 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from matplotlib>=2.2->seaborn) (0.11.0)\n",
"Requirement already satisfied: packaging>=20.0 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from matplotlib>=2.2->seaborn) (21.3)\n",
"Requirement already satisfied: python-dateutil>=2.7 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from matplotlib>=2.2->seaborn) (2.8.2)\n",
"Requirement already satisfied: pillow>=6.2.0 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from matplotlib>=2.2->seaborn) (9.0.1)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from matplotlib>=2.2->seaborn) (1.4.0)\n",
"Requirement already satisfied: typing-extensions in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from kiwisolver>=1.0.1->matplotlib>=2.2->seaborn) (4.1.1)\n",
"Requirement already satisfied: pytz>=2017.3 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from pandas>=0.23->seaborn) (2022.1)\n",
"Requirement already satisfied: six>=1.5 in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from python-dateutil>=2.7->matplotlib>=2.2->seaborn) (1.16.0)\n",
"Requirement already satisfied: torch in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (1.11.0)\n",
"Requirement already satisfied: typing-extensions in c:\\programy\\anaconda3\\envs\\ium\\lib\\site-packages (from torch) (4.1.1)\n"
]
}
],
"source": [
"!pip install kaggle\n",
"!pip install pandas\n",
"!pip install torch\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"401 - Unauthorized\n"
]
}
],
"source": [
"# 1 Pobranie zbioru\n",
"!kaggle datasets download -d joniarroba/noshowappointments"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"'unzip' is not recognized as an internal or external command,\n",
"operable program or batch file.\n"
]
}
],
"source": [
"!unzip -o noshowappointments.zip"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PatientId</th>\n",
" <th>AppointmentID</th>\n",
" <th>Gender</th>\n",
" <th>ScheduledDay</th>\n",
" <th>AppointmentDay</th>\n",
" <th>Age</th>\n",
" <th>Neighbourhood</th>\n",
" <th>Scholarship</th>\n",
" <th>Hipertension</th>\n",
" <th>Diabetes</th>\n",
" <th>Alcoholism</th>\n",
" <th>Handcap</th>\n",
" <th>SMS_received</th>\n",
" <th>No-show</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2.987250e+13</td>\n",
" <td>5642903</td>\n",
" <td>F</td>\n",
" <td>2016-04-29T18:38:08Z</td>\n",
" <td>2016-04-29T00:00:00Z</td>\n",
" <td>62</td>\n",
" <td>JARDIM DA PENHA</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5.589978e+14</td>\n",
" <td>5642503</td>\n",
" <td>M</td>\n",
" <td>2016-04-29T16:08:27Z</td>\n",
" <td>2016-04-29T00:00:00Z</td>\n",
" <td>56</td>\n",
" <td>JARDIM DA PENHA</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4.262962e+12</td>\n",
" <td>5642549</td>\n",
" <td>F</td>\n",
" <td>2016-04-29T16:19:04Z</td>\n",
" <td>2016-04-29T00:00:00Z</td>\n",
" <td>62</td>\n",
" <td>MATA DA PRAIA</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>8.679512e+11</td>\n",
" <td>5642828</td>\n",
" <td>F</td>\n",
" <td>2016-04-29T17:29:31Z</td>\n",
" <td>2016-04-29T00:00:00Z</td>\n",
" <td>8</td>\n",
" <td>PONTAL DE CAMBURI</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>8.841186e+12</td>\n",
" <td>5642494</td>\n",
" <td>F</td>\n",
" <td>2016-04-29T16:07:23Z</td>\n",
" <td>2016-04-29T00:00:00Z</td>\n",
" <td>56</td>\n",
" <td>JARDIM DA PENHA</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110522</th>\n",
" <td>2.572134e+12</td>\n",
" <td>5651768</td>\n",
" <td>F</td>\n",
" <td>2016-05-03T09:15:35Z</td>\n",
" <td>2016-06-07T00:00:00Z</td>\n",
" <td>56</td>\n",
" <td>MARIA ORTIZ</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110523</th>\n",
" <td>3.596266e+12</td>\n",
" <td>5650093</td>\n",
" <td>F</td>\n",
" <td>2016-05-03T07:27:33Z</td>\n",
" <td>2016-06-07T00:00:00Z</td>\n",
" <td>51</td>\n",
" <td>MARIA ORTIZ</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110524</th>\n",
" <td>1.557663e+13</td>\n",
" <td>5630692</td>\n",
" <td>F</td>\n",
" <td>2016-04-27T16:03:52Z</td>\n",
" <td>2016-06-07T00:00:00Z</td>\n",
" <td>21</td>\n",
" <td>MARIA ORTIZ</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110525</th>\n",
" <td>9.213493e+13</td>\n",
" <td>5630323</td>\n",
" <td>F</td>\n",
" <td>2016-04-27T15:09:23Z</td>\n",
" <td>2016-06-07T00:00:00Z</td>\n",
" <td>38</td>\n",
" <td>MARIA ORTIZ</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110526</th>\n",
" <td>3.775115e+14</td>\n",
" <td>5629448</td>\n",
" <td>F</td>\n",
" <td>2016-04-27T13:30:56Z</td>\n",
" <td>2016-06-07T00:00:00Z</td>\n",
" <td>54</td>\n",
" <td>MARIA ORTIZ</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>No</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>110527 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" PatientId AppointmentID Gender ScheduledDay \\\n",
"0 2.987250e+13 5642903 F 2016-04-29T18:38:08Z \n",
"1 5.589978e+14 5642503 M 2016-04-29T16:08:27Z \n",
"2 4.262962e+12 5642549 F 2016-04-29T16:19:04Z \n",
"3 8.679512e+11 5642828 F 2016-04-29T17:29:31Z \n",
"4 8.841186e+12 5642494 F 2016-04-29T16:07:23Z \n",
"... ... ... ... ... \n",
"110522 2.572134e+12 5651768 F 2016-05-03T09:15:35Z \n",
"110523 3.596266e+12 5650093 F 2016-05-03T07:27:33Z \n",
"110524 1.557663e+13 5630692 F 2016-04-27T16:03:52Z \n",
"110525 9.213493e+13 5630323 F 2016-04-27T15:09:23Z \n",
"110526 3.775115e+14 5629448 F 2016-04-27T13:30:56Z \n",
"\n",
" AppointmentDay Age Neighbourhood Scholarship \\\n",
"0 2016-04-29T00:00:00Z 62 JARDIM DA PENHA 0 \n",
"1 2016-04-29T00:00:00Z 56 JARDIM DA PENHA 0 \n",
"2 2016-04-29T00:00:00Z 62 MATA DA PRAIA 0 \n",
"3 2016-04-29T00:00:00Z 8 PONTAL DE CAMBURI 0 \n",
"4 2016-04-29T00:00:00Z 56 JARDIM DA PENHA 0 \n",
"... ... ... ... ... \n",
"110522 2016-06-07T00:00:00Z 56 MARIA ORTIZ 0 \n",
"110523 2016-06-07T00:00:00Z 51 MARIA ORTIZ 0 \n",
"110524 2016-06-07T00:00:00Z 21 MARIA ORTIZ 0 \n",
"110525 2016-06-07T00:00:00Z 38 MARIA ORTIZ 0 \n",
"110526 2016-06-07T00:00:00Z 54 MARIA ORTIZ 0 \n",
"\n",
" Hipertension Diabetes Alcoholism Handcap SMS_received No-show \n",
"0 1 0 0 0 0 No \n",
"1 0 0 0 0 0 No \n",
"2 0 0 0 0 0 No \n",
"3 0 0 0 0 0 No \n",
"4 1 1 0 0 0 No \n",
"... ... ... ... ... ... ... \n",
"110522 0 0 0 0 1 No \n",
"110523 0 0 0 0 1 No \n",
"110524 0 0 0 0 1 No \n",
"110525 0 0 0 0 1 No \n",
"110526 0 0 0 0 1 No \n",
"\n",
"[110527 rows x 14 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"no_shows=pd.read_csv('KaggleV2-May-2016.csv')\n",
"no_shows"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# 2. Podział na train/test\n",
"import torch\n",
"\n",
"train_size = int(0.8 * len(no_shows))\n",
"test_size = (len(no_shows) - train_size)\n",
"no_shows_train, no_shows_test = torch.utils.data.random_split(no_shows, [train_size, test_size])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wielkosc zbioru: 110527, podzbiór train: 88421, podzbiór test 22106.\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PatientId</th>\n",
" <th>AppointmentID</th>\n",
" <th>Gender</th>\n",
" <th>ScheduledDay</th>\n",
" <th>AppointmentDay</th>\n",
" <th>Age</th>\n",
" <th>Neighbourhood</th>\n",
" <th>Scholarship</th>\n",
" <th>Hipertension</th>\n",
" <th>Diabetes</th>\n",
" <th>Alcoholism</th>\n",
" <th>Handcap</th>\n",
" <th>SMS_received</th>\n",
" <th>No-show</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>1.105270e+05</td>\n",
" <td>1.105270e+05</td>\n",
" <td>110527</td>\n",
" <td>110527</td>\n",
" <td>110527</td>\n",
" <td>110527.000000</td>\n",
" <td>110527</td>\n",
" <td>110527.000000</td>\n",
" <td>110527.000000</td>\n",
" <td>110527.000000</td>\n",
" <td>110527.000000</td>\n",
" <td>110527.000000</td>\n",
" <td>110527.000000</td>\n",
" <td>110527</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>103549</td>\n",
" <td>27</td>\n",
" <td>NaN</td>\n",
" <td>81</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>F</td>\n",
" <td>2016-05-06T07:09:54Z</td>\n",
" <td>2016-06-06T00:00:00Z</td>\n",
" <td>NaN</td>\n",
" <td>JARDIM CAMBURI</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>71840</td>\n",
" <td>24</td>\n",
" <td>4692</td>\n",
" <td>NaN</td>\n",
" <td>7717</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>88208</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>1.474963e+14</td>\n",
" <td>5.675305e+06</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>37.088874</td>\n",
" <td>NaN</td>\n",
" <td>0.098266</td>\n",
" <td>0.197246</td>\n",
" <td>0.071865</td>\n",
" <td>0.030400</td>\n",
" <td>0.022248</td>\n",
" <td>0.321026</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>2.560949e+14</td>\n",
" <td>7.129575e+04</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>23.110205</td>\n",
" <td>NaN</td>\n",
" <td>0.297675</td>\n",
" <td>0.397921</td>\n",
" <td>0.258265</td>\n",
" <td>0.171686</td>\n",
" <td>0.161543</td>\n",
" <td>0.466873</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>3.921784e+04</td>\n",
" <td>5.030230e+06</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>-1.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>4.172614e+12</td>\n",
" <td>5.640286e+06</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>18.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>3.173184e+13</td>\n",
" <td>5.680573e+06</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>37.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>9.439172e+13</td>\n",
" <td>5.725524e+06</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>55.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>9.999816e+14</td>\n",
" <td>5.790484e+06</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>115.000000</td>\n",
" <td>NaN</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>4.000000</td>\n",
" <td>1.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PatientId AppointmentID Gender ScheduledDay \\\n",
"count 1.105270e+05 1.105270e+05 110527 110527 \n",
"unique NaN NaN 2 103549 \n",
"top NaN NaN F 2016-05-06T07:09:54Z \n",
"freq NaN NaN 71840 24 \n",
"mean 1.474963e+14 5.675305e+06 NaN NaN \n",
"std 2.560949e+14 7.129575e+04 NaN NaN \n",
"min 3.921784e+04 5.030230e+06 NaN NaN \n",
"25% 4.172614e+12 5.640286e+06 NaN NaN \n",
"50% 3.173184e+13 5.680573e+06 NaN NaN \n",
"75% 9.439172e+13 5.725524e+06 NaN NaN \n",
"max 9.999816e+14 5.790484e+06 NaN NaN \n",
"\n",
" AppointmentDay Age Neighbourhood Scholarship \\\n",
"count 110527 110527.000000 110527 110527.000000 \n",
"unique 27 NaN 81 NaN \n",
"top 2016-06-06T00:00:00Z NaN JARDIM CAMBURI NaN \n",
"freq 4692 NaN 7717 NaN \n",
"mean NaN 37.088874 NaN 0.098266 \n",
"std NaN 23.110205 NaN 0.297675 \n",
"min NaN -1.000000 NaN 0.000000 \n",
"25% NaN 18.000000 NaN 0.000000 \n",
"50% NaN 37.000000 NaN 0.000000 \n",
"75% NaN 55.000000 NaN 0.000000 \n",
"max NaN 115.000000 NaN 1.000000 \n",
"\n",
" Hipertension Diabetes Alcoholism Handcap \\\n",
"count 110527.000000 110527.000000 110527.000000 110527.000000 \n",
"unique NaN NaN NaN NaN \n",
"top NaN NaN NaN NaN \n",
"freq NaN NaN NaN NaN \n",
"mean 0.197246 0.071865 0.030400 0.022248 \n",
"std 0.397921 0.258265 0.171686 0.161543 \n",
"min 0.000000 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 0.000000 \n",
"50% 0.000000 0.000000 0.000000 0.000000 \n",
"75% 0.000000 0.000000 0.000000 0.000000 \n",
"max 1.000000 1.000000 1.000000 4.000000 \n",
"\n",
" SMS_received No-show \n",
"count 110527.000000 110527 \n",
"unique NaN 2 \n",
"top NaN No \n",
"freq NaN 88208 \n",
"mean 0.321026 NaN \n",
"std 0.466873 NaN \n",
"min 0.000000 NaN \n",
"25% 0.000000 NaN \n",
"50% 0.000000 NaN \n",
"75% 1.000000 NaN \n",
"max 1.000000 NaN "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 3. Statystyki\n",
"# Wielkość zbioru i podzbiorów\n",
"print(f\"Wielkosc zbioru: {len(no_shows)}, podzbiór train: {train_size}, podzbiór test {test_size}.\")\n",
"# Opis parametrów\n",
"no_shows.describe(include='all')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot:title={'center':'No-show'}>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAEOCAYAAABrSnsUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAQnUlEQVR4nO3df6zddX3H8efL1iLC+H3HpK20G3WuskyxQcD4CxYoopYt/gDnaAjaGFFwwWmdm2QoiUaFiUETIiCgszJ0oUJdR1CM04DcikELIjflR1tBLpQfIgoU3vvjfKrHcm/vqW3vufQ8H8nN/X4/38/3nM9J2vu853vOaVNVSJIG23P6vQBJUv8ZA0mSMZAkGQNJEsZAkoQxkCRhDKQdIsl1Sd7Z73VIvTIGGihJ7kxyX5LdusbemeS6Pi5L6jtjoEE0DTi934uQphJjoEH0KeADSfba/ECSI5LcmOTh9v2I8W4kyfOSfDnJA0keavP375pyYJLvJ/lVkv9Nsl/XuW9Ksrqdd12Sv2rjJyf5Zte825P8V9f+2iQv3baHLz2TMdAgGgauAz7QPZhkH+Bq4DxgX+Ac4Ook+45zO4uBPYHZbf67gd90HX87cDLwp8CMTfeX5EXAV4H3A0PACuCbSWYA3wVeleQ5SQ5o5x3ezvtzYHfg5j/6kUvjMAYaVB8F3pdkqGvsOOD2qrqsqjZW1VeBnwFvHOc2nqQTgYOq6qmqWlVVj3Qdv7iqfl5VvwEuB17axt8GXF1V11TVk8CngV2BI6pqDfCrNvfVwErgF0leDLwG+F5VPb3Nj17azPR+L0Dqh6r6aZKrgKXArW34AOCuzabeBcwESPJo1/h84DI6zwqWtUtOXwY+0n7AA9zbNf8xOr/VP+N+qurpJGs33Q+dZwevBQ5q2w/RCcHhbV/a7nxmoEF2JvAufv9D+BfAgZvNeSGwHqCqdu/6uruqnqyqf6+q+cARwBuAk3q43z+4nyShE5X1bWhTDF7Vtr9LJwavwRhoBzEGGlhVNQJ8DTitDa0AXpTk7UmmJ3kbnWcAV411fpLXJfnrJNOAR+hcNurlEs7lwHFJjkryXOAM4HHgB+34d4HXAbtW1Trge8BCOpekbvojHqo0IWOgQXcWsBtAVT1A57f7M4AHgA8Cb6iq+8c598+AK+iE4FY6P8Qvm+gOq+o24B3A54D76bwm8caqeqId/znwKJ0I0F6HWAN8v6qe+qMepTSB+J/bSJJ8ZiBJMgaSJGMgScIYSJJ4Fn/obL/99qs5c+b0exmS9KyxatWq+6tqaKxjz9oYzJkzh+Hh4X4vQ5KeNZJs/gn73/EykSTJGEiSjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJJ4Fn8C+dlgztKr+72Encqdnziu30uQdlo+M5AkGQNJkjGQJGEMJEkYA0kSxkCShDGQJGEMJEkYA0kSxkCShDGQJGEMJEkYA0kSxkCShDGQJNFjDJL8U5LVSX6a5KtJnpdkbpIbkowk+VqSGW3uLm1/pB2f03U7H27jtyU5pmt8YRsbSbJ0uz9KSdIWTRiDJDOB04AFVXUwMA04AfgkcG5VHQQ8CJzSTjkFeLCNn9vmkWR+O+8lwELg80mmJZkGnA8cC8wHTmxzJUmTpNfLRNOBXZNMB54P3AMcCVzRjl8CHN+2F7V92vGjkqSNL6uqx6vqDmAEOLR9jVTVmqp6AljW5kqSJsmEMaiq9cCngbvpROBhYBXwUFVtbNPWATPb9kxgbTt3Y5u/b/f4ZueMN/4MSZYkGU4yPDo62svjkyT1oJfLRHvT+U19LnAAsBudyzyTrqouqKoFVbVgaGioH0uQpJ1SL5eJ/ha4o6pGq+pJ4BvAK4G92mUjgFnA+ra9HpgN0I7vCTzQPb7ZOeONS5ImSS8xuBs4LMnz27X/o4BbgO8Ab25zFgNXtu3lbZ92/NtVVW38hPZuo7nAPOCHwI3AvPbupBl0XmRevu0PTZLUq+kTTaiqG5JcAfwI2AjcBFwAXA0sS/LxNnZhO+VC4LIkI8AGOj/cqarVSS6nE5KNwKlV9RRAkvcCK+m8U+miqlq9/R6iJGkiE8YAoKrOBM7cbHgNnXcCbT73t8Bbxrmds4GzxxhfAazoZS2SpO3PTyBLkoyBJMkYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgSaLHGCTZK8kVSX6W5NYkhyfZJ8k1SW5v3/duc5PkvCQjSW5OckjX7Sxu829Psrhr/OVJftLOOS9Jtv9DlSSNp9dnBp8F/qeqXgz8DXArsBS4tqrmAde2fYBjgXntawnwBYAk+wBnAq8ADgXO3BSQNuddXect3LaHJUnaGhPGIMmewKuBCwGq6omqeghYBFzSpl0CHN+2FwGXVsf1wF5JXgAcA1xTVRuq6kHgGmBhO7ZHVV1fVQVc2nVbkqRJ0Mszg7nAKHBxkpuSfDHJbsD+VXVPm3MvsH/bngms7Tp/XRvb0vi6McafIcmSJMNJhkdHR3tYuiSpF73EYDpwCPCFqnoZ8Gt+f0kIgPYbfW3/5f2hqrqgqhZU1YKhoaEdfXeSNDB6icE6YF1V3dD2r6ATh1+2Szy07/e14+uB2V3nz2pjWxqfNca4JGmSTBiDqroXWJvkL9vQUcAtwHJg0zuCFgNXtu3lwEntXUWHAQ+3y0krgaOT7N1eOD4aWNmOPZLksPYuopO6bkuSNAmm9zjvfcBXkswA1gAn0wnJ5UlOAe4C3trmrgBeD4wAj7W5VNWGJB8DbmzzzqqqDW37PcCXgF2Bb7UvSdIk6SkGVfVjYMEYh44aY24Bp45zOxcBF40xPgwc3MtaJEnbn59AliQZA0mSMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRLGQJLEVsQgybQkNyW5qu3PTXJDkpEkX0syo43v0vZH2vE5Xbfx4TZ+W5JjusYXtrGRJEu34+OTJPVga54ZnA7c2rX/SeDcqjoIeBA4pY2fAjzYxs9t80gyHzgBeAmwEPh8C8w04HzgWGA+cGKbK0maJD3FIMks4Djgi20/wJHAFW3KJcDxbXtR26cdP6rNXwQsq6rHq+oOYAQ4tH2NVNWaqnoCWNbmSpImSa/PDP4D+CDwdNvfF3ioqja2/XXAzLY9E1gL0I4/3Ob/bnyzc8Ybf4YkS5IMJxkeHR3tcemSpIlMGIMkbwDuq6pVk7CeLaqqC6pqQVUtGBoa6vdyJGmnMb2HOa8E3pTk9cDzgD2AzwJ7JZnefvufBaxv89cDs4F1SaYDewIPdI1v0n3OeOOSpEkw4TODqvpwVc2qqjl0XgD+dlX9A/Ad4M1t2mLgyra9vO3Tjn+7qqqNn9DebTQXmAf8ELgRmNfenTSj3cfy7fLoJEk96eWZwXg+BCxL8nHgJuDCNn4hcFmSEWADnR/uVNXqJJcDtwAbgVOr6imAJO8FVgLTgIuqavU2rEuStJW2KgZVdR1wXdteQ+edQJvP+S3wlnHOPxs4e4zxFcCKrVmLJGn78RPIkiRjIEkyBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkYHq/FyCpP+YsvbrfS9ip3PmJ4/q9hG3iMwNJkjGQJPUQgySzk3wnyS1JVic5vY3vk+SaJLe373u38SQ5L8lIkpuTHNJ1W4vb/NuTLO4af3mSn7RzzkuSHfFgJUlj6+WZwUbgjKqaDxwGnJpkPrAUuLaq5gHXtn2AY4F57WsJ8AXoxAM4E3gFcChw5qaAtDnv6jpv4bY/NElSryaMQVXdU1U/atu/Am4FZgKLgEvatEuA49v2IuDS6rge2CvJC4BjgGuqakNVPQhcAyxsx/aoquurqoBLu25LkjQJtuo1gyRzgJcBNwD7V9U97dC9wP5teyawtuu0dW1sS+Prxhgf6/6XJBlOMjw6Oro1S5ckbUHPMUiyO/B14P1V9Uj3sfYbfW3ntT1DVV1QVQuqasHQ0NCOvjtJGhg9xSDJc+mE4CtV9Y02/Mt2iYf2/b42vh6Y3XX6rDa2pfFZY4xLkiZJL+8mCnAhcGtVndN1aDmw6R1Bi4Eru8ZPau8qOgx4uF1OWgkcnWTv9sLx0cDKduyRJIe1+zqp67YkSZOgl08gvxL4R+AnSX7cxv4F+ARweZJTgLuAt7ZjK4DXAyPAY8DJAFW1IcnHgBvbvLOqakPbfg/wJWBX4FvtS5I0SSaMQVX9HzDe+/6PGmN+AaeOc1sXAReNMT4MHDzRWiRJO4afQJYkGQNJkjGQJGEMJEkYA0kSxkCShDGQJGEMJEkYA0kSxkCShDGQJGEMJEkYA0kSxkCShDGQJGEMJEkYA0kSxkCShDGQJGEMJEkYA0kSxkCShDGQJGEMJEkYA0kSxkCShDGQJGEMJEkYA0kSxkCShDGQJGEMJEkYA0kSxkCShDGQJGEMJEkYA0kSxkCSxBSKQZKFSW5LMpJkab/XI0mDZErEIMk04HzgWGA+cGKS+f1dlSQNjikRA+BQYKSq1lTVE8AyYFGf1yRJA2N6vxfQzATWdu2vA16x+aQkS4AlbffRJLdNwtoGwX7A/f1exETyyX6vQH3in8/t58DxDkyVGPSkqi4ALuj3OnY2SYarakG/1yGNxT+fk2OqXCZaD8zu2p/VxiRJk2CqxOBGYF6SuUlmACcAy/u8JkkaGFPiMlFVbUzyXmAlMA24qKpW93lZg8RLb5rK/PM5CVJV/V6DJKnPpsplIklSHxkDSZIxkCQZA0lTTJK/SLJL235tktOS7NXnZe30jMGASjIryX8nGU1yX5KvJ5nV73VJwNeBp5IcROedRLOB/+zvknZ+xmBwXUznsxwvAA4AvtnGpH57uqo2An8HfK6q/pnOn1PtQMZgcA1V1cVVtbF9fQkY6veiJODJJCcCi4Gr2thz+7iegWAMBtcDSd6RZFr7egfwQL8XJQEnA4cDZ1fVHUnmApf1eU07PT90NqCSHAh8js5fugJ+AJxWVXf3dWESkGRX4IVV5b9MPEmMgaQpJckbgU8DM6pqbpKXAmdV1Zv6u7KdmzEYMEk+uoXDVVUfm7TFSGNIsgo4Eriuql7Wxn5aVQf3d2U7tynxD9VpUv16jLHdgFOAfQFjoH57sqoeTtI99nS/FjMojMGAqarPbNpO8ifA6XResFsGfGa886QdLckK4FRgdZK3A9OSzANOo/OalnYg3000gJLsk+TjwM10fiE4pKo+VFX39XlpGmwX0/ln7O8EDgYep/Nhs4fp/NKiHcjXDAZMkk8Bf0/nk53nV9WjfV6S9DtJdgf+DVhI5+2km35AVVWd07eFDQAvEw2eM+j8xvWvwEe6rsuGzl+4Pfq1MAl4gs7rWrsAu/P7GGgHMwYDpqq8NKgpKclC4Bw6/0zKIVX1WJ+XNFC8TCRpSkjyPeDd/pe3/WEMJEm+m0iSZAwkSRgDSRLGQJKEMZAkAf8PP9ePQZsYa28AAAAASUVORK5CYII=",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# Rozkład częstości dla klas\n",
"no_shows[\"No-show\"].value_counts().plot(kind=\"bar\", title=\"No-show\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# Wyczyszczenie zbioru\n",
"# Usunięcie negatywnego wieku\n",
"no_shows = no_shows.drop(no_shows[no_shows[\"Age\"] < 0].index)\n",
"\n",
"# Usunięcie niewiadomego wieku (zależy od zastosowania)\n",
"# no_shows = no_shows.drop(no_shows[no_shows[\"Age\"] == 0].index)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Normalizacja danych\n",
"\n",
"# Usunięcie kolumn PatientId oraz AppointmentID\n",
"no_shows.drop([\"PatientId\", \"AppointmentID\"], inplace=True, axis=1)\n",
"\n",
"# Zmiena wartości kolumny No-show z Yes/No na wartość boolowską\n",
"no_shows[\"No-show\"] = no_shows[\"No-show\"].map({'Yes': 1, 'No': 0})\n",
"\n",
"# Normalizacja kolumny Age\n",
"no_shows[\"Age\"]=(no_shows[\"Age\"]-no_shows[\"Age\"].min())/(no_shows[\"Age\"].max()-no_shows[\"Age\"].min())"
]
}
],
"metadata": {
"interpreter": {
"hash": "3c12dc341c1078754dffca0e61bfc548ab04f96cfe0a82a85a936b702c4881ab"
},
"kernelspec": {
"display_name": "Python 3.7.11 ('ium')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.11"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}