ium_478855/02_Dane.ipynb

859 lines
38 KiB
Plaintext
Raw Normal View History

2022-03-20 21:06:58 +01:00
{
"cells": [
{
"cell_type": "code",
2022-03-27 17:07:12 +02:00
"execution_count": 6,
2022-03-20 21:06:58 +01:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2022-03-27 17:07:12 +02:00
"Defaulting to user installation because normal site-packages is not writeable\n",
"Requirement already satisfied: kaggle in \\\\files\\students\\s478855\\.appdata\\python\\python38\\site-packages (1.5.12)\n",
"Requirement already satisfied: tqdm in c:\\software\\python3\\lib\\site-packages (from kaggle) (4.62.1)\n",
"Requirement already satisfied: urllib3 in c:\\software\\python3\\lib\\site-packages (from kaggle) (1.26.6)\n",
"Requirement already satisfied: certifi in c:\\software\\python3\\lib\\site-packages (from kaggle) (2021.5.30)\n",
"Requirement already satisfied: python-dateutil in c:\\software\\python3\\lib\\site-packages (from kaggle) (2.8.2)\n",
"Requirement already satisfied: six>=1.10 in c:\\software\\python3\\lib\\site-packages (from kaggle) (1.15.0)\n",
"Requirement already satisfied: python-slugify in \\\\files\\students\\s478855\\.appdata\\python\\python38\\site-packages (from kaggle) (6.1.1)\n",
"Requirement already satisfied: requests in c:\\software\\python3\\lib\\site-packages (from kaggle) (2.26.0)\n",
"Requirement already satisfied: text-unidecode>=1.3 in \\\\files\\students\\s478855\\.appdata\\python\\python38\\site-packages (from python-slugify->kaggle) (1.3)\n",
"Requirement already satisfied: idna<4,>=2.5 in c:\\software\\python3\\lib\\site-packages (from requests->kaggle) (3.2)\n",
"Requirement already satisfied: charset-normalizer~=2.0.0 in c:\\software\\python3\\lib\\site-packages (from requests->kaggle) (2.0.4)\n",
"Requirement already satisfied: colorama in c:\\software\\python3\\lib\\site-packages (from tqdm->kaggle) (0.4.4)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the 'c:\\software\\python3\\python3.exe -m pip install --upgrade pip' command.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Defaulting to user installation because normal site-packages is not writeable\n",
"Requirement already satisfied: pandas in c:\\software\\python3\\lib\\site-packages (1.3.2)\n",
"Requirement already satisfied: pytz>=2017.3 in c:\\software\\python3\\lib\\site-packages (from pandas) (2021.1)\n",
"Requirement already satisfied: python-dateutil>=2.7.3 in c:\\software\\python3\\lib\\site-packages (from pandas) (2.8.2)\n",
"Requirement already satisfied: numpy>=1.17.3 in c:\\software\\python3\\lib\\site-packages (from pandas) (1.19.5)\n",
"Requirement already satisfied: six>=1.5 in c:\\software\\python3\\lib\\site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the 'c:\\software\\python3\\python3.exe -m pip install --upgrade pip' command.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Defaulting to user installation because normal site-packages is not writeable"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the 'c:\\software\\python3\\python3.exe -m pip install --upgrade pip' command.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Requirement already satisfied: seaborn in \\\\files\\students\\s478855\\.appdata\\python\\python38\\site-packages (0.11.2)\n",
"Requirement already satisfied: numpy>=1.15 in c:\\software\\python3\\lib\\site-packages (from seaborn) (1.19.5)\n",
"Requirement already satisfied: pandas>=0.23 in c:\\software\\python3\\lib\\site-packages (from seaborn) (1.3.2)\n",
"Requirement already satisfied: matplotlib>=2.2 in c:\\software\\python3\\lib\\site-packages (from seaborn) (3.4.3)\n",
"Requirement already satisfied: scipy>=1.0 in c:\\software\\python3\\lib\\site-packages (from seaborn) (1.7.1)\n",
"Requirement already satisfied: python-dateutil>=2.7 in c:\\software\\python3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (2.8.2)\n",
"Requirement already satisfied: pyparsing>=2.2.1 in c:\\software\\python3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (2.4.7)\n",
"Requirement already satisfied: pillow>=6.2.0 in c:\\software\\python3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (8.3.1)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in c:\\software\\python3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (1.3.1)\n",
"Requirement already satisfied: cycler>=0.10 in c:\\software\\python3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n",
"Requirement already satisfied: six in c:\\software\\python3\\lib\\site-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.15.0)\n",
"Requirement already satisfied: pytz>=2017.3 in c:\\software\\python3\\lib\\site-packages (from pandas>=0.23->seaborn) (2021.1)\n",
"Defaulting to user installation because normal site-packages is not writeable\n",
"Collecting torch\n",
" Downloading torch-1.11.0-cp38-cp38-win_amd64.whl (158.0 MB)\n",
"Requirement already satisfied: typing-extensions in c:\\software\\python3\\lib\\site-packages (from torch) (3.7.4.3)\n",
"Installing collected packages: torch\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" WARNING: The scripts convert-caffe2-to-onnx.exe, convert-onnx-to-caffe2.exe and torchrun.exe are installed in 'j:\\.AppData\\Python\\Python38\\Scripts' which is not on PATH.\n",
" Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n",
"WARNING: You are using pip version 21.2.4; however, version 22.0.4 is available.\n",
"You should consider upgrading via the 'c:\\software\\python3\\python3.exe -m pip install --upgrade pip' command.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Successfully installed torch-1.11.0\n"
2022-03-20 21:06:58 +01:00
]
}
],
"source": [
"!pip install kaggle\n",
"!pip install pandas\n",
2022-03-27 17:07:12 +02:00
"!pip install seaborn\n",
"!pip install torch\n"
2022-03-20 21:06:58 +01:00
]
},
{
"cell_type": "code",
2022-03-27 17:07:12 +02:00
"execution_count": 7,
2022-03-20 21:06:58 +01:00
"metadata": {},
"outputs": [
{
2022-03-27 17:07:12 +02:00
"name": "stderr",
2022-03-20 21:06:58 +01:00
"output_type": "stream",
"text": [
2022-03-27 17:07:12 +02:00
"'kaggle' is not recognized as an internal or external command,\n",
"operable program or batch file.\n"
2022-03-20 21:06:58 +01:00
]
}
],
"source": [
"# 1 Pobranie zbioru\n",
"!kaggle datasets download -d joniarroba/noshowappointments"
]
},
{
"cell_type": "code",
2022-03-27 17:07:12 +02:00
"execution_count": 8,
2022-03-20 21:06:58 +01:00
"metadata": {},
"outputs": [
{
2022-03-27 17:07:12 +02:00
"name": "stdout",
2022-03-20 21:06:58 +01:00
"output_type": "stream",
"text": [
2022-03-27 17:07:12 +02:00
"Archive: noshowappointments.zip\n",
" inflating: KaggleV2-May-2016.csv \n"
2022-03-20 21:06:58 +01:00
]
}
],
"source": [
"!unzip -o noshowappointments.zip"
]
},
{
"cell_type": "code",
2022-03-27 17:07:12 +02:00
"execution_count": 9,
2022-03-20 21:06:58 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PatientId</th>\n",
" <th>AppointmentID</th>\n",
" <th>Gender</th>\n",
" <th>ScheduledDay</th>\n",
" <th>AppointmentDay</th>\n",
" <th>Age</th>\n",
" <th>Neighbourhood</th>\n",
" <th>Scholarship</th>\n",
" <th>Hipertension</th>\n",
" <th>Diabetes</th>\n",
" <th>Alcoholism</th>\n",
" <th>Handcap</th>\n",
" <th>SMS_received</th>\n",
" <th>No-show</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2.987250e+13</td>\n",
" <td>5642903</td>\n",
" <td>F</td>\n",
" <td>2016-04-29T18:38:08Z</td>\n",
" <td>2016-04-29T00:00:00Z</td>\n",
" <td>62</td>\n",
" <td>JARDIM DA PENHA</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5.589978e+14</td>\n",
" <td>5642503</td>\n",
" <td>M</td>\n",
" <td>2016-04-29T16:08:27Z</td>\n",
" <td>2016-04-29T00:00:00Z</td>\n",
" <td>56</td>\n",
" <td>JARDIM DA PENHA</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4.262962e+12</td>\n",
" <td>5642549</td>\n",
" <td>F</td>\n",
" <td>2016-04-29T16:19:04Z</td>\n",
" <td>2016-04-29T00:00:00Z</td>\n",
" <td>62</td>\n",
" <td>MATA DA PRAIA</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>8.679512e+11</td>\n",
" <td>5642828</td>\n",
" <td>F</td>\n",
" <td>2016-04-29T17:29:31Z</td>\n",
" <td>2016-04-29T00:00:00Z</td>\n",
" <td>8</td>\n",
" <td>PONTAL DE CAMBURI</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>8.841186e+12</td>\n",
" <td>5642494</td>\n",
" <td>F</td>\n",
" <td>2016-04-29T16:07:23Z</td>\n",
" <td>2016-04-29T00:00:00Z</td>\n",
" <td>56</td>\n",
" <td>JARDIM DA PENHA</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110522</th>\n",
" <td>2.572134e+12</td>\n",
" <td>5651768</td>\n",
" <td>F</td>\n",
" <td>2016-05-03T09:15:35Z</td>\n",
" <td>2016-06-07T00:00:00Z</td>\n",
" <td>56</td>\n",
" <td>MARIA ORTIZ</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110523</th>\n",
" <td>3.596266e+12</td>\n",
" <td>5650093</td>\n",
" <td>F</td>\n",
" <td>2016-05-03T07:27:33Z</td>\n",
" <td>2016-06-07T00:00:00Z</td>\n",
" <td>51</td>\n",
" <td>MARIA ORTIZ</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110524</th>\n",
" <td>1.557663e+13</td>\n",
" <td>5630692</td>\n",
" <td>F</td>\n",
" <td>2016-04-27T16:03:52Z</td>\n",
" <td>2016-06-07T00:00:00Z</td>\n",
" <td>21</td>\n",
" <td>MARIA ORTIZ</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110525</th>\n",
" <td>9.213493e+13</td>\n",
" <td>5630323</td>\n",
" <td>F</td>\n",
" <td>2016-04-27T15:09:23Z</td>\n",
" <td>2016-06-07T00:00:00Z</td>\n",
" <td>38</td>\n",
" <td>MARIA ORTIZ</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110526</th>\n",
" <td>3.775115e+14</td>\n",
" <td>5629448</td>\n",
" <td>F</td>\n",
" <td>2016-04-27T13:30:56Z</td>\n",
" <td>2016-06-07T00:00:00Z</td>\n",
" <td>54</td>\n",
" <td>MARIA ORTIZ</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>No</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>110527 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" PatientId AppointmentID Gender ScheduledDay \\\n",
"0 2.987250e+13 5642903 F 2016-04-29T18:38:08Z \n",
"1 5.589978e+14 5642503 M 2016-04-29T16:08:27Z \n",
"2 4.262962e+12 5642549 F 2016-04-29T16:19:04Z \n",
"3 8.679512e+11 5642828 F 2016-04-29T17:29:31Z \n",
"4 8.841186e+12 5642494 F 2016-04-29T16:07:23Z \n",
"... ... ... ... ... \n",
"110522 2.572134e+12 5651768 F 2016-05-03T09:15:35Z \n",
"110523 3.596266e+12 5650093 F 2016-05-03T07:27:33Z \n",
"110524 1.557663e+13 5630692 F 2016-04-27T16:03:52Z \n",
"110525 9.213493e+13 5630323 F 2016-04-27T15:09:23Z \n",
"110526 3.775115e+14 5629448 F 2016-04-27T13:30:56Z \n",
"\n",
" AppointmentDay Age Neighbourhood Scholarship \\\n",
"0 2016-04-29T00:00:00Z 62 JARDIM DA PENHA 0 \n",
"1 2016-04-29T00:00:00Z 56 JARDIM DA PENHA 0 \n",
"2 2016-04-29T00:00:00Z 62 MATA DA PRAIA 0 \n",
"3 2016-04-29T00:00:00Z 8 PONTAL DE CAMBURI 0 \n",
"4 2016-04-29T00:00:00Z 56 JARDIM DA PENHA 0 \n",
"... ... ... ... ... \n",
"110522 2016-06-07T00:00:00Z 56 MARIA ORTIZ 0 \n",
"110523 2016-06-07T00:00:00Z 51 MARIA ORTIZ 0 \n",
"110524 2016-06-07T00:00:00Z 21 MARIA ORTIZ 0 \n",
"110525 2016-06-07T00:00:00Z 38 MARIA ORTIZ 0 \n",
"110526 2016-06-07T00:00:00Z 54 MARIA ORTIZ 0 \n",
"\n",
" Hipertension Diabetes Alcoholism Handcap SMS_received No-show \n",
"0 1 0 0 0 0 No \n",
"1 0 0 0 0 0 No \n",
"2 0 0 0 0 0 No \n",
"3 0 0 0 0 0 No \n",
"4 1 1 0 0 0 No \n",
"... ... ... ... ... ... ... \n",
"110522 0 0 0 0 1 No \n",
"110523 0 0 0 0 1 No \n",
"110524 0 0 0 0 1 No \n",
"110525 0 0 0 0 1 No \n",
"110526 0 0 0 0 1 No \n",
"\n",
"[110527 rows x 14 columns]"
]
},
2022-03-27 17:07:12 +02:00
"execution_count": 9,
2022-03-20 21:06:58 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"no_shows=pd.read_csv('KaggleV2-May-2016.csv')\n",
"no_shows"
]
},
{
"cell_type": "code",
2022-03-27 17:07:12 +02:00
"execution_count": 10,
2022-03-20 21:06:58 +01:00
"metadata": {},
"outputs": [],
"source": [
"# 2. Podział na train/test\n",
"import torch\n",
"\n",
"train_size = int(0.8 * len(no_shows))\n",
"test_size = (len(no_shows) - train_size)\n",
"no_shows_train, no_shows_test = torch.utils.data.random_split(no_shows, [train_size, test_size])"
]
},
{
"cell_type": "code",
2022-03-27 17:07:12 +02:00
"execution_count": 11,
2022-03-20 21:06:58 +01:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wielkosc zbioru: 110527, podzbiór train: 88421, podzbiór test 22106.\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PatientId</th>\n",
" <th>AppointmentID</th>\n",
" <th>Gender</th>\n",
" <th>ScheduledDay</th>\n",
" <th>AppointmentDay</th>\n",
" <th>Age</th>\n",
" <th>Neighbourhood</th>\n",
" <th>Scholarship</th>\n",
" <th>Hipertension</th>\n",
" <th>Diabetes</th>\n",
" <th>Alcoholism</th>\n",
" <th>Handcap</th>\n",
" <th>SMS_received</th>\n",
" <th>No-show</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>1.105270e+05</td>\n",
" <td>1.105270e+05</td>\n",
" <td>110527</td>\n",
" <td>110527</td>\n",
" <td>110527</td>\n",
" <td>110527.000000</td>\n",
" <td>110527</td>\n",
" <td>110527.000000</td>\n",
" <td>110527.000000</td>\n",
" <td>110527.000000</td>\n",
" <td>110527.000000</td>\n",
" <td>110527.000000</td>\n",
" <td>110527.000000</td>\n",
" <td>110527</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>103549</td>\n",
" <td>27</td>\n",
" <td>NaN</td>\n",
" <td>81</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>F</td>\n",
" <td>2016-05-06T07:09:54Z</td>\n",
" <td>2016-06-06T00:00:00Z</td>\n",
" <td>NaN</td>\n",
" <td>JARDIM CAMBURI</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>71840</td>\n",
" <td>24</td>\n",
" <td>4692</td>\n",
" <td>NaN</td>\n",
" <td>7717</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>88208</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>1.474963e+14</td>\n",
" <td>5.675305e+06</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>37.088874</td>\n",
" <td>NaN</td>\n",
" <td>0.098266</td>\n",
" <td>0.197246</td>\n",
" <td>0.071865</td>\n",
" <td>0.030400</td>\n",
" <td>0.022248</td>\n",
" <td>0.321026</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>2.560949e+14</td>\n",
" <td>7.129575e+04</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>23.110205</td>\n",
" <td>NaN</td>\n",
" <td>0.297675</td>\n",
" <td>0.397921</td>\n",
" <td>0.258265</td>\n",
" <td>0.171686</td>\n",
" <td>0.161543</td>\n",
" <td>0.466873</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>3.921784e+04</td>\n",
" <td>5.030230e+06</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>-1.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>4.172614e+12</td>\n",
" <td>5.640286e+06</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>18.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>3.173184e+13</td>\n",
" <td>5.680573e+06</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>37.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>9.439172e+13</td>\n",
" <td>5.725524e+06</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>55.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>9.999816e+14</td>\n",
" <td>5.790484e+06</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>115.000000</td>\n",
" <td>NaN</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>4.000000</td>\n",
" <td>1.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PatientId AppointmentID Gender ScheduledDay \\\n",
"count 1.105270e+05 1.105270e+05 110527 110527 \n",
"unique NaN NaN 2 103549 \n",
"top NaN NaN F 2016-05-06T07:09:54Z \n",
"freq NaN NaN 71840 24 \n",
"mean 1.474963e+14 5.675305e+06 NaN NaN \n",
"std 2.560949e+14 7.129575e+04 NaN NaN \n",
"min 3.921784e+04 5.030230e+06 NaN NaN \n",
"25% 4.172614e+12 5.640286e+06 NaN NaN \n",
"50% 3.173184e+13 5.680573e+06 NaN NaN \n",
"75% 9.439172e+13 5.725524e+06 NaN NaN \n",
"max 9.999816e+14 5.790484e+06 NaN NaN \n",
"\n",
" AppointmentDay Age Neighbourhood Scholarship \\\n",
"count 110527 110527.000000 110527 110527.000000 \n",
"unique 27 NaN 81 NaN \n",
"top 2016-06-06T00:00:00Z NaN JARDIM CAMBURI NaN \n",
"freq 4692 NaN 7717 NaN \n",
"mean NaN 37.088874 NaN 0.098266 \n",
"std NaN 23.110205 NaN 0.297675 \n",
"min NaN -1.000000 NaN 0.000000 \n",
"25% NaN 18.000000 NaN 0.000000 \n",
"50% NaN 37.000000 NaN 0.000000 \n",
"75% NaN 55.000000 NaN 0.000000 \n",
"max NaN 115.000000 NaN 1.000000 \n",
"\n",
" Hipertension Diabetes Alcoholism Handcap \\\n",
"count 110527.000000 110527.000000 110527.000000 110527.000000 \n",
"unique NaN NaN NaN NaN \n",
"top NaN NaN NaN NaN \n",
"freq NaN NaN NaN NaN \n",
"mean 0.197246 0.071865 0.030400 0.022248 \n",
"std 0.397921 0.258265 0.171686 0.161543 \n",
"min 0.000000 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 0.000000 \n",
"50% 0.000000 0.000000 0.000000 0.000000 \n",
"75% 0.000000 0.000000 0.000000 0.000000 \n",
"max 1.000000 1.000000 1.000000 4.000000 \n",
"\n",
" SMS_received No-show \n",
"count 110527.000000 110527 \n",
"unique NaN 2 \n",
"top NaN No \n",
"freq NaN 88208 \n",
"mean 0.321026 NaN \n",
"std 0.466873 NaN \n",
"min 0.000000 NaN \n",
"25% 0.000000 NaN \n",
"50% 0.000000 NaN \n",
"75% 1.000000 NaN \n",
"max 1.000000 NaN "
]
},
2022-03-27 17:07:12 +02:00
"execution_count": 11,
2022-03-20 21:06:58 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 3. Statystyki\n",
"# Wielkość zbioru i podzbiorów\n",
"print(f\"Wielkosc zbioru: {len(no_shows)}, podzbiór train: {train_size}, podzbiór test {test_size}.\")\n",
"# Opis parametrów\n",
"no_shows.describe(include='all')"
]
},
{
"cell_type": "code",
2022-03-27 17:07:12 +02:00
"execution_count": 12,
2022-03-20 21:06:58 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot:title={'center':'No-show'}>"
]
},
2022-03-27 17:07:12 +02:00
"execution_count": 12,
2022-03-20 21:06:58 +01:00
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
2022-03-27 17:07:12 +02:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAEOCAYAAABrSnsUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAAQnUlEQVR4nO3df6zddX3H8efL1iLC+H3HpK20G3WuskyxQcD4CxYoopYt/gDnaAjaGFFwwWmdm2QoiUaFiUETIiCgszJ0oUJdR1CM04DcikELIjflR1tBLpQfIgoU3vvjfKrHcm/vqW3vufQ8H8nN/X4/38/3nM9J2vu853vOaVNVSJIG23P6vQBJUv8ZA0mSMZAkGQNJEsZAkoQxkCRhDKQdIsl1Sd7Z73VIvTIGGihJ7kxyX5LdusbemeS6Pi5L6jtjoEE0DTi934uQphJjoEH0KeADSfba/ECSI5LcmOTh9v2I8W4kyfOSfDnJA0keavP375pyYJLvJ/lVkv9Nsl/XuW9Ksrqdd12Sv2rjJyf5Zte825P8V9f+2iQv3baHLz2TMdAgGgauAz7QPZhkH+Bq4DxgX+Ac4Ook+45zO4uBPYHZbf67gd90HX87cDLwp8CMTfeX5EXAV4H3A0PACuCbSWYA3wVeleQ5SQ5o5x3ezvtzYHfg5j/6kUvjMAYaVB8F3pdkqGvsOOD2qrqsqjZW1VeBnwFvHOc2nqQTgYOq6qmqWlVVj3Qdv7iqfl5VvwEuB17axt8GXF1V11TVk8CngV2BI6pqDfCrNvfVwErgF0leDLwG+F5VPb3Nj17azPR+L0Dqh6r6aZKrgKXArW34AOCuzabeBcwESPJo1/h84DI6zwqWtUtOXwY+0n7AA9zbNf8xOr/VP+N+qurpJGs33Q+dZwevBQ5q2w/RCcHhbV/a7nxmoEF2JvAufv9D+BfAgZvNeSGwHqCqdu/6uruqnqyqf6+q+cARwBuAk3q43z+4nyShE5X1bWhTDF7Vtr9LJwavwRhoBzEGGlhVNQJ8DTitDa0AXpTk7UmmJ3kbnWcAV411fpLXJfnrJNOAR+hcNurlEs7lwHFJjkryXOAM4HHgB+34d4HXAbtW1Trge8BCOpekbvojHqo0IWOgQXcWsBtAVT1A57f7M4AHgA8Cb6iq+8c598+AK+iE4FY6P8Qvm+gOq+o24B3A54D76bwm8caqeqId/znwKJ0I0F6HWAN8v6qe+qMepTSB+J/bSJJ8ZiBJMgaSJGMgScIYSJJ4Fn/obL/99qs5c+b0exmS9KyxatWq+6tqaKxjz9oYzJkzh+Hh4X4vQ5KeNZJs/gn73/EykSTJGEiSjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJJ4Fn8C+dlgztKr+72Encqdnziu30uQdlo+M5AkGQNJkjGQJGEMJEkYA0kSxkCShDGQJGEMJEkYA0kSxkCShDGQJGEMJEkYA0kSxkCShDGQJNFjDJL8U5LVSX6a5KtJnpdkbpIbkowk+VqSGW3uLm1/pB2f03U7H27jtyU5pmt8YRsbSbJ0uz9KSdIWTRiDJDOB04AFVXUwMA04AfgkcG5VHQQ8CJzSTjkFeLCNn9vmkWR+O+8lwELg80mmJZkGnA8cC8wHTmxzJUmTpNfLRNOBXZNMB54P3AMcCVzRjl8CHN+2F7V92vGjkqSNL6uqx6vqDmAEOLR9jVTVmqp6AljW5kqSJsmEMaiq9cCngbvpROBhYBXwUFVtbNPWATPb9kxgbTt3Y5u/b/f4ZueMN/4MSZYkGU4yPDo62svjkyT1oJfLRHvT+U19LnAAsBudyzyTrqouqKoFVbVgaGioH0uQpJ1SL5eJ/ha4o6pGq+pJ4BvAK4G92mUjgFnA+ra9HpgN0I7vCTzQPb7ZOeONS5ImSS8xuBs4LMnz27X/o4BbgO8Ab25zFgNXtu3lbZ92/NtVVW38hPZuo7nAPOCHwI3AvPbupBl0XmRevu0PTZLUq+kTTaiqG5JcAfwI2AjcBFwAXA0sS/LxNnZhO+VC4LIkI8AGOj/cqarVSS6nE5KNwKlV9RRAkvcCK+m8U+miqlq9/R6iJGkiE8YAoKrOBM7cbHgNnXcCbT73t8Bbxrmds4GzxxhfAazoZS2SpO3PTyBLkoyBJMkYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgSaLHGCTZK8kVSX6W5NYkhyfZJ8k1SW5v3/duc5PkvCQjSW5OckjX7Sxu829Psrhr/OVJftLOOS9Jtv9DlSSNp9dnBp8F/qeqXgz8DXArsBS4tqrmAde2fYBjgXntawnwBYAk+wBnAq8ADgXO3BSQNuddXect3LaHJUnaGhPGIMmewKuBCwGq6omqeghYBFzSpl0CHN+2FwGXVsf1wF5JXgAcA1xTVRuq6kHgGmBhO7ZHVV1fVQVc2nVbkqRJ0Mszg7nAKHBxkpuSfDHJbsD+VXVPm3MvsH/bngms7Tp/XRvb0vi6McafIcmSJMNJhkdHR3tYuiSpF73EYDpwCPCFqnoZ8Gt+f0kIgPYbfW3/5f2hqrqgqhZU1YKhoaEdfXeSNDB6icE6YF1V3dD2r6ATh1+2Szy07/e14+uB2V3nz2pjWxqfNca4JGmSTBiDqroXWJvkL9vQUcAtwHJg0zuCFgNXtu3lwEntXUWHAQ+3y0krgaOT7N1eOD4aWNmOPZLksPYuopO6bkuSNAmm9zjvfcBXkswA1gAn0wnJ5UlOAe4C3trmrgBeD4wAj7W5VNWGJB8DbmzzzqqqDW37PcCXgF2Bb7UvSdIk6SkGVfVjYMEYh44aY24Bp45zOxcBF40xPgwc3MtaJEnbn59AliQZA0mSMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRLGQJLEVsQgybQkNyW5qu3PTXJDkpEkX0syo43v0vZH2vE5Xbfx4TZ+W5JjusYXtrGRJEu34+OTJPVga54ZnA7c2rX/SeDcqjoIeBA4pY2fAjzYxs9t80gyHzgBeAmwEPh8C8w04HzgWGA+cGKbK0maJD3FIMks4Djgi20/wJHAFW3KJcDxbXtR26cdP6rNXwQsq6rHq+oOYAQ4tH2NVNWaqnoCWNbmSpImSa/PDP4D+CDwdNvfF3ioqja2/XXAzLY9E1gL0I4/3Ob/bnyzc8Ybf4YkS5IMJxkeHR3tcemSpIlMGIMkbwDuq6pVk7CeLaqqC6pqQVUtGBoa6vdyJGmnMb2HOa8E3pTk9cDzgD2AzwJ7JZnefvufBaxv89cDs4F1SaYDewIPdI1v0n3OeOOSpEkw4TODqvpwVc2qqjl0XgD+dlX9A/Ad4M1t2mLgyra9vO3Tjn+7qqqNn9DebTQXmAf8ELgRmNfenTSj3cfy7fLoJEk96eWZwXg+BCxL8nHgJuDCNn4hcFmSEWADnR/uVNXqJJcDtwAbgVOr6imAJO8FVgLTgIuqavU2rEuStJW2KgZVdR1wXdteQ+edQJvP+S3wlnHOPxs4e4zxFcCKrVmLJGn78RPIkiRjIEkyBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkYHq/FyCpP+YsvbrfS9ip3PmJ4/q9hG3iMwNJkjGQJPUQgySzk3wnyS1JVic5vY3vk+SaJLe373u38SQ5L8lIkpuTHNJ1W4vb/NuTLO4af3mSn7R
2022-03-20 21:06:58 +01:00
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# Rozkład częstości dla klas\n",
"no_shows[\"No-show\"].value_counts().plot(kind=\"bar\", title=\"No-show\")"
]
},
{
"cell_type": "code",
2022-03-27 17:07:12 +02:00
"execution_count": 13,
2022-03-20 21:06:58 +01:00
"metadata": {},
"outputs": [],
"source": [
"# Wyczyszczenie zbioru\n",
"# Usunięcie negatywnego wieku\n",
"no_shows = no_shows.drop(no_shows[no_shows[\"Age\"] < 0].index)\n",
"\n",
"# Usunięcie niewiadomego wieku (zależy od zastosowania)\n",
"# no_shows = no_shows.drop(no_shows[no_shows[\"Age\"] == 0].index)"
]
},
{
"cell_type": "code",
2022-03-27 17:07:12 +02:00
"execution_count": 14,
2022-03-20 21:06:58 +01:00
"metadata": {},
2022-03-27 17:07:12 +02:00
"outputs": [],
2022-03-20 21:06:58 +01:00
"source": [
"# Normalizacja danych\n",
"\n",
"# Usunięcie kolumn PatientId oraz AppointmentID\n",
"no_shows.drop([\"PatientId\", \"AppointmentID\"], inplace=True, axis=1)\n",
"\n",
"# Zmiena wartości kolumny No-show z Yes/No na wartość boolowską\n",
"no_shows[\"No-show\"] = no_shows[\"No-show\"].map({'Yes': 1, 'No': 0})\n",
"\n",
"# Normalizacja kolumny Age\n",
"no_shows[\"Age\"]=(no_shows[\"Age\"]-no_shows[\"Age\"].min())/(no_shows[\"Age\"].max()-no_shows[\"Age\"].min())"
]
}
],
"metadata": {
"interpreter": {
"hash": "3c12dc341c1078754dffca0e61bfc548ab04f96cfe0a82a85a936b702c4881ab"
},
"kernelspec": {
"display_name": "Python 3.7.11 ('ium')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2022-03-27 17:07:12 +02:00
"version": "3.8.10"
2022-03-20 21:06:58 +01:00
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}