ium_452487/dane.ipynb

1710 lines
348 KiB
Plaintext
Raw Normal View History

{
"cells": [
{
"cell_type": "markdown",
"source": [
"## 1. Pobieranie zbioru danych"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 1,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting kaggle\n",
" Using cached kaggle-1.6.6.tar.gz (84 kB)\n",
"Requirement already satisfied: six>=1.10 in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from kaggle) (1.16.0)\n",
"Requirement already satisfied: certifi in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from kaggle) (2022.6.15)\n",
"Requirement already satisfied: python-dateutil in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from kaggle) (2.8.2)\n",
"Requirement already satisfied: requests in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from kaggle) (2.28.1)\n",
"Requirement already satisfied: tqdm in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from kaggle) (4.64.1)\n",
"Collecting python-slugify\n",
" Using cached python_slugify-8.0.4-py2.py3-none-any.whl (10 kB)\n",
"Requirement already satisfied: urllib3 in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from kaggle) (1.26.11)\n",
"Requirement already satisfied: bleach in c:\\users\\adrian\\miniconda3\\lib\\site-packages (from kaggle) (4.1.0)\n",
"Requirement already satisfied: webencodings in c:\\users\\adrian\\miniconda3\\lib\\site-packages (from bleach->kaggle) (0.5.1)\n",
"Requirement already satisfied: packaging in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from bleach->kaggle) (22.0)\n",
"Collecting text-unidecode>=1.3\n",
" Using cached text_unidecode-1.3-py2.py3-none-any.whl (78 kB)\n",
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from requests->kaggle) (2.10)\n",
"Requirement already satisfied: charset-normalizer<3,>=2 in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from requests->kaggle) (2.1.0)\n",
"Requirement already satisfied: colorama in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from tqdm->kaggle) (0.4.5)\n",
"Building wheels for collected packages: kaggle\n",
" Building wheel for kaggle (setup.py): started\n",
" Building wheel for kaggle (setup.py): finished with status 'done'\n",
" Created wheel for kaggle: filename=kaggle-1.6.6-py3-none-any.whl size=111961 sha256=3aa19c7655c19d77b65c2542567b6e34a57813227f1f2df6d0fd84accad6824f\n",
" Stored in directory: c:\\users\\adrian\\appdata\\local\\pip\\cache\\wheels\\46\\aa\\c3\\b3e421522fb5acdd7c366a05c5fc80787615bdeed207e7f79b\n",
"Successfully built kaggle\n",
"Installing collected packages: text-unidecode, python-slugify, kaggle\n",
"Successfully installed kaggle-1.6.6 python-slugify-8.0.4 text-unidecode-1.3\n"
]
}
],
"source": [
"!pip install --user kaggle"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading personal-key-indicators-of-heart-disease.zip to C:\\Users\\Adrian\\Desktop\\Semestr 1 (II ST)\\ML\\zadania\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
" 0%| | 0.00/21.4M [00:00<?, ?B/s]\n",
" 5%|4 | 1.00M/21.4M [00:00<00:11, 1.80MB/s]\n",
" 9%|9 | 2.00M/21.4M [00:00<00:05, 3.51MB/s]\n",
" 19%|#8 | 4.00M/21.4M [00:00<00:02, 7.25MB/s]\n",
" 33%|###2 | 7.00M/21.4M [00:00<00:01, 12.0MB/s]\n",
" 51%|#####1 | 11.0M/21.4M [00:01<00:00, 17.9MB/s]\n",
" 65%|######5 | 14.0M/21.4M [00:01<00:00, 20.7MB/s]\n",
" 79%|#######9 | 17.0M/21.4M [00:01<00:00, 20.6MB/s]\n",
" 93%|#########3| 20.0M/21.4M [00:01<00:00, 22.7MB/s]\n",
"100%|##########| 21.4M/21.4M [00:01<00:00, 14.9MB/s]\n"
]
}
],
"source": [
"!kaggle datasets download -d kamilpytlak/personal-key-indicators-of-heart-disease/"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"'unzip' is not recognized as an internal or external command,\n",
"operable program or batch file.\n"
]
}
],
"source": [
"#!unzip -o personal-key-indicators-of-heart-disease.zip #nie działa na Windowsie więc korzystam z modułu zipfile"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [],
"source": [
"import zipfile\n",
"with zipfile.ZipFile(\"personal-key-indicators-of-heart-disease.zip\", 'r') as zip_ref:\n",
" zip_ref.extractall(\"dataset_extracted\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 1,
"outputs": [],
"source": [
"import pandas as pd\n",
"# W pobranym zbiorze danych jest kilka podzbiorów więc celowo otwieram ten z NaN, żeby manualnie go oczyścić dla praktyki\n",
"df = pd.read_csv(\"dataset_extracted/2022/heart_2022_with_nans.csv\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## Przeglądanie nieoczyszczonego datasetu"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 38,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 445132 entries, 0 to 445131\n",
"Data columns (total 40 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 State 445132 non-null object \n",
" 1 Sex 445132 non-null object \n",
" 2 GeneralHealth 443934 non-null object \n",
" 3 PhysicalHealthDays 434205 non-null float64\n",
" 4 MentalHealthDays 436065 non-null float64\n",
" 5 LastCheckupTime 436824 non-null object \n",
" 6 PhysicalActivities 444039 non-null object \n",
" 7 SleepHours 439679 non-null float64\n",
" 8 RemovedTeeth 433772 non-null object \n",
" 9 HadHeartAttack 442067 non-null object \n",
" 10 HadAngina 440727 non-null object \n",
" 11 HadStroke 443575 non-null object \n",
" 12 HadAsthma 443359 non-null object \n",
" 13 HadSkinCancer 441989 non-null object \n",
" 14 HadCOPD 442913 non-null object \n",
" 15 HadDepressiveDisorder 442320 non-null object \n",
" 16 HadKidneyDisease 443206 non-null object \n",
" 17 HadArthritis 442499 non-null object \n",
" 18 HadDiabetes 444045 non-null object \n",
" 19 DeafOrHardOfHearing 424485 non-null object \n",
" 20 BlindOrVisionDifficulty 423568 non-null object \n",
" 21 DifficultyConcentrating 420892 non-null object \n",
" 22 DifficultyWalking 421120 non-null object \n",
" 23 DifficultyDressingBathing 421217 non-null object \n",
" 24 DifficultyErrands 419476 non-null object \n",
" 25 SmokerStatus 409670 non-null object \n",
" 26 ECigaretteUsage 409472 non-null object \n",
" 27 ChestScan 389086 non-null object \n",
" 28 RaceEthnicityCategory 431075 non-null object \n",
" 29 AgeCategory 436053 non-null object \n",
" 30 HeightInMeters 416480 non-null float64\n",
" 31 WeightInKilograms 403054 non-null float64\n",
" 32 BMI 396326 non-null float64\n",
" 33 AlcoholDrinkers 398558 non-null object \n",
" 34 HIVTesting 379005 non-null object \n",
" 35 FluVaxLast12 398011 non-null object \n",
" 36 PneumoVaxEver 368092 non-null object \n",
" 37 TetanusLast10Tdap 362616 non-null object \n",
" 38 HighRiskLastYear 394509 non-null object \n",
" 39 CovidPos 394368 non-null object \n",
"dtypes: float64(6), object(34)\n",
"memory usage: 135.8+ MB\n"
]
}
],
"source": [
"df.info()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 39,
"outputs": [
{
"data": {
"text/plain": " State Sex GeneralHealth PhysicalHealthDays MentalHealthDays \\\n0 Alabama Female Very good 0.0 0.0 \n1 Alabama Female Excellent 0.0 0.0 \n2 Alabama Female Very good 2.0 3.0 \n3 Alabama Female Excellent 0.0 0.0 \n4 Alabama Female Fair 2.0 0.0 \n\n LastCheckupTime PhysicalActivities \\\n0 Within past year (anytime less than 12 months ... No \n1 NaN No \n2 Within past year (anytime less than 12 months ... Yes \n3 Within past year (anytime less than 12 months ... Yes \n4 Within past year (anytime less than 12 months ... Yes \n\n SleepHours RemovedTeeth HadHeartAttack ... HeightInMeters \\\n0 8.0 NaN No ... NaN \n1 6.0 NaN No ... 1.60 \n2 5.0 NaN No ... 1.57 \n3 7.0 NaN No ... 1.65 \n4 9.0 NaN No ... 1.57 \n\n WeightInKilograms BMI AlcoholDrinkers HIVTesting FluVaxLast12 \\\n0 NaN NaN No No Yes \n1 68.04 26.57 No No No \n2 63.50 25.61 No No No \n3 63.50 23.30 No No Yes \n4 53.98 21.77 Yes No No \n\n PneumoVaxEver TetanusLast10Tdap \\\n0 No Yes, received tetanus shot but not sure what type \n1 No No, did not receive any tetanus shot in the pa... \n2 No NaN \n3 Yes No, did not receive any tetanus shot in the pa... \n4 Yes No, did not receive any tetanus shot in the pa... \n\n HighRiskLastYear CovidPos \n0 No No \n1 No No \n2 No Yes \n3 No No \n4 No No \n\n[5 rows x 40 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>State</th>\n <th>Sex</th>\n <th>GeneralHealth</th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>LastCheckupTime</th>\n <th>PhysicalActivities</th>\n <th>SleepHours</th>\n <th>RemovedTeeth</th>\n <th>HadHeartAttack</th>\n <th>...</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n <th>AlcoholDrinkers</th>\n <th>HIVTesting</th>\n <th>FluVaxLast12</th>\n <th>PneumoVaxEver</th>\n <th>TetanusLast10Tdap</th>\n <th>HighRiskLastYear</th>\n <th>CovidPos</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Alabama</td>\n <td>Female</td>\n <td>Very good</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>No</td>\n <td>8.0</td>\n <td>NaN</td>\n <td>No</td>\n <td>...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>No</td>\n <td>No</td>\n <td>Yes</td>\n <td>No</td>\n <td>Yes, received tetanus shot but not sure what type</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Alabama</td>\n <td>Female</td>\n <td>Excellent</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>No</td>\n <td>6.0</td>\n <td>NaN</td>\n <td>No</td>\n <td>...</td>\n <td>1.60</td>\n <td>68.04</td>\n <td>26.57</td>\n <td>No</td>\n <td>No</td>\n <td>No</td>\n <td>No</td>\n <td>No, did not receive any tetanus shot in the pa...</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Alabama</td>\n <td>Female</td>\n <td>Very good</td>\n <td>2.0</td>\n <td>3.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>Yes</td>\n <td>5.0</td>\n <td>NaN</td>\n <td>No</td>\n <td>...</td>\n <td>1.57</td>\n <td>63.50</td>\n <td>25.61</td>\n <td>No</td>\n <td>No</td>\n <td>No</td>\n <td>No</td>\n <td>NaN</td>\n <td>No</td>\n <td>Yes</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Alabama</td>\n <td>Female</td>\n <td>Excellent</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>Yes</td>\n <td>7.0</td>\n <td>NaN</td>\n <td>No</td>\n <td>...</td>\n <td>1.65</td>\n <td>63.50</td>\n <td>23.30</td>\n <td>No</td>\n <td>No</td>\n <td>Yes</td>\n <td>Yes</td>\n <td>No, did not receive any tetanus shot in the pa...</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Alabama</td>\n <td>Female</td>\n <td>Fair</td>\n <td>2.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>Yes</td>\n <td>9.0</td>\n <td>NaN</td>\n <td>No</td>\n <td>...</td>\n <td>1.57</td>\n <td>53.98</td>\n <td>21.77</td>\n <td>Yes</td>\n <td>No</td>\n <td>No</td>\n <td>Yes</td>\n <td>No, did not receive any tetanus shot in the pa...</td>\n <td>No</td>\n <td>No</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 40 columns</p>\n</div>"
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 40,
"outputs": [
{
"data": {
"text/plain": " PhysicalHealthDays MentalHealthDays SleepHours HeightInMeters \\\ncount 434205.000000 436065.000000 439679.000000 416480.000000 \nmean 4.347919 4.382649 7.022983 1.702691 \nstd 8.688912 8.387475 1.502425 0.107177 \nmin 0.000000 0.000000 1.000000 0.910000 \n25% 0.000000 0.000000 6.000000 1.630000 \n50% 0.000000 0.000000 7.000000 1.700000 \n75% 3.000000 5.000000 8.000000 1.780000 \nmax 30.000000 30.000000 24.000000 2.410000 \n\n WeightInKilograms BMI \ncount 403054.000000 396326.000000 \nmean 83.074470 28.529842 \nstd 21.448173 6.554889 \nmin 22.680000 12.020000 \n25% 68.040000 24.130000 \n50% 80.740000 27.440000 \n75% 95.250000 31.750000 \nmax 292.570000 99.640000 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>SleepHours</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>434205.000000</td>\n <td>436065.000000</td>\n <td>439679.000000</td>\n <td>416480.000000</td>\n <td>403054.000000</td>\n <td>396326.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>4.347919</td>\n <td>4.382649</td>\n <td>7.022983</td>\n <td>1.702691</td>\n <td>83.074470</td>\n <td>28.529842</td>\n </tr>\n <tr>\n <th>std</th>\n <td>8.688912</td>\n <td>8.387475</td>\n <td>1.502425</td>\n <td>0.107177</td>\n <td>21.448173</td>\n <td>6.554889</td>\n </tr>\n <tr>\n <th>min</th>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>0.910000</td>\n <td>22.680000</td>\n <td>12.020000</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>6.000000</td>\n <td>1.630000</td>\n <td>68.040000</td>\n <td>24.130000</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>7.000000</td>\n <td>1.700000</td>\n <td>80.740000</td>\n <td>27.440000</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>3.000000</td>\n <td>5.000000</td>\n <td>8.000000</td>\n <td>1.780000</td>\n <td>95.250000</td>\n <td>31.750000</td>\n </tr>\n <tr>\n <th>max</th>\n <td>30.000000</td>\n <td>30.000000</td>\n <td>24.000000</td>\n <td>2.410000</td>\n <td>292.570000</td>\n <td>99.640000</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"#### Tylko 6 kolumn jest numeryczne na razie więc wiele statystyk nie zostaje wyświetlonych w tym podsumowaniu"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"#### Zbiór danych jest niezbalansowany, zmienna którą chcemy przewidzieć w znacznej większości przypadków wynosi 0:"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 41,
"outputs": [
{
"data": {
"text/plain": "<AxesSubplot:ylabel='HadHeartAttack'>"
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZoAAAGFCAYAAADEhjUtAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA7KklEQVR4nO3deXhU5eE98HPvTPY9ZCUkhCUbIBCWsAqC7IoLYhEKCKgogvZr1aq/UoRWsdS2atUWtSIiWhbBBUVQCAjIEgIhYQmELQshCSHrTPa59/7+CEmhIIaZuXNnOZ/n8WlJ4uQQMCfvct9XUBRFARERkUpErQMQEZFzY9EQEZGqWDRERKQqFg0REamKRUNERKpi0RARkapYNEREpCoWDRERqYpFQ0REqmLREBGRqlg0RESkKhYNERGpikVDRESqYtEQEZGqWDRERKQqFg0REamKRUNERKpi0RARkapYNEREpCoWDRERqYpFQ0REqmLREBGRqlg0RESkKhYNERGpikVDRESqYtEQEZGqWDRERKQqFg0REamKRUNERKpi0RARkapYNEREpCoWDRERqYpFQ0REqmLREBGRqlg0RESkKhYNERGpikVDRESqYtEQEZGqWDRERKQqFg0REamKRUNERKpi0RARkapYNEREpCoWDRERqYpFQ0REqtJrHYDIUZgkGVAAURQgioLVXleWFciKAkUBRBHQifz5j5wLi4ZcniTLUBRAJwoQhP8WiKIoMNY2ocJQj8uV9Sivbv6nwlCPiuoGVBjqUVtvav1YBQAUQLny6+a347r3QwC8PfXw93aHn487/K78r7+3O/x93BES6IXQQC8E+nnA3U13TR5JUqDXs4jIsbBoyKVIkgxB+O+I5HJlHXLyK5BbVI2yqjqUXymQiuoGVBobIMuKpnl9PPVoF+CFdgGeCA/2Rmz7AHSJCkBse394ujf/52uSZIiCdUdZRNYkKC0/ehE5GZNJhk7XPEqRZQUXS43IKajEucJKnC2swvnCKtRcGZE4GkEAwoK80al9ADq190fnqAB07RCIkEAvAP+djtPrOPoh7bFoyCk0r3Eo0IkiGpsk5BZV40xBc6GcK6xCXnE1mkyy1jFV5+2pR2ykf2sBxUUHITbSH6IowCTJLB7SBIuGHFbLN05JknEyrwKHT15CRs4lnL1QCY1nvOyKr5cbeseHok9iGFK6RSDA1wOSrEAAON1GNsGiIYfRsl4iigKKLtfg4IliZOSU4tjZy6hvlDRO5zhiI/3RJzEM/ZPCkRQbDJ1O5GiHVMWiIbsmSTJEUYCiACfzyrHvaBEOHC9G0eUaraM5BS8PPW7rGoK+V0Y7IYFe1xQ6kTWwaMjuyLICQQBMkoL07GLsO1qMQydLUF3TqHU0pxcV6ou+iWEY2S8aXToEQpJk6DjSIQuxaMhutEzf5BZV47u95/Hj4QsOuyvMGcRG+mNU/xjcmRIDXy83SLLMh0nJLCwa0lTL6KWhSUJqegG+P5CHsxeqtI5FV9HrRKR0D8fYgbFIjg+FrACigGsebiW6GRYNaaJlSuZUXjm+25eLPZkX0cAFfbsXEuiJkf1iMG5gR4QGeXMTAbUJi4ZspmXqxVjXhG1pefj+QD4KSgxaxyIzCALQo3MIRg+IwdBe7aHXiVfOauMoh67HoiHVtfzUe/TMZWzedx77jxY3H1BJTsHHU4/hfaMxeWQcQq/sWmPh0NVYNKSalumxw6cu4bMtJ3Eqv0LrSKQinSjg9t5RmDI6Hh3C/CDJCnQsHAKLhlTQUjBHci5h9ZaTOJXHgnElggAM6B6BKaMT0JVbpAksGrKilm8omadL8emWk8jOLdc6EmmsV1woZo5PQnzHIG6PdmEsGrJYS8FknWkumBPnWTB0rb6JYXh4Qjd0igpg4bggFg2ZraVgjp29jNVbTuL4uTKtI5GdG9gjEjMnJCE63I+bBlwIi4ZuWcs3iBPny/DJ5mwcY8HQLRAEYFjvKDx6bw/4+Xhww4ALYNHQLZFkBcbaRvxrQxZ+yrqodRxyYF4eevx6bCIm3t65+S4hbhhwWiwaapOWU5Q3783Fqs0nUMszyMhKOkcF4KkHe6NrdCAUReHRNk6IRUO/SFEUFJQY8Y+1GXwWhlQhCsCYgbGYM7E73PUiRzdOhkVDP8skyZBlBZ9uPYmvfjwLiddWksoCfT0w557uGNE3mrvTnAiLhq4jKwpEQcDhU5fwz88zUVJeq3UkcjE9u4ZgwYO9Ed7OGyKn0hwei4auIckyjLVNWL4xC3syudhP2tHrRDwwoiseGp0AQQCn0xwYi4YANO8mEwXgu325WPXtCV44RnYjsp0Pnp7SG907t+NGAQfFoiFIkozqmkb85ZN0PhNDdkkUgCmjEzB1TAKvI3BALBrCwRPFeHNNBqprGrWOQnRTveJC8bsZ/eDjqedUmgNh0bgo6cp9MB99cxxf7TqncRqitgvy88DvZvTjVJoDYdG4IEmWUVZVj9dWHsSZC5VaxyG6ZaIATB2TiCmj4zmV5gBYNC5o/7EivPGfw3y6nxxe7/jmqTRvD06l2TMWjYuQZQWCAKzanI0NO06Df+rkLIL9PfHCzH5Iig3mVJqdYtG4AJMko6FRwrJVB5GRU6p1HCKrE0UB08YkYMroBF4/YIdYNE5OlhXkFVfjlRUHcKmiTus4RKpKTgjFCzP6w9Ndx6k0O8KicWKKouCnzIt44z+H0WiStY5DZBMdwnzxyhODEejrwbKxEywaJ/btT+fx/hdZ4FmY5GqC/T3xyhOD0T7Eh2VjB1g0Tuo/35/EZ1tPaR2DSDM+Xm5Y/OhAxMcEcc1GYywaJ9JyadR7X2Thmz3ntY5DpDl3vYgXZvZHv27hPAVaQywaJyHLChQoeOM/Gfjx8AWt4xDZDVEUsGByL4we0FHrKC6LReMEJFmBJMlYujINh05e0joOkV2aMT4JvxoVr3UMl8SicXCSJKOhScLiD/YjO7dc6zhEdm3i0M6Ye/9trZf7kW2waByYSWq+pGzh8p+QV2zQOg6RQxiWHIXfTu0DQRC4ScBGWDQOyiQ1H4z5+3/9xKuWiW5Rcnwofj9nAPQ6ATqR25/VxqJxQJIk4+LlGvy/f/2ESkOD1nGIHFKPzu3wx8cHQydyZKM2VrmDkSQZFYYGLFy+lyVDZIFj58rw6kcHoECBzJ+3VcWicSCSLKOuUcLC5XtRXl2vdRwih3fo5CX8dfUhQGl+Do3UwaJxELKsQJIUvPz+PhSWGrWOQ+Q09mRexDvrj/CKARWxaByAojQ/jLl0ZRpy8iu0jkPkdH5Iy8cHXx7VOobTYtE4AEEQ8NaaI3wYk0hFX+8+h89TT2sdwymxaBzAR98cx45DBVrHIHJ6qzafwM7DFyDzyHOrYtHYMUVR8NWPZ7FxxxmtoxC5BEUB3lpzGMfOXYYk8w4na2HR2ClZVrD7SCE+3HRM6yhELsUkKXj1ozRcuGSEJLFsrIFFY4ckWcaxc5fxxn8ywB2XRLZXW2/Covf2odLYwJGNFbBo7IwkySgoMeKVFWkw8acpIs2UV9fjjx8egCLzGRtLsWjsiKwoaDTJeGXFAdQ1mLSOQ+TyzhVW4b0vsviMjYVYNHZEFAS88Z/DPCSTyI5s2Z+HHw9f4BSaBVg0dkKWFXyz5xz2HS3SOgoR/Y931h9BSVktNweYiUVjByRJRl5RNT78+rjWUYjoBuobJby6Mg2SrHC9xgwsGo3JcvO6zNKPufhPZM/yiw149/NMrteYgUWjMVEU8NaaDBSXcV2GyN6lphfghwN5PDngFrFoNCTLCjbvPY+fsi5qHYWI2ui9L46isJQPc94KFo1GJElGfokB//6KT/4TOZKGJgmvftQ81c0L09qGRaMBWVFgkmQsXZmGJhN/KiJyNIWlRvxj3RGIXK9pExaNBkRBwFtrj6Doco3WUYjITLsyCvHd3vNcr2kDFo2NSbKCrftzsftIodZRiMhCH246jvLqepbNL2DR2JAsK6g2NvB5GSIn0dAo4Z+fZ0IUOYV2MywaGxJFAe9+nslzzIicyMHsEuw7epG70G6CRWMjkiRj39GLOHC8WOsoRGR
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df[\"HadHeartAttack\"].value_counts().plot(kind=\"pie\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 42,
"outputs": [
{
"data": {
"text/plain": "No 416959\nYes 25108\nName: HadHeartAttack, dtype: int64"
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"HadHeartAttack\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## 2. Podział na podzbiory (train / dev / test - 8:1:1)) i oversampling"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 43,
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"#Funkcji z sklearn musimy użyć dwukrotnie, bo dzieli tylko na dwa podzbiory\n",
"train, test_and_valid = train_test_split(df, test_size=0.2) #0.8 train, 0.2 test&valid\n",
"\n",
"test, valid = train_test_split(test_and_valid, test_size=0.5) #0.1 test, 0.1 valid"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 44,
"outputs": [
{
"data": {
"text/plain": "No 333641\nYes 20042\nName: HadHeartAttack, dtype: int64"
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train[\"HadHeartAttack\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Zbiór treningowy jest nadal niezbalansowany więc zrobię prosty oversampling przez kopiowanie mniejszej klasy aż będą prawie równe"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 45,
"outputs": [],
"source": [
"def oversample(dataset):\n",
" num_true = len(dataset[dataset[\"HadHeartAttack\"]==\"Yes\"])\n",
" num_false = len(dataset[dataset[\"HadHeartAttack\"]==\"No\"])\n",
" num_oversampling_steps = num_false//num_true\n",
" oversampled = dataset.copy()\n",
" for x in range(num_oversampling_steps):\n",
" oversampled = pd.concat([oversampled, dataset[dataset[\"HadHeartAttack\"]==\"Yes\"]], ignore_index=True)\n",
" return oversampled"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 46,
"outputs": [],
"source": [
"train = oversample(train)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 47,
"outputs": [
{
"data": {
"text/plain": "<AxesSubplot:ylabel='HadHeartAttack'>"
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZoAAAGFCAYAAADEhjUtAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA1TElEQVR4nO3deXxU5aE+8Oc9Z5ZsM1kIIQv7jiwCYlBAFKRURChi1YqioK11QS3VW7XlVqUWt/YnrfaW6hUVqEUQxKWKVRYR2TfZt0AgCSRkneyZmXPO748huawaJjPzzpx5vp8Pn2YzPon0PPMu5z3CMAwDREREQaLIDkBERObGoiEioqBi0RARUVCxaIiIKKhYNEREFFQsGiIiCioWDRERBRWLhoiIgopFQ0REQcWiISKioGLREBFRULFoiIgoqFg0REQUVCwaIiIKKhYNEREFFYuGiIiCikVDRERBxaIhIqKgYtEQEVFQsWiIiCioWDRERBRULBoiIgoqFg0REQUVi4aIiIKKRUNR5emnn0a/fv1w9OjR8z5XXFyM7OxsPPHEExKSEZmXMAzDkB2CKFQqKysxduxYdOzYEfPmzYMQoulzjzzyCHbu3IlPPvkETqdTYkoic+GIhqKK0+nEzJkzsWnTJixatKjp41988QW+/PJLzJo1iyVDFGAsGoo6I0aMwPjx4/HKK6+gpKQE1dXV+MMf/oBJkyZh6NChyMnJwS9+8QsMGDAAw4YNw+OPP47i4uKmfz43Nxf33XcfrrjiCgwYMAD33XcfDhw4IPEnIgpvnDqjqORyuTB27FgMHToUTqcTa9aswbJly1BZWYnx48dj3LhxuOOOO1BXV4fXXnsNhw4dwqeffoq4uDhMnDgRPXv2xP333w+v14uXXnoJubm5+PLLL2X/WERhySI7AJEMiYmJePbZZzFt2jRYrVYsWLAAsbGx+Mc//oH09HTMmDGj6Wtnz56Nq666CsuXL8fEiRNx/PhxDBkyBFlZWbBarZg1axaOHDkCXdehKJwkIDoXi4ai1qhRo9CnTx9kZWXh8ssvBwDs3bsXhw4dwoABA8762oaGBuTk5AAApk+fjlmzZuG9995DdnY2rrnmGtx0000sGaKL4NQZRbXJkycjKysLL774IgDg5z//OXRdxzPPPHPe1zocDqSkpAAASktL8fXXX2P9+vVYs2YNrFYrli1bhtTU1JDmJ4oEfAlGdIZu3bohJycHGRkZ6NChAzp06IDExETMmjULBw8eRGlpKWbOnAmPx4OJEyfilVdewccff4zi4mJs2rRJdnyisMSiITrDpEmTUFVVhSeeeAL79+/H/v37MX36dOzatQvdu3dHYmIiVq9ejRkzZmDfvn3Iy8vDwoULYbVa0adPH9nxicISi4boDO3atcOCBQtQU1ODO+64A3fddResVivmzZuHlJQUWCwWvPnmm1AUBVOmTMHYsWOxbt06vPHGG2jfvr3s+ERhiWs0REQUVBzREBFRULFoiIgoqFg0REQUVCwaIiIKKhYNEREFFYuGiIiCikVDRERBxaIhIqKgYtEQEVFQ8TEBRM2g6wY03YCiCKiKCOj31XUDEICiCCgicN+bKFywaCjqGYavRADAop49yK+qcaO4og5FZTU4VV6H4vI6lLrqUNfghdujw+3R4PZqvre9GjweHZquX/TfZbWocMbb4EywITHejqQE++m3bUhMsCPFGYMkhx2OOBviY61n/bO64SulczMShTuedUZRxevVYbH4LtRuj4ayynoUltbiVHktistrUVxR5/tTXofSijq4vRcvjWBTFQFHvK+EkhwxyGodj06ZiejaNgnt0x2wWVXfz6TpUISAEsCRFlEgsWjItHwjC99Ul6bryD1Zib1Hy3DgWDkOHCtDYWmt7Ih+UwSQkZqATplOdMpMROcsJ7pkJSHZGQPg9JScwdEPhQcWDZlC4/RX44W1rLIee4+WYn9uOQ4cL8ORfJfU0UmoOOKs6JiRiE6ZTnTO8o1+2rZJgKoo8Go6i4ekYNFQxGq8cLo9Gg7nV2BfbuNopRxllfWy44WNWLsFl3drjUG90jC4dwaSHHZougEhwM0HFBIsGooomqZDVRWUuerxzY4CrNt1AgeOlTct5tMP65jhxKBebZDdOx092idDUQRHOxRULBoKa8bpnVaqqqCorNZXLjtP4FBehexophAfa8WA7q19xXNZOhzxNmi6b3OB4GiHAoRFQ2HHMHwL2aqioOBUNb7ZUYBvd55A7slK2dFMTQigS1YSBvVKQ3bvdHRtmwTd8G08YOlQS7BoKCwYhgHD8N20mHvChW92nMC6XSeQf6padrSolZoUg5GD2uPHV3VAWnIcp9fIbywakqrx4pV/qgr/2XgM63aeRFFZ5G47NiMhgN6dWmFUdntc0z8LVovS9KKAqDlYNCSFpusQEFi/6wQ+XXsUu4+Uyo5EzRBrt2DY5ZkYO7QTurRN4iiHmoVFQyGj6TpURYGrugGfrTuK5euPcRtyBOvWLgljh3bCtQPaQlEEBNdy6CJYNBR0jVuSc/IrsHT1YazbeQJejX/tzMIZb8OPstvjpmGdkZoU2/SCgqgRi4aCpnFaZeu+Inyw6hB253B6zMwUAVx5WTom/bgnOmclsnCoCYuGAk7TdcAAVm/Lx9LVh3G8sEp2JAqxq/tm4O4be6FtmgP66ccrUPRi0VDAeDUdAsC/vz2KpasPo9TF9Zdopgjg2oFtcdeYXmidFAsDPPImWrFoqMUa12A27DqJuZ/uwcmSGtmRKIyoisCPsttj0o97IslhB8BNA9GGRUN+a5wSOVrgwhvLdnGLMn0vm0XBmCEdcfuPeiA+xsrptCjCoiG/6LqBypoGzP1kL1ZvywP/FlFzxdhUjL+mC356fTfYrGpAH41N4YlFQ5dE03RouoHFKw7iw69z0ODWZEeiCBUfa8XE67piwrVdoCoCKm/8NC0WDTWLpulQFIEVm49j/uf7eaMlBUyyw44HJvbDkH6Z3KFmUiwa+l6N90LszinBmx/txpECl+xIZFJD+mXg4Z/2R3yshfffmAyLhi5K0w2UV9ZjztKd2LinUHYcigKOOCt+MaEvRlzRjqMbE2HR0Hl0w4AifNNk//hwF+oavLIjUZS5omcaHr19ABITbBzdmACLhs6iaTrqPRr++v52rNt5UnYcimKxdgumjrsMY67uBE03uDstgrFo6Cw7Dhbj1X9t42I/hY2+XVLxqzsGIDUxllNpEYpFQ9A0HYYBzP10Dz5de4T3xFDYsdtU3HVDT/xkeBfousGt0BGGRRPldN1AXlEVXp6/BceLePglhbceHZIx/Y6ByGgVz9FNBGHRRClNN6AIYMmqw/jn8v3warrsSETNYrUoeHBiP/xocAfZUaiZWDRRSNN1VFQ14E8LtvJ8MopYNw7piPtv7gsA3JkW5lg0UWj1tnzMWfIdauq5bZkiW+/OrfC7qdmIs1u4bhPGWDRRQtd9/5n/sWwXPvv2qOQ0RIGTmhSD/773KnTMcHLdJkyxaKKApuvwagZefHcztuwrkh2HKOBsFgXTbuuPEVe0kx2FLoBFY3KapqOq1o3fv7EeR09Uyo5DFFTjr+mM+8b3AQCObsIIi8bENF1HflE1nnlzPR+rTFGjX7dU/PaebMTYVK7bhAkWjUkZhoFt+0/hpflbeFYZRZ02KXH47/sGo22ag0fXhAEWjUn9+9sjeGPZ7qZNAETRxm5T8djtA3BN/yzZUaIei8ZEdMOAAPDWx7vx0ZojsuMQhYU7f9wTPxvdQ3aMqMaiMQlN16FpBl5ZsAUbdvPZMURnuvm6Lrh3XB/ZMaIWi8YENE1HdZ0Hz765AYfzK2THIQpLY67uiId+ejkMw4AQXLcJJRZNhNM0HafK6/C7Od+iuLxOdhyisDbiinb41c8GAOD251Bi0UQwTdNRXFGHJ19fy+fHEDXTkL4Z+M3kQRBCsGxChEUTobyajrLKevzmtW94jwzRJRrUqw1+NzUbiiKgcBot6Hg3UwTyajrKqxrw1OtrWTJEftiyrwiz3tkEwzB4C0AIsGgijKbpcFU34KnXv0FxBddkiPy
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"train[\"HadHeartAttack\"].value_counts().plot(kind=\"pie\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 48,
"outputs": [
{
"data": {
"text/plain": "<AxesSubplot:ylabel='HadHeartAttack'>"
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZoAAAGFCAYAAADEhjUtAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA7CklEQVR4nO3dd2BUZaIF8HPvTHovpBBSCJBCQodQpUlHULEgKCi6YgHd57Ph07WLsu6uuJZldQVEdEUEURRBmiBSQkgglITQkpCQRuqkZ+6974+QCBI1zMzNnXJ+f6ySxMkhsDn5yv0+QVEUBURERCoRtQ5ARET2jUVDRESqYtEQEZGqWDRERKQqFg0REamKRUNERKpi0RARkapYNEREpCoWDRERqYpFQ0REqmLREBGRqlg0RESkKhYNERGpikVDRESqYtEQEZGqWDRERKQqFg0REamKRUNERKpi0RARkapYNEREpCoWDRERqYpFQ0REqmLREBGRqlg0RESkKhYNERGpikVDRESqYtEQEZGqWDRERKQqFg0REamKRUNERKpi0RARkapYNEREpCoWDRERqYpFQ0REqmLREBGRqlg0RESkKhYNERGpikVDRESqYtEQEZGqWDRERKQqFg0REamKRUNERKpi0RARkapYNEREpCoWDRERqUqvdQAiW2GUZEABRFGAKAoWe11ZViArCqAAgihAZ8HXJrIGLBpyeC3f6PW6Kwf4iqKgurYJ5YZ6XKysR1llPcoN9Sivamj+p6EBNXVNUBQFCoDm/2n+h6Iol73OL79ueau7ix5eHs7wcneGl4czvN2d4O3hAm8PZwT4uqKTrzv8vFzg7KS7Io8kX52TyNqxaMihSJIMQfhlRHKxog6n8ypwNr8SFyvqUG5oQHlVc4lUVDdAlpU/eEV1ebg5IcDHFQE+rgjyc0dUqDe6d/FFVGdvuDo3/9/XKMkQBcuOsogsSVAu/9GLyI4YJRk6UYAgCJBlBfkl1cjKLce5C5U4k1+JcxeqUFPXpHVMkwgCEOTnjq6dfRAd5o2unX3QvYsvAn3dAPz2KI1ICywasguyokBRFOhEEQ1NErIvVOLU+YrWUsktNKDJKGsdU3UernpEdfZB186/lE9UqDdEUYBRklk8pAkWDdmslm+ckiQjM6ccqZnFSMsqxpm8Cmg842VVPNyc0C+mE/rHBSGpZwh8PF0gyQoEgNNt1CFYNGQzWtZLRFFAwcUaHDxRiLSsEhw7cxH1jZLG6WxHVKg3+scFYVDPYMRH+UMnihztkKpYNGTVJEmGTidClhVk5pRh39ECJB8vxIWLNVpHswtuLnr07h7YOtoJ9HW7otCJLIFFQ1ZHVpqndYySgoMnCrH/WAFSMopgqLXNhXtb0iXIEwPigjBmQDi6dfFtLXoic7BoyGq0TN/kFFRh075s7ErNs9ldYfYgMsQL45MicX1SBDzdnFg6ZDIWDWlKkpufAWlokrAj5Tx+OJCDM3mVWseiy+h1IpISgjFxSBT6xXSCrACiAAgCp9aofVg0pImWn44zs8uweX829hy5gAYu6Fu9QF9XjB0YgUlDItHJz52bCKhdWDTUYVrKpbq2EVuTc/HDgRzkFVdrHYtMIAhAYnQgJgyOwPA+naHXiVAUbiCgtrFoSHUtBZN+ugSbfs7GgeMFMEr8a2cvPFz1GDUgHLeO6Y5Ofu6QZYWFQ1dg0ZBqWgom7WQxPt2SiZM55VpHIhWJooCRfcMwc3wMugR5QZIVnkRNAFg0pIKWgjmc1VwwmdksGEciCMDghBDMHB+L7twiTWDRkAW1fEM5cqoEn27OREZ2mdaRSGN9YzphzqR4xET6QZJl6EQWjiNi0ZDZLl+D+XRzJk6cY8HQlQbEBeHuKT3RNcyHheOAWDRkspaCOXr6IlZvzmDB0B8akhiKuVPiER7sxU0DDoRFQ9dMVhSIgoDjZ0vxyfcZOH62VOtIZEMEAbiubxj+ND0RPp4uLBsHwKKhayLJMmrqmrBs/VH8dDhf6zhkw9xc9Jg1IRY3juzWfJcQNwzYLRYNtUvLNNn3e8/h4+9OoKbeqHUkshNdO3vjkdv7oke4HxRF4dE2dohFQ39IURScL6rGP79I47MwpApBACYOjsS8aQlwcdJxdGNnWDT0m4ySDFlW8OmWTHy96wwkXltJKvP1dMF9NyZidP8u3J1mR1g0dJWW3UCpJ4vx/pdHUFRWq3UkcjC9ewRi4a19ERzgDpFTaTaPRUNXkGQF1bWNWLY+HXuOXNA6DjkwvU7ELWO6447xsRAEcDrNhrFoCEDzKEYQgE17s7Fq0wnUcrGfrERogAcendkXCdEB3Chgo1g0BEmSYahtxJJVKTjGZ2LICokCMHN8LGZNiOV1BDaIRUNIySjCW/9NRVVNo9ZRiH5X7x6BeHrOIHi46jmVZkNYNA5KkmQAwIpvj+Pr3Wc1TkPUfn5eLnhqzkBOpdkQFo0DkmQZFyvq8cbHB3E6r0LrOETXTBSAWRPiMHN8DKfSbACLxgHtP1aAt/6bygV/snn9YjrhyTkD4e7CqTRrxqJxEC27ylZtysC6nafAP3WyF/7ernh67kDER/lzKs1KsWgcgFGS0dAoYcmqg0jLKtE6DpHFiaKAOyfG4fZxMbx+wAqxaOycLCvIKajCqysOoLi8Tus4RKrqHxuEJ+cMhJszz0uzJiwaO6YoCvamX8A/PktFo1HWOg5Rhwjr5InXHhoGX08Xlo2VYNHYse/3nsOy9engWZjkaPy9XfHqg8PQOdCDZWMFWDR26r8/ZOKzLSe1jkGkGQ83J7zwpyGIjfDjmo3GWDR2pOXSqA++OoqNe/gQJpGzXsTTcwdhYM9gngKtIRaNnZBlBQoUvPVZKnal8YplohaiKGDhrX0wfnCk1lEcFovGDkiyAkmSsXhlMg5lFmsdh8gqzZkcj9vHxWgdwyGxaGycJMloaJLwwof7kJnNa5aJfs+066Ix/6ZerdPM1DFYNDbMKMmorm3Es8v2IrfQoHUcIpswsl8Y/ndWfwiCwE0CHYRFY6MkScbFijr8379+5oOYRNeoX0wnPHvvYOh1AnQitz+rjUVjg4ySjIKLNfi/939GRXWD1nGIbFJidABefmAYdCJHNmpjldsYSZJRWd2A55btZckQmeHY2VK8tuIAFCiQ+fO2qlg0NkSSZdQ1Snhu2V6UVdVrHYfI5h3KLMbfVh8ClObn0EgdLBobIcsKJEnBCx/sQ15xtdZxiOzGniMX8O7aw9yFpiIWjQ1o+Ulr8cpkZOVyCzORpW1NzsV/vj6qdQy7xaKxAYIgYOnnaXwYk0hFX+8+i7Xbs7SOYZdYNDZg+cbj2HnovNYxiOzeJ99n4MfUPMg88tyiWDRWTFEUbNh1Gl/9eFrrKEQOQVGAtz9PxbGzFyHJvMPJUlg0VkqWFexOy8fyjce1jkLkUIySgtdWJCOvuBqSxLKxBBaNFZJkBemnL2Lp56ngjkuijldbb8Tz/96HyupGlo0FsGisjCTJOF9kwOKVyTBKbBkirZRV1eOlj/ZD4TM2ZmPRWBFZVtBolPHaigOoazBqHYfI4Z3Nr8S/v0rnMzZmYtFYEVEUsPTzVBSW1modhYgu2bw/B7tS87g5wAwsGishywq+3XMWe9MLtI5CRL/y7trDKCqt5XqNiVg0VkCSZOQUVOGjb7jDjMga1TdKeG1lMiRZ4XqNCVg0GmtZl1n8cTKM/GmJyGrlFhrw/rojXK8xAYtGY6Io4O3P07guQ2QDth88j23JOTw54BqxaDQkywq++/ksfk6/oHUUImqnZeuPIr+ED3NeCxaNRrguQ2SbGpokvLaieaqbF6a1D4tGA7KsoEmS8frHB9Fk5E9FRLYmv6Qa//ziMESu17QLi0YDLesyBaU1WkchIhPtTsvH93vPcb2mHVg0HUySFWzel409R7guQ2TrPtp4HGVV9SybP8Ci6UCyrKCquoEnMhPZiYZGCe9/eQSiyCm038Oi6UCiKOD9dUd4jhmRHTmYUYR9Ry/wObjfwaLpIJIkY9/RC9h/rFDrKERkYf/+6igkSeapAb+BRdMBFKX56f9l649qHYWIVFB
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"test[\"HadHeartAttack\"].value_counts().plot(kind=\"pie\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 49,
"outputs": [
{
"data": {
"text/plain": "<AxesSubplot:ylabel='HadHeartAttack'>"
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZoAAAGFCAYAAADEhjUtAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA7KUlEQVR4nO3dd2BV9eE28Oecc7M32YQMVhIIe4SlIHspKg4EBQUrLrQ/X7ViteKoKLWtWGul2gIiDkAoiiIzDJkhEBJGQpiBBDLIvJk395zz/hGTSoka7r0n547n80eVJN48BJon3/NdgqqqKoiIiDQi6h2AiIicG4uGiIg0xaIhIiJNsWiIiEhTLBoiItIUi4aIiDTFoiEiIk2xaIiISFMsGiIi0hSLhoiINMWiISIiTbFoiIhIUywaIiLSFIuGiIg0xaIhIiJNsWiIiEhTLBoiItIUi4aIiDTFoiEiIk2xaIiISFMsGiIi0hSLhoiINMWiISIiTbFoiIhIUywaIiLSFIuGiIg0xaIhIiJNsWiIiEhTLBoiItIUi4aIiDTFoiEiIk2xaIiISFMsGiIi0hSLhoiINMWiISIiTbFoiIhIUywaIiLSFIuGiIg0xaIhIiJNsWiIiEhTLBoiItIUi4aIiDTFoiEiIk2xaIiISFMsGiIi0pRB7wBEjsIsK4AKiKIAURRs9rqKqkJRVEAFBFGAZMPXJrIHLBpyeYqiQlFVGKRrB/iqqsJY04ByYx2ultehtLIOZcY6lFXWo8zY+Ovq2gaoABr/p/kfUFX1J69z/du8PAzw83GHn7c7/H3c4efjDv8f/z040BOhgd4I8vOAu5t0TR5ZVmEw8EEEORYWDbkUWVYgCP8dkVwtr8WZvHKcy69AcXktyirrUGasR1llHSqqTY0jDR35eLkhOMATwQGeCAvyRlykP7p0CERce394ujf+39csKxAF246yiGxJUH/6YxaREzGbFUiSAEEQoCgq8ourcPpSOc7ll+NcfiXOXa5AdW2D3jEtIghAWJA3OrYPQKcof3RsH4AuHQIREugF4OdHaUR6YNGQU1BUFaqqQhJF1DfIuHC5AmfyKnAuvxxn8ytwscCIBrOid0zN+XgaENc+AB3bN5ZP1+hAxEb4QxQFmGWFxUO6YNGQw2r6xikrCk7lluFwdhHSTxXhbF45dH7iZVd8vNzQNz4U/RLDkNw9AgG+HpAVFQLAx23UJlg05DCa5ktEUcCVq9U4dLIA6TnFOH72KupMss7pHEdcpD/6J4ZhQPdwdItrB0kUOdohTbFoyK7JsgJJEqEoKrJzS7H/2BWknijA5avVekdzCl4eBvTuGoJ+CWFITopAcIDXNYVOZAssGrI7yo9/JWVZxaGTBThw/ArSsgphrHHMiXtH0iHMF/0TwzCyfzQ6dwhsLnoia7BoyG40Pb7JvVKJjfsvYNeRPIddFeYMYiP8MDY5FqOTY+Dr5QZZUSCJLB26cSwa0pWsNO4BqW+QkZJ2CVsO5uJsXoXesegnDJKIQUkRGDc4Fn3jQ6GogCgAgsBHa9Q6LBrSRdMjmewLpfh+/wXszbyMek7o272QQE+MHhCD8YNjERrkzUUE1CosGmozTeVirDFh68FcbE29iLyiKr1jkQUEAejZOQRjk2MwrHd7GCQRqsoFBNQyFg1prqlgMs8U47u955F6ogBmmX/tnIWPpwG39I/GXSO7IDTIG4qisnDoGiwa0kxTwaSfKsJnm7NxKrdM70ikIVEUMLxPFKaNjUeHMD/IisqTqAkAi4Y00FQwR3MaCyb7AgvGlQgCMCgpAtPGJqALl0gTWDRkQ03fUDJOF+OzTdnIulCqdyTSWZ/4UMyc0A3xsUFcHu3CWDRktZ/OwXy2KRsnz7Ng6Fr9E8Pw4KTu6BgVwMJxQSwaslhTwRw7cxUrN2WxYOhXDe4RiVmTuiE63I+LBlwIi4ZumKKqEAUBJ86V4NPvs3DiXInekciBCAJwc58oPDylBwJ8PbhgwAWwaOiGyIqC6toGLFl3DD8czdc7DjkwLw8Dpo9LwO3DOzfeJcQFA06LRUOt0vSY7Pt95/HJdydRXWfWOxI5iY7t/fHUvX3QNToIqqryaBsnxKKhX6WqKvKKqvC31elcqkyaEARg/KBYzL4tCR5uEkc3ToZFQz/LLCtQFBWfbc7G17vOQua1laSxQF8PzJmShJH9o7nh04mwaOg6TauBjmQX4h9rM1FYWqN3JHIxvbqEYN49fRAe7A2Rj9IcHouGriErCqpqGvDh2kzszbysdxxyYQZJxF0ju+C+sQkQBPBxmgNj0RCA/94Ls3HfeazYmIUaTvaTnYgM9sFT0/qgR6dgLhRwUCwagiwrMNaYsOjTNBw/yz0xZH9EAbh3bAJmjEuAoqo8WcDBsGgIaVmFePeLI6isNukdhegX9eoaghdmDoSPp4GP0hwIi8ZFybICAFj27Ql8vfuczmmIWi/IzwO/mzkASXyU5jBYNC5IVhRcLa/D258cwpm8cr3jEN0wUQCmj0vEtLHxvNnTAbBoXNDB4wX46xeHOeFPDq9vfCienzkA3h58lGbPWDQuQlFUCALw6fdZ+CrlNPinTs6inb8nXpg1AN3i2vFRmp1i0bgAs6yg3iRj0YpDSM8p1jsOkc2JooD7xyfi3jHxvH7ADrFonJyiqLhwpRJvLjuIorJaveMQaapfQhienzkAXu48L82esGicmKqq2Jd5GX/9/AhMZkXvOERtIirUF398bCgC/TxgYNnYBRaNE/t+33ksWZcJnoVJrqadvyf++NhQtA/x4cjGDrBonNSXW07hs83Zescg0o2PlxsW/GYwEmKCOGejMxaNE2m6NOqj/xzDhj3chEnkbhDxwqyBGNA9nKdA64hF4yQURYUKFe9+kY5dR/L0jkNkN0RRwLy7e2PsoFi9o7gsFo0TkBUVsqxg4fJUHM4u0jsOkV2aObEb7h0Tr3cMl8SicXCyrKC+QcaCj/fzmmWiX3HbzZ0w946ezY+ZqW2waByYWVZQVWPCS0v24WKBUe84RA5hRN8oPDOjHwQIXCTQRlg0DkqWFVwtr8XvP9zLjZhEN6hvfChemjMIBkng3TZtgEXjgMyygoKr1XjxH3tRXlWvdxwih9SjUzBef3QIJFHkyEZjrHIHI8sKKqrq8dKSfSwZIiscP1eCN5elQoUKhT9va4pF40BkRUGdScbLS/ahtLJO7zhEDu9wdhH+vPIwoDbuQyNtsGgchKKokGUVCz7aj7yiKr3jEDmNPRmX8fc1R7kKTUMsGgfQ9JPWwuWpOHWRS5iJbG1r6kX86+tjesdwWiwaByAIAhZ/mc7NmEQa+nr3OazZnqN3DKfEonEAy749gR2HL+kdg8jprdiYhR2HL0Hhkec2xaKxY6qqYv2uM1i344zeUYhcxt9WpeP4uauQZd7hZCssGjulKCp2p+dj6YYTekchcilmWcWby1KRV1zFsrERFo0dkhUVmWeKsfjLI+CKS6K2V1Nnxiv/3I+KKhPLxgZYNHZGlhVcKjTizWWpMMtsGSK9lFbW4dV/7YfKPTZWY9HYEUVRYTIreHPZQdSZZL3jELm885crseQ/mdxjYyUWjR0RRQHvfZmOgpIavaMQ0Y82H8jFziN5kBU+QrMUi8ZOKIqK7/aew97My3pHIaL/8cGaoygsqeF8jYVYNHZAlhXkXqnEv7/hCjMie1RnkvHm8lTIisr5GguwaHTWNC+z8JNUNJj50xKRvbpYYMQHX2VwvsYCLBqdcV6GyHGkpF3CttRcnhxwg1g0OuK8DJHjWbLuGPK5mfOGsGh0Yua8DJFDqm+Qf9znpvDCtFZi0ehAUVQ0cF6GyGHlF1fhb6uPQuR8TauwaHTAeRkix7c7PR/f7zvP+ZpWYNG0MVlRsGn/Bc7LEDmBf284gdLKOpbNr2DRtCFFUWGsNmHZt5yXIXIG9SYZH3yVAVHkI7RfwqJpQ6Io4IOvMlBTZ9Y7ChHZSFpWIfYfuwwzV6H9LBZNG5FlBQeOXcGB4wV6RyEiG/vnf45BlhWeGvAzWDRtQFUbd/9/uC5T7yhEpIGSijp8sjFL7xh2i0XTBgR
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"valid[\"HadHeartAttack\"].value_counts().plot(kind=\"pie\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Proporcje osób palących / niepalących w pierwotnym zbiorze danych:"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 50,
"outputs": [
{
"data": {
"text/plain": "<AxesSubplot:ylabel='SmokerStatus'>"
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAp0AAAGFCAYAAAC/uEhVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB1hklEQVR4nO3dd1hTZ8MG8DsJe6goCIi4UHCiAk5w4aqziq1bP0Wtq666tVatWn0ddaF1tlrRtk5ctbbWvUfdiigKspS9Z3LO9wcllYoWIeEEuH/X9V6vJifJnVDh5nnO8xyZKIoiiIiIiIi0SC51ACIiIiIq+Vg6iYiIiEjrWDqJiIiISOtYOomIiIhI61g6iYiIiEjrWDqJiIiISOtYOomIiIhI61g6iYiIiEjrWDqJiIiISOtYOomIiIhI61g6iYiIiEjrWDqJiIiISOtYOomIiIhI61g6iYiIiEjrWDqJiIiISOtYOomIiIhI61g6iYiIiEjrWDqJiIiISOtYOomIiIhI61g6iYiIiEjrWDqJiIiISOtYOomIiIhI61g6iYiIiEjrWDqJiIiISOtYOomIiIhI61g6iYiIiEjrWDqJiIiISOtYOomIiIhI61g6iYiIiEjrWDqJiIiISOtYOomIiIhI61g6iYiIiEjrWDrf4OnpCU9PTyQnJ79136xZszBkyBAJUumW0NBQODk54dq1axp9XicnJxw8eFCjz0lERES6g6XzX8LCwrB8+XKpYxARERGVKCyd/2Jvb49ffvkFly9fljoKERERUYnB0vkvPXv2RIsWLTB37tw8p9lzJCUlYd68eWjevDlcXV0xdOhQ3L9/HwAQEhKC2rVr49y5c7keM3v2bAwYMAAAkJmZiRUrVqBVq1Zo3Lgx+vbti4sXL6qPPXjwIDp27IjFixfD1dUV48aNyzPHuXPn4OXlhYYNG6JFixaYNWsWEhISAADXrl1D3bp18ccff6Bz585wdnbG0KFDERERgcWLF8PNzQ0tWrTAd999l+s5/fz80LNnTzg7O8PT0xMbN26ESqXK8/UDAwPh7u6OGTNmqI85c+YMvLy84OzsjI4dO2LNmjXIzMxUP+bVq1cYO3YsGjdujNatW+Po0aPv/JyJiIioZGDp/BeZTIYlS5YgISEB//vf//I8RhRFjBo1CiEhIdi8eTP27t2LRo0aYcCAAXj06BHs7e3RpEkTHDt2TP2YjIwM/P777/Dy8gKQXUAvXbqElStX4tChQ+jSpQvGjBmDs2fPqh/z8uVLREZGws/PD1OmTHkrR2xsLD7//HP06dMHv/76K3x8fHDjxo1cpweoVCp89913WLlyJXbu3Al/f398/PHH0NfXx759+9C/f3+sWbMGT548AQDs2LED8+bNQ79+/XDkyBFMmjQJ27dvx7Jly956/eDgYAwbNgytW7fGsmXLoFAocP78eUyePBl9+/bFsWPHMH/+fJw4cQLTp08HACiVSowcORJxcXHw9fXF2rVrsX379g//QhEREVHxIpJau3btxHXr1omiKIo///yz6OjoKF64cEEURVGcOXOmOHjwYFEURfHy5cuik5OTGBcXl+vxgwYNEmfOnCmKoigePHhQbNSokZiamiqKoij++uuvYsOGDcWkpCQxKChIdHR0FB89epTr8TNmzFC/xoEDB0RHR0fx8ePH78z76NEj0dHRUTx9+rT6toCAAPVjrl69Kjo6Oopnz55V3z9hwgSxdevWoiAIoiiKYlpamujo6CgePXpUFARBbNmypbhs2bJcr7Njxw6xXr16YmJiohgSEiI6OjqK+/fvF1u3bi3OnTtX/VyiKIoDBgwQFy9enOvxV65cER0dHcWQkBDx/PnzoqOjoxgcHPzW+zhw4MA73ysREREVb3pSl15d1a9fP5w8eRJffvllrhFLAHj48CFEUUS7du1y3Z6ZmYmMjAwAQOfOnfH111/jzz//RPfu3XHkyBF06NABZmZmuHDhAgBg4MCBuR6flZWFMmXK5LqtWrVq78xYp04ddO/eHWPGjIGVlRXc3d3Rtm1bdOzYMddxVatWVf/ZxMQElStXhkwmAwAYGRmps8fGxiI6Ohqurq65Ht+0aVNkZWXh+fPnqFChAgBgwYIFyMrKgq2trfq5AODRo0e4d+8e9u/fr75NFEUA2VPxz549Q9myZVGlSpVc7yMnBxEREZVMLJ3vsXjxYvTo0QNLly7NdbsgCDAzM8tzix8DAwMA2eXuo48+wtGjR+Hh4YELFy5gy5YtAP4pYbt374apqWmux8vluc94+K8ytmrVKowfPx7nz5/H5cuXMX36dLi6umLnzp3qY/T0cn+Z//0aOXJy/ZsgCG89T+/eveHo6Ihly5ahY8eOcHR0VB87cuRI9O7d+63nsbKyQmBgoPr53vTvjERERFSy8JzO96hUqRJmzZqF/fv34+bNm+rbHR0dkZycjKysLFStWlX9v61bt+LPP/9UH9enTx9cunQJfn5+sLS0RPPmzQEAtWrVAgBERUXlevzBgwc/aK/Ku3fv4ptvvkGNGjUwbNgwbNmyBd988w2uXr2KmJiYD36/lpaWsLS0xK1bt3LdfvPmTejr6+canezWrRsGDhyI+vXrY/bs2epFRLVq1cKLFy9yva9Xr15h+fLlSElJQZ06dZCUlISnT5+qnysoKOi9i7aIiIio+GPp/A+ffvopPDw8EBISor6tVatWqFOnDqZMmYKrV68iODgYS5cuxcGDB+Hg4KA+zs3NDba2tli3bh0+/vhj9QhjrVq10K5dO8yfPx+nT59GSEgItm7dis2bN+cqdv/FzMwMe/bswYoVKxAcHIyAgAD8+uuvqFatGiwsLAr0fkeMGAFfX1/s2bMHwcHBOHr0KHx8fNCvXz+Ym5vnOlYul2PRokV48uQJtm3bBgAYNWoUTp48CR8fH7x48QJXrlzB7NmzkZSUBCsrKzRr1gwNGzbEjBkzcOfOHdy/fx8zZsx45+grERERlQyc08yHnGn2HAqFAt9//z1WrFiByZMnIy0tDQ4ODvDx8UGLFi1yPbZ3795Yu3atetV6jtWrV2P16tX46quvkJCQgCpVqmDJkiV5Tku/i4ODA9avXw8fHx/s2bMHcrkczZs3x9atWwtc4ry9vWFgYICdO3fim2++gY2NDUaNGoURI0bkeXytWrUwatQo+Pj4oH379vjoo4+wevVqbN68GZs2bUK5cuXg6emJadOmAcguqps3b8bixYvh7e0NIyMjjB49GmFhYQXKS0RERMWDTHzXiXxERERERBrCOU0iIiIi0jpOrxPlg0oQIAiATAbI5TLI39gmqrBEUYRKEAERkCs0+9xERES6gqWTSj1RFKFSidllUv5P4UtIzkBCcgbikzIQl5yBxJTM7P+98ee0DCWUKgFKlQilSoDqX39WCSL0FHIYGihgqK+AoYECBn//f87fTQz1YWFuiPJljGBRxghW5YxhUcYQZc0Moaf4ZzJCEEQIggiFQpZrb1QiIqLigOd0UqmRUy719LKLnCCIiE5IQ2hkMiKiUxARnYzw6BS8iknF65gUZCrf3k+0qJkZ66N8GSNULG8COytTVLI0Q+WKZqhiY45y5tl7uOaMlCrkLKNERKS7WDqpRPp3wczIVCIwLAEBL+PwLDQBgaHxiIhOyZ7WLqZMjfRgb2OOKtbmqGpTBrWrlUcNu7LQU8ghCCJEUYRCwdO2iYhIN7B0UomQM9qnp5BDqRLwNCQeT4Jj1QUzLCoZpeG/dD2FHNUrlUHtquXhVNUCdWtUgFU5YwCAUiXkmq4nIiIqSiydVGzllChBEPEiPAF/PYnE3afRePwiRiemxnVFOTNDOFa1gLODJdzqWsPOygyimHN+KEsoEREVDZZOKjaEv6fC5XIZwqKS8deTSNx7GoX7z6KRkq6UOF3xYWVhDBeninCtbY3GTlYwMtCDUiXwnFAiItIqlk7SaSpBhFwGiCLwKCgGl+9F4OqDCETFpUkdrURQyGWoXa08XJwqonkDW1SxNodKJUDOAkpERBrG0kk6RyUIkMtkEEXg7tMoXLgThmsPXyExJVPqaCWevbU5WjeyQzu3yrA
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df[\"SmokerStatus\"].value_counts().plot(kind=\"pie\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 51,
"outputs": [
{
"data": {
"text/plain": "<AxesSubplot:ylabel='ECigaretteUsage'>"
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAsUAAAGFCAYAAAAVe47pAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAA9hAAAPYQGoP6dpAABzg0lEQVR4nO3dd1iV9fsH8PcZ7CmCMmQoIuAAwQVu0TLNvbVcpb+0zNQsMfeqXDlTMzU1y0xzf0tzlHtmbhFFZQrK3us8z+8P5AQCyn4OnPfrurqSM55zAwd4n8+5n/sjE0VRBBERERGRFpNLXQARERERkdQYiomIiIhI6zEUExEREZHWYygmIiIiIq3HUExEREREWo+hmIiIiIi0HkMxEREREWk9hmIiIiIi0noMxURERESk9RiKiYiIiEjrMRQTERERkdZjKCYiIiIircdQTERERERaj6GYiIiIiLQeQzERERERaT2GYiIiIiLSegzFRERERKT1GIqJiIiISOsxFBMRERGR1mMoJiIiIiKtx1BMRERERFqPoZiIiIiItB5DMRERERFpPYZiIiIiItJ6DMVEREREpPUYiomIiIhI6zEUExEREZHWYygmIiIiIq3HUExEREREWo+hmIiIiIi0HkMxEREREWk9hmIiIiIi0noMxURERESk9RiKiYiIiEjrMRQTERERkdZjKCYiIiIircdQTERERERaj6GYiIiIiLQeQzERERERaT2GYiIiIiLSegzFRERERKT1GIqJiIiISOsxFBMRERGR1mMoJiIiIiKtx1BMRERERFpPKXUBREQkHUEQIYgiRBGQywGFvPzWSgRRhCBUzLGJiMobQzERUTWWrRIgk8mgkMsA5ITgpNRMJKVmIj45AwlJOf9OTMlUX56YkvNfcmoWMrNVxXocGQBdHQWMDHRgbKADIwMdGOnrwMRQF2bGujAz0YOFqT4sTPRhZqwLY0Pdl2pkaCYiaTEUExFVcbmrvUrFf6EyISUD4c+SERKZhIjoZIQ9S0b4s2RExaZCJYgSVpvDQE8JWysj2FkZq/9zsDaBraUR9HRz/jSJogiVSoRCIYNMJpO4YiKq7mSiKEr/25GIiIpFFEWohP8CcEJyBgKexOJxRCLCnucE34joZKSmZ0tcaelZmOq/CMpGqFPLBK6ONeBcxxw6SvmLQC9yVZmIyh1DMRGRBhNEEaIgQqGQQ6USEBSegDuPYhAYEof7IXF4HpcmdYmVQqmQw7mOGdydLNCwbk00rlcTJkY5LRjZKiHfKjkRUWkwFBMRaZjckJeZpUJAcCxuPYzG7UcxCAyOQ2a2IHV5GsO6piHcnSzg5mSBJs6WqFPLGDKZDCqVAAVDMhGVEEMxEZHExBdTGhQKOWIT0nHuVgQu34nE7aBoZKv4K7q4TAx10Ny9Nnwa26CZe23o6Si4ikxExcZQTEQkAeHFyW5yuQxPnibi/M0IXLoTiUfhCRJXVj3oKOXwqG8Jn8Y28G1iAzNjPahUAuRynrRHRIVjKCYiqiQqQYBCLodKEHDnUQwu3HqKy3ci8UxL+oKlIpMBLvY10KqRNdp42MKuljFUggi5DAzIRKTGUExEVMFye1zvPYnF0YtPcPF2JFLSsqQuS2vZWBqhfVM7vOnjiFo1DNliQUQAGIqJiCpEbhBOTs3Escsh+PNSMMKeJUtdFr2kUb2a6NLCHu286kBXKX+x+x5Xj4m0EUMxEVE5UgkC5DIZbjx4jiMXgnHpzlOeLFcF6Osq0K6pHd5uUxfOdcy5ekykhRiKiYjKKHdVOC4pHUcvBOP4lRBExaZKXRaVUv065ujW2gkdvetAqZRDBvYeE2kDhmIiolLKDcOBwXHYdSIQV+9FqadKUNVnpK/EG60c0d/PBWZGumytIKrmGIqJiEooNwzfehiNnX/ex62gaKlLogqko5SjSwsHDOrSAJbmBhAEkeGYqBpiKCYiKqbcPtMrdyOx61gg7ofESV0SVSKFXIYO3nUw5A1X2FgaqUfsEVH1wFBMRPQa2SoBCrkM529GYNfxQDyOSJS6JJKQXAa09rDFkDdd4WhtynBMVE0wFBMRFSFblTNJ4q9/QrHn5AOOVKMCWrjXxtCurnCxr8FwTFTFMRQTEb0kt2f4wq0I/HDoLp7GpEhdEmm4Fg1r44M+TVDLwpCTKoiqKIZiIqIXBFGEXCZDUFg8Nu6/hbuPY6UuiaoQpUKOnu3qYlhXN+go5FBwzjFRlcJQTEQEQBBEJCRnYMuhOzj1bxj4m5FKy9xYD+92c8ObrRwhCCLDMVEVwVBMRFotWyVABuC3vx5i94lApGeqpC6JqglnOzN80M8D7k4WHONGVAUwFBORVsoNKdfuP8N3e28iIpp9w1Qx2ja1xdheTWBuosdgTKTBGIqJSOuoBAEJyZlYt+cGLt2JlLoc0gJ6Ogr061Qfg7s0AAC2VBBpIIZiItIauavDJ6+GYOO+W0hJz5a6JNIydW1NMfWdZqhTy4SrxkQahqGYiLSCSiUgJT0ba379Fxdvc3WYpKNUyDH0TVcM8HOBCJGzjYk0BEMxEVVruWPWLtyKwNrdN5CYkil1SUQAAFeHGvj0nWaobWHIVWMiDcBQTETVVrZKQGaWCut+u4lT18KkLoeoAD0dBUZ0d0ev9s7cEY9IYgzFRFTtiKIImUyGf+8/w8pf/kVsYrrUJRG9kkd9S0wZ5g1zEz0GYyKJMBQTUbWiUglQCSK+338LRy4GS10OUbEZ6Cnxf30ao0tLR/ULOyKqPAzFRFRtqFQCYhLTsWDzJTx5mih1OUSl0qmZPT4e1BRyGUe3EVWmEv20+fn5wc/PD8nJyQWu8/f3x/Dhw8utsKrs0qVLcHV1RVhY5fUwSvGYRfnnn39w9epV9ccPHjzA33//XWGPt2bNGvj5+VXY8TVFXFwcdu/erf54+PDh8Pf3L9fHePlr6erqir179wIAMjMzMXHiRHh4eKBt27YQBKFcH7usRFHEzYfR+GT53wzEVKX99U8opqw8hZjEdKg07OeMqDor8UvQ8PBwLFmypCJqoTLw8vLC2bNnYWNjI3UpGDZsGEJCQtQff/DBB7h161aFPd57772HPXv2VNjxNcWSJUtw8OBB9cdr1qzBjBkzKvQxz549i+7duwMAzpw5g6NHj2L16tXYvXs35BrS9ygIOW927T7xAHO/v4DktCyJKyIquydPEzFx+d+4FvBM6lKItEaJ/6rZ29tj165dOH/+fEXUQ6Wkq6sLKysrKBQKqUupdEZGRrCwsJC6jAr3cqeTubk5TExMKvQxraysoK+vDwBITMxZfe3QoYNGvPgCctolslQCvtp2GT/+cQ8Cm8GoGklJy8KCLZfw09EAAP+9ACSiilHiUNyrVy/4+vpixowZhbZR5EpKSsKsWbPg4+ODZs2aYcSIEerVwtDQULi5ueHUqVP57jN9+nQMHToUQM5btUuXLkW7du3g5eWFQYMG4ezZs+rb7t27F2+88QYWLlyIZs2a4cMPPyy0jrxv/xZ2WVpaGmbMmIE2bdqgSZMm6NOnD/7880/1bUVRxPfff4/OnTvD09MTvXv3zrdaBwBXr17FwIED4eHhgV69eiEgIOB1X0b89ddf6NevHzw8PPDGG29g5cqVyMx89fzUrKwsrFq1Cp06dYKnpyf69euHc+fOASjYPpGWloY5c+agVatW8Pb2xowZM/Dpp5/me7t99+7d6NmzJzw8PNC0aVMMGzYs34qun58fFi9ejO7du6NVq1a4fPnya78erq6uAHK+l/7+/vDz80N4eDjWrl2rbq951XOjON+Tl+V9yz8sLAyurq44evQoBg4ciMaNG8PPzw+7du0q8v65z6VffvkFHTt2hKenJyZOnIioqChMnToVXl5eaN++vXo1+vjx43Bzc0N4eHi+4wwePBiLFy8u9DGK+3zO/X/jxo3Rr18//PPPPwBy2pP27duHy5cvq7/Gedsnivp5CAoKwtixY+Hl5YW2bdvi008/xfPnz4v8Wrws92dlzZo16sdyc3PDmjVrAADXrl3DO++8Aw8PD3Ts2BHz5s175e+F8qRSCYiOT8OUladw/ubTSnl
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df[\"ECigaretteUsage\"].value_counts().plot(kind=\"pie\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Statystyki covidowe"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 52,
"outputs": [
{
"data": {
"text/plain": "<AxesSubplot:ylabel='CovidPos'>"
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx0AAAGFCAYAAACYM56UAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAA9hAAAPYQGoP6dpAABYjUlEQVR4nO3dd3iT9cLG8ft5ku5FC2XvvfceMpUpAh4HIB4OuMW9QBwHPUfEiYIKqJxXRBRUloAgOOCgyFQQEMoqe0MXLR1J3j96GimzlKZPk3w/1+V1YdImd9u0yZ3fMlwul0sAAAAA4CGm1QEAAAAA+DZKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8ChKBwAAAACPonQAAAAA8Ci71QEAb+BwOOVySYZpyGYaBXrbTqdLTqdLMiSbacgwCvb2AQAArEbpgN/LedFv2gyZ57zgz8xy6HRSuo4lpOrYqTSdSEhTYkq6zpzNUtrZTKWmZyntbJbOZmQpI9OpjCyHMrOcysh0yCXJbjMVYDMVYDdlt5my28/5t81QgN2msBC7ikUEKzoi6H//Bat4sWBFRwQrMixQdttfg5EOZ3bxOfcyAAAAb2C4XC6X1SGAwpKZ5ZTd9tdowrHTqdp9MFH7jiTrREJ2sTiRmKaTiWeVdCbD4rRSWEiAYiKDVbp4qCqUilCFkhGqXCZS5UuGKzgo+z0Dp9Mlp8vFKAkAACiyKB3wWVkOp3tUIC09S/GHErXrYKLiDycp/lCS9h1NVlp6lsUp8y8mMji7iJQKzy4jZSNVvXwxBQbY5HS65JJLNpNREQAAYD1KB3yCy+WSw+mS3WbK6XQp/nCSNu44ri27T2r3wUQdT0izOmKhsJmGqpSNUp0qMapbJUb1q5ZQsYggSblLGAAAQGGidMArXVgyEvV73HFt3nVSW/ec1Jmz3juCUdBKRoeoTpXiqlM5RvWrFVfFUhGSsqdl2SghAACgEFA6vEyXLl0kSfPnz1d4eHiu60aOHKmDBw/q008/tSKax7lcLjld2e/mHziWrDVbjuiP/5WMVEpGnkWGBap5nVJqVa+0mtUuqaBAO6MgAADAo9i9ygsdPHhQr732ml566SWro3ic05ndiU3T0O6DiVq58ZB+2XRIh06csTiZ90o6k6Ef1u3XD+v2y24z1aB6cbWsW1ptGpRR8agQOZxOmQaL0gEAQMGhdHihChUqaObMmerRo4fatm1rdZwCl/OiV5K27z2l/248pF//OOw36zIKU5bDqd+2H9dv249r8pw/VLlMpFrWK622DcqoWvlicjicTMECAADXjNLhhfr27asNGzZo9OjR+uabby6YZiVJCQkJeuedd/TDDz/o9OnTqlu3rh577DG1atXKgsR5k/MCd+vuU1rx2wH9uvmIElLSrY7lV+IPJyn+cJJmLYtT+ZLh6tqiorq1rKhi4UEUEAAAkG+UDi9kGIb+/e9/68Ybb9S4ceP08ssv57re4XBo2LBhyszM1Ouvv66YmBhNmzZNw4cP14wZM9SwYUOLkl/I4XTKZppKOpOuxav2aumavTpyMtXqWJB04FiKPlm4VZ9++6ea1IxVt5YV1bp+GdlMQy5X9pQ3AACAvKB0eKly5crpmWee0QsvvKDu3burffv27utWrlypLVu26JtvvlHNmjUlSWPGjNEff/yhjz/+WO+8845Vsd1yplBt3HFCi1fFa82WI3I42dOgKHI6XVq/7ZjWbzum8JAAXde0vG5oWZHpVwAAIM8oHV7stttu05IlS/Tcc89pwYIF7svj4uIUERHhLhxS9uhI8+bNtXLlSiuiSvprVCMhJV2LV8Vr6eq9OnaadRreJCUtU4t+3qNFP+9RpdIRurFDVXVtXlGGacjGyAcAALgESoeX+9e//qUbb7xRY8eOdV92qV2QXS6X7PbC/5HnvBu++2CiZi3boTVbj7h3pYL32nskWRO/3Kjp325T7/ZVdGP7qgoNtssluTcCAAAAkCgdXq9s2bIaOXKknnvuOVWoUEFlypRRrVq1lJycrLi4OPdoh8vl0vr161W9evVCy5Zz9sOWPSc1c2mcNu08UWj3jcKTkJKuzxZv01c/7FC3FhV1c+fqio0OdY9sAQAAUDp8wC233KLFixdr5cqVKlOmjNq3b686deroiSee0PPPP6/ixYtr+vTpiouL04svvujxPDllY8O2Y5q5LE5x+057/D5hvfQMhxb+vEff/rJHbRqU1S1da7DuAwAASKJ0+IycaVaSZLPZNHXqVI0bN04jRoxQRkaG6tevr//7v/9T48aNPZbB4XDKMAz9vPGQvvw+TnuPJHvsvlB0OV3Sz5sO6edNh1S/anEN7lFb9auVoHwAAODHDNelFgAAeZTlyN6J6vu1+/Tl9zt0+CSnhSO3xjVjNbR3XVUrX0xOp4vtdgEA8DOUDuRbzpz9jTuO66N5mxV/OMnqSCji2jQoo6G966pMiTBJ2buqAQAA30fpwFXLecgcPnFGU+b+ofXbjlmcCN7ENA3d0KqS7uxVR2HBAYx6AADgBygduCoOp0tp6Vn6dNFWLfl1Lwf6Id9Cguy6uXN1DehcXYZhyM56DwAAfBalA3nicDglSfNW7NKsZXE6czbL4kTwFbHFQnT/zQ3Vom5p1nsAAOCjKB24LKfLJdMwtHbrEU2Z+4eOnEy1OhJ8VNsGZXT/3xopIjSQ080BAPAxlA5cksPp1Jm0LH3w9Uat3HjI6jjwA6HBdv29d131bFNZTpeLwwUBAPARlA5cIGdXqmVr9urj+VuUkpZpdST4mdqVo/XIbU1ULjacHa4AAPABlA7k4nC6lJicrre/2KDf445bHQd+zG4z1L9TdQ26obYMQxwsCACAF6N0QJLcC3i/+zVeH83forR0FoqjaChTIkxPDmqmGhWLMeoBAICXonRADodTSWcyNP6L37RhO2duoOixmYYGda+tW7rWYK0HAABeiNIBrf/zqN6csV7JqazdQNHWsHoJPXVHc0WEBjDdCgAAL0Lp8FMOp0uGpGmLtmr2TzvFowDeIjIsUI8NbKrmdUpZHQUAAOQRpcMPORxOJadmaOwna7V1zymr4wD5cmOHqhp2Yz0ZYpE5AABFHaXDz7hcLm3aeUKvT1+nxJQMq+MA16RquSiNvLOFSsaEcqAgAABFGKXDTzicTpmGoRlLtmnWsjg5+anDRwQH2vTYwKZq06AMu1sBAFBEUTr8gMPh1JmzWRr36Vpt2nHC6jhAgTMMaVD32rr9+lpyuVyUDwAAihhKh49zOJw6fPKMXpiySsdPp1kdB/Cojk3L69Hbmsgwxba6AAAUIZQOH+Z0urRl90n9+z+rdeYsh/3BP9SqFK0XhrdWaLBddhaYAwBQJFA6fNgP6/ZpwqzfleXgRwz/Ehsdohfvaq3yJcMZ8QAAoAigdPiYnPnsny3Zpi++2251HMAyIUF2PXlHM7WoU4o1HgAAWIzS4UOcTpdccundmb/rh3X7rY4DWM40pKF96ql/p+pWRwEAwK9ROnyEw+FURpZT/5q6Wpt2skMVcK7butXUHT3rWB0DAAC/RenwAQ6HUylpmXr2g5+170iy1XGAIqlfx2oa3re+1TEAAPBLlA4vl1M4Rr63UgeOpVgdByjSeraprAf+1oizPAAAKGSUDi/mcDiVnJpdOA4ep3AAedGleQU9clsTyZBMigcAAIWC0uG
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df[\"CovidPos\"].value_counts().plot(kind=\"pie\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## Normalizacja część 1 - zamiana na kolumny liczbowe i kategoryczne"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Kolumny zawierające stan zdrowia i podobne cechy opisane w sposób \"poor/fair/good/excellent\" etc. starałem się zamienić na liczbowe w sposób sensowny, rosnący względem pozytywnego aspektu tego czynnika zdrowotnego. Podobnie z tym jak często dana osoba paliła.\n",
"Część kolumn zamieniłem na kategoryczne\n",
"Kolumnę płci zamieniłem na numeryczną w celu późniejszego wykorzystania przez model, choć mialem wątpliwości co do robienia tego pod względem poprawności politycznej\n"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 53,
"outputs": [
{
"data": {
"text/plain": "array(['Female', 'Male'], dtype=object)"
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"Sex\"].unique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 54,
"outputs": [
{
"data": {
"text/plain": "array(['Very good', 'Excellent', 'Fair', 'Poor', 'Good', nan],\n dtype=object)"
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"GeneralHealth\"].unique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 55,
"outputs": [],
"source": [
"health_map = {\n",
" \"Excellent\": 5,\n",
" \"Very good\": 4,\n",
" \"Good\": 3,\n",
" \"Fair\": 2,\n",
" \"Poor\": 1\n",
"}"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 56,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"State:\n",
"['Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California' 'Colorado'\n",
" 'Connecticut' 'Delaware' 'District of Columbia' 'Florida' 'Georgia'\n",
" 'Hawaii' 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas' 'Kentucky'\n",
" 'Louisiana' 'Maine' 'Maryland' 'Massachusetts' 'Michigan' 'Minnesota'\n",
" 'Mississippi' 'Missouri' 'Montana' 'Nebraska' 'Nevada' 'New Hampshire'\n",
" 'New Jersey' 'New Mexico' 'New York' 'North Carolina' 'North Dakota'\n",
" 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania' 'Rhode Island' 'South Carolina'\n",
" 'South Dakota' 'Tennessee' 'Texas' 'Utah' 'Vermont' 'Virginia'\n",
" 'Washington' 'West Virginia' 'Wisconsin' 'Wyoming' 'Guam' 'Puerto Rico'\n",
" 'Virgin Islands']\n",
"Sex:\n",
"['Female' 'Male']\n",
"GeneralHealth:\n",
"['Very good' 'Excellent' 'Fair' 'Poor' 'Good' nan]\n",
"PhysicalHealthDays:\n",
"[ 0. 2. 1. 8. 5. 30. 4. 23. 14. nan 15. 3. 10. 7. 25. 6. 21. 20.\n",
" 29. 16. 9. 27. 28. 12. 13. 11. 26. 17. 24. 19. 18. 22.]\n",
"MentalHealthDays:\n",
"[ 0. 3. 9. 5. 15. 20. 14. 10. 18. 1. nan 2. 30. 4. 6. 7. 25. 8.\n",
" 22. 29. 27. 21. 12. 28. 16. 13. 26. 17. 11. 23. 19. 24.]\n",
"LastCheckupTime:\n",
"['Within past year (anytime less than 12 months ago)' nan\n",
" 'Within past 2 years (1 year but less than 2 years ago)'\n",
" 'Within past 5 years (2 years but less than 5 years ago)'\n",
" '5 or more years ago']\n",
"PhysicalActivities:\n",
"['No' 'Yes' nan]\n",
"SleepHours:\n",
"[ 8. 6. 5. 7. 9. 4. 10. 1. 12. nan 18. 3. 2. 11. 16. 15. 13. 14.\n",
" 20. 23. 17. 24. 22. 19. 21.]\n",
"RemovedTeeth:\n",
"[nan 'None of them' '1 to 5' '6 or more, but not all' 'All']\n",
"HadHeartAttack:\n",
"['No' 'Yes' nan]\n",
"HadAngina:\n",
"['No' 'Yes' nan]\n",
"HadStroke:\n",
"['No' 'Yes' nan]\n",
"HadAsthma:\n",
"['No' 'Yes' nan]\n",
"HadSkinCancer:\n",
"['No' 'Yes' nan]\n",
"HadCOPD:\n",
"['No' 'Yes' nan]\n",
"HadDepressiveDisorder:\n",
"['No' 'Yes' nan]\n",
"HadKidneyDisease:\n",
"['No' 'Yes' nan]\n",
"HadArthritis:\n",
"['No' 'Yes' nan]\n",
"HadDiabetes:\n",
"['Yes' 'No' 'No, pre-diabetes or borderline diabetes' nan\n",
" 'Yes, but only during pregnancy (female)']\n",
"DeafOrHardOfHearing:\n",
"['No' nan 'Yes']\n",
"BlindOrVisionDifficulty:\n",
"['No' 'Yes' nan]\n",
"DifficultyConcentrating:\n",
"['No' nan 'Yes']\n",
"DifficultyWalking:\n",
"['No' 'Yes' nan]\n",
"DifficultyDressingBathing:\n",
"['No' nan 'Yes']\n",
"DifficultyErrands:\n",
"['No' 'Yes' nan]\n",
"SmokerStatus:\n",
"['Never smoked' 'Current smoker - now smokes some days' 'Former smoker'\n",
" nan 'Current smoker - now smokes every day']\n",
"ECigaretteUsage:\n",
"['Not at all (right now)' 'Never used e-cigarettes in my entire life' nan\n",
" 'Use them every day' 'Use them some days']\n",
"ChestScan:\n",
"['No' 'Yes' nan]\n",
"RaceEthnicityCategory:\n",
"['White only, Non-Hispanic' 'Black only, Non-Hispanic'\n",
" 'Other race only, Non-Hispanic' 'Multiracial, Non-Hispanic' nan\n",
" 'Hispanic']\n",
"AgeCategory:\n",
"['Age 80 or older' 'Age 55 to 59' nan 'Age 40 to 44' 'Age 75 to 79'\n",
" 'Age 70 to 74' 'Age 65 to 69' 'Age 60 to 64' 'Age 50 to 54'\n",
" 'Age 45 to 49' 'Age 35 to 39' 'Age 25 to 29' 'Age 30 to 34'\n",
" 'Age 18 to 24']\n",
"HeightInMeters:\n",
"[ nan 1.6 1.57 1.65 1.8 1.63 1.7 1.68 1.73 1.55 1.93 1.88 1.78 1.85\n",
" 1.75 1.52 1.83 1.91 1.96 1.5 1.45 1.42 1.24 1.47 1.22 1.98 2.03 2.01\n",
" 1.3 1.4 1.35 1.82 1.67 1.76 2.11 1.37 1.64 1.71 2.16 2.26 0.91 2.06\n",
" 1.14 1.74 1.51 1.53 1.69 1.56 1.84 1.9 1.54 1.72 1.87 1.61 1.49 1.59\n",
" 1.58 1.62 1.79 1.46 1.89 2.13 0.99 2.08 2.21 1.32 2.18 1.77 2.36 1.25\n",
" 1.66 1.86 1.95 1.19 1.05 1.48 1.03 1.18 1.81 1.38 1.44 1.07 1.27 1.2\n",
" 1.17 1.04 2.24 1.1 1.43 1.92 2.05 1.12 2.41 2.34 0.97 1.06 1.15 2.29\n",
" 1.16 1.09 0.92 2.07 1. 1.08 1.02 1.33 2. 2.02 1.94 0.95]\n",
"WeightInKilograms:\n",
"[ nan 68.04 63.5 53.98 84.82 62.6 73.48 81.65 74.84 59.42\n",
" 85.28 106.59 71.21 64.41 61.23 90.72 65.77 66.22 80.29 86.18\n",
" 47.63 107.05 57.15 105.23 77.11 56.7 79.38 113.4 102.06 59.87\n",
" 104.33 53.52 61.69 136.08 34.47 99.79 127.01 78.93 95.25 58.97\n",
" 92.08 72.57 83.91 49.9 117.93 71.67 102.97 62.14 83.46 54.43\n",
" 94.35 60.78 117.03 65.32 76.66 88.45 89.81 74.39 68.95 79.83\n",
" 108.41 90.26 55.79 91.63 47.17 78.02 50.8 91.17 84.37 145.15\n",
" 93.89 122.47 48.99 73.94 88.9 80.74 81.19 158.76 97.52 51.71\n",
" 82.55 76.2 68.49 75.3 70.31 63.05 60.33 115.67 86.64 108.86\n",
" 92.53 124.74 43.09 58.51 63.96 92.99 44.45 128.82 98.88 45.36\n",
" 110.68 46.72 58.06 73.03 95.71 131.09 78.47 69.4 85.73 67.59\n",
" 103.87 120.2 88. 54.88 111.58 52.16 77.56 126.55 94.8 123.83\n",
" 89.36 75.75 69.85 112.49 82.1 106.14 57.61 70.76 148.78 96.16\n",
" 67.13 48.08 163.29 109.77 100.7 142.88 64.86 111.13 121.11 55.34\n",
" 101.6 93.44 117.48 120.66 66.68 44.91 132. 107.5 107.95 36.29\n",
" 103.42 87.09 83.01 56.25 96.62 134.26 97.07 34.93 99.34 72.12\n",
" 49.44 122.02 98.43 129.73 181.44 52.62 121.56 110.22 48.53 140.61\n",
" 156.49 116.57 87.54 44. 114.31 31.75 97.98 101.15 112.04 100.24\n",
" 113.85 154.22 118.39 133.81 149.69 41.73 119.75 138.35 151.95 129.27\n",
" 131.54 104.78 132.45 102.51 116.12 40.37 105.69 136.98 195.04 53.07\n",
" 132.9 124.28 112.94 114.76 45.81 119.29 167.83 51.26 172.37 162.39\n",
" 46.27 127.91 123.38 38.56 130.63 143.34 115.21 166.92 135.17 109.32\n",
" 135.62 204.12 127.46 118.84 139.25 126.1 122.92 151.5 133.36 42.64\n",
" 50.35 80. 190.51 37.19 147.87 35.38 144.24 149.23 37.65 86.\n",
" 147.42 281. 165.56 162.84 155.58 70. 137.89 189.6 206.38 148.32\n",
" 42.18 153.77 38.1 90. 176.9 191.87 249.48 67. 95. 82.\n",
" 170.1 62. 40.82 53. 139.71 130.18 100. 165.11 64. 43.54\n",
" 24. 134.72 141.52 125.19 75. 60. 34.02 164.65 30.84 250.\n",
" 58. 76. 73. 112. 74. 55. 200. 54. 66. 72.\n",
" 152.41 39.46 220. 41.28 168.28 188.24 59. 46. 265. 238.14\n",
" 168.74 145. 190. 93. 159.66 78. 50. 185.07 91. 104.\n",
" 165. 183.7 33.57 161.93 68. 125.65 134. 130. 32.21 143.79\n",
" 69. 179.17 63. 105. 210.92 65. 32. 292.57 280. 85.\n",
" 174.63 56. 128.37 87. 39.92 83. 169.64 156.04 177. 121.\n",
" 151.05 89. 146.96 146.06 98. 166.47 36.74 171.46 227.25 29.48\n",
" 190.06 161.03 35.83 226.8 175.09 138.8 240.4 158.3 170.55 61.\n",
" 137.44 145.6 141.07 155.13 52. 120. 57. 77. 27.22 25.4\n",
" 240. 96. 47. 115. 41. 45. 170. 150.59 272.16 26.31\n",
" 48. 39.01 236. 92. 197.31 156. 84. 94. 29.03 49.\n",
" 79. 157.85 192.78 255. 108. 185. 222.26 229.97 180. 81.\n",
" 24.95 71. 26. 107. 101. 208.65 140. 175. 111. 110.\n",
" 141.97 22.68 284.86 136.53 210. 103. 185.97 140.16 146.51 24.49\n",
" 25.85 150. 102. 229.52 23.59 125. 163. 38. 135. 176.45\n",
" 185.52 152.86 232.69 124. 192.32 186.88 118. 160.12 160. 193.68\n",
" 201.85 144.7 184.16 142.43 169. 166.01 32.66 180.53 196.41 51.\n",
" 40. 171.91 195.95 33.11 153.31 159.21 164.2 219.99 215.46 182.34\n",
" 30. 160.57 173.27 158. 213.19 276.24 199.58 175.99 235.87 217.72\n",
" 200.03 230.88 146. 24.04 178.72 150.14 157.4 163.75 191.42 174.18\n",
" 28.58 97. 256.28 205.48 161.48 178.26 179.62 205.02 254.01 154.68\n",
" 209.56 201.4 234.96 177.81 200.49 231.79 227.7 273.52 189.15 173.73\n",
" 183.25 167.38 211.83 223.62 228.61 30.39 197.77 184.61 250.38 181.89\n",
" 31.3 290.3 285. 113. 242.67 231.33 180.08 202.76 176. 188.69\n",
" 206.84 164. 156.94 114. 122. 222. 137. 166. 180.98 272.\n",
" 172.82 274.42 234.51 199.13 244.94 203.21 23.13 265.35 198.22 263.08\n",
" 216.82 154. 169.19 239.04 177.35 210.47 224.98 117. 37. 126.\n",
" 273.06 203.66 252.2 238.59 194.59 187.33 221.35 162. 224.53 23.\n",
" 223.17 187.79 212.73 152. 233.6 193.23 205. 229.06 230. 247.21\n",
" 99. 28.12 230.42 175.54 205.93 171. 26.76 212.28 217. 280.32\n",
" 281.68 248.57 195. 42. 258.55 215. 116. 28. 123. 186.43\n",
" 228.16 119. 219.09 214.55 278.96 182.8 138. 217.27 246.3 189. ]\n",
"BMI:\n",
"[ nan 26.57 25.61 ... 13.51 28.39 48.63]\n",
"AlcoholDrinkers:\n",
"['No' 'Yes' nan]\n",
"HIVTesting:\n",
"['No' 'Yes' nan]\n",
"FluVaxLast12:\n",
"['Yes' 'No' nan]\n",
"PneumoVaxEver:\n",
"['No' 'Yes' nan]\n",
"TetanusLast10Tdap:\n",
"['Yes, received tetanus shot but not sure what type'\n",
" 'No, did not receive any tetanus shot in the past 10 years' nan\n",
" 'Yes, received Tdap' 'Yes, received tetanus shot, but not Tdap']\n",
"HighRiskLastYear:\n",
"['No' nan 'Yes']\n",
"CovidPos:\n",
"['No' 'Yes' nan\n",
" 'Tested positive using home test without a health professional']\n"
]
}
],
"source": [
"for col in df:\n",
" print(f\"{col}:\")\n",
" print(df[col].unique())"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 57,
"outputs": [],
"source": [
"from collections import defaultdict\n",
"def normalize_dataset(dataset):\n",
" dataset[\"GeneralHealth\"] = dataset[\"GeneralHealth\"].map(defaultdict(lambda: float('NaN'), health_map), na_action='ignore')\n",
" dataset[\"Sex\"] = dataset[\"Sex\"].map({\"Female\":0,\"Male\":1}).astype(float) #Zamiana z kolumn tekstowych na numeryczne\n",
" dataset.rename(columns ={\"Sex\":\"Male\"},inplace=True)\n",
" dataset[\"State\"] = dataset[\"State\"].astype('category')\n",
" dataset[\"PhysicalHealthDays\"].astype(float)\n",
" dataset[\"MentalHealthDays\"].astype(float)\n",
" dataset[\"LastCheckupTime\"] = dataset[\"LastCheckupTime\"].fillna(\"Unknown\").astype('category') # Potem korzystam z fillna-->median ale nie działa to na kolumnach kategorycznych więc wykonuję to przed konwersją\n",
" dataset[\"PhysicalActivities\"]= dataset[\"PhysicalActivities\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"SleepHours\"].astype(float)\n",
" dataset[\"RemovedTeeth\"] = dataset[\"RemovedTeeth\"].map(defaultdict(lambda: float('NaN'), {\"None of them\":0,\"1 to 5\":1, \"6 or more, but not all\":2, \"All\":3}), na_action='ignore')\n",
" dataset[\"HadHeartAttack\"]= dataset[\"HadHeartAttack\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"HadAngina\"]= dataset[\"HadAngina\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"HadStroke\"]= dataset[\"HadStroke\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"HadAsthma\"]= dataset[\"HadAsthma\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"HadSkinCancer\"]= dataset[\"HadSkinCancer\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"HadCOPD\"]= dataset[\"HadCOPD\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"HadDepressiveDisorder\"]= dataset[\"HadDepressiveDisorder\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"HadKidneyDisease\"]= dataset[\"HadKidneyDisease\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"HadArthritis\"]= dataset[\"HadArthritis\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"HadDiabetes\"]= dataset[\"HadDiabetes\"].map({\"No\":0,\"Yes, but only during pregnancy (female)\":1,\"No, pre-diabetes or borderline diabetes\":2,\"Yes\":3})\n",
"\n",
" dataset[\"DeafOrHardOfHearing\"]= dataset[\"DeafOrHardOfHearing\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"BlindOrVisionDifficulty\"]= dataset[\"BlindOrVisionDifficulty\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"DifficultyConcentrating\"]= dataset[\"DifficultyConcentrating\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"DifficultyWalking\"]= dataset[\"DifficultyWalking\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"DifficultyDressingBathing\"]= dataset[\"DifficultyDressingBathing\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"DifficultyErrands\"]= dataset[\"DifficultyErrands\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"SmokerStatus\"]= dataset[\"SmokerStatus\"].map({\"Never smoked\":0,\"Current smoker - now smokes some days\":1,\"Former smoker\":2,\"Current smoker - now smokes every day\":3})\n",
" dataset[\"ECigaretteUsage\"]= dataset[\"ECigaretteUsage\"].map({\"Never used e-cigarettes in my entire life\":0,\"Not at all (right now)\":1,\"Use them some days\":2,\"Use them every day\":3})\n",
" dataset[\"ChestScan\"]= dataset[\"ChestScan\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"RaceEthnicityCategory\"] = dataset[\"RaceEthnicityCategory\"].fillna(\"Unknown\").astype('category')\n",
" dataset[\"AgeCategory\"] = dataset[\"AgeCategory\"].fillna(\"Unknown\").astype('category')\n",
" dataset[\"HeightInMeters\"] = dataset[\"HeightInMeters\"].astype(float)\n",
" dataset[\"WeightInKilograms\"] = dataset[\"WeightInKilograms\"].astype(float)\n",
" dataset[\"BMI\"] = dataset[\"BMI\"].astype(float)\n",
" dataset[\"AlcoholDrinkers\"]= dataset[\"AlcoholDrinkers\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"HIVTesting\"]= dataset[\"HIVTesting\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"FluVaxLast12\"]= dataset[\"FluVaxLast12\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"PneumoVaxEver\"]= dataset[\"PneumoVaxEver\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"TetanusLast10Tdap\"]= dataset[\"TetanusLast10Tdap\"].apply(lambda x: float('NaN') if type(x)!=str else 1.0 if 'Yes,' in x else 1.0 if 'No,' in x else float('NaN'))\n",
" dataset[\"HighRiskLastYear\"]= dataset[\"HighRiskLastYear\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"CovidPos\"]= dataset[\"CovidPos\"].map({\"No\":0,\"Yes\":1})"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Zbiór test przed zmianą typu danych"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 58,
"outputs": [
{
"data": {
"text/plain": " State Sex GeneralHealth PhysicalHealthDays \\\n339824 South Dakota Female Good 3.0 \n127927 Kansas Female Good 30.0 \n362523 Utah Male Excellent 0.0 \n183687 Michigan Male Good 0.0 \n191905 Michigan Female Very good 0.0 \n\n MentalHealthDays LastCheckupTime \\\n339824 21.0 Within past year (anytime less than 12 months ... \n127927 0.0 Within past year (anytime less than 12 months ... \n362523 0.0 Within past year (anytime less than 12 months ... \n183687 7.0 Within past year (anytime less than 12 months ... \n191905 0.0 Within past year (anytime less than 12 months ... \n\n PhysicalActivities SleepHours RemovedTeeth HadHeartAttack \\\n339824 Yes 8.0 6 or more, but not all No \n127927 Yes 10.0 1 to 5 No \n362523 Yes 7.0 1 to 5 No \n183687 Yes 8.0 None of them No \n191905 Yes 7.0 None of them No \n\n ... HeightInMeters WeightInKilograms BMI AlcoholDrinkers \\\n339824 ... 1.60 52.16 20.37 No \n127927 ... 1.68 97.52 34.70 No \n362523 ... 1.83 113.85 34.04 No \n183687 ... 1.78 83.91 26.54 Yes \n191905 ... 1.57 68.04 27.44 Yes \n\n HIVTesting FluVaxLast12 PneumoVaxEver \\\n339824 Yes Yes Yes \n127927 No Yes Yes \n362523 No No No \n183687 No Yes Yes \n191905 Yes No Yes \n\n TetanusLast10Tdap HighRiskLastYear \\\n339824 Yes, received Tdap No \n127927 NaN No \n362523 Yes, received Tdap No \n183687 Yes, received Tdap No \n191905 Yes, received tetanus shot but not sure what type No \n\n CovidPos \n339824 No \n127927 No \n362523 Yes \n183687 No \n191905 No \n\n[5 rows x 40 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>State</th>\n <th>Sex</th>\n <th>GeneralHealth</th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>LastCheckupTime</th>\n <th>PhysicalActivities</th>\n <th>SleepHours</th>\n <th>RemovedTeeth</th>\n <th>HadHeartAttack</th>\n <th>...</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n <th>AlcoholDrinkers</th>\n <th>HIVTesting</th>\n <th>FluVaxLast12</th>\n <th>PneumoVaxEver</th>\n <th>TetanusLast10Tdap</th>\n <th>HighRiskLastYear</th>\n <th>CovidPos</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>339824</th>\n <td>South Dakota</td>\n <td>Female</td>\n <td>Good</td>\n <td>3.0</td>\n <td>21.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>Yes</td>\n <td>8.0</td>\n <td>6 or more, but not all</td>\n <td>No</td>\n <td>...</td>\n <td>1.60</td>\n <td>52.16</td>\n <td>20.37</td>\n <td>No</td>\n <td>Yes</td>\n <td>Yes</td>\n <td>Yes</td>\n <td>Yes, received Tdap</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>127927</th>\n <td>Kansas</td>\n <td>Female</td>\n <td>Good</td>\n <td>30.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>Yes</td>\n <td>10.0</td>\n <td>1 to 5</td>\n <td>No</td>\n <td>...</td>\n <td>1.68</td>\n <td>97.52</td>\n <td>34.70</td>\n <td>No</td>\n <td>No</td>\n <td>Yes</td>\n <td>Yes</td>\n <td>NaN</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>362523</th>\n <td>Utah</td>\n <td>Male</td>\n <td>Excellent</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>Yes</td>\n <td>7.0</td>\n <td>1 to 5</td>\n <td>No</td>\n <td>...</td>\n <td>1.83</td>\n <td>113.85</td>\n <td>34.04</td>\n <td>No</td>\n <td>No</td>\n <td>No</td>\n <td>No</td>\n <td>Yes, received Tdap</td>\n <td>No</td>\n <td>Yes</td>\n </tr>\n <tr>\n <th>183687</th>\n <td>Michigan</td>\n <td>Male</td>\n <td>Good</td>\n <td>0.0</td>\n <td>7.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>Yes</td>\n <td>8.0</td>\n <td>None of them</td>\n <td>No</td>\n <td>...</td>\n <td>1.78</td>\n <td>83.91</td>\n <td>26.54</td>\n <td>Yes</td>\n <td>No</td>\n <td>Yes</td>\n <td>Yes</td>\n <td>Yes, received Tdap</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>191905</th>\n <td>Michigan</td>\n <td>Female</td>\n <td>Very good</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>Yes</td>\n <td>7.0</td>\n <td>None of them</td>\n <td>No</td>\n <td>...</td>\n <td>1.57</td>\n <td>68.04</td>\n <td>27.44</td>\n <td>Yes</td>\n <td>Yes</td>\n <td>No</td>\n <td>Yes</td>\n <td>Yes, received tetanus shot but not sure what type</td>\n <td>No</td>\n <td>No</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 40 columns</p>\n</div>"
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## Zbiór test po zmianie typu danych"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 59,
"outputs": [
{
"data": {
"text/plain": " State Male GeneralHealth PhysicalHealthDays \\\n339824 South Dakota 0.0 3.0 3.0 \n127927 Kansas 0.0 3.0 30.0 \n362523 Utah 1.0 5.0 0.0 \n183687 Michigan 1.0 3.0 0.0 \n191905 Michigan 0.0 4.0 0.0 \n\n MentalHealthDays LastCheckupTime \\\n339824 21.0 Within past year (anytime less than 12 months ... \n127927 0.0 Within past year (anytime less than 12 months ... \n362523 0.0 Within past year (anytime less than 12 months ... \n183687 7.0 Within past year (anytime less than 12 months ... \n191905 0.0 Within past year (anytime less than 12 months ... \n\n PhysicalActivities SleepHours RemovedTeeth HadHeartAttack ... \\\n339824 1.0 8.0 2.0 0.0 ... \n127927 1.0 10.0 1.0 0.0 ... \n362523 1.0 7.0 1.0 0.0 ... \n183687 1.0 8.0 0.0 0.0 ... \n191905 1.0 7.0 0.0 0.0 ... \n\n HeightInMeters WeightInKilograms BMI AlcoholDrinkers HIVTesting \\\n339824 1.60 52.16 20.37 0.0 1.0 \n127927 1.68 97.52 34.70 0.0 0.0 \n362523 1.83 113.85 34.04 0.0 0.0 \n183687 1.78 83.91 26.54 1.0 0.0 \n191905 1.57 68.04 27.44 1.0 1.0 \n\n FluVaxLast12 PneumoVaxEver TetanusLast10Tdap HighRiskLastYear \\\n339824 1.0 1.0 1.0 0.0 \n127927 1.0 1.0 NaN 0.0 \n362523 0.0 0.0 1.0 0.0 \n183687 1.0 1.0 1.0 0.0 \n191905 0.0 1.0 1.0 0.0 \n\n CovidPos \n339824 0.0 \n127927 0.0 \n362523 1.0 \n183687 0.0 \n191905 0.0 \n\n[5 rows x 40 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>State</th>\n <th>Male</th>\n <th>GeneralHealth</th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>LastCheckupTime</th>\n <th>PhysicalActivities</th>\n <th>SleepHours</th>\n <th>RemovedTeeth</th>\n <th>HadHeartAttack</th>\n <th>...</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n <th>AlcoholDrinkers</th>\n <th>HIVTesting</th>\n <th>FluVaxLast12</th>\n <th>PneumoVaxEver</th>\n <th>TetanusLast10Tdap</th>\n <th>HighRiskLastYear</th>\n <th>CovidPos</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>339824</th>\n <td>South Dakota</td>\n <td>0.0</td>\n <td>3.0</td>\n <td>3.0</td>\n <td>21.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>8.0</td>\n <td>2.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>1.60</td>\n <td>52.16</td>\n <td>20.37</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>127927</th>\n <td>Kansas</td>\n <td>0.0</td>\n <td>3.0</td>\n <td>30.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>10.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>1.68</td>\n <td>97.52</td>\n <td>34.70</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>362523</th>\n <td>Utah</td>\n <td>1.0</td>\n <td>5.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>7.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>1.83</td>\n <td>113.85</td>\n <td>34.04</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>183687</th>\n <td>Michigan</td>\n <td>1.0</td>\n <td>3.0</td>\n <td>0.0</td>\n <td>7.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>8.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>1.78</td>\n <td>83.91</td>\n <td>26.54</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>191905</th>\n <td>Michigan</td>\n <td>0.0</td>\n <td>4.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>7.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>1.57</td>\n <td>68.04</td>\n <td>27.44</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 40 columns</p>\n</div>"
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"normalize_dataset(test)\n",
"test.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 60,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 44513 entries, 339824 to 52161\n",
"Data columns (total 40 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 State 44513 non-null category\n",
" 1 Male 44513 non-null float64 \n",
" 2 GeneralHealth 44393 non-null float64 \n",
" 3 PhysicalHealthDays 43469 non-null float64 \n",
" 4 MentalHealthDays 43622 non-null float64 \n",
" 5 LastCheckupTime 44513 non-null category\n",
" 6 PhysicalActivities 44408 non-null float64 \n",
" 7 SleepHours 44008 non-null float64 \n",
" 8 RemovedTeeth 43413 non-null float64 \n",
" 9 HadHeartAttack 44182 non-null float64 \n",
" 10 HadAngina 44074 non-null float64 \n",
" 11 HadStroke 44368 non-null float64 \n",
" 12 HadAsthma 44339 non-null float64 \n",
" 13 HadSkinCancer 44184 non-null float64 \n",
" 14 HadCOPD 44299 non-null float64 \n",
" 15 HadDepressiveDisorder 44218 non-null float64 \n",
" 16 HadKidneyDisease 44320 non-null float64 \n",
" 17 HadArthritis 44243 non-null float64 \n",
" 18 HadDiabetes 44411 non-null float64 \n",
" 19 DeafOrHardOfHearing 42485 non-null float64 \n",
" 20 BlindOrVisionDifficulty 42387 non-null float64 \n",
" 21 DifficultyConcentrating 42169 non-null float64 \n",
" 22 DifficultyWalking 42172 non-null float64 \n",
" 23 DifficultyDressingBathing 42182 non-null float64 \n",
" 24 DifficultyErrands 41999 non-null float64 \n",
" 25 SmokerStatus 41005 non-null float64 \n",
" 26 ECigaretteUsage 41003 non-null float64 \n",
" 27 ChestScan 38958 non-null float64 \n",
" 28 RaceEthnicityCategory 44513 non-null category\n",
" 29 AgeCategory 44513 non-null category\n",
" 30 HeightInMeters 41714 non-null float64 \n",
" 31 WeightInKilograms 40397 non-null float64 \n",
" 32 BMI 39724 non-null float64 \n",
" 33 AlcoholDrinkers 39956 non-null float64 \n",
" 34 HIVTesting 38018 non-null float64 \n",
" 35 FluVaxLast12 39886 non-null float64 \n",
" 36 PneumoVaxEver 36860 non-null float64 \n",
" 37 TetanusLast10Tdap 36315 non-null float64 \n",
" 38 HighRiskLastYear 39538 non-null float64 \n",
" 39 CovidPos 38114 non-null float64 \n",
"dtypes: category(4), float64(36)\n",
"memory usage: 12.7 MB\n"
]
}
],
"source": [
"test.info()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 61,
"outputs": [],
"source": [
"normalize_dataset(train)\n",
"normalize_dataset(valid)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## Statystyki dla zbiorów po zamianie na kolumny numeryczne\n",
"\n",
"*50. centyl to mediana*"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 62,
"outputs": [
{
"data": {
"text/plain": " Male GeneralHealth PhysicalHealthDays MentalHealthDays \\\ncount 676777.000000 674433.000000 655630.000000 660417.000000 \nmean 0.538139 3.055519 6.737547 4.863972 \nstd 0.498544 1.137862 10.713287 9.115863 \nmin 0.000000 1.000000 0.000000 0.000000 \n25% 0.000000 2.000000 0.000000 0.000000 \n50% 1.000000 3.000000 0.000000 0.000000 \n75% 1.000000 4.000000 10.000000 5.000000 \nmax 1.000000 5.000000 30.000000 30.000000 \n\n PhysicalActivities SleepHours RemovedTeeth HadHeartAttack \\\ncount 674814.000000 665609.000000 654359.000000 674355.000000 \nmean 0.690146 7.032336 0.983081 0.505244 \nstd 0.462434 1.726387 1.019679 0.499973 \nmin 0.000000 1.000000 0.000000 0.000000 \n25% 0.000000 6.000000 0.000000 0.000000 \n50% 1.000000 7.000000 1.000000 1.000000 \n75% 1.000000 8.000000 2.000000 1.000000 \nmax 1.000000 24.000000 3.000000 1.000000 \n\n HadAngina HadStroke ... HeightInMeters WeightInKilograms \\\ncount 657726.000000 672927.000000 ... 637313.000000 619546.000000 \nmean 0.264549 0.117193 ... 1.707193 84.657015 \nstd 0.441093 0.321650 ... 0.108002 21.753692 \nmin 0.000000 0.000000 ... 0.910000 22.680000 \n25% 0.000000 0.000000 ... 1.630000 69.400000 \n50% 0.000000 0.000000 ... 1.700000 81.650000 \n75% 1.000000 0.000000 ... 1.780000 96.160000 \nmax 1.000000 1.000000 ... 2.410000 292.570000 \n\n BMI AlcoholDrinkers HIVTesting FluVaxLast12 \\\ncount 611024.000000 606636.000000 572562.000000 605920.000000 \nmean 28.917363 0.456819 0.325787 0.569879 \nstd 6.607455 0.498132 0.468668 0.495093 \nmin 12.050000 0.000000 0.000000 0.000000 \n25% 24.410000 0.000000 0.000000 0.000000 \n50% 27.890000 0.000000 0.000000 1.000000 \n75% 32.260000 1.000000 1.000000 1.000000 \nmax 97.650000 1.000000 1.000000 1.000000 \n\n PneumoVaxEver TetanusLast10Tdap HighRiskLastYear CovidPos \ncount 570114.000000 554467.0 600540.000000 585150.000000 \nmean 0.527672 1.0 0.035087 0.273055 \nstd 0.499234 0.0 0.183999 0.445529 \nmin 0.000000 1.0 0.000000 0.000000 \n25% 0.000000 1.0 0.000000 0.000000 \n50% 1.000000 1.0 0.000000 0.000000 \n75% 1.000000 1.0 0.000000 1.000000 \nmax 1.000000 1.0 1.000000 1.000000 \n\n[8 rows x 36 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Male</th>\n <th>GeneralHealth</th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>PhysicalActivities</th>\n <th>SleepHours</th>\n <th>RemovedTeeth</th>\n <th>HadHeartAttack</th>\n <th>HadAngina</th>\n <th>HadStroke</th>\n <th>...</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n <th>AlcoholDrinkers</th>\n <th>HIVTesting</th>\n <th>FluVaxLast12</th>\n <th>PneumoVaxEver</th>\n <th>TetanusLast10Tdap</th>\n <th>HighRiskLastYear</th>\n <th>CovidPos</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>676777.000000</td>\n <td>674433.000000</td>\n <td>655630.000000</td>\n <td>660417.000000</td>\n <td>674814.000000</td>\n <td>665609.000000</td>\n <td>654359.000000</td>\n <td>674355.000000</td>\n <td>657726.000000</td>\n <td>672927.000000</td>\n <td>...</td>\n <td>637313.000000</td>\n <td>619546.000000</td>\n <td>611024.000000</td>\n <td>606636.000000</td>\n <td>572562.000000</td>\n <td>605920.000000</td>\n <td>570114.000000</td>\n <td>554467.0</td>\n <td>600540.000000</td>\n <td>585150.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>0.538139</td>\n <td>3.055519</td>\n <td>6.737547</td>\n <td>4.863972</td>\n <td>0.690146</td>\n <td>7.032336</td>\n <td>0.983081</td>\n <td>0.505244</td>\n <td>0.264549</td>\n <td>0.117193</td>\n <td>...</td>\n <td>1.707193</td>\n <td>84.657015</td>\n <td>28.917363</td>\n <td>0.456819</td>\n <td>0.325787</td>\n <td>0.569879</td>\n <td>0.527672</td>\n <td>1.0</td>\n <td>0.035087</td>\n <td>0.273055</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.498544</td>\n <td>1.137862</td>\n <td>10.713287</td>\n <td>9.115863</td>\n <td>0.462434</td>\n <td>1.726387</td>\n <td>1.019679</td>\n <td>0.499973</td>\n <td>0.441093</td>\n <td>0.321650</td>\n <td>...</td>\n <td>0.108002</td>\n <td>21.753692</td>\n <td>6.607455</td>\n <td>0.498132</td>\n <td>0.468668</td>\n <td>0.495093</td>\n <td>0.499234</td>\n <td>0.0</td>\n <td>0.183999</td>\n <td>0.445529</td>\n </tr>\n <tr>\n <th>min</th>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>0.910000</td>\n <td>22.680000</td>\n <td>12.050000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>0.000000</td>\n <td>2.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>6.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>1.630000</td>\n <td>69.400000</td>\n <td>24.410000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>1.000000</td>\n <td>3.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>7.000000</td>\n
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.describe()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 63,
"outputs": [
{
"data": {
"text/plain": " Male GeneralHealth PhysicalHealthDays MentalHealthDays \\\ncount 44513.000000 44393.000000 43469.000000 43622.000000 \nmean 0.471593 3.441511 4.275001 4.298221 \nstd 0.499198 1.050924 8.588663 8.299250 \nmin 0.000000 1.000000 0.000000 0.000000 \n25% 0.000000 3.000000 0.000000 0.000000 \n50% 0.000000 3.000000 0.000000 0.000000 \n75% 1.000000 4.000000 3.000000 4.000000 \nmax 1.000000 5.000000 30.000000 30.000000 \n\n PhysicalActivities SleepHours RemovedTeeth HadHeartAttack \\\ncount 44408.000000 44008.000000 43413.000000 44182.000000 \nmean 0.760021 7.036584 0.685302 0.057241 \nstd 0.427075 1.512667 0.884912 0.232304 \nmin 0.000000 1.000000 0.000000 0.000000 \n25% 1.000000 6.000000 0.000000 0.000000 \n50% 1.000000 7.000000 0.000000 0.000000 \n75% 1.000000 8.000000 1.000000 0.000000 \nmax 1.000000 24.000000 3.000000 1.000000 \n\n HadAngina HadStroke ... HeightInMeters WeightInKilograms \\\ncount 44074.000000 44368.000000 ... 41714.000000 40397.000000 \nmean 0.061056 0.042869 ... 1.703194 83.021746 \nstd 0.239436 0.202563 ... 0.107438 21.551394 \nmin 0.000000 0.000000 ... 0.910000 22.680000 \n25% 0.000000 0.000000 ... 1.630000 68.040000 \n50% 0.000000 0.000000 ... 1.700000 80.740000 \n75% 0.000000 0.000000 ... 1.780000 95.250000 \nmax 1.000000 1.000000 ... 2.340000 290.300000 \n\n BMI AlcoholDrinkers HIVTesting FluVaxLast12 \\\ncount 39724.000000 39956.000000 38018.000000 39886.000000 \nmean 28.512326 0.529532 0.342233 0.527002 \nstd 6.596149 0.499133 0.474463 0.499277 \nmin 12.020000 0.000000 0.000000 0.000000 \n25% 24.030000 0.000000 0.000000 0.000000 \n50% 27.410000 1.000000 0.000000 1.000000 \n75% 31.650000 1.000000 1.000000 1.000000 \nmax 97.650000 1.000000 1.000000 1.000000 \n\n PneumoVaxEver TetanusLast10Tdap HighRiskLastYear CovidPos \ncount 36860.000000 36315.0 39538.000000 38114.000000 \nmean 0.413592 1.0 0.043427 0.289500 \nstd 0.492484 0.0 0.203818 0.453536 \nmin 0.000000 1.0 0.000000 0.000000 \n25% 0.000000 1.0 0.000000 0.000000 \n50% 0.000000 1.0 0.000000 0.000000 \n75% 1.000000 1.0 0.000000 1.000000 \nmax 1.000000 1.0 1.000000 1.000000 \n\n[8 rows x 36 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Male</th>\n <th>GeneralHealth</th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>PhysicalActivities</th>\n <th>SleepHours</th>\n <th>RemovedTeeth</th>\n <th>HadHeartAttack</th>\n <th>HadAngina</th>\n <th>HadStroke</th>\n <th>...</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n <th>AlcoholDrinkers</th>\n <th>HIVTesting</th>\n <th>FluVaxLast12</th>\n <th>PneumoVaxEver</th>\n <th>TetanusLast10Tdap</th>\n <th>HighRiskLastYear</th>\n <th>CovidPos</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>44513.000000</td>\n <td>44393.000000</td>\n <td>43469.000000</td>\n <td>43622.000000</td>\n <td>44408.000000</td>\n <td>44008.000000</td>\n <td>43413.000000</td>\n <td>44182.000000</td>\n <td>44074.000000</td>\n <td>44368.000000</td>\n <td>...</td>\n <td>41714.000000</td>\n <td>40397.000000</td>\n <td>39724.000000</td>\n <td>39956.000000</td>\n <td>38018.000000</td>\n <td>39886.000000</td>\n <td>36860.000000</td>\n <td>36315.0</td>\n <td>39538.000000</td>\n <td>38114.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>0.471593</td>\n <td>3.441511</td>\n <td>4.275001</td>\n <td>4.298221</td>\n <td>0.760021</td>\n <td>7.036584</td>\n <td>0.685302</td>\n <td>0.057241</td>\n <td>0.061056</td>\n <td>0.042869</td>\n <td>...</td>\n <td>1.703194</td>\n <td>83.021746</td>\n <td>28.512326</td>\n <td>0.529532</td>\n <td>0.342233</td>\n <td>0.527002</td>\n <td>0.413592</td>\n <td>1.0</td>\n <td>0.043427</td>\n <td>0.289500</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.499198</td>\n <td>1.050924</td>\n <td>8.588663</td>\n <td>8.299250</td>\n <td>0.427075</td>\n <td>1.512667</td>\n <td>0.884912</td>\n <td>0.232304</td>\n <td>0.239436</td>\n <td>0.202563</td>\n <td>...</td>\n <td>0.107438</td>\n <td>21.551394</td>\n <td>6.596149</td>\n <td>0.499133</td>\n <td>0.474463</td>\n <td>0.499277</td>\n <td>0.492484</td>\n <td>0.0</td>\n <td>0.203818</td>\n <td>0.453536</td>\n </tr>\n <tr>\n <th>min</th>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>0.910000</td>\n <td>22.680000</td>\n <td>12.020000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>0.000000</td>\n <td>3.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>6.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>1.630000</td>\n <td>68.040000</td>\n <td>24.030000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>0.000000</td>\n <td>3.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>7.000000</td>\n <td>0.000000</td>\n
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.describe()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 64,
"outputs": [
{
"data": {
"text/plain": " Male GeneralHealth PhysicalHealthDays MentalHealthDays \\\ncount 44514.000000 44388.000000 43458.000000 43578.000000 \nmean 0.469043 3.434554 4.355470 4.379022 \nstd 0.499046 1.051996 8.718506 8.383576 \nmin 0.000000 1.000000 0.000000 0.000000 \n25% 0.000000 3.000000 0.000000 0.000000 \n50% 0.000000 3.000000 0.000000 0.000000 \n75% 1.000000 4.000000 3.000000 5.000000 \nmax 1.000000 5.000000 30.000000 30.000000 \n\n PhysicalActivities SleepHours RemovedTeeth HadHeartAttack \\\ncount 44401.000000 43966.000000 43360.000000 44202.000000 \nmean 0.758361 7.013010 0.684732 0.057396 \nstd 0.428081 1.491967 0.882396 0.232600 \nmin 0.000000 1.000000 0.000000 0.000000 \n25% 1.000000 6.000000 0.000000 0.000000 \n50% 1.000000 7.000000 0.000000 0.000000 \n75% 1.000000 8.000000 1.000000 0.000000 \nmax 1.000000 24.000000 3.000000 1.000000 \n\n HadAngina HadStroke ... HeightInMeters WeightInKilograms \\\ncount 44031.000000 44344.000000 ... 41677.000000 40327.000000 \nmean 0.058936 0.042463 ... 1.702146 82.981070 \nstd 0.235507 0.201646 ... 0.106978 21.512676 \nmin 0.000000 0.000000 ... 0.910000 22.680000 \n25% 0.000000 0.000000 ... 1.630000 68.040000 \n50% 0.000000 0.000000 ... 1.700000 79.830000 \n75% 0.000000 0.000000 ... 1.780000 95.250000 \nmax 1.000000 1.000000 ... 2.360000 263.080000 \n\n BMI AlcoholDrinkers HIVTesting FluVaxLast12 \\\ncount 39626.000000 39950.000000 38041.000000 39885.000000 \nmean 28.521370 0.527910 0.342525 0.525285 \nstd 6.622255 0.499227 0.474560 0.499367 \nmin 12.160000 0.000000 0.000000 0.000000 \n25% 24.030000 0.000000 0.000000 0.000000 \n50% 27.400000 1.000000 0.000000 1.000000 \n75% 31.750000 1.000000 1.000000 1.000000 \nmax 99.640000 1.000000 1.000000 1.000000 \n\n PneumoVaxEver TetanusLast10Tdap HighRiskLastYear CovidPos \ncount 36926.000000 36250.0 39535.000000 38212.000000 \nmean 0.412771 1.0 0.044846 0.291872 \nstd 0.492339 0.0 0.206969 0.454630 \nmin 0.000000 1.0 0.000000 0.000000 \n25% 0.000000 1.0 0.000000 0.000000 \n50% 0.000000 1.0 0.000000 0.000000 \n75% 1.000000 1.0 0.000000 1.000000 \nmax 1.000000 1.0 1.000000 1.000000 \n\n[8 rows x 36 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Male</th>\n <th>GeneralHealth</th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>PhysicalActivities</th>\n <th>SleepHours</th>\n <th>RemovedTeeth</th>\n <th>HadHeartAttack</th>\n <th>HadAngina</th>\n <th>HadStroke</th>\n <th>...</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n <th>AlcoholDrinkers</th>\n <th>HIVTesting</th>\n <th>FluVaxLast12</th>\n <th>PneumoVaxEver</th>\n <th>TetanusLast10Tdap</th>\n <th>HighRiskLastYear</th>\n <th>CovidPos</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>44514.000000</td>\n <td>44388.000000</td>\n <td>43458.000000</td>\n <td>43578.000000</td>\n <td>44401.000000</td>\n <td>43966.000000</td>\n <td>43360.000000</td>\n <td>44202.000000</td>\n <td>44031.000000</td>\n <td>44344.000000</td>\n <td>...</td>\n <td>41677.000000</td>\n <td>40327.000000</td>\n <td>39626.000000</td>\n <td>39950.000000</td>\n <td>38041.000000</td>\n <td>39885.000000</td>\n <td>36926.000000</td>\n <td>36250.0</td>\n <td>39535.000000</td>\n <td>38212.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>0.469043</td>\n <td>3.434554</td>\n <td>4.355470</td>\n <td>4.379022</td>\n <td>0.758361</td>\n <td>7.013010</td>\n <td>0.684732</td>\n <td>0.057396</td>\n <td>0.058936</td>\n <td>0.042463</td>\n <td>...</td>\n <td>1.702146</td>\n <td>82.981070</td>\n <td>28.521370</td>\n <td>0.527910</td>\n <td>0.342525</td>\n <td>0.525285</td>\n <td>0.412771</td>\n <td>1.0</td>\n <td>0.044846</td>\n <td>0.291872</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.499046</td>\n <td>1.051996</td>\n <td>8.718506</td>\n <td>8.383576</td>\n <td>0.428081</td>\n <td>1.491967</td>\n <td>0.882396</td>\n <td>0.232600</td>\n <td>0.235507</td>\n <td>0.201646</td>\n <td>...</td>\n <td>0.106978</td>\n <td>21.512676</td>\n <td>6.622255</td>\n <td>0.499227</td>\n <td>0.474560</td>\n <td>0.499367</td>\n <td>0.492339</td>\n <td>0.0</td>\n <td>0.206969</td>\n <td>0.454630</td>\n </tr>\n <tr>\n <th>min</th>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>0.910000</td>\n <td>22.680000</td>\n <td>12.160000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>0.000000</td>\n <td>3.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>6.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>1.630000</td>\n <td>68.040000</td>\n <td>24.030000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>0.000000</td>\n <td>3.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>7.000000</td>\n <td>0.000000</td>\n
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"valid.describe()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"#### Wydaje się być korelacja między masą ciała i zawałem:"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 65,
"outputs": [
{
"data": {
"text/plain": "<Figure size 729.847x600 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAsMAAAJICAYAAACE++lZAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAA9hAAAPYQGoP6dpAABSxklEQVR4nO3dd3xUVf7/8fdMkkkCCRAiJHQUpQgkJEKE/SFCQOBLcaW4ooKAgCAlAlKXjtJWQEpEQFBAQIoUZcUFEUXWlRJE3BVQ0YD00JKQkD7z+4Nl1jGUlEkyk/t6Ph557OTek3M/d46zvHNy7r0mm81mEwAAAGBA5qIuAAAAACgqhGEAAAAYFmEYAAAAhkUYBgAAgGERhgEAAGBYhGEAAAAYFmEYAAAAhkUYBgAAgGERhgEAAGBYLhWGlyxZoh49ejhs2717t7p06aKwsDBFRkZq1qxZSk1Nte9PS0vTlClT1KRJE4WFhenVV1/V1atXC7t0AAAAuCHPoi7gljVr1mjevHlq2LChfVtMTIwGDx6sqKgotW3bVqdOndLEiRMVHx+vGTNmSJImT56smJgYLVy4UBaLRZMmTVJUVJRWr16d51ouXbqe7/MBAABFp1w5/6IuAW6iyGeGL168qAEDBmj27NmqXr26w75169bp0Ucf1YABA1S9enU9/vjjGjZsmLZt26b09HRdvHhRW7du1fjx49WwYUOFhIRo7ty5OnjwoA4fPlw0JwQAAAC3UeRh+IcffpCXl5c+/vhjhYaGOux78cUXNXr0aIdtZrNZGRkZSkpK0qFDhyRJjRs3tu+///77FRQUpIMHDxZ88QAAAHBrRb5MIjIyUpGRkbfd9/DDDzt8n5GRoRUrVqhevXoqW7asLl68qICAAHl7ezu0K1++vC5cuJDnmsxmk8xmU55/HgAAAO6hyMNwTmVmZmrUqFH6+eeftWbNGklSSkqKLBZLtrbe3t5KS0vL87HKli0pk4kwDAAAUNy5RRhOSkrS0KFDdeDAAUVHRyskJESS5OPjo/T09Gzt09LS5Ovrm+fjXb2azMwwAABuLCCgZFGXADfh8mE4Li5O/fr109mzZ7V8+XI1atTIvi84OFjx8fFKT093mCGOi4tTUFBQno9ptdpktdryVTcAAABcX5FfQHc3CQkJ6tmzp65evao1a9Y4BGFJeuSRR2S1Wu0X0klSbGysLl68mK0tAAAA8EcuPTM8Y8YMnT59WsuWLVPZsmV16dIl+76yZcsqKChI7du31/jx4zV9+nT5+vpq0qRJioiIUIMGDYqucAAAALgFlw3DWVlZ2r59uzIyMtSzZ89s+z///HNVrlxZr732mqZPn67BgwdLkpo1a6bx48cXdrkAAABwQyabzcbi2D/gCXQAALg3nkCHnHLpNcMAAABAQSIMAwAAwLAIwwAAADAswjAAAAAMizAMAAAAwyIMAwAAwLAIwwAAADAswjAAAAAMizAMAAAAwyIMAwAAwLAIwwAAADAswjAAAAAMy7OoC4B07txZrVjxjiSpV69+qlixUhFXBAAAYAzMDLuAlSuX6/vvv9P333+nVauWF3U5AAAAhkEYdgFnz562vz5z5vRdWgIAAMCZCMMAAAAwLMIwAAAADIsL6HBPXOAHAACKK2aGcU9c4AcAAIorwjDuiQv8AABAccUyCeAPWBYCAIBxMDMM/AHLQgAAMA7CMPAHLAsBAMA4CMMAAAAwLMIwAAAADIswDAAAAMMiDAMAAMCwCMMAAAAwLMIwAAAADIswDAAAAMMiDAMAAMCwCMMAAAAwLMIwAAAADMuzqAsAkDvnzp3VihXvSJJ69eqnihUrFXFFAAC4L2aGATezcuVyff/9d/r++++0atXyoi4HAAC3RhgG3MzZs6ftr8+cOX2XlgAA4F4IwwAAADAswjAAAAAMizAMAAAAwyIMAwAAwLAIwwAAADAswjAAAAAMizAMAAAAwyIMAwAAwLB4HDMAl8bjpwEABYmZYQAujcdPAwAKEmEYgEvj8dMAgILEMgkAyAeWcQCAe2NmGADygWUcAODeCMMAkA8s4wAA90YYBgAAgGERhgEAAGBYhGEAAAAYFmEYAAAAhkUYBgAAgGERhgEAAGBYhGEAAAAYFmEYAAAAhkUYBgAAgGERhgEAAGBYhGEAAAAYFmEYAAAAhkUYBgAAgGERhgEAAGBYhGEAAAAYlmdRFwAAgDOcO3dWK1a8I0nq1aufKlasVMQVAXAHzAwDAIqFlSuX6/vvv9P333+nVauWF3U5ANwEYRgAUCycPXva/vrMmdN3aQkA/0MYBgAAgGERhgEAAGBYhGEAAAAYFneTAAAATsfdPeAumBkGAABOx9094C5cKgwvWbJEPXr0cNh27Ngxde/eXQ0aNFBkZKRWrVrlsN9qtWrBggV67LHH1KBBA/Xr10+nT3MVMQAARYm7e8BduEwYXrNmjebNm+ew7dq1a+rdu7eqVq2qTZs2adCgQZo9e7Y2bdpkb7No0SKtXbtWr732mtatWyer1aq+ffsqPT29kM8AAAAA7qbI1wxfvHhRkyZN0v79+1W9enWHfRs2bJCXl5emTp0qT09P1ahRQ6dOndLSpUvVpUsXpaen691339WIESPUvHlzSdKbb76pxx57TDt37lSHDh0K/4QAAADgNop8ZviHH36Ql5eXPv74Y4WGhjrsi4mJUUREhDw9/5fZGzdurJMnT+ry5cs6fvy4kpOT1aRJE/v+UqVK6eGHH9bBgwcL7RwAAADgnop8ZjgyMlKRkZG33XfhwgXVrFnTYVv58uUlSefPn9eFCxckSRUqVMjW5tY+AIBzcHcAAMVRkYfhu0lNTZXFYnHY5u3tLUlKS0tTSkqKJN22TUJCQp6PazabZDab8vzzuWUymRxee3oW+YS9A1evz9lc/XxdvT5nc/XzdfX6nGnVqpt3B5Ck999/V+PGTSragv7ASGPhDhgPuAuXDsM+Pj7ZLoRLS0uTJJUoUUI+Pj6SpPT0dPvrW218fX3zfNyyZUs6fIgL2u+Dt9lsUkBAyUI7dk64en3O5urn6+r1OZurn6+r1+dM58+ftb8+d+6My52rkcbCHTAecBcuHYaDg4MVFxfnsO3W90FBQcrMzLRvq1q1qkObWrVq5fm4V68mF+rMsNVqc3h97VpyoR07J1y9Pmdz9fN19fqczdXP19XrcyZXP1dXr89oino8CN/IKZcOw40aNdK6deuUlZUlDw8PSdK+fft0//33KzAwUP7+/vLz89P+/fvtYTgxMVFHjx5V9+7d83xcq9Xm8CEuaDabzeF1Zqa10I6dE65en7O5+vm6en3O5urn6+r1OZOrn6ur12c0jAfchUsv4OnSpYuSkpI0btw4nThxQps3b9aKFSvUv39/STfXCnfv3l2zZ8/W559/ruPHj2vYsGEKDg5W69ati7h6AAAAuDqXnhkODAzUsmXLNG3aNHXq1EnlypXTqFGj1KlTJ3ubqKgoZWZmavz48UpNTVWjRo20fPlyeXl5FWHlAAAAcAcuFYZnzpyZbVtISIjWr19/x5/x8PDQyJEjNXLkyIIsDQAAAMWQSy+TAAAAAAoSYRgAAACGRRgGAACAYRGGAQAAYFiEYQAAABgWYRgAAACGRRgGAACAYRGGAQAAYFiEYQAAABgWYRgAAACGRRgGAACAYRGGAQAAYFiEYQAAABgWYRgAAACGRRgGAACAYRGGAQAAYFieRV0AAADIvXPnzmrFinckSb169VPFipWKuCLAPTEzDACAG1q5crm+//47ff/9d1q1anlRlwO4LcIwAABu6OzZ0/bXZ86cvktLAHdDGAYAAIBhEYYBAABgWIRhAAAAGBZhGAAAAIZFGAYAAIBhEYYBAABgWIRhAAAAGBZhGAAAAIZFGAYAAIBhEYYBAABgWIRhAAAAGBZhGAAAAIZFGAYAAIBhEYYBAABgWIRhAAAAGBZhGAAAAIZFGAYAAIBhEYYBAABgWIRhAAAAGBZhGAAAAIZFGAYAAIBhEYYBAABgWIRhAAAAGBZhGAAAAIblWdQFoOCYTJLJZHJCPyaH12Zz/vuUJJvNJpvNKV0BAAD
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sns\n",
"sns.set_theme()\n",
"g = sns.catplot(\n",
" data=train, kind=\"bar\",\n",
" x=\"GeneralHealth\", y=\"WeightInKilograms\", hue=\"HadHeartAttack\",\n",
" errorbar=\"sd\", palette=\"dark\", alpha=.6, height=6\n",
")\n",
"g.despine(left=True)\n",
"g.set_axis_labels(\"General health index\", \"Body mass (kg)\")\n",
"g.legend.set_title(\"Had heart attack\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Osoby palące częsciej miały zawał:"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 66,
"outputs": [
{
"data": {
"text/plain": " SmokerStatus HadHeartAttack\n0 0.0 0.037883\n1 1.0 0.072598\n2 2.0 0.088887\n3 3.0 0.090192",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>SmokerStatus</th>\n <th>HadHeartAttack</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0.0</td>\n <td>0.037883</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1.0</td>\n <td>0.072598</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2.0</td>\n <td>0.088887</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3.0</td>\n <td>0.090192</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"valid.groupby('SmokerStatus', as_index=False)['HadHeartAttack'].mean()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Osoby z gorszym wskaźnikiem \"GeneralHealth\" w tym zbiorze danych częściej miały zawał:"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 67,
"outputs": [
{
"data": {
"text/plain": " GeneralHealth HadHeartAttack\n0 1.0 0.228411\n1 2.0 0.129270\n2 3.0 0.056693\n3 4.0 0.027336\n4 5.0 0.014743",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>GeneralHealth</th>\n <th>HadHeartAttack</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1.0</td>\n <td>0.228411</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2.0</td>\n <td>0.129270</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3.0</td>\n <td>0.056693</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4.0</td>\n <td>0.027336</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5.0</td>\n <td>0.014743</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"valid.groupby('GeneralHealth', as_index=False)['HadHeartAttack'].mean()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 68,
"outputs": [
{
"data": {
"text/plain": "SmokerStatus 0.0 1.0 2.0 3.0\nGeneralHealth \n1.0 0.194640 0.310680 0.257100 0.222222\n2.0 0.090772 0.146429 0.184443 0.155059\n3.0 0.039989 0.031068 0.091645 0.060469\n4.0 0.021611 0.032070 0.035265 0.048292\n5.0 0.011078 0.012579 0.026298 0.018315",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>SmokerStatus</th>\n <th>0.0</th>\n <th>1.0</th>\n <th>2.0</th>\n <th>3.0</th>\n </tr>\n <tr>\n <th>GeneralHealth</th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1.0</th>\n <td>0.194640</td>\n <td>0.310680</td>\n <td>0.257100</td>\n <td>0.222222</td>\n </tr>\n <tr>\n <th>2.0</th>\n <td>0.090772</td>\n <td>0.146429</td>\n <td>0.184443</td>\n <td>0.155059</td>\n </tr>\n <tr>\n <th>3.0</th>\n <td>0.039989</td>\n <td>0.031068</td>\n <td>0.091645</td>\n <td>0.060469</td>\n </tr>\n <tr>\n <th>4.0</th>\n <td>0.021611</td>\n <td>0.032070</td>\n <td>0.035265</td>\n <td>0.048292</td>\n </tr>\n <tr>\n <th>5.0</th>\n <td>0.011078</td>\n <td>0.012579</td>\n <td>0.026298</td>\n <td>0.018315</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"valid.pivot_table('HadHeartAttack',index='GeneralHealth', columns='SmokerStatus')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## Normalizacja część 2 - Skalowanie kolumn numerycznych do 0-1"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 69,
"outputs": [],
"source": [
"from sklearn.preprocessing import MinMaxScaler\n",
"scaler = MinMaxScaler()\n",
"def scale_float_columns(dataset):\n",
" numerical_columns = list(dataset.select_dtypes(include=['float64']).columns)\n",
" dataset[numerical_columns] = scaler.fit_transform(dataset[numerical_columns])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 70,
"outputs": [
{
"data": {
"text/plain": " State Male GeneralHealth PhysicalHealthDays \\\n339824 South Dakota 0.0 3.0 3.0 \n127927 Kansas 0.0 3.0 30.0 \n362523 Utah 1.0 5.0 0.0 \n183687 Michigan 1.0 3.0 0.0 \n191905 Michigan 0.0 4.0 0.0 \n\n MentalHealthDays LastCheckupTime \\\n339824 21.0 Within past year (anytime less than 12 months ... \n127927 0.0 Within past year (anytime less than 12 months ... \n362523 0.0 Within past year (anytime less than 12 months ... \n183687 7.0 Within past year (anytime less than 12 months ... \n191905 0.0 Within past year (anytime less than 12 months ... \n\n PhysicalActivities SleepHours RemovedTeeth HadHeartAttack ... \\\n339824 1.0 8.0 2.0 0.0 ... \n127927 1.0 10.0 1.0 0.0 ... \n362523 1.0 7.0 1.0 0.0 ... \n183687 1.0 8.0 0.0 0.0 ... \n191905 1.0 7.0 0.0 0.0 ... \n\n HeightInMeters WeightInKilograms BMI AlcoholDrinkers HIVTesting \\\n339824 1.60 52.16 20.37 0.0 1.0 \n127927 1.68 97.52 34.70 0.0 0.0 \n362523 1.83 113.85 34.04 0.0 0.0 \n183687 1.78 83.91 26.54 1.0 0.0 \n191905 1.57 68.04 27.44 1.0 1.0 \n\n FluVaxLast12 PneumoVaxEver TetanusLast10Tdap HighRiskLastYear \\\n339824 1.0 1.0 1.0 0.0 \n127927 1.0 1.0 NaN 0.0 \n362523 0.0 0.0 1.0 0.0 \n183687 1.0 1.0 1.0 0.0 \n191905 0.0 1.0 1.0 0.0 \n\n CovidPos \n339824 0.0 \n127927 0.0 \n362523 1.0 \n183687 0.0 \n191905 0.0 \n\n[5 rows x 40 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>State</th>\n <th>Male</th>\n <th>GeneralHealth</th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>LastCheckupTime</th>\n <th>PhysicalActivities</th>\n <th>SleepHours</th>\n <th>RemovedTeeth</th>\n <th>HadHeartAttack</th>\n <th>...</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n <th>AlcoholDrinkers</th>\n <th>HIVTesting</th>\n <th>FluVaxLast12</th>\n <th>PneumoVaxEver</th>\n <th>TetanusLast10Tdap</th>\n <th>HighRiskLastYear</th>\n <th>CovidPos</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>339824</th>\n <td>South Dakota</td>\n <td>0.0</td>\n <td>3.0</td>\n <td>3.0</td>\n <td>21.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>8.0</td>\n <td>2.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>1.60</td>\n <td>52.16</td>\n <td>20.37</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>127927</th>\n <td>Kansas</td>\n <td>0.0</td>\n <td>3.0</td>\n <td>30.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>10.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>1.68</td>\n <td>97.52</td>\n <td>34.70</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>362523</th>\n <td>Utah</td>\n <td>1.0</td>\n <td>5.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>7.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>1.83</td>\n <td>113.85</td>\n <td>34.04</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>183687</th>\n <td>Michigan</td>\n <td>1.0</td>\n <td>3.0</td>\n <td>0.0</td>\n <td>7.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>8.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>1.78</td>\n <td>83.91</td>\n <td>26.54</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>191905</th>\n <td>Michigan</td>\n <td>0.0</td>\n <td>4.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>7.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>1.57</td>\n <td>68.04</td>\n <td>27.44</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 40 columns</p>\n</div>"
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 71,
"outputs": [
{
"data": {
"text/plain": " State Male GeneralHealth PhysicalHealthDays \\\n339824 South Dakota 0.0 0.50 0.1 \n127927 Kansas 0.0 0.50 1.0 \n362523 Utah 1.0 1.00 0.0 \n183687 Michigan 1.0 0.50 0.0 \n191905 Michigan 0.0 0.75 0.0 \n\n MentalHealthDays LastCheckupTime \\\n339824 0.700000 Within past year (anytime less than 12 months ... \n127927 0.000000 Within past year (anytime less than 12 months ... \n362523 0.000000 Within past year (anytime less than 12 months ... \n183687 0.233333 Within past year (anytime less than 12 months ... \n191905 0.000000 Within past year (anytime less than 12 months ... \n\n PhysicalActivities SleepHours RemovedTeeth HadHeartAttack ... \\\n339824 1.0 0.304348 0.666667 0.0 ... \n127927 1.0 0.391304 0.333333 0.0 ... \n362523 1.0 0.260870 0.333333 0.0 ... \n183687 1.0 0.304348 0.000000 0.0 ... \n191905 1.0 0.260870 0.000000 0.0 ... \n\n HeightInMeters WeightInKilograms BMI AlcoholDrinkers \\\n339824 0.482517 0.110156 0.097513 0.0 \n127927 0.538462 0.279650 0.264860 0.0 \n362523 0.643357 0.340670 0.257153 0.0 \n183687 0.608392 0.228795 0.169567 1.0 \n191905 0.461538 0.169494 0.180077 1.0 \n\n HIVTesting FluVaxLast12 PneumoVaxEver TetanusLast10Tdap \\\n339824 1.0 1.0 1.0 0.0 \n127927 0.0 1.0 1.0 NaN \n362523 0.0 0.0 0.0 0.0 \n183687 0.0 1.0 1.0 0.0 \n191905 1.0 0.0 1.0 0.0 \n\n HighRiskLastYear CovidPos \n339824 0.0 0.0 \n127927 0.0 0.0 \n362523 0.0 1.0 \n183687 0.0 0.0 \n191905 0.0 0.0 \n\n[5 rows x 40 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>State</th>\n <th>Male</th>\n <th>GeneralHealth</th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>LastCheckupTime</th>\n <th>PhysicalActivities</th>\n <th>SleepHours</th>\n <th>RemovedTeeth</th>\n <th>HadHeartAttack</th>\n <th>...</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n <th>AlcoholDrinkers</th>\n <th>HIVTesting</th>\n <th>FluVaxLast12</th>\n <th>PneumoVaxEver</th>\n <th>TetanusLast10Tdap</th>\n <th>HighRiskLastYear</th>\n <th>CovidPos</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>339824</th>\n <td>South Dakota</td>\n <td>0.0</td>\n <td>0.50</td>\n <td>0.1</td>\n <td>0.700000</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.304348</td>\n <td>0.666667</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.482517</td>\n <td>0.110156</td>\n <td>0.097513</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>127927</th>\n <td>Kansas</td>\n <td>0.0</td>\n <td>0.50</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.391304</td>\n <td>0.333333</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.538462</td>\n <td>0.279650</td>\n <td>0.264860</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>362523</th>\n <td>Utah</td>\n <td>1.0</td>\n <td>1.00</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.260870</td>\n <td>0.333333</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.643357</td>\n <td>0.340670</td>\n <td>0.257153</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>183687</th>\n <td>Michigan</td>\n <td>1.0</td>\n <td>0.50</td>\n <td>0.0</td>\n <td>0.233333</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.304348</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.608392</td>\n <td>0.228795</td>\n <td>0.169567</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>191905</th>\n <td>Michigan</td>\n <td>0.0</td>\n <td>0.75</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.260870</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.461538</td>\n <td>0.169494</td>\n <td>0.180077</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 40 columns</p>\n</div>"
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scale_float_columns(test)\n",
"scale_float_columns(train)\n",
"scale_float_columns(valid)\n",
"test.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## 5. Czyszczenie brakujących pól\n",
"\n",
"Nie możemy użyć .dropna() gdyż większość wierszy ma brakujące wartości:"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 72,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"445132\n",
"199110\n"
]
}
],
"source": [
"print(df.shape[0])\n",
"print(df.shape[0] - df.dropna().shape[0])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 73,
"outputs": [
{
"data": {
"text/plain": " State Male GeneralHealth PhysicalHealthDays \\\n339824 South Dakota 0.0 0.50 0.1 \n127927 Kansas 0.0 0.50 1.0 \n362523 Utah 1.0 1.00 0.0 \n183687 Michigan 1.0 0.50 0.0 \n191905 Michigan 0.0 0.75 0.0 \n\n MentalHealthDays LastCheckupTime \\\n339824 0.700000 Within past year (anytime less than 12 months ... \n127927 0.000000 Within past year (anytime less than 12 months ... \n362523 0.000000 Within past year (anytime less than 12 months ... \n183687 0.233333 Within past year (anytime less than 12 months ... \n191905 0.000000 Within past year (anytime less than 12 months ... \n\n PhysicalActivities SleepHours RemovedTeeth HadHeartAttack ... \\\n339824 1.0 0.304348 0.666667 0.0 ... \n127927 1.0 0.391304 0.333333 0.0 ... \n362523 1.0 0.260870 0.333333 0.0 ... \n183687 1.0 0.304348 0.000000 0.0 ... \n191905 1.0 0.260870 0.000000 0.0 ... \n\n HeightInMeters WeightInKilograms BMI AlcoholDrinkers \\\n339824 0.482517 0.110156 0.097513 0.0 \n127927 0.538462 0.279650 0.264860 0.0 \n362523 0.643357 0.340670 0.257153 0.0 \n183687 0.608392 0.228795 0.169567 1.0 \n191905 0.461538 0.169494 0.180077 1.0 \n\n HIVTesting FluVaxLast12 PneumoVaxEver TetanusLast10Tdap \\\n339824 1.0 1.0 1.0 0.0 \n127927 0.0 1.0 1.0 NaN \n362523 0.0 0.0 0.0 0.0 \n183687 0.0 1.0 1.0 0.0 \n191905 1.0 0.0 1.0 0.0 \n\n HighRiskLastYear CovidPos \n339824 0.0 0.0 \n127927 0.0 0.0 \n362523 0.0 1.0 \n183687 0.0 0.0 \n191905 0.0 0.0 \n\n[5 rows x 40 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>State</th>\n <th>Male</th>\n <th>GeneralHealth</th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>LastCheckupTime</th>\n <th>PhysicalActivities</th>\n <th>SleepHours</th>\n <th>RemovedTeeth</th>\n <th>HadHeartAttack</th>\n <th>...</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n <th>AlcoholDrinkers</th>\n <th>HIVTesting</th>\n <th>FluVaxLast12</th>\n <th>PneumoVaxEver</th>\n <th>TetanusLast10Tdap</th>\n <th>HighRiskLastYear</th>\n <th>CovidPos</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>339824</th>\n <td>South Dakota</td>\n <td>0.0</td>\n <td>0.50</td>\n <td>0.1</td>\n <td>0.700000</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.304348</td>\n <td>0.666667</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.482517</td>\n <td>0.110156</td>\n <td>0.097513</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>127927</th>\n <td>Kansas</td>\n <td>0.0</td>\n <td>0.50</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.391304</td>\n <td>0.333333</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.538462</td>\n <td>0.279650</td>\n <td>0.264860</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>362523</th>\n <td>Utah</td>\n <td>1.0</td>\n <td>1.00</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.260870</td>\n <td>0.333333</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.643357</td>\n <td>0.340670</td>\n <td>0.257153</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>183687</th>\n <td>Michigan</td>\n <td>1.0</td>\n <td>0.50</td>\n <td>0.0</td>\n <td>0.233333</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.304348</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.608392</td>\n <td>0.228795</td>\n <td>0.169567</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>191905</th>\n <td>Michigan</td>\n <td>0.0</td>\n <td>0.75</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.260870</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.461538</td>\n <td>0.169494</td>\n <td>0.180077</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 40 columns</p>\n</div>"
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Uzupełniam brakujące wartości medianą:"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 74,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Adrian\\AppData\\Local\\Temp\\ipykernel_18732\\896322512.py:4: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.\n",
" test.fillna(test.median(),inplace=True)\n",
"C:\\Users\\Adrian\\AppData\\Local\\Temp\\ipykernel_18732\\896322512.py:5: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.\n",
" train.fillna(train.median(),inplace=True)\n",
"C:\\Users\\Adrian\\AppData\\Local\\Temp\\ipykernel_18732\\896322512.py:6: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.\n",
" valid.fillna(valid.median(),inplace=True)\n"
]
}
],
"source": [
"#test.dropna(inplace=True)\n",
"#train.dropna(inplace=True)\n",
"#valid.dropna(inplace=True)\n",
"test.fillna(test.median(),inplace=True)\n",
"train.fillna(train.median(),inplace=True)\n",
"valid.fillna(valid.median(),inplace=True)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 75,
"outputs": [
{
"data": {
"text/plain": " State Male GeneralHealth PhysicalHealthDays \\\n339824 South Dakota 0.0 0.50 0.1 \n127927 Kansas 0.0 0.50 1.0 \n362523 Utah 1.0 1.00 0.0 \n183687 Michigan 1.0 0.50 0.0 \n191905 Michigan 0.0 0.75 0.0 \n\n MentalHealthDays LastCheckupTime \\\n339824 0.700000 Within past year (anytime less than 12 months ... \n127927 0.000000 Within past year (anytime less than 12 months ... \n362523 0.000000 Within past year (anytime less than 12 months ... \n183687 0.233333 Within past year (anytime less than 12 months ... \n191905 0.000000 Within past year (anytime less than 12 months ... \n\n PhysicalActivities SleepHours RemovedTeeth HadHeartAttack ... \\\n339824 1.0 0.304348 0.666667 0.0 ... \n127927 1.0 0.391304 0.333333 0.0 ... \n362523 1.0 0.260870 0.333333 0.0 ... \n183687 1.0 0.304348 0.000000 0.0 ... \n191905 1.0 0.260870 0.000000 0.0 ... \n\n HeightInMeters WeightInKilograms BMI AlcoholDrinkers \\\n339824 0.482517 0.110156 0.097513 0.0 \n127927 0.538462 0.279650 0.264860 0.0 \n362523 0.643357 0.340670 0.257153 0.0 \n183687 0.608392 0.228795 0.169567 1.0 \n191905 0.461538 0.169494 0.180077 1.0 \n\n HIVTesting FluVaxLast12 PneumoVaxEver TetanusLast10Tdap \\\n339824 1.0 1.0 1.0 0.0 \n127927 0.0 1.0 1.0 0.0 \n362523 0.0 0.0 0.0 0.0 \n183687 0.0 1.0 1.0 0.0 \n191905 1.0 0.0 1.0 0.0 \n\n HighRiskLastYear CovidPos \n339824 0.0 0.0 \n127927 0.0 0.0 \n362523 0.0 1.0 \n183687 0.0 0.0 \n191905 0.0 0.0 \n\n[5 rows x 40 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>State</th>\n <th>Male</th>\n <th>GeneralHealth</th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>LastCheckupTime</th>\n <th>PhysicalActivities</th>\n <th>SleepHours</th>\n <th>RemovedTeeth</th>\n <th>HadHeartAttack</th>\n <th>...</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n <th>AlcoholDrinkers</th>\n <th>HIVTesting</th>\n <th>FluVaxLast12</th>\n <th>PneumoVaxEver</th>\n <th>TetanusLast10Tdap</th>\n <th>HighRiskLastYear</th>\n <th>CovidPos</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>339824</th>\n <td>South Dakota</td>\n <td>0.0</td>\n <td>0.50</td>\n <td>0.1</td>\n <td>0.700000</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.304348</td>\n <td>0.666667</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.482517</td>\n <td>0.110156</td>\n <td>0.097513</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>127927</th>\n <td>Kansas</td>\n <td>0.0</td>\n <td>0.50</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.391304</td>\n <td>0.333333</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.538462</td>\n <td>0.279650</td>\n <td>0.264860</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>362523</th>\n <td>Utah</td>\n <td>1.0</td>\n <td>1.00</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.260870</td>\n <td>0.333333</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.643357</td>\n <td>0.340670</td>\n <td>0.257153</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>183687</th>\n <td>Michigan</td>\n <td>1.0</td>\n <td>0.50</td>\n <td>0.0</td>\n <td>0.233333</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.304348</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.608392</td>\n <td>0.228795</td>\n <td>0.169567</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>191905</th>\n <td>Michigan</td>\n <td>0.0</td>\n <td>0.75</td>\n <td>0.0</td>\n <td>0.000000</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.260870</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.461538</td>\n <td>0.169494</td>\n <td>0.180077</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 40 columns</p>\n</div>"
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Kolumny kategoryczne wypełniłem w czasie normalizacji wartościami \"Unknown\" ponieważ fillna-->median nie działa dla tego typu danych\n",
"(https://stackoverflow.com/questions/49127897/python-pandas-fillna-median-not-working)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 76,
"outputs": [
{
"data": {
"text/plain": "0.0 42796\n1.0 1717\nName: HighRiskLastYear, dtype: int64"
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test[\"HighRiskLastYear\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 77,
"outputs": [
{
"data": {
"text/plain": "0"
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test[\"HighRiskLastYear\"].isna().sum()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Brak wartości non-null:"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 78,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 44513 entries, 339824 to 52161\n",
"Data columns (total 40 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 State 44513 non-null category\n",
" 1 Male 44513 non-null float64 \n",
" 2 GeneralHealth 44513 non-null float64 \n",
" 3 PhysicalHealthDays 44513 non-null float64 \n",
" 4 MentalHealthDays 44513 non-null float64 \n",
" 5 LastCheckupTime 44513 non-null category\n",
" 6 PhysicalActivities 44513 non-null float64 \n",
" 7 SleepHours 44513 non-null float64 \n",
" 8 RemovedTeeth 44513 non-null float64 \n",
" 9 HadHeartAttack 44513 non-null float64 \n",
" 10 HadAngina 44513 non-null float64 \n",
" 11 HadStroke 44513 non-null float64 \n",
" 12 HadAsthma 44513 non-null float64 \n",
" 13 HadSkinCancer 44513 non-null float64 \n",
" 14 HadCOPD 44513 non-null float64 \n",
" 15 HadDepressiveDisorder 44513 non-null float64 \n",
" 16 HadKidneyDisease 44513 non-null float64 \n",
" 17 HadArthritis 44513 non-null float64 \n",
" 18 HadDiabetes 44513 non-null float64 \n",
" 19 DeafOrHardOfHearing 44513 non-null float64 \n",
" 20 BlindOrVisionDifficulty 44513 non-null float64 \n",
" 21 DifficultyConcentrating 44513 non-null float64 \n",
" 22 DifficultyWalking 44513 non-null float64 \n",
" 23 DifficultyDressingBathing 44513 non-null float64 \n",
" 24 DifficultyErrands 44513 non-null float64 \n",
" 25 SmokerStatus 44513 non-null float64 \n",
" 26 ECigaretteUsage 44513 non-null float64 \n",
" 27 ChestScan 44513 non-null float64 \n",
" 28 RaceEthnicityCategory 44513 non-null category\n",
" 29 AgeCategory 44513 non-null category\n",
" 30 HeightInMeters 44513 non-null float64 \n",
" 31 WeightInKilograms 44513 non-null float64 \n",
" 32 BMI 44513 non-null float64 \n",
" 33 AlcoholDrinkers 44513 non-null float64 \n",
" 34 HIVTesting 44513 non-null float64 \n",
" 35 FluVaxLast12 44513 non-null float64 \n",
" 36 PneumoVaxEver 44513 non-null float64 \n",
" 37 TetanusLast10Tdap 44513 non-null float64 \n",
" 38 HighRiskLastYear 44513 non-null float64 \n",
" 39 CovidPos 44513 non-null float64 \n",
"dtypes: category(4), float64(36)\n",
"memory usage: 12.7 MB\n"
]
}
],
"source": [
"test.info()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 79,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 676777 entries, 0 to 676776\n",
"Data columns (total 40 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 State 676777 non-null category\n",
" 1 Male 676777 non-null float64 \n",
" 2 GeneralHealth 676777 non-null float64 \n",
" 3 PhysicalHealthDays 676777 non-null float64 \n",
" 4 MentalHealthDays 676777 non-null float64 \n",
" 5 LastCheckupTime 676777 non-null category\n",
" 6 PhysicalActivities 676777 non-null float64 \n",
" 7 SleepHours 676777 non-null float64 \n",
" 8 RemovedTeeth 676777 non-null float64 \n",
" 9 HadHeartAttack 676777 non-null float64 \n",
" 10 HadAngina 676777 non-null float64 \n",
" 11 HadStroke 676777 non-null float64 \n",
" 12 HadAsthma 676777 non-null float64 \n",
" 13 HadSkinCancer 676777 non-null float64 \n",
" 14 HadCOPD 676777 non-null float64 \n",
" 15 HadDepressiveDisorder 676777 non-null float64 \n",
" 16 HadKidneyDisease 676777 non-null float64 \n",
" 17 HadArthritis 676777 non-null float64 \n",
" 18 HadDiabetes 676777 non-null float64 \n",
" 19 DeafOrHardOfHearing 676777 non-null float64 \n",
" 20 BlindOrVisionDifficulty 676777 non-null float64 \n",
" 21 DifficultyConcentrating 676777 non-null float64 \n",
" 22 DifficultyWalking 676777 non-null float64 \n",
" 23 DifficultyDressingBathing 676777 non-null float64 \n",
" 24 DifficultyErrands 676777 non-null float64 \n",
" 25 SmokerStatus 676777 non-null float64 \n",
" 26 ECigaretteUsage 676777 non-null float64 \n",
" 27 ChestScan 676777 non-null float64 \n",
" 28 RaceEthnicityCategory 676777 non-null category\n",
" 29 AgeCategory 676777 non-null category\n",
" 30 HeightInMeters 676777 non-null float64 \n",
" 31 WeightInKilograms 676777 non-null float64 \n",
" 32 BMI 676777 non-null float64 \n",
" 33 AlcoholDrinkers 676777 non-null float64 \n",
" 34 HIVTesting 676777 non-null float64 \n",
" 35 FluVaxLast12 676777 non-null float64 \n",
" 36 PneumoVaxEver 676777 non-null float64 \n",
" 37 TetanusLast10Tdap 676777 non-null float64 \n",
" 38 HighRiskLastYear 676777 non-null float64 \n",
" 39 CovidPos 676777 non-null float64 \n",
"dtypes: category(4), float64(36)\n",
"memory usage: 188.5 MB\n"
]
}
],
"source": [
"train.info()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 80,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 44514 entries, 66965 to 224311\n",
"Data columns (total 40 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 State 44514 non-null category\n",
" 1 Male 44514 non-null float64 \n",
" 2 GeneralHealth 44514 non-null float64 \n",
" 3 PhysicalHealthDays 44514 non-null float64 \n",
" 4 MentalHealthDays 44514 non-null float64 \n",
" 5 LastCheckupTime 44514 non-null category\n",
" 6 PhysicalActivities 44514 non-null float64 \n",
" 7 SleepHours 44514 non-null float64 \n",
" 8 RemovedTeeth 44514 non-null float64 \n",
" 9 HadHeartAttack 44514 non-null float64 \n",
" 10 HadAngina 44514 non-null float64 \n",
" 11 HadStroke 44514 non-null float64 \n",
" 12 HadAsthma 44514 non-null float64 \n",
" 13 HadSkinCancer 44514 non-null float64 \n",
" 14 HadCOPD 44514 non-null float64 \n",
" 15 HadDepressiveDisorder 44514 non-null float64 \n",
" 16 HadKidneyDisease 44514 non-null float64 \n",
" 17 HadArthritis 44514 non-null float64 \n",
" 18 HadDiabetes 44514 non-null float64 \n",
" 19 DeafOrHardOfHearing 44514 non-null float64 \n",
" 20 BlindOrVisionDifficulty 44514 non-null float64 \n",
" 21 DifficultyConcentrating 44514 non-null float64 \n",
" 22 DifficultyWalking 44514 non-null float64 \n",
" 23 DifficultyDressingBathing 44514 non-null float64 \n",
" 24 DifficultyErrands 44514 non-null float64 \n",
" 25 SmokerStatus 44514 non-null float64 \n",
" 26 ECigaretteUsage 44514 non-null float64 \n",
" 27 ChestScan 44514 non-null float64 \n",
" 28 RaceEthnicityCategory 44514 non-null category\n",
" 29 AgeCategory 44514 non-null category\n",
" 30 HeightInMeters 44514 non-null float64 \n",
" 31 WeightInKilograms 44514 non-null float64 \n",
" 32 BMI 44514 non-null float64 \n",
" 33 AlcoholDrinkers 44514 non-null float64 \n",
" 34 HIVTesting 44514 non-null float64 \n",
" 35 FluVaxLast12 44514 non-null float64 \n",
" 36 PneumoVaxEver 44514 non-null float64 \n",
" 37 TetanusLast10Tdap 44514 non-null float64 \n",
" 38 HighRiskLastYear 44514 non-null float64 \n",
" 39 CovidPos 44514 non-null float64 \n",
"dtypes: category(4), float64(36)\n",
"memory usage: 12.7 MB\n"
]
}
],
"source": [
"valid.info()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}