ium_452487/dane.ipynb
2024-04-14 16:17:38 +02:00

1703 lines
327 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"source": [
"## 1. Pobieranie zbioru danych"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 97,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: kaggle in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (1.6.6)\n",
"Requirement already satisfied: bleach in c:\\users\\adrian\\miniconda3\\lib\\site-packages (from kaggle) (4.1.0)\n",
"Requirement already satisfied: python-slugify in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from kaggle) (8.0.4)\n",
"Requirement already satisfied: python-dateutil in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from kaggle) (2.8.2)\n",
"Requirement already satisfied: tqdm in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from kaggle) (4.64.1)\n",
"Requirement already satisfied: requests in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from kaggle) (2.28.1)\n",
"Requirement already satisfied: certifi in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from kaggle) (2022.6.15)\n",
"Requirement already satisfied: six>=1.10 in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from kaggle) (1.16.0)\n",
"Requirement already satisfied: urllib3 in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from kaggle) (1.26.11)\n",
"Requirement already satisfied: webencodings in c:\\users\\adrian\\miniconda3\\lib\\site-packages (from bleach->kaggle) (0.5.1)\n",
"Requirement already satisfied: packaging in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from bleach->kaggle) (22.0)\n",
"Requirement already satisfied: text-unidecode>=1.3 in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from python-slugify->kaggle) (1.3)\n",
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from requests->kaggle) (2.10)\n",
"Requirement already satisfied: charset-normalizer<3,>=2 in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from requests->kaggle) (2.1.0)\n",
"Requirement already satisfied: colorama in c:\\users\\adrian\\appdata\\roaming\\python\\python39\\site-packages (from tqdm->kaggle) (0.4.5)\n"
]
}
],
"source": [
"!pip install --user kaggle"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 98,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"personal-key-indicators-of-heart-disease.zip: Skipping, found more recently modified local copy (use --force to force download)\n"
]
}
],
"source": [
"!kaggle datasets download -d kamilpytlak/personal-key-indicators-of-heart-disease/"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 99,
"outputs": [],
"source": [
"#!unzip -o personal-key-indicators-of-heart-disease.zip #nie działa na Windowsie więc korzystam z modułu zipfile"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 100,
"outputs": [],
"source": [
"import zipfile\n",
"with zipfile.ZipFile(\"personal-key-indicators-of-heart-disease.zip\", 'r') as zip_ref:\n",
" zip_ref.extractall(\"dataset_extracted\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 50,
"outputs": [],
"source": [
"import pandas as pd\n",
"# W pobranym zbiorze danych jest kilka podzbiorów więc celowo otwieram ten z NaN, żeby manualnie go oczyścić dla praktyki\n",
"df = pd.read_csv(\"dataset_extracted/2022/heart_2022_with_nans.csv\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## Przeglądanie nieoczyszczonego datasetu"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 51,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 445132 entries, 0 to 445131\n",
"Data columns (total 40 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 State 445132 non-null object \n",
" 1 Sex 445132 non-null object \n",
" 2 GeneralHealth 443934 non-null object \n",
" 3 PhysicalHealthDays 434205 non-null float64\n",
" 4 MentalHealthDays 436065 non-null float64\n",
" 5 LastCheckupTime 436824 non-null object \n",
" 6 PhysicalActivities 444039 non-null object \n",
" 7 SleepHours 439679 non-null float64\n",
" 8 RemovedTeeth 433772 non-null object \n",
" 9 HadHeartAttack 442067 non-null object \n",
" 10 HadAngina 440727 non-null object \n",
" 11 HadStroke 443575 non-null object \n",
" 12 HadAsthma 443359 non-null object \n",
" 13 HadSkinCancer 441989 non-null object \n",
" 14 HadCOPD 442913 non-null object \n",
" 15 HadDepressiveDisorder 442320 non-null object \n",
" 16 HadKidneyDisease 443206 non-null object \n",
" 17 HadArthritis 442499 non-null object \n",
" 18 HadDiabetes 444045 non-null object \n",
" 19 DeafOrHardOfHearing 424485 non-null object \n",
" 20 BlindOrVisionDifficulty 423568 non-null object \n",
" 21 DifficultyConcentrating 420892 non-null object \n",
" 22 DifficultyWalking 421120 non-null object \n",
" 23 DifficultyDressingBathing 421217 non-null object \n",
" 24 DifficultyErrands 419476 non-null object \n",
" 25 SmokerStatus 409670 non-null object \n",
" 26 ECigaretteUsage 409472 non-null object \n",
" 27 ChestScan 389086 non-null object \n",
" 28 RaceEthnicityCategory 431075 non-null object \n",
" 29 AgeCategory 436053 non-null object \n",
" 30 HeightInMeters 416480 non-null float64\n",
" 31 WeightInKilograms 403054 non-null float64\n",
" 32 BMI 396326 non-null float64\n",
" 33 AlcoholDrinkers 398558 non-null object \n",
" 34 HIVTesting 379005 non-null object \n",
" 35 FluVaxLast12 398011 non-null object \n",
" 36 PneumoVaxEver 368092 non-null object \n",
" 37 TetanusLast10Tdap 362616 non-null object \n",
" 38 HighRiskLastYear 394509 non-null object \n",
" 39 CovidPos 394368 non-null object \n",
"dtypes: float64(6), object(34)\n",
"memory usage: 135.8+ MB\n"
]
}
],
"source": [
"df.info()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 52,
"outputs": [
{
"data": {
"text/plain": " State Sex GeneralHealth PhysicalHealthDays MentalHealthDays \\\n0 Alabama Female Very good 0.0 0.0 \n1 Alabama Female Excellent 0.0 0.0 \n2 Alabama Female Very good 2.0 3.0 \n3 Alabama Female Excellent 0.0 0.0 \n4 Alabama Female Fair 2.0 0.0 \n\n LastCheckupTime PhysicalActivities \\\n0 Within past year (anytime less than 12 months ... No \n1 NaN No \n2 Within past year (anytime less than 12 months ... Yes \n3 Within past year (anytime less than 12 months ... Yes \n4 Within past year (anytime less than 12 months ... Yes \n\n SleepHours RemovedTeeth HadHeartAttack ... HeightInMeters \\\n0 8.0 NaN No ... NaN \n1 6.0 NaN No ... 1.60 \n2 5.0 NaN No ... 1.57 \n3 7.0 NaN No ... 1.65 \n4 9.0 NaN No ... 1.57 \n\n WeightInKilograms BMI AlcoholDrinkers HIVTesting FluVaxLast12 \\\n0 NaN NaN No No Yes \n1 68.04 26.57 No No No \n2 63.50 25.61 No No No \n3 63.50 23.30 No No Yes \n4 53.98 21.77 Yes No No \n\n PneumoVaxEver TetanusLast10Tdap \\\n0 No Yes, received tetanus shot but not sure what type \n1 No No, did not receive any tetanus shot in the pa... \n2 No NaN \n3 Yes No, did not receive any tetanus shot in the pa... \n4 Yes No, did not receive any tetanus shot in the pa... \n\n HighRiskLastYear CovidPos \n0 No No \n1 No No \n2 No Yes \n3 No No \n4 No No \n\n[5 rows x 40 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>State</th>\n <th>Sex</th>\n <th>GeneralHealth</th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>LastCheckupTime</th>\n <th>PhysicalActivities</th>\n <th>SleepHours</th>\n <th>RemovedTeeth</th>\n <th>HadHeartAttack</th>\n <th>...</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n <th>AlcoholDrinkers</th>\n <th>HIVTesting</th>\n <th>FluVaxLast12</th>\n <th>PneumoVaxEver</th>\n <th>TetanusLast10Tdap</th>\n <th>HighRiskLastYear</th>\n <th>CovidPos</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Alabama</td>\n <td>Female</td>\n <td>Very good</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>No</td>\n <td>8.0</td>\n <td>NaN</td>\n <td>No</td>\n <td>...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>No</td>\n <td>No</td>\n <td>Yes</td>\n <td>No</td>\n <td>Yes, received tetanus shot but not sure what type</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Alabama</td>\n <td>Female</td>\n <td>Excellent</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>No</td>\n <td>6.0</td>\n <td>NaN</td>\n <td>No</td>\n <td>...</td>\n <td>1.60</td>\n <td>68.04</td>\n <td>26.57</td>\n <td>No</td>\n <td>No</td>\n <td>No</td>\n <td>No</td>\n <td>No, did not receive any tetanus shot in the pa...</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Alabama</td>\n <td>Female</td>\n <td>Very good</td>\n <td>2.0</td>\n <td>3.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>Yes</td>\n <td>5.0</td>\n <td>NaN</td>\n <td>No</td>\n <td>...</td>\n <td>1.57</td>\n <td>63.50</td>\n <td>25.61</td>\n <td>No</td>\n <td>No</td>\n <td>No</td>\n <td>No</td>\n <td>NaN</td>\n <td>No</td>\n <td>Yes</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Alabama</td>\n <td>Female</td>\n <td>Excellent</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>Yes</td>\n <td>7.0</td>\n <td>NaN</td>\n <td>No</td>\n <td>...</td>\n <td>1.65</td>\n <td>63.50</td>\n <td>23.30</td>\n <td>No</td>\n <td>No</td>\n <td>Yes</td>\n <td>Yes</td>\n <td>No, did not receive any tetanus shot in the pa...</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Alabama</td>\n <td>Female</td>\n <td>Fair</td>\n <td>2.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>Yes</td>\n <td>9.0</td>\n <td>NaN</td>\n <td>No</td>\n <td>...</td>\n <td>1.57</td>\n <td>53.98</td>\n <td>21.77</td>\n <td>Yes</td>\n <td>No</td>\n <td>No</td>\n <td>Yes</td>\n <td>No, did not receive any tetanus shot in the pa...</td>\n <td>No</td>\n <td>No</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 40 columns</p>\n</div>"
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 53,
"outputs": [
{
"data": {
"text/plain": " PhysicalHealthDays MentalHealthDays SleepHours HeightInMeters \\\ncount 434205.000000 436065.000000 439679.000000 416480.000000 \nmean 4.347919 4.382649 7.022983 1.702691 \nstd 8.688912 8.387475 1.502425 0.107177 \nmin 0.000000 0.000000 1.000000 0.910000 \n25% 0.000000 0.000000 6.000000 1.630000 \n50% 0.000000 0.000000 7.000000 1.700000 \n75% 3.000000 5.000000 8.000000 1.780000 \nmax 30.000000 30.000000 24.000000 2.410000 \n\n WeightInKilograms BMI \ncount 403054.000000 396326.000000 \nmean 83.074470 28.529842 \nstd 21.448173 6.554889 \nmin 22.680000 12.020000 \n25% 68.040000 24.130000 \n50% 80.740000 27.440000 \n75% 95.250000 31.750000 \nmax 292.570000 99.640000 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>SleepHours</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>434205.000000</td>\n <td>436065.000000</td>\n <td>439679.000000</td>\n <td>416480.000000</td>\n <td>403054.000000</td>\n <td>396326.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>4.347919</td>\n <td>4.382649</td>\n <td>7.022983</td>\n <td>1.702691</td>\n <td>83.074470</td>\n <td>28.529842</td>\n </tr>\n <tr>\n <th>std</th>\n <td>8.688912</td>\n <td>8.387475</td>\n <td>1.502425</td>\n <td>0.107177</td>\n <td>21.448173</td>\n <td>6.554889</td>\n </tr>\n <tr>\n <th>min</th>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>0.910000</td>\n <td>22.680000</td>\n <td>12.020000</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>6.000000</td>\n <td>1.630000</td>\n <td>68.040000</td>\n <td>24.130000</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>7.000000</td>\n <td>1.700000</td>\n <td>80.740000</td>\n <td>27.440000</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>3.000000</td>\n <td>5.000000</td>\n <td>8.000000</td>\n <td>1.780000</td>\n <td>95.250000</td>\n <td>31.750000</td>\n </tr>\n <tr>\n <th>max</th>\n <td>30.000000</td>\n <td>30.000000</td>\n <td>24.000000</td>\n <td>2.410000</td>\n <td>292.570000</td>\n <td>99.640000</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"#### Tylko 6 kolumn jest numeryczne na razie więc wiele statystyk nie zostaje wyświetlonych w tym podsumowaniu"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"#### Zbiór danych jest niezbalansowany, zmienna którą chcemy przewidzieć w znacznej większości przypadków wynosi 0:"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 54,
"outputs": [
{
"data": {
"text/plain": "<AxesSubplot:ylabel='count'>"
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "\n"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df[\"HadHeartAttack\"].value_counts().plot(kind=\"pie\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 55,
"outputs": [
{
"data": {
"text/plain": "HadHeartAttack\nNo 416959\nYes 25108\nName: count, dtype: int64"
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"HadHeartAttack\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## 2. Podział na podzbiory (train / dev / test - 8:1:1)) i oversampling"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 56,
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"#Funkcji z sklearn musimy użyć dwukrotnie, bo dzieli tylko na dwa podzbiory\n",
"train, test_and_valid = train_test_split(df, test_size=0.2) #0.8 train, 0.2 test&valid\n",
"\n",
"test, valid = train_test_split(test_and_valid, test_size=0.5) #0.1 test, 0.1 valid"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 57,
"outputs": [
{
"data": {
"text/plain": "HadHeartAttack\nNo 333640\nYes 20032\nName: count, dtype: int64"
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train[\"HadHeartAttack\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Zbiór treningowy jest nadal niezbalansowany więc zrobię prosty oversampling przez kopiowanie mniejszej klasy aż będą prawie równe"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 58,
"outputs": [],
"source": [
"def oversample(dataset):\n",
" num_true = len(dataset[dataset[\"HadHeartAttack\"]==\"Yes\"])\n",
" num_false = len(dataset[dataset[\"HadHeartAttack\"]==\"No\"])\n",
" num_oversampling_steps = num_false//num_true\n",
" oversampled = dataset.copy()\n",
" for x in range(num_oversampling_steps):\n",
" oversampled = pd.concat([oversampled, dataset[dataset[\"HadHeartAttack\"]==\"Yes\"]], ignore_index=True)\n",
" return oversampled"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 59,
"outputs": [],
"source": [
"train = oversample(train)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 60,
"outputs": [
{
"data": {
"text/plain": "<AxesSubplot:ylabel='count'>"
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "\n"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"train[\"HadHeartAttack\"].value_counts().plot(kind=\"pie\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 61,
"outputs": [
{
"data": {
"text/plain": "<AxesSubplot:ylabel='count'>"
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "\n"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"test[\"HadHeartAttack\"].value_counts().plot(kind=\"pie\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 62,
"outputs": [
{
"data": {
"text/plain": "<AxesSubplot:ylabel='count'>"
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "\n"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"valid[\"HadHeartAttack\"].value_counts().plot(kind=\"pie\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Proporcje osób palących / niepalących w pierwotnym zbiorze danych:"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 63,
"outputs": [
{
"data": {
"text/plain": "<AxesSubplot:ylabel='count'>"
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "\n"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df[\"SmokerStatus\"].value_counts().plot(kind=\"pie\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 64,
"outputs": [
{
"data": {
"text/plain": "<AxesSubplot:ylabel='count'>"
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "\n"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df[\"ECigaretteUsage\"].value_counts().plot(kind=\"pie\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Statystyki covidowe"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 65,
"outputs": [
{
"data": {
"text/plain": "<AxesSubplot:ylabel='count'>"
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "\n"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df[\"CovidPos\"].value_counts().plot(kind=\"pie\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## Normalizacja część 1 - zamiana na kolumny liczbowe i kategoryczne"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Kolumny zawierające stan zdrowia i podobne cechy opisane w sposób \"poor/fair/good/excellent\" etc. starałem się zamienić na liczbowe w sposób sensowny, rosnący względem pozytywnego aspektu tego czynnika zdrowotnego. Podobnie z tym jak często dana osoba paliła.\n",
"Część kolumn zamieniłem na kategoryczne\n",
"Kolumnę płci zamieniłem na numeryczną w celu późniejszego wykorzystania przez model, choć mialem wątpliwości co do robienia tego pod względem poprawności politycznej\n"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 66,
"outputs": [
{
"data": {
"text/plain": "array(['Female', 'Male'], dtype=object)"
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"Sex\"].unique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 67,
"outputs": [
{
"data": {
"text/plain": "array(['Very good', 'Excellent', 'Fair', 'Poor', 'Good', nan],\n dtype=object)"
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"GeneralHealth\"].unique()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 68,
"outputs": [],
"source": [
"health_map = {\n",
" \"Excellent\": 5,\n",
" \"Very good\": 4,\n",
" \"Good\": 3,\n",
" \"Fair\": 2,\n",
" \"Poor\": 1\n",
"}"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 69,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"State:\n",
"['Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California' 'Colorado'\n",
" 'Connecticut' 'Delaware' 'District of Columbia' 'Florida' 'Georgia'\n",
" 'Hawaii' 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas' 'Kentucky'\n",
" 'Louisiana' 'Maine' 'Maryland' 'Massachusetts' 'Michigan' 'Minnesota'\n",
" 'Mississippi' 'Missouri' 'Montana' 'Nebraska' 'Nevada' 'New Hampshire'\n",
" 'New Jersey' 'New Mexico' 'New York' 'North Carolina' 'North Dakota'\n",
" 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania' 'Rhode Island' 'South Carolina'\n",
" 'South Dakota' 'Tennessee' 'Texas' 'Utah' 'Vermont' 'Virginia'\n",
" 'Washington' 'West Virginia' 'Wisconsin' 'Wyoming' 'Guam' 'Puerto Rico'\n",
" 'Virgin Islands']\n",
"Sex:\n",
"['Female' 'Male']\n",
"GeneralHealth:\n",
"['Very good' 'Excellent' 'Fair' 'Poor' 'Good' nan]\n",
"PhysicalHealthDays:\n",
"[ 0. 2. 1. 8. 5. 30. 4. 23. 14. nan 15. 3. 10. 7. 25. 6. 21. 20.\n",
" 29. 16. 9. 27. 28. 12. 13. 11. 26. 17. 24. 19. 18. 22.]\n",
"MentalHealthDays:\n",
"[ 0. 3. 9. 5. 15. 20. 14. 10. 18. 1. nan 2. 30. 4. 6. 7. 25. 8.\n",
" 22. 29. 27. 21. 12. 28. 16. 13. 26. 17. 11. 23. 19. 24.]\n",
"LastCheckupTime:\n",
"['Within past year (anytime less than 12 months ago)' nan\n",
" 'Within past 2 years (1 year but less than 2 years ago)'\n",
" 'Within past 5 years (2 years but less than 5 years ago)'\n",
" '5 or more years ago']\n",
"PhysicalActivities:\n",
"['No' 'Yes' nan]\n",
"SleepHours:\n",
"[ 8. 6. 5. 7. 9. 4. 10. 1. 12. nan 18. 3. 2. 11. 16. 15. 13. 14.\n",
" 20. 23. 17. 24. 22. 19. 21.]\n",
"RemovedTeeth:\n",
"[nan 'None of them' '1 to 5' '6 or more, but not all' 'All']\n",
"HadHeartAttack:\n",
"['No' 'Yes' nan]\n",
"HadAngina:\n",
"['No' 'Yes' nan]\n",
"HadStroke:\n",
"['No' 'Yes' nan]\n",
"HadAsthma:\n",
"['No' 'Yes' nan]\n",
"HadSkinCancer:\n",
"['No' 'Yes' nan]\n",
"HadCOPD:\n",
"['No' 'Yes' nan]\n",
"HadDepressiveDisorder:\n",
"['No' 'Yes' nan]\n",
"HadKidneyDisease:\n",
"['No' 'Yes' nan]\n",
"HadArthritis:\n",
"['No' 'Yes' nan]\n",
"HadDiabetes:\n",
"['Yes' 'No' 'No, pre-diabetes or borderline diabetes' nan\n",
" 'Yes, but only during pregnancy (female)']\n",
"DeafOrHardOfHearing:\n",
"['No' nan 'Yes']\n",
"BlindOrVisionDifficulty:\n",
"['No' 'Yes' nan]\n",
"DifficultyConcentrating:\n",
"['No' nan 'Yes']\n",
"DifficultyWalking:\n",
"['No' 'Yes' nan]\n",
"DifficultyDressingBathing:\n",
"['No' nan 'Yes']\n",
"DifficultyErrands:\n",
"['No' 'Yes' nan]\n",
"SmokerStatus:\n",
"['Never smoked' 'Current smoker - now smokes some days' 'Former smoker'\n",
" nan 'Current smoker - now smokes every day']\n",
"ECigaretteUsage:\n",
"['Not at all (right now)' 'Never used e-cigarettes in my entire life' nan\n",
" 'Use them every day' 'Use them some days']\n",
"ChestScan:\n",
"['No' 'Yes' nan]\n",
"RaceEthnicityCategory:\n",
"['White only, Non-Hispanic' 'Black only, Non-Hispanic'\n",
" 'Other race only, Non-Hispanic' 'Multiracial, Non-Hispanic' nan\n",
" 'Hispanic']\n",
"AgeCategory:\n",
"['Age 80 or older' 'Age 55 to 59' nan 'Age 40 to 44' 'Age 75 to 79'\n",
" 'Age 70 to 74' 'Age 65 to 69' 'Age 60 to 64' 'Age 50 to 54'\n",
" 'Age 45 to 49' 'Age 35 to 39' 'Age 25 to 29' 'Age 30 to 34'\n",
" 'Age 18 to 24']\n",
"HeightInMeters:\n",
"[ nan 1.6 1.57 1.65 1.8 1.63 1.7 1.68 1.73 1.55 1.93 1.88 1.78 1.85\n",
" 1.75 1.52 1.83 1.91 1.96 1.5 1.45 1.42 1.24 1.47 1.22 1.98 2.03 2.01\n",
" 1.3 1.4 1.35 1.82 1.67 1.76 2.11 1.37 1.64 1.71 2.16 2.26 0.91 2.06\n",
" 1.14 1.74 1.51 1.53 1.69 1.56 1.84 1.9 1.54 1.72 1.87 1.61 1.49 1.59\n",
" 1.58 1.62 1.79 1.46 1.89 2.13 0.99 2.08 2.21 1.32 2.18 1.77 2.36 1.25\n",
" 1.66 1.86 1.95 1.19 1.05 1.48 1.03 1.18 1.81 1.38 1.44 1.07 1.27 1.2\n",
" 1.17 1.04 2.24 1.1 1.43 1.92 2.05 1.12 2.41 2.34 0.97 1.06 1.15 2.29\n",
" 1.16 1.09 0.92 2.07 1. 1.08 1.02 1.33 2. 2.02 1.94 0.95]\n",
"WeightInKilograms:\n",
"[ nan 68.04 63.5 53.98 84.82 62.6 73.48 81.65 74.84 59.42\n",
" 85.28 106.59 71.21 64.41 61.23 90.72 65.77 66.22 80.29 86.18\n",
" 47.63 107.05 57.15 105.23 77.11 56.7 79.38 113.4 102.06 59.87\n",
" 104.33 53.52 61.69 136.08 34.47 99.79 127.01 78.93 95.25 58.97\n",
" 92.08 72.57 83.91 49.9 117.93 71.67 102.97 62.14 83.46 54.43\n",
" 94.35 60.78 117.03 65.32 76.66 88.45 89.81 74.39 68.95 79.83\n",
" 108.41 90.26 55.79 91.63 47.17 78.02 50.8 91.17 84.37 145.15\n",
" 93.89 122.47 48.99 73.94 88.9 80.74 81.19 158.76 97.52 51.71\n",
" 82.55 76.2 68.49 75.3 70.31 63.05 60.33 115.67 86.64 108.86\n",
" 92.53 124.74 43.09 58.51 63.96 92.99 44.45 128.82 98.88 45.36\n",
" 110.68 46.72 58.06 73.03 95.71 131.09 78.47 69.4 85.73 67.59\n",
" 103.87 120.2 88. 54.88 111.58 52.16 77.56 126.55 94.8 123.83\n",
" 89.36 75.75 69.85 112.49 82.1 106.14 57.61 70.76 148.78 96.16\n",
" 67.13 48.08 163.29 109.77 100.7 142.88 64.86 111.13 121.11 55.34\n",
" 101.6 93.44 117.48 120.66 66.68 44.91 132. 107.5 107.95 36.29\n",
" 103.42 87.09 83.01 56.25 96.62 134.26 97.07 34.93 99.34 72.12\n",
" 49.44 122.02 98.43 129.73 181.44 52.62 121.56 110.22 48.53 140.61\n",
" 156.49 116.57 87.54 44. 114.31 31.75 97.98 101.15 112.04 100.24\n",
" 113.85 154.22 118.39 133.81 149.69 41.73 119.75 138.35 151.95 129.27\n",
" 131.54 104.78 132.45 102.51 116.12 40.37 105.69 136.98 195.04 53.07\n",
" 132.9 124.28 112.94 114.76 45.81 119.29 167.83 51.26 172.37 162.39\n",
" 46.27 127.91 123.38 38.56 130.63 143.34 115.21 166.92 135.17 109.32\n",
" 135.62 204.12 127.46 118.84 139.25 126.1 122.92 151.5 133.36 42.64\n",
" 50.35 80. 190.51 37.19 147.87 35.38 144.24 149.23 37.65 86.\n",
" 147.42 281. 165.56 162.84 155.58 70. 137.89 189.6 206.38 148.32\n",
" 42.18 153.77 38.1 90. 176.9 191.87 249.48 67. 95. 82.\n",
" 170.1 62. 40.82 53. 139.71 130.18 100. 165.11 64. 43.54\n",
" 24. 134.72 141.52 125.19 75. 60. 34.02 164.65 30.84 250.\n",
" 58. 76. 73. 112. 74. 55. 200. 54. 66. 72.\n",
" 152.41 39.46 220. 41.28 168.28 188.24 59. 46. 265. 238.14\n",
" 168.74 145. 190. 93. 159.66 78. 50. 185.07 91. 104.\n",
" 165. 183.7 33.57 161.93 68. 125.65 134. 130. 32.21 143.79\n",
" 69. 179.17 63. 105. 210.92 65. 32. 292.57 280. 85.\n",
" 174.63 56. 128.37 87. 39.92 83. 169.64 156.04 177. 121.\n",
" 151.05 89. 146.96 146.06 98. 166.47 36.74 171.46 227.25 29.48\n",
" 190.06 161.03 35.83 226.8 175.09 138.8 240.4 158.3 170.55 61.\n",
" 137.44 145.6 141.07 155.13 52. 120. 57. 77. 27.22 25.4\n",
" 240. 96. 47. 115. 41. 45. 170. 150.59 272.16 26.31\n",
" 48. 39.01 236. 92. 197.31 156. 84. 94. 29.03 49.\n",
" 79. 157.85 192.78 255. 108. 185. 222.26 229.97 180. 81.\n",
" 24.95 71. 26. 107. 101. 208.65 140. 175. 111. 110.\n",
" 141.97 22.68 284.86 136.53 210. 103. 185.97 140.16 146.51 24.49\n",
" 25.85 150. 102. 229.52 23.59 125. 163. 38. 135. 176.45\n",
" 185.52 152.86 232.69 124. 192.32 186.88 118. 160.12 160. 193.68\n",
" 201.85 144.7 184.16 142.43 169. 166.01 32.66 180.53 196.41 51.\n",
" 40. 171.91 195.95 33.11 153.31 159.21 164.2 219.99 215.46 182.34\n",
" 30. 160.57 173.27 158. 213.19 276.24 199.58 175.99 235.87 217.72\n",
" 200.03 230.88 146. 24.04 178.72 150.14 157.4 163.75 191.42 174.18\n",
" 28.58 97. 256.28 205.48 161.48 178.26 179.62 205.02 254.01 154.68\n",
" 209.56 201.4 234.96 177.81 200.49 231.79 227.7 273.52 189.15 173.73\n",
" 183.25 167.38 211.83 223.62 228.61 30.39 197.77 184.61 250.38 181.89\n",
" 31.3 290.3 285. 113. 242.67 231.33 180.08 202.76 176. 188.69\n",
" 206.84 164. 156.94 114. 122. 222. 137. 166. 180.98 272.\n",
" 172.82 274.42 234.51 199.13 244.94 203.21 23.13 265.35 198.22 263.08\n",
" 216.82 154. 169.19 239.04 177.35 210.47 224.98 117. 37. 126.\n",
" 273.06 203.66 252.2 238.59 194.59 187.33 221.35 162. 224.53 23.\n",
" 223.17 187.79 212.73 152. 233.6 193.23 205. 229.06 230. 247.21\n",
" 99. 28.12 230.42 175.54 205.93 171. 26.76 212.28 217. 280.32\n",
" 281.68 248.57 195. 42. 258.55 215. 116. 28. 123. 186.43\n",
" 228.16 119. 219.09 214.55 278.96 182.8 138. 217.27 246.3 189. ]\n",
"BMI:\n",
"[ nan 26.57 25.61 ... 13.51 28.39 48.63]\n",
"AlcoholDrinkers:\n",
"['No' 'Yes' nan]\n",
"HIVTesting:\n",
"['No' 'Yes' nan]\n",
"FluVaxLast12:\n",
"['Yes' 'No' nan]\n",
"PneumoVaxEver:\n",
"['No' 'Yes' nan]\n",
"TetanusLast10Tdap:\n",
"['Yes, received tetanus shot but not sure what type'\n",
" 'No, did not receive any tetanus shot in the past 10 years' nan\n",
" 'Yes, received Tdap' 'Yes, received tetanus shot, but not Tdap']\n",
"HighRiskLastYear:\n",
"['No' nan 'Yes']\n",
"CovidPos:\n",
"['No' 'Yes' nan\n",
" 'Tested positive using home test without a health professional']\n"
]
}
],
"source": [
"for col in df:\n",
" print(f\"{col}:\")\n",
" print(df[col].unique())"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 70,
"outputs": [],
"source": [
"from collections import defaultdict\n",
"def normalize_dataset(dataset):\n",
" dataset[\"GeneralHealth\"] = dataset[\"GeneralHealth\"].map(defaultdict(lambda: float('NaN'), health_map), na_action='ignore')\n",
" dataset[\"Sex\"] = dataset[\"Sex\"].map({\"Female\":0,\"Male\":1}).astype(float) #Zamiana z kolumn tekstowych na numeryczne\n",
" dataset.rename(columns ={\"Sex\":\"Male\"},inplace=True)\n",
" dataset[\"State\"] = dataset[\"State\"].astype('category')\n",
" dataset[\"PhysicalHealthDays\"].astype(float)\n",
" dataset[\"MentalHealthDays\"].astype(float)\n",
" dataset[\"LastCheckupTime\"] = dataset[\"LastCheckupTime\"].fillna(\"Unknown\").astype('category') # Potem korzystam z fillna-->median ale nie działa to na kolumnach kategorycznych więc wykonuję to przed konwersją\n",
" dataset[\"PhysicalActivities\"]= dataset[\"PhysicalActivities\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"SleepHours\"].astype(float)\n",
" dataset[\"RemovedTeeth\"] = dataset[\"RemovedTeeth\"].map(defaultdict(lambda: float('NaN'), {\"None of them\":0,\"1 to 5\":1, \"6 or more, but not all\":2, \"All\":3}), na_action='ignore')\n",
" dataset[\"HadHeartAttack\"]= dataset[\"HadHeartAttack\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"HadAngina\"]= dataset[\"HadAngina\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"HadStroke\"]= dataset[\"HadStroke\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"HadAsthma\"]= dataset[\"HadAsthma\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"HadSkinCancer\"]= dataset[\"HadSkinCancer\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"HadCOPD\"]= dataset[\"HadCOPD\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"HadDepressiveDisorder\"]= dataset[\"HadDepressiveDisorder\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"HadKidneyDisease\"]= dataset[\"HadKidneyDisease\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"HadArthritis\"]= dataset[\"HadArthritis\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"HadDiabetes\"]= dataset[\"HadDiabetes\"].map({\"No\":0,\"Yes, but only during pregnancy (female)\":1,\"No, pre-diabetes or borderline diabetes\":2,\"Yes\":3})\n",
"\n",
" dataset[\"DeafOrHardOfHearing\"]= dataset[\"DeafOrHardOfHearing\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"BlindOrVisionDifficulty\"]= dataset[\"BlindOrVisionDifficulty\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"DifficultyConcentrating\"]= dataset[\"DifficultyConcentrating\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"DifficultyWalking\"]= dataset[\"DifficultyWalking\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"DifficultyDressingBathing\"]= dataset[\"DifficultyDressingBathing\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"DifficultyErrands\"]= dataset[\"DifficultyErrands\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"SmokerStatus\"]= dataset[\"SmokerStatus\"].map({\"Never smoked\":0,\"Current smoker - now smokes some days\":1,\"Former smoker\":2,\"Current smoker - now smokes every day\":3})\n",
" dataset[\"ECigaretteUsage\"]= dataset[\"ECigaretteUsage\"].map({\"Never used e-cigarettes in my entire life\":0,\"Not at all (right now)\":1,\"Use them some days\":2,\"Use them every day\":3})\n",
" dataset[\"ChestScan\"]= dataset[\"ChestScan\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"RaceEthnicityCategory\"] = dataset[\"RaceEthnicityCategory\"].fillna(\"Unknown\").astype('category')\n",
" dataset[\"AgeCategory\"] = dataset[\"AgeCategory\"].fillna(\"Unknown\").astype('category')\n",
" dataset[\"HeightInMeters\"] = dataset[\"HeightInMeters\"].astype(float)\n",
" dataset[\"WeightInKilograms\"] = dataset[\"WeightInKilograms\"].astype(float)\n",
" dataset[\"BMI\"] = dataset[\"BMI\"].astype(float)\n",
" dataset[\"AlcoholDrinkers\"]= dataset[\"AlcoholDrinkers\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"HIVTesting\"]= dataset[\"HIVTesting\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"FluVaxLast12\"]= dataset[\"FluVaxLast12\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"PneumoVaxEver\"]= dataset[\"PneumoVaxEver\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"TetanusLast10Tdap\"]= dataset[\"TetanusLast10Tdap\"].apply(lambda x: float('NaN') if type(x)!=str else 1.0 if 'Yes,' in x else 1.0 if 'No,' in x else float('NaN'))\n",
" dataset[\"HighRiskLastYear\"]= dataset[\"HighRiskLastYear\"].map({\"No\":0,\"Yes\":1})\n",
" dataset[\"CovidPos\"]= dataset[\"CovidPos\"].map({\"No\":0,\"Yes\":1})"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Zbiór test przed zmianą typu danych"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 71,
"outputs": [
{
"data": {
"text/plain": " State Sex GeneralHealth PhysicalHealthDays \\\n276058 New York Male Good 2.0 \n189605 Michigan Female Fair 20.0 \n59234 Delaware Female Very good 0.0 \n255322 New Mexico Male Good 0.0 \n226504 Montana Female Very good 6.0 \n\n MentalHealthDays LastCheckupTime \\\n276058 0.0 Within past 2 years (1 year but less than 2 ye... \n189605 15.0 Within past year (anytime less than 12 months ... \n59234 0.0 Within past year (anytime less than 12 months ... \n255322 0.0 5 or more years ago \n226504 0.0 Within past year (anytime less than 12 months ... \n\n PhysicalActivities SleepHours RemovedTeeth HadHeartAttack ... \\\n276058 NaN 7.0 None of them No ... \n189605 Yes 5.0 All No ... \n59234 Yes 6.0 None of them No ... \n255322 Yes 6.0 None of them No ... \n226504 Yes 8.0 None of them No ... \n\n HeightInMeters WeightInKilograms BMI AlcoholDrinkers HIVTesting \\\n276058 1.55 NaN NaN No No \n189605 1.68 70.31 25.02 No NaN \n59234 1.50 64.41 28.68 No No \n255322 NaN NaN NaN NaN NaN \n226504 1.73 90.72 30.41 Yes No \n\n FluVaxLast12 PneumoVaxEver \\\n276058 No NaN \n189605 Yes Yes \n59234 Yes NaN \n255322 NaN NaN \n226504 No Yes \n\n TetanusLast10Tdap HighRiskLastYear \\\n276058 No, did not receive any tetanus shot in the pa... No \n189605 NaN No \n59234 No, did not receive any tetanus shot in the pa... No \n255322 NaN NaN \n226504 NaN No \n\n CovidPos \n276058 No \n189605 No \n59234 No \n255322 NaN \n226504 Yes \n\n[5 rows x 40 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>State</th>\n <th>Sex</th>\n <th>GeneralHealth</th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>LastCheckupTime</th>\n <th>PhysicalActivities</th>\n <th>SleepHours</th>\n <th>RemovedTeeth</th>\n <th>HadHeartAttack</th>\n <th>...</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n <th>AlcoholDrinkers</th>\n <th>HIVTesting</th>\n <th>FluVaxLast12</th>\n <th>PneumoVaxEver</th>\n <th>TetanusLast10Tdap</th>\n <th>HighRiskLastYear</th>\n <th>CovidPos</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>276058</th>\n <td>New York</td>\n <td>Male</td>\n <td>Good</td>\n <td>2.0</td>\n <td>0.0</td>\n <td>Within past 2 years (1 year but less than 2 ye...</td>\n <td>NaN</td>\n <td>7.0</td>\n <td>None of them</td>\n <td>No</td>\n <td>...</td>\n <td>1.55</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>No</td>\n <td>No</td>\n <td>No</td>\n <td>NaN</td>\n <td>No, did not receive any tetanus shot in the pa...</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>189605</th>\n <td>Michigan</td>\n <td>Female</td>\n <td>Fair</td>\n <td>20.0</td>\n <td>15.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>Yes</td>\n <td>5.0</td>\n <td>All</td>\n <td>No</td>\n <td>...</td>\n <td>1.68</td>\n <td>70.31</td>\n <td>25.02</td>\n <td>No</td>\n <td>NaN</td>\n <td>Yes</td>\n <td>Yes</td>\n <td>NaN</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>59234</th>\n <td>Delaware</td>\n <td>Female</td>\n <td>Very good</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>Yes</td>\n <td>6.0</td>\n <td>None of them</td>\n <td>No</td>\n <td>...</td>\n <td>1.50</td>\n <td>64.41</td>\n <td>28.68</td>\n <td>No</td>\n <td>No</td>\n <td>Yes</td>\n <td>NaN</td>\n <td>No, did not receive any tetanus shot in the pa...</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>255322</th>\n <td>New Mexico</td>\n <td>Male</td>\n <td>Good</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>5 or more years ago</td>\n <td>Yes</td>\n <td>6.0</td>\n <td>None of them</td>\n <td>No</td>\n <td>...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>226504</th>\n <td>Montana</td>\n <td>Female</td>\n <td>Very good</td>\n <td>6.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>Yes</td>\n <td>8.0</td>\n <td>None of them</td>\n <td>No</td>\n <td>...</td>\n <td>1.73</td>\n <td>90.72</td>\n <td>30.41</td>\n <td>Yes</td>\n <td>No</td>\n <td>No</td>\n <td>Yes</td>\n <td>NaN</td>\n <td>No</td>\n <td>Yes</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 40 columns</p>\n</div>"
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## Zbiór test po zmianie typu danych"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 72,
"outputs": [
{
"data": {
"text/plain": " State Male GeneralHealth PhysicalHealthDays MentalHealthDays \\\n276058 New York 1.0 3.0 2.0 0.0 \n189605 Michigan 0.0 2.0 20.0 15.0 \n59234 Delaware 0.0 4.0 0.0 0.0 \n255322 New Mexico 1.0 3.0 0.0 0.0 \n226504 Montana 0.0 4.0 6.0 0.0 \n\n LastCheckupTime PhysicalActivities \\\n276058 Within past 2 years (1 year but less than 2 ye... NaN \n189605 Within past year (anytime less than 12 months ... 1.0 \n59234 Within past year (anytime less than 12 months ... 1.0 \n255322 5 or more years ago 1.0 \n226504 Within past year (anytime less than 12 months ... 1.0 \n\n SleepHours RemovedTeeth HadHeartAttack ... HeightInMeters \\\n276058 7.0 0.0 0.0 ... 1.55 \n189605 5.0 3.0 0.0 ... 1.68 \n59234 6.0 0.0 0.0 ... 1.50 \n255322 6.0 0.0 0.0 ... NaN \n226504 8.0 0.0 0.0 ... 1.73 \n\n WeightInKilograms BMI AlcoholDrinkers HIVTesting FluVaxLast12 \\\n276058 NaN NaN 0.0 0.0 0.0 \n189605 70.31 25.02 0.0 NaN 1.0 \n59234 64.41 28.68 0.0 0.0 1.0 \n255322 NaN NaN NaN NaN NaN \n226504 90.72 30.41 1.0 0.0 0.0 \n\n PneumoVaxEver TetanusLast10Tdap HighRiskLastYear CovidPos \n276058 NaN 1.0 0.0 0.0 \n189605 1.0 NaN 0.0 0.0 \n59234 NaN 1.0 0.0 0.0 \n255322 NaN NaN NaN NaN \n226504 1.0 NaN 0.0 1.0 \n\n[5 rows x 40 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>State</th>\n <th>Male</th>\n <th>GeneralHealth</th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>LastCheckupTime</th>\n <th>PhysicalActivities</th>\n <th>SleepHours</th>\n <th>RemovedTeeth</th>\n <th>HadHeartAttack</th>\n <th>...</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n <th>AlcoholDrinkers</th>\n <th>HIVTesting</th>\n <th>FluVaxLast12</th>\n <th>PneumoVaxEver</th>\n <th>TetanusLast10Tdap</th>\n <th>HighRiskLastYear</th>\n <th>CovidPos</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>276058</th>\n <td>New York</td>\n <td>1.0</td>\n <td>3.0</td>\n <td>2.0</td>\n <td>0.0</td>\n <td>Within past 2 years (1 year but less than 2 ye...</td>\n <td>NaN</td>\n <td>7.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>1.55</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>189605</th>\n <td>Michigan</td>\n <td>0.0</td>\n <td>2.0</td>\n <td>20.0</td>\n <td>15.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>5.0</td>\n <td>3.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>1.68</td>\n <td>70.31</td>\n <td>25.02</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>59234</th>\n <td>Delaware</td>\n <td>0.0</td>\n <td>4.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>6.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>1.50</td>\n <td>64.41</td>\n <td>28.68</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>255322</th>\n <td>New Mexico</td>\n <td>1.0</td>\n <td>3.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>5 or more years ago</td>\n <td>1.0</td>\n <td>6.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>226504</th>\n <td>Montana</td>\n <td>0.0</td>\n <td>4.0</td>\n <td>6.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>8.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>1.73</td>\n <td>90.72</td>\n <td>30.41</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>1.0</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 40 columns</p>\n</div>"
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"normalize_dataset(test)\n",
"test.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 73,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 44513 entries, 276058 to 196692\n",
"Data columns (total 40 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 State 44513 non-null category\n",
" 1 Male 44513 non-null float64 \n",
" 2 GeneralHealth 44380 non-null float64 \n",
" 3 PhysicalHealthDays 43374 non-null float64 \n",
" 4 MentalHealthDays 43620 non-null float64 \n",
" 5 LastCheckupTime 44513 non-null category\n",
" 6 PhysicalActivities 44383 non-null float64 \n",
" 7 SleepHours 43982 non-null float64 \n",
" 8 RemovedTeeth 43364 non-null float64 \n",
" 9 HadHeartAttack 44220 non-null float64 \n",
" 10 HadAngina 44117 non-null float64 \n",
" 11 HadStroke 44352 non-null float64 \n",
" 12 HadAsthma 44348 non-null float64 \n",
" 13 HadSkinCancer 44192 non-null float64 \n",
" 14 HadCOPD 44283 non-null float64 \n",
" 15 HadDepressiveDisorder 44197 non-null float64 \n",
" 16 HadKidneyDisease 44342 non-null float64 \n",
" 17 HadArthritis 44231 non-null float64 \n",
" 18 HadDiabetes 44377 non-null float64 \n",
" 19 DeafOrHardOfHearing 42456 non-null float64 \n",
" 20 BlindOrVisionDifficulty 42338 non-null float64 \n",
" 21 DifficultyConcentrating 42066 non-null float64 \n",
" 22 DifficultyWalking 42090 non-null float64 \n",
" 23 DifficultyDressingBathing 42111 non-null float64 \n",
" 24 DifficultyErrands 41923 non-null float64 \n",
" 25 SmokerStatus 40967 non-null float64 \n",
" 26 ECigaretteUsage 40964 non-null float64 \n",
" 27 ChestScan 38930 non-null float64 \n",
" 28 RaceEthnicityCategory 44513 non-null category\n",
" 29 AgeCategory 44513 non-null category\n",
" 30 HeightInMeters 41634 non-null float64 \n",
" 31 WeightInKilograms 40303 non-null float64 \n",
" 32 BMI 39648 non-null float64 \n",
" 33 AlcoholDrinkers 39882 non-null float64 \n",
" 34 HIVTesting 37870 non-null float64 \n",
" 35 FluVaxLast12 39814 non-null float64 \n",
" 36 PneumoVaxEver 36760 non-null float64 \n",
" 37 TetanusLast10Tdap 36287 non-null float64 \n",
" 38 HighRiskLastYear 39445 non-null float64 \n",
" 39 CovidPos 38063 non-null float64 \n",
"dtypes: category(4), float64(36)\n",
"memory usage: 12.7 MB\n"
]
}
],
"source": [
"test.info()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 74,
"outputs": [],
"source": [
"normalize_dataset(train)\n",
"normalize_dataset(valid)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## Statystyki dla zbiorów po zamianie na kolumny numeryczne\n",
"\n",
"*50. centyl to mediana*"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 75,
"outputs": [
{
"data": {
"text/plain": " Male GeneralHealth PhysicalHealthDays MentalHealthDays \\\ncount 676617.000000 674189.000000 655653.000000 660103.000000 \nmean 0.539397 3.056503 6.720248 4.819231 \nstd 0.498446 1.138185 10.708463 9.058480 \nmin 0.000000 1.000000 0.000000 0.000000 \n25% 0.000000 2.000000 0.000000 0.000000 \n50% 1.000000 3.000000 0.000000 0.000000 \n75% 1.000000 4.000000 10.000000 5.000000 \nmax 1.000000 5.000000 30.000000 30.000000 \n\n PhysicalActivities SleepHours RemovedTeeth HadHeartAttack \\\ncount 674547.000000 665806.000000 654146.000000 674184.000000 \nmean 0.689765 7.039463 0.978094 0.505120 \nstd 0.462590 1.726591 1.017700 0.499974 \nmin 0.000000 1.000000 0.000000 0.000000 \n25% 0.000000 6.000000 0.000000 0.000000 \n50% 1.000000 7.000000 1.000000 1.000000 \n75% 1.000000 8.000000 2.000000 1.000000 \nmax 1.000000 24.000000 3.000000 1.000000 \n\n HadAngina HadStroke ... HeightInMeters WeightInKilograms \\\ncount 657382.000000 672884.000000 ... 637479.000000 620141.000000 \nmean 0.264342 0.116472 ... 1.707316 84.660193 \nstd 0.440983 0.320790 ... 0.108041 21.748490 \nmin 0.000000 0.000000 ... 0.910000 22.680000 \n25% 0.000000 0.000000 ... 1.630000 69.400000 \n50% 0.000000 0.000000 ... 1.700000 81.650000 \n75% 1.000000 0.000000 ... 1.780000 96.160000 \nmax 1.000000 1.000000 ... 2.410000 292.570000 \n\n BMI AlcoholDrinkers HIVTesting FluVaxLast12 \\\ncount 611530.000000 607591.000000 573999.000000 606624.000000 \nmean 28.918429 0.455838 0.326018 0.571211 \nstd 6.631906 0.498046 0.468754 0.494903 \nmin 12.020000 0.000000 0.000000 0.000000 \n25% 24.410000 0.000000 0.000000 0.000000 \n50% 27.890000 0.000000 0.000000 1.000000 \n75% 32.220000 1.000000 1.000000 1.000000 \nmax 99.640000 1.000000 1.000000 1.000000 \n\n PneumoVaxEver TetanusLast10Tdap HighRiskLastYear CovidPos \ncount 571259.000000 554407.0 601115.000000 585931.000000 \nmean 0.527326 1.0 0.034534 0.273136 \nstd 0.499253 0.0 0.182597 0.445571 \nmin 0.000000 1.0 0.000000 0.000000 \n25% 0.000000 1.0 0.000000 0.000000 \n50% 1.000000 1.0 0.000000 0.000000 \n75% 1.000000 1.0 0.000000 1.000000 \nmax 1.000000 1.0 1.000000 1.000000 \n\n[8 rows x 36 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Male</th>\n <th>GeneralHealth</th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>PhysicalActivities</th>\n <th>SleepHours</th>\n <th>RemovedTeeth</th>\n <th>HadHeartAttack</th>\n <th>HadAngina</th>\n <th>HadStroke</th>\n <th>...</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n <th>AlcoholDrinkers</th>\n <th>HIVTesting</th>\n <th>FluVaxLast12</th>\n <th>PneumoVaxEver</th>\n <th>TetanusLast10Tdap</th>\n <th>HighRiskLastYear</th>\n <th>CovidPos</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>676617.000000</td>\n <td>674189.000000</td>\n <td>655653.000000</td>\n <td>660103.000000</td>\n <td>674547.000000</td>\n <td>665806.000000</td>\n <td>654146.000000</td>\n <td>674184.000000</td>\n <td>657382.000000</td>\n <td>672884.000000</td>\n <td>...</td>\n <td>637479.000000</td>\n <td>620141.000000</td>\n <td>611530.000000</td>\n <td>607591.000000</td>\n <td>573999.000000</td>\n <td>606624.000000</td>\n <td>571259.000000</td>\n <td>554407.0</td>\n <td>601115.000000</td>\n <td>585931.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>0.539397</td>\n <td>3.056503</td>\n <td>6.720248</td>\n <td>4.819231</td>\n <td>0.689765</td>\n <td>7.039463</td>\n <td>0.978094</td>\n <td>0.505120</td>\n <td>0.264342</td>\n <td>0.116472</td>\n <td>...</td>\n <td>1.707316</td>\n <td>84.660193</td>\n <td>28.918429</td>\n <td>0.455838</td>\n <td>0.326018</td>\n <td>0.571211</td>\n <td>0.527326</td>\n <td>1.0</td>\n <td>0.034534</td>\n <td>0.273136</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.498446</td>\n <td>1.138185</td>\n <td>10.708463</td>\n <td>9.058480</td>\n <td>0.462590</td>\n <td>1.726591</td>\n <td>1.017700</td>\n <td>0.499974</td>\n <td>0.440983</td>\n <td>0.320790</td>\n <td>...</td>\n <td>0.108041</td>\n <td>21.748490</td>\n <td>6.631906</td>\n <td>0.498046</td>\n <td>0.468754</td>\n <td>0.494903</td>\n <td>0.499253</td>\n <td>0.0</td>\n <td>0.182597</td>\n <td>0.445571</td>\n </tr>\n <tr>\n <th>min</th>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>0.910000</td>\n <td>22.680000</td>\n <td>12.020000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>0.000000</td>\n <td>2.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>6.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>1.630000</td>\n <td>69.400000</td>\n <td>24.410000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>1.000000</td>\n <td>3.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>7.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>1.700000</td>\n <td>81.650000</td>\n <td>27.890000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>1.000000</td>\n <td>4.000000</td>\n <td>10.000000</td>\n <td>5.000000</td>\n <td>1.000000</td>\n <td>8.000000</td>\n <td>2.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>1.780000</td>\n <td>96.160000</td>\n <td>32.220000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>1.000000</td>\n </tr>\n <tr>\n <th>max</th>\n <td>1.000000</td>\n <td>5.000000</td>\n <td>30.000000</td>\n <td>30.000000</td>\n <td>1.000000</td>\n <td>24.000000</td>\n <td>3.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>...</td>\n <td>2.410000</td>\n <td>292.570000</td>\n <td>99.640000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.0</td>\n <td>1.000000</td>\n <td>1.000000</td>\n </tr>\n </tbody>\n</table>\n<p>8 rows × 36 columns</p>\n</div>"
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.describe()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 76,
"outputs": [
{
"data": {
"text/plain": " Male GeneralHealth PhysicalHealthDays MentalHealthDays \\\ncount 44513.000000 44380.000000 43374.000000 43620.000000 \nmean 0.467347 3.433551 4.304353 4.470839 \nstd 0.498938 1.049691 8.629763 8.472884 \nmin 0.000000 1.000000 0.000000 0.000000 \n25% 0.000000 3.000000 0.000000 0.000000 \n50% 0.000000 3.000000 0.000000 0.000000 \n75% 1.000000 4.000000 3.000000 5.000000 \nmax 1.000000 5.000000 30.000000 30.000000 \n\n PhysicalActivities SleepHours RemovedTeeth HadHeartAttack \\\ncount 44383.000000 43982.000000 43364.000000 44220.000000 \nmean 0.759119 7.012414 0.687644 0.058684 \nstd 0.427623 1.493726 0.883372 0.235035 \nmin 0.000000 1.000000 0.000000 0.000000 \n25% 1.000000 6.000000 0.000000 0.000000 \n50% 1.000000 7.000000 0.000000 0.000000 \n75% 1.000000 8.000000 1.000000 0.000000 \nmax 1.000000 24.000000 3.000000 1.000000 \n\n HadAngina HadStroke ... HeightInMeters WeightInKilograms \\\ncount 44117.000000 44352.000000 ... 41634.000000 40303.000000 \nmean 0.060816 0.043155 ... 1.701734 82.990520 \nstd 0.238994 0.203208 ... 0.106604 21.462338 \nmin 0.000000 0.000000 ... 0.910000 22.680000 \n25% 0.000000 0.000000 ... 1.630000 68.040000 \n50% 0.000000 0.000000 ... 1.700000 80.740000 \n75% 0.000000 0.000000 ... 1.780000 95.250000 \nmax 1.000000 1.000000 ... 2.260000 276.240000 \n\n BMI AlcoholDrinkers HIVTesting FluVaxLast12 \\\ncount 39648.000000 39882.000000 37870.000000 39814.000000 \nmean 28.545288 0.532621 0.342382 0.526348 \nstd 6.574508 0.498941 0.474513 0.499312 \nmin 12.690000 0.000000 0.000000 0.000000 \n25% 24.130000 0.000000 0.000000 0.000000 \n50% 27.440000 1.000000 0.000000 1.000000 \n75% 31.750000 1.000000 1.000000 1.000000 \nmax 97.650000 1.000000 1.000000 1.000000 \n\n PneumoVaxEver TetanusLast10Tdap HighRiskLastYear CovidPos \ncount 36760.00000 36287.0 39445.000000 38063.000000 \nmean 0.41420 1.0 0.043174 0.293461 \nstd 0.49259 0.0 0.203251 0.455354 \nmin 0.00000 1.0 0.000000 0.000000 \n25% 0.00000 1.0 0.000000 0.000000 \n50% 0.00000 1.0 0.000000 0.000000 \n75% 1.00000 1.0 0.000000 1.000000 \nmax 1.00000 1.0 1.000000 1.000000 \n\n[8 rows x 36 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Male</th>\n <th>GeneralHealth</th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>PhysicalActivities</th>\n <th>SleepHours</th>\n <th>RemovedTeeth</th>\n <th>HadHeartAttack</th>\n <th>HadAngina</th>\n <th>HadStroke</th>\n <th>...</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n <th>AlcoholDrinkers</th>\n <th>HIVTesting</th>\n <th>FluVaxLast12</th>\n <th>PneumoVaxEver</th>\n <th>TetanusLast10Tdap</th>\n <th>HighRiskLastYear</th>\n <th>CovidPos</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>44513.000000</td>\n <td>44380.000000</td>\n <td>43374.000000</td>\n <td>43620.000000</td>\n <td>44383.000000</td>\n <td>43982.000000</td>\n <td>43364.000000</td>\n <td>44220.000000</td>\n <td>44117.000000</td>\n <td>44352.000000</td>\n <td>...</td>\n <td>41634.000000</td>\n <td>40303.000000</td>\n <td>39648.000000</td>\n <td>39882.000000</td>\n <td>37870.000000</td>\n <td>39814.000000</td>\n <td>36760.00000</td>\n <td>36287.0</td>\n <td>39445.000000</td>\n <td>38063.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>0.467347</td>\n <td>3.433551</td>\n <td>4.304353</td>\n <td>4.470839</td>\n <td>0.759119</td>\n <td>7.012414</td>\n <td>0.687644</td>\n <td>0.058684</td>\n <td>0.060816</td>\n <td>0.043155</td>\n <td>...</td>\n <td>1.701734</td>\n <td>82.990520</td>\n <td>28.545288</td>\n <td>0.532621</td>\n <td>0.342382</td>\n <td>0.526348</td>\n <td>0.41420</td>\n <td>1.0</td>\n <td>0.043174</td>\n <td>0.293461</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.498938</td>\n <td>1.049691</td>\n <td>8.629763</td>\n <td>8.472884</td>\n <td>0.427623</td>\n <td>1.493726</td>\n <td>0.883372</td>\n <td>0.235035</td>\n <td>0.238994</td>\n <td>0.203208</td>\n <td>...</td>\n <td>0.106604</td>\n <td>21.462338</td>\n <td>6.574508</td>\n <td>0.498941</td>\n <td>0.474513</td>\n <td>0.499312</td>\n <td>0.49259</td>\n <td>0.0</td>\n <td>0.203251</td>\n <td>0.455354</td>\n </tr>\n <tr>\n <th>min</th>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>0.910000</td>\n <td>22.680000</td>\n <td>12.690000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.00000</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>0.000000</td>\n <td>3.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>6.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>1.630000</td>\n <td>68.040000</td>\n <td>24.130000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.00000</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>0.000000</td>\n <td>3.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>7.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>1.700000</td>\n <td>80.740000</td>\n <td>27.440000</td>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>0.00000</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>1.000000</td>\n <td>4.000000</td>\n <td>3.000000</td>\n <td>5.000000</td>\n <td>1.000000</td>\n <td>8.000000</td>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>1.780000</td>\n <td>95.250000</td>\n <td>31.750000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.00000</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>1.000000</td>\n </tr>\n <tr>\n <th>max</th>\n <td>1.000000</td>\n <td>5.000000</td>\n <td>30.000000</td>\n <td>30.000000</td>\n <td>1.000000</td>\n <td>24.000000</td>\n <td>3.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>...</td>\n <td>2.260000</td>\n <td>276.240000</td>\n <td>97.650000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.00000</td>\n <td>1.0</td>\n <td>1.000000</td>\n <td>1.000000</td>\n </tr>\n </tbody>\n</table>\n<p>8 rows × 36 columns</p>\n</div>"
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.describe()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 77,
"outputs": [
{
"data": {
"text/plain": " Male GeneralHealth PhysicalHealthDays MentalHealthDays \\\ncount 44514.000000 44405.000000 43450.000000 43622.000000 \nmean 0.466887 3.427835 4.354799 4.398171 \nstd 0.498908 1.056506 8.691768 8.406697 \nmin 0.000000 1.000000 0.000000 0.000000 \n25% 0.000000 3.000000 0.000000 0.000000 \n50% 0.000000 3.000000 0.000000 0.000000 \n75% 1.000000 4.000000 3.000000 5.000000 \nmax 1.000000 5.000000 30.000000 30.000000 \n\n PhysicalActivities SleepHours RemovedTeeth HadHeartAttack \\\ncount 44421.000000 43955.000000 43350.000000 44175.000000 \nmean 0.760271 7.031760 0.684060 0.056163 \nstd 0.426923 1.513703 0.881616 0.230239 \nmin 0.000000 1.000000 0.000000 0.000000 \n25% 1.000000 6.000000 0.000000 0.000000 \n50% 1.000000 7.000000 0.000000 0.000000 \n75% 1.000000 8.000000 1.000000 0.000000 \nmax 1.000000 24.000000 3.000000 1.000000 \n\n HadAngina HadStroke ... HeightInMeters WeightInKilograms \\\ncount 44060.000000 44339.000000 ... 41591.000000 40226.000000 \nmean 0.060236 0.043506 ... 1.702198 83.013436 \nstd 0.237926 0.203995 ... 0.107066 21.464497 \nmin 0.000000 0.000000 ... 0.910000 22.680000 \n25% 0.000000 0.000000 ... 1.630000 68.040000 \n50% 0.000000 0.000000 ... 1.700000 80.740000 \n75% 0.000000 0.000000 ... 1.780000 95.250000 \nmax 1.000000 1.000000 ... 2.360000 284.860000 \n\n BMI AlcoholDrinkers HIVTesting FluVaxLast12 \\\ncount 39516.000000 39789.000000 37856.000000 39749.000000 \nmean 28.522226 0.529945 0.340501 0.522831 \nstd 6.564679 0.499109 0.473884 0.499485 \nmin 12.190000 0.000000 0.000000 0.000000 \n25% 24.130000 0.000000 0.000000 0.000000 \n50% 27.440000 1.000000 0.000000 1.000000 \n75% 31.750000 1.000000 1.000000 1.000000 \nmax 96.200000 1.000000 1.000000 1.000000 \n\n PneumoVaxEver TetanusLast10Tdap HighRiskLastYear CovidPos \ncount 36681.000000 36210.0 39453.000000 38058.000000 \nmean 0.414983 1.0 0.045903 0.290609 \nstd 0.492726 0.0 0.209277 0.454049 \nmin 0.000000 1.0 0.000000 0.000000 \n25% 0.000000 1.0 0.000000 0.000000 \n50% 0.000000 1.0 0.000000 0.000000 \n75% 1.000000 1.0 0.000000 1.000000 \nmax 1.000000 1.0 1.000000 1.000000 \n\n[8 rows x 36 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Male</th>\n <th>GeneralHealth</th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>PhysicalActivities</th>\n <th>SleepHours</th>\n <th>RemovedTeeth</th>\n <th>HadHeartAttack</th>\n <th>HadAngina</th>\n <th>HadStroke</th>\n <th>...</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n <th>AlcoholDrinkers</th>\n <th>HIVTesting</th>\n <th>FluVaxLast12</th>\n <th>PneumoVaxEver</th>\n <th>TetanusLast10Tdap</th>\n <th>HighRiskLastYear</th>\n <th>CovidPos</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>44514.000000</td>\n <td>44405.000000</td>\n <td>43450.000000</td>\n <td>43622.000000</td>\n <td>44421.000000</td>\n <td>43955.000000</td>\n <td>43350.000000</td>\n <td>44175.000000</td>\n <td>44060.000000</td>\n <td>44339.000000</td>\n <td>...</td>\n <td>41591.000000</td>\n <td>40226.000000</td>\n <td>39516.000000</td>\n <td>39789.000000</td>\n <td>37856.000000</td>\n <td>39749.000000</td>\n <td>36681.000000</td>\n <td>36210.0</td>\n <td>39453.000000</td>\n <td>38058.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>0.466887</td>\n <td>3.427835</td>\n <td>4.354799</td>\n <td>4.398171</td>\n <td>0.760271</td>\n <td>7.031760</td>\n <td>0.684060</td>\n <td>0.056163</td>\n <td>0.060236</td>\n <td>0.043506</td>\n <td>...</td>\n <td>1.702198</td>\n <td>83.013436</td>\n <td>28.522226</td>\n <td>0.529945</td>\n <td>0.340501</td>\n <td>0.522831</td>\n <td>0.414983</td>\n <td>1.0</td>\n <td>0.045903</td>\n <td>0.290609</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.498908</td>\n <td>1.056506</td>\n <td>8.691768</td>\n <td>8.406697</td>\n <td>0.426923</td>\n <td>1.513703</td>\n <td>0.881616</td>\n <td>0.230239</td>\n <td>0.237926</td>\n <td>0.203995</td>\n <td>...</td>\n <td>0.107066</td>\n <td>21.464497</td>\n <td>6.564679</td>\n <td>0.499109</td>\n <td>0.473884</td>\n <td>0.499485</td>\n <td>0.492726</td>\n <td>0.0</td>\n <td>0.209277</td>\n <td>0.454049</td>\n </tr>\n <tr>\n <th>min</th>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>0.910000</td>\n <td>22.680000</td>\n <td>12.190000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>0.000000</td>\n <td>3.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>6.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>1.630000</td>\n <td>68.040000</td>\n <td>24.130000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>0.000000</td>\n <td>3.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>7.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>1.700000</td>\n <td>80.740000</td>\n <td>27.440000</td>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>1.000000</td>\n <td>4.000000</td>\n <td>3.000000</td>\n <td>5.000000</td>\n <td>1.000000</td>\n <td>8.000000</td>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>...</td>\n <td>1.780000</td>\n <td>95.250000</td>\n <td>31.750000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.0</td>\n <td>0.000000</td>\n <td>1.000000</td>\n </tr>\n <tr>\n <th>max</th>\n <td>1.000000</td>\n <td>5.000000</td>\n <td>30.000000</td>\n <td>30.000000</td>\n <td>1.000000</td>\n <td>24.000000</td>\n <td>3.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>...</td>\n <td>2.360000</td>\n <td>284.860000</td>\n <td>96.200000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.0</td>\n <td>1.000000</td>\n <td>1.000000</td>\n </tr>\n </tbody>\n</table>\n<p>8 rows × 36 columns</p>\n</div>"
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"valid.describe()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"#### Wydaje się być korelacja między masą ciała i zawałem:"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 78,
"outputs": [
{
"data": {
"text/plain": "<Figure size 729.847x600 with 1 Axes>",
"image/png": "\n"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sns\n",
"sns.set_theme()\n",
"g = sns.catplot(\n",
" data=train, kind=\"bar\",\n",
" x=\"GeneralHealth\", y=\"WeightInKilograms\", hue=\"HadHeartAttack\",\n",
" errorbar=\"sd\", palette=\"dark\", alpha=.6, height=6\n",
")\n",
"g.despine(left=True)\n",
"g.set_axis_labels(\"General health index\", \"Body mass (kg)\")\n",
"g.legend.set_title(\"Had heart attack\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Osoby palące częsciej miały zawał:"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 79,
"outputs": [
{
"data": {
"text/plain": " SmokerStatus HadHeartAttack\n0 0.0 0.037162\n1 1.0 0.069817\n2 2.0 0.082760\n3 3.0 0.093980",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>SmokerStatus</th>\n <th>HadHeartAttack</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0.0</td>\n <td>0.037162</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1.0</td>\n <td>0.069817</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2.0</td>\n <td>0.082760</td>\n </tr>\n <tr>\n <th>3</th>\n <td>3.0</td>\n <td>0.093980</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"valid.groupby('SmokerStatus', as_index=False)['HadHeartAttack'].mean()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Osoby z gorszym wskaźnikiem \"GeneralHealth\" w tym zbiorze danych częściej miały zawał:"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 80,
"outputs": [
{
"data": {
"text/plain": " GeneralHealth HadHeartAttack\n0 1.0 0.219401\n1 2.0 0.118330\n2 3.0 0.056664\n3 4.0 0.028686\n4 5.0 0.014112",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>GeneralHealth</th>\n <th>HadHeartAttack</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1.0</td>\n <td>0.219401</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2.0</td>\n <td>0.118330</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3.0</td>\n <td>0.056664</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4.0</td>\n <td>0.028686</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5.0</td>\n <td>0.014112</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"valid.groupby('GeneralHealth', as_index=False)['HadHeartAttack'].mean()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 81,
"outputs": [
{
"data": {
"text/plain": "SmokerStatus 0.0 1.0 2.0 3.0\nGeneralHealth \n1.0 0.163180 0.242991 0.259740 0.250000\n2.0 0.085862 0.120438 0.158195 0.146465\n3.0 0.038882 0.059574 0.083070 0.076079\n4.0 0.023638 0.022901 0.039315 0.032688\n5.0 0.011113 0.017544 0.020365 0.025316",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th>SmokerStatus</th>\n <th>0.0</th>\n <th>1.0</th>\n <th>2.0</th>\n <th>3.0</th>\n </tr>\n <tr>\n <th>GeneralHealth</th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1.0</th>\n <td>0.163180</td>\n <td>0.242991</td>\n <td>0.259740</td>\n <td>0.250000</td>\n </tr>\n <tr>\n <th>2.0</th>\n <td>0.085862</td>\n <td>0.120438</td>\n <td>0.158195</td>\n <td>0.146465</td>\n </tr>\n <tr>\n <th>3.0</th>\n <td>0.038882</td>\n <td>0.059574</td>\n <td>0.083070</td>\n <td>0.076079</td>\n </tr>\n <tr>\n <th>4.0</th>\n <td>0.023638</td>\n <td>0.022901</td>\n <td>0.039315</td>\n <td>0.032688</td>\n </tr>\n <tr>\n <th>5.0</th>\n <td>0.011113</td>\n <td>0.017544</td>\n <td>0.020365</td>\n <td>0.025316</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"valid.pivot_table('HadHeartAttack',index='GeneralHealth', columns='SmokerStatus')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## Normalizacja część 2 - Skalowanie kolumn numerycznych do 0-1"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 82,
"outputs": [],
"source": [
"from sklearn.preprocessing import MinMaxScaler\n",
"scaler = MinMaxScaler()\n",
"def scale_float_columns(dataset):\n",
" numerical_columns = list(dataset.select_dtypes(include=['float64']).columns)\n",
" dataset[numerical_columns] = scaler.fit_transform(dataset[numerical_columns])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 83,
"outputs": [
{
"data": {
"text/plain": " State Male GeneralHealth PhysicalHealthDays MentalHealthDays \\\n276058 New York 1.0 3.0 2.0 0.0 \n189605 Michigan 0.0 2.0 20.0 15.0 \n59234 Delaware 0.0 4.0 0.0 0.0 \n255322 New Mexico 1.0 3.0 0.0 0.0 \n226504 Montana 0.0 4.0 6.0 0.0 \n\n LastCheckupTime PhysicalActivities \\\n276058 Within past 2 years (1 year but less than 2 ye... NaN \n189605 Within past year (anytime less than 12 months ... 1.0 \n59234 Within past year (anytime less than 12 months ... 1.0 \n255322 5 or more years ago 1.0 \n226504 Within past year (anytime less than 12 months ... 1.0 \n\n SleepHours RemovedTeeth HadHeartAttack ... HeightInMeters \\\n276058 7.0 0.0 0.0 ... 1.55 \n189605 5.0 3.0 0.0 ... 1.68 \n59234 6.0 0.0 0.0 ... 1.50 \n255322 6.0 0.0 0.0 ... NaN \n226504 8.0 0.0 0.0 ... 1.73 \n\n WeightInKilograms BMI AlcoholDrinkers HIVTesting FluVaxLast12 \\\n276058 NaN NaN 0.0 0.0 0.0 \n189605 70.31 25.02 0.0 NaN 1.0 \n59234 64.41 28.68 0.0 0.0 1.0 \n255322 NaN NaN NaN NaN NaN \n226504 90.72 30.41 1.0 0.0 0.0 \n\n PneumoVaxEver TetanusLast10Tdap HighRiskLastYear CovidPos \n276058 NaN 1.0 0.0 0.0 \n189605 1.0 NaN 0.0 0.0 \n59234 NaN 1.0 0.0 0.0 \n255322 NaN NaN NaN NaN \n226504 1.0 NaN 0.0 1.0 \n\n[5 rows x 40 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>State</th>\n <th>Male</th>\n <th>GeneralHealth</th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>LastCheckupTime</th>\n <th>PhysicalActivities</th>\n <th>SleepHours</th>\n <th>RemovedTeeth</th>\n <th>HadHeartAttack</th>\n <th>...</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n <th>AlcoholDrinkers</th>\n <th>HIVTesting</th>\n <th>FluVaxLast12</th>\n <th>PneumoVaxEver</th>\n <th>TetanusLast10Tdap</th>\n <th>HighRiskLastYear</th>\n <th>CovidPos</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>276058</th>\n <td>New York</td>\n <td>1.0</td>\n <td>3.0</td>\n <td>2.0</td>\n <td>0.0</td>\n <td>Within past 2 years (1 year but less than 2 ye...</td>\n <td>NaN</td>\n <td>7.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>1.55</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>189605</th>\n <td>Michigan</td>\n <td>0.0</td>\n <td>2.0</td>\n <td>20.0</td>\n <td>15.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>5.0</td>\n <td>3.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>1.68</td>\n <td>70.31</td>\n <td>25.02</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>59234</th>\n <td>Delaware</td>\n <td>0.0</td>\n <td>4.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>6.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>1.50</td>\n <td>64.41</td>\n <td>28.68</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>255322</th>\n <td>New Mexico</td>\n <td>1.0</td>\n <td>3.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>5 or more years ago</td>\n <td>1.0</td>\n <td>6.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>226504</th>\n <td>Montana</td>\n <td>0.0</td>\n <td>4.0</td>\n <td>6.0</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>8.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>1.73</td>\n <td>90.72</td>\n <td>30.41</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>1.0</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 40 columns</p>\n</div>"
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 84,
"outputs": [
{
"data": {
"text/plain": " State Male GeneralHealth PhysicalHealthDays MentalHealthDays \\\n276058 New York 1.0 0.50 0.066667 0.0 \n189605 Michigan 0.0 0.25 0.666667 0.5 \n59234 Delaware 0.0 0.75 0.000000 0.0 \n255322 New Mexico 1.0 0.50 0.000000 0.0 \n226504 Montana 0.0 0.75 0.200000 0.0 \n\n LastCheckupTime PhysicalActivities \\\n276058 Within past 2 years (1 year but less than 2 ye... NaN \n189605 Within past year (anytime less than 12 months ... 1.0 \n59234 Within past year (anytime less than 12 months ... 1.0 \n255322 5 or more years ago 1.0 \n226504 Within past year (anytime less than 12 months ... 1.0 \n\n SleepHours RemovedTeeth HadHeartAttack ... HeightInMeters \\\n276058 0.260870 0.0 0.0 ... 0.474074 \n189605 0.173913 1.0 0.0 ... 0.570370 \n59234 0.217391 0.0 0.0 ... 0.437037 \n255322 0.217391 0.0 0.0 ... NaN \n226504 0.304348 0.0 0.0 ... 0.607407 \n\n WeightInKilograms BMI AlcoholDrinkers HIVTesting \\\n276058 NaN NaN 0.0 0.0 \n189605 0.187845 0.145127 0.0 NaN \n59234 0.164576 0.188206 0.0 0.0 \n255322 NaN NaN NaN NaN \n226504 0.268339 0.208569 1.0 0.0 \n\n FluVaxLast12 PneumoVaxEver TetanusLast10Tdap HighRiskLastYear \\\n276058 0.0 NaN 0.0 0.0 \n189605 1.0 1.0 NaN 0.0 \n59234 1.0 NaN 0.0 0.0 \n255322 NaN NaN NaN NaN \n226504 0.0 1.0 NaN 0.0 \n\n CovidPos \n276058 0.0 \n189605 0.0 \n59234 0.0 \n255322 NaN \n226504 1.0 \n\n[5 rows x 40 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>State</th>\n <th>Male</th>\n <th>GeneralHealth</th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>LastCheckupTime</th>\n <th>PhysicalActivities</th>\n <th>SleepHours</th>\n <th>RemovedTeeth</th>\n <th>HadHeartAttack</th>\n <th>...</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n <th>AlcoholDrinkers</th>\n <th>HIVTesting</th>\n <th>FluVaxLast12</th>\n <th>PneumoVaxEver</th>\n <th>TetanusLast10Tdap</th>\n <th>HighRiskLastYear</th>\n <th>CovidPos</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>276058</th>\n <td>New York</td>\n <td>1.0</td>\n <td>0.50</td>\n <td>0.066667</td>\n <td>0.0</td>\n <td>Within past 2 years (1 year but less than 2 ye...</td>\n <td>NaN</td>\n <td>0.260870</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.474074</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>189605</th>\n <td>Michigan</td>\n <td>0.0</td>\n <td>0.25</td>\n <td>0.666667</td>\n <td>0.5</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.173913</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.570370</td>\n <td>0.187845</td>\n <td>0.145127</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>59234</th>\n <td>Delaware</td>\n <td>0.0</td>\n <td>0.75</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.217391</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.437037</td>\n <td>0.164576</td>\n <td>0.188206</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>255322</th>\n <td>New Mexico</td>\n <td>1.0</td>\n <td>0.50</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>5 or more years ago</td>\n <td>1.0</td>\n <td>0.217391</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>226504</th>\n <td>Montana</td>\n <td>0.0</td>\n <td>0.75</td>\n <td>0.200000</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.304348</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.607407</td>\n <td>0.268339</td>\n <td>0.208569</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>1.0</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 40 columns</p>\n</div>"
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scale_float_columns(test)\n",
"scale_float_columns(train)\n",
"scale_float_columns(valid)\n",
"test.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"## 5. Czyszczenie brakujących pól\n",
"\n",
"Nie możemy użyć .dropna() gdyż większość wierszy ma brakujące wartości:"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 85,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"445132\n",
"199110\n"
]
}
],
"source": [
"print(df.shape[0])\n",
"print(df.shape[0] - df.dropna().shape[0])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 86,
"outputs": [
{
"data": {
"text/plain": " State Male GeneralHealth PhysicalHealthDays MentalHealthDays \\\n276058 New York 1.0 0.50 0.066667 0.0 \n189605 Michigan 0.0 0.25 0.666667 0.5 \n59234 Delaware 0.0 0.75 0.000000 0.0 \n255322 New Mexico 1.0 0.50 0.000000 0.0 \n226504 Montana 0.0 0.75 0.200000 0.0 \n\n LastCheckupTime PhysicalActivities \\\n276058 Within past 2 years (1 year but less than 2 ye... NaN \n189605 Within past year (anytime less than 12 months ... 1.0 \n59234 Within past year (anytime less than 12 months ... 1.0 \n255322 5 or more years ago 1.0 \n226504 Within past year (anytime less than 12 months ... 1.0 \n\n SleepHours RemovedTeeth HadHeartAttack ... HeightInMeters \\\n276058 0.260870 0.0 0.0 ... 0.474074 \n189605 0.173913 1.0 0.0 ... 0.570370 \n59234 0.217391 0.0 0.0 ... 0.437037 \n255322 0.217391 0.0 0.0 ... NaN \n226504 0.304348 0.0 0.0 ... 0.607407 \n\n WeightInKilograms BMI AlcoholDrinkers HIVTesting \\\n276058 NaN NaN 0.0 0.0 \n189605 0.187845 0.145127 0.0 NaN \n59234 0.164576 0.188206 0.0 0.0 \n255322 NaN NaN NaN NaN \n226504 0.268339 0.208569 1.0 0.0 \n\n FluVaxLast12 PneumoVaxEver TetanusLast10Tdap HighRiskLastYear \\\n276058 0.0 NaN 0.0 0.0 \n189605 1.0 1.0 NaN 0.0 \n59234 1.0 NaN 0.0 0.0 \n255322 NaN NaN NaN NaN \n226504 0.0 1.0 NaN 0.0 \n\n CovidPos \n276058 0.0 \n189605 0.0 \n59234 0.0 \n255322 NaN \n226504 1.0 \n\n[5 rows x 40 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>State</th>\n <th>Male</th>\n <th>GeneralHealth</th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>LastCheckupTime</th>\n <th>PhysicalActivities</th>\n <th>SleepHours</th>\n <th>RemovedTeeth</th>\n <th>HadHeartAttack</th>\n <th>...</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n <th>AlcoholDrinkers</th>\n <th>HIVTesting</th>\n <th>FluVaxLast12</th>\n <th>PneumoVaxEver</th>\n <th>TetanusLast10Tdap</th>\n <th>HighRiskLastYear</th>\n <th>CovidPos</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>276058</th>\n <td>New York</td>\n <td>1.0</td>\n <td>0.50</td>\n <td>0.066667</td>\n <td>0.0</td>\n <td>Within past 2 years (1 year but less than 2 ye...</td>\n <td>NaN</td>\n <td>0.260870</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.474074</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>189605</th>\n <td>Michigan</td>\n <td>0.0</td>\n <td>0.25</td>\n <td>0.666667</td>\n <td>0.5</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.173913</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.570370</td>\n <td>0.187845</td>\n <td>0.145127</td>\n <td>0.0</td>\n <td>NaN</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>59234</th>\n <td>Delaware</td>\n <td>0.0</td>\n <td>0.75</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.217391</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.437037</td>\n <td>0.164576</td>\n <td>0.188206</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>255322</th>\n <td>New Mexico</td>\n <td>1.0</td>\n <td>0.50</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>5 or more years ago</td>\n <td>1.0</td>\n <td>0.217391</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>226504</th>\n <td>Montana</td>\n <td>0.0</td>\n <td>0.75</td>\n <td>0.200000</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.304348</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.607407</td>\n <td>0.268339</td>\n <td>0.208569</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>NaN</td>\n <td>0.0</td>\n <td>1.0</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 40 columns</p>\n</div>"
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Uzupełniam brakujące wartości medianą:"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 87,
"outputs": [],
"source": [
"numeric_columns = train.select_dtypes(include=['number']).columns\n",
"test[numeric_columns] = test[numeric_columns].fillna(test[numeric_columns].median().iloc[0])\n",
"train[numeric_columns] = train[numeric_columns].fillna(train[numeric_columns].median().iloc[0])\n",
"valid[numeric_columns] = valid[numeric_columns].fillna(valid[numeric_columns].iloc[0])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 88,
"outputs": [
{
"data": {
"text/plain": " State Male GeneralHealth PhysicalHealthDays MentalHealthDays \\\n276058 New York 1.0 0.50 0.066667 0.0 \n189605 Michigan 0.0 0.25 0.666667 0.5 \n59234 Delaware 0.0 0.75 0.000000 0.0 \n255322 New Mexico 1.0 0.50 0.000000 0.0 \n226504 Montana 0.0 0.75 0.200000 0.0 \n\n LastCheckupTime PhysicalActivities \\\n276058 Within past 2 years (1 year but less than 2 ye... 0.0 \n189605 Within past year (anytime less than 12 months ... 1.0 \n59234 Within past year (anytime less than 12 months ... 1.0 \n255322 5 or more years ago 1.0 \n226504 Within past year (anytime less than 12 months ... 1.0 \n\n SleepHours RemovedTeeth HadHeartAttack ... HeightInMeters \\\n276058 0.260870 0.0 0.0 ... 0.474074 \n189605 0.173913 1.0 0.0 ... 0.570370 \n59234 0.217391 0.0 0.0 ... 0.437037 \n255322 0.217391 0.0 0.0 ... 0.000000 \n226504 0.304348 0.0 0.0 ... 0.607407 \n\n WeightInKilograms BMI AlcoholDrinkers HIVTesting \\\n276058 0.000000 0.000000 0.0 0.0 \n189605 0.187845 0.145127 0.0 0.0 \n59234 0.164576 0.188206 0.0 0.0 \n255322 0.000000 0.000000 0.0 0.0 \n226504 0.268339 0.208569 1.0 0.0 \n\n FluVaxLast12 PneumoVaxEver TetanusLast10Tdap HighRiskLastYear \\\n276058 0.0 0.0 0.0 0.0 \n189605 1.0 1.0 0.0 0.0 \n59234 1.0 0.0 0.0 0.0 \n255322 0.0 0.0 0.0 0.0 \n226504 0.0 1.0 0.0 0.0 \n\n CovidPos \n276058 0.0 \n189605 0.0 \n59234 0.0 \n255322 0.0 \n226504 1.0 \n\n[5 rows x 40 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>State</th>\n <th>Male</th>\n <th>GeneralHealth</th>\n <th>PhysicalHealthDays</th>\n <th>MentalHealthDays</th>\n <th>LastCheckupTime</th>\n <th>PhysicalActivities</th>\n <th>SleepHours</th>\n <th>RemovedTeeth</th>\n <th>HadHeartAttack</th>\n <th>...</th>\n <th>HeightInMeters</th>\n <th>WeightInKilograms</th>\n <th>BMI</th>\n <th>AlcoholDrinkers</th>\n <th>HIVTesting</th>\n <th>FluVaxLast12</th>\n <th>PneumoVaxEver</th>\n <th>TetanusLast10Tdap</th>\n <th>HighRiskLastYear</th>\n <th>CovidPos</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>276058</th>\n <td>New York</td>\n <td>1.0</td>\n <td>0.50</td>\n <td>0.066667</td>\n <td>0.0</td>\n <td>Within past 2 years (1 year but less than 2 ye...</td>\n <td>0.0</td>\n <td>0.260870</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.474074</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>189605</th>\n <td>Michigan</td>\n <td>0.0</td>\n <td>0.25</td>\n <td>0.666667</td>\n <td>0.5</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.173913</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.570370</td>\n <td>0.187845</td>\n <td>0.145127</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>59234</th>\n <td>Delaware</td>\n <td>0.0</td>\n <td>0.75</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.217391</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.437037</td>\n <td>0.164576</td>\n <td>0.188206</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>255322</th>\n <td>New Mexico</td>\n <td>1.0</td>\n <td>0.50</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>5 or more years ago</td>\n <td>1.0</td>\n <td>0.217391</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>226504</th>\n <td>Montana</td>\n <td>0.0</td>\n <td>0.75</td>\n <td>0.200000</td>\n <td>0.0</td>\n <td>Within past year (anytime less than 12 months ...</td>\n <td>1.0</td>\n <td>0.304348</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>...</td>\n <td>0.607407</td>\n <td>0.268339</td>\n <td>0.208569</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n <td>0.0</td>\n <td>0.0</td>\n <td>1.0</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 40 columns</p>\n</div>"
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"Kolumny kategoryczne wypełniłem w czasie normalizacji wartościami \"Unknown\" ponieważ fillna-->median nie działa dla tego typu danych\n",
"(https://stackoverflow.com/questions/49127897/python-pandas-fillna-median-not-working)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 89,
"outputs": [
{
"data": {
"text/plain": "HighRiskLastYear\n0.0 42810\n1.0 1703\nName: count, dtype: int64"
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test[\"HighRiskLastYear\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 90,
"outputs": [
{
"data": {
"text/plain": "0"
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test[\"HighRiskLastYear\"].isna().sum()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Brak wartości non-null:"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 91,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 44513 entries, 276058 to 196692\n",
"Data columns (total 40 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 State 44513 non-null category\n",
" 1 Male 44513 non-null float64 \n",
" 2 GeneralHealth 44513 non-null float64 \n",
" 3 PhysicalHealthDays 44513 non-null float64 \n",
" 4 MentalHealthDays 44513 non-null float64 \n",
" 5 LastCheckupTime 44513 non-null category\n",
" 6 PhysicalActivities 44513 non-null float64 \n",
" 7 SleepHours 44513 non-null float64 \n",
" 8 RemovedTeeth 44513 non-null float64 \n",
" 9 HadHeartAttack 44513 non-null float64 \n",
" 10 HadAngina 44513 non-null float64 \n",
" 11 HadStroke 44513 non-null float64 \n",
" 12 HadAsthma 44513 non-null float64 \n",
" 13 HadSkinCancer 44513 non-null float64 \n",
" 14 HadCOPD 44513 non-null float64 \n",
" 15 HadDepressiveDisorder 44513 non-null float64 \n",
" 16 HadKidneyDisease 44513 non-null float64 \n",
" 17 HadArthritis 44513 non-null float64 \n",
" 18 HadDiabetes 44513 non-null float64 \n",
" 19 DeafOrHardOfHearing 44513 non-null float64 \n",
" 20 BlindOrVisionDifficulty 44513 non-null float64 \n",
" 21 DifficultyConcentrating 44513 non-null float64 \n",
" 22 DifficultyWalking 44513 non-null float64 \n",
" 23 DifficultyDressingBathing 44513 non-null float64 \n",
" 24 DifficultyErrands 44513 non-null float64 \n",
" 25 SmokerStatus 44513 non-null float64 \n",
" 26 ECigaretteUsage 44513 non-null float64 \n",
" 27 ChestScan 44513 non-null float64 \n",
" 28 RaceEthnicityCategory 44513 non-null category\n",
" 29 AgeCategory 44513 non-null category\n",
" 30 HeightInMeters 44513 non-null float64 \n",
" 31 WeightInKilograms 44513 non-null float64 \n",
" 32 BMI 44513 non-null float64 \n",
" 33 AlcoholDrinkers 44513 non-null float64 \n",
" 34 HIVTesting 44513 non-null float64 \n",
" 35 FluVaxLast12 44513 non-null float64 \n",
" 36 PneumoVaxEver 44513 non-null float64 \n",
" 37 TetanusLast10Tdap 44513 non-null float64 \n",
" 38 HighRiskLastYear 44513 non-null float64 \n",
" 39 CovidPos 44513 non-null float64 \n",
"dtypes: category(4), float64(36)\n",
"memory usage: 12.7 MB\n"
]
}
],
"source": [
"test.info()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 92,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 676617 entries, 0 to 676616\n",
"Data columns (total 40 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 State 676617 non-null category\n",
" 1 Male 676617 non-null float64 \n",
" 2 GeneralHealth 676617 non-null float64 \n",
" 3 PhysicalHealthDays 676617 non-null float64 \n",
" 4 MentalHealthDays 676617 non-null float64 \n",
" 5 LastCheckupTime 676617 non-null category\n",
" 6 PhysicalActivities 676617 non-null float64 \n",
" 7 SleepHours 676617 non-null float64 \n",
" 8 RemovedTeeth 676617 non-null float64 \n",
" 9 HadHeartAttack 676617 non-null float64 \n",
" 10 HadAngina 676617 non-null float64 \n",
" 11 HadStroke 676617 non-null float64 \n",
" 12 HadAsthma 676617 non-null float64 \n",
" 13 HadSkinCancer 676617 non-null float64 \n",
" 14 HadCOPD 676617 non-null float64 \n",
" 15 HadDepressiveDisorder 676617 non-null float64 \n",
" 16 HadKidneyDisease 676617 non-null float64 \n",
" 17 HadArthritis 676617 non-null float64 \n",
" 18 HadDiabetes 676617 non-null float64 \n",
" 19 DeafOrHardOfHearing 676617 non-null float64 \n",
" 20 BlindOrVisionDifficulty 676617 non-null float64 \n",
" 21 DifficultyConcentrating 676617 non-null float64 \n",
" 22 DifficultyWalking 676617 non-null float64 \n",
" 23 DifficultyDressingBathing 676617 non-null float64 \n",
" 24 DifficultyErrands 676617 non-null float64 \n",
" 25 SmokerStatus 676617 non-null float64 \n",
" 26 ECigaretteUsage 676617 non-null float64 \n",
" 27 ChestScan 676617 non-null float64 \n",
" 28 RaceEthnicityCategory 676617 non-null category\n",
" 29 AgeCategory 676617 non-null category\n",
" 30 HeightInMeters 676617 non-null float64 \n",
" 31 WeightInKilograms 676617 non-null float64 \n",
" 32 BMI 676617 non-null float64 \n",
" 33 AlcoholDrinkers 676617 non-null float64 \n",
" 34 HIVTesting 676617 non-null float64 \n",
" 35 FluVaxLast12 676617 non-null float64 \n",
" 36 PneumoVaxEver 676617 non-null float64 \n",
" 37 TetanusLast10Tdap 676617 non-null float64 \n",
" 38 HighRiskLastYear 676617 non-null float64 \n",
" 39 CovidPos 676617 non-null float64 \n",
"dtypes: category(4), float64(36)\n",
"memory usage: 188.4 MB\n"
]
}
],
"source": [
"train.info()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 93,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 44514 entries, 127295 to 418173\n",
"Data columns (total 40 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 State 44514 non-null category\n",
" 1 Male 44514 non-null float64 \n",
" 2 GeneralHealth 44514 non-null float64 \n",
" 3 PhysicalHealthDays 44514 non-null float64 \n",
" 4 MentalHealthDays 44514 non-null float64 \n",
" 5 LastCheckupTime 44514 non-null category\n",
" 6 PhysicalActivities 44514 non-null float64 \n",
" 7 SleepHours 44514 non-null float64 \n",
" 8 RemovedTeeth 44514 non-null float64 \n",
" 9 HadHeartAttack 44514 non-null float64 \n",
" 10 HadAngina 44514 non-null float64 \n",
" 11 HadStroke 44514 non-null float64 \n",
" 12 HadAsthma 44514 non-null float64 \n",
" 13 HadSkinCancer 44514 non-null float64 \n",
" 14 HadCOPD 44514 non-null float64 \n",
" 15 HadDepressiveDisorder 44514 non-null float64 \n",
" 16 HadKidneyDisease 44514 non-null float64 \n",
" 17 HadArthritis 44514 non-null float64 \n",
" 18 HadDiabetes 44514 non-null float64 \n",
" 19 DeafOrHardOfHearing 44514 non-null float64 \n",
" 20 BlindOrVisionDifficulty 44514 non-null float64 \n",
" 21 DifficultyConcentrating 44514 non-null float64 \n",
" 22 DifficultyWalking 44514 non-null float64 \n",
" 23 DifficultyDressingBathing 44514 non-null float64 \n",
" 24 DifficultyErrands 44514 non-null float64 \n",
" 25 SmokerStatus 44514 non-null float64 \n",
" 26 ECigaretteUsage 44514 non-null float64 \n",
" 27 ChestScan 44514 non-null float64 \n",
" 28 RaceEthnicityCategory 44514 non-null category\n",
" 29 AgeCategory 44514 non-null category\n",
" 30 HeightInMeters 44514 non-null float64 \n",
" 31 WeightInKilograms 44514 non-null float64 \n",
" 32 BMI 44514 non-null float64 \n",
" 33 AlcoholDrinkers 44514 non-null float64 \n",
" 34 HIVTesting 44514 non-null float64 \n",
" 35 FluVaxLast12 44514 non-null float64 \n",
" 36 PneumoVaxEver 44514 non-null float64 \n",
" 37 TetanusLast10Tdap 44514 non-null float64 \n",
" 38 HighRiskLastYear 44514 non-null float64 \n",
" 39 CovidPos 44514 non-null float64 \n",
"dtypes: category(4), float64(36)\n",
"memory usage: 12.7 MB\n"
]
}
],
"source": [
"valid.info()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Zapisywanie do csv"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 94,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['State', 'LastCheckupTime', 'RaceEthnicityCategory', 'AgeCategory'], dtype='object')\n"
]
}
],
"source": [
"cat_columns = test.select_dtypes(['category']).columns\n",
"print(cat_columns)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 95,
"outputs": [],
"source": [
"#test[cat_columns] = test[cat_columns].apply(lambda x: pd.factorize(x)[0])\n",
"#train[cat_columns] = train[cat_columns].apply(lambda x: pd.factorize(x)[0])\n",
"#valid[cat_columns] = valid[cat_columns].apply(lambda x: pd.factorize(x)[0])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 96,
"outputs": [],
"source": [
"test.to_csv(\"test.csv\")\n",
"train.to_csv(\"train.csv\")\n",
"valid.to_csv(\"valid.csv\")"
],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}