ium_478831/IUM_main.ipynb
JulianZablonski ec907266c8 ex1
2022-03-20 22:02:36 +01:00

781 lines
37 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: kaggle in c:\\users\\user\\anaconda3\\lib\\site-packages (1.5.12)\n",
"Requirement already satisfied: urllib3 in c:\\users\\user\\anaconda3\\lib\\site-packages (from kaggle) (1.26.7)\n",
"Requirement already satisfied: python-dateutil in c:\\users\\user\\anaconda3\\lib\\site-packages (from kaggle) (2.8.2)\n",
"Requirement already satisfied: python-slugify in c:\\users\\user\\anaconda3\\lib\\site-packages (from kaggle) (5.0.2)\n",
"Requirement already satisfied: requests in c:\\users\\user\\anaconda3\\lib\\site-packages (from kaggle) (2.26.0)\n",
"Requirement already satisfied: six>=1.10 in c:\\users\\user\\anaconda3\\lib\\site-packages (from kaggle) (1.16.0)\n",
"Requirement already satisfied: tqdm in c:\\users\\user\\anaconda3\\lib\\site-packages (from kaggle) (4.62.3)\n",
"Requirement already satisfied: certifi in c:\\users\\user\\anaconda3\\lib\\site-packages (from kaggle) (2021.10.8)\n",
"Requirement already satisfied: text-unidecode>=1.3 in c:\\users\\user\\anaconda3\\lib\\site-packages (from python-slugify->kaggle) (1.3)\n",
"Requirement already satisfied: charset-normalizer~=2.0.0 in c:\\users\\user\\anaconda3\\lib\\site-packages (from requests->kaggle) (2.0.4)\n",
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\user\\anaconda3\\lib\\site-packages (from requests->kaggle) (3.2)\n",
"Requirement already satisfied: colorama in c:\\users\\user\\anaconda3\\lib\\site-packages (from tqdm->kaggle) (0.4.4)\n",
"Requirement already satisfied: pandas in c:\\users\\user\\anaconda3\\lib\\site-packages (1.3.4)\n",
"Requirement already satisfied: pytz>=2017.3 in c:\\users\\user\\anaconda3\\lib\\site-packages (from pandas) (2021.3)\n",
"Requirement already satisfied: python-dateutil>=2.7.3 in c:\\users\\user\\anaconda3\\lib\\site-packages (from pandas) (2.8.2)\n",
"Requirement already satisfied: numpy>=1.17.3 in c:\\users\\user\\anaconda3\\lib\\site-packages (from pandas) (1.20.3)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\user\\anaconda3\\lib\\site-packages (from python-dateutil>=2.7.3->pandas) (1.16.0)\n",
"Requirement already satisfied: seaborn in c:\\users\\user\\anaconda3\\lib\\site-packages (0.11.2)\n",
"Requirement already satisfied: numpy>=1.15 in c:\\users\\user\\anaconda3\\lib\\site-packages (from seaborn) (1.20.3)\n",
"Requirement already satisfied: matplotlib>=2.2 in c:\\users\\user\\anaconda3\\lib\\site-packages (from seaborn) (3.4.3)\n",
"Requirement already satisfied: scipy>=1.0 in c:\\users\\user\\anaconda3\\lib\\site-packages (from seaborn) (1.7.1)\n",
"Requirement already satisfied: pandas>=0.23 in c:\\users\\user\\anaconda3\\lib\\site-packages (from seaborn) (1.3.4)\n",
"Requirement already satisfied: cycler>=0.10 in c:\\users\\user\\anaconda3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n",
"Requirement already satisfied: pillow>=6.2.0 in c:\\users\\user\\anaconda3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (8.4.0)\n",
"Requirement already satisfied: pyparsing>=2.2.1 in c:\\users\\user\\anaconda3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (3.0.4)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\user\\anaconda3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (1.3.1)\n",
"Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\user\\anaconda3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (2.8.2)\n",
"Requirement already satisfied: six in c:\\users\\user\\anaconda3\\lib\\site-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.16.0)\n",
"Requirement already satisfied: pytz>=2017.3 in c:\\users\\user\\anaconda3\\lib\\site-packages (from pandas>=0.23->seaborn) (2021.3)\n"
]
}
],
"source": [
"!pip install kaggle\n",
"!pip install pandas\n",
"!pip install seaborn"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"adult-income-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)\n"
]
}
],
"source": [
"!kaggle datasets download -d wenruliu/adult-income-dataset\n",
"\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"'unzip' is not recognized as an internal or external command,\n",
"operable program or batch file.\n"
]
}
],
"source": [
"!unzip -o adult-income-dataset.zip"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>workclass</th>\n",
" <th>fnlwgt</th>\n",
" <th>education</th>\n",
" <th>educational-num</th>\n",
" <th>marital-status</th>\n",
" <th>occupation</th>\n",
" <th>relationship</th>\n",
" <th>race</th>\n",
" <th>gender</th>\n",
" <th>capital-gain</th>\n",
" <th>capital-loss</th>\n",
" <th>hours-per-week</th>\n",
" <th>native-country</th>\n",
" <th>income</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>25</td>\n",
" <td>Private</td>\n",
" <td>226802</td>\n",
" <td>11th</td>\n",
" <td>7</td>\n",
" <td>Never-married</td>\n",
" <td>Machine-op-inspct</td>\n",
" <td>Own-child</td>\n",
" <td>Black</td>\n",
" <td>Male</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>United-States</td>\n",
" <td>&lt;=50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>38</td>\n",
" <td>Private</td>\n",
" <td>89814</td>\n",
" <td>HS-grad</td>\n",
" <td>9</td>\n",
" <td>Married-civ-spouse</td>\n",
" <td>Farming-fishing</td>\n",
" <td>Husband</td>\n",
" <td>White</td>\n",
" <td>Male</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>50</td>\n",
" <td>United-States</td>\n",
" <td>&lt;=50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>28</td>\n",
" <td>Local-gov</td>\n",
" <td>336951</td>\n",
" <td>Assoc-acdm</td>\n",
" <td>12</td>\n",
" <td>Married-civ-spouse</td>\n",
" <td>Protective-serv</td>\n",
" <td>Husband</td>\n",
" <td>White</td>\n",
" <td>Male</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>United-States</td>\n",
" <td>&gt;50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>44</td>\n",
" <td>Private</td>\n",
" <td>160323</td>\n",
" <td>Some-college</td>\n",
" <td>10</td>\n",
" <td>Married-civ-spouse</td>\n",
" <td>Machine-op-inspct</td>\n",
" <td>Husband</td>\n",
" <td>Black</td>\n",
" <td>Male</td>\n",
" <td>7688</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>United-States</td>\n",
" <td>&gt;50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>18</td>\n",
" <td>?</td>\n",
" <td>103497</td>\n",
" <td>Some-college</td>\n",
" <td>10</td>\n",
" <td>Never-married</td>\n",
" <td>?</td>\n",
" <td>Own-child</td>\n",
" <td>White</td>\n",
" <td>Female</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>30</td>\n",
" <td>United-States</td>\n",
" <td>&lt;=50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48837</th>\n",
" <td>27</td>\n",
" <td>Private</td>\n",
" <td>257302</td>\n",
" <td>Assoc-acdm</td>\n",
" <td>12</td>\n",
" <td>Married-civ-spouse</td>\n",
" <td>Tech-support</td>\n",
" <td>Wife</td>\n",
" <td>White</td>\n",
" <td>Female</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>38</td>\n",
" <td>United-States</td>\n",
" <td>&lt;=50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48838</th>\n",
" <td>40</td>\n",
" <td>Private</td>\n",
" <td>154374</td>\n",
" <td>HS-grad</td>\n",
" <td>9</td>\n",
" <td>Married-civ-spouse</td>\n",
" <td>Machine-op-inspct</td>\n",
" <td>Husband</td>\n",
" <td>White</td>\n",
" <td>Male</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>United-States</td>\n",
" <td>&gt;50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48839</th>\n",
" <td>58</td>\n",
" <td>Private</td>\n",
" <td>151910</td>\n",
" <td>HS-grad</td>\n",
" <td>9</td>\n",
" <td>Widowed</td>\n",
" <td>Adm-clerical</td>\n",
" <td>Unmarried</td>\n",
" <td>White</td>\n",
" <td>Female</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>United-States</td>\n",
" <td>&lt;=50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48840</th>\n",
" <td>22</td>\n",
" <td>Private</td>\n",
" <td>201490</td>\n",
" <td>HS-grad</td>\n",
" <td>9</td>\n",
" <td>Never-married</td>\n",
" <td>Adm-clerical</td>\n",
" <td>Own-child</td>\n",
" <td>White</td>\n",
" <td>Male</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>20</td>\n",
" <td>United-States</td>\n",
" <td>&lt;=50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48841</th>\n",
" <td>52</td>\n",
" <td>Self-emp-inc</td>\n",
" <td>287927</td>\n",
" <td>HS-grad</td>\n",
" <td>9</td>\n",
" <td>Married-civ-spouse</td>\n",
" <td>Exec-managerial</td>\n",
" <td>Wife</td>\n",
" <td>White</td>\n",
" <td>Female</td>\n",
" <td>15024</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>United-States</td>\n",
" <td>&gt;50K</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>48842 rows × 15 columns</p>\n",
"</div>"
],
"text/plain": [
" age workclass fnlwgt education educational-num \\\n",
"0 25 Private 226802 11th 7 \n",
"1 38 Private 89814 HS-grad 9 \n",
"2 28 Local-gov 336951 Assoc-acdm 12 \n",
"3 44 Private 160323 Some-college 10 \n",
"4 18 ? 103497 Some-college 10 \n",
"... ... ... ... ... ... \n",
"48837 27 Private 257302 Assoc-acdm 12 \n",
"48838 40 Private 154374 HS-grad 9 \n",
"48839 58 Private 151910 HS-grad 9 \n",
"48840 22 Private 201490 HS-grad 9 \n",
"48841 52 Self-emp-inc 287927 HS-grad 9 \n",
"\n",
" marital-status occupation relationship race gender \\\n",
"0 Never-married Machine-op-inspct Own-child Black Male \n",
"1 Married-civ-spouse Farming-fishing Husband White Male \n",
"2 Married-civ-spouse Protective-serv Husband White Male \n",
"3 Married-civ-spouse Machine-op-inspct Husband Black Male \n",
"4 Never-married ? Own-child White Female \n",
"... ... ... ... ... ... \n",
"48837 Married-civ-spouse Tech-support Wife White Female \n",
"48838 Married-civ-spouse Machine-op-inspct Husband White Male \n",
"48839 Widowed Adm-clerical Unmarried White Female \n",
"48840 Never-married Adm-clerical Own-child White Male \n",
"48841 Married-civ-spouse Exec-managerial Wife White Female \n",
"\n",
" capital-gain capital-loss hours-per-week native-country income \n",
"0 0 0 40 United-States <=50K \n",
"1 0 0 50 United-States <=50K \n",
"2 0 0 40 United-States >50K \n",
"3 7688 0 40 United-States >50K \n",
"4 0 0 30 United-States <=50K \n",
"... ... ... ... ... ... \n",
"48837 0 0 38 United-States <=50K \n",
"48838 0 0 40 United-States >50K \n",
"48839 0 0 40 United-States <=50K \n",
"48840 0 0 20 United-States <=50K \n",
"48841 15024 0 40 United-States >50K \n",
"\n",
"[48842 rows x 15 columns]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"df=pd.read_csv('adult-income-dataset.csv')\n",
"df\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"#usunięcie nie pełnych danych \n",
"df = df[df.workclass != '?']"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"\n",
"train_size = int(0.8 * len(df))\n",
"test_size = (len(df) - train_size)\n",
"df_train, df_test = torch.utils.data.random_split(df, [train_size, test_size])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wielkosc zbioru: 48842, podzbiór train: 39073, podzbiór test 9769.\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>workclass</th>\n",
" <th>fnlwgt</th>\n",
" <th>education</th>\n",
" <th>educational-num</th>\n",
" <th>marital-status</th>\n",
" <th>occupation</th>\n",
" <th>relationship</th>\n",
" <th>race</th>\n",
" <th>gender</th>\n",
" <th>capital-gain</th>\n",
" <th>capital-loss</th>\n",
" <th>hours-per-week</th>\n",
" <th>native-country</th>\n",
" <th>income</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>48842.000000</td>\n",
" <td>48842</td>\n",
" <td>4.884200e+04</td>\n",
" <td>48842</td>\n",
" <td>48842.000000</td>\n",
" <td>48842</td>\n",
" <td>48842</td>\n",
" <td>48842</td>\n",
" <td>48842</td>\n",
" <td>48842</td>\n",
" <td>48842.000000</td>\n",
" <td>48842.000000</td>\n",
" <td>48842.000000</td>\n",
" <td>48842</td>\n",
" <td>48842</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>NaN</td>\n",
" <td>9</td>\n",
" <td>NaN</td>\n",
" <td>16</td>\n",
" <td>NaN</td>\n",
" <td>7</td>\n",
" <td>15</td>\n",
" <td>6</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>42</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>NaN</td>\n",
" <td>Private</td>\n",
" <td>NaN</td>\n",
" <td>HS-grad</td>\n",
" <td>NaN</td>\n",
" <td>Married-civ-spouse</td>\n",
" <td>Prof-specialty</td>\n",
" <td>Husband</td>\n",
" <td>White</td>\n",
" <td>Male</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>United-States</td>\n",
" <td>&lt;=50K</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>NaN</td>\n",
" <td>33906</td>\n",
" <td>NaN</td>\n",
" <td>15784</td>\n",
" <td>NaN</td>\n",
" <td>22379</td>\n",
" <td>6172</td>\n",
" <td>19716</td>\n",
" <td>41762</td>\n",
" <td>32650</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>43832</td>\n",
" <td>37155</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>38.643585</td>\n",
" <td>NaN</td>\n",
" <td>1.896641e+05</td>\n",
" <td>NaN</td>\n",
" <td>10.078089</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1079.067626</td>\n",
" <td>87.502314</td>\n",
" <td>40.422382</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>13.710510</td>\n",
" <td>NaN</td>\n",
" <td>1.056040e+05</td>\n",
" <td>NaN</td>\n",
" <td>2.570973</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7452.019058</td>\n",
" <td>403.004552</td>\n",
" <td>12.391444</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>17.000000</td>\n",
" <td>NaN</td>\n",
" <td>1.228500e+04</td>\n",
" <td>NaN</td>\n",
" <td>1.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>28.000000</td>\n",
" <td>NaN</td>\n",
" <td>1.175505e+05</td>\n",
" <td>NaN</td>\n",
" <td>9.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>40.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>37.000000</td>\n",
" <td>NaN</td>\n",
" <td>1.781445e+05</td>\n",
" <td>NaN</td>\n",
" <td>10.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>40.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>48.000000</td>\n",
" <td>NaN</td>\n",
" <td>2.376420e+05</td>\n",
" <td>NaN</td>\n",
" <td>12.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>45.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>90.000000</td>\n",
" <td>NaN</td>\n",
" <td>1.490400e+06</td>\n",
" <td>NaN</td>\n",
" <td>16.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>99999.000000</td>\n",
" <td>4356.000000</td>\n",
" <td>99.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age workclass fnlwgt education educational-num \\\n",
"count 48842.000000 48842 4.884200e+04 48842 48842.000000 \n",
"unique NaN 9 NaN 16 NaN \n",
"top NaN Private NaN HS-grad NaN \n",
"freq NaN 33906 NaN 15784 NaN \n",
"mean 38.643585 NaN 1.896641e+05 NaN 10.078089 \n",
"std 13.710510 NaN 1.056040e+05 NaN 2.570973 \n",
"min 17.000000 NaN 1.228500e+04 NaN 1.000000 \n",
"25% 28.000000 NaN 1.175505e+05 NaN 9.000000 \n",
"50% 37.000000 NaN 1.781445e+05 NaN 10.000000 \n",
"75% 48.000000 NaN 2.376420e+05 NaN 12.000000 \n",
"max 90.000000 NaN 1.490400e+06 NaN 16.000000 \n",
"\n",
" marital-status occupation relationship race gender \\\n",
"count 48842 48842 48842 48842 48842 \n",
"unique 7 15 6 5 2 \n",
"top Married-civ-spouse Prof-specialty Husband White Male \n",
"freq 22379 6172 19716 41762 32650 \n",
"mean NaN NaN NaN NaN NaN \n",
"std NaN NaN NaN NaN NaN \n",
"min NaN NaN NaN NaN NaN \n",
"25% NaN NaN NaN NaN NaN \n",
"50% NaN NaN NaN NaN NaN \n",
"75% NaN NaN NaN NaN NaN \n",
"max NaN NaN NaN NaN NaN \n",
"\n",
" capital-gain capital-loss hours-per-week native-country income \n",
"count 48842.000000 48842.000000 48842.000000 48842 48842 \n",
"unique NaN NaN NaN 42 2 \n",
"top NaN NaN NaN United-States <=50K \n",
"freq NaN NaN NaN 43832 37155 \n",
"mean 1079.067626 87.502314 40.422382 NaN NaN \n",
"std 7452.019058 403.004552 12.391444 NaN NaN \n",
"min 0.000000 0.000000 1.000000 NaN NaN \n",
"25% 0.000000 0.000000 40.000000 NaN NaN \n",
"50% 0.000000 0.000000 40.000000 NaN NaN \n",
"75% 0.000000 0.000000 45.000000 NaN NaN \n",
"max 99999.000000 4356.000000 99.000000 NaN NaN "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(f\"Wielkosc zbioru: {len(df)}, podzbiór train: {train_size}, podzbiór test {test_size}.\")\n",
"df.describe(include='all')\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot:title={'center':'income'}>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAEiCAYAAAD5+KUgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAAXhElEQVR4nO3df7DddX3n8eeLBDFqQYQrG3OjoZLuGpgalhizq7PVwizRTgt2ob38IdmaTlyKO7bTmS24namd2ezAH4rLWGixUALTNWRQS0aJXTboaqc08eIiEJDlTgGJycJVfhinCzXxvX+cz60nl8O95+bHPYHzfMx853zO+/v9fO/7yyS87vfHOUlVIUnScYNuQJJ0bDAQJEmAgSBJagwESRJgIEiSGgNBkgQYCBJJdiV536D7kAYtfg5BkgSeIUiSGgNBQy/J40nOS/LJJFuS3JJkX7uUtKpru6VJvphkMskPk3y21Y9L8odJnkjydJt/Ulu3LEkl+a0kTyZ5Nsl/SPKuJPcneW5qP10/5yNJHm7b/nWSt83vfxENKwNBOtivAZuBNwJbgan/6S8Avgw8ASwDlrTtAP59W94P/Dzwhql5Xd4NLAd+E/gM8J+B84Azgd9I8kvt51wIfAL4dWAE+Cbw+SN5gNLL8R6Chl6Sx4HfBt4LvLeqzmv1FcC9VbUoyb+iExCLq2r/tPnbgS9U1XXt/T8HHgQWAaPAY8BoVX2/rf8h8DtVdVt7/wXgm1X1mSTbgNur6sa27jjgx8A7quqJo/nfQfIMQTrY/+0a/wPw2iQLgaXAE9PDoHkLnTOHKU8AC4HTumpPdY3/X4/3b2jjtwH/rV1Keg54BgidMxLpqDIQpP48Cby1hcN0e+j8j3zKW4H9HPw//bn8nI9W1Ru7lkVV9beHsC9pTgwEqT87gb3AVUlen+S1Sd7T1n0e+L0kpyd5A/Bfgdte5mxiNn8KXJnkTIAkJyW5+EgcgDQbA0HqQ1UdAH4VOAP4HrCbzg1igJuAW4Fv0Llf8ALwHw/x53wJuBrYnORHdO5FfOCwmpf65E1lSRLgGYIkqTEQJEmAgSBJagwESRLQ+fDMK9Kpp55ay5YtG3QbkvSKcu+99/6gqkZ6rXvFBsKyZcsYHx8fdBuS9IqS5GW/AsVLRpIkwECQJDUGgiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1BgIkiTgFfxJ5VeKZVd8ZdAtvKo8ftWvDLoF6VXLMwRJEmAgSJIaA0GSBBgIkqTGQJAkAQaCJKkxECRJgIEgSWpmDYQkr02yM8l3kuxK8set/skk309yX1s+2DXnyiQTSR5Jcn5X/ZwkD7R11yZJq5+Q5LZW35Fk2VE4VknSDPo5Q3gR+OWqeiewElibZE1bd01VrWzLnQBJVgBjwJnAWuC6JAva9tcDG4DlbVnb6uuBZ6vqDOAa4OrDPjJJ0pzMGgjV8eP29vi21AxTLgA2V9WLVfUYMAGsTrIYOLGq7qmqAm4BLuyas6mNbwfOnTp7kCTNj77uISRZkOQ+4Gngrqra0VZ9LMn9SW5KcnKrLQGe7Jq+u9WWtPH0+kFzqmo/8DxwSo8+NiQZTzI+OTnZT+uSpD71FQhVdaCqVgKjdH7bP4vO5Z+307mMtBf4VNu812/2NUN9pjnT+7ihqlZV1aqRkZF+Wpck9WlOTxlV1XPA14G1VfVUC4qfAp8DVrfNdgNLu6aNAntafbRH/aA5SRYCJwHPzKU3SdLh6ecpo5Ekb2zjRcB5wHfbPYEpHwIebOOtwFh7cuh0OjePd1bVXmBfkjXt/sClwB1dc9a18UXA3e0+gyRpnvTz7yEsBja1J4WOA7ZU1ZeT3JpkJZ1LO48DHwWoql1JtgAPAfuBy6vqQNvXZcDNwCJgW1sAbgRuTTJB58xg7PAPTZI0F7MGQlXdD5zdo/7hGeZsBDb2qI8DZ/WovwBcPFsvkqSjx08qS5IAA0GS1BgIkiTAQJAkNQaCJAkwECRJjYEgSQIMBElSYyBIkgADQZLUGAiSJMBAkCQ1BoIkCTAQJEmNgSBJAgwESVJjIEiSAANBktTMGghJXptkZ5LvJNmV5I9b/U1J7kryaHs9uWvOlUkmkjyS5Pyu+jlJHmjrrk2SVj8hyW2tviPJsqNwrJKkGfRzhvAi8MtV9U5gJbA2yRrgCmB7VS0Htrf3JFkBjAFnAmuB65IsaPu6HtgALG/L2lZfDzxbVWcA1wBXH/6hSZLmYtZAqI4ft7fHt6WAC4BNrb4JuLCNLwA2V9WLVfUYMAGsTrIYOLGq7qmqAm6ZNmdqX7cD506dPUiS5kdf9xCSLEhyH/A0cFdV7QBOq6q9AO31zW3zJcCTXdN3t9qSNp5eP2hOVe0HngdOOYTjkSQdor4CoaoOVNVKYJTOb/tnzbB5r9/sa4b6THMO3nGyIcl4kvHJyclZupYkzcWcnjKqqueAr9O59v9UuwxEe326bbYbWNo1bRTY0+qjPeoHzUmyEDgJeKbHz7+hqlZV1aqRkZG5tC5JmkU/TxmNJHljGy8CzgO+C2wF1rXN1gF3tPFWYKw9OXQ6nZvHO9tlpX1J1rT7A5dOmzO1r4uAu9t9BknSPFnYxzaLgU3tSaHjgC1V9eUk9wBbkqwHvgdcDFBVu5JsAR4C9gOXV9WBtq/LgJuBRcC2tgDcCNyaZILOmcHYkTg4SVL/Zg2EqrofOLtH/YfAuS8zZyOwsUd9HHjJ/YeqeoEWKJKkwfCTypIkwECQJDUGgiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNQaCJAkwECRJjYEgSQIMBElSYyBIkgADQZLUGAiSJMBAkCQ1BoIkCegjEJIsTfK1JA8n2ZXk463+ySTfT3JfWz7YNefKJBNJHklyflf9nCQPtHXXJkmrn5DktlbfkWTZUThWSdIM+jlD2A/8flW9A1gDXJ5kRVt3TVWtbMudAG3dGHAmsBa4LsmCtv31wAZgeVvWtvp64NmqOgO4Brj68A9NkjQXswZCVe2tqm+38T7gYWDJDFMuADZX1YtV9RgwAaxOshg4saruqaoCbgEu7JqzqY1vB86dOnuQJM2POd1DaJdyzgZ2tNLHktyf5KYkJ7faEuDJrmm7W21JG0+vHzSnqvYDzwOnzKU3SdLh6TsQkrwB+ALwu1X1IzqXf94OrAT2Ap+a2rTH9JqhPtOc6T1sSDKeZHxycrLf1iVJfegrEJIcTycM/rKqvghQVU9V1YGq+inwOWB123w3sLRr+iiwp9VHe9QPmpNkIXAS8Mz0PqrqhqpaVVWrRkZG+jtCSVJf+nnKKMCNwMNV9emu+uKuzT4EPNjGW4Gx9uTQ6XRuHu+sqr3AviRr2j4vBe7omrOujS8C7m73GSRJ82RhH9u8B/gw8ECS+1rtE8AlSVbSubTzOPBRgKralWQL8BCdJ5Qur6oDbd5lwM3AImBbW6ATOLcmmaBzZjB2OAclSZq7WQOhqv6G3tf475xhzkZgY4/6OHBWj/oLwMWz9SJJOnr8pLIkCTAQJEmNgSBJAgwESVJjIEiSAANBktQYCJIkwECQJDUGgiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNQaCJAkwECRJjYEgSQL6CIQkS5N8LcnDSXYl+XirvynJXUkeba8nd825MslEkkeSnN9VPyfJA23dtUnS6ickua3VdyRZdhSOVZI0g37OEPYDv19V7wDWAJcnWQFcAWyvquXA9vaetm4MOBNYC1yXZEHb1/XABmB5W9a2+nrg2ao6A7gGuPoIHJskaQ5mDYSq2ltV327jfcDDwBLgAmBT22wTcGEbXwBsrqoXq+oxYAJYnWQxcGJV3VNVBdwybc7Uvm4Hzp06e5AkzY853UNol3LOBnYAp1XVXuiEBvDmttkS4MmuabtbbUkbT68fNKeq9gPPA6f0+PkbkownGZ+cnJxL65KkWfQdCEneAHwB+N2q+tFMm/ao1Qz1meYcXKi6oapWVdWqkZGR2VqWJM1BX4GQ5Hg6YfCXVfXFVn6qXQaivT7d6ruBpV3TR4E9rT7ao37QnCQLgZOAZ+Z6MJKkQ9fPU0YBbgQerqpPd63aCqxr43XAHV31sfbk0Ol0bh7vbJeV9iVZ0/Z56bQ5U/u6CLi73WeQJM2ThX1s8x7gw8ADSe5rtU8AVwFbkqwHvgdcDFBVu5JsAR6i84TS5VV1oM27DLgZWARsawt0AufWJBN0zgzGDu+wJElzNWsgVNXf0PsaP8C5LzNnI7CxR30cOKtH/QVaoEiSBsNPKkuSAANBktQYCJIkwECQJDUGgiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNQaCJAkwECRJjYEgSQIMBElSYyBIkgADQZLUGAiSJKCPQEhyU5KnkzzYVftkku8nua8tH+xad2WSiSSPJDm/q35OkgfaumuTpNVPSHJbq+9IsuwIH6MkqQ/9nCHcDKztUb+mqla25U6AJCuAMeDMNue6JAva9tcDG4DlbZna53rg2ao6A7gGuPoQj0WSdBhmDYSq+gbwTJ/7uwDYXFUvVtVjwASwOsli4MSquqeqCrgFuLBrzqY2vh04d+rsQZI0fw7nHsLHktzfLimd3GpLgCe7ttndakvaeHr9oDlVtR94Hjil1w9MsiHJeJLxycnJw2hdkjTdoQbC9cDbgZXAXuBTrd7rN/uaoT7TnJcWq26oqlVVtWpkZGRODUuSZnZIgVBVT1XVgar6KfA5YHVbtRtY2rXpKLCn1Ud71A+ak2QhcBL9X6KSJB0hhxQI7Z7AlA8BU08gbQXG2pNDp9O5ebyzqvYC+5KsafcHLgXu6Jqzro0vAu5u9xkkSfNo4WwbJPk88D7g1CS7gT8C3pdkJZ1LO48DHwWoql1JtgAPAfuBy6vqQNvVZXSeWFoEbGsLwI3ArUkm6JwZjB2B45IkzdGsgVBVl/Qo3zjD9huBjT3q48BZPeovABfP1ock6ejyk8qSJMBAkCQ1BoIkCTAQJEmNgSBJAgwESVJjIEiSAANBktQYCJIkwECQJDUGgiQJMBAkSY2BIEkC+vi2U0mvTsuu+MqgW3hVefyqXxl0C4fNMwRJEmAgSJIaA0GSBBgIkqTGQJAkAX0EQpKbkjyd5MGu2puS3JXk0fZ6cte6K5NMJHkkyfld9XOSPNDWXZskrX5CkttafUeSZUf4GCVJfejnDOFmYO202hXA9qpaDmxv70myAhgDzmxzrkuyoM25HtgALG/L1D7XA89W1RnANcDVh3owkqRDN2sgVNU3gGemlS8ANrXxJuDCrvrmqnqxqh4DJoDVSRYDJ1bVPVVVwC3T5kzt63bg3KmzB0nS/DnUewinVdVegPb65lZfAjzZtd3uVlvSxtPrB82pqv3A88ApvX5okg1JxpOMT05OHmLrkqRejvRN5V6/2dcM9ZnmvLRYdUNVraqqVSMjI4fYoiSpl0MNhKfaZSDa69OtvhtY2rXdKLCn1Ud71A+ak2QhcBIvvUQlSTrKDjUQtgLr2ngdcEdXfaw9OXQ6nZvHO9tlpX1J1rT7A5dOmzO1r4uAu9t9BknSPJr1y+2SfB54H3Bqkt3AHwFXAVuSrAe+B1wMUFW7kmwBHgL2A5dX1YG2q8voPLG0CNjWFoAbgVuTTNA5Mxg7IkcmSZqTWQOhqi55mVXnvsz2G4GNPerjwFk96i/QAkWSNDh+UlmSBBgIkqTGQJAkAQaCJKkxECRJgIEgSWoMBEkSYCBIkhoDQZIEGAiSpMZAkCQBBoIkqTEQJEmAgSBJagwESRJgIEiSGgNBkgQYCJKkxkCQJAGHGQhJHk/yQJL7koy32puS3JXk0fZ6ctf2VyaZSPJIkvO76ue0/UwkuTZJDqcvSdLcHYkzhPdX1cqqWtXeXwFsr6rlwPb2niQrgDHgTGAtcF2SBW3O9cAGYHlb1h6BviRJc3A0LhldAGxq403AhV31zVX1YlU9BkwAq5MsBk6sqnuqqoBbuuZIkubJ4QZCAf8jyb1JNrTaaVW1F6C9vrnVlwBPds3d3WpL2nh6/SWSbEgynmR8cnLyMFuXJHVbeJjz31NVe5K8GbgryXdn2LbXfYGaof7SYtUNwA0Aq1at6rmNJOnQHNYZQlXtaa9PA18CVgNPtctAtNen2+a7gaVd00eBPa0+2qMuSZpHhxwISV6f5OemxsC/BR4EtgLr2mbrgDvaeCswluSEJKfTuXm8s11W2pdkTXu66NKuOZKkeXI4l4xOA77UnhBdCPz3qvpqkm8BW5KsB74HXAxQVbuSbAEeAvYDl1fVgbavy4CbgUXAtrZIkubRIQdCVf098M4e9R8C577MnI3Axh71ceCsQ+1FknT4/KSyJAkwECRJjYEgSQIMBElSYyBIkgADQZLUGAiSJMBAkCQ1BoIkCTAQJEmNgSBJAgwESVJjIEiSAANBktQYCJIkwECQJDUGgiQJMBAkSY2BIEkCjqFASLI2ySNJJpJcMeh+JGnYHBOBkGQB8CfAB4AVwCVJVgy2K0kaLsdEIACrgYmq+vuq+kdgM3DBgHuSpKGycNANNEuAJ7ve7wbePX2jJBuADe3tj5M8Mg+9DYtTgR8MuonZ5OpBd6AB8M/mkfW2l1txrARCetTqJYWqG4Abjn47wyfJeFWtGnQf0nT+2Zw/x8olo93A0q73o8CeAfUiSUPpWAmEbwHLk5ye5DXAGLB1wD1J0lA5Ji4ZVdX+JB8D/hpYANxUVbsG3Naw8VKcjlX+2ZwnqXrJpXpJ0hA6Vi4ZSZIGzECQJAEGgiSpMRAkHTOSjM6w7lfns5dhZCAMoSTrXqZ+fJLPz3c/UpftSZZNLyb5CPCZee9myBgIw+nj7WtA/kmS1wN3Av8wmJYkAH4PuCvJ8qlCkitb/ZcG1tWQOCY+h6B5dx7w1SSvraprk4zQCYPtVeVXj2tgqurOJC8C25JcCPw28C7g31TVswNtbgj4OYQhleREYBvwTTrfLHt9VV072K6kjiTvBf4K+FvgN6rqhcF2NBwMhCGU5Nfb8OeATwPb6XzlOABV9cVB9CUl2Ufniy0DnAD8BDjQ3ldVnTjA9l71DIQhlOQvZlhdVfWReWtG0jHDQJB0zElyGp1/J6WAPVX11IBbGgoGwpBK8i/o3Dv4p790wNaqenigjWmoJTkbuB44Cfh+K48CzwG/U1XfHlBrQ8FAGEJJ/gC4hM59g92tPErna8c3V9VVg+pNwy3JfcBHq2rHtPoa4M+q6p0DaWxIGAhDKMn/Ac6sqp9Mq78G2FVVy3vPlI6uJI++3J+/JBNVdcZ89zRM/BzCcPop8BbgiWn1xW2dNCjbknwFuIWf/TvrS4FLga8OrKsh4RnCEEqyFvgs8Cg/+0v3VuAM4GNV5V88DUySD/Cz+1uhc1lza1XdOdDGhoCBMKSSHAes5uC/dN+qqgMDbUzSwBgIAiDJm6rqmUH3oeGW5Ber6v42Ph74Azq/uDwI/Jeq8ru2jiK/3G4IJfnDrvGKdpP53iSPJ3n3AFuTbu4aX0XnMuangEXAnw6ioWHiGcIQSvLtqvqXbfwV4LNVtS3JauAzVfWvB9uhhlWS/11VZ7fxfcC7quonSQJ8p6p+caANvsr5lJHeUlXbAKpqZ5JFg25IQ+2kJB+ic/XihKlHo6uqkvjb61FmIAynn0+ylc7N5NEkr+u6Nnv8APuS/hfwa238d0lOq6qnkvwz4AcD7GsoeMloCCWZ/g+N3FtVP27fH3NRVf3JIPqSNFgGgqRjVpJVwP1V9Y+D7mUY+JTRkEvyn7pfpWNFksW0fyBn0L0MCwNBY9NepWPFOmATnX9GU/PAQNCUDLoBaZoPA1cCr0ny9kE3MwwMBEnHnCTvB75bVT8A/gJYP+CWhoKBIOlYtB64sY1vAy5u37+lo8j/wJKOKUneCKwBpj4w+SPg74APDrCtoeAH0/T19vq1QTYhTamq5+h8h1F37cOD6Wa4+DkESRLgJaOhleR1Sd45rfbWJEsG1ZOkwTIQhtdPgC8meX1X7c/p/DOakoaQgTCk2rdIfgn4TeicHQAjVTU+0MYkDYyBMNz+HPitNr6UzvPekoaUTxkNsar6bhKS/AJwCfDeQfckaXA8Q9CNdM4U7q+qZwfdjKTB8bHTIZfkdcBe4N9V1f8cdD+SBsdAkCQBXjKSJDUGgiQJMBAkSY2BIEkC4P8D0n/9LGBOQzIAAAAASUVORK5CYII=",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"df[\"income\"].value_counts().plot(kind=\"bar\", title=\"income\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"interpreter": {
"hash": "2647ea34e536f865ab67ff9ddee7fd78773d956cec0cab53c79b32cd10da5d83"
},
"kernelspec": {
"display_name": "Python 3.9.11 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
},
"orig_nbformat": 2
},
"nbformat": 4,
"nbformat_minor": 2
}