{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: kaggle in c:\\users\\user\\anaconda3\\lib\\site-packages (1.5.12)\n", "Requirement already satisfied: urllib3 in c:\\users\\user\\anaconda3\\lib\\site-packages (from kaggle) (1.26.7)\n", "Requirement already satisfied: python-dateutil in c:\\users\\user\\anaconda3\\lib\\site-packages (from kaggle) (2.8.2)\n", "Requirement already satisfied: python-slugify in c:\\users\\user\\anaconda3\\lib\\site-packages (from kaggle) (5.0.2)\n", "Requirement already satisfied: requests in c:\\users\\user\\anaconda3\\lib\\site-packages (from kaggle) (2.26.0)\n", "Requirement already satisfied: six>=1.10 in c:\\users\\user\\anaconda3\\lib\\site-packages (from kaggle) (1.16.0)\n", "Requirement already satisfied: tqdm in c:\\users\\user\\anaconda3\\lib\\site-packages (from kaggle) (4.62.3)\n", "Requirement already satisfied: certifi in c:\\users\\user\\anaconda3\\lib\\site-packages (from kaggle) (2021.10.8)\n", "Requirement already satisfied: text-unidecode>=1.3 in c:\\users\\user\\anaconda3\\lib\\site-packages (from python-slugify->kaggle) (1.3)\n", "Requirement already satisfied: charset-normalizer~=2.0.0 in c:\\users\\user\\anaconda3\\lib\\site-packages (from requests->kaggle) (2.0.4)\n", "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\user\\anaconda3\\lib\\site-packages (from requests->kaggle) (3.2)\n", "Requirement already satisfied: colorama in c:\\users\\user\\anaconda3\\lib\\site-packages (from tqdm->kaggle) (0.4.4)\n", "Requirement already satisfied: pandas in c:\\users\\user\\anaconda3\\lib\\site-packages (1.3.4)\n", "Requirement already satisfied: pytz>=2017.3 in c:\\users\\user\\anaconda3\\lib\\site-packages (from pandas) (2021.3)\n", "Requirement already satisfied: python-dateutil>=2.7.3 in c:\\users\\user\\anaconda3\\lib\\site-packages (from pandas) (2.8.2)\n", "Requirement already satisfied: numpy>=1.17.3 in c:\\users\\user\\anaconda3\\lib\\site-packages (from pandas) (1.20.3)\n", "Requirement already satisfied: six>=1.5 in c:\\users\\user\\anaconda3\\lib\\site-packages (from python-dateutil>=2.7.3->pandas) (1.16.0)\n", "Requirement already satisfied: seaborn in c:\\users\\user\\anaconda3\\lib\\site-packages (0.11.2)\n", "Requirement already satisfied: numpy>=1.15 in c:\\users\\user\\anaconda3\\lib\\site-packages (from seaborn) (1.20.3)\n", "Requirement already satisfied: matplotlib>=2.2 in c:\\users\\user\\anaconda3\\lib\\site-packages (from seaborn) (3.4.3)\n", "Requirement already satisfied: scipy>=1.0 in c:\\users\\user\\anaconda3\\lib\\site-packages (from seaborn) (1.7.1)\n", "Requirement already satisfied: pandas>=0.23 in c:\\users\\user\\anaconda3\\lib\\site-packages (from seaborn) (1.3.4)\n", "Requirement already satisfied: cycler>=0.10 in c:\\users\\user\\anaconda3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n", "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\user\\anaconda3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (8.4.0)\n", "Requirement already satisfied: pyparsing>=2.2.1 in c:\\users\\user\\anaconda3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (3.0.4)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\user\\anaconda3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (1.3.1)\n", "Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\user\\anaconda3\\lib\\site-packages (from matplotlib>=2.2->seaborn) (2.8.2)\n", "Requirement already satisfied: six in c:\\users\\user\\anaconda3\\lib\\site-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.16.0)\n", "Requirement already satisfied: pytz>=2017.3 in c:\\users\\user\\anaconda3\\lib\\site-packages (from pandas>=0.23->seaborn) (2021.3)\n" ] } ], "source": [ "!pip install kaggle\n", "!pip install pandas\n", "!pip install seaborn" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "adult-income-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)\n" ] } ], "source": [ "!kaggle datasets download -d wenruliu/adult-income-dataset\n", "\n", " " ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "'unzip' is not recognized as an internal or external command,\n", "operable program or batch file.\n" ] } ], "source": [ "!unzip -o adult-income-dataset.zip" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | age | \n", "workclass | \n", "fnlwgt | \n", "education | \n", "educational-num | \n", "marital-status | \n", "occupation | \n", "relationship | \n", "race | \n", "gender | \n", "capital-gain | \n", "capital-loss | \n", "hours-per-week | \n", "native-country | \n", "income | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "25 | \n", "Private | \n", "226802 | \n", "11th | \n", "7 | \n", "Never-married | \n", "Machine-op-inspct | \n", "Own-child | \n", "Black | \n", "Male | \n", "0 | \n", "0 | \n", "40 | \n", "United-States | \n", "<=50K | \n", "
1 | \n", "38 | \n", "Private | \n", "89814 | \n", "HS-grad | \n", "9 | \n", "Married-civ-spouse | \n", "Farming-fishing | \n", "Husband | \n", "White | \n", "Male | \n", "0 | \n", "0 | \n", "50 | \n", "United-States | \n", "<=50K | \n", "
2 | \n", "28 | \n", "Local-gov | \n", "336951 | \n", "Assoc-acdm | \n", "12 | \n", "Married-civ-spouse | \n", "Protective-serv | \n", "Husband | \n", "White | \n", "Male | \n", "0 | \n", "0 | \n", "40 | \n", "United-States | \n", ">50K | \n", "
3 | \n", "44 | \n", "Private | \n", "160323 | \n", "Some-college | \n", "10 | \n", "Married-civ-spouse | \n", "Machine-op-inspct | \n", "Husband | \n", "Black | \n", "Male | \n", "7688 | \n", "0 | \n", "40 | \n", "United-States | \n", ">50K | \n", "
4 | \n", "18 | \n", "? | \n", "103497 | \n", "Some-college | \n", "10 | \n", "Never-married | \n", "? | \n", "Own-child | \n", "White | \n", "Female | \n", "0 | \n", "0 | \n", "30 | \n", "United-States | \n", "<=50K | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
48837 | \n", "27 | \n", "Private | \n", "257302 | \n", "Assoc-acdm | \n", "12 | \n", "Married-civ-spouse | \n", "Tech-support | \n", "Wife | \n", "White | \n", "Female | \n", "0 | \n", "0 | \n", "38 | \n", "United-States | \n", "<=50K | \n", "
48838 | \n", "40 | \n", "Private | \n", "154374 | \n", "HS-grad | \n", "9 | \n", "Married-civ-spouse | \n", "Machine-op-inspct | \n", "Husband | \n", "White | \n", "Male | \n", "0 | \n", "0 | \n", "40 | \n", "United-States | \n", ">50K | \n", "
48839 | \n", "58 | \n", "Private | \n", "151910 | \n", "HS-grad | \n", "9 | \n", "Widowed | \n", "Adm-clerical | \n", "Unmarried | \n", "White | \n", "Female | \n", "0 | \n", "0 | \n", "40 | \n", "United-States | \n", "<=50K | \n", "
48840 | \n", "22 | \n", "Private | \n", "201490 | \n", "HS-grad | \n", "9 | \n", "Never-married | \n", "Adm-clerical | \n", "Own-child | \n", "White | \n", "Male | \n", "0 | \n", "0 | \n", "20 | \n", "United-States | \n", "<=50K | \n", "
48841 | \n", "52 | \n", "Self-emp-inc | \n", "287927 | \n", "HS-grad | \n", "9 | \n", "Married-civ-spouse | \n", "Exec-managerial | \n", "Wife | \n", "White | \n", "Female | \n", "15024 | \n", "0 | \n", "40 | \n", "United-States | \n", ">50K | \n", "
48842 rows × 15 columns
\n", "\n", " | age | \n", "workclass | \n", "fnlwgt | \n", "education | \n", "educational-num | \n", "marital-status | \n", "occupation | \n", "relationship | \n", "race | \n", "gender | \n", "capital-gain | \n", "capital-loss | \n", "hours-per-week | \n", "native-country | \n", "income | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | \n", "48842.000000 | \n", "48842 | \n", "4.884200e+04 | \n", "48842 | \n", "48842.000000 | \n", "48842 | \n", "48842 | \n", "48842 | \n", "48842 | \n", "48842 | \n", "48842.000000 | \n", "48842.000000 | \n", "48842.000000 | \n", "48842 | \n", "48842 | \n", "
unique | \n", "NaN | \n", "9 | \n", "NaN | \n", "16 | \n", "NaN | \n", "7 | \n", "15 | \n", "6 | \n", "5 | \n", "2 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "42 | \n", "2 | \n", "
top | \n", "NaN | \n", "Private | \n", "NaN | \n", "HS-grad | \n", "NaN | \n", "Married-civ-spouse | \n", "Prof-specialty | \n", "Husband | \n", "White | \n", "Male | \n", "NaN | \n", "NaN | \n", "NaN | \n", "United-States | \n", "<=50K | \n", "
freq | \n", "NaN | \n", "33906 | \n", "NaN | \n", "15784 | \n", "NaN | \n", "22379 | \n", "6172 | \n", "19716 | \n", "41762 | \n", "32650 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "43832 | \n", "37155 | \n", "
mean | \n", "38.643585 | \n", "NaN | \n", "1.896641e+05 | \n", "NaN | \n", "10.078089 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "1079.067626 | \n", "87.502314 | \n", "40.422382 | \n", "NaN | \n", "NaN | \n", "
std | \n", "13.710510 | \n", "NaN | \n", "1.056040e+05 | \n", "NaN | \n", "2.570973 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "7452.019058 | \n", "403.004552 | \n", "12.391444 | \n", "NaN | \n", "NaN | \n", "
min | \n", "17.000000 | \n", "NaN | \n", "1.228500e+04 | \n", "NaN | \n", "1.000000 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.000000 | \n", "0.000000 | \n", "1.000000 | \n", "NaN | \n", "NaN | \n", "
25% | \n", "28.000000 | \n", "NaN | \n", "1.175505e+05 | \n", "NaN | \n", "9.000000 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.000000 | \n", "0.000000 | \n", "40.000000 | \n", "NaN | \n", "NaN | \n", "
50% | \n", "37.000000 | \n", "NaN | \n", "1.781445e+05 | \n", "NaN | \n", "10.000000 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.000000 | \n", "0.000000 | \n", "40.000000 | \n", "NaN | \n", "NaN | \n", "
75% | \n", "48.000000 | \n", "NaN | \n", "2.376420e+05 | \n", "NaN | \n", "12.000000 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.000000 | \n", "0.000000 | \n", "45.000000 | \n", "NaN | \n", "NaN | \n", "
max | \n", "90.000000 | \n", "NaN | \n", "1.490400e+06 | \n", "NaN | \n", "16.000000 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "99999.000000 | \n", "4356.000000 | \n", "99.000000 | \n", "NaN | \n", "NaN | \n", "