ium_470618/dane.ipynb

812 lines
57 KiB
Plaintext
Raw Normal View History

2023-03-21 22:00:29 +01:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "3473477b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading titanic.zip to /home/gedin/Studia/InzUczeniaMaszynowego/zadania\n",
"100%|███████████████████████████████████████| 34.1k/34.1k [00:00<00:00, 212kB/s]\n",
"100%|███████████████████████████████████████| 34.1k/34.1k [00:00<00:00, 212kB/s]\n"
]
}
],
"source": [
"!kaggle competitions download -c titanic"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "0c37c704",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Archive: titanic.zip\r\n",
" inflating: gender_submission.csv \r\n",
" inflating: test.csv \r\n",
" inflating: train.csv \r\n"
]
}
],
"source": [
"!unzip titanic.zip"
]
},
{
"cell_type": "markdown",
"id": "b6adfde9",
"metadata": {},
"source": [
"### Dane o pliku"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "a9d9a8ee",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"892 train.csv\n",
"419 test.csv\n"
]
}
],
"source": [
"!wc -l train.csv\n",
"!wc -l test.csv"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "bf08fe16",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "fc59f320",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"train.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "aa5ea30b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PassengerId</th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Name</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Ticket</th>\n",
" <th>Fare</th>\n",
" <th>Cabin</th>\n",
" <th>Embarked</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>Braund, Mr. Owen Harris</td>\n",
" <td>male</td>\n",
" <td>22.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>A/5 21171</td>\n",
" <td>7.2500</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
" <td>female</td>\n",
" <td>38.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>PC 17599</td>\n",
" <td>71.2833</td>\n",
" <td>C85</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>Heikkinen, Miss. Laina</td>\n",
" <td>female</td>\n",
" <td>26.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>STON/O2. 3101282</td>\n",
" <td>7.9250</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
" <td>female</td>\n",
" <td>35.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>113803</td>\n",
" <td>53.1000</td>\n",
" <td>C123</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>Allen, Mr. William Henry</td>\n",
" <td>male</td>\n",
" <td>35.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>373450</td>\n",
" <td>8.0500</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PassengerId Survived Pclass \\\n",
"0 1 0 3 \n",
"1 2 1 1 \n",
"2 3 1 3 \n",
"3 4 1 1 \n",
"4 5 0 3 \n",
"\n",
" Name Sex Age SibSp \\\n",
"0 Braund, Mr. Owen Harris male 22.0 1 \n",
"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
"2 Heikkinen, Miss. Laina female 26.0 0 \n",
"3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
"4 Allen, Mr. William Henry male 35.0 0 \n",
"\n",
" Parch Ticket Fare Cabin Embarked \n",
"0 0 A/5 21171 7.2500 NaN S \n",
"1 0 PC 17599 71.2833 C85 C \n",
"2 0 STON/O2. 3101282 7.9250 NaN S \n",
"3 0 113803 53.1000 C123 S \n",
"4 0 373450 8.0500 NaN S "
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "32d4140c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PassengerId</th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Fare</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>891.000000</td>\n",
" <td>891.000000</td>\n",
" <td>891.000000</td>\n",
" <td>714.000000</td>\n",
" <td>891.000000</td>\n",
" <td>891.000000</td>\n",
" <td>891.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>446.000000</td>\n",
" <td>0.383838</td>\n",
" <td>2.308642</td>\n",
" <td>29.699118</td>\n",
" <td>0.523008</td>\n",
" <td>0.381594</td>\n",
" <td>32.204208</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>257.353842</td>\n",
" <td>0.486592</td>\n",
" <td>0.836071</td>\n",
" <td>14.526497</td>\n",
" <td>1.102743</td>\n",
" <td>0.806057</td>\n",
" <td>49.693429</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.420000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>223.500000</td>\n",
" <td>0.000000</td>\n",
" <td>2.000000</td>\n",
" <td>20.125000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>7.910400</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>446.000000</td>\n",
" <td>0.000000</td>\n",
" <td>3.000000</td>\n",
" <td>28.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>14.454200</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>668.500000</td>\n",
" <td>1.000000</td>\n",
" <td>3.000000</td>\n",
" <td>38.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>31.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>891.000000</td>\n",
" <td>1.000000</td>\n",
" <td>3.000000</td>\n",
" <td>80.000000</td>\n",
" <td>8.000000</td>\n",
" <td>6.000000</td>\n",
" <td>512.329200</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PassengerId Survived Pclass Age SibSp \\\n",
"count 891.000000 891.000000 891.000000 714.000000 891.000000 \n",
"mean 446.000000 0.383838 2.308642 29.699118 0.523008 \n",
"std 257.353842 0.486592 0.836071 14.526497 1.102743 \n",
"min 1.000000 0.000000 1.000000 0.420000 0.000000 \n",
"25% 223.500000 0.000000 2.000000 20.125000 0.000000 \n",
"50% 446.000000 0.000000 3.000000 28.000000 0.000000 \n",
"75% 668.500000 1.000000 3.000000 38.000000 1.000000 \n",
"max 891.000000 1.000000 3.000000 80.000000 8.000000 \n",
"\n",
" Parch Fare \n",
"count 891.000000 891.000000 \n",
"mean 0.381594 32.204208 \n",
"std 0.806057 49.693429 \n",
"min 0.000000 0.000000 \n",
"25% 0.000000 7.910400 \n",
"50% 0.000000 14.454200 \n",
"75% 0.000000 31.000000 \n",
"max 6.000000 512.329200 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "920ea21b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[<Axes: title={'center': 'Survived'}>,\n",
" <Axes: title={'center': 'Pclass'}>]], dtype=object)"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAikAAAGzCAYAAADqhoemAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA9uUlEQVR4nO3dfVxUdd7/8TfgMIg6oKagiWS1qZh3acq0tVvGTcZaW1RaXUZl9YvQTdms3HW9rTC3zW1btK7W1bYu17TNbsyS0by5SkijLMRy7WaXXBvYLEUlhxHO7499MFcTIAwOzBd9PR8PHjnf8z3f8/3MgTPvzpwzE2ZZliUAAADDhId6AgAAAA0hpAAAACMRUgAAgJEIKQAAwEiEFAAAYCRCCgAAMBIhBQAAGImQAgAAjERIAQAARiKkoE3deuutOuuss0Ky7bCwMM2ZMyck2wZw8v7xj38oLCxMy5cvD/VU0EYIKae4kpISXXfddUpMTFRUVJTOPPNMpaam6sknnwz11ACcJpYvX66wsDDfT1RUlM477zxNnjxZ5eXloZ4eDNYh1BNA69m2bZsuu+wy9e3bV3feeafi4+P15ZdfqqioSE888YSmTJnS5nN65plnVFtb2+bbBRB68+bNU79+/XTs2DG9/fbbWrJkidatW6ddu3YpOjo61NODgQgpp7CHH35YMTEx2rFjh2JjY/2WVVRUBGUbR48eVadOnZrd32azBWW7ANqfsWPHauTIkZKkO+64Q927d9fjjz+uV155RTfeeGOIZwcT8XbPKeyzzz7ToEGD6gUUSerZs6ekE7/H+8NrOObMmaOwsDDt3r1bN910k7p27aqLL75Yjz32mMLCwvTPf/6z3hgzZsxQZGSkvv32W0n+16R4vV5169ZNt912W731KisrFRUVpfvuu8/X5vF4NHv2bJ177rmy2+1KSEjQ/fffL4/H47eux+PRtGnT1KNHD3Xp0kVXXXWV9u3b19TTBaCNjRkzRpL0xRdfSJIOHjyoadOm6ayzzpLdblefPn10yy236Ouvv250jI8++ki33nqrzj77bEVFRSk+Pl633367Dhw44Nfv8OHDmjp1qm/snj17KjU1Ve+//76vz969e5WZman4+HhFRUWpT58+mjBhgg4dOtQK1aM5OJNyCktMTFRhYaF27dql888/P2jjXn/99frRj36kRx55RJZl6Wc/+5nuv/9+rVq1StOnT/fru2rVKqWlpalr1671xrHZbLrmmmv00ksv6emnn1ZkZKRv2csvvyyPx6MJEyZIkmpra3XVVVfp7bff1l133aWBAweqpKREixYt0t///ne9/PLLvnXvuOMOPf/887rpppt00UUX6a233lJGRkbQ6gcQHJ999pkkqXv37jpy5IguueQSffzxx7r99tt1wQUX6Ouvv9arr76qffv26YwzzmhwDJfLpc8//1y33Xab4uPjVVpaqv/+7/9WaWmpioqKFBYWJkm6++679eKLL2ry5MlKSkrSgQMH9Pbbb+vjjz/WBRdcoOrqaqWnp8vj8WjKlCmKj4/Xv/71L61du1YHDx5UTExMmz0v+B4Lp6yCggIrIiLCioiIsJxOp3X//fdb69evt6qrq319vvjiC0uStWzZsnrrS7Jmz57tezx79mxLknXjjTfW6+t0Oq0RI0b4tW3fvt2SZP3lL3/xtWVlZVmJiYm+x+vXr7ckWa+99prfuldeeaV19tln+x4/99xzVnh4uPW///u/fv2eeuopS5L1zjvvWJZlWTt37rQkWffcc49fv5tuuqlePQDaxrJlyyxJ1oYNG6x///vf1pdffmmtXLnS6t69u9WxY0dr37591qxZsyxJ1ksvvVRv/draWsuyGj5eVVVV1ev/17/+1ZJkbd261dcWExNj5eTkNDrHDz74wJJkrV69+iQqRbDxds8pLDU1VYWFhbrqqqv04YcfauHChUpPT9eZZ56pV199tcXj3n333fXaxo8fr+LiYt//GUnSCy+8ILvdrquvvrrRscaMGaMzzjhDL7zwgq/t22+/lcvl0vjx431tq1ev1sCBAzVgwAB9/fXXvp+608WbNm2SJK1bt06S9Itf/MJvO1OnTg28UABBlZKSoh49eighIUETJkxQ586dtWbNGp155pn629/+pqFDh+qaa66pt17d2ZCGdOzY0ffvY8eO6euvv1ZycrIk+b2VExsbq3fffVf79+9vcJy6MyXr169XVVVVi+pD8BFSTnEXXnihXnrpJX377bfavn27ZsyYocOHD+u6667T7t27WzRmv3796rVdf/31Cg8P94UNy7K0evVqjR07Vg6Ho9GxOnTooMzMTL3yyiu+a0teeukleb1ev5Cyd+9elZaWqkePHn4/5513nqT/uxD4n//8p8LDw3XOOef4bad///4tqhVA8OTn58vlcmnTpk3avXu3Pv/8c6Wnp0v6z1s/LXlb+ptvvtG9996ruLg4dezYUT169PAdo75/LcnChQu1a9cuJSQkaNSoUZozZ44+//xz3/J+/fopNzdXf/rTn3TGGWcoPT1d+fn5XI8SYoSU00RkZKQuvPBCPfLII1qyZIm8Xq9Wr17d6P+h1NTUNDrW9//PpU7v3r11ySWXaNWqVZKkoqIilZWV+QWNxkyYMEGHDx/WG2+8Iek/17EMGDBAQ4cO9fWpra3V4MGD5XK5Gvy55557mtwOgNAaNWqUUlJSdOmll2rgwIEKDz/5l6AbbrhBzzzzjO6++2699NJLKigo0JtvvilJfh93cMMNN+jzzz/Xk08+qd69e+u3v/2tBg0a5DvuSNLvfvc7ffTRR/rVr36l7777Tr/4xS80aNAgLrwPIS6cPQ3V3QL41Vdf+S5oPXjwoF+fhu7Uacr48eN1zz33aM+ePXrhhRcUHR2tcePGNbneT37yE/Xq1UsvvPCCLr74Yr311lv69a9/7dfnnHPO0YcffqjLL7/8hKd+ExMTVVtbq88++8zv7MmePXsCrgdA2znnnHO0a9eugNb59ttvtXHjRs2dO1ezZs3yte/du7fB/r169dI999yje+65RxUVFbrgggv08MMPa+zYsb4+gwcP1uDBgzVz5kxt27ZNP/7xj/XUU0/poYceallhOCmcSTmFbdq0SZZl1Wuvu26jf//+cjgcOuOMM7R161a/PosXLw54e5mZmYqIiNBf//pXrV69Wj/72c+a9Rkq4eHhuu666/Taa6/pueee0/Hjx+udgbnhhhv0r3/9S88880y99b/77jsdPXpUknwHmz/84Q9+fX7/+98HXA+AtpOZmakPP/xQa9asqbesoeOYJEVERDS4/Id/7zU1NfXetunZs6d69+7te5u5srJSx48f9+szePBghYeH1/uYA7QdzqScwqZMmaKqqipdc801GjBggKqrq7Vt2za98MILOuuss3yfT3LHHXdowYIFuuOOOzRy5Eht3bpVf//73wPeXs+ePXXZZZfp8ccf1+HDh5v1Vk+d8ePH68knn9Ts2bM1ePBgDRw40G/5xIkTtWrVKt19993atGmTfvzjH6umpkaffPKJVq1apfXr12vkyJEaNmyYbrzxRi1evFiHDh3SRRddpI0bN+rTTz8NuB4AbWf69Ol68cUXdf311+v222/XiBEj9M033+jVV1/VU0895ff2bx2Hw6Gf/OQnWrhwobxer84880wVFBT4PnelzuHDh9WnTx9dd911Gjp0qDp37qwNGzZox44d+t3vfidJeuuttzR58mRdf/31Ou+883T8+HE999xzioiIUGZmZps8B2hAaG8uQmt64403rNtvv90aMGCA1blzZysyMtI699xzrSlTpljl5eW+flVVVdakSZOsmJgYq0uXLtYNN9xgVVRUNHoL8r///e9Gt/nMM89YkqwuXbpY3333Xb3lP7wFuU5tba2VkJBgSbIeeuihBseurq62Hn30UWvQoEGW3W63unbtao0YMcKaO3eudejQIV+/7777zvrFL35hde/e3erUqZM1btw468svv+QWZCBE6m5B3rFjxwn7HThwwJo8ebJ15plnWpGRkVafPn2srKws6+uvv7Ysq+FbkPft22d
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df.hist([\"Survived\", \"Pclass\"])\n"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "be20c939",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Axes: xlabel='Embarked'>"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAigAAAGtCAYAAAA8mI9zAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAlK0lEQVR4nO3de3TU9Z3/8VcuZAIJMzFgZkgNmFbWJApViCVT7VYhECC0tcS1tlmMLQtdmshCVio5IiBWodQtll1utRZoV9aWs0d3wUM0xoIXhluQYwpIxcImNs4EC5kBbCaQfH9/7I/vdgQvQxLmk+T5OOd7DvP9fmbm/cXBPM9kLnGWZVkCAAAwSHysBwAAAPgoAgUAABiHQAEAAMYhUAAAgHEIFAAAYBwCBQAAGIdAAQAAxkmM9QCXo6OjQ01NTRo4cKDi4uJiPQ4AAPgMLMvS6dOnlZmZqfj4T36OpEcGSlNTk7KysmI9BgAAuAyNjY265pprPnFNjwyUgQMHSvrfE3Q6nTGeBgAAfBahUEhZWVn2z/FP0iMD5cKvdZxOJ4ECAEAP81lensGLZAEAgHEIFAAAYBwCBQAAGIdAAQAAxiFQAACAcQgUAABgHAIFAAAYh0ABAADGIVAAAIBxCBQAAGAcAgUAABiHQAEAAMYhUAAAgHEIFAAAYBwCBQAAGCcx1gP0ZtfOfyHWI/Qax5cVx3oEAMAVxDMoAADAOAQKAAAwDoECAACMQ6AAAADjECgAAMA4BAoAADAOgQIAAIxDoAAAAOMQKAAAwDgECgAAMA6BAgAAjEOgAAAA4xAoAADAOAQKAAAwDoECAACMQ6AAAADjECgAAMA4BAoAADAOgQIAAIwTdaD86U9/0t///d9r0KBB6t+/v0aMGKF9+/bZxy3L0sKFCzVkyBD1799fhYWFeueddyJu4+TJkyotLZXT6VRaWpqmT5+uM2fOdP5sAABArxBVoJw6dUq33nqr+vXrp23btunQoUP6l3/5F1111VX2muXLl2vlypVau3atdu/erZSUFBUVFam1tdVeU1paqoMHD6qmpkZbt27Vq6++qpkzZ3bdWQEAgB4tzrIs67Munj9/vt544w299tprlzxuWZYyMzP1z//8z3rggQckScFgUG63Wxs2bNA999yjw4cPKy8vT3v37lV+fr4kqbq6WpMnT9Z7772nzMzMT50jFArJ5XIpGAzK6XR+1vGvuGvnvxDrEXqN48uKYz0CAKCTovn5HdUzKP/93/+t/Px8/d3f/Z0yMjJ0880366mnnrKPHzt2TH6/X4WFhfY+l8ulMWPGyOfzSZJ8Pp/S0tLsOJGkwsJCxcfHa/fu3Ze833A4rFAoFLEBAIDeK6pA+eMf/6g1a9Zo+PDhevHFFzVr1izNnj1bGzdulCT5/X5Jktvtjrie2+22j/n9fmVkZEQcT0xMVHp6ur3mo5YuXSqXy2VvWVlZ0YwNAAB6mKgCpaOjQ6NGjdLjjz+um2++WTNnztSMGTO0du3a7ppPklRVVaVgMGhvjY2N3Xp/AAAgtqIKlCFDhigvLy9iX25urhoaGiRJHo9HkhQIBCLWBAIB+5jH41Fzc3PE8fPnz+vkyZP2mo9yOBxyOp0RGwAA6L2iCpRbb71VR44cidj3hz/8QcOGDZMkZWdny+PxqLa21j4eCoW0e/dueb1eSZLX61VLS4vq6ursNa+88oo6Ojo0ZsyYyz4RAADQeyRGs3ju3Ln68pe/rMcff1x333239uzZo5///Of6+c9/LkmKi4vTnDlz9KMf/UjDhw9Xdna2Hn74YWVmZurOO++U9L/PuEycONH+1dC5c+dUUVGhe+655zO9gwcAAPR+UQXKLbfcoueee05VVVVasmSJsrOz9eSTT6q0tNRe88Mf/lBnz57VzJkz1dLSottuu03V1dVKTk621zzzzDOqqKjQuHHjFB8fr5KSEq1cubLrzgoAAPRoUX0Oiin4HJS+h89BAYCer9s+BwUAAOBKIFAAAIBxCBQAAGAcAgUAABiHQAEAAMYhUAAAgHEIFAAAYBwCBQAAGIdAAQAAxiFQAACAcQgUAABgHAIFAAAYh0ABAADGIVAAAIBxCBQAAGAcAgUAABiHQAEAAMYhUAAAgHEIFAAAYBwCBQAAGIdAAQAAxiFQAACAcQgUAABgHAIFAAAYh0ABAADGIVAAAIBxCBQAAGAcAgUAABiHQAEAAMYhUAAAgHEIFAAAYBwCBQAAGIdAAQAAxiFQAACAcQgUAABgHAIFAAAYh0ABAADGIVAAAIBxCBQAAGAcAgUAABiHQAEAAMYhUAAAgHEIFAAAYBwCBQAAGCeqQFm8eLHi4uIitpycHPt4a2urysvLNWjQIKWmpqqkpESBQCDiNhoaGlRcXKwBAwYoIyND8+bN0/nz57vmbAAAQK+QGO0VbrjhBr388sv/dwOJ/3cTc+fO1QsvvKDNmzfL5XKpoqJCU6dO1RtvvCFJam9vV3FxsTwej3bu3Kn3339f9957r/r166fHH3+8C04HAAD0BlEHSmJiojwez0X7g8Ggnn76aW3atEljx46VJK1fv165ubnatWuXCgoK9NJLL+nQoUN6+eWX5Xa7ddNNN+nRRx/Vgw8+qMWLFyspKanzZwQAAHq8qF+D8s477ygzM1Of//znVVpaqoaGBklSXV2dzp07p8LCQnttTk6Ohg4dKp/PJ0ny+XwaMWKE3G63vaaoqEihUEgHDx782PsMh8MKhUIRGwAA6L2iCpQxY8Zow4YNqq6u1po1a3Ts2DF95Stf0enTp+X3+5WUlKS0tLSI67jdbvn9fkmS3++PiJMLxy8c+zhLly6Vy+Wyt6ysrGjGBgAAPUxUv+KZNGmS/eeRI0dqzJgxGjZsmH7729+qf//+XT7cBVVVVaqsrLQvh0IhIgUAgF6sU28zTktL09/8zd/o6NGj8ng8amtrU0tLS8SaQCBgv2bF4/Fc9K6eC5cv9bqWCxwOh5xOZ8QGAAB6r04FypkzZ/Tuu+9qyJAhGj16tPr166fa2lr7+JEjR9TQ0CCv1ytJ8nq9qq+vV3Nzs72mpqZGTqdTeXl5nRkFAAD0IlH9iueBBx7Q1772NQ0bNkxNTU1atGiREhIS9O1vf1sul0vTp09XZWWl0tPT5XQ6df/998vr9aqgoECSNGHCBOXl5WnatGlavny5/H6/FixYoPLycjkcjm45QQAA0PNEFSjvvfeevv3tb+vPf/6zrr76at12223atWuXrr76aknSihUrFB8fr5KSEoXDYRUVFWn16tX29RMSErR161bNmjVLXq9XKSkpKisr05IlS7r2rAAAQI8WZ1mWFeshohUKheRyuRQMBo1+Pcq181+I9Qi9xvFlxbEeAQDQSdH8/Oa7eAAAgHEIFAAAYBwCBQAAGIdAAQAAxiFQAACAcQgUAABgHAIFAAAYh0ABAADGIVAAAIBxCBQAAGAcAgUAABiHQAEAAMYhUAAAgHEIFAAAYBwCBQAAGIdAAQAAxiFQAACAcQgUAABgHAIFAAAYh0ABAADGIVAAAIBxCBQAAGAcAgUAABiHQAEAAMYhUAAAgHEIFAAAYBwCBQAAGIdAAQAAxiFQAACAcQgUAABgHAIFAAAYh0ABAADGIVAAAIBxCBQAAGAcAgUAABiHQAEAAMYhUAAAgHEIFAAAYBwCBQAAGIdAAQAAxiFQAACAcQgUAABgHAIFAAAYp1OBsmzZMsXFxWnOnDn2vtbWVpWXl2vQoEFKTU1VSUmJAoFAxPUaGhpUXFysAQMGKCMjQ/PmzdP58+c7MwoAAOhFLjtQ9u7dq3Xr1mnkyJER++fOnastW7Zo8+bN2rFjh5qamjR16lT7eHt7u4qLi9XW1qadO3dq48aN2rBhgxYuXHj5ZwEAAHqVywqUM2fOqLS0VE899ZSuuuoqe38wGNTTTz+tn/70pxo7dqxGjx6t9evXa+fOndq1a5ck6aWXXtKhQ4f07//+77rppps0adIkPfroo1q1apXa2tq65qwAAECPdlmBUl5eruLiYhUWFkbsr6ur07lz5yL25+TkaOjQofL5fJIkn8+nESNGyO1222u
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"embarked = df.value_counts(\"Embarked\")\n",
"#later will be transformed to one-hot\n",
"embarked.plot.bar()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "8286046e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PassengerId</th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Name</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Ticket</th>\n",
" <th>Fare</th>\n",
" <th>Cabin</th>\n",
" <th>Embarked</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>Braund, Mr. Owen Harris</td>\n",
" <td>male</td>\n",
" <td>22.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>A/5 21171</td>\n",
" <td>7.2500</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
" <td>female</td>\n",
" <td>38.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>PC 17599</td>\n",
" <td>71.2833</td>\n",
" <td>C85</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>Heikkinen, Miss. Laina</td>\n",
" <td>female</td>\n",
" <td>26.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>STON/O2. 3101282</td>\n",
" <td>7.9250</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
" <td>female</td>\n",
" <td>35.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>113803</td>\n",
" <td>53.1000</td>\n",
" <td>C123</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>Allen, Mr. William Henry</td>\n",
" <td>male</td>\n",
" <td>35.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>373450</td>\n",
" <td>8.0500</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PassengerId Survived Pclass \\\n",
"0 1 0 3 \n",
"1 2 1 1 \n",
"2 3 1 3 \n",
"3 4 1 1 \n",
"4 5 0 3 \n",
"\n",
" Name Sex Age SibSp \\\n",
"0 Braund, Mr. Owen Harris male 22.0 1 \n",
"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
"2 Heikkinen, Miss. Laina female 26.0 0 \n",
"3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
"4 Allen, Mr. William Henry male 35.0 0 \n",
"\n",
" Parch Ticket Fare Cabin Embarked \n",
"0 0 A/5 21171 7.2500 NaN S \n",
"1 0 PC 17599 71.2833 C85 C \n",
"2 0 STON/O2. 3101282 7.9250 NaN S \n",
"3 0 113803 53.1000 C123 S \n",
"4 0 373450 8.0500 NaN S "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# df.dropna()\n",
"#df.fillna()"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "1ed8c693",
"metadata": {},
"outputs": [],
"source": [
"columns_to_normalize=['Age','Fare']\n",
"for colname in columns_to_normalize:\n",
" df[colname]=(df[colname]-df[colname].min())/(df[colname].max()-df[colname].min())"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "d5a0fa72",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PassengerId</th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Name</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Ticket</th>\n",
" <th>Fare</th>\n",
" <th>Cabin</th>\n",
" <th>Embarked</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>Braund, Mr. Owen Harris</td>\n",
" <td>male</td>\n",
" <td>0.271174</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>A/5 21171</td>\n",
" <td>0.014151</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
" <td>female</td>\n",
" <td>0.472229</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>PC 17599</td>\n",
" <td>0.139136</td>\n",
" <td>C85</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>Heikkinen, Miss. Laina</td>\n",
" <td>female</td>\n",
" <td>0.321438</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>STON/O2. 3101282</td>\n",
" <td>0.015469</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
" <td>female</td>\n",
" <td>0.434531</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>113803</td>\n",
" <td>0.103644</td>\n",
" <td>C123</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>Allen, Mr. William Henry</td>\n",
" <td>male</td>\n",
" <td>0.434531</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>373450</td>\n",
" <td>0.015713</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PassengerId Survived Pclass \\\n",
"0 1 0 3 \n",
"1 2 1 1 \n",
"2 3 1 3 \n",
"3 4 1 1 \n",
"4 5 0 3 \n",
"\n",
" Name Sex Age SibSp \\\n",
"0 Braund, Mr. Owen Harris male 0.271174 1 \n",
"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 0.472229 1 \n",
"2 Heikkinen, Miss. Laina female 0.321438 0 \n",
"3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 0.434531 1 \n",
"4 Allen, Mr. William Henry male 0.434531 0 \n",
"\n",
" Parch Ticket Fare Cabin Embarked \n",
"0 0 A/5 21171 0.014151 NaN S \n",
"1 0 PC 17599 0.139136 C85 C \n",
"2 0 STON/O2. 3101282 0.015469 NaN S \n",
"3 0 113803 0.103644 C123 S \n",
"4 0 373450 0.015713 NaN S "
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e6ffda37",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}