540 lines
140 KiB
Plaintext
540 lines
140 KiB
Plaintext
![]() |
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "0ba6ee9e",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# Titanic_Machine_Learning_from_Disaster"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "ec6e69b1",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# Imports"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 1,
|
||
|
"id": "ffcae455",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import numpy as np\n",
|
||
|
"import pandas as pd\n",
|
||
|
"import matplotlib.pyplot as plt\n",
|
||
|
"import seaborn as sns\n",
|
||
|
"import plotly.express as px"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "d1b19cf9",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# Data description"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 2,
|
||
|
"id": "3174342e",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df_train = pd.read_csv('train.csv')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 3,
|
||
|
"id": "4d561fad",
|
||
|
"metadata": {
|
||
|
"scrolled": true
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',\n",
|
||
|
" 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],\n",
|
||
|
" dtype='object')"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 3,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df_train.columns"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 4,
|
||
|
"id": "7818fc15",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>PassengerId</th>\n",
|
||
|
" <th>Survived</th>\n",
|
||
|
" <th>Pclass</th>\n",
|
||
|
" <th>Age</th>\n",
|
||
|
" <th>SibSp</th>\n",
|
||
|
" <th>Parch</th>\n",
|
||
|
" <th>Fare</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>count</th>\n",
|
||
|
" <td>891.000000</td>\n",
|
||
|
" <td>891.000000</td>\n",
|
||
|
" <td>891.000000</td>\n",
|
||
|
" <td>714.000000</td>\n",
|
||
|
" <td>891.000000</td>\n",
|
||
|
" <td>891.000000</td>\n",
|
||
|
" <td>891.000000</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>mean</th>\n",
|
||
|
" <td>446.000000</td>\n",
|
||
|
" <td>0.383838</td>\n",
|
||
|
" <td>2.308642</td>\n",
|
||
|
" <td>29.699118</td>\n",
|
||
|
" <td>0.523008</td>\n",
|
||
|
" <td>0.381594</td>\n",
|
||
|
" <td>32.204208</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>std</th>\n",
|
||
|
" <td>257.353842</td>\n",
|
||
|
" <td>0.486592</td>\n",
|
||
|
" <td>0.836071</td>\n",
|
||
|
" <td>14.526497</td>\n",
|
||
|
" <td>1.102743</td>\n",
|
||
|
" <td>0.806057</td>\n",
|
||
|
" <td>49.693429</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>min</th>\n",
|
||
|
" <td>1.000000</td>\n",
|
||
|
" <td>0.000000</td>\n",
|
||
|
" <td>1.000000</td>\n",
|
||
|
" <td>0.420000</td>\n",
|
||
|
" <td>0.000000</td>\n",
|
||
|
" <td>0.000000</td>\n",
|
||
|
" <td>0.000000</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>25%</th>\n",
|
||
|
" <td>223.500000</td>\n",
|
||
|
" <td>0.000000</td>\n",
|
||
|
" <td>2.000000</td>\n",
|
||
|
" <td>20.125000</td>\n",
|
||
|
" <td>0.000000</td>\n",
|
||
|
" <td>0.000000</td>\n",
|
||
|
" <td>7.910400</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>50%</th>\n",
|
||
|
" <td>446.000000</td>\n",
|
||
|
" <td>0.000000</td>\n",
|
||
|
" <td>3.000000</td>\n",
|
||
|
" <td>28.000000</td>\n",
|
||
|
" <td>0.000000</td>\n",
|
||
|
" <td>0.000000</td>\n",
|
||
|
" <td>14.454200</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>75%</th>\n",
|
||
|
" <td>668.500000</td>\n",
|
||
|
" <td>1.000000</td>\n",
|
||
|
" <td>3.000000</td>\n",
|
||
|
" <td>38.000000</td>\n",
|
||
|
" <td>1.000000</td>\n",
|
||
|
" <td>0.000000</td>\n",
|
||
|
" <td>31.000000</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>max</th>\n",
|
||
|
" <td>891.000000</td>\n",
|
||
|
" <td>1.000000</td>\n",
|
||
|
" <td>3.000000</td>\n",
|
||
|
" <td>80.000000</td>\n",
|
||
|
" <td>8.000000</td>\n",
|
||
|
" <td>6.000000</td>\n",
|
||
|
" <td>512.329200</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" PassengerId Survived Pclass Age SibSp \\\n",
|
||
|
"count 891.000000 891.000000 891.000000 714.000000 891.000000 \n",
|
||
|
"mean 446.000000 0.383838 2.308642 29.699118 0.523008 \n",
|
||
|
"std 257.353842 0.486592 0.836071 14.526497 1.102743 \n",
|
||
|
"min 1.000000 0.000000 1.000000 0.420000 0.000000 \n",
|
||
|
"25% 223.500000 0.000000 2.000000 20.125000 0.000000 \n",
|
||
|
"50% 446.000000 0.000000 3.000000 28.000000 0.000000 \n",
|
||
|
"75% 668.500000 1.000000 3.000000 38.000000 1.000000 \n",
|
||
|
"max 891.000000 1.000000 3.000000 80.000000 8.000000 \n",
|
||
|
"\n",
|
||
|
" Parch Fare \n",
|
||
|
"count 891.000000 891.000000 \n",
|
||
|
"mean 0.381594 32.204208 \n",
|
||
|
"std 0.806057 49.693429 \n",
|
||
|
"min 0.000000 0.000000 \n",
|
||
|
"25% 0.000000 7.910400 \n",
|
||
|
"50% 0.000000 14.454200 \n",
|
||
|
"75% 0.000000 31.000000 \n",
|
||
|
"max 6.000000 512.329200 "
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 4,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df_train.describe()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "07234316",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# Analysis before data cleaning"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 5,
|
||
|
"id": "1472e369",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"<seaborn.axisgrid.FacetGrid at 0x1f232d8f6d0>"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 5,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAWAAAAFgCAYAAACFYaNMAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAABHU0lEQVR4nO3deXhU5dnH8e89M5nJvhAS9i3sBAQFEXEHRKxWrbVqa2ttq5aq7avVVruAW0VwAcGlSK2tti61FZeCiIC7EBZlh7AjBIKQANkz6/P+kWARAtlm5sxyf64rF8nMyTm3TvLLM895FjHGoJRSKvxsVheglFLxSgNYKaUsogGslFIW0QBWSimLaAArpZRFHFYXEArjxo0z7777rtVlKKXUEdLQgzHZAi4pKbG6BKWUalRMBrBSSkUDDWCllLKIBrBSSllEA1gppSyiAayUUhbRAFZKKYtoACullEU0gJVSyiIawEopZRENYKWUsogGsFJKWUQDWCmlLBKTq6EppYLr5Zdf5s03ZgNN20MyIcHJHydMpH///qEtLMppACulTqq0tJS//+1vdEysoUuKr0nfs6okkVmznmXatCdCW1yU0wBWSp3Uq6++is/n5db8CtolB5r0Pe/u9vPyylWsXr2awYMHh7jC6KV9wEqpE9q/fz9vvfkmZ7WrbXL4AozqWEumC/763F8wpmndFvFIA1gpdUIzZ/4ZE/DynR41zfo+px2u6F7JmrXr+PDDD0NTXAzQAFZKNWj16tW8//4HXNKlmrZJTW/9HnF+Rzfd0gI88/RT1NbWhqDC6KcBrJQ6jsfjYdrUx8lOgku6Na/1e4RN4Ie9KzhQUsrf//734BYYIzSAlVLHeeGFF9j55S5u6FOOy97y8/TN9HF+x1pe+9e/WL9+ffAKjBEawEqpbygsLOSVl1/mnA61DM72tvp83+9VTVaiYfLDk3C73UGoMHZoACulvlZZWckD999Hpsvwg17VQTlnksPws77l7C7aw5NPPhmUc8YKHQccRzweD2+++eY3bojk5eVx9tlnW1iVihTGGB599BH27dvH708tIyUheMPHBrbxcknXGubMmcOQIUMYM2ZM0M4dzSwNYBF5HrgU2G+MGdjA89cBd9d/WQn8whizOowlxpTFixfzzDPPfOMxR0IC8955h4SEBIuqUpHijTfe4KOPPuaanlX0yWzajLfmuCqvmi3lCTz26CP07t2bbt26Bf0a0cbqLoi/A+NO8vwO4DxjzCnAg8CscBQVqwoLC8Fmp+K066kYdgM1eefh83rZvn271aUpi33xxRc89dRTnNrWw8VdQzNkzG6DW/IrcOLh97+7h/Ly8pBcJ5pYGsDGmI+Bgyd5frEx5lD9lwVA57AUFqM2biwkkNQG7A4QG/6UHKA+mFXc2r17N/dOmECHZB/jB1Rik9Bdq40rwP8NLOOrfcXcO3EiPl/wW9rRxOoWcHP8DJh3oidF5GYRWSEiKw4cOBDGsqKD1+tlw8YN+FJzvn7MuNIQZzJr1661sDJlpbKyMn53z93greKOQWUkOUI/bbh3ho+f9q1g5apVTJs2La6nKkdFAIvIBdQF8N0nOsYYM8sYM8wYMywnJ+dEh8WtwsJCvB4P/rT2/3tQBE9KO75YuTKufwniVU1NDffc/Vu+Kt7LrwaWkduC2W4tdXYHD9/uVs3cuXPjepJGxAewiJwCPAdcbowptbqeaLVy5UoA/Kntv/G4P709B0tL2bNnjxVlKYv4fD7uu/deCjdt4hf5FfQNwU23xlyVV8M5HWp54YUXeOutt8J+/UgQ0QEsIl2B2cCPjDGbra4nmhUULCWQ0haTkPiNx33pnQBYunSpFWUpC/j9fiZPnszSZcu4oU8lw3I8ltQhAj/tW8WpbT088cQ0Fi1aZEkdVrI0gEXkFWAJ0FdEikTkZyIyXkTG1x8yEcgGnhGRVSKywrJio1h5eTkbN27Am3H8PUyTmA5JGRQUFFhQmQq3QCDA1KlTWbhwId/Lq+KCTtbOTDsyMqJvpo+HHnqITz75xNJ6ws3qURDfN8Z0MMYkGGM6G2P+aoyZaYyZWf/8jcaYLGPMkPqPYVbWG62WLFmCMQZfRpcGn/ekd+aLlSupqqoKc2UqnIwxzJgxg7lz53J592q+3T0yVihz2eGOQWXkpXm5/7774qoxENFdECo4PvroI3ClEkhp2+Dzvqzu+H0+lixZEubKVLgYY3j66ad58803ubhrDVc2c33fUEtywJ2nlNE5xcuECX+Mmy4xDeAYV1lZybJly/Fkdq3rdGuAPzUXcSbzwQcfhLk6FQ5HWr7/+c9/GNu5hmt7Vp/oR8FSKQmG3ww+TMdED3/8w+/joiWsARzjPv74Y3w+L942eSc+SAR3Vg8KCgp0dlKMMcYwffp03njjDcZ1qeG63pEZvkekJRjuHnKYTsleJvzxDyxevNjqkkJKAzjGvffee5CUQSDl5GOjvdk98fv9un1MDAkEAjz++ONfdzt8v1dkh+8RqQmGuwcfpnOyh4kTJsT0jTkN4BhWXFzMqtWrcbfpecLuhyMCydmY5CzeeeedMFWnQsnv9zNlyhTmzJnDt7tVR2y3w4mkJBh+O7iM7qke7r33Xt5//32rSwoJDeAYNm/ePDAGb3avxg8WwZ3dm8LCQl2cJ8r5fHVDuubPn8+VPaq5Kq8mqsL3iCN9wr3SPfzpwQfr3s3FGA3gGOXz+Zg79x18GZ0xrtSmfU92L7DZmDt3boirU6Hi9Xp54IEHeP/99/leXhVX9IjO8D0iyQG/GVxGv0wvDz88KebeoWkAx6iCggJKS0vw5vRp8veYhES8md2Y9+67uottFPJ46t6uf/zxx/ygV1XEjPNtLZcdfn1KGflZXh555JGYmrasARyj3njzTXCl4Mvs2qzv8+b2p7qqSoekRRm3282ECX9k8eLFXN+nknEhWtPXKk473D6onMHZHqZNm8bs2bOtLikoNIBjUFFREZ+vWIE7uw9I815if2o7THIWs994I0TVqWDzeOpGCyxduoyf9K1kTOfY3PjSaYf/G1TB0BwPM2bMiIkQ1gCOQW+99RaIDW9O3+Z/swjutn3ZsnkzGzduDH5xKqg8Hg8TJ05g6bK68LV6bYdQc9jg1vwKhratC+E3oryhoAEcY2pra5n7zjt4s7phnMktOoe3bW/E4Yz6H+5Y5/P5uO++eykoWBoX4XuEwwa3Dqzg1LYepk+fzn//+1+rS2oxDeAYs2jRIqqrqvDm9m/5SewJuNv05P33P+Dw4cNBq00FTyAQYPLkySxevITr+8RP+B7hsMEvB1YwONvL1KmPR+04YQ3gGPPGm29ikrPwp7Zr1Xm8uf3w+by8++67QapMBYsxhieffJKFCxdyVV51zPb5NsZhg9sGltMnw8ekhx6KygV8NIBjSGFhIVu3bMGd06/RmW+NCSRl4U9rzxtvvkkgEL6talTjXnrpJd544w0u7lLDt7tF1qpm4eaywx2nlNMp2cvECX+Mug1mNYBjyNy5cxF7At42PYNyPk9OX77at49Vq1YF5Xyq9T766COee+45zmzn5tooWdsh1JIdhrsGHybN7uUPv7uHaNqUVwM4RtTU1LBg4UI8md3A4QzKOX1Z3RCHS2fGRYhNmzYx6aE/0SvDz8/6VWr4HiXDabhj0GGqKg7z+9/9jpqa6HhnoAEcIz755BNqa2qaNfOtUTYH7jZ5fPTRx1RWVgbvvKrZKioq+OMffk+q3cv/DSrDabe6osjTJdXPLQPK2bptK1OnTrW6nCbRAI4RCxcuhMS0Vt98O5Y3uyc+n5dPP/00qOdVTWeM4bHHHuVgaSm/zD9MhtNYXVLEGtLWy+XdqlmwYEHd70SE0wCOAYcPH2bFihW4M7u3+ubbsQIpOZCYzsI43LE2UsybN4+PPvqY7+ZVkZfut7qciHd59xp6Z/iZ+vhjFBcXW13OSWkAx4CCggICgQC+Nj2Cf3IR3JndWPnFF7pppwUOHTrEU0/OoH+Wj2/F2PoOoWK3wfgB5RhfLU9Mm2Z1OSelARwDlixZgjiTCSRnh+T8/swu+P1+VqxYEZLzqxN
|
||
|
"text/plain": [
|
||
|
"<Figure size 360x360 with 1 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {
|
||
|
"needs_background": "light"
|
||
|
},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"sns.catplot(x='Sex', y='Survived', data=df_train, kind='violin')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 6,
|
||
|
"id": "538c8c4f",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# sns.pairplot(data=df_train, hue='Survived')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "06684f27",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# Data Cleaning"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 7,
|
||
|
"id": "095ae1ae",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df_train = pd.get_dummies(data=df_train, columns=['Sex', 'Embarked'])"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 8,
|
||
|
"id": "50c17ca4",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df_train.drop(['Sex_male', 'Name', 'Ticket', 'PassengerId'], axis=1, inplace=True)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 9,
|
||
|
"id": "b1949435",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df_train['Cabin'] = df_train['Cabin'].fillna('None')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 10,
|
||
|
"id": "8a75aa3b",
|
||
|
"metadata": {
|
||
|
"scrolled": true
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"array(['None', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',\n",
|
||
|
" 'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',\n",
|
||
|
" 'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',\n",
|
||
|
" 'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',\n",
|
||
|
" 'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',\n",
|
||
|
" 'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',\n",
|
||
|
" 'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',\n",
|
||
|
" 'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',\n",
|
||
|
" 'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',\n",
|
||
|
" 'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',\n",
|
||
|
" 'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',\n",
|
||
|
" 'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',\n",
|
||
|
" 'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',\n",
|
||
|
" 'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',\n",
|
||
|
" 'C62 C64', 'E24', 'C90', 'C45', 'E8', 'B101', 'D45', 'C46', 'D30',\n",
|
||
|
" 'E121', 'D11', 'E77', 'F38', 'B3', 'D6', 'B82 B84', 'D17', 'A36',\n",
|
||
|
" 'B102', 'B69', 'E49', 'C47', 'D28', 'E17', 'A24', 'C50', 'B42',\n",
|
||
|
" 'C148'], dtype=object)"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 10,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df_train['Cabin'].unique()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 11,
|
||
|
"id": "875d207c",
|
||
|
"metadata": {
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"None 687\n",
|
||
|
"C23 C25 C27 4\n",
|
||
|
"G6 4\n",
|
||
|
"B96 B98 4\n",
|
||
|
"C22 C26 3\n",
|
||
|
" ... \n",
|
||
|
"E34 1\n",
|
||
|
"C7 1\n",
|
||
|
"C54 1\n",
|
||
|
"E36 1\n",
|
||
|
"C148 1\n",
|
||
|
"Name: Cabin, Length: 148, dtype: int64"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 11,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df_train['Cabin'].value_counts()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 12,
|
||
|
"id": "b10cca7f",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df_train['Age'] = df_train['Age'].fillna(df_train['Age'].mean())"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 13,
|
||
|
"id": "fdcbf0a5",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stderr",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"C:\\Users\\Maciej\\AppData\\Local\\Temp/ipykernel_8664/4127688350.py:2: SettingWithCopyWarning: \n",
|
||
|
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
|
"\n",
|
||
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
|
" df_train['Cabin'][df_train['Cabin'].isin(counts[counts==1].index)] = 'Alone'\n",
|
||
|
"C:\\Users\\Maciej\\AppData\\Local\\Temp/ipykernel_8664/4127688350.py:3: SettingWithCopyWarning: \n",
|
||
|
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
|
"\n",
|
||
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
|
" df_train['Cabin'][df_train['Cabin'].isin(counts[counts==2].index)] = 'Double room'\n",
|
||
|
"C:\\Users\\Maciej\\AppData\\Local\\Temp/ipykernel_8664/4127688350.py:4: SettingWithCopyWarning: \n",
|
||
|
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
|
"\n",
|
||
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
|
" df_train['Cabin'][df_train['Cabin'].isin(counts[counts==3].index)] = 'Three person room'\n",
|
||
|
"C:\\Users\\Maciej\\AppData\\Local\\Temp/ipykernel_8664/4127688350.py:5: SettingWithCopyWarning: \n",
|
||
|
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
|
"\n",
|
||
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
|
" df_train['Cabin'][df_train['Cabin'].isin(counts[counts==4].index)] = 'Four person room'\n",
|
||
|
"C:\\Users\\Maciej\\AppData\\Local\\Temp/ipykernel_8664/4127688350.py:6: SettingWithCopyWarning: \n",
|
||
|
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||
|
"\n",
|
||
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
|
" df_train['Cabin'][df_train['Cabin'].isin(counts[counts>4].index)] = 'Other'\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"counts = df_train['Cabin'].value_counts().copy(deep=True)\n",
|
||
|
"df_train['Cabin'][df_train['Cabin'].isin(counts[counts==1].index)] = 'Alone'\n",
|
||
|
"df_train['Cabin'][df_train['Cabin'].isin(counts[counts==2].index)] = 'Double room'\n",
|
||
|
"df_train['Cabin'][df_train['Cabin'].isin(counts[counts==3].index)] = 'Three person room'\n",
|
||
|
"df_train['Cabin'][df_train['Cabin'].isin(counts[counts==4].index)] = 'Four person room'\n",
|
||
|
"df_train['Cabin'][df_train['Cabin'].isin(counts[counts>4].index)] = 'Other'"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 14,
|
||
|
"id": "6d73e794",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"Other 687\n",
|
||
|
"Alone 101\n",
|
||
|
"Double room 76\n",
|
||
|
"Three person room 15\n",
|
||
|
"Four person room 12\n",
|
||
|
"Name: Cabin, dtype: int64"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 14,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"df_train['Cabin'].value_counts()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 15,
|
||
|
"id": "ae86ac06",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df_train = pd.get_dummies(data=df_train, columns=['Cabin'])"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "7ed9bfbb",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# Analysis"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 16,
|
||
|
"id": "3c82db70",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAdIAAAGWCAYAAADMqQJMAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAEAAElEQVR4nOyddXhT1/+A3yRN0tTdoVihFNfiPtiGDXf34e4Udx3uUGwMKAwbPmEbFIcxnAItUPdUY78/ElKhQAl82djvvs+TB5rczz3nnnuTc8+9535ekU6n0yEgICAgICBgEuJ/ugICAgICAgKfM0JHKiAgICAg8AEIHamAgICAgMAHIHSkAgICAgICH4DQkQoICAgICHwAQkcqICAgICDwAQgdqYCAgIDAZ8/Tp0+pWrUqz58/f+My+/fvp2HDhpQvX57OnTvz6NGjj1K20JEKCAgICHzW/Pzzz3Tu3JnExMQ3LhMcHMz8+fNZunQply5dokqVKgwdOpSPkUpB6EgFBAQEBD5b1q5dy+LFixk5cuRbl9u3bx9NmzalXLlyyGQyhg0bRmxsLBcuXPjgOph98BoEBAQEBAQ+MklJSSQlJb32vo2NDTY2Nsa/W7duzcCBA3nx4sVb1/fo0SPatm1r/FsikVCoUCHu379PjRo1PqiuQkcq8NmjigkxOTZz/XST4hKOvvk+zLvQqkUmxSkT5CaXqVJJTIpLzZSaXKZGa9oFLw2mtQ9Aook/aaki0y/OpYlNq2+MabsEgPviDJPi1GhNLnPr0wMmx77ifb6r2/ccY9WqVa+9P2TIEIYOHWr829XVNV/rS0lJwdzcPMd75ubmpKWl5btOb0LoSAUEBAQEPg1aTb4X7dGjB61atXrt/eyj0fdBoVCQkZHzBCQ9PR0rKyuT1pcdoSP9j5GYmMiyZcs4d+4cSUlJWFlZUbNmTUaOHImbm9tHLatv375UrlyZgQMHftT1TpgwAYD58+d/1PUKCAj8w+jyPyLOfQn3Q/Hx8SEkJGtErNFoePr0KT4+Ph+8bmGy0X+MkSNHEhcXR1BQEDdu3ODgwYNkZmbSq1cv1Gr1Ry1r06ZNH70TFRAQ+A+j1eb/9ZFp3bo1hw8f5urVq2RmZvLdd99hY2ND5cqVP3jdQkf6H+Pq1as0adIEJycnAJydnZk0aRLlypUjKSmJBg0aEBQUZFz+xx9/pEGDBoB+enjdunUZOXIklStXJjAwEF9fX0JDQ43L37hxg3LlypGcnEy3bt1YuXIlFy5coHz58qSkpBiXO3bsGHXr1kWr1aJUKgkICKBOnTrUqFGD8ePH55imfvbsWZo2bUr58uUZMGAA8fHx/+tmEhAQ+AfQadT5fn0oL1++pEKFCly5cgWAmjVrMmHCBCZNmoS/vz9Xr15l/fr1SKWmzwN4hXBp9z9G06ZNmTZtGsHBwVStWpVy5cpRoECBfF8mjYiIoGTJkixcuJDMzExOnz7Njz/+aLy5f+jQIRo3boy1tbUxplq1ajg5OXHy5Elat25tXK5Vq1aIxWImTpxIamoqhw4dQiaTMWPGDEaNGsXmzZsJCQlh+PDhLFiwgCZNmvDLL78wbNgwWrRo8d7bHvr8JaOmzGH/ttXG9y5cvs7hE2fR6XR0aNWUcqV8mb7gOywtFGSqVEzwsEfWsBO69BS00c9RXz0LgFmZWkhKVUOnTET77C7qv37Xv1++HmI3b8TB67EdOhBtUjLqJ09JOfAjAJatWiD1K4FIoSDtxGnSLwRjO3IoupRURJYWJC5bhcTeAbsRA9EmJaEKeYZynyG2TXNkfiUQKxSk/HSG9PMXcNm8EnVoGACZ3x3AaVgXtInJZDwMJX7XUeN2WtauhF37JrwYOheL6uWw/UZ/cmRZowKhPSeji0/HY0of1AlKMh48I3bHcWOsVd2KOHRoTOi3WceIQ8fGKEoX5enSILwDeqKOV5L2IJTIbScAsKldFue29UAEkYEnSX/8gsILB6GKTkQVGceL7w4gc3Og0PQeqBOSSb3/nIit+ljb2mVwaV9Pf7xtP0Xy1QcUmdsHbVoGci9nHs/eReFJnVEnKEm5H0b41pMA2NUug2v7uoCI8O0nEZvLcGlXFwD7OmW51X4mGcmZ+E3viiohheT7YTzdehoAp9ql8WpfGxEinm4/TfzVR5Rd3BeNMh2xXErwxG14t62FczVfzBQybi8JQhkSYWyPslM6IZGbIbW24Mq4zWgz1YjMJNTbO4GXF+9i4eqARCHj5rKDJGWLqzS5IxKZFJmNgj/Hb6HM4OZYF3RGam2BxNGaW7vP4eXvi5m5jAvLDxL/JCtWIpdSb3Innl9+wP0jF3EpXYjy3RqSnphCNQcLtBotcoWcIyv2Efkk3BjXZXY/tBotEjMxu6dtpmzDSlT+ujparY57l/5GlaGiRNWSyBRyflzxQ47YbrP7o9NoEZuJ2Tltk2E9EsbsnPa2r17+eY9Lu++Ll5cX9+/fN/7t4eHB9evXcyzTpk0b2rRp89HLFkak/zFmz57NzJkziYmJYdasWTRq1IgvvviCw4cP53sd33zzDVKpFEtLS9q2bcuPP+p/6DMzM/npp59yTCEHEIlEtG7d2rhcdHQ0Fy5coE2bNsTGxnLq1CkmT56Mg4MDVlZWTJw4kd9//53Q0FCOHz9OmTJlaNq0KWZmZjRq1Ij69eu/93bHxMZx4MgJFIqcs/ICvw9i5oThTB8/jC0793H5+i28PNwYP3wA9na2xBavieryKTJPbEdSrDyI9VMpxQVLoEuOB5EIzQt99hNxgRKIHd1AJMLym2ak7DtI4uIVmNeoBhJ9nFapJGHOIhIXLkfxRUPM69RCJJchsrRA/TQUNBosWzcjeW8Q8Qu+w7ymvzFWl6wkftZi4heswKJJAyQuTogU5uhUKlTPwrBpUY/4wMNETF+DVb0qYKaPs/Avi8zbHbGFfttTL9wkfPwykk9dIHb9D2Q+DsOx81fEbD3Cy6lrsW6QFWtZvQxyb3ckllntZlm1FPIiXiAS4dqtMRGbj/F00gbsGlZCZIhz79+ckDFrCBm3Do/BrbD2L0XyxTs8nbQBjTIVm5qlcev+BeGbjhMyYRP2jSoaYz0GNufRqLU8HrsezyHfIFbIcWpZEzMHa3RaHW4d6/Ny03EeTdiEY7Y4r4HNeDBqHQ/HrsdryDck/H6bB8NXE3s8mLCVB0l7+ALv7g15sukEf03YgkujCsbYIgO/5uaoDdwcu5FiQ1rgWMOP1GdR/B2wg4zYJBwr+1CoXW2ujNnEX/P34TuoaVZ7FHRGZqPg+tQdRF+4i1fTqgCUHteW1JdxeDUoz5/jNnN94X5KDfjaGGdV0BmZjQWXAnYQceEe3l9X4eayg/w+cgOZiSkcG7qaUm1rc2r8Zv5YvJ/K2WIBKvf9KkeygKqDmpH0IhZLZ1s8fLzYMXE9hxZ/T+N+WSedMnMZN89cYU/AZrQaLQ4eTkjlUjaN+I6tY1ZT6Ut/arapx7aJ6whavIcv+7V8LXZngL4DdfRwBqDVqI7EvYx985fvfdBq8v/6jBA60v8YYrGYpk2bsmbNGi5evMjx48f5+uuvGTduXL4fPHZ2djb+/8svvyQxMZGrV69y7tw5bGxsqFq16msxbdq04erVq4SHh3P48GEqVapEgQIFjM92tW/fnsqVK1O5cmUaN26MXC4nLCyMyMhI3N3dc6yrYMGC773dTo4OjBzUGwuFIsf7OkAqlWIul5ORmUlMbDxurvrtc3NxQmNhgy4pTr9wegrI9fHqm7+R+dM2Ms/uQdawIyJbJ8xKVEJ15QwAEgcHNJFRAGiTkxFbWQKQdvocIoU5NsMGkhy4GzNPD1SPn5C4eAWyksWReHogcbRHExn9WmzqqZ8RKcyxGzGQ5G170GVkEjdtHvFzlyG2tcHctzCqcH2cJkmJxNoQF3yL+J1Zo1N9BcXYd2lK/K5jAJg526EKj9HHJmbFplz4i9jAY8YwqZcLNk2qExOoX5/U2Y7Ml6/iUpDYWAD6kyedSo0uPROxXErCuavIvZzxntUHRTEvpM52SJ3tyHgVm/B6rNYQKxLB3Z4LeDRiDRmhkVi
|
||
|
"text/plain": [
|
||
|
"<Figure size 432x288 with 2 Axes>"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"corrmat = df_train.corr() \n",
|
||
|
"cols = corrmat.nlargest(15, 'Survived')['Survived'].index \n",
|
||
|
"cm = np.corrcoef(df_train[cols].values.T) \n",
|
||
|
"sns.set(font_scale=1.25) \n",
|
||
|
"hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 8}, yticklabels=cols.values, xticklabels=cols.values)"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3 (ipykernel)",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.9.7"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 5
|
||
|
}
|