2023-02-16 13:32:30 +01:00
|
|
|
{
|
|
|
|
"cells": [
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "0ba6ee9e",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
2023-02-17 14:01:05 +01:00
|
|
|
"# Titanic Machine Learning from Disaster"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "ec6e69b1",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
2023-02-17 14:01:05 +01:00
|
|
|
"## Imports"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 1,
|
|
|
|
"id": "ffcae455",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"import numpy as np\n",
|
|
|
|
"import pandas as pd\n",
|
|
|
|
"import matplotlib.pyplot as plt\n",
|
|
|
|
"import seaborn as sns\n",
|
|
|
|
"import plotly.express as px"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "d1b19cf9",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
2023-02-17 14:01:05 +01:00
|
|
|
"## Data description"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 2,
|
|
|
|
"id": "3174342e",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2023-02-17 14:01:05 +01:00
|
|
|
"# Loading the data\n",
|
|
|
|
"df_train = pd.read_csv('train.csv')\n",
|
|
|
|
"df_test = pd.read_csv('test.csv')"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 3,
|
|
|
|
"id": "4d561fad",
|
|
|
|
"metadata": {
|
|
|
|
"scrolled": true
|
|
|
|
},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',\n",
|
|
|
|
" 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],\n",
|
|
|
|
" dtype='object')"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 3,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"df_train.columns"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 4,
|
2023-02-17 14:01:05 +01:00
|
|
|
"id": "b2bfda08",
|
2023-02-16 13:32:30 +01:00
|
|
|
"metadata": {},
|
2023-02-17 14:01:05 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',\n",
|
|
|
|
" 'Ticket', 'Fare', 'Cabin', 'Embarked'],\n",
|
|
|
|
" dtype='object')"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 4,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"df_test.columns"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 5,
|
|
|
|
"id": "7818fc15",
|
|
|
|
"metadata": {
|
|
|
|
"scrolled": true
|
|
|
|
},
|
2023-02-16 13:32:30 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/html": [
|
|
|
|
"<div>\n",
|
|
|
|
"<style scoped>\n",
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
" text-align: right;\n",
|
|
|
|
" }\n",
|
|
|
|
"</style>\n",
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
" <thead>\n",
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
" <th></th>\n",
|
|
|
|
" <th>PassengerId</th>\n",
|
|
|
|
" <th>Survived</th>\n",
|
|
|
|
" <th>Pclass</th>\n",
|
|
|
|
" <th>Age</th>\n",
|
|
|
|
" <th>SibSp</th>\n",
|
|
|
|
" <th>Parch</th>\n",
|
|
|
|
" <th>Fare</th>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </thead>\n",
|
|
|
|
" <tbody>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>count</th>\n",
|
|
|
|
" <td>891.000000</td>\n",
|
|
|
|
" <td>891.000000</td>\n",
|
|
|
|
" <td>891.000000</td>\n",
|
|
|
|
" <td>714.000000</td>\n",
|
|
|
|
" <td>891.000000</td>\n",
|
|
|
|
" <td>891.000000</td>\n",
|
|
|
|
" <td>891.000000</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>mean</th>\n",
|
|
|
|
" <td>446.000000</td>\n",
|
|
|
|
" <td>0.383838</td>\n",
|
|
|
|
" <td>2.308642</td>\n",
|
|
|
|
" <td>29.699118</td>\n",
|
|
|
|
" <td>0.523008</td>\n",
|
|
|
|
" <td>0.381594</td>\n",
|
|
|
|
" <td>32.204208</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>std</th>\n",
|
|
|
|
" <td>257.353842</td>\n",
|
|
|
|
" <td>0.486592</td>\n",
|
|
|
|
" <td>0.836071</td>\n",
|
|
|
|
" <td>14.526497</td>\n",
|
|
|
|
" <td>1.102743</td>\n",
|
|
|
|
" <td>0.806057</td>\n",
|
|
|
|
" <td>49.693429</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>min</th>\n",
|
|
|
|
" <td>1.000000</td>\n",
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
" <td>1.000000</td>\n",
|
|
|
|
" <td>0.420000</td>\n",
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>25%</th>\n",
|
|
|
|
" <td>223.500000</td>\n",
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
" <td>2.000000</td>\n",
|
|
|
|
" <td>20.125000</td>\n",
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
" <td>7.910400</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>50%</th>\n",
|
|
|
|
" <td>446.000000</td>\n",
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
" <td>3.000000</td>\n",
|
|
|
|
" <td>28.000000</td>\n",
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
" <td>14.454200</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>75%</th>\n",
|
|
|
|
" <td>668.500000</td>\n",
|
|
|
|
" <td>1.000000</td>\n",
|
|
|
|
" <td>3.000000</td>\n",
|
|
|
|
" <td>38.000000</td>\n",
|
|
|
|
" <td>1.000000</td>\n",
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
" <td>31.000000</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>max</th>\n",
|
|
|
|
" <td>891.000000</td>\n",
|
|
|
|
" <td>1.000000</td>\n",
|
|
|
|
" <td>3.000000</td>\n",
|
|
|
|
" <td>80.000000</td>\n",
|
|
|
|
" <td>8.000000</td>\n",
|
|
|
|
" <td>6.000000</td>\n",
|
|
|
|
" <td>512.329200</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </tbody>\n",
|
|
|
|
"</table>\n",
|
|
|
|
"</div>"
|
|
|
|
],
|
|
|
|
"text/plain": [
|
|
|
|
" PassengerId Survived Pclass Age SibSp \\\n",
|
|
|
|
"count 891.000000 891.000000 891.000000 714.000000 891.000000 \n",
|
|
|
|
"mean 446.000000 0.383838 2.308642 29.699118 0.523008 \n",
|
|
|
|
"std 257.353842 0.486592 0.836071 14.526497 1.102743 \n",
|
|
|
|
"min 1.000000 0.000000 1.000000 0.420000 0.000000 \n",
|
|
|
|
"25% 223.500000 0.000000 2.000000 20.125000 0.000000 \n",
|
|
|
|
"50% 446.000000 0.000000 3.000000 28.000000 0.000000 \n",
|
|
|
|
"75% 668.500000 1.000000 3.000000 38.000000 1.000000 \n",
|
|
|
|
"max 891.000000 1.000000 3.000000 80.000000 8.000000 \n",
|
|
|
|
"\n",
|
|
|
|
" Parch Fare \n",
|
|
|
|
"count 891.000000 891.000000 \n",
|
|
|
|
"mean 0.381594 32.204208 \n",
|
|
|
|
"std 0.806057 49.693429 \n",
|
|
|
|
"min 0.000000 0.000000 \n",
|
|
|
|
"25% 0.000000 7.910400 \n",
|
|
|
|
"50% 0.000000 14.454200 \n",
|
|
|
|
"75% 0.000000 31.000000 \n",
|
|
|
|
"max 6.000000 512.329200 "
|
|
|
|
]
|
|
|
|
},
|
2023-02-17 14:01:05 +01:00
|
|
|
"execution_count": 5,
|
2023-02-16 13:32:30 +01:00
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"df_train.describe()"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
2023-02-17 14:01:05 +01:00
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 6,
|
|
|
|
"id": "9c83bffc",
|
|
|
|
"metadata": {
|
|
|
|
"scrolled": false
|
|
|
|
},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/html": [
|
|
|
|
"<div>\n",
|
|
|
|
"<style scoped>\n",
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
" text-align: right;\n",
|
|
|
|
" }\n",
|
|
|
|
"</style>\n",
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
" <thead>\n",
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
" <th></th>\n",
|
|
|
|
" <th>PassengerId</th>\n",
|
|
|
|
" <th>Pclass</th>\n",
|
|
|
|
" <th>Age</th>\n",
|
|
|
|
" <th>SibSp</th>\n",
|
|
|
|
" <th>Parch</th>\n",
|
|
|
|
" <th>Fare</th>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </thead>\n",
|
|
|
|
" <tbody>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>count</th>\n",
|
|
|
|
" <td>418.000000</td>\n",
|
|
|
|
" <td>418.000000</td>\n",
|
|
|
|
" <td>332.000000</td>\n",
|
|
|
|
" <td>418.000000</td>\n",
|
|
|
|
" <td>418.000000</td>\n",
|
|
|
|
" <td>417.000000</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>mean</th>\n",
|
|
|
|
" <td>1100.500000</td>\n",
|
|
|
|
" <td>2.265550</td>\n",
|
|
|
|
" <td>30.272590</td>\n",
|
|
|
|
" <td>0.447368</td>\n",
|
|
|
|
" <td>0.392344</td>\n",
|
|
|
|
" <td>35.627188</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>std</th>\n",
|
|
|
|
" <td>120.810458</td>\n",
|
|
|
|
" <td>0.841838</td>\n",
|
|
|
|
" <td>14.181209</td>\n",
|
|
|
|
" <td>0.896760</td>\n",
|
|
|
|
" <td>0.981429</td>\n",
|
|
|
|
" <td>55.907576</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>min</th>\n",
|
|
|
|
" <td>892.000000</td>\n",
|
|
|
|
" <td>1.000000</td>\n",
|
|
|
|
" <td>0.170000</td>\n",
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>25%</th>\n",
|
|
|
|
" <td>996.250000</td>\n",
|
|
|
|
" <td>1.000000</td>\n",
|
|
|
|
" <td>21.000000</td>\n",
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
" <td>7.895800</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>50%</th>\n",
|
|
|
|
" <td>1100.500000</td>\n",
|
|
|
|
" <td>3.000000</td>\n",
|
|
|
|
" <td>27.000000</td>\n",
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
" <td>14.454200</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>75%</th>\n",
|
|
|
|
" <td>1204.750000</td>\n",
|
|
|
|
" <td>3.000000</td>\n",
|
|
|
|
" <td>39.000000</td>\n",
|
|
|
|
" <td>1.000000</td>\n",
|
|
|
|
" <td>0.000000</td>\n",
|
|
|
|
" <td>31.500000</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>max</th>\n",
|
|
|
|
" <td>1309.000000</td>\n",
|
|
|
|
" <td>3.000000</td>\n",
|
|
|
|
" <td>76.000000</td>\n",
|
|
|
|
" <td>8.000000</td>\n",
|
|
|
|
" <td>9.000000</td>\n",
|
|
|
|
" <td>512.329200</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </tbody>\n",
|
|
|
|
"</table>\n",
|
|
|
|
"</div>"
|
|
|
|
],
|
|
|
|
"text/plain": [
|
|
|
|
" PassengerId Pclass Age SibSp Parch Fare\n",
|
|
|
|
"count 418.000000 418.000000 332.000000 418.000000 418.000000 417.000000\n",
|
|
|
|
"mean 1100.500000 2.265550 30.272590 0.447368 0.392344 35.627188\n",
|
|
|
|
"std 120.810458 0.841838 14.181209 0.896760 0.981429 55.907576\n",
|
|
|
|
"min 892.000000 1.000000 0.170000 0.000000 0.000000 0.000000\n",
|
|
|
|
"25% 996.250000 1.000000 21.000000 0.000000 0.000000 7.895800\n",
|
|
|
|
"50% 1100.500000 3.000000 27.000000 0.000000 0.000000 14.454200\n",
|
|
|
|
"75% 1204.750000 3.000000 39.000000 1.000000 0.000000 31.500000\n",
|
|
|
|
"max 1309.000000 3.000000 76.000000 8.000000 9.000000 512.329200"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 6,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
2023-02-16 13:32:30 +01:00
|
|
|
"source": [
|
2023-02-17 14:01:05 +01:00
|
|
|
"df_test.describe()"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-02-17 14:01:05 +01:00
|
|
|
"execution_count": 7,
|
|
|
|
"id": "0b345650",
|
2023-02-16 13:32:30 +01:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
2023-02-17 14:01:05 +01:00
|
|
|
"PassengerId 0\n",
|
|
|
|
"Survived 0\n",
|
|
|
|
"Pclass 0\n",
|
|
|
|
"Name 0\n",
|
|
|
|
"Sex 0\n",
|
|
|
|
"Age 177\n",
|
|
|
|
"SibSp 0\n",
|
|
|
|
"Parch 0\n",
|
|
|
|
"Ticket 0\n",
|
|
|
|
"Fare 0\n",
|
|
|
|
"Cabin 687\n",
|
|
|
|
"Embarked 2\n",
|
|
|
|
"dtype: int64"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
},
|
2023-02-17 14:01:05 +01:00
|
|
|
"execution_count": 7,
|
2023-02-16 13:32:30 +01:00
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
2023-02-17 14:01:05 +01:00
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"df_train.isna().sum()"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 8,
|
|
|
|
"id": "af40052a",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
2023-02-16 13:32:30 +01:00
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
2023-02-17 14:01:05 +01:00
|
|
|
"PassengerId 0\n",
|
|
|
|
"Pclass 0\n",
|
|
|
|
"Name 0\n",
|
|
|
|
"Sex 0\n",
|
|
|
|
"Age 86\n",
|
|
|
|
"SibSp 0\n",
|
|
|
|
"Parch 0\n",
|
|
|
|
"Ticket 0\n",
|
|
|
|
"Fare 1\n",
|
|
|
|
"Cabin 327\n",
|
|
|
|
"Embarked 0\n",
|
|
|
|
"dtype: int64"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
},
|
2023-02-17 14:01:05 +01:00
|
|
|
"execution_count": 8,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
2023-02-16 13:32:30 +01:00
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2023-02-17 14:01:05 +01:00
|
|
|
"df_test.isna().sum()"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-02-17 14:01:05 +01:00
|
|
|
"execution_count": 9,
|
|
|
|
"id": "8cec8cda",
|
2023-02-16 13:32:30 +01:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2023-02-17 14:01:05 +01:00
|
|
|
"df_test['Fare'].fillna(df_test['Fare'].mean(), inplace=True)"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-02-17 14:01:05 +01:00
|
|
|
"execution_count": 10,
|
|
|
|
"id": "6f612c59",
|
2023-02-16 13:32:30 +01:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2023-02-17 14:01:05 +01:00
|
|
|
"df_test['Cabin'].fillna('Other', inplace=True)"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
2023-02-17 14:01:05 +01:00
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "07234316",
|
2023-02-16 13:32:30 +01:00
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
2023-02-17 14:01:05 +01:00
|
|
|
"## Preexploratory Data Analysis"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-02-17 14:01:05 +01:00
|
|
|
"execution_count": 11,
|
|
|
|
"id": "2facd3d5",
|
2023-02-16 13:32:30 +01:00
|
|
|
"metadata": {},
|
2023-02-17 14:01:05 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"(342, 549)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 11,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
2023-02-16 13:32:30 +01:00
|
|
|
"source": [
|
2023-02-17 14:01:05 +01:00
|
|
|
"len(df_train[df_train['Survived']==1]), len(df_train[df_train['Survived']==0])"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-02-17 14:01:05 +01:00
|
|
|
"execution_count": 12,
|
|
|
|
"id": "1472e369",
|
|
|
|
"metadata": {},
|
2023-02-16 13:32:30 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
2023-02-17 14:01:05 +01:00
|
|
|
"<seaborn.axisgrid.FacetGrid at 0x1e194179a00>"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
},
|
2023-02-17 14:01:05 +01:00
|
|
|
"execution_count": 12,
|
2023-02-16 13:32:30 +01:00
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
2023-02-17 14:01:05 +01:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAWAAAAFgCAYAAACFYaNMAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAABHU0lEQVR4nO3deXhU5dnH8e89M5nJvhAS9i3sBAQFEXEHRKxWrbVqa2ttq5aq7avVVruAW0VwAcGlSK2tti61FZeCiIC7EBZlh7AjBIKQANkz6/P+kWARAtlm5sxyf64rF8nMyTm3TvLLM895FjHGoJRSKvxsVheglFLxSgNYKaUsogGslFIW0QBWSimLaAArpZRFHFYXEArjxo0z7777rtVlKKXUEdLQgzHZAi4pKbG6BKWUalRMBrBSSkUDDWCllLKIBrBSSllEA1gppSyiAayUUhbRAFZKKYtoACullEU0gJVSyiIawEopZRENYKWUsogGsFJKWUQDWCmlLBKTq6EppYLr5Zdf5s03ZgNN20MyIcHJHydMpH///qEtLMppACulTqq0tJS//+1vdEysoUuKr0nfs6okkVmznmXatCdCW1yU0wBWSp3Uq6++is/n5db8CtolB5r0Pe/u9vPyylWsXr2awYMHh7jC6KV9wEqpE9q/fz9vvfkmZ7WrbXL4AozqWEumC/763F8wpmndFvFIA1gpdUIzZ/4ZE/DynR41zfo+px2u6F7JmrXr+PDDD0NTXAzQAFZKNWj16tW8//4HXNKlmrZJTW/9HnF+Rzfd0gI88/RT1NbWhqDC6KcBrJQ6jsfjYdrUx8lOgku6Na/1e4RN4Ie9KzhQUsrf//734BYYIzSAlVLHeeGFF9j55S5u6FOOy97y8/TN9HF+x1pe+9e/WL9+ffAKjBEawEqpbygsLOSVl1/mnA61DM72tvp83+9VTVaiYfLDk3C73UGoMHZoACulvlZZWckD999Hpsvwg17VQTlnksPws77l7C7aw5NPPhmUc8YKHQccRzweD2+++eY3bojk5eVx9tlnW1iVihTGGB599BH27dvH708tIyUheMPHBrbxcknXGubMmcOQIUMYM2ZM0M4dzSwNYBF5HrgU2G+MGdjA89cBd9d/WQn8whizOowlxpTFixfzzDPPfOMxR0IC8955h4SEBIuqUpHijTfe4KOPPuaanlX0yWzajLfmuCqvmi3lCTz26CP07t2bbt26Bf0a0cbqLoi/A+NO8vwO4DxjzCnAg8CscBQVqwoLC8Fmp+K066kYdgM1eefh83rZvn271aUpi33xxRc89dRTnNrWw8VdQzNkzG6DW/IrcOLh97+7h/Ly8pBcJ5pYGsDGmI+Bgyd5frEx5lD9lwVA57AUFqM2biwkkNQG7A4QG/6UHKA+mFXc2r17N/dOmECHZB/jB1Rik9Bdq40rwP8NLOOrfcXcO3EiPl/wW9rRxOoWcHP8DJh3oidF5GYRWSEiKw4cOBDGsqKD1+tlw8YN+FJzvn7MuNIQZzJr1661sDJlpbKyMn53z93greKOQWUkOUI/bbh3ho+f9q1g5apVTJs2La6nKkdFAIvIBdQF8N0nOsYYM8sYM8wYMywnJ+dEh8WtwsJCvB4P/rT2/3tQBE9KO75YuTKufwniVU1NDffc/Vu+Kt7LrwaWkduC2W4tdXYHD9/uVs3cuXPjepJGxAewiJwCPAdcbowptbqeaLVy5UoA/Kntv/G4P709B0tL2bNnjxVlKYv4fD7uu/deCjdt4hf5FfQNwU23xlyVV8M5HWp54YUXeOutt8J+/UgQ0QEsIl2B2cCPjDGbra4nmhUULCWQ0haTkPiNx33pnQBYunSpFWUpC/j9fiZPnszSZcu4oU8lw3I8ltQhAj/tW8WpbT088cQ0Fi1aZEkdVrI0gEXkFWAJ0FdEikTkZyIyXkTG1x8yEcgGnhGRVSKywrJio1h5eTkbN27Am3H8PUyTmA5JGRQUFFhQmQq3QCDA1KlTWbhwId/Lq+KCTtbOTDsyMqJvpo+HHnqITz75xNJ6ws3qURDfN8Z0MMYkGGM6G2P+aoyZaYyZWf/8jcaYLGPMkPqPYVbWG62WLFmCMQZfRpcGn/ekd+aLlSupqqoKc2UqnIwxzJgxg7lz53J592q+3T0yVihz2eGOQWXkpXm5/7774qoxENFdECo4PvroI3ClEkhp2+Dzvqzu+H0+lixZEubKVLgYY3j66ad58803ubhrDVc2c33fUEtywJ2nlNE5xcuECX+Mmy4xDeAYV1lZybJly/Fkdq3rdGuAPzUXcSbzwQcfhLk6FQ5HWr7/+c9/GNu5hmt7Vp/oR8FSKQmG3ww+TMdED3/8w+/joiWsARzjPv74Y3w+L942eSc+SAR3Vg8KCgp0dlKMMcYwffp03njjDcZ1qeG63pEZvkekJRjuHnKYTsleJvzxDyxevNjqkkJKAzjGvffee5CUQSDl5GOjvdk98fv9un1MDAkEAjz++ONfdzt8v1dkh+8RqQmGuwcfpnOyh4kTJsT0jTkN4BhWXFzMqtWrcbfpecLuhyMCydmY5CzeeeedMFWnQsnv9zNlyhTmzJnDt7tVR2y3w4mkJBh+O7iM7qke7r33Xt5//32rSwoJDeAYNm/ePDAGb3avxg8WwZ3dm8LCQl2cJ8r5fHVDuubPn8+VPaq5Kq8mqsL3iCN9wr3SPfzpwQfr3s3FGA3gGOXz+Zg79x18GZ0xrtSmfU92L7DZmDt3boirU6Hi9Xp54IEHeP/99/leXhVX9IjO8D0iyQG/GVxGv0wvDz88KebeoWkAx6iCggJKS0vw5vRp8veYhES8md2Y9+67uottFPJ46t6uf/zxx/ygV1XEjPNtLZcdfn1KGflZXh555JGYmrasARyj3njzTXCl4Mvs2qzv8+b2p7qqSoekRRm3282ECX9k8eLFXN+nknEhWtPXKk473D6onMHZHqZNm8bs2bOtLikoNIBjUFFREZ+vWIE7uw9I815if2o7THIWs994I0TVqWDzeOpGCyxduoyf9K1kTOfY3PjSaYf/G1TB0BwPM2bMiIkQ1gCOQW+99RaIDW9O3+Z/swjutn3ZsnkzGzduDH5xKqg8Hg8TJ05g6bK68LV6bYdQc9jg1vwKhratC+E3oryhoAEcY2pra5n7zjt4s7phnMktOoe3bW/E4Yz6H+5Y5/P5uO++eykoWBoX4XuEwwa3Dqzg1LYepk+fzn//+1+rS2oxDeAYs2jRIqqrqvDm9m/5SewJuNv05P33P+Dw4cNBq00FTyAQYPLkySxevITr+8RP+B7hsMEvB1YwONvL1KmPR+04YQ3gGPPGm29ikrPwp7Zr1Xm8uf3w+by8++67QapMBYsxhieffJKFCxdyVV51zPb5NsZhg9sGltMnw8ekhx6KygV8NIBjSGFhIVu3bMGd06/RmW+NCSRl4U9rzxtvvkkgEL6talTjXnrpJd544w0u7lLDt7tF1qpm4eaywx2nlNMp2cvECX+Mug1mNYBjyNy5cxF7At42PYNyPk9OX77at49Vq1YF5Xyq9T766COee+45zmzn5tooWdsh1JIdhrsGHybN7uUPv7uHaNqUVwM4RtTU1LBg4UI8md3A4QzKOX1Z3RCHS2fGRYhNmzYx6aE/0SvDz8/6VWr4HiXDabhj0GGqKg7z+9/9jpqa6HhnoAEcIz755BNqa2qaNfOtUTYH7jZ5fPTRx1RWVgbvvKrZKioq+OMffk+q3cv/DSrDabe6osjTJdXPLQPK2bptK1OnTrW6nCbRAI4RCxcuhMS0Vt98O5Y3uyc+n5dPP/00qOdVTWeM4bHHHuVgaSm/zD9MhtNYXVLEGtLWy+XdqlmwYEHd70SE0wCOAYcPH2bFihW4M7u3+ubbsQIpOZCYzsI43LE2UsybN4+PPvqY7+ZVkZfut7qciHd59xp6Z/iZ+vhjFBcXW13OSWkAx4CCggICgQC+Nj2Cf3IR3JndWPnFF7pppwUOHTrEU0/OoH+Wj2/F2PoOoWK3wfgB5RhfLU9Mm2Z1OSelARwDlixZgjiTCSRnh+T8/swu+P1+VqxYEZLzqxN
|
|
|
|
"text/plain": [
|
|
|
|
"<Figure size 360x360 with 1 Axes>"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"metadata": {
|
|
|
|
"needs_background": "light"
|
|
|
|
},
|
|
|
|
"output_type": "display_data"
|
2023-02-16 13:32:30 +01:00
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2023-02-17 14:01:05 +01:00
|
|
|
"sns.catplot(x='Sex', y='Survived', data=df_train, kind='violin')"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-02-17 14:01:05 +01:00
|
|
|
"execution_count": 13,
|
|
|
|
"id": "6b33f748",
|
2023-02-16 13:32:30 +01:00
|
|
|
"metadata": {
|
2023-02-17 14:01:05 +01:00
|
|
|
"scrolled": true
|
2023-02-16 13:32:30 +01:00
|
|
|
},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
2023-02-17 14:01:05 +01:00
|
|
|
"<AxesSubplot:xlabel='Sex', ylabel='Survived'>"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
},
|
2023-02-17 14:01:05 +01:00
|
|
|
"execution_count": 13,
|
2023-02-16 13:32:30 +01:00
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
2023-02-17 14:01:05 +01:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEGCAYAAABo25JHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAAUL0lEQVR4nO3df7jedX3f8eeLgxkDEWc5Ky7BkmmUYgcoIdZdtuJaNHTdotOOX9co1jZXNqP7hUjXlW5S1wmbV7WEZplXRrurl6kbTmOXNrW2pQ7LlsPKr4BxZ4mSk5D1RFYF6iUeee+P+4bd3OdOcgP5nEPyfT6u61y5P9/v5/7e78CdvPL9fL/fzydVhSSpu05Y7AIkSYvLIJCkjjMIJKnjDAJJ6jiDQJI67sTFLuDZOv300+uss85a7DIk6Zhy1113HayqyVH7jrkgOOuss5iamlrsMiTpmJLka4fa59CQJHWcQSBJHdc0CJKsTrIryXSS60bsPy3J55Lck2Rnkne3rEeSNF+zIEgyAWwALgHOAS5Pcs5Qt/cCD1TVecBFwL9NsqRVTZKk+VqeEawCpqtqd1U9AWwB1gz1KeDUJAFeDDwCzDWsSZI0pGUQLAX2DrRn+tsG3Qx8P7AfuA/4h1X15PCBkqxNMpVkanZ2tlW9ktRJLYMgI7YNT3X6NuBu4K8A5wM3J3nJvDdVbaqqlVW1cnJy5G2wkqTnqGUQzABnDrSX0fuX/6B3A5+unmlgD3B2w5okSUNaPlC2A1iRZDmwD7gMuGKoz0PAjwBfTPK9wGuA3Q1rknQMuPbaazlw4ABnnHEGN95442KXc9xrFgRVNZdkPbAdmAA2V9XOJOv6+zcCNwC3JrmP3lDSB6vqYKuaJB0bDhw4wL59+xa7jM5oOsVEVW0Dtg1t2zjwej/w1pY1SJIOzyeLJanjDAJJ6jiDQJI6ziCQpI4zCCSp4wwCSeo4g0CSOs4gkKSOMwgkqeOOucXrpePZQx/6a4tdwgvC3CMvA05k7pGv+d8EeMX19zU9vmcEktRxBoEkdZxBIEkdZxBIUscZBJLUcQaBJHWcQSBJHdc0CJKsTrIryXSS60bs/0CSu/s/9yf5bpKXtaxJkvRMzYIgyQSwAbgEOAe4PMk5g32q6qaqOr+qzgd+Fri9qh5pVZMkab6WZwSrgOmq2l1VTwBbgDWH6X858MmG9UiSRmgZBEuBvQPtmf62eZKcDKwGbmtYjyRphJZBkBHb6hB9/xZwx6GGhZKsTTKVZGp2dvaoFShJahsEM8CZA+1lwP5D9L2MwwwLVdWmqlpZVSsnJyePYomSpJZBsANYkWR5kiX0/rLfOtwpyWnAm4HPNqxF0jHk9JOe5Hv/4hynn/TkYpfSCc2moa6quSTrge3ABLC5qnYmWdffv7Hf9R3A71bV461qkXRsuebcP1vsEjql6XoEVbUN2Da0beNQ+1bg1pZ1SJIOzSeLJanjDAJJ6jiDQJI6ziCQpI4zCCSp4wwCSeo4g0CSOs4gkKSOMwgkqeMMAknqOINAkjrOIJCkjjMIJKnjDAJJ6jiDQJI6ziCQpI4zCCSp4wwCSeq4pkGQZHWSXUmmk1x3iD4XJbk7yc4kt7esR5I0X7M1i5NMABuAi4EZYEeSrVX1wECflwK3AKur6qEkf7lVPZKk0VqeEawCpqtqd1U9AWwB1gz1uQL4dFU9BFBVf9qwHknSCC2DYCmwd6A909826NXAX0ryh0nuSnLVqAMlWZtkKsnU7Oxso3IlqZtaBkFGbKuh9onABcDfBN4G/HySV897U9WmqlpZVSsnJyePfqWS1GHNrhHQOwM4c6C9DNg/os/BqnoceDzJHwHnAV9pWJckaUDLM4IdwIoky5MsAS4Dtg71+SzwQ0lOTHIy8AbgwYY1SZKGNDsjqKq5JOuB7cAEsLmqdiZZ19+/saoeTPI7wL3Ak8Anqur+VjVJkuZrOTREVW0Dtg1t2zjUvgm4qWUdkqRD88liSeo4g0CSOs4gkKSOMwgkqeMMAknqOINAkjrOIJCkjjMIJKnjDAJJ6jiDQJI6ziCQpI4zCCSp4wwCSeo4g0CSOs4gkKSOMwgkqeMMAknquKZBkGR1kl1JppNcN2L/RUm+keTu/s/1LeuRJM3XbKnKJBPABuBiYAbYkWRrVT0w1PWLVfXjreqQJB1eyzOCVcB0Ve2uqieALcCahp8nSXoOWgbBUmDvQHumv23YG5Pck+S3k7x21IGSrE0ylWRqdna2Ra2S1FktgyAjttVQ+38C31dV5wG/Anxm1IGqalNVrayqlZOTk0e3SknquJZBMAOcOdBeBuwf7FBV36yqx/qvtwEvSnJ6w5okSUNaBsEOYEWS5UmWAJcBWwc7JDkjSfqvV/Xr+XrDmiRJQ5rdNVRVc0nWA9uBCWBzVe1Msq6/fyPwLuDvJ5kDvgVcVlXDw0eSpIYOGwRJHmX+uP7Tquolh3t/f7hn29C2jQOvbwZuHqtSSVIThw2CqjoVIMmHgAPAf6R3EfhK4NTm1UmSmhv3GsHbquqWqnq0f4H3V4F3tixMkrQwxg2C7ya5MslEkhOSXAl8t2VhkqSFMW4QXAH8XeD/9H9+or9NknSMG+uuoar6Kk4PIUnHpbHOCJK8OskXktzfb5+b5J+3LU2StBDGHRr698DPAt8BqKp76T0gJkk6xo0bBCdX1f8Y2jZ3tIuRJC28cYPgYJJX0n+4LMm7gIebVSVJWjDjTjHxXmATcHaSfcAeeg+VSZKOceMGwdeq6keTnAKcUFWPtixKkrRwxh0a2pNkE/CDwGMN65EkLbBxg+A1wO/RGyLak+TmJG9qV5YkaaGMFQRV9a2q+lRV/R3gdcBLgNubViZJWhBjL0yT5M1JbqG3vORJ9KackCQd48a6WJxkD3A38CngA1X1eMuiJEkLZ9y7hs6rqm82rUSStCiOtELZtVV1I/DhJPNWKquq9x/h/auBj9FbqvITVfWvD9HvQuBO4NKq+s/jFi9Jev6OdEbwYP/XqWd74CQTwAbgYmAG2JFka1U9MKLfR+itbSxJWmBHWqryc/2X91bVnzzLY68CpqtqN0CSLfSmsn5gqN/7gNuAC5/l8SVJR8G4dw19NMmXk9yQ5LVjvmcpsHegPdPf9rQkS4F3ABs5jCRrk0wlmZqdnR3z4yVJ4xj3OYK3ABcBs8CmJPeNsR5BRh1qqP3LwAer6rDLXlbVpqpaWVUrJycnxylZkjSmsZ8jqKoDVfVxYB29W0mvP8JbZoAzB9rLgP1DfVYCW5J8FXgXcEuSt49bkyTp+Rv3OYLvBy6l95f114EtwD89wtt2ACuSLAf20VvI5hnrHFfV8oHPuBX4rar6zJi1S5KOgnGfI/gPwCeBt1bV8L/qR6qquSTr6d0NNAFsrqqdSdb19x/2uoAkaWEcMQj6t3f+76r62LM9eFVtA7YNbRsZAFV19bM9viTp+TviNYL+hdzvSbJkAeqRJC2wsRemAe5IshV4ep6hqvpok6okSQtm3CDY3/85ATi1XTmSpIU2VhBU1b9sXYgkaXGMe/voHzD/YTCq6m8c9YokSQtq3KGhawZenwS8E5g7+uVIkhbauENDdw1tuiOJS1VK0nFg3KGhlw00T6A3NcQZTSqSJC2ocYeG7uL/XyOYA74KvKdFQZKkhXWkFcouBPY+NSdQkp+kd33gq8xfV0CSdAw60pPF/w54AiDJDwO/BPwa8A1gU9vSJEkL4UhDQxNV9Uj/9aXApqq6Dbgtyd1NK5MkLYgjnRFMJHkqLH4E+P2BfeNeX5AkvYAd6S/zTwK3JzkIfAv4IkCSV9EbHpIkHeOOtHj9h5N8AXg58LtV9dSdQyfQW3ReknSMO+LwTlXdOWLbV9qUI0laaGOvWSxJOj4ZBJLUcU2DIMnqJLuSTCe5bsT+NUnuTXJ3kqkkb2pZjyRpvma3gPbXOt4AXAzMADuSbK2qwSeSvwBsrapKci7wKeDsVjVJkuZreUawCpiuqt1V9QSwBVgz2KGqHhu4E+kURqx5IElqq2UQLAX2DrRn+tueIck7knwZ+K/AT406UJK1/aGjqdnZ2SbFSlJXtQy
|
|
|
|
"text/plain": [
|
|
|
|
"<Figure size 432x288 with 1 Axes>"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"metadata": {
|
|
|
|
"needs_background": "light"
|
|
|
|
},
|
|
|
|
"output_type": "display_data"
|
2023-02-16 13:32:30 +01:00
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2023-02-17 14:01:05 +01:00
|
|
|
"sns.barplot(x='Sex', y='Survived', data=df_train)"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-02-17 14:01:05 +01:00
|
|
|
"execution_count": 14,
|
|
|
|
"id": "99710899",
|
|
|
|
"metadata": {
|
|
|
|
"scrolled": true
|
|
|
|
},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"<AxesSubplot:xlabel='Age', ylabel='Count'>"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 14,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAX4AAAEGCAYAAABiq/5QAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAAY6klEQVR4nO3df5BU5Z3v8feHX6LiL2AgxJlhsDQoqBAdXA0bi+CKxGuhuyu/cmPIyi6piFmybHaDbtVV7i1T3JSxYpncvSHRlawooiYLcreMXBRvrXHVGYM/CctGYWaQwIAbjbFcBL73jz7gBIaZpqdPn545n1dVV/c500+fT8PMd555+jnPUURgZmb50S/rAGZmVlku/GZmOePCb2aWMy78ZmY548JvZpYzA7IOUIzhw4dHQ0ND1jHMzHqV5ubmPRFRc+T+XlH4GxoaaGpqyjqGmVmvIml7Z/s91GNmljMu/GZmOePCb2aWM71ijN/MrNw++ugj2tra+PDDD7OO0mODBw+mtraWgQMHFvV8F34zy6W2tjZOOeUUGhoakJR1nJJFBHv37qWtrY0xY8YU1cZDPWaWSx9++CHDhg3r1UUfQBLDhg07rr9cXPjNLLd6e9E/5Hjfhwu/mVnOuPCbmXVwxx13MH78eC688EImTpzI888/3+PXXLt2LcuWLStDOhgyZEiPX8Mf7trvqasfTVtrS0lta+vqaW3p9ERBs17hueeeY926dbz00kuccMIJ7Nmzh3379hXVdv/+/QwY0HlJnTFjBjNmzChn1B5x4bff09bawl1Pbimp7eJpY8ucxqyydu7cyfDhwznhhBMAGD58OPDxsjHDhw+nqamJb3zjG2zcuJHbb7+dt99+m23btjF8+HB+9atfcd999zF+/HgApkyZwne+8x1effVVmpqauOOOO5gwYQJvvvkm/fr144MPPmDs2LG8+eabtLS0sHDhQtrb2znppJP44Q9/yLnnnstbb73FF77wBfbv38/06dPL8j491GNmlpg2bRqtra186lOf4qabbuKZZ57ptk1zczNr1qzhwQcfZM6cOaxevRoo/BJ5++23ufjiiw8/97TTTmPChAmHX/fxxx/nqquuYuDAgSxYsIB77rmH5uZm7rzzTm666SYAFi1axFe/+lVefPFFPvGJT5TlfaZa+CWdLulRSb+UtFnSZZKGSlovaWtyf0aaGczMijVkyBCam5tZvnw5NTU1zJ49m/vvv7/LNjNmzODEE08EYNasWTzyyCMArF69mpkzZx71/NmzZ/Pwww8DsGrVKmbPns3777/Pz3/+c2bOnMnEiRP5yle+ws6dOwF49tlnmTt3LgA33HBDWd5n2kM9dwNPRMT1kgYBJwG3AhsiYpmkJcAS4Jsp5zAzK0r//v2ZMmUKU6ZM4YILLmDFihUMGDCAgwcPAhw1X/7kk08+/PjMM89k2LBhvPLKKzz88MP84Ac/OOr1Z8yYwS233MI777xDc3MzU6dO5Xe/+x2nn346mzZt6jRTuaedptbjl3QqcDlwL0BE7IuI3wDXAiuSp60Arksrg5nZ8diyZQtbt249vL1p0yZGjx5NQ0MDzc3NADz22GNdvsacOXP49re/zbvvvssFF1xw1NeHDBnCJZdcwqJFi7jmmmvo378/p556KmPGjDn810JE8PLLLwMwefJkVq1aBcDKlSvL8j7THOo5C2gH/kHSLyT9SNLJwMiI2AmQ3I/orLGkBZKaJDW1t7enGNPMrOD9999n3rx5jBs3jgsvvJA33niD22+/ndtuu41Fixbx2c9+lv79+3f5Gtdffz2rVq1i1qxZx3zO7NmzeeCBB5g9e/bhfStXruTee+9lwoQJjB8/njVr1gBw99138/3vf59Jkybx7rvvluV9KiLK8kJHvbDUCPwrMDkinpd0N/Ae8LWIOL3D8/4jIroc529sbAxfiKUyJPVoVk9a309m5bZ582bOO++8rGOUTWfvR1JzRDQe+dw0e/xtQFtEHDr74VHgImCXpFFJqFHA7hQzmJnZEVIr/BHxa6BV0qHJ3VcAbwBrgXnJvnnAmrQymJnZ0dKe1fM1YGUyo+dN4M8o/LJZLWk+0AIcPd/JzMxSk2rhj4hNwFHjSxR6/2ZmlgGfuWtmljMu/GZmOePCb2Z2DHX1o5FUtltd/ehuj/nEE08wduxYzj777LIt5Xwkr85pZnYMPVmttjPdrWB74MABFi5cyPr166mtrWXSpEnMmDGDcePGlS0DuMdvZlY1XnjhBc4++2zOOussBg0axJw5cw6fwVtOLvxmZlVix44d1NXVHd6ura1lx44dZT+OC7+ZWZXobMmTNC4I78JvZlYlamtraW1tPbzd1tbGJz/5ybIfx4XfzKxKTJo0ia1bt/LWW2+xb98+Vq1alcq1ej2rx8zsGGrr6st6Lenauvouvz5gwAC+973vcdVVV3HgwAFuvPHGw9fvLScXfjOzY2ht2V7xY1599dVcffXVqR7DQz1WPuqX+sktZtZz7vFb+cTBHp3sUs4/qc3s2NzjNzPLGRd+M7OcceE3M8sZF34zs5xx4TczO4aG+tqyLsvcUF/b7TFvvPFGRowYwfnnn5/a+/KsHjOzY9jeuoN46ltlez1NvbXb53z5y1/m5ptv5ktf+lLZjnsk9/jNzKrI5ZdfztChQ1M9hgu/mVnOuPD3QT25XJyZ9X0e4++DenK5OJ89a9b3ucdvZpYz7vGbmR3D6Lozi5qJczyv1525c+eyceNG9uzZQ21tLUuXLmX+/PllywApF35J24DfAgeA/RHRKGko8DDQAGwDZkXEf6SZw8ysFNta2ip+zIceeij1Y1RiqOdzETExIhqT7SXAhog4B9iQbJuZWYVkMcZ/LbAiebwCuC6DDGZmuZV24Q/gSUnNkhYk+0ZGxE6A5H5EZw0lLZDUJKmpvb095ZhmlkcRkXWEsjje95F24Z8cERcBnwcWSrq82IYRsTwiGiOisaamJr2EZpZLgwcPZu/evb2++EcEe/fuZfDgwUW3SfXD3Yh4O7nfLemnwCXALkmjImKnpFHA7jQzmJl1pra2lra2NvrCiMLgwYOpre1+AbhDUiv8kk4G+kXEb5PH04D/DqwF5gHLkvs1aWWwXia5Zm8pauvqM7kwtvVeAwcOZMyYMVnHyESaPf6RwE+TH+QBwIMR8YSkF4HVkuYDLcDMFDNYb9KDa/b6jGOz4qVW+CPiTWBCJ/v3AlekdVwzM+ual2wwM8sZF34zs5xx4TczyxkXfjOznHHhNzPLGRd+M7OcceE3M8sZF34zs5xx4TczyxkXfjOznHHhNzPLGRd+M7OcceE3M8sZF34zs5xx4TczyxkXfjOznHHhNzPLGRd+M7OcceE3M8sZF34zs5xx4TczyxkXfjOznHHhNzPLGRd+M7OcSb3wS+ov6ReS1iXbQyWtl7Q1uT8j7QxmZvaxSvT4FwGbO2wvATZExDnAhmTbzMwqJNXCL6kW+C/AjzrsvhZYkTxeAVyXZgYzM/t9aff4vwv8LXCww76REbETILkf0VlDSQskNUlqam9vTzmmmVl+pFb4JV0D7I6I5lLaR8TyiGiMiMaampoypzMzy68BKb72ZGCGpKuBwcCpkh4AdkkaFRE7JY0CdqeYwczMjpBajz8ibomI2ohoAOYAT0XEF4G1wLzkafOANWllMDOzo2Uxj38ZcKWkrcCVybaZmVVImkM9h0XERmBj8ngvcEUljmtmZkfzmbtmZjnjwm99g/ohqaRbXf3orNObVVRFhnrMUhcHuevJLSU1XTxtbJnDmFU39/jNzHLGhd/MLGdc+M3McsaF38wsZ4oq/JImF7PPzMyqX7Gzeu4BLipin/UBS5cuzTqCmaWoy8Iv6TLgM0CNpMUdvnQq0D/NYJad2+aVdmL14mcfLHMSM0tDdz3+QcCQ5HmndNj/HnB9WqHMzCw9XRb+iHgGeEbS/RGxvUKZLMd6MszkISqz4hQ7xn+CpOVAQ8c2ETE1jVCWXz0ZZvIQlVlxii38jwD/m8K1cw+kF8fMzNJWbOHfHxF/n2oSMzOriGJP4Hpc0k2SRkkaeuiWajIzM0t
|
|
|
|
"text/plain": [
|
|
|
|
"<Figure size 432x288 with 1 Axes>"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"metadata": {
|
|
|
|
"needs_background": "light"
|
|
|
|
},
|
|
|
|
"output_type": "display_data"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"sns.histplot(x='Age', hue='Survived', data=df_train, bins=20)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 15,
|
|
|
|
"id": "58c6b951",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"<AxesSubplot:xlabel='Pclass', ylabel='Count'>"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 15,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEGCAYAAACKB4k+AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAAa30lEQVR4nO3dfZRU9Z3n8ffHhoAR4gPdKtKNjRFNQKVNGiYTJznEzFGGzYDukadkfViZYCLukHWSM+qejbhnmeNk1CRjHiY4eiQJihh1QdeoRKMmasBuF1AkrAQQWlhpMD4Q4wPtd/+oy6VsqruroW5VQ31e59Spqt+9v1vful77w32o31VEYGZmBnBYpQswM7O+w6FgZmYph4KZmaUcCmZmlnIomJlZql+lCzgQtbW10djYWOkyzMwOKq2trTsioq7QtIM6FBobG2lpaal0GWZmBxVJL3c1zYePzMws5VAwM7OUQ8HMzFIH9TmFQt5//33a2tp45513Kl3KARs4cCD19fX079+/0qWYWZU45EKhra2NwYMH09jYiKRKl7PfIoKdO3fS1tbGiBEjKl2OmVWJQ+7w0TvvvMOQIUMO6kAAkMSQIUMOiT0eMzt4HHKhABz0gbDHofI9zOzgcUiGgpmZ7Z+qCYV58+YxevRozjjjDJqamli+fPkBL3Pp0qVcf/31JagOBg0aVJLlmJkdiEPuRHMhzzzzDA888ADPPfccAwYMYMeOHbz33ntF9d29ezf9+hVeTZMmTWLSpEmlLNXMqkjD8BNp27J5v/rWNwxny+Yuf5i836oiFLZt20ZtbS0DBgwAoLa2Ftg7TEZtbS0tLS1885vf5PHHH2fu3Lls3bqVTZs2UVtbyx/+8Aduu+02Ro8eDcD48eO58cYbef7552lpaWHevHmMGTOGDRs2cNhhh/H2229z6qmnsmHDBjZv3szs2bNpb2/nox/9KLfccguf+MQn2LhxI1/+8pfZvXs3EyZMqNi6MbPKaduymZseWbdffa8859QSV5NTFYePzjnnHLZs2cIpp5zC5ZdfzhNPPNFjn9bWVpYsWcIdd9zB9OnTWbx4MZALmK1bt/LpT386nffII49kzJgx6XLvv/9+zj33XPr378+sWbO4+eabaW1t5YYbbuDyyy8HYM6cOXz961/n2Wef5fjjj8/gW5uZ9V5VhMKgQYNobW1l/vz51NXVMW3aNG6//fZu+0yaNInDDz8cgKlTp3L33XcDsHjxYqZMmbLP/NOmTeOuu+4CYNGiRUybNo1du3bx9NNPM2XKFJqamrjsssvYtm0bAE899RQzZswA4MILLyzVVzUzOyBVcfgIoKamhvHjxzN+/HhOP/10FixYQL9+/fjggw8A9vk9wBFHHJG+HjZsGEOGDGH16tXcdddd/OQnP9ln+ZMmTeLqq6/mtddeo7W1lbPPPps//elPHHXUUaxcubJgTb7k1Mz6mqrYU1i3bh0vvfRS+n7lypWceOKJNDY20traCsA999zT7TKmT5/Od77zHd544w1OP/30faYPGjSIcePGMWfOHL70pS9RU1PDxz72MUaMGJHuZUQEq1atAuCss85i0aJFACxcuLAk39PM7EBVRSjs2rWLiy++mFGjRnHGGWfw4osvMnfuXK699lrmzJnD5z73OWpqarpdxgUXXMCiRYuYOnVql/NMmzaNn//850ybNi1tW7hwIbfeeitjxoxh9OjRLFmyBIDvf//7/PCHP2Ts2LG88cYbpfmiZmYHSBFR6Rr2W3Nzc3S+yc7atWv55Cc/WaGKSu9Q+z5mtpekA7r6aH//fktqjYjmQtOqYk/BzMyKk1koSBooaYWkVZLWSLouaZ8r6RVJK5PHxLw+V0taL2mdpHOzqs3MzArL8uqjd4GzI2KXpP7AbyX9Mpn23Yi4IX9mSaOA6cBo4ATgV5JOiYiODGs0M7M8me0pRM6u5G3/5NHdAbDJwKKIeDciNgLrgXFZ1WdmZvvK9JyCpBpJK4HtwLKI2DMK3RWSVku6TdLRSdswYEte97akrfMyZ0lqkdTS3t6eZflmZlUn01CIiI6IaALqgXGSTgN+DHwcaAK2ATcmsxf6Jdc+exYRMT8imiOiua6uLpO6zcyqVVmuPoqI14HHgQkR8WoSFh8At7D3EFEb0JDXrR7YmkU9DcNPRFLJHg3DTyzqcx966CFOPfVUTj755JINuW1mVkqZnWiWVAe8HxGvSzoc+GvgnyUNjYhtyWznAy8kr5cCd0i6idyJ5pHAiixqO5CRCQspZrTCjo4OZs+ezbJly6ivr2fs2LFMmjSJUaNGlawOM7MDleXVR0OBBZJqyO2RLI6IByT9TFITuUNDm4DLACJijaTFwIvAbmD2oXTl0YoVKzj55JM56aSTgNywGUuWLHEomFmfklkoRMRq4MwC7V0OCRoR84B5WdVUSa+88goNDXuPjtXX15fk7m9mZqXkXzSXSaGfo3uUVDPraxwKZVJfX8+WLXuvuG1ra+OEE06oYEVmZvtyKJTJ2LFjeemll9i4cSPvvfceixYt8v2dzazPqZqb7OSrbxhe0vub1jcM73Gefv368YMf/IBzzz2Xjo4OLr300vSez2ZmfUVVhsKWzS9X5HMnTpzIxIkTe57RzKxCfPjIzMxSDgUzM0s5FMzMLOVQMDOzlEPBzMxSDgUzM0tVZSg0Dq8v6dDZjcPre/zMSy+9lGOPPZbTTjutDN/QzGz/VOXvFF7e8grx2D+VbHk6+5oe57nkkku44ooruOiii0r2uWZmpVaVewqV8PnPf55jjjmm0mWYmXXLoWBmZimHgpmZpRwKZmaWciiYmVmqKq8+OrFhWFFXDPVmeT2ZMWMGjz/+ODt27KC+vp7rrruOmTNnlqwGM7NSyCwUJA0EngQGJJ/zi4i4VtIxwF1AI7AJmBoRf0z6XA3MBDqAv4+Ih7OobdPmtiwW260777yz7J9pZtZbWR4+ehc4OyLGAE3ABEmfAa4CHo2IkcCjyXskjQKmA6OBCcCPJNVkWJ+ZmXWSWShEzq7kbf/kEcBkYEHSvgA4L3k9GVgUEe9GxEZgPTAuq/rMzGxfmZ5ollQjaSWwHVgWEcuB4yJiG0DyfGwy+zBgS173tqSt8zJnSWqR1NLe3l7wcyOidF+igg6V72FmB49MQyEiOiKiCagHxknqbuAfFVpEgWXOj4jmiGiuq6vbp8PAgQPZuXPnQf8HNSLYuXMnAwcOrHQpZlZFynL1UUS8LulxcucKXpU0NCK2SRpKbi8CcnsGDXnd6oGtvf2s+vp62tra6Gov4mAycOBA6ut7HmzPzKxUsrz6qA54PwmEw4G/Bv4ZWApcDFyfPC9JuiwF7pB0E3ACMBJY0dvP7d+/PyNGjCjBNzAzqz5Z7ikMBRYkVxAdBiyOiAckPQMsljQT2AxMAYiINZIWAy8Cu4HZEdGRYX1mZtZJZqEQEauBMwu07wS+2EWfecC8rGoyM7PueZgLMzNLORTMzCzlUDAzs5RDwczMUg4FMzNLORTMzCzlUDAzs5RDwczMUg4FMzNLORTMzCzlUDAzs5RDwczMUg4FMzNLORTMzCzlUDAzs5RDwczMUg4FMzNLORTMzCyVWShIapD0a0lrJa2RNCdpnyvpFUkrk8fEvD5XS1ovaZ2kc7OqzczMCsvsHs3AbuAfIuI5SYOBVknLkmnfjYgb8meWNAqYDowGTgB+JemUiOjIsEYzM8uT2Z5CRGyLiOeS128Ba4Fh3XSZDCyKiHcjYiOwHhiXVX1mZravspxTkNQInAksT5qukLRa0m2Sjk7ahgFb8rq10X2ImJlZiWUeCpIGAfcA34iIN4EfAx8HmoBtwI17Zi3QPQosb5akFkkt7e3t2RRtZlalMg0FSf3JBcLCiLgXICJejYiOiPgAuIW9h4jagIa87vXA1s7LjIj5EdEcEc11dXVZlm9mVnWyvPpIwK3A2oi4Ka99aN5s5wMvJK+XAtMlDZA0AhgJrMiqPjMz21eWVx+dBVwIPC9pZdJ2DTBDUhO5Q0ObgMsAImKNpMXAi+SuXJrtK4/MzMors1CIiN9S+DzBg930mQfMy6omMzP
|
|
|
|
"text/plain": [
|
|
|
|
"<Figure size 432x288 with 1 Axes>"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"metadata": {
|
|
|
|
"needs_background": "light"
|
|
|
|
},
|
|
|
|
"output_type": "display_data"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"sns.histplot(x='Pclass', hue='Survived', data=df_train, bins=20)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 16,
|
|
|
|
"id": "538c8c4f",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"# sns.pairplot(data=df_train, hue='Survived')"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "06684f27",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"## Data Cleaning"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 17,
|
|
|
|
"id": "095ae1ae",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"df_train = pd.get_dummies(data=df_train, columns=['Sex', 'Embarked'])\n",
|
|
|
|
"df_test = pd.get_dummies(data=df_test, columns=['Sex', 'Embarked'])"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 18,
|
|
|
|
"id": "50c17ca4",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"# df_train.drop(['Sex_male', 'Name', 'Ticket', 'PassengerId'], axis=1, inplace=True)\n",
|
|
|
|
"df_train.drop('Sex_male', axis=1, inplace=True)\n",
|
|
|
|
"df_test.drop('Sex_male', axis=1, inplace=True)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 19,
|
|
|
|
"id": "0fdf3229",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"df_train['Age'] = df_train['Age'].fillna(df_train['Age'].mean())\n",
|
|
|
|
"df_test['Age'] = df_test['Age'].fillna(df_train['Age'].mean())"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 20,
|
|
|
|
"id": "b1949435",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"df_train['Cabin'] = df_train['Cabin'].fillna('Other')"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 21,
|
|
|
|
"id": "8a75aa3b",
|
|
|
|
"metadata": {
|
|
|
|
"scrolled": true
|
|
|
|
},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"array(['Other', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',\n",
|
|
|
|
" 'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',\n",
|
|
|
|
" 'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',\n",
|
|
|
|
" 'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',\n",
|
|
|
|
" 'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',\n",
|
|
|
|
" 'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',\n",
|
|
|
|
" 'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',\n",
|
|
|
|
" 'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',\n",
|
|
|
|
" 'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',\n",
|
|
|
|
" 'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',\n",
|
|
|
|
" 'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',\n",
|
|
|
|
" 'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',\n",
|
|
|
|
" 'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',\n",
|
|
|
|
" 'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',\n",
|
|
|
|
" 'C62 C64', 'E24', 'C90', 'C45', 'E8', 'B101', 'D45', 'C46', 'D30',\n",
|
|
|
|
" 'E121', 'D11', 'E77', 'F38', 'B3', 'D6', 'B82 B84', 'D17', 'A36',\n",
|
|
|
|
" 'B102', 'B69', 'E49', 'C47', 'D28', 'E17', 'A24', 'C50', 'B42',\n",
|
|
|
|
" 'C148'], dtype=object)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 21,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"df_train['Cabin'].unique()"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 22,
|
|
|
|
"id": "875d207c",
|
|
|
|
"metadata": {
|
|
|
|
"scrolled": false
|
|
|
|
},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"Other 687\n",
|
|
|
|
"C23 C25 C27 4\n",
|
|
|
|
"G6 4\n",
|
|
|
|
"B96 B98 4\n",
|
|
|
|
"C22 C26 3\n",
|
|
|
|
" ... \n",
|
|
|
|
"E34 1\n",
|
|
|
|
"C7 1\n",
|
|
|
|
"C54 1\n",
|
|
|
|
"E36 1\n",
|
|
|
|
"C148 1\n",
|
|
|
|
"Name: Cabin, Length: 148, dtype: int64"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 22,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"df_train['Cabin'].value_counts()"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 23,
|
|
|
|
"id": "56c071c5",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"# df_train['Cabin'].str.extract('(\\d+)')"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 24,
|
|
|
|
"id": "d0163d74",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"df_train['Cabin symbol'] = df_train['Cabin'].str.extract('(\\w)')\n",
|
|
|
|
"df_test['Cabin symbol'] = df_test['Cabin'].str.extract('(\\w)')"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 25,
|
|
|
|
"id": "b0f96907",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"O 327\n",
|
|
|
|
"C 35\n",
|
|
|
|
"B 18\n",
|
|
|
|
"D 13\n",
|
|
|
|
"E 9\n",
|
|
|
|
"F 8\n",
|
|
|
|
"A 7\n",
|
|
|
|
"G 1\n",
|
|
|
|
"dtype: int64"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 25,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"df_test['Cabin'].str.extract('(\\w)').value_counts()"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 26,
|
|
|
|
"id": "48944d81",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"symbol_hist = df_train[df_train['Cabin symbol'] != 'O'][['Cabin symbol', 'Survived']]"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 27,
|
|
|
|
"id": "7fde7d54",
|
|
|
|
"metadata": {
|
|
|
|
"scrolled": true
|
|
|
|
},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"<AxesSubplot:xlabel='Cabin symbol', ylabel='Count'>"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 27,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAX4AAAEGCAYAAABiq/5QAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAAYaUlEQVR4nO3de5RU5Z3u8e/DRVHxBjSG2DSNoyECAsaGmdEkQ0hGGVcW6owKmKNkIMEV5RwckplJmD/Ek6XHk6UkWeoyttGRGBTJGMfLyThhVPREidptEEV0mCCXRg43MyhxDAK/80dtSKUvdHVTu3Y1+/ms1atqv7UvvypqPbV56613KyIwM7P86JV1AWZmVlkOfjOznHHwm5nljIPfzCxnHPxmZjnTJ+sCSjFo0KCor6/Pugwzsx6lubl5R0TUtG7vEcFfX19PU1NT1mWYmfUokja01+6uHjOznHHwm5nljIPfzCxnekQfv5lZuX300Ue0tLTw4YcfZl3KYevXrx+1tbX07du3pPUd/GaWSy0tLRx//PHU19cjKetyui0i2LlzJy0tLQwfPrykbdzVY2a59OGHHzJw4MAeHfoAkhg4cGCX/ueSWvBL6ifpJUmvSlot6YakfYGkzZJWJn8XplWDmdmh9PTQP6CrzyPNrp7fAZMiYrekvsAvJP1L8th3I+KWFI9tZmYdSO2MPwp2J4t9kz9P/m9mVe3GG29k1KhRjBkzhnHjxvHiiy8e9j4fe+wxbr755jJUB/379z/sfaT65a6k3kAzcDpwR0S8KOkvgDmSrgKagK9HxG/a2XY2MBugrq6u2zXU19WyYdPmbm+flmFDT2X9xpasy7AjlN/33bNixQqeeOIJXnnlFY4++mh27NjBnj17Stp279699OnTfqROmTKFKVOmlLPUw5Jq8EfEPmCcpJOARySNBu4Evk3h7P/bwK3AzHa2bQQaARoaGrr9P4UNmzYTT9/U3c1To0nzsy7BjmB+33fPli1bGDRoEEcffTQAgwYNAn4/bcygQYNoamriG9/4BsuXL2fBggW88847rF+/nkGDBvHrX/+ae++9l1GjRgEwceJEbr31Vl577TWampq48cYbGTt2LOvWraNXr1588MEHjBgxgnXr1rFx40auvfZatm/fzrHHHsvdd9/NJz/5Sd5++22uuOIK9u7dy+TJk8vyPCsyqici/hNYDkyOiK0RsS8i9gN3AxMqUYOZWWfOP/98Nm3axCc+8QmuueYann322U63aW5u5tFHH+WBBx5g2rRpLF26FCh8iLzzzjucc845B9c98cQTGTt27MH9Pv7441xwwQX07duX2bNnc9ttt9Hc3Mwtt9zCNddcA8DcuXP52te+xssvv8zHPvaxsjzPNEf11CRn+kg6BvgC8KakIUWrXQK8nlYNZmZd0b9/f5qbm2lsbKSmpoapU6dy3333HXKbKVOmcMwxxwBw+eWX85Of/ASApUuXctlll7VZf+rUqTz00EMALFmyhKlTp7J7925eeOEFLrvsMsaNG8fVV1/Nli1bAHj++eeZPn06AFdeeWVZnmeaXT1DgEVJP38vYGlEPCHpfknjKHT1rAeuTrEGM7Mu6d27NxMnTmTixImcddZZLFq0iD59+rB//36ANuPljzvuuIP3Tz31VAYOHMiqVat46KGHuOuuu9rsf8qUKXzrW9/i3Xffpbm5mUmTJvHb3/6Wk046iZUrV7ZbU7mHnaY5qmdVRJwdEWMiYnRE/M+k/cqIOCtpnxIRW9KqwcysK9566y3Wrl17cHnlypUMGzaM+vp6mpubAXj44YcPuY9p06bxne98h127dnHWWWe1ebx///5MmDCBuXPn8sUvfpHevXtzwgknMHz48IP/W4gIXn31VQDOO+88lixZAsDixYvL8jz9y10zs8Tu3buZMWMGI0eOZMyYMbzxxhssWLCA66+/nrlz5/KZz3yG3r17H3Ifl156KUuWLOHyyy/vcJ2pU6fy4x//mKlTpx5sW7x4Mffccw9jx45l1KhRPProowB8//vf54477mD8+PHs2rWrLM9TEdU/tL6hoSG6eyEWSVU7uqEnvPbWM/l937k1a9Zw5plnZl1G2bT3fCQ1R0RD63V9xm9mljMOfjOznHHwm5nljIPfzCxnHPxmZjnj4DczyxkHv5lZB4bWDUNS2f6G1g3r9JhPPvkkI0aM4PTTTy/bVM6t+Zq7ZmYdaNm0kYU/f6ts+5t3/ohDPr5v3z6uvfZali1bRm1tLePHj2fKlCmMHDmybDWAz/jNzKrGSy+9xOmnn85pp53GUUcdxbRp0w7+grecHPxmZlVi8+bNDB069OBybW0tmzeX/4I6Dn4zsyrR3nQWaVwQ3sFvZlYlamtr2bRp08HllpYWPv7xj5f9OA5+M7MqMX78eNauXcvbb7/Nnj17WLJkSSrX6vWoHjOzDtQOret0JE5X93coffr04fbbb+eCCy5g3759zJw58+D1e8vJwW9m1oFNGzdU/JgXXnghF154YarHcFePmVnOOPjNzHLGwW9mljOpBb+kfpJekvSqpNWSbkjaB0haJmltcntyWjWYmVlbaZ7x/w6YFBFjgXHAZEl/AnwTeCoizgCeSpbNzKxCUgv+KNidLPZN/gK4CFiUtC8CLk6rBjMzayvVPn5JvSWtBLYByyLiReCUiNgCkNwO7mDb2ZKaJDVt3749zTLNzNpVX1db1mmZ6+tqOz3mzJkzGTx4MKNHj07teaU6jj8i9gHjJJ0EPCKp5GcSEY1AI0BDQ0PbCSzMzFK2YdNm4umbyrY/TZrf6Tpf/vKXmTNnDldddVXZjttaRUb1RMR/AsuBycBWSUMAktttlajBzKwn+OxnP8uAAQNSPUaao3pqkjN9JB0DfAF4E3gMmJGsNgMo/2TTZmbWoTS7eoYAiyT1pvABszQinpC0AlgqaRawEbgsxRrMzKyV1II/IlYBZ7fTvhP4fFrHNTOzQ/Mvd83Mcsazc5qZdWDY0FNLGonTlf11Zvr06SxfvpwdO3ZQW1vLDTfcwKxZs8pWAzj4zcw6tH5jS8WP+eCDD6Z+DHf1mJnljIPfzCxnHPxmllsRR8akAF19Hg5+M8ulfv36sXPnzh4f/hHBzp076devX8nb+MtdM8ul2tpaWlpaOBImgezXrx+1tZ1PAHeAg9/Mcqlv374MHz486zIy4a4eM7OccfCbmeWMg9/MLGcc/GZmOePgNzPLGQe/mVnOOPjNzHLGwW9mljMOfjOznHHwm5nljIPfzCxnHPxmZjmTWvBLGirpGUlrJK2WNDdpXyBps6SVyd+FadVgZmZtpTk7517g6xHxiqTjgWZJy5LHvhsRt6R4bDMz60BqwR8RW4Atyf33Ja0BOr/EvJmZpaoiffyS6oGzgReTpjmSVkm6V9LJHWwzW1KTpKYj4UIJdnjq62qRVHV/9XWlX/zCrFqkfiEWSf2Bh4HrIuI9SXcC3wYiub0VmNl6u4hoBBoBGhoaeva10eywbdi0mXj6pqzLaEOT5mddglmXpXrGL6kvhdBfHBE/BYiIrRGxLyL2A3cDE9KswczM/lCao3oE3AOsiYiFRe1Dila7BHg9rRrMzKytNLt6zgOuBF6TtDJpmw9MlzSOQlfPeuDqFGswM7NW0hzV8wtA7Tz0s7SOaWZmnUv9y93MqVd1fgGnXhR6w6rLsKGnsn5jS9ZlmFmKjvzgj/0sbLwn6yramDd7lkepmFkmPFePmVnOOPjNzHLGwW9mljMOfjOznHHwm5nljIPfzCxnHPxmZjnj4DczyxkHv5lZzjj4zcxyxsFvZpYzDn4zs5xx8JuZ5YyD38wsZxz8ZmY54+A3M8sZB7+ZWc44+M3Mcia14Jc0VNIzktZIWi1pbtI+QNIySWuT25PTqsHMzNoqKfglnVdKWyt7ga9HxJnAnwDXShoJfBN4KiLOAJ5Kls3MrEJKPeO/rcS2gyJiS0S8ktx/H1gDnApcBCxKVlsEXFxiDWZmVgZ9DvWgpD8FzgVqJM0reugEoHepB5FUD5wNvAicEhFboPDhIGlwB9vMBmYD1NXVlXooMzPrRGdn/EcB/Sl8QBx
|
|
|
|
"text/plain": [
|
|
|
|
"<Figure size 432x288 with 1 Axes>"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"metadata": {
|
|
|
|
"needs_background": "light"
|
|
|
|
},
|
|
|
|
"output_type": "display_data"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"sns.histplot(x='Cabin symbol', hue='Survived', data=symbol_hist, bins=20)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 28,
|
|
|
|
"id": "19d57354",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"# Describe the 'Cabin' with number of people in it\n",
|
|
|
|
"counts_train = df_train['Cabin'].value_counts().copy(deep=True)\n",
|
|
|
|
"counts_test = df_test['Cabin'].value_counts().copy(deep=True)\n",
|
|
|
|
"\n",
|
|
|
|
"# Changing n-people cabin to 'description'\n",
|
|
|
|
"def num_peopl_in_cabin(df, n, description, counts):\n",
|
|
|
|
" df['Cabin'][df['Cabin'].isin(counts[counts==n].index)] = description"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 29,
|
|
|
|
"id": "fdcbf0a5",
|
|
|
|
"metadata": {
|
|
|
|
"scrolled": true
|
|
|
|
},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stderr",
|
2023-02-16 13:32:30 +01:00
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
2023-02-17 14:01:05 +01:00
|
|
|
"C:\\Users\\Maciej\\AppData\\Local\\Temp/ipykernel_16012/2825624458.py:7: SettingWithCopyWarning: \n",
|
2023-02-16 13:32:30 +01:00
|
|
|
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
|
|
|
"\n",
|
|
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
2023-02-17 14:01:05 +01:00
|
|
|
" df['Cabin'][df['Cabin'].isin(counts[counts==n].index)] = description\n"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2023-02-17 14:01:05 +01:00
|
|
|
"\n",
|
|
|
|
"num_peopl_in_cabin(df_train, 1, 'Alone', counts_train)\n",
|
|
|
|
"num_peopl_in_cabin(df_train, 2, 'Double room', counts_train)\n",
|
|
|
|
"num_peopl_in_cabin(df_train, 3, 'Three person room', counts_train)\n",
|
|
|
|
"num_peopl_in_cabin(df_train, 4, 'Four person room', counts_train)\n",
|
|
|
|
"\n",
|
|
|
|
"num_peopl_in_cabin(df_test, 1, 'Alone', counts_test)\n",
|
|
|
|
"num_peopl_in_cabin(df_test, 2, 'Double room', counts_test)\n",
|
|
|
|
"num_peopl_in_cabin(df_test, 3, 'Three person room', counts_test)\n",
|
|
|
|
"\n",
|
|
|
|
"\n",
|
|
|
|
"# df_train['Cabin'][df_train['Cabin'].isin(counts[counts>4].index)] = 'Other'\n",
|
|
|
|
"\n",
|
|
|
|
"# df_train['Cabin'][df_train['Cabin'].isin(counts[counts==1].index)] = 'Alone'\n",
|
|
|
|
"# df_train['Cabin'][df_train['Cabin'].isin(counts[counts==2].index)] = 'Double room'\n",
|
|
|
|
"# df_train['Cabin'][df_train['Cabin'].isin(counts[counts==3].index)] = 'Three person room'\n",
|
|
|
|
"# df_train['Cabin'][df_train['Cabin'].isin(counts[counts==4].index)] = 'Four person room'\n",
|
|
|
|
"# df_train['Cabin'][df_train['Cabin'].isin(counts[counts>4].index)] = 'Other'"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-02-17 14:01:05 +01:00
|
|
|
"execution_count": 30,
|
2023-02-16 13:32:30 +01:00
|
|
|
"id": "6d73e794",
|
2023-02-17 14:01:05 +01:00
|
|
|
"metadata": {
|
|
|
|
"scrolled": true
|
|
|
|
},
|
2023-02-16 13:32:30 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"Other 687\n",
|
|
|
|
"Alone 101\n",
|
|
|
|
"Double room 76\n",
|
|
|
|
"Three person room 15\n",
|
|
|
|
"Four person room 12\n",
|
|
|
|
"Name: Cabin, dtype: int64"
|
|
|
|
]
|
|
|
|
},
|
2023-02-17 14:01:05 +01:00
|
|
|
"execution_count": 30,
|
2023-02-16 13:32:30 +01:00
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"df_train['Cabin'].value_counts()"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-02-17 14:01:05 +01:00
|
|
|
"execution_count": 31,
|
2023-02-16 13:32:30 +01:00
|
|
|
"id": "ae86ac06",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2023-02-17 14:01:05 +01:00
|
|
|
"df_train = pd.get_dummies(data=df_train, columns=['Cabin'])\n",
|
|
|
|
"df_test = pd.get_dummies(data=df_test, columns=['Cabin', 'Pclass', 'Cabin symbol'])"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 32,
|
|
|
|
"id": "702cf4b1",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"df_train = pd.get_dummies(data=df_train, columns=['Pclass'])"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 33,
|
|
|
|
"id": "2c5e0c67",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"df_train = pd.get_dummies(data=df_train, columns=['Cabin symbol'])"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 34,
|
|
|
|
"id": "76796bf2",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"df_train = df_train.drop('Cabin symbol_O', axis=1)\n",
|
|
|
|
"df_test = df_test.drop('Cabin symbol_O', axis=1)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 35,
|
|
|
|
"id": "4be06019",
|
|
|
|
"metadata": {
|
|
|
|
"scrolled": true
|
|
|
|
},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"Index(['PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',\n",
|
|
|
|
" 'Sex_female', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Cabin_Alone',\n",
|
|
|
|
" 'Cabin_Double room', 'Cabin_Other', 'Cabin_Three person room',\n",
|
|
|
|
" 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Cabin symbol_A', 'Cabin symbol_B',\n",
|
|
|
|
" 'Cabin symbol_C', 'Cabin symbol_D', 'Cabin symbol_E', 'Cabin symbol_F',\n",
|
|
|
|
" 'Cabin symbol_G'],\n",
|
|
|
|
" dtype='object')"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 35,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"df_test.columns"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 36,
|
|
|
|
"id": "c9e20ea5",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"Index(['PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',\n",
|
|
|
|
" 'Sex_female', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Cabin_Alone',\n",
|
|
|
|
" 'Cabin_Double room', 'Cabin_Other', 'Cabin_Three person room',\n",
|
|
|
|
" 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Cabin symbol_A', 'Cabin symbol_B',\n",
|
|
|
|
" 'Cabin symbol_C', 'Cabin symbol_D', 'Cabin symbol_E', 'Cabin symbol_F',\n",
|
|
|
|
" 'Cabin symbol_G'],\n",
|
|
|
|
" dtype='object')"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 36,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"df_test.columns[df_test.columns.isin(df_train.columns)]"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 37,
|
|
|
|
"id": "a6a53564",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"Index(['PassengerId', 'Survived', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket',\n",
|
|
|
|
" 'Fare', 'Sex_female', 'Embarked_C', 'Embarked_Q', 'Embarked_S',\n",
|
|
|
|
" 'Cabin_Alone', 'Cabin_Double room', 'Cabin_Four person room',\n",
|
|
|
|
" 'Cabin_Other', 'Cabin_Three person room', 'Pclass_1', 'Pclass_2',\n",
|
|
|
|
" 'Pclass_3', 'Cabin symbol_A', 'Cabin symbol_B', 'Cabin symbol_C',\n",
|
|
|
|
" 'Cabin symbol_D', 'Cabin symbol_E', 'Cabin symbol_F', 'Cabin symbol_G',\n",
|
|
|
|
" 'Cabin symbol_T'],\n",
|
|
|
|
" dtype='object')"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 37,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"df_train.columns"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 38,
|
|
|
|
"id": "b299afb0",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"df_test['Cabin symbol_T'] = 0\n",
|
|
|
|
"df_test['Cabin_Four person room'] = 0"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 39,
|
|
|
|
"id": "5c4f00b0",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"df_train = df_train.reindex(columns=['PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket',\n",
|
|
|
|
" 'Fare', 'Sex_female', 'Embarked_C', 'Embarked_Q', 'Embarked_S',\n",
|
|
|
|
" 'Cabin_Alone', 'Cabin_Double room', 'Cabin_Four person room',\n",
|
|
|
|
" 'Cabin_Other', 'Cabin_Three person room', 'Pclass_1', 'Pclass_2',\n",
|
|
|
|
" 'Pclass_3', 'Cabin symbol_A', 'Cabin symbol_B', 'Cabin symbol_C',\n",
|
|
|
|
" 'Cabin symbol_D', 'Cabin symbol_E', 'Cabin symbol_F', 'Cabin symbol_G',\n",
|
|
|
|
" 'Cabin symbol_T', 'Survived'])"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 40,
|
|
|
|
"id": "086a50d2",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"df_test = df_test.reindex(columns=['PassengerId', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket',\n",
|
|
|
|
" 'Fare', 'Sex_female', 'Embarked_C', 'Embarked_Q', 'Embarked_S',\n",
|
|
|
|
" 'Cabin_Alone', 'Cabin_Double room', 'Cabin_Four person room',\n",
|
|
|
|
" 'Cabin_Other', 'Cabin_Three person room', 'Pclass_1', 'Pclass_2',\n",
|
|
|
|
" 'Pclass_3', 'Cabin symbol_A', 'Cabin symbol_B', 'Cabin symbol_C',\n",
|
|
|
|
" 'Cabin symbol_D', 'Cabin symbol_E', 'Cabin symbol_F', 'Cabin symbol_G',\n",
|
|
|
|
" 'Cabin symbol_T'])"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 41,
|
|
|
|
"id": "55271503",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"PassengerId 0\n",
|
|
|
|
"Name 0\n",
|
|
|
|
"Age 0\n",
|
|
|
|
"SibSp 0\n",
|
|
|
|
"Parch 0\n",
|
|
|
|
"Ticket 0\n",
|
|
|
|
"Fare 0\n",
|
|
|
|
"Sex_female 0\n",
|
|
|
|
"Embarked_C 0\n",
|
|
|
|
"Embarked_Q 0\n",
|
|
|
|
"Embarked_S 0\n",
|
|
|
|
"Cabin_Alone 0\n",
|
|
|
|
"Cabin_Double room 0\n",
|
|
|
|
"Cabin_Four person room 0\n",
|
|
|
|
"Cabin_Other 0\n",
|
|
|
|
"Cabin_Three person room 0\n",
|
|
|
|
"Pclass_1 0\n",
|
|
|
|
"Pclass_2 0\n",
|
|
|
|
"Pclass_3 0\n",
|
|
|
|
"Cabin symbol_A 0\n",
|
|
|
|
"Cabin symbol_B 0\n",
|
|
|
|
"Cabin symbol_C 0\n",
|
|
|
|
"Cabin symbol_D 0\n",
|
|
|
|
"Cabin symbol_E 0\n",
|
|
|
|
"Cabin symbol_F 0\n",
|
|
|
|
"Cabin symbol_G 0\n",
|
|
|
|
"Cabin symbol_T 0\n",
|
|
|
|
"dtype: int64"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 41,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"df_test.isna().sum()"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "7ed9bfbb",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
2023-02-17 14:01:05 +01:00
|
|
|
"## Views of the best correlated features"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-02-17 14:01:05 +01:00
|
|
|
"execution_count": 42,
|
2023-02-16 13:32:30 +01:00
|
|
|
"id": "3c82db70",
|
2023-02-17 14:01:05 +01:00
|
|
|
"metadata": {
|
|
|
|
"scrolled": true
|
|
|
|
},
|
2023-02-16 13:32:30 +01:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
2023-02-17 14:01:05 +01:00
|
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAFNCAYAAADo9m/BAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAEAAElEQVR4nOydd5hTZfr+P+llkkym984AQx2KCpYVFaUIiAgqoqK4CjbcFXSx4PpTEERcG7iL7uq6ICoiSFVZOyDSkTIMZWBmmN5L2qT+/kgmkyk5icIq+M19XbkY8rznbefkzcl77vu5RS6Xy0UIIYQQQgj/pyD+rTsQQgghhBDCr4/Q4h9CCCGE8H8QocU/hBBCCOH/IEKLfwghhBDC/0GEFv8QQgghhP+DCC3+IYQQQgj/BxFa/EMIIYQQLgDk5eVx1113tXvvtdde44UXXmDevHk/u77Q4h9CCCGE8BugqamJkpKSTq+mpqZOZc+cOcO3336LRCJp915jYyNPPvkker2egwcP/qz2pWc9ghB+FdhqTgUsY132rGDc8N9iwbi9JfC9gMspHDfUKwPXEUBW6HAI98PlEp11G/YAbdgcEsF4MHAE6GegOIAL4TIWl3A/A5wudz8CtOEM3E1sAepwiITjZnHgRqoDnJLj4hbBeDBz8Xbhx0GUEkYwn1WA9z7YxJIlSzq9/9BDD/Hwww+3ey8lJYUHHniA6dOne9+rrq4mLi4OgNjYWKqqqn5WP0OLfwghhBDCuYTTEVSxqVOncuONN3Z6X6fTBXV8fHw81dXVAFRVVdGzZ8/g+0ho8W+HoqIiXn31VdRqNSaTiccff5yEhISgjn3rrbeYPHkyWq02qPI7d+7k8OHD3HPPPUGVr61v4OUl/+SFubPZte8g+ccLSE6Kp7nZQGlFFbdMmkjkkNE4zhxHpNEjjkvBUZSPszgfxU0P07L6dRSXX44kORnboUNIUlORREdjLypCkpAAEgn2umbEGg3Wg4eRpqUgiYnGdroIsVaDODwcl8UCcgXWnw4jTXfH7aeLkKYmgwtMX32HZnAP5GlJmPcewXLsNAkL/kzZIwuIuGs8Yo0aR2MzIoUC8x53PHHhnymduQD9baNR9e2OvboOe6MZ454jtOQXkrToT5x5aCHR908Cl4umzdtQ9MhAnpGIaXcelvxCkl96hOIHXyTm/ongAkezEXGYCmOHOGIxaX+fQ9H0FwgfOQRlegKG3UcxHy0k7eWZnL5/EVG3DMfRZMRuF6HMTKB511FEMilhfTJoKa5EmZkIEjH2egPS8DC/8dq121D1zUSZkUDzrnx3mb4ZWIoqUabFIdGosDUYESvlNO7MRyx312EpqkQRH4EmtxvWynrszSaaduYjkkvReOKySB3SKC21n+9G1y0VdWY89TuPIZZL0fZNx1xUia5fJhKVAkNhJTKN0m/cWFiJRKOi3tNGeN90TEVVSHVhqFNiKFm9lYRxQ5BqVeQvXAUuF+l3X4syPpKTS9aTMOYSoi/rjbG0liOeeObd16KKj+TYkvVkTbsOsVyKzWJDplVxeOHH4HKRdde1qBIiyF+yAXVSNJfPu5OafSfZ/6I73mPqcNQJkRxf/hUZ4y8l65Y/cOyLPWx90d1GbN90skcM5sfXP+WSh8djN7eQIbKj0qhYt/hDXC4XV95+HVHJsXy7/HOuvms0Kb0zOLX/OJ964sM88W+Wf841d43G1mIL6rMYEIF+Hnug0+mCXuh9YbFYeP3113n88cdRqVTe/f5+/fr9rHpCe/4+2LFjB0OGDGH+/Pn8+c9/ZuPGjXz++ecAPPDAA5SUlDBhwgSee+45Zs6cidlspqSkhIULF3Lq1Cmampp45plnAFiyZAn5+fksWbKEefPm8dhjj1FXV8dHH33EvHnzWLFixc/qW0NDIxlpKQAcPnqcO2+9kTOl5dTWNzJtykTKqmqx/bgZcVI3xJHxuFosYLch7XMZzqoSACRJSZg+/BBZr15Yf/wR06efIo6ORqRUYt60CeXQizG8/xHy3jlYtu/E8Mk6JDHRyPv0QqRQINaHY1jxEfI+nvhqd1wcGQkyKS6TCXlqInX/XI0ytyfSKD3Wk+6tJok2DGl0BNKoCOreXo0qtyfSaD0tJ9zxhpWbsRaW4WgyUvPWJ6gHtI9Lo/WIZFIcRjPy9ARqlq1BPaAH0mg9luPty0hjIqj2ibfWEXHjMCz5RQAo0hKo/MdawgZ2RxrTVoe9vglEIpQZ8ZS/+Smagd3R5Haj4u2NKNLiEasUVK/8Cv3VAwTjEdcOQpmeQNmb69AMcpcpf2sjyvR4pFo18tgIZDF6SpauQzcoG01uFmVvbUSVHkf5e1swnyrH3mSkZOl6tIO6o83tRulbm1Cmx6Md3B2JUo69wYA6I47CpRvQD+pGeG4mxcs2o06L48y7W6jbdhht90ROe+K63EyKPPHid7dQ64mfWroB/aBs9LlZnF72Geq0OBTROgqWrifh+ouxVNRRtzMfTZb7RkiiUlCyeisRg7vjtDuwNhio6RAvXr2VqMHZlG7chTJWj7minpqdx9BmxXvKyClavY3oi7oTe3lvzFUNVO7MR5fpjktVCk6t3kZ4dhKl3x6kYnseZ348SqQn7rI7aWkykTAwG7HEvYw1VNRxYvdRYjPd/fhxzfdoIrWYGo189e4mzM1GTuw+SpwnvsMnvnbxBzgdwd2xB4TTGdzrF2DZsmUolUoef/xxAGbNmsXTTz/N008//bPrCi3+Ppg0aRIAzz//PO+88w56vb5TmW7duvHMM88wcuRIvv76a9atW8f48eMBUKlUNDc309LSwokTJ5BKpfzwww8olUrkcjk//fQT3377LU8//TQ333zzz+pbVkYaUql701Pqeejjcrm8fztbLyYR2A9tw77zMyTZAxDHpXpeadB6cYtEuCwW1DffjHnjRuyFhahGjMBltbXFWyxoJ0/CuH4TLrMF4+q1SDPS2h2vvW0ShnWbaF7+IcY1G1Bd9QdcnjZEIrCdqcBpNAPgMJioeX0F8rREbz9txRU4Te64NDYSe0092Fs/gCKsxRU4PMfXLPuEupWfET7yUlx2p7cf1uIKnCYLANXL1lC78jMU2Snt4g6jBZFcijInA2WvdJS9M9vNhbWoAoenDnNeoXtu7T5z5fO3+dgZoicNw2W1C8adLba2uYB2f9sNZs689CHK9HifOtrGJI+LwFbd4POeb3/AYbRQ+s7nRI++BKej7Tjfv2WRWtRZCZiKq9va8InLI7WEdYo7Ov0dcUlP98ls7TzgtLWd4/BeqW1hb9w9N2K5DGtdM6qkKMSea7e1kHc8QMI1/QnvloBYKkHkiTvtrfMLAx6fSN3hwnbHVx/13FTIJJTuPkbykJ5IZFLPIe4ymkgd363YQkZuNrc+O40zR4vwHYjWJz70xj9w8Ot9nAu4HPagXr81Qou/D/7zn/9w1VVXMXfuXO68806WLVuGw/MhaGxsBECj0QAwfPhwtm7dSkFBQbu9tmuvvZbXXnuNgQMH4nQ6ycrKYvbs2UyYMIHU1FTEYveU+z61/znY9uMeundL570P15CemoxareS9D9aQmhiH9JKRiMRSxCndkV4yEmdFIdYvV+IoPoazsghHdTXqm28GqRTNAw8AIO/fH5FKBSIRLXv3o5k8CaRSwh++HwBFbj/spWWE3TQe609H0Nw2CZFUSvgjbXH1yOGETRhLy/6fsFfUEDntRpC231EUh6nQTx6N+dAxIqfdiKhDXNm3O+Z9edgqaoi650ZEsvbzox9/FZG3jcK48zC2ihqi/zi+Ux36G4cRddsoTHuPuuOytrjLaqd83r8w7srDcuQU1vIaYu+7oVMdrbCW1xI/fRwimQTT0SLi7xuLpaAUcZgSkVhE47ZDgvGGr/dhLaslcfpYRDIpprwiEqaPxVxQhiRMSdydI2g+cJKkGWMRSSWY8opImj4Gc0EZmtxuNO0+hrW8lqQZYxBLpRh94paiShLvHkHDjjxaympJm3E9IpkEQ14xadNHYywoJ2fhNCRhSnC6SPfEm/OKSffEe/nEM2Zcj1gmoSmvmIzpozEWlGE3tpB
|
2023-02-16 13:32:30 +01:00
|
|
|
"text/plain": [
|
|
|
|
"<Figure size 432x288 with 2 Axes>"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "display_data"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"corrmat = df_train.corr() \n",
|
2023-02-17 14:01:05 +01:00
|
|
|
"cols = corrmat.nlargest(df_train.shape[1], 'Survived')['Survived'].index \n",
|
2023-02-16 13:32:30 +01:00
|
|
|
"cm = np.corrcoef(df_train[cols].values.T) \n",
|
2023-02-17 14:01:05 +01:00
|
|
|
"sns.set(font_scale=0.7) \n",
|
|
|
|
"hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 5}, yticklabels=cols.values, xticklabels=cols.values)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "972b9982",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"## Model"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 43,
|
|
|
|
"id": "f8ff6bbd",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"from sklearn.linear_model import LogisticRegression\n",
|
|
|
|
"from sklearn.preprocessing import StandardScaler\n",
|
|
|
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|
|
|
"from sklearn.model_selection import train_test_split"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 44,
|
|
|
|
"id": "ae38bd90",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"X_train = df_train.drop(['PassengerId', 'Name', 'Ticket', 'Survived'], axis=1)\n",
|
|
|
|
"y_train = df_train['Survived']\n",
|
|
|
|
"\n",
|
|
|
|
"X_test = df_test.drop(['PassengerId', 'Name', 'Ticket'], axis=1)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 45,
|
|
|
|
"id": "79af2916",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"ros = RandomOverSampler()\n",
|
|
|
|
"X_train, y_train = ros.fit_resample(X_train, y_train)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 46,
|
|
|
|
"id": "5ef85114",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 47,
|
|
|
|
"id": "f93f38a4",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"(549, 549)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 47,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"len(y_train[y_train==1]), len(y_train[y_train==0])"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 48,
|
|
|
|
"id": "09d2817d",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"sc = StandardScaler()\n",
|
|
|
|
"\n",
|
|
|
|
"X_train_std = sc.fit_transform(X_tr)\n",
|
|
|
|
"X_val = sc.transform(X_val)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 49,
|
|
|
|
"id": "96fe96d7",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"0.7818181818181819"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 49,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"clf = LogisticRegression()\n",
|
|
|
|
"\n",
|
|
|
|
"clf.fit(X_train_std, y_tr)\n",
|
|
|
|
"clf.score(X_val, y_val)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 50,
|
|
|
|
"id": "dda1026c",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"X_test = sc.transform(X_test)\n",
|
|
|
|
"predictions = clf.predict(X_test)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 53,
|
|
|
|
"id": "ea78bc69",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"submissionStacking = pd.DataFrame({ 'PassengerId': df_test[\"PassengerId\"],'Survived': predictions })\n",
|
|
|
|
"submissionStacking.to_csv(\"submission.csv\", index=False)"
|
2023-02-16 13:32:30 +01:00
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"metadata": {
|
|
|
|
"kernelspec": {
|
|
|
|
"display_name": "Python 3 (ipykernel)",
|
|
|
|
"language": "python",
|
|
|
|
"name": "python3"
|
|
|
|
},
|
|
|
|
"language_info": {
|
|
|
|
"codemirror_mode": {
|
|
|
|
"name": "ipython",
|
|
|
|
"version": 3
|
|
|
|
},
|
|
|
|
"file_extension": ".py",
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
"name": "python",
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
"version": "3.9.7"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"nbformat": 4,
|
|
|
|
"nbformat_minor": 5
|
|
|
|
}
|