140 KiB
140 KiB
Titanic_Machine_Learning_from_Disaster
Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
Data description
df_train = pd.read_csv('train.csv')
df_train.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object')
df_train.describe()
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
Analysis before data cleaning
sns.catplot(x='Sex', y='Survived', data=df_train, kind='violin')
<seaborn.axisgrid.FacetGrid at 0x1f232d8f6d0>
# sns.pairplot(data=df_train, hue='Survived')
Data Cleaning
df_train = pd.get_dummies(data=df_train, columns=['Sex', 'Embarked'])
df_train.drop(['Sex_male', 'Name', 'Ticket', 'PassengerId'], axis=1, inplace=True)
df_train['Cabin'] = df_train['Cabin'].fillna('None')
df_train['Cabin'].unique()
array(['None', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6', 'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33', 'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101', 'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4', 'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35', 'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19', 'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54', 'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40', 'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44', 'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14', 'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38', 'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68', 'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48', 'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63', 'C62 C64', 'E24', 'C90', 'C45', 'E8', 'B101', 'D45', 'C46', 'D30', 'E121', 'D11', 'E77', 'F38', 'B3', 'D6', 'B82 B84', 'D17', 'A36', 'B102', 'B69', 'E49', 'C47', 'D28', 'E17', 'A24', 'C50', 'B42', 'C148'], dtype=object)
df_train['Cabin'].value_counts()
None 687 C23 C25 C27 4 G6 4 B96 B98 4 C22 C26 3 ... E34 1 C7 1 C54 1 E36 1 C148 1 Name: Cabin, Length: 148, dtype: int64
df_train['Age'] = df_train['Age'].fillna(df_train['Age'].mean())
counts = df_train['Cabin'].value_counts().copy(deep=True)
df_train['Cabin'][df_train['Cabin'].isin(counts[counts==1].index)] = 'Alone'
df_train['Cabin'][df_train['Cabin'].isin(counts[counts==2].index)] = 'Double room'
df_train['Cabin'][df_train['Cabin'].isin(counts[counts==3].index)] = 'Three person room'
df_train['Cabin'][df_train['Cabin'].isin(counts[counts==4].index)] = 'Four person room'
df_train['Cabin'][df_train['Cabin'].isin(counts[counts>4].index)] = 'Other'
C:\Users\Maciej\AppData\Local\Temp/ipykernel_8664/4127688350.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_train['Cabin'][df_train['Cabin'].isin(counts[counts==1].index)] = 'Alone' C:\Users\Maciej\AppData\Local\Temp/ipykernel_8664/4127688350.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_train['Cabin'][df_train['Cabin'].isin(counts[counts==2].index)] = 'Double room' C:\Users\Maciej\AppData\Local\Temp/ipykernel_8664/4127688350.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_train['Cabin'][df_train['Cabin'].isin(counts[counts==3].index)] = 'Three person room' C:\Users\Maciej\AppData\Local\Temp/ipykernel_8664/4127688350.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_train['Cabin'][df_train['Cabin'].isin(counts[counts==4].index)] = 'Four person room' C:\Users\Maciej\AppData\Local\Temp/ipykernel_8664/4127688350.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_train['Cabin'][df_train['Cabin'].isin(counts[counts>4].index)] = 'Other'
df_train['Cabin'].value_counts()
Other 687 Alone 101 Double room 76 Three person room 15 Four person room 12 Name: Cabin, dtype: int64
df_train = pd.get_dummies(data=df_train, columns=['Cabin'])
Analysis
corrmat = df_train.corr()
cols = corrmat.nlargest(15, 'Survived')['Survived'].index
cm = np.corrcoef(df_train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 8}, yticklabels=cols.values, xticklabels=cols.values)