{
"cells": [
{
"cell_type": "code",
"execution_count": 254,
"id": "8b45b299",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Survived | \n",
" PassengerId | \n",
" Pclass | \n",
" Name | \n",
" Sex | \n",
" Age | \n",
" SibSp | \n",
" Parch | \n",
" Ticket | \n",
" Fare | \n",
" Cabin | \n",
" Embarked | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 530 | \n",
" 2 | \n",
" Hocking\\t Mr. Richard George | \n",
" male | \n",
" 23.0 | \n",
" 2 | \n",
" 1 | \n",
" 29104 | \n",
" 11.5000 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 466 | \n",
" 3 | \n",
" Goncalves\\t Mr. Manuel Estanslas | \n",
" male | \n",
" 38.0 | \n",
" 0 | \n",
" 0 | \n",
" SOTON/O.Q. 3101306 | \n",
" 7.0500 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 753 | \n",
" 3 | \n",
" Vande Velde\\t Mr. Johannes Joseph | \n",
" male | \n",
" 33.0 | \n",
" 0 | \n",
" 0 | \n",
" 345780 | \n",
" 9.5000 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 855 | \n",
" 2 | \n",
" Carter\\t Mrs. Ernest Courtenay (Lilian Hughes) | \n",
" female | \n",
" 44.0 | \n",
" 1 | \n",
" 0 | \n",
" 244252 | \n",
" 26.0000 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 333 | \n",
" 1 | \n",
" Graham\\t Mr. George Edward | \n",
" male | \n",
" 38.0 | \n",
" 0 | \n",
" 1 | \n",
" PC 17582 | \n",
" 153.4625 | \n",
" C91 | \n",
" S | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Survived PassengerId Pclass \\\n",
"0 0 530 2 \n",
"1 0 466 3 \n",
"2 0 753 3 \n",
"3 0 855 2 \n",
"4 0 333 1 \n",
"\n",
" Name Sex Age SibSp Parch \\\n",
"0 Hocking\\t Mr. Richard George male 23.0 2 1 \n",
"1 Goncalves\\t Mr. Manuel Estanslas male 38.0 0 0 \n",
"2 Vande Velde\\t Mr. Johannes Joseph male 33.0 0 0 \n",
"3 Carter\\t Mrs. Ernest Courtenay (Lilian Hughes) female 44.0 1 0 \n",
"4 Graham\\t Mr. George Edward male 38.0 0 1 \n",
"\n",
" Ticket Fare Cabin Embarked \n",
"0 29104 11.5000 NaN S \n",
"1 SOTON/O.Q. 3101306 7.0500 NaN S \n",
"2 345780 9.5000 NaN S \n",
"3 244252 26.0000 NaN S \n",
"4 PC 17582 153.4625 C91 S "
]
},
"execution_count": 254,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"data = pd.read_csv('titanic.tsv',sep='\\t')\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 255,
"id": "8b3702f6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Survived | \n",
" PassengerId | \n",
" Pclass | \n",
" Sex | \n",
" Age | \n",
" SibSp | \n",
" Parch | \n",
" Ticket | \n",
" Fare | \n",
" Cabin | \n",
" Embarked | \n",
" Name_to_num | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 530 | \n",
" 2 | \n",
" 1 | \n",
" 23.0 | \n",
" 2 | \n",
" 1 | \n",
" 29104 | \n",
" 11.5000 | \n",
" NaN | \n",
" S | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 466 | \n",
" 3 | \n",
" 1 | \n",
" 38.0 | \n",
" 0 | \n",
" 0 | \n",
" SOTON/O.Q. 3101306 | \n",
" 7.0500 | \n",
" NaN | \n",
" S | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 753 | \n",
" 3 | \n",
" 1 | \n",
" 33.0 | \n",
" 0 | \n",
" 0 | \n",
" 345780 | \n",
" 9.5000 | \n",
" NaN | \n",
" S | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 855 | \n",
" 2 | \n",
" 0 | \n",
" 44.0 | \n",
" 1 | \n",
" 0 | \n",
" 244252 | \n",
" 26.0000 | \n",
" NaN | \n",
" S | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 333 | \n",
" 1 | \n",
" 1 | \n",
" 38.0 | \n",
" 0 | \n",
" 1 | \n",
" PC 17582 | \n",
" 153.4625 | \n",
" C91 | \n",
" S | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Survived PassengerId Pclass Sex Age SibSp Parch Ticket \\\n",
"0 0 530 2 1 23.0 2 1 29104 \n",
"1 0 466 3 1 38.0 0 0 SOTON/O.Q. 3101306 \n",
"2 0 753 3 1 33.0 0 0 345780 \n",
"3 0 855 2 0 44.0 1 0 244252 \n",
"4 0 333 1 1 38.0 0 1 PC 17582 \n",
"\n",
" Fare Cabin Embarked Name_to_num \n",
"0 11.5000 NaN S 1 \n",
"1 7.0500 NaN S 1 \n",
"2 9.5000 NaN S 1 \n",
"3 26.0000 NaN S 0 \n",
"4 153.4625 C91 S 1 "
]
},
"execution_count": 255,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['Sex'] = data['Sex'].apply(lambda x: 1 if x=='male' else 0)\n",
"data['Name_to_num'] = data['Name'].apply(\n",
" lambda x: 1 if 'Mr.' in x else 0\n",
")\n",
"del data['Name']\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 256,
"id": "9253cb6e",
"metadata": {},
"outputs": [],
"source": [
"data['Cabin'] = data['Cabin'].replace(np.nan,'Undefined')\n",
"\n",
"\n",
"vectorizer = TfidfVectorizer()\n",
"vectorizer.fit(data['Cabin'])\n",
"vector = vectorizer.transform(data['Cabin']).toarray()\n",
"vector_sum = []\n",
"for v in vector:\n",
" vector_sum.append(v.sum())\n",
"data['Cabin']=vector_sum"
]
},
{
"cell_type": "code",
"execution_count": 257,
"id": "0d915dab",
"metadata": {},
"outputs": [],
"source": [
"data['Embarked'] = data['Embarked'].replace(np.nan,'Undefined')\n",
"\n",
"data = pd.get_dummies(data,columns=['Embarked'])"
]
},
{
"cell_type": "code",
"execution_count": 258,
"id": "2f641e28",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"28.00 139\n",
"22.00 20\n",
"21.00 19\n",
"24.00 19\n",
"19.00 17\n",
" ... \n",
"61.00 1\n",
"70.50 1\n",
"0.75 1\n",
"10.00 1\n",
"46.00 1\n",
"Name: Age, Length: 82, dtype: int64"
]
},
"execution_count": 258,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"imputer = SimpleImputer(missing_values=np.nan, strategy='median')\n",
"data[['Age']] = imputer.fit_transform(data[['Age']])\n",
"data['Age'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 259,
"id": "536d5bd7",
"metadata": {},
"outputs": [],
"source": [
"data['Ticket'] = data['Ticket'].replace(np.nan,'Undefined')\n",
"\n",
"vectorizer = TfidfVectorizer()\n",
"vectorizer.fit(data['Ticket'])\n",
"vector = vectorizer.transform(data['Ticket']).toarray()\n",
"vector_sum = []\n",
"for v in vector:\n",
" vector_sum.append(v.sum())\n",
"data['Ticket']=vector_sum"
]
},
{
"cell_type": "code",
"execution_count": 260,
"id": "74e47288",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Survived | \n",
" PassengerId | \n",
" Pclass | \n",
" Sex | \n",
" Age | \n",
" SibSp | \n",
" Parch | \n",
" Ticket | \n",
" Fare | \n",
" Cabin | \n",
" Name_to_num | \n",
" Embarked_C | \n",
" Embarked_Q | \n",
" Embarked_S | \n",
" Embarked_Undefined | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 530 | \n",
" 2 | \n",
" 1 | \n",
" 23.0 | \n",
" 2 | \n",
" 1 | \n",
" 1.000000 | \n",
" 11.5000 | \n",
" 1.0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 466 | \n",
" 3 | \n",
" 1 | \n",
" 38.0 | \n",
" 0 | \n",
" 0 | \n",
" 1.391284 | \n",
" 7.0500 | \n",
" 1.0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 753 | \n",
" 3 | \n",
" 1 | \n",
" 33.0 | \n",
" 0 | \n",
" 0 | \n",
" 1.000000 | \n",
" 9.5000 | \n",
" 1.0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 855 | \n",
" 2 | \n",
" 0 | \n",
" 44.0 | \n",
" 1 | \n",
" 0 | \n",
" 1.000000 | \n",
" 26.0000 | \n",
" 1.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 333 | \n",
" 1 | \n",
" 1 | \n",
" 38.0 | \n",
" 0 | \n",
" 1 | \n",
" 1.365721 | \n",
" 153.4625 | \n",
" 1.0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Survived PassengerId Pclass Sex Age SibSp Parch Ticket Fare \\\n",
"0 0 530 2 1 23.0 2 1 1.000000 11.5000 \n",
"1 0 466 3 1 38.0 0 0 1.391284 7.0500 \n",
"2 0 753 3 1 33.0 0 0 1.000000 9.5000 \n",
"3 0 855 2 0 44.0 1 0 1.000000 26.0000 \n",
"4 0 333 1 1 38.0 0 1 1.365721 153.4625 \n",
"\n",
" Cabin Name_to_num Embarked_C Embarked_Q Embarked_S Embarked_Undefined \n",
"0 1.0 1 0 0 1 0 \n",
"1 1.0 1 0 0 1 0 \n",
"2 1.0 1 0 0 1 0 \n",
"3 1.0 0 0 0 1 0 \n",
"4 1.0 1 0 0 1 0 "
]
},
"execution_count": 260,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}