{ "cells": [ { "cell_type": "code", "execution_count": 254, "id": "8b45b299", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
005302Hocking\\t Mr. Richard Georgemale23.0212910411.5000NaNS
104663Goncalves\\t Mr. Manuel Estanslasmale38.000SOTON/O.Q. 31013067.0500NaNS
207533Vande Velde\\t Mr. Johannes Josephmale33.0003457809.5000NaNS
308552Carter\\t Mrs. Ernest Courtenay (Lilian Hughes)female44.01024425226.0000NaNS
403331Graham\\t Mr. George Edwardmale38.001PC 17582153.4625C91S
\n", "
" ], "text/plain": [ " Survived PassengerId Pclass \\\n", "0 0 530 2 \n", "1 0 466 3 \n", "2 0 753 3 \n", "3 0 855 2 \n", "4 0 333 1 \n", "\n", " Name Sex Age SibSp Parch \\\n", "0 Hocking\\t Mr. Richard George male 23.0 2 1 \n", "1 Goncalves\\t Mr. Manuel Estanslas male 38.0 0 0 \n", "2 Vande Velde\\t Mr. Johannes Joseph male 33.0 0 0 \n", "3 Carter\\t Mrs. Ernest Courtenay (Lilian Hughes) female 44.0 1 0 \n", "4 Graham\\t Mr. George Edward male 38.0 0 1 \n", "\n", " Ticket Fare Cabin Embarked \n", "0 29104 11.5000 NaN S \n", "1 SOTON/O.Q. 3101306 7.0500 NaN S \n", "2 345780 9.5000 NaN S \n", "3 244252 26.0000 NaN S \n", "4 PC 17582 153.4625 C91 S " ] }, "execution_count": 254, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "data = pd.read_csv('titanic.tsv',sep='\\t')\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 255, "id": "8b3702f6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPassengerIdPclassSexAgeSibSpParchTicketFareCabinEmbarkedName_to_num
005302123.0212910411.5000NaNS1
104663138.000SOTON/O.Q. 31013067.0500NaNS1
207533133.0003457809.5000NaNS1
308552044.01024425226.0000NaNS0
403331138.001PC 17582153.4625C91S1
\n", "
" ], "text/plain": [ " Survived PassengerId Pclass Sex Age SibSp Parch Ticket \\\n", "0 0 530 2 1 23.0 2 1 29104 \n", "1 0 466 3 1 38.0 0 0 SOTON/O.Q. 3101306 \n", "2 0 753 3 1 33.0 0 0 345780 \n", "3 0 855 2 0 44.0 1 0 244252 \n", "4 0 333 1 1 38.0 0 1 PC 17582 \n", "\n", " Fare Cabin Embarked Name_to_num \n", "0 11.5000 NaN S 1 \n", "1 7.0500 NaN S 1 \n", "2 9.5000 NaN S 1 \n", "3 26.0000 NaN S 0 \n", "4 153.4625 C91 S 1 " ] }, "execution_count": 255, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['Sex'] = data['Sex'].apply(lambda x: 1 if x=='male' else 0)\n", "data['Name_to_num'] = data['Name'].apply(\n", " lambda x: 1 if 'Mr.' in x else 0\n", ")\n", "del data['Name']\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 256, "id": "9253cb6e", "metadata": {}, "outputs": [], "source": [ "data['Cabin'] = data['Cabin'].replace(np.nan,'Undefined')\n", "\n", "\n", "vectorizer = TfidfVectorizer()\n", "vectorizer.fit(data['Cabin'])\n", "vector = vectorizer.transform(data['Cabin']).toarray()\n", "vector_sum = []\n", "for v in vector:\n", " vector_sum.append(v.sum())\n", "data['Cabin']=vector_sum" ] }, { "cell_type": "code", "execution_count": 257, "id": "0d915dab", "metadata": {}, "outputs": [], "source": [ "data['Embarked'] = data['Embarked'].replace(np.nan,'Undefined')\n", "\n", "data = pd.get_dummies(data,columns=['Embarked'])" ] }, { "cell_type": "code", "execution_count": 258, "id": "2f641e28", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "28.00 139\n", "22.00 20\n", "21.00 19\n", "24.00 19\n", "19.00 17\n", " ... \n", "61.00 1\n", "70.50 1\n", "0.75 1\n", "10.00 1\n", "46.00 1\n", "Name: Age, Length: 82, dtype: int64" ] }, "execution_count": 258, "metadata": {}, "output_type": "execute_result" } ], "source": [ "imputer = SimpleImputer(missing_values=np.nan, strategy='median')\n", "data[['Age']] = imputer.fit_transform(data[['Age']])\n", "data['Age'].value_counts()" ] }, { "cell_type": "code", "execution_count": 259, "id": "536d5bd7", "metadata": {}, "outputs": [], "source": [ "data['Ticket'] = data['Ticket'].replace(np.nan,'Undefined')\n", "\n", "vectorizer = TfidfVectorizer()\n", "vectorizer.fit(data['Ticket'])\n", "vector = vectorizer.transform(data['Ticket']).toarray()\n", "vector_sum = []\n", "for v in vector:\n", " vector_sum.append(v.sum())\n", "data['Ticket']=vector_sum" ] }, { "cell_type": "code", "execution_count": 260, "id": "74e47288", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPassengerIdPclassSexAgeSibSpParchTicketFareCabinName_to_numEmbarked_CEmbarked_QEmbarked_SEmbarked_Undefined
005302123.0211.00000011.50001.010010
104663138.0001.3912847.05001.010010
207533133.0001.0000009.50001.010010
308552044.0101.00000026.00001.000010
403331138.0011.365721153.46251.010010
\n", "
" ], "text/plain": [ " Survived PassengerId Pclass Sex Age SibSp Parch Ticket Fare \\\n", "0 0 530 2 1 23.0 2 1 1.000000 11.5000 \n", "1 0 466 3 1 38.0 0 0 1.391284 7.0500 \n", "2 0 753 3 1 33.0 0 0 1.000000 9.5000 \n", "3 0 855 2 0 44.0 1 0 1.000000 26.0000 \n", "4 0 333 1 1 38.0 0 1 1.365721 153.4625 \n", "\n", " Cabin Name_to_num Embarked_C Embarked_Q Embarked_S Embarked_Undefined \n", "0 1.0 1 0 0 1 0 \n", "1 1.0 1 0 0 1 0 \n", "2 1.0 1 0 0 1 0 \n", "3 1.0 0 0 0 1 0 \n", "4 1.0 1 0 0 1 0 " ] }, "execution_count": 260, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }