583 lines
17 KiB
Plaintext
583 lines
17 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 254,
|
|
"id": "8b45b299",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Survived</th>\n",
|
|
" <th>PassengerId</th>\n",
|
|
" <th>Pclass</th>\n",
|
|
" <th>Name</th>\n",
|
|
" <th>Sex</th>\n",
|
|
" <th>Age</th>\n",
|
|
" <th>SibSp</th>\n",
|
|
" <th>Parch</th>\n",
|
|
" <th>Ticket</th>\n",
|
|
" <th>Fare</th>\n",
|
|
" <th>Cabin</th>\n",
|
|
" <th>Embarked</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>530</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>Hocking\\t Mr. Richard George</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>23.0</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>29104</td>\n",
|
|
" <td>11.5000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>S</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>466</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>Goncalves\\t Mr. Manuel Estanslas</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>38.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>SOTON/O.Q. 3101306</td>\n",
|
|
" <td>7.0500</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>S</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>753</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>Vande Velde\\t Mr. Johannes Joseph</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>33.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>345780</td>\n",
|
|
" <td>9.5000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>S</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>855</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>Carter\\t Mrs. Ernest Courtenay (Lilian Hughes)</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>44.0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>244252</td>\n",
|
|
" <td>26.0000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>S</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>333</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>Graham\\t Mr. George Edward</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>38.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>PC 17582</td>\n",
|
|
" <td>153.4625</td>\n",
|
|
" <td>C91</td>\n",
|
|
" <td>S</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Survived PassengerId Pclass \\\n",
|
|
"0 0 530 2 \n",
|
|
"1 0 466 3 \n",
|
|
"2 0 753 3 \n",
|
|
"3 0 855 2 \n",
|
|
"4 0 333 1 \n",
|
|
"\n",
|
|
" Name Sex Age SibSp Parch \\\n",
|
|
"0 Hocking\\t Mr. Richard George male 23.0 2 1 \n",
|
|
"1 Goncalves\\t Mr. Manuel Estanslas male 38.0 0 0 \n",
|
|
"2 Vande Velde\\t Mr. Johannes Joseph male 33.0 0 0 \n",
|
|
"3 Carter\\t Mrs. Ernest Courtenay (Lilian Hughes) female 44.0 1 0 \n",
|
|
"4 Graham\\t Mr. George Edward male 38.0 0 1 \n",
|
|
"\n",
|
|
" Ticket Fare Cabin Embarked \n",
|
|
"0 29104 11.5000 NaN S \n",
|
|
"1 SOTON/O.Q. 3101306 7.0500 NaN S \n",
|
|
"2 345780 9.5000 NaN S \n",
|
|
"3 244252 26.0000 NaN S \n",
|
|
"4 PC 17582 153.4625 C91 S "
|
|
]
|
|
},
|
|
"execution_count": 254,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"from sklearn.impute import SimpleImputer\n",
|
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
|
"\n",
|
|
"data = pd.read_csv('titanic.tsv',sep='\\t')\n",
|
|
"data.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 255,
|
|
"id": "8b3702f6",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Survived</th>\n",
|
|
" <th>PassengerId</th>\n",
|
|
" <th>Pclass</th>\n",
|
|
" <th>Sex</th>\n",
|
|
" <th>Age</th>\n",
|
|
" <th>SibSp</th>\n",
|
|
" <th>Parch</th>\n",
|
|
" <th>Ticket</th>\n",
|
|
" <th>Fare</th>\n",
|
|
" <th>Cabin</th>\n",
|
|
" <th>Embarked</th>\n",
|
|
" <th>Name_to_num</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>530</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>23.0</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>29104</td>\n",
|
|
" <td>11.5000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>S</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>466</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>38.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>SOTON/O.Q. 3101306</td>\n",
|
|
" <td>7.0500</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>S</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>753</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>33.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>345780</td>\n",
|
|
" <td>9.5000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>S</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>855</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>44.0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>244252</td>\n",
|
|
" <td>26.0000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>S</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>333</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>38.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>PC 17582</td>\n",
|
|
" <td>153.4625</td>\n",
|
|
" <td>C91</td>\n",
|
|
" <td>S</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Survived PassengerId Pclass Sex Age SibSp Parch Ticket \\\n",
|
|
"0 0 530 2 1 23.0 2 1 29104 \n",
|
|
"1 0 466 3 1 38.0 0 0 SOTON/O.Q. 3101306 \n",
|
|
"2 0 753 3 1 33.0 0 0 345780 \n",
|
|
"3 0 855 2 0 44.0 1 0 244252 \n",
|
|
"4 0 333 1 1 38.0 0 1 PC 17582 \n",
|
|
"\n",
|
|
" Fare Cabin Embarked Name_to_num \n",
|
|
"0 11.5000 NaN S 1 \n",
|
|
"1 7.0500 NaN S 1 \n",
|
|
"2 9.5000 NaN S 1 \n",
|
|
"3 26.0000 NaN S 0 \n",
|
|
"4 153.4625 C91 S 1 "
|
|
]
|
|
},
|
|
"execution_count": 255,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data['Sex'] = data['Sex'].apply(lambda x: 1 if x=='male' else 0)\n",
|
|
"data['Name_to_num'] = data['Name'].apply(\n",
|
|
" lambda x: 1 if 'Mr.' in x else 0\n",
|
|
")\n",
|
|
"del data['Name']\n",
|
|
"data.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 256,
|
|
"id": "9253cb6e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data['Cabin'] = data['Cabin'].replace(np.nan,'Undefined')\n",
|
|
"\n",
|
|
"\n",
|
|
"vectorizer = TfidfVectorizer()\n",
|
|
"vectorizer.fit(data['Cabin'])\n",
|
|
"vector = vectorizer.transform(data['Cabin']).toarray()\n",
|
|
"vector_sum = []\n",
|
|
"for v in vector:\n",
|
|
" vector_sum.append(v.sum())\n",
|
|
"data['Cabin']=vector_sum"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 257,
|
|
"id": "0d915dab",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data['Embarked'] = data['Embarked'].replace(np.nan,'Undefined')\n",
|
|
"\n",
|
|
"data = pd.get_dummies(data,columns=['Embarked'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 258,
|
|
"id": "2f641e28",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"28.00 139\n",
|
|
"22.00 20\n",
|
|
"21.00 19\n",
|
|
"24.00 19\n",
|
|
"19.00 17\n",
|
|
" ... \n",
|
|
"61.00 1\n",
|
|
"70.50 1\n",
|
|
"0.75 1\n",
|
|
"10.00 1\n",
|
|
"46.00 1\n",
|
|
"Name: Age, Length: 82, dtype: int64"
|
|
]
|
|
},
|
|
"execution_count": 258,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"imputer = SimpleImputer(missing_values=np.nan, strategy='median')\n",
|
|
"data[['Age']] = imputer.fit_transform(data[['Age']])\n",
|
|
"data['Age'].value_counts()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 259,
|
|
"id": "536d5bd7",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data['Ticket'] = data['Ticket'].replace(np.nan,'Undefined')\n",
|
|
"\n",
|
|
"vectorizer = TfidfVectorizer()\n",
|
|
"vectorizer.fit(data['Ticket'])\n",
|
|
"vector = vectorizer.transform(data['Ticket']).toarray()\n",
|
|
"vector_sum = []\n",
|
|
"for v in vector:\n",
|
|
" vector_sum.append(v.sum())\n",
|
|
"data['Ticket']=vector_sum"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 260,
|
|
"id": "74e47288",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Survived</th>\n",
|
|
" <th>PassengerId</th>\n",
|
|
" <th>Pclass</th>\n",
|
|
" <th>Sex</th>\n",
|
|
" <th>Age</th>\n",
|
|
" <th>SibSp</th>\n",
|
|
" <th>Parch</th>\n",
|
|
" <th>Ticket</th>\n",
|
|
" <th>Fare</th>\n",
|
|
" <th>Cabin</th>\n",
|
|
" <th>Name_to_num</th>\n",
|
|
" <th>Embarked_C</th>\n",
|
|
" <th>Embarked_Q</th>\n",
|
|
" <th>Embarked_S</th>\n",
|
|
" <th>Embarked_Undefined</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>530</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>23.0</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>11.5000</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>466</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>38.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1.391284</td>\n",
|
|
" <td>7.0500</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>753</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>33.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>9.5000</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>855</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>44.0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>26.0000</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>333</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>38.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1.365721</td>\n",
|
|
" <td>153.4625</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Survived PassengerId Pclass Sex Age SibSp Parch Ticket Fare \\\n",
|
|
"0 0 530 2 1 23.0 2 1 1.000000 11.5000 \n",
|
|
"1 0 466 3 1 38.0 0 0 1.391284 7.0500 \n",
|
|
"2 0 753 3 1 33.0 0 0 1.000000 9.5000 \n",
|
|
"3 0 855 2 0 44.0 1 0 1.000000 26.0000 \n",
|
|
"4 0 333 1 1 38.0 0 1 1.365721 153.4625 \n",
|
|
"\n",
|
|
" Cabin Name_to_num Embarked_C Embarked_Q Embarked_S Embarked_Undefined \n",
|
|
"0 1.0 1 0 0 1 0 \n",
|
|
"1 1.0 1 0 0 1 0 \n",
|
|
"2 1.0 1 0 0 1 0 \n",
|
|
"3 1.0 0 0 0 1 0 \n",
|
|
"4 1.0 1 0 0 1 0 "
|
|
]
|
|
},
|
|
"execution_count": 260,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data.head()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.10"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|