uczenie_maszynowe_zadania/cw_4/.ipynb_checkpoints/main-checkpoint.ipynb
2023-07-04 20:42:14 +02:00

583 lines
17 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 254,
"id": "8b45b299",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Survived</th>\n",
" <th>PassengerId</th>\n",
" <th>Pclass</th>\n",
" <th>Name</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Ticket</th>\n",
" <th>Fare</th>\n",
" <th>Cabin</th>\n",
" <th>Embarked</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>530</td>\n",
" <td>2</td>\n",
" <td>Hocking\\t Mr. Richard George</td>\n",
" <td>male</td>\n",
" <td>23.0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>29104</td>\n",
" <td>11.5000</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>466</td>\n",
" <td>3</td>\n",
" <td>Goncalves\\t Mr. Manuel Estanslas</td>\n",
" <td>male</td>\n",
" <td>38.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>SOTON/O.Q. 3101306</td>\n",
" <td>7.0500</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>753</td>\n",
" <td>3</td>\n",
" <td>Vande Velde\\t Mr. Johannes Joseph</td>\n",
" <td>male</td>\n",
" <td>33.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>345780</td>\n",
" <td>9.5000</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>855</td>\n",
" <td>2</td>\n",
" <td>Carter\\t Mrs. Ernest Courtenay (Lilian Hughes)</td>\n",
" <td>female</td>\n",
" <td>44.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>244252</td>\n",
" <td>26.0000</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>333</td>\n",
" <td>1</td>\n",
" <td>Graham\\t Mr. George Edward</td>\n",
" <td>male</td>\n",
" <td>38.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>PC 17582</td>\n",
" <td>153.4625</td>\n",
" <td>C91</td>\n",
" <td>S</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Survived PassengerId Pclass \\\n",
"0 0 530 2 \n",
"1 0 466 3 \n",
"2 0 753 3 \n",
"3 0 855 2 \n",
"4 0 333 1 \n",
"\n",
" Name Sex Age SibSp Parch \\\n",
"0 Hocking\\t Mr. Richard George male 23.0 2 1 \n",
"1 Goncalves\\t Mr. Manuel Estanslas male 38.0 0 0 \n",
"2 Vande Velde\\t Mr. Johannes Joseph male 33.0 0 0 \n",
"3 Carter\\t Mrs. Ernest Courtenay (Lilian Hughes) female 44.0 1 0 \n",
"4 Graham\\t Mr. George Edward male 38.0 0 1 \n",
"\n",
" Ticket Fare Cabin Embarked \n",
"0 29104 11.5000 NaN S \n",
"1 SOTON/O.Q. 3101306 7.0500 NaN S \n",
"2 345780 9.5000 NaN S \n",
"3 244252 26.0000 NaN S \n",
"4 PC 17582 153.4625 C91 S "
]
},
"execution_count": 254,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"data = pd.read_csv('titanic.tsv',sep='\\t')\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 255,
"id": "8b3702f6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Survived</th>\n",
" <th>PassengerId</th>\n",
" <th>Pclass</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Ticket</th>\n",
" <th>Fare</th>\n",
" <th>Cabin</th>\n",
" <th>Embarked</th>\n",
" <th>Name_to_num</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>530</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>23.0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>29104</td>\n",
" <td>11.5000</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>466</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>38.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>SOTON/O.Q. 3101306</td>\n",
" <td>7.0500</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>753</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>33.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>345780</td>\n",
" <td>9.5000</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>855</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>44.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>244252</td>\n",
" <td>26.0000</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>333</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>38.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>PC 17582</td>\n",
" <td>153.4625</td>\n",
" <td>C91</td>\n",
" <td>S</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Survived PassengerId Pclass Sex Age SibSp Parch Ticket \\\n",
"0 0 530 2 1 23.0 2 1 29104 \n",
"1 0 466 3 1 38.0 0 0 SOTON/O.Q. 3101306 \n",
"2 0 753 3 1 33.0 0 0 345780 \n",
"3 0 855 2 0 44.0 1 0 244252 \n",
"4 0 333 1 1 38.0 0 1 PC 17582 \n",
"\n",
" Fare Cabin Embarked Name_to_num \n",
"0 11.5000 NaN S 1 \n",
"1 7.0500 NaN S 1 \n",
"2 9.5000 NaN S 1 \n",
"3 26.0000 NaN S 0 \n",
"4 153.4625 C91 S 1 "
]
},
"execution_count": 255,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['Sex'] = data['Sex'].apply(lambda x: 1 if x=='male' else 0)\n",
"data['Name_to_num'] = data['Name'].apply(\n",
" lambda x: 1 if 'Mr.' in x else 0\n",
")\n",
"del data['Name']\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 256,
"id": "9253cb6e",
"metadata": {},
"outputs": [],
"source": [
"data['Cabin'] = data['Cabin'].replace(np.nan,'Undefined')\n",
"\n",
"\n",
"vectorizer = TfidfVectorizer()\n",
"vectorizer.fit(data['Cabin'])\n",
"vector = vectorizer.transform(data['Cabin']).toarray()\n",
"vector_sum = []\n",
"for v in vector:\n",
" vector_sum.append(v.sum())\n",
"data['Cabin']=vector_sum"
]
},
{
"cell_type": "code",
"execution_count": 257,
"id": "0d915dab",
"metadata": {},
"outputs": [],
"source": [
"data['Embarked'] = data['Embarked'].replace(np.nan,'Undefined')\n",
"\n",
"data = pd.get_dummies(data,columns=['Embarked'])"
]
},
{
"cell_type": "code",
"execution_count": 258,
"id": "2f641e28",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"28.00 139\n",
"22.00 20\n",
"21.00 19\n",
"24.00 19\n",
"19.00 17\n",
" ... \n",
"61.00 1\n",
"70.50 1\n",
"0.75 1\n",
"10.00 1\n",
"46.00 1\n",
"Name: Age, Length: 82, dtype: int64"
]
},
"execution_count": 258,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"imputer = SimpleImputer(missing_values=np.nan, strategy='median')\n",
"data[['Age']] = imputer.fit_transform(data[['Age']])\n",
"data['Age'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 259,
"id": "536d5bd7",
"metadata": {},
"outputs": [],
"source": [
"data['Ticket'] = data['Ticket'].replace(np.nan,'Undefined')\n",
"\n",
"vectorizer = TfidfVectorizer()\n",
"vectorizer.fit(data['Ticket'])\n",
"vector = vectorizer.transform(data['Ticket']).toarray()\n",
"vector_sum = []\n",
"for v in vector:\n",
" vector_sum.append(v.sum())\n",
"data['Ticket']=vector_sum"
]
},
{
"cell_type": "code",
"execution_count": 260,
"id": "74e47288",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Survived</th>\n",
" <th>PassengerId</th>\n",
" <th>Pclass</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Ticket</th>\n",
" <th>Fare</th>\n",
" <th>Cabin</th>\n",
" <th>Name_to_num</th>\n",
" <th>Embarked_C</th>\n",
" <th>Embarked_Q</th>\n",
" <th>Embarked_S</th>\n",
" <th>Embarked_Undefined</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>530</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>23.0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1.000000</td>\n",
" <td>11.5000</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>466</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>38.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.391284</td>\n",
" <td>7.0500</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>753</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>33.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.000000</td>\n",
" <td>9.5000</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>855</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>44.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1.000000</td>\n",
" <td>26.0000</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>333</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>38.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1.365721</td>\n",
" <td>153.4625</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Survived PassengerId Pclass Sex Age SibSp Parch Ticket Fare \\\n",
"0 0 530 2 1 23.0 2 1 1.000000 11.5000 \n",
"1 0 466 3 1 38.0 0 0 1.391284 7.0500 \n",
"2 0 753 3 1 33.0 0 0 1.000000 9.5000 \n",
"3 0 855 2 0 44.0 1 0 1.000000 26.0000 \n",
"4 0 333 1 1 38.0 0 1 1.365721 153.4625 \n",
"\n",
" Cabin Name_to_num Embarked_C Embarked_Q Embarked_S Embarked_Undefined \n",
"0 1.0 1 0 0 1 0 \n",
"1 1.0 1 0 0 1 0 \n",
"2 1.0 1 0 0 1 0 \n",
"3 1.0 0 0 0 1 0 \n",
"4 1.0 1 0 0 1 0 "
]
},
"execution_count": 260,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}