317 lines
9.2 KiB
Plaintext
317 lines
9.2 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 139,
|
||
|
"id": "02249c82",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import numpy as np\n",
|
||
|
"import pandas as pd\n",
|
||
|
"from sklearn.impute import SimpleImputer\n",
|
||
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||
|
"\n",
|
||
|
"from sklearn.linear_model import LogisticRegression\n",
|
||
|
"from sklearn.model_selection import train_test_split\n",
|
||
|
"from sklearn.metrics import precision_recall_fscore_support"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 140,
|
||
|
"id": "4be6437d",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"data = pd.read_csv('titanic.tsv',sep='\\t')\n",
|
||
|
"\n",
|
||
|
"# formatowanie danych\n",
|
||
|
"data['Sex'] = data['Sex'].apply(lambda x: 1 if x=='male' else 0)\n",
|
||
|
"data['Name_to_num'] = data['Name'].apply(\n",
|
||
|
" lambda x: 1 if 'Mr.' in x else 0\n",
|
||
|
")\n",
|
||
|
"del data['Name']\n",
|
||
|
"\n",
|
||
|
"data['Cabin'] = data['Cabin'].replace(np.nan,'Undefined')\n",
|
||
|
"\n",
|
||
|
"\n",
|
||
|
"vectorizer = TfidfVectorizer()\n",
|
||
|
"vectorizer.fit(data['Cabin'])\n",
|
||
|
"vector = vectorizer.transform(data['Cabin']).toarray()\n",
|
||
|
"vector_sum = []\n",
|
||
|
"for v in vector:\n",
|
||
|
" vector_sum.append(v.sum())\n",
|
||
|
"data['Cabin']=vector_sum\n",
|
||
|
"\n",
|
||
|
"data['Embarked'] = data['Embarked'].replace(np.nan,'Undefined')\n",
|
||
|
"\n",
|
||
|
"data = pd.get_dummies(data,columns=['Embarked'])\n",
|
||
|
"\n",
|
||
|
"imputer = SimpleImputer(missing_values=np.nan, strategy='median')\n",
|
||
|
"data[['Age']] = imputer.fit_transform(data[['Age']])\n",
|
||
|
"\n",
|
||
|
"data['Ticket'] = data['Ticket'].replace(np.nan,'Undefined')\n",
|
||
|
"\n",
|
||
|
"vectorizer = TfidfVectorizer()\n",
|
||
|
"vectorizer.fit(data['Ticket'])\n",
|
||
|
"vector = vectorizer.transform(data['Ticket']).toarray()\n",
|
||
|
"vector_sum = []\n",
|
||
|
"for v in vector:\n",
|
||
|
" vector_sum.append(v.sum())\n",
|
||
|
"data['Ticket']=vector_sum"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 141,
|
||
|
"id": "618e8841",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<div>\n",
|
||
|
"<style scoped>\n",
|
||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||
|
" vertical-align: middle;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe tbody tr th {\n",
|
||
|
" vertical-align: top;\n",
|
||
|
" }\n",
|
||
|
"\n",
|
||
|
" .dataframe thead th {\n",
|
||
|
" text-align: right;\n",
|
||
|
" }\n",
|
||
|
"</style>\n",
|
||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||
|
" <thead>\n",
|
||
|
" <tr style=\"text-align: right;\">\n",
|
||
|
" <th></th>\n",
|
||
|
" <th>Survived</th>\n",
|
||
|
" <th>PassengerId</th>\n",
|
||
|
" <th>Pclass</th>\n",
|
||
|
" <th>Sex</th>\n",
|
||
|
" <th>Age</th>\n",
|
||
|
" <th>SibSp</th>\n",
|
||
|
" <th>Parch</th>\n",
|
||
|
" <th>Ticket</th>\n",
|
||
|
" <th>Fare</th>\n",
|
||
|
" <th>Cabin</th>\n",
|
||
|
" <th>Name_to_num</th>\n",
|
||
|
" <th>Embarked_C</th>\n",
|
||
|
" <th>Embarked_Q</th>\n",
|
||
|
" <th>Embarked_S</th>\n",
|
||
|
" <th>Embarked_Undefined</th>\n",
|
||
|
" </tr>\n",
|
||
|
" </thead>\n",
|
||
|
" <tbody>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>0</th>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>530</td>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>23.0</td>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1.000000</td>\n",
|
||
|
" <td>11.5000</td>\n",
|
||
|
" <td>1.0</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>1</th>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>466</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>38.0</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>1.391284</td>\n",
|
||
|
" <td>7.0500</td>\n",
|
||
|
" <td>1.0</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>2</th>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>753</td>\n",
|
||
|
" <td>3</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>33.0</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>1.000000</td>\n",
|
||
|
" <td>9.5000</td>\n",
|
||
|
" <td>1.0</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>3</th>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>855</td>\n",
|
||
|
" <td>2</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>44.0</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>1.000000</td>\n",
|
||
|
" <td>26.0000</td>\n",
|
||
|
" <td>1.0</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" </tr>\n",
|
||
|
" <tr>\n",
|
||
|
" <th>4</th>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>333</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>38.0</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>1.365721</td>\n",
|
||
|
" <td>153.4625</td>\n",
|
||
|
" <td>1.0</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" <td>1</td>\n",
|
||
|
" <td>0</td>\n",
|
||
|
" </tr>\n",
|
||
|
" </tbody>\n",
|
||
|
"</table>\n",
|
||
|
"</div>"
|
||
|
],
|
||
|
"text/plain": [
|
||
|
" Survived PassengerId Pclass Sex Age SibSp Parch Ticket Fare \\\n",
|
||
|
"0 0 530 2 1 23.0 2 1 1.000000 11.5000 \n",
|
||
|
"1 0 466 3 1 38.0 0 0 1.391284 7.0500 \n",
|
||
|
"2 0 753 3 1 33.0 0 0 1.000000 9.5000 \n",
|
||
|
"3 0 855 2 0 44.0 1 0 1.000000 26.0000 \n",
|
||
|
"4 0 333 1 1 38.0 0 1 1.365721 153.4625 \n",
|
||
|
"\n",
|
||
|
" Cabin Name_to_num Embarked_C Embarked_Q Embarked_S Embarked_Undefined \n",
|
||
|
"0 1.0 1 0 0 1 0 \n",
|
||
|
"1 1.0 1 0 0 1 0 \n",
|
||
|
"2 1.0 1 0 0 1 0 \n",
|
||
|
"3 1.0 0 0 0 1 0 \n",
|
||
|
"4 1.0 1 0 0 1 0 "
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 141,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"data.head()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 142,
|
||
|
"id": "7134ea55",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Podział danych na zbiór uczący i zbiór testowy\n",
|
||
|
"data_train,data_test = train_test_split(data, test_size=0.2)\n",
|
||
|
"\n",
|
||
|
"# zdefiniowanie cech\n",
|
||
|
"FEATURES = ['Sex','Age','Embarked_C','Embarked_Q','Embarked_S']\n",
|
||
|
"x_train = pd.DataFrame(data_train[FEATURES])\n",
|
||
|
"y_train = pd.DataFrame(data_train['Survived'])\n",
|
||
|
"model = LogisticRegression()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 143,
|
||
|
"id": "1ff85122",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# uczenie modelu\n",
|
||
|
"model.fit(x_train,y_train.values.ravel())\n",
|
||
|
"\n",
|
||
|
"# predykcja wynikow\n",
|
||
|
"x_test = pd.DataFrame(data_test[FEATURES])\n",
|
||
|
"y_expected = pd.DataFrame(data_test['Survived'])\n",
|
||
|
"y_predicted = model.predict(x_test)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 144,
|
||
|
"id": "0dff77b1",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Precision: 0.832\n",
|
||
|
"Recall: 0.832\n",
|
||
|
"F-score: 0.832\n",
|
||
|
"Model score: 0.832\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# ewaluacja wynikow\n",
|
||
|
"precision, recall, fscore, support = precision_recall_fscore_support(\n",
|
||
|
" y_expected, y_predicted, average='micro'\n",
|
||
|
")\n",
|
||
|
"\n",
|
||
|
"print(f\"Precision: {precision}\")\n",
|
||
|
"print(f\"Recall: {recall}\")\n",
|
||
|
"print(f\"F-score: {fscore}\")\n",
|
||
|
"\n",
|
||
|
"score = model.score(x_test, y_expected)\n",
|
||
|
"\n",
|
||
|
"print(f\"Model score: {score}\")"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3 (ipykernel)",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.8.10"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 5
|
||
|
}
|