{ "cells": [ { "cell_type": "code", "execution_count": 139, "id": "02249c82", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import precision_recall_fscore_support" ] }, { "cell_type": "code", "execution_count": 140, "id": "4be6437d", "metadata": {}, "outputs": [], "source": [ "data = pd.read_csv('titanic.tsv',sep='\\t')\n", "\n", "# formatowanie danych\n", "data['Sex'] = data['Sex'].apply(lambda x: 1 if x=='male' else 0)\n", "data['Name_to_num'] = data['Name'].apply(\n", " lambda x: 1 if 'Mr.' in x else 0\n", ")\n", "del data['Name']\n", "\n", "data['Cabin'] = data['Cabin'].replace(np.nan,'Undefined')\n", "\n", "\n", "vectorizer = TfidfVectorizer()\n", "vectorizer.fit(data['Cabin'])\n", "vector = vectorizer.transform(data['Cabin']).toarray()\n", "vector_sum = []\n", "for v in vector:\n", " vector_sum.append(v.sum())\n", "data['Cabin']=vector_sum\n", "\n", "data['Embarked'] = data['Embarked'].replace(np.nan,'Undefined')\n", "\n", "data = pd.get_dummies(data,columns=['Embarked'])\n", "\n", "imputer = SimpleImputer(missing_values=np.nan, strategy='median')\n", "data[['Age']] = imputer.fit_transform(data[['Age']])\n", "\n", "data['Ticket'] = data['Ticket'].replace(np.nan,'Undefined')\n", "\n", "vectorizer = TfidfVectorizer()\n", "vectorizer.fit(data['Ticket'])\n", "vector = vectorizer.transform(data['Ticket']).toarray()\n", "vector_sum = []\n", "for v in vector:\n", " vector_sum.append(v.sum())\n", "data['Ticket']=vector_sum" ] }, { "cell_type": "code", "execution_count": 141, "id": "618e8841", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPassengerIdPclassSexAgeSibSpParchTicketFareCabinName_to_numEmbarked_CEmbarked_QEmbarked_SEmbarked_Undefined
005302123.0211.00000011.50001.010010
104663138.0001.3912847.05001.010010
207533133.0001.0000009.50001.010010
308552044.0101.00000026.00001.000010
403331138.0011.365721153.46251.010010
\n", "
" ], "text/plain": [ " Survived PassengerId Pclass Sex Age SibSp Parch Ticket Fare \\\n", "0 0 530 2 1 23.0 2 1 1.000000 11.5000 \n", "1 0 466 3 1 38.0 0 0 1.391284 7.0500 \n", "2 0 753 3 1 33.0 0 0 1.000000 9.5000 \n", "3 0 855 2 0 44.0 1 0 1.000000 26.0000 \n", "4 0 333 1 1 38.0 0 1 1.365721 153.4625 \n", "\n", " Cabin Name_to_num Embarked_C Embarked_Q Embarked_S Embarked_Undefined \n", "0 1.0 1 0 0 1 0 \n", "1 1.0 1 0 0 1 0 \n", "2 1.0 1 0 0 1 0 \n", "3 1.0 0 0 0 1 0 \n", "4 1.0 1 0 0 1 0 " ] }, "execution_count": 141, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 142, "id": "7134ea55", "metadata": {}, "outputs": [], "source": [ "# Podział danych na zbiór uczący i zbiór testowy\n", "data_train,data_test = train_test_split(data, test_size=0.2)\n", "\n", "# zdefiniowanie cech\n", "FEATURES = ['Sex','Age','Embarked_C','Embarked_Q','Embarked_S']\n", "x_train = pd.DataFrame(data_train[FEATURES])\n", "y_train = pd.DataFrame(data_train['Survived'])\n", "model = LogisticRegression()" ] }, { "cell_type": "code", "execution_count": 143, "id": "1ff85122", "metadata": {}, "outputs": [], "source": [ "# uczenie modelu\n", "model.fit(x_train,y_train.values.ravel())\n", "\n", "# predykcja wynikow\n", "x_test = pd.DataFrame(data_test[FEATURES])\n", "y_expected = pd.DataFrame(data_test['Survived'])\n", "y_predicted = model.predict(x_test)" ] }, { "cell_type": "code", "execution_count": 144, "id": "0dff77b1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Precision: 0.832\n", "Recall: 0.832\n", "F-score: 0.832\n", "Model score: 0.832\n" ] } ], "source": [ "# ewaluacja wynikow\n", "precision, recall, fscore, support = precision_recall_fscore_support(\n", " y_expected, y_predicted, average='micro'\n", ")\n", "\n", "print(f\"Precision: {precision}\")\n", "print(f\"Recall: {recall}\")\n", "print(f\"F-score: {fscore}\")\n", "\n", "score = model.score(x_test, y_expected)\n", "\n", "print(f\"Model score: {score}\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }