{ "cells": [ { "cell_type": "code", "execution_count": 139, "id": "02249c82", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import precision_recall_fscore_support" ] }, { "cell_type": "code", "execution_count": 140, "id": "4be6437d", "metadata": {}, "outputs": [], "source": [ "data = pd.read_csv('titanic.tsv',sep='\\t')\n", "\n", "# formatowanie danych\n", "data['Sex'] = data['Sex'].apply(lambda x: 1 if x=='male' else 0)\n", "data['Name_to_num'] = data['Name'].apply(\n", " lambda x: 1 if 'Mr.' in x else 0\n", ")\n", "del data['Name']\n", "\n", "data['Cabin'] = data['Cabin'].replace(np.nan,'Undefined')\n", "\n", "\n", "vectorizer = TfidfVectorizer()\n", "vectorizer.fit(data['Cabin'])\n", "vector = vectorizer.transform(data['Cabin']).toarray()\n", "vector_sum = []\n", "for v in vector:\n", " vector_sum.append(v.sum())\n", "data['Cabin']=vector_sum\n", "\n", "data['Embarked'] = data['Embarked'].replace(np.nan,'Undefined')\n", "\n", "data = pd.get_dummies(data,columns=['Embarked'])\n", "\n", "imputer = SimpleImputer(missing_values=np.nan, strategy='median')\n", "data[['Age']] = imputer.fit_transform(data[['Age']])\n", "\n", "data['Ticket'] = data['Ticket'].replace(np.nan,'Undefined')\n", "\n", "vectorizer = TfidfVectorizer()\n", "vectorizer.fit(data['Ticket'])\n", "vector = vectorizer.transform(data['Ticket']).toarray()\n", "vector_sum = []\n", "for v in vector:\n", " vector_sum.append(v.sum())\n", "data['Ticket']=vector_sum" ] }, { "cell_type": "code", "execution_count": 141, "id": "618e8841", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Survived | \n", "PassengerId | \n", "Pclass | \n", "Sex | \n", "Age | \n", "SibSp | \n", "Parch | \n", "Ticket | \n", "Fare | \n", "Cabin | \n", "Name_to_num | \n", "Embarked_C | \n", "Embarked_Q | \n", "Embarked_S | \n", "Embarked_Undefined | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0 | \n", "530 | \n", "2 | \n", "1 | \n", "23.0 | \n", "2 | \n", "1 | \n", "1.000000 | \n", "11.5000 | \n", "1.0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "
1 | \n", "0 | \n", "466 | \n", "3 | \n", "1 | \n", "38.0 | \n", "0 | \n", "0 | \n", "1.391284 | \n", "7.0500 | \n", "1.0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "
2 | \n", "0 | \n", "753 | \n", "3 | \n", "1 | \n", "33.0 | \n", "0 | \n", "0 | \n", "1.000000 | \n", "9.5000 | \n", "1.0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "
3 | \n", "0 | \n", "855 | \n", "2 | \n", "0 | \n", "44.0 | \n", "1 | \n", "0 | \n", "1.000000 | \n", "26.0000 | \n", "1.0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "
4 | \n", "0 | \n", "333 | \n", "1 | \n", "1 | \n", "38.0 | \n", "0 | \n", "1 | \n", "1.365721 | \n", "153.4625 | \n", "1.0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "