uczenie_maszynowe_zadania/cw_5/main.ipynb
2023-07-04 20:42:14 +02:00

317 lines
9.2 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 139,
"id": "02249c82",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import precision_recall_fscore_support"
]
},
{
"cell_type": "code",
"execution_count": 140,
"id": "4be6437d",
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv('titanic.tsv',sep='\\t')\n",
"\n",
"# formatowanie danych\n",
"data['Sex'] = data['Sex'].apply(lambda x: 1 if x=='male' else 0)\n",
"data['Name_to_num'] = data['Name'].apply(\n",
" lambda x: 1 if 'Mr.' in x else 0\n",
")\n",
"del data['Name']\n",
"\n",
"data['Cabin'] = data['Cabin'].replace(np.nan,'Undefined')\n",
"\n",
"\n",
"vectorizer = TfidfVectorizer()\n",
"vectorizer.fit(data['Cabin'])\n",
"vector = vectorizer.transform(data['Cabin']).toarray()\n",
"vector_sum = []\n",
"for v in vector:\n",
" vector_sum.append(v.sum())\n",
"data['Cabin']=vector_sum\n",
"\n",
"data['Embarked'] = data['Embarked'].replace(np.nan,'Undefined')\n",
"\n",
"data = pd.get_dummies(data,columns=['Embarked'])\n",
"\n",
"imputer = SimpleImputer(missing_values=np.nan, strategy='median')\n",
"data[['Age']] = imputer.fit_transform(data[['Age']])\n",
"\n",
"data['Ticket'] = data['Ticket'].replace(np.nan,'Undefined')\n",
"\n",
"vectorizer = TfidfVectorizer()\n",
"vectorizer.fit(data['Ticket'])\n",
"vector = vectorizer.transform(data['Ticket']).toarray()\n",
"vector_sum = []\n",
"for v in vector:\n",
" vector_sum.append(v.sum())\n",
"data['Ticket']=vector_sum"
]
},
{
"cell_type": "code",
"execution_count": 141,
"id": "618e8841",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Survived</th>\n",
" <th>PassengerId</th>\n",
" <th>Pclass</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Ticket</th>\n",
" <th>Fare</th>\n",
" <th>Cabin</th>\n",
" <th>Name_to_num</th>\n",
" <th>Embarked_C</th>\n",
" <th>Embarked_Q</th>\n",
" <th>Embarked_S</th>\n",
" <th>Embarked_Undefined</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>530</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>23.0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1.000000</td>\n",
" <td>11.5000</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>466</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>38.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.391284</td>\n",
" <td>7.0500</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>753</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>33.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.000000</td>\n",
" <td>9.5000</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>855</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>44.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1.000000</td>\n",
" <td>26.0000</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>333</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>38.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1.365721</td>\n",
" <td>153.4625</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Survived PassengerId Pclass Sex Age SibSp Parch Ticket Fare \\\n",
"0 0 530 2 1 23.0 2 1 1.000000 11.5000 \n",
"1 0 466 3 1 38.0 0 0 1.391284 7.0500 \n",
"2 0 753 3 1 33.0 0 0 1.000000 9.5000 \n",
"3 0 855 2 0 44.0 1 0 1.000000 26.0000 \n",
"4 0 333 1 1 38.0 0 1 1.365721 153.4625 \n",
"\n",
" Cabin Name_to_num Embarked_C Embarked_Q Embarked_S Embarked_Undefined \n",
"0 1.0 1 0 0 1 0 \n",
"1 1.0 1 0 0 1 0 \n",
"2 1.0 1 0 0 1 0 \n",
"3 1.0 0 0 0 1 0 \n",
"4 1.0 1 0 0 1 0 "
]
},
"execution_count": 141,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 142,
"id": "7134ea55",
"metadata": {},
"outputs": [],
"source": [
"# Podział danych na zbiór uczący i zbiór testowy\n",
"data_train,data_test = train_test_split(data, test_size=0.2)\n",
"\n",
"# zdefiniowanie cech\n",
"FEATURES = ['Sex','Age','Embarked_C','Embarked_Q','Embarked_S']\n",
"x_train = pd.DataFrame(data_train[FEATURES])\n",
"y_train = pd.DataFrame(data_train['Survived'])\n",
"model = LogisticRegression()"
]
},
{
"cell_type": "code",
"execution_count": 143,
"id": "1ff85122",
"metadata": {},
"outputs": [],
"source": [
"# uczenie modelu\n",
"model.fit(x_train,y_train.values.ravel())\n",
"\n",
"# predykcja wynikow\n",
"x_test = pd.DataFrame(data_test[FEATURES])\n",
"y_expected = pd.DataFrame(data_test['Survived'])\n",
"y_predicted = model.predict(x_test)"
]
},
{
"cell_type": "code",
"execution_count": 144,
"id": "0dff77b1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Precision: 0.832\n",
"Recall: 0.832\n",
"F-score: 0.832\n",
"Model score: 0.832\n"
]
}
],
"source": [
"# ewaluacja wynikow\n",
"precision, recall, fscore, support = precision_recall_fscore_support(\n",
" y_expected, y_predicted, average='micro'\n",
")\n",
"\n",
"print(f\"Precision: {precision}\")\n",
"print(f\"Recall: {recall}\")\n",
"print(f\"F-score: {fscore}\")\n",
"\n",
"score = model.score(x_test, y_expected)\n",
"\n",
"print(f\"Model score: {score}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}