{ "cells": [ { "cell_type": "markdown", "id": "eec59090", "metadata": {}, "source": [ "### Import potrzebnych bibliotek" ] }, { "cell_type": "code", "execution_count": 1, "id": "ed6c43d8", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import sklearn\n", "\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "from sklearn.linear_model import SGDClassifier\n", "\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "from sklearn.metrics import precision_recall_fscore_support\n", "from sklearn.metrics import accuracy_score\n", "\n", "import torch\n", "from torch import Tensor" ] }, { "cell_type": "markdown", "id": "fcd702cb", "metadata": {}, "source": [ "### Preprocessing danych" ] }, { "cell_type": "code", "execution_count": 2, "id": "46133040", "metadata": {}, "outputs": [], "source": [ "def preprocess(data):\n", " #rename columns\n", " data.rename(columns = {'CODE_GENDER':'if_man', 'FLAG_OWN_CAR':'if_own_car', 'FLAG_OWN_REALTY':'if_own_realty', 'CNT_CHILDREN':'cnt_children', \n", " 'AMT_INCOME_TOTAL':'amt_income', 'NAME_EDUCATION_TYPE':'name_edu_type', 'CNT_FAM_MEMBERS':'cnt_fam_members',\n", " 'NAME_INCOME_TYPE':'name_income_type', 'NAME_FAMILY_STATUS':'name_fam_status'}, inplace = True)\n", " \n", " #replace data\n", " data['if_man'] = data['if_man'].apply(lambda x: 1 if x=='M' else 0)\n", " data['if_own_car'] = data['if_own_car'].apply(lambda x: 1 if x=='Y' else 0)\n", " data['if_own_realty'] = data['if_own_realty'].apply(lambda x: 1 if x=='Y' else 0)\n", " data['cnt_children'] = data['cnt_children'].apply(pd.to_numeric, errors='coerce')\n", " data['cnt_fam_members'] = data['cnt_fam_members'].apply(pd.to_numeric, errors='coerce')\n", " data['cnt_children'] = data['cnt_children'].apply(lambda x: np.NaN if x > 5 else x)\n", " data['cnt_fam_members'] = data['cnt_fam_members'].apply(lambda x: np.NaN if x > 8 else x)\n", " \n", " #get dummies\n", " data = pd.get_dummies(data, columns=['name_income_type'])\n", " data = pd.get_dummies(data, columns=['name_fam_status'])\n", " \n", " #dropna\n", " print(\"Length of dataset before dropna: \" + str(len(data)))\n", " data = data.dropna()\n", " print(\"Length of dataset after dropna: \" + str(len(data)))\n", " \n", " return data\n", "\n" ] }, { "cell_type": "markdown", "id": "db8f4662", "metadata": {}, "source": [ "### Podział na zbiór trenujący i testowy" ] }, { "cell_type": "code", "execution_count": 3, "id": "e13ef021", "metadata": {}, "outputs": [], "source": [ "def split(data):\n", " split_point = int(0.8 * len(data))\n", " data_train = data[:split_point]\n", " data_test = data[split_point:]\n", " print(\"Length of whole dataset: \" + str(len(data)))\n", " print(\"Length of train dataset: \" + str(len(data_train)))\n", " print(\"Length of test dataset: \" + str(len(data_test)))\n", " return data_train, data_test" ] }, { "cell_type": "markdown", "id": "c65e3f22", "metadata": {}, "source": [ "### Ewaluacja" ] }, { "cell_type": "code", "execution_count": 4, "id": "374bc36c", "metadata": {}, "outputs": [], "source": [ "def evaluation(y_expected, y_predicted):\n", " precision, recall, fscore, support = precision_recall_fscore_support(y_expected, y_predicted, average=\"weighted\")\n", " accuracy = accuracy_score(y_expected, y_predicted)\n", " print(f\"Accuracy: {accuracy}\")\n", " print(f\"Precision: {precision}\")\n", " print(f\"Recall: {recall}\")\n", " print(f\"F-score: {fscore}\")" ] }, { "cell_type": "markdown", "id": "6a671fd4", "metadata": {}, "source": [ "### Wczytanie danych z pliku" ] }, { "cell_type": "code", "execution_count": 5, "id": "4a42cd28", "metadata": {}, "outputs": [], "source": [ "alldata = pd.read_csv('application_record.csv', header=0, sep=',',\n", " usecols=['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'NAME_EDUCATION_TYPE', 'CNT_FAM_MEMBERS', 'NAME_INCOME_TYPE', 'NAME_FAMILY_STATUS'])\n", "# print(alldata[:5])" ] }, { "cell_type": "markdown", "id": "6a384b9f", "metadata": {}, "source": [ "### Wybór cech do trenowania" ] }, { "cell_type": "code", "execution_count": 6, "id": "c69232b2", "metadata": {}, "outputs": [], "source": [ "FEATURES = [\n", " 'if_man', \n", " 'if_own_car', \n", " 'if_own_realty', \n", " 'cnt_children', \n", " 'amt_income', \n", " 'cnt_fam_members', \n", " 'name_income_type_Commercial associate', \n", " 'name_income_type_Pensioner', \n", " 'name_income_type_State servant', \n", " 'name_income_type_Student', \n", " 'name_income_type_Working',\n", " 'name_fam_status_Civil marriage',\n", " 'name_fam_status_Married',\n", " 'name_fam_status_Separated',\n", " 'name_fam_status_Single / not married',\n", " 'name_fam_status_Widow'\n", "]" ] }, { "cell_type": "code", "execution_count": null, "id": "d146c21a", "metadata": {}, "outputs": [], "source": [ "print(pd.unique(alldata['if_man']))\n", "print(pd.unique(alldata['if_own_car']))\n", "print(pd.unique(alldata['if_own_realty']))\n", "print(pd.unique(alldata['cnt_children']))\n", "# print(pd.unique(alldata['name_income_type']))\n", "print(pd.unique(alldata['name_edu_type']))\n", "# print(pd.unique(alldata['name_fam_status']))\n", "print(pd.unique(alldata['cnt_fam_members']))" ] }, { "cell_type": "markdown", "id": "144b9e70", "metadata": {}, "source": [ "### Przygotowanie danych" ] }, { "cell_type": "code", "execution_count": 7, "id": "2ca32941", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Length of dataset before dropna: 438557\n", "Length of dataset after dropna: 438531\n", "Length of whole dataset: 438531\n", "Length of train dataset: 350824\n", "Length of test dataset: 87707\n" ] } ], "source": [ "alldata = preprocess(alldata)\n", "data_train, data_test = split(alldata)" ] }, { "cell_type": "code", "execution_count": 8, "id": "f292f593", "metadata": {}, "outputs": [], "source": [ "y_train = pd.DataFrame(data_train['name_edu_type'])\n", "x_train = pd.DataFrame(data_train[FEATURES])\n", "scaler = StandardScaler().fit(x_train)\n", "x_train = scaler.transform(x_train)\n", "x_test = pd.DataFrame(data_test[FEATURES])\n", "x_test = scaler.transform(x_test)\n", "y_expected = pd.DataFrame(data_test['name_edu_type'])" ] }, { "cell_type": "markdown", "id": "3af424e7", "metadata": {}, "source": [ "## Regresja logistyczna" ] }, { "cell_type": "code", "execution_count": 9, "id": "a77aa29f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "D:\\Programy\\anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", " return f(*args, **kwargs)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.7031023749529685\n", "Precision: 0.6408866393822401\n", "Recall: 0.7031023749529685\n", "F-score: 0.6268976358430636\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "D:\\Programy\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n" ] } ], "source": [ "model_logreg = LogisticRegression(max_iter=1000) \n", "model_logreg.fit(x_train, y_train)\n", "\n", "y_predicted_logreg = model_logreg.predict(x_test) \n", "\n", "evaluation(y_expected, y_predicted_logreg)" ] }, { "cell_type": "markdown", "id": "e05aa676", "metadata": {}, "source": [ "## SGD" ] }, { "cell_type": "code", "execution_count": 10, "id": "31910f4a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "D:\\Programy\\anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", " return f(*args, **kwargs)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.6921340371920143\n", "Precision: 0.7333126153525842\n", "Recall: 0.6921340371920143\n", "F-score: 0.5666044690008586\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "D:\\Programy\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n" ] } ], "source": [ "model_sgd = SGDClassifier() \n", "model_sgd.fit(x_train, y_train)\n", "\n", "y_predicted_sgd = model_sgd.predict(x_test) \n", "\n", "evaluation(y_expected, y_predicted_sgd)" ] }, { "cell_type": "markdown", "id": "037c6132", "metadata": {}, "source": [ "## Gaussian Naive Bayes" ] }, { "cell_type": "code", "execution_count": null, "id": "b71d276a", "metadata": {}, "outputs": [], "source": [ "# model_gnb = GaussianNB() \n", "# model_gnb.fit(x_train, y_train)\n", "\n", "# y_predicted_sgd = model_gnb.predict(x_test) \n", "\n", "# evaluation(y_expected, y_predicted_sgd)" ] }, { "cell_type": "markdown", "id": "2f41b9f3", "metadata": {}, "source": [ "## PyTorch" ] }, { "cell_type": "markdown", "id": "a4b83c7c", "metadata": {}, "source": [ "### Przygotowanie danych" ] }, { "cell_type": "code", "execution_count": 11, "id": "a578ed9d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'Higher education': 0,\n", " 'Secondary / secondary special': 1,\n", " 'Incomplete higher': 2,\n", " 'Lower secondary': 3,\n", " 'Academic degree': 4}" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_numpy = alldata.drop(\"name_edu_type\", axis=1).values\n", "X_numpy = scaler.transform(X_numpy)\n", "target_map = {\n", " val: index for index, val in enumerate(alldata.name_edu_type.unique())\n", "}\n", "y_numpy = alldata.name_edu_type.map(target_map).values\n", "X = torch.tensor(X_numpy, dtype=torch.float32)\n", "y = torch.tensor(y_numpy)\n", "\n", "target_map" ] }, { "cell_type": "markdown", "id": "72d58f4d", "metadata": {}, "source": [ "### One hot vectors" ] }, { "cell_type": "code", "execution_count": 12, "id": "8521b29d", "metadata": {}, "outputs": [], "source": [ "def one_hot_encode(vector):\n", " n_classes = len(vector.unique())\n", " one_hot = torch.zeros((vector.shape[0], n_classes))\\\n", " .type(torch.LongTensor)\n", " return one_hot\\\n", " .scatter(1, vector.type(torch.LongTensor).unsqueeze(1), 1)\n", "\n", "y_one_hot = one_hot_encode(y)" ] }, { "cell_type": "code", "execution_count": 13, "id": "2adc5f1d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "438531\n" ] } ], "source": [ "random_indices = torch.randperm(X.shape[0])\n", "print(X.shape[0])\n", "n_train = int(0.8 * X.shape[0])\n", "X_train = X[random_indices[:n_train]]\n", "y_train = y[random_indices[:n_train]]\n", "y_train_one_hot = y_one_hot[random_indices[:n_train]]\n", "\n", "X_test = X[random_indices[n_train:]]\n", "y_test = y[random_indices[n_train:]]\n", "y_test_one_hot = y_one_hot[random_indices[n_train:]]" ] }, { "cell_type": "markdown", "id": "2992b275", "metadata": {}, "source": [ "### Model" ] }, { "cell_type": "code", "execution_count": 14, "id": "a0820cfa", "metadata": {}, "outputs": [], "source": [ "model_pytorch = torch.nn.Sequential(\n", " torch.nn.Linear(16, 5)\n", ")" ] }, { "cell_type": "markdown", "id": "7d177d27", "metadata": {}, "source": [ "### Optymalizator" ] }, { "cell_type": "code", "execution_count": 15, "id": "1858b5d1", "metadata": {}, "outputs": [], "source": [ "learning_rate = 0.1\n", "lambda_param = 0.01\n", "optimizer = torch.optim.SGD(\n", " model_pytorch.parameters(), \n", " lr=learning_rate, \n", " weight_decay=lambda_param\n", ")" ] }, { "cell_type": "markdown", "id": "ca95791b", "metadata": {}, "source": [ "### Funkcja straty" ] }, { "cell_type": "code", "execution_count": 16, "id": "16f5c2d1", "metadata": {}, "outputs": [], "source": [ "loss_function = torch.nn.CrossEntropyLoss()" ] }, { "cell_type": "markdown", "id": "ebe3feff", "metadata": {}, "source": [ "### Trenowanie" ] }, { "cell_type": "code", "execution_count": 17, "id": "9b2115e9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loss at iteration 1: 1.6335363388061523\n", "Loss at iteration 100: 0.8125142455101013\n", "Loss at iteration 200: 0.7701064944267273\n", "Loss at iteration 300: 0.75752854347229\n", "Loss at iteration 400: 0.7520564198493958\n", "Loss at iteration 500: 0.7492005228996277\n", "Loss at iteration 600: 0.7475774884223938\n", "Loss at iteration 700: 0.7465949058532715\n", "Loss at iteration 800: 0.7459684014320374\n", "Loss at iteration 900: 0.7455567717552185\n", "Loss at iteration 1000: 0.7452787160873413\n", "\n", "Final Test Accuracy: 0.7014605447683765\n" ] } ], "source": [ "n_iterations = 1000\n", "for i in range(1, n_iterations + 1):\n", " Z = model_pytorch(X_train) # 1\n", " loss = loss_function(Z, y_train) # 2\n", " optimizer.zero_grad() # 3\n", " loss.backward() # 4\n", " optimizer.step() # 5\n", " \n", " if i == 1 or i % 100 == 0:\n", " print(\"Loss at iteration {}: {}\".format(i, loss))\n", "\n", "test_predictions = torch.argmax(\n", " torch.softmax(model_pytorch(X_test), 1), axis=1 # 6\n", ")\n", "test_accuracy = float(sum(test_predictions == y_test)) / y_test.shape[0]\n", "print(\"\\nFinal Test Accuracy: {}\".format(test_accuracy))" ] }, { "cell_type": "markdown", "id": "c91a0446", "metadata": {}, "source": [ "### Ewaluacja" ] }, { "cell_type": "code", "execution_count": 18, "id": "d394a74e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.7014605447683765\n", "Precision: 0.6413920853293256\n", "Recall: 0.7014605447683765\n", "F-score: 0.6264027678306182\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "D:\\Programy\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n" ] } ], "source": [ "evaluation(y_test, test_predictions)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }