uma_s478839/.ipynb_checkpoints/run-checkpoint.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "eec59090",
   "metadata": {},
   "source": [
    "### Import potrzebnych bibliotek"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ed6c43d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import sklearn\n",
    "\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "from sklearn.metrics import precision_recall_fscore_support\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "import torch\n",
    "from torch import Tensor"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fcd702cb",
   "metadata": {},
   "source": [
    "### Preprocessing danych"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "46133040",
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess(data):\n",
    "    #rename columns\n",
    "    data.rename(columns = {'CODE_GENDER':'if_man', 'FLAG_OWN_CAR':'if_own_car', 'FLAG_OWN_REALTY':'if_own_realty', 'CNT_CHILDREN':'cnt_children', \n",
    "                           'AMT_INCOME_TOTAL':'amt_income', 'NAME_EDUCATION_TYPE':'name_edu_type', 'CNT_FAM_MEMBERS':'cnt_fam_members',\n",
    "                           'NAME_INCOME_TYPE':'name_income_type', 'NAME_FAMILY_STATUS':'name_fam_status'}, inplace = True)\n",
    "    \n",
    "    #replace data\n",
    "    data['if_man'] = data['if_man'].apply(lambda x: 1 if x=='M' else 0)\n",
    "    data['if_own_car'] = data['if_own_car'].apply(lambda x: 1 if x=='Y' else 0)\n",
    "    data['if_own_realty'] = data['if_own_realty'].apply(lambda x: 1 if x=='Y' else 0)\n",
    "    data['cnt_children'] = data['cnt_children'].apply(pd.to_numeric, errors='coerce')\n",
    "    data['cnt_fam_members'] = data['cnt_fam_members'].apply(pd.to_numeric, errors='coerce')\n",
    "    data['cnt_children'] = data['cnt_children'].apply(lambda x: np.NaN if x > 5 else x)\n",
    "    data['cnt_fam_members'] = data['cnt_fam_members'].apply(lambda x: np.NaN if x > 8 else x)\n",
    "    \n",
    "    #get dummies\n",
    "    data = pd.get_dummies(data, columns=['name_income_type'])\n",
    "    data = pd.get_dummies(data, columns=['name_fam_status'])\n",
    "    \n",
    "    #dropna\n",
    "    print(\"Length of dataset before dropna: \" + str(len(data)))\n",
    "    data = data.dropna()\n",
    "    print(\"Length of dataset after dropna: \" + str(len(data)))\n",
    "    \n",
    "    return data\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "db8f4662",
   "metadata": {},
   "source": [
    "### Podział na zbiór trenujący i testowy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e13ef021",
   "metadata": {},
   "outputs": [],
   "source": [
    "def split(data):\n",
    "    split_point = int(0.8 * len(data))\n",
    "    data_train = data[:split_point]\n",
    "    data_test = data[split_point:]\n",
    "    print(\"Length of whole dataset: \" + str(len(data)))\n",
    "    print(\"Length of train dataset: \" + str(len(data_train)))\n",
    "    print(\"Length of test dataset: \" + str(len(data_test)))\n",
    "    return data_train, data_test"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c65e3f22",
   "metadata": {},
   "source": [
    "### Ewaluacja"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "374bc36c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluation(y_expected, y_predicted):\n",
    "    precision, recall, fscore, support = precision_recall_fscore_support(y_expected, y_predicted, average=\"weighted\")\n",
    "    accuracy = accuracy_score(y_expected, y_predicted)\n",
    "    print(f\"Accuracy: {accuracy}\")\n",
    "    print(f\"Precision: {precision}\")\n",
    "    print(f\"Recall: {recall}\")\n",
    "    print(f\"F-score: {fscore}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6a671fd4",
   "metadata": {},
   "source": [
    "### Wczytanie danych z pliku"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "4a42cd28",
   "metadata": {},
   "outputs": [],
   "source": [
    "alldata = pd.read_csv('application_record.csv', header=0, sep=',',\n",
    "     usecols=['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'NAME_EDUCATION_TYPE',  'CNT_FAM_MEMBERS', 'NAME_INCOME_TYPE', 'NAME_FAMILY_STATUS'])\n",
    "# print(alldata[:5])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6a384b9f",
   "metadata": {},
   "source": [
    "### Wybór cech do trenowania"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c69232b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "FEATURES = [\n",
    "    'if_man', \n",
    "    'if_own_car', \n",
    "    'if_own_realty', \n",
    "    'cnt_children', \n",
    "    'amt_income',  \n",
    "    'cnt_fam_members', \n",
    "    'name_income_type_Commercial associate',                                  \n",
    "    'name_income_type_Pensioner', \n",
    "    'name_income_type_State servant', \n",
    "    'name_income_type_Student', \n",
    "    'name_income_type_Working',\n",
    "    'name_fam_status_Civil marriage',\n",
    "    'name_fam_status_Married',\n",
    "    'name_fam_status_Separated',\n",
    "    'name_fam_status_Single / not married',\n",
    "    'name_fam_status_Widow'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d146c21a",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(pd.unique(alldata['if_man']))\n",
    "print(pd.unique(alldata['if_own_car']))\n",
    "print(pd.unique(alldata['if_own_realty']))\n",
    "print(pd.unique(alldata['cnt_children']))\n",
    "# print(pd.unique(alldata['name_income_type']))\n",
    "print(pd.unique(alldata['name_edu_type']))\n",
    "# print(pd.unique(alldata['name_fam_status']))\n",
    "print(pd.unique(alldata['cnt_fam_members']))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "144b9e70",
   "metadata": {},
   "source": [
    "### Przygotowanie danych"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "2ca32941",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Length of dataset before dropna: 438557\n",
      "Length of dataset after dropna: 438531\n",
      "Length of whole dataset: 438531\n",
      "Length of train dataset: 350824\n",
      "Length of test dataset: 87707\n"
     ]
    }
   ],
   "source": [
    "alldata = preprocess(alldata)\n",
    "data_train, data_test = split(alldata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "f292f593",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train = pd.DataFrame(data_train['name_edu_type'])\n",
    "x_train = pd.DataFrame(data_train[FEATURES])\n",
    "scaler = StandardScaler().fit(x_train)\n",
    "x_train = scaler.transform(x_train)\n",
    "x_test = pd.DataFrame(data_test[FEATURES])\n",
    "x_test = scaler.transform(x_test)\n",
    "y_expected = pd.DataFrame(data_test['name_edu_type'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3af424e7",
   "metadata": {},
   "source": [
    "## Regresja logistyczna"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a77aa29f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Programy\\anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
      "  return f(*args, **kwargs)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.7031023749529685\n",
      "Precision: 0.6408866393822401\n",
      "Recall: 0.7031023749529685\n",
      "F-score: 0.6268976358430636\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Programy\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, msg_start, len(result))\n"
     ]
    }
   ],
   "source": [
    "model_logreg = LogisticRegression(max_iter=1000) \n",
    "model_logreg.fit(x_train, y_train)\n",
    "\n",
    "y_predicted_logreg = model_logreg.predict(x_test) \n",
    "\n",
    "evaluation(y_expected, y_predicted_logreg)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e05aa676",
   "metadata": {},
   "source": [
    "## SGD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "31910f4a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Programy\\anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
      "  return f(*args, **kwargs)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.6921340371920143\n",
      "Precision: 0.7333126153525842\n",
      "Recall: 0.6921340371920143\n",
      "F-score: 0.5666044690008586\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Programy\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, msg_start, len(result))\n"
     ]
    }
   ],
   "source": [
    "model_sgd = SGDClassifier() \n",
    "model_sgd.fit(x_train, y_train)\n",
    "\n",
    "y_predicted_sgd = model_sgd.predict(x_test) \n",
    "\n",
    "evaluation(y_expected, y_predicted_sgd)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "037c6132",
   "metadata": {},
   "source": [
    "## Gaussian Naive Bayes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b71d276a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# model_gnb = GaussianNB() \n",
    "# model_gnb.fit(x_train, y_train)\n",
    "\n",
    "# y_predicted_sgd = model_gnb.predict(x_test) \n",
    "\n",
    "# evaluation(y_expected, y_predicted_sgd)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2f41b9f3",
   "metadata": {},
   "source": [
    "## PyTorch"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a4b83c7c",
   "metadata": {},
   "source": [
    "### Przygotowanie danych"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "a578ed9d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Higher education': 0,\n",
       " 'Secondary / secondary special': 1,\n",
       " 'Incomplete higher': 2,\n",
       " 'Lower secondary': 3,\n",
       " 'Academic degree': 4}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_numpy = alldata.drop(\"name_edu_type\", axis=1).values\n",
    "X_numpy = scaler.transform(X_numpy)\n",
    "target_map = {\n",
    "    val: index for index, val in enumerate(alldata.name_edu_type.unique())\n",
    "}\n",
    "y_numpy = alldata.name_edu_type.map(target_map).values\n",
    "X = torch.tensor(X_numpy, dtype=torch.float32)\n",
    "y = torch.tensor(y_numpy)\n",
    "\n",
    "target_map"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "72d58f4d",
   "metadata": {},
   "source": [
    "### One hot vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "8521b29d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def one_hot_encode(vector):\n",
    "    n_classes = len(vector.unique())\n",
    "    one_hot = torch.zeros((vector.shape[0], n_classes))\\\n",
    "        .type(torch.LongTensor)\n",
    "    return one_hot\\\n",
    "        .scatter(1, vector.type(torch.LongTensor).unsqueeze(1), 1)\n",
    "\n",
    "y_one_hot = one_hot_encode(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "2adc5f1d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "438531\n"
     ]
    }
   ],
   "source": [
    "random_indices = torch.randperm(X.shape[0])\n",
    "print(X.shape[0])\n",
    "n_train = int(0.8 * X.shape[0])\n",
    "X_train = X[random_indices[:n_train]]\n",
    "y_train = y[random_indices[:n_train]]\n",
    "y_train_one_hot = y_one_hot[random_indices[:n_train]]\n",
    "\n",
    "X_test = X[random_indices[n_train:]]\n",
    "y_test = y[random_indices[n_train:]]\n",
    "y_test_one_hot = y_one_hot[random_indices[n_train:]]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2992b275",
   "metadata": {},
   "source": [
    "### Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "a0820cfa",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_pytorch = torch.nn.Sequential(\n",
    "    torch.nn.Linear(16, 5)\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7d177d27",
   "metadata": {},
   "source": [
    "### Optymalizator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "1858b5d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "learning_rate = 0.1\n",
    "lambda_param = 0.01\n",
    "optimizer = torch.optim.SGD(\n",
    "    model_pytorch.parameters(), \n",
    "    lr=learning_rate, \n",
    "    weight_decay=lambda_param\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ca95791b",
   "metadata": {},
   "source": [
    "### Funkcja straty"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "16f5c2d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "loss_function = torch.nn.CrossEntropyLoss()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ebe3feff",
   "metadata": {},
   "source": [
    "### Trenowanie"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "9b2115e9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loss at iteration 1: 1.6335363388061523\n",
      "Loss at iteration 100: 0.8125142455101013\n",
      "Loss at iteration 200: 0.7701064944267273\n",
      "Loss at iteration 300: 0.75752854347229\n",
      "Loss at iteration 400: 0.7520564198493958\n",
      "Loss at iteration 500: 0.7492005228996277\n",
      "Loss at iteration 600: 0.7475774884223938\n",
      "Loss at iteration 700: 0.7465949058532715\n",
      "Loss at iteration 800: 0.7459684014320374\n",
      "Loss at iteration 900: 0.7455567717552185\n",
      "Loss at iteration 1000: 0.7452787160873413\n",
      "\n",
      "Final Test Accuracy: 0.7014605447683765\n"
     ]
    }
   ],
   "source": [
    "n_iterations = 1000\n",
    "for i in range(1, n_iterations + 1):\n",
    "    Z = model_pytorch(X_train)  # 1\n",
    "    loss = loss_function(Z, y_train)  # 2\n",
    "    optimizer.zero_grad()  # 3\n",
    "    loss.backward()  # 4\n",
    "    optimizer.step()  # 5\n",
    "    \n",
    "    if i == 1 or i % 100 == 0:\n",
    "        print(\"Loss at iteration {}: {}\".format(i, loss))\n",
    "\n",
    "test_predictions = torch.argmax(\n",
    "    torch.softmax(model_pytorch(X_test), 1), axis=1  # 6\n",
    ")\n",
    "test_accuracy = float(sum(test_predictions == y_test)) / y_test.shape[0]\n",
    "print(\"\\nFinal Test Accuracy: {}\".format(test_accuracy))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c91a0446",
   "metadata": {},
   "source": [
    "### Ewaluacja"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "d394a74e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.7014605447683765\n",
      "Precision: 0.6413920853293256\n",
      "Recall: 0.7014605447683765\n",
      "F-score: 0.6264027678306182\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Programy\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
      "  _warn_prf(average, modifier, msg_start, len(result))\n"
     ]
    }
   ],
   "source": [
    "evaluation(y_test, test_predictions)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
upload files 2022-06-19 23:35:14 +02:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "markdown",`
			`"id": "eec59090",`
			`"metadata": {},`
			`"source": [`
			`"### Import potrzebnych bibliotek"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 1,`
			`"id": "ed6c43d8",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"import numpy as np\n",`
			`"import pandas as pd\n",`
			`"import sklearn\n",`
			`"\n",`
			`"from sklearn.linear_model import LogisticRegression\n",`
			`"from sklearn.preprocessing import StandardScaler\n",`
			`"\n",`
			`"from sklearn.linear_model import SGDClassifier\n",`
			`"\n",`
			`"from sklearn.naive_bayes import GaussianNB\n",`
			`"from sklearn.feature_extraction.text import TfidfVectorizer\n",`
			`"\n",`
			`"from sklearn.metrics import precision_recall_fscore_support\n",`
			`"from sklearn.metrics import accuracy_score\n",`
			`"\n",`
			`"import torch\n",`
			`"from torch import Tensor"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "fcd702cb",`
			`"metadata": {},`
			`"source": [`
			`"### Preprocessing danych"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 2,`
			`"id": "46133040",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def preprocess(data):\n",`
			`" #rename columns\n",`
			`" data.rename(columns = {'CODE_GENDER':'if_man', 'FLAG_OWN_CAR':'if_own_car', 'FLAG_OWN_REALTY':'if_own_realty', 'CNT_CHILDREN':'cnt_children', \n",`
			`" 'AMT_INCOME_TOTAL':'amt_income', 'NAME_EDUCATION_TYPE':'name_edu_type', 'CNT_FAM_MEMBERS':'cnt_fam_members',\n",`
			`" 'NAME_INCOME_TYPE':'name_income_type', 'NAME_FAMILY_STATUS':'name_fam_status'}, inplace = True)\n",`
			`" \n",`
			`" #replace data\n",`
			`" data['if_man'] = data['if_man'].apply(lambda x: 1 if x=='M' else 0)\n",`
			`" data['if_own_car'] = data['if_own_car'].apply(lambda x: 1 if x=='Y' else 0)\n",`
			`" data['if_own_realty'] = data['if_own_realty'].apply(lambda x: 1 if x=='Y' else 0)\n",`
			`" data['cnt_children'] = data['cnt_children'].apply(pd.to_numeric, errors='coerce')\n",`
			`" data['cnt_fam_members'] = data['cnt_fam_members'].apply(pd.to_numeric, errors='coerce')\n",`
			`" data['cnt_children'] = data['cnt_children'].apply(lambda x: np.NaN if x > 5 else x)\n",`
			`" data['cnt_fam_members'] = data['cnt_fam_members'].apply(lambda x: np.NaN if x > 8 else x)\n",`
			`" \n",`
			`" #get dummies\n",`
			`" data = pd.get_dummies(data, columns=['name_income_type'])\n",`
			`" data = pd.get_dummies(data, columns=['name_fam_status'])\n",`
			`" \n",`
			`" #dropna\n",`
			`" print(\"Length of dataset before dropna: \" + str(len(data)))\n",`
			`" data = data.dropna()\n",`
			`" print(\"Length of dataset after dropna: \" + str(len(data)))\n",`
			`" \n",`
			`" return data\n",`
			`"\n"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "db8f4662",`
			`"metadata": {},`
			`"source": [`
			`"### Podział na zbiór trenujący i testowy"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 3,`
			`"id": "e13ef021",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def split(data):\n",`
			`" split_point = int(0.8 * len(data))\n",`
			`" data_train = data[:split_point]\n",`
			`" data_test = data[split_point:]\n",`
			`" print(\"Length of whole dataset: \" + str(len(data)))\n",`
			`" print(\"Length of train dataset: \" + str(len(data_train)))\n",`
			`" print(\"Length of test dataset: \" + str(len(data_test)))\n",`
			`" return data_train, data_test"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "c65e3f22",`
			`"metadata": {},`
			`"source": [`
			`"### Ewaluacja"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 4,`
			`"id": "374bc36c",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def evaluation(y_expected, y_predicted):\n",`
			`" precision, recall, fscore, support = precision_recall_fscore_support(y_expected, y_predicted, average=\"weighted\")\n",`
			`" accuracy = accuracy_score(y_expected, y_predicted)\n",`
			`" print(f\"Accuracy: {accuracy}\")\n",`
			`" print(f\"Precision: {precision}\")\n",`
			`" print(f\"Recall: {recall}\")\n",`
			`" print(f\"F-score: {fscore}\")"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "6a671fd4",`
			`"metadata": {},`
			`"source": [`
			`"### Wczytanie danych z pliku"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 5,`
			`"id": "4a42cd28",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"alldata = pd.read_csv('application_record.csv', header=0, sep=',',\n",`
			`" usecols=['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'NAME_EDUCATION_TYPE', 'CNT_FAM_MEMBERS', 'NAME_INCOME_TYPE', 'NAME_FAMILY_STATUS'])\n",`
			`"# print(alldata[:5])"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "6a384b9f",`
			`"metadata": {},`
			`"source": [`
			`"### Wybór cech do trenowania"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 6,`
			`"id": "c69232b2",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"FEATURES = [\n",`
			`" 'if_man', \n",`
			`" 'if_own_car', \n",`
			`" 'if_own_realty', \n",`
			`" 'cnt_children', \n",`
			`" 'amt_income', \n",`
			`" 'cnt_fam_members', \n",`
			`" 'name_income_type_Commercial associate', \n",`
			`" 'name_income_type_Pensioner', \n",`
			`" 'name_income_type_State servant', \n",`
			`" 'name_income_type_Student', \n",`
			`" 'name_income_type_Working',\n",`
			`" 'name_fam_status_Civil marriage',\n",`
			`" 'name_fam_status_Married',\n",`
			`" 'name_fam_status_Separated',\n",`
			`" 'name_fam_status_Single / not married',\n",`
			`" 'name_fam_status_Widow'\n",`
			`"]"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "d146c21a",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"print(pd.unique(alldata['if_man']))\n",`
			`"print(pd.unique(alldata['if_own_car']))\n",`
			`"print(pd.unique(alldata['if_own_realty']))\n",`
			`"print(pd.unique(alldata['cnt_children']))\n",`
			`"# print(pd.unique(alldata['name_income_type']))\n",`
			`"print(pd.unique(alldata['name_edu_type']))\n",`
			`"# print(pd.unique(alldata['name_fam_status']))\n",`
			`"print(pd.unique(alldata['cnt_fam_members']))"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "144b9e70",`
			`"metadata": {},`
			`"source": [`
			`"### Przygotowanie danych"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 7,`
			`"id": "2ca32941",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Length of dataset before dropna: 438557\n",`
			`"Length of dataset after dropna: 438531\n",`
			`"Length of whole dataset: 438531\n",`
			`"Length of train dataset: 350824\n",`
			`"Length of test dataset: 87707\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"alldata = preprocess(alldata)\n",`
			`"data_train, data_test = split(alldata)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 8,`
			`"id": "f292f593",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"y_train = pd.DataFrame(data_train['name_edu_type'])\n",`
			`"x_train = pd.DataFrame(data_train[FEATURES])\n",`
			`"scaler = StandardScaler().fit(x_train)\n",`
			`"x_train = scaler.transform(x_train)\n",`
			`"x_test = pd.DataFrame(data_test[FEATURES])\n",`
			`"x_test = scaler.transform(x_test)\n",`
			`"y_expected = pd.DataFrame(data_test['name_edu_type'])"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "3af424e7",`
			`"metadata": {},`
			`"source": [`
			`"## Regresja logistyczna"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 9,`
			`"id": "a77aa29f",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"D:\\Programy\\anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",`
			`" return f(args, *kwargs)\n"`
			`]`
			`},`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Accuracy: 0.7031023749529685\n",`
			`"Precision: 0.6408866393822401\n",`
			`"Recall: 0.7031023749529685\n",`
			`"F-score: 0.6268976358430636\n"`
			`]`
			`},`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			"D:\\Programy\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
			`" _warn_prf(average, modifier, msg_start, len(result))\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"model_logreg = LogisticRegression(max_iter=1000) \n",`
			`"model_logreg.fit(x_train, y_train)\n",`
			`"\n",`
			`"y_predicted_logreg = model_logreg.predict(x_test) \n",`
			`"\n",`
			`"evaluation(y_expected, y_predicted_logreg)"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "e05aa676",`
			`"metadata": {},`
			`"source": [`
			`"## SGD"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 10,`
			`"id": "31910f4a",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"D:\\Programy\\anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",`
			`" return f(args, *kwargs)\n"`
			`]`
			`},`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Accuracy: 0.6921340371920143\n",`
			`"Precision: 0.7333126153525842\n",`
			`"Recall: 0.6921340371920143\n",`
			`"F-score: 0.5666044690008586\n"`
			`]`
			`},`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			"D:\\Programy\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
			`" _warn_prf(average, modifier, msg_start, len(result))\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"model_sgd = SGDClassifier() \n",`
			`"model_sgd.fit(x_train, y_train)\n",`
			`"\n",`
			`"y_predicted_sgd = model_sgd.predict(x_test) \n",`
			`"\n",`
			`"evaluation(y_expected, y_predicted_sgd)"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "037c6132",`
			`"metadata": {},`
			`"source": [`
			`"## Gaussian Naive Bayes"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"id": "b71d276a",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# model_gnb = GaussianNB() \n",`
			`"# model_gnb.fit(x_train, y_train)\n",`
			`"\n",`
			`"# y_predicted_sgd = model_gnb.predict(x_test) \n",`
			`"\n",`
			`"# evaluation(y_expected, y_predicted_sgd)"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "2f41b9f3",`
			`"metadata": {},`
			`"source": [`
			`"## PyTorch"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "a4b83c7c",`
			`"metadata": {},`
			`"source": [`
			`"### Przygotowanie danych"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 11,`
			`"id": "a578ed9d",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"data": {`
			`"text/plain": [`
			`"{'Higher education': 0,\n",`
			`" 'Secondary / secondary special': 1,\n",`
			`" 'Incomplete higher': 2,\n",`
			`" 'Lower secondary': 3,\n",`
			`" 'Academic degree': 4}"`
			`]`
			`},`
			`"execution_count": 11,`
			`"metadata": {},`
			`"output_type": "execute_result"`
			`}`
			`],`
			`"source": [`
			`"X_numpy = alldata.drop(\"name_edu_type\", axis=1).values\n",`
			`"X_numpy = scaler.transform(X_numpy)\n",`
			`"target_map = {\n",`
			`" val: index for index, val in enumerate(alldata.name_edu_type.unique())\n",`
			`"}\n",`
			`"y_numpy = alldata.name_edu_type.map(target_map).values\n",`
			`"X = torch.tensor(X_numpy, dtype=torch.float32)\n",`
			`"y = torch.tensor(y_numpy)\n",`
			`"\n",`
			`"target_map"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "72d58f4d",`
			`"metadata": {},`
			`"source": [`
			`"### One hot vectors"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 12,`
			`"id": "8521b29d",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def one_hot_encode(vector):\n",`
			`" n_classes = len(vector.unique())\n",`
			`" one_hot = torch.zeros((vector.shape[0], n_classes))\\\n",`
			`" .type(torch.LongTensor)\n",`
			`" return one_hot\\\n",`
			`" .scatter(1, vector.type(torch.LongTensor).unsqueeze(1), 1)\n",`
			`"\n",`
			`"y_one_hot = one_hot_encode(y)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 13,`
			`"id": "2adc5f1d",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"438531\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"random_indices = torch.randperm(X.shape[0])\n",`
			`"print(X.shape[0])\n",`
			`"n_train = int(0.8 * X.shape[0])\n",`
			`"X_train = X[random_indices[:n_train]]\n",`
			`"y_train = y[random_indices[:n_train]]\n",`
			`"y_train_one_hot = y_one_hot[random_indices[:n_train]]\n",`
			`"\n",`
			`"X_test = X[random_indices[n_train:]]\n",`
			`"y_test = y[random_indices[n_train:]]\n",`
			`"y_test_one_hot = y_one_hot[random_indices[n_train:]]"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "2992b275",`
			`"metadata": {},`
			`"source": [`
			`"### Model"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 14,`
			`"id": "a0820cfa",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"model_pytorch = torch.nn.Sequential(\n",`
			`" torch.nn.Linear(16, 5)\n",`
			`")"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "7d177d27",`
			`"metadata": {},`
			`"source": [`
			`"### Optymalizator"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 15,`
			`"id": "1858b5d1",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"learning_rate = 0.1\n",`
			`"lambda_param = 0.01\n",`
			`"optimizer = torch.optim.SGD(\n",`
			`" model_pytorch.parameters(), \n",`
			`" lr=learning_rate, \n",`
			`" weight_decay=lambda_param\n",`
			`")"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "ca95791b",`
			`"metadata": {},`
			`"source": [`
			`"### Funkcja straty"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 16,`
			`"id": "16f5c2d1",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"loss_function = torch.nn.CrossEntropyLoss()"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "ebe3feff",`
			`"metadata": {},`
			`"source": [`
			`"### Trenowanie"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 17,`
			`"id": "9b2115e9",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Loss at iteration 1: 1.6335363388061523\n",`
			`"Loss at iteration 100: 0.8125142455101013\n",`
			`"Loss at iteration 200: 0.7701064944267273\n",`
			`"Loss at iteration 300: 0.75752854347229\n",`
			`"Loss at iteration 400: 0.7520564198493958\n",`
			`"Loss at iteration 500: 0.7492005228996277\n",`
			`"Loss at iteration 600: 0.7475774884223938\n",`
			`"Loss at iteration 700: 0.7465949058532715\n",`
			`"Loss at iteration 800: 0.7459684014320374\n",`
			`"Loss at iteration 900: 0.7455567717552185\n",`
			`"Loss at iteration 1000: 0.7452787160873413\n",`
			`"\n",`
			`"Final Test Accuracy: 0.7014605447683765\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"n_iterations = 1000\n",`
			`"for i in range(1, n_iterations + 1):\n",`
			`" Z = model_pytorch(X_train) # 1\n",`
			`" loss = loss_function(Z, y_train) # 2\n",`
			`" optimizer.zero_grad() # 3\n",`
			`" loss.backward() # 4\n",`
			`" optimizer.step() # 5\n",`
			`" \n",`
			`" if i == 1 or i % 100 == 0:\n",`
			`" print(\"Loss at iteration {}: {}\".format(i, loss))\n",`
			`"\n",`
			`"test_predictions = torch.argmax(\n",`
			`" torch.softmax(model_pytorch(X_test), 1), axis=1 # 6\n",`
			`")\n",`
			`"test_accuracy = float(sum(test_predictions == y_test)) / y_test.shape[0]\n",`
			`"print(\"\\nFinal Test Accuracy: {}\".format(test_accuracy))"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"id": "c91a0446",`
			`"metadata": {},`
			`"source": [`
			`"### Ewaluacja"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 18,`
			`"id": "d394a74e",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"Accuracy: 0.7014605447683765\n",`
			`"Precision: 0.6413920853293256\n",`
			`"Recall: 0.7014605447683765\n",`
			`"F-score: 0.6264027678306182\n"`
			`]`
			`},`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			"D:\\Programy\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
			`" _warn_prf(average, modifier, msg_start, len(result))\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"evaluation(y_test, test_predictions)"`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "Python 3 (ipykernel)",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.9.7"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 5`
			`}`