From ff760e3e40a03ceb33b11126e04d1896e397267f Mon Sep 17 00:00:00 2001
From: "ag.gabka@gmail.com" <ag.gabka@gmail.com>
Date: Tue, 23 Jul 2024 20:07:53 +0200
Subject: [PATCH] Updated lung_cancer_data_project.ipynb

---
 lung_cancer_data_project.ipynb | 174 +++++++++------------------------
 1 file changed, 44 insertions(+), 130 deletions(-)

diff --git a/lung_cancer_data_project.ipynb b/lung_cancer_data_project.ipynb
index 6dc5c9c..53c85b2 100644
--- a/lung_cancer_data_project.ipynb
+++ b/lung_cancer_data_project.ipynb
@@ -32,46 +32,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "3b9fd854",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: plotnine in c:\\users\\hp\\anaconda3\\lib\\site-packages (0.12.4)\n",
-      "Requirement already satisfied: matplotlib>=3.6.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (3.8.0)\n",
-      "Requirement already satisfied: mizani<0.10.0,>0.9.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (0.9.3)\n",
-      "Requirement already satisfied: numpy>=1.23.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (1.26.0)\n",
-      "Requirement already satisfied: pandas>=1.5.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (2.1.1)\n",
-      "Requirement already satisfied: patsy>=0.5.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (0.5.5)\n",
-      "Requirement already satisfied: scipy>=1.5.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (1.11.4)\n",
-      "Requirement already satisfied: statsmodels>=0.14.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (0.14.0)\n",
-      "Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (1.2.0)\n",
-      "Requirement already satisfied: cycler>=0.10 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (0.12.1)\n",
-      "Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (4.25.0)\n",
-      "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (1.4.4)\n",
-      "Requirement already satisfied: packaging>=20.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (23.1)\n",
-      "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (10.0.1)\n",
-      "Requirement already satisfied: pyparsing>=2.3.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (3.0.9)\n",
-      "Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (2.8.2)\n",
-      "Requirement already satisfied: tzdata in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mizani<0.10.0,>0.9.0->plotnine) (2023.3)\n",
-      "Requirement already satisfied: pytz>=2020.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from pandas>=1.5.0->plotnine) (2023.3.post1)\n",
-      "Requirement already satisfied: six in c:\\users\\hp\\anaconda3\\lib\\site-packages (from patsy>=0.5.1->plotnine) (1.16.0)\n",
-      "Note: you may need to restart the kernel to use updated packages.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "[notice] A new release of pip is available: 23.3.2 -> 24.0\n",
-      "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "pip install plotnine"
    ]
@@ -815,7 +779,7 @@
    ],
    "source": [
     "\n",
-    "#stworzenie 'binow' dla pokazania wieku pacjentów\n",
+    "\n",
     "bins = []\n",
     "for i in range (0, 101, 10):\n",
     "    bins.append(i)\n",
@@ -873,19 +837,19 @@
     "gender_counts = dane['Gender'].value_counts()\n",
     "ax = gender_counts.plot(kind='bar', color=['blue', 'pink'])\n",
     "\n",
-    "# Dodawanie wartości do słupków\n",
+    "\n",
     "for i, value in enumerate(gender_counts):\n",
     "    ax.text(i, value + 0.1, str(value), ha='center', va='bottom')\n",
     "\n",
-    "# Zmiana etykiet osi x\n",
+    "\n",
     "ax.set_xticks([0, 1])\n",
     "ax.set_xticklabels(['Man', 'Woman'])\n",
     "ax.set_title (\"Distribution of patients' gender\")\n",
     "\n",
-    "# Dodanie legendy\n",
+    "\n",
     "plt.legend()\n",
     "\n",
-    "# Wyświetlenie wykresu\n",
+    "\n",
     "plt.show()\n"
    ]
   },
@@ -907,22 +871,22 @@
     }
    ],
    "source": [
-    "# Grupowanie danych\n",
+    "\n",
     "grouped_data = dane.groupby(['Gender', 'Level']).size().unstack()\n",
     "\n",
-    "# Ustawienia kategorii i szerokości słupków\n",
+    "\n",
     "categories = grouped_data.columns\n",
     "bar_width = 0.35\n",
     "bar_positions_man = np.arange(len(categories))\n",
     "bar_positions_woman = [pos + bar_width for pos in bar_positions_man]\n",
     "\n",
-    "# Wygenerowanie wykresu słupkowego\n",
+    "\n",
     "fig, ax = plt.subplots()\n",
     "\n",
     "ax.bar(bar_positions_man, grouped_data.loc[1], width=bar_width, label='Man')\n",
     "ax.bar(bar_positions_woman, grouped_data.loc[2], width=bar_width, label='Woman')\n",
     "\n",
-    "# Dodanie wartości procentowych do słupków\n",
+    "\n",
     "for i, column in enumerate(categories):\n",
     "    for j, value in enumerate(grouped_data.index):\n",
     "        total = grouped_data[column].sum()\n",
@@ -930,19 +894,19 @@
     "        height = grouped_data.loc[value, column]\n",
     "        ax.text(i + j * bar_width, height + 0.2, f'{percent:.0%}', ha='center', va='bottom')  \n",
     "\n",
-    "# Ustawienia etykiet i tytułów\n",
+    "\n",
     "plt.xlabel('Gender')\n",
     "plt.ylabel('Count')\n",
     "plt.title('Distribution of level by gender')\n",
     "\n",
-    "# Dodanie legendy\n",
+    "\n",
     "plt.legend(title='Level')\n",
     "\n",
-    "# Zmiana etykiet osi x\n",
+    "\n",
     "ax.set_xticks([pos + bar_width / 2 for pos in bar_positions_man])\n",
     "ax.set_xticklabels(categories)\n",
     "\n",
-    "# Wyświetlenie wykresu\n",
+    "\n",
     "plt.show()"
    ]
   },
@@ -1005,25 +969,25 @@
     "dane['Gender'] = dane['Gender'].replace({1: 'Man', 2: 'Woman'})\n",
     "smoking_counts = dane.groupby(['Smoking', 'Gender']).size()\n",
     "\n",
-    "# Zamiana liczby na procent\n",
+    "\n",
     "smoking_percentages = smoking_counts / smoking_counts.groupby('Gender').sum() * 100\n",
     "\n",
-    "# Sortowanie danych według stopnia 'Smoking'\n",
+    "\n",
     "smoking_percentages_sorted = smoking_percentages.sort_index(level='Smoking', sort_remaining=False)\n",
     "\n",
     "plt.figure(figsize=(14, 6))\n",
     "\n",
-    "# Tworzenie wykresu słupkowego poziomego\n",
+    "\n",
     "ax = smoking_percentages_sorted.plot(kind='barh')\n",
     "\n",
-    "# Dodawanie wartości procentowych do słupków\n",
+    "\n",
     "for i, value in enumerate(smoking_percentages_sorted):\n",
     "    ax.text(value + 0.1, i, f'{value:.2f}%', ha='left', va='center')\n",
     "\n",
-    "# Dodanie legendy\n",
+    "\n",
     "plt.legend()\n",
     "\n",
-    "# Ustawienia etykiet i tytułów\n",
+    "\n",
     "plt.xlabel('Percentage')\n",
     "plt.ylabel('Smoking, Gender')\n",
     "plt.title('Distribution of smoking by gender (%)')\n",
@@ -1092,25 +1056,25 @@
     "dane['Gender'] = dane['Gender'].replace({1: 'Man', 2: 'Woman'})\n",
     "smoking_counts = dane.groupby(['Passive Smoker', 'Gender']).size()\n",
     "\n",
-    "# Zamiana liczby na procent\n",
+    "\n",
     "smoking_percentages = smoking_counts / smoking_counts.groupby('Gender').sum() * 100\n",
     "\n",
-    "# Sortowanie danych według stopnia 'Passive smoker'\n",
+    "\n",
     "smoking_percentages_sorted = smoking_percentages.sort_index(level='Passive Smoker', sort_remaining=False)\n",
     "\n",
     "plt.figure(figsize=(15, 6))\n",
     "\n",
-    "# Tworzenie wykresu słupkowego poziomego\n",
+    "\n",
     "ax = smoking_percentages_sorted.plot(kind='barh')\n",
     "\n",
-    "# Dodawanie wartości procentowych do słupków\n",
+    "\n",
     "for i, value in enumerate(smoking_percentages_sorted):\n",
     "    ax.text(value + 0.1, i, f'{value:.2f}%', ha='left', va='center')\n",
     "\n",
-    "# Dodanie legendy\n",
+    "\n",
     "plt.legend()\n",
     "\n",
-    "# Ustawienia etykiet i tytułów\n",
+    "\n",
     "plt.xlabel('Percentage')\n",
     "plt.ylabel('Passive Smoker, Gender')\n",
     "plt.title('Distribution of passive smokers by gender (%)')\n",
@@ -1152,22 +1116,22 @@
     "\n",
     "plt.figure(figsize=(15, 6))\n",
     "\n",
-    "# Tworzenie wykresu słupkowego poziomego\n",
+    "\n",
     "ax = Genetic_risk_percentages_sorted.plot(kind='barh')\n",
     "\n",
-    "# Dodawanie wartości procentowych do słupków\n",
+    "\n",
     "for i, value in enumerate(Genetic_risk_percentages_sorted):\n",
     "    ax.text(value + 0.1, i, f'{value:.2f}%', ha='left', va='center')\n",
     "\n",
-    "# Dodanie legendy\n",
+    "\n",
     "plt.legend()\n",
     "\n",
-    "# Ustawienia etykiet i tytułów\n",
+    "\n",
     "plt.xlabel('Percentage')\n",
     "plt.ylabel('Genetic risk, Gender')\n",
     "plt.title('Distribution of genetic risk by gender (%)')\n",
     "\n",
-    "# Wyświetlenie wykresu\n",
+    "\n",
     "plt.show()\n"
    ]
   },
@@ -1192,24 +1156,24 @@
     "\n",
     "Genetic_risk_counts = dane.groupby(['Genetic Risk', 'Level']).size()\n",
     "\n",
-    "# Sortowanie danych według ryzyka genetycznego i liczby w odwrotnej kolejności\n",
+    "\n",
     "Genetic_risk_counts_sorted = Genetic_risk_counts.sort_index(level=['Genetic Risk', 'Level'], key=lambda x: x.map({'High': 1, 'Medium': 2, 'Low': 3}))\n",
     "\n",
     "plt.figure(figsize=(10, 6))\n",
     "\n",
-    "# Tworzenie wykresu słupkowego horyzontalnego\n",
+    "\n",
     "ax = Genetic_risk_counts_sorted.plot(kind='barh')\n",
     "\n",
-    "# Dodawanie wartości do słupków\n",
+    "\n",
     "for i, value in enumerate(Genetic_risk_counts_sorted):\n",
     "    ax.text(value + 0.1, i, str(value), ha='left', va='center')\n",
     "\n",
-    "# Ustawienia etykiet i tytułów\n",
+    "\n",
     "plt.xlabel('Count')\n",
     "plt.ylabel('Genetic Risk, Level')\n",
     "plt.title('Distribution of level by genetic risk')\n",
     "\n",
-    "# Wyświetlenie wykresu\n",
+    "\n",
     "plt.show()"
    ]
   },
@@ -1403,24 +1367,24 @@
    "source": [
     "air_pollution = dane.groupby(['Air Pollution', 'Level']).size()\n",
     "\n",
-    "# Sortowanie danych według ryzyka genetycznego i liczby w odwrotnej kolejności\n",
+    "\n",
     "air_pollution_sorted = air_pollution.sort_index(level=['Air Pollution', 'Level'], key=lambda x: x.map({'High': 1, 'Medium': 2, 'Low': 3}))\n",
     "\n",
     "plt.figure(figsize=(10, 6))\n",
     "\n",
-    "# Tworzenie wykresu słupkowego horyzontalnego\n",
+    "\n",
     "ax = air_pollution_sorted.plot(kind='barh')\n",
     "\n",
-    "# Dodawanie wartości do słupków\n",
+    "\n",
     "for i, value in enumerate(air_pollution_sorted):\n",
     "    ax.text(value + 0.1, i, str(value), ha='left', va='center')\n",
     "\n",
-    "# Ustawienia etykiet i tytułów\n",
+    "\n",
     "plt.xlabel('Count')\n",
     "plt.ylabel('Air Pollution, Level')\n",
     "plt.title('Distribution of level by air pollution')\n",
     "\n",
-    "# Wyświetlenie wykresu\n",
+    "\n",
     "plt.show()"
    ]
   },
@@ -3644,46 +3608,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": null,
    "id": "322588b1",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: mlxtend in c:\\users\\hp\\anaconda3\\lib\\site-packages (0.23.0)\n",
-      "Requirement already satisfied: scipy>=1.2.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (1.11.4)\n",
-      "Requirement already satisfied: numpy>=1.16.2 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (1.26.0)\n",
-      "Requirement already satisfied: pandas>=0.24.2 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (2.1.1)\n",
-      "Requirement already satisfied: scikit-learn>=1.0.2 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (1.3.2)\n",
-      "Requirement already satisfied: matplotlib>=3.0.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (3.8.0)\n",
-      "Requirement already satisfied: joblib>=0.13.2 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (1.3.2)\n",
-      "Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (1.2.0)\n",
-      "Requirement already satisfied: cycler>=0.10 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (0.12.1)\n",
-      "Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (4.25.0)\n",
-      "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (1.4.4)\n",
-      "Requirement already satisfied: packaging>=20.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (23.1)\n",
-      "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (10.0.1)\n",
-      "Requirement already satisfied: pyparsing>=2.3.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (3.0.9)\n",
-      "Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (2.8.2)\n",
-      "Requirement already satisfied: pytz>=2020.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from pandas>=0.24.2->mlxtend) (2023.3.post1)\n",
-      "Requirement already satisfied: tzdata>=2022.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from pandas>=0.24.2->mlxtend) (2023.3)\n",
-      "Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from scikit-learn>=1.0.2->mlxtend) (3.2.0)\n",
-      "Requirement already satisfied: six>=1.5 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from python-dateutil>=2.7->matplotlib>=3.0.0->mlxtend) (1.16.0)\n",
-      "Note: you may need to restart the kernel to use updated packages.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "[notice] A new release of pip is available: 23.3.2 -> 24.0\n",
-      "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "pip install mlxtend"
    ]
@@ -3703,20 +3631,6 @@
     "sns.set()\n"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "id": "cbb6c719",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#cm = confusion_matrix(y_test, y_pred)\n",
-    "#plot_confusion_matrix(cm)\n",
-    "\n",
-    "#acc = accuracy_score(y_test, y_pred)\n",
-    "#print('Accuracy',':', acc)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 35,