Updated lung_cancer_data_project.ipynb

2024-07-23 20:07:53 +02:00 · 2024-07-23 20:07:53 +02:00 · ff760e3e40
commit ff760e3e40
parent 514e7840b3
1 changed files with 44 additions and 130 deletions
--- a/lung_cancer_data_project.ipynb
+++ b/lung_cancer_data_project.ipynb
@ -32,46 +32,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "id": "3b9fd854",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: plotnine in c:\\users\\hp\\anaconda3\\lib\\site-packages (0.12.4)\n",
-      "Requirement already satisfied: matplotlib>=3.6.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (3.8.0)\n",
-      "Requirement already satisfied: mizani<0.10.0,>0.9.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (0.9.3)\n",
-      "Requirement already satisfied: numpy>=1.23.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (1.26.0)\n",
-      "Requirement already satisfied: pandas>=1.5.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (2.1.1)\n",
-      "Requirement already satisfied: patsy>=0.5.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (0.5.5)\n",
-      "Requirement already satisfied: scipy>=1.5.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (1.11.4)\n",
-      "Requirement already satisfied: statsmodels>=0.14.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (0.14.0)\n",
-      "Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (1.2.0)\n",
-      "Requirement already satisfied: cycler>=0.10 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (0.12.1)\n",
-      "Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (4.25.0)\n",
-      "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (1.4.4)\n",
-      "Requirement already satisfied: packaging>=20.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (23.1)\n",
-      "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (10.0.1)\n",
-      "Requirement already satisfied: pyparsing>=2.3.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (3.0.9)\n",
-      "Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (2.8.2)\n",
-      "Requirement already satisfied: tzdata in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mizani<0.10.0,>0.9.0->plotnine) (2023.3)\n",
-      "Requirement already satisfied: pytz>=2020.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from pandas>=1.5.0->plotnine) (2023.3.post1)\n",
-      "Requirement already satisfied: six in c:\\users\\hp\\anaconda3\\lib\\site-packages (from patsy>=0.5.1->plotnine) (1.16.0)\n",
-      "Note: you may need to restart the kernel to use updated packages.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "[notice] A new release of pip is available: 23.3.2 -> 24.0\n",
-      "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "pip install plotnine"
   ]
@ -815,7 +779,7 @@
   ],
   "source": [
    "\n",
-    "#stworzenie 'binow' dla pokazania wieku pacjentów\n",
+    "\n",
    "bins = []\n",
    "for i in range (0, 101, 10):\n",
    "    bins.append(i)\n",
@ -873,19 +837,19 @@
    "gender_counts = dane['Gender'].value_counts()\n",
    "ax = gender_counts.plot(kind='bar', color=['blue', 'pink'])\n",
    "\n",
-    "# Dodawanie wartości do słupków\n",
+    "\n",
    "for i, value in enumerate(gender_counts):\n",
    "    ax.text(i, value + 0.1, str(value), ha='center', va='bottom')\n",
    "\n",
-    "# Zmiana etykiet osi x\n",
+    "\n",
    "ax.set_xticks([0, 1])\n",
    "ax.set_xticklabels(['Man', 'Woman'])\n",
    "ax.set_title (\"Distribution of patients' gender\")\n",
    "\n",
-    "# Dodanie legendy\n",
+    "\n",
    "plt.legend()\n",
    "\n",
-    "# Wyświetlenie wykresu\n",
+    "\n",
    "plt.show()\n"
   ]
  },
@ -907,22 +871,22 @@
    }
   ],
   "source": [
-    "# Grupowanie danych\n",
+    "\n",
    "grouped_data = dane.groupby(['Gender', 'Level']).size().unstack()\n",
    "\n",
-    "# Ustawienia kategorii i szerokości słupków\n",
+    "\n",
    "categories = grouped_data.columns\n",
    "bar_width = 0.35\n",
    "bar_positions_man = np.arange(len(categories))\n",
    "bar_positions_woman = [pos + bar_width for pos in bar_positions_man]\n",
    "\n",
-    "# Wygenerowanie wykresu słupkowego\n",
+    "\n",
    "fig, ax = plt.subplots()\n",
    "\n",
    "ax.bar(bar_positions_man, grouped_data.loc[1], width=bar_width, label='Man')\n",
    "ax.bar(bar_positions_woman, grouped_data.loc[2], width=bar_width, label='Woman')\n",
    "\n",
-    "# Dodanie wartości procentowych do słupków\n",
+    "\n",
    "for i, column in enumerate(categories):\n",
    "    for j, value in enumerate(grouped_data.index):\n",
    "        total = grouped_data[column].sum()\n",
@ -930,19 +894,19 @@
    "        height = grouped_data.loc[value, column]\n",
    "        ax.text(i + j * bar_width, height + 0.2, f'{percent:.0%}', ha='center', va='bottom')  \n",
    "\n",
-    "# Ustawienia etykiet i tytułów\n",
+    "\n",
    "plt.xlabel('Gender')\n",
    "plt.ylabel('Count')\n",
    "plt.title('Distribution of level by gender')\n",
    "\n",
-    "# Dodanie legendy\n",
+    "\n",
    "plt.legend(title='Level')\n",
    "\n",
-    "# Zmiana etykiet osi x\n",
+    "\n",
    "ax.set_xticks([pos + bar_width / 2 for pos in bar_positions_man])\n",
    "ax.set_xticklabels(categories)\n",
    "\n",
-    "# Wyświetlenie wykresu\n",
+    "\n",
    "plt.show()"
   ]
  },
@ -1005,25 +969,25 @@
    "dane['Gender'] = dane['Gender'].replace({1: 'Man', 2: 'Woman'})\n",
    "smoking_counts = dane.groupby(['Smoking', 'Gender']).size()\n",
    "\n",
-    "# Zamiana liczby na procent\n",
+    "\n",
    "smoking_percentages = smoking_counts / smoking_counts.groupby('Gender').sum() * 100\n",
    "\n",
-    "# Sortowanie danych według stopnia 'Smoking'\n",
+    "\n",
    "smoking_percentages_sorted = smoking_percentages.sort_index(level='Smoking', sort_remaining=False)\n",
    "\n",
    "plt.figure(figsize=(14, 6))\n",
    "\n",
-    "# Tworzenie wykresu słupkowego poziomego\n",
+    "\n",
    "ax = smoking_percentages_sorted.plot(kind='barh')\n",
    "\n",
-    "# Dodawanie wartości procentowych do słupków\n",
+    "\n",
    "for i, value in enumerate(smoking_percentages_sorted):\n",
    "    ax.text(value + 0.1, i, f'{value:.2f}%', ha='left', va='center')\n",
    "\n",
-    "# Dodanie legendy\n",
+    "\n",
    "plt.legend()\n",
    "\n",
-    "# Ustawienia etykiet i tytułów\n",
+    "\n",
    "plt.xlabel('Percentage')\n",
    "plt.ylabel('Smoking, Gender')\n",
    "plt.title('Distribution of smoking by gender (%)')\n",
@ -1092,25 +1056,25 @@
    "dane['Gender'] = dane['Gender'].replace({1: 'Man', 2: 'Woman'})\n",
    "smoking_counts = dane.groupby(['Passive Smoker', 'Gender']).size()\n",
    "\n",
-    "# Zamiana liczby na procent\n",
+    "\n",
    "smoking_percentages = smoking_counts / smoking_counts.groupby('Gender').sum() * 100\n",
    "\n",
-    "# Sortowanie danych według stopnia 'Passive smoker'\n",
+    "\n",
    "smoking_percentages_sorted = smoking_percentages.sort_index(level='Passive Smoker', sort_remaining=False)\n",
    "\n",
    "plt.figure(figsize=(15, 6))\n",
    "\n",
-    "# Tworzenie wykresu słupkowego poziomego\n",
+    "\n",
    "ax = smoking_percentages_sorted.plot(kind='barh')\n",
    "\n",
-    "# Dodawanie wartości procentowych do słupków\n",
+    "\n",
    "for i, value in enumerate(smoking_percentages_sorted):\n",
    "    ax.text(value + 0.1, i, f'{value:.2f}%', ha='left', va='center')\n",
    "\n",
-    "# Dodanie legendy\n",
+    "\n",
    "plt.legend()\n",
    "\n",
-    "# Ustawienia etykiet i tytułów\n",
+    "\n",
    "plt.xlabel('Percentage')\n",
    "plt.ylabel('Passive Smoker, Gender')\n",
    "plt.title('Distribution of passive smokers by gender (%)')\n",
@ -1152,22 +1116,22 @@
    "\n",
    "plt.figure(figsize=(15, 6))\n",
    "\n",
-    "# Tworzenie wykresu słupkowego poziomego\n",
+    "\n",
    "ax = Genetic_risk_percentages_sorted.plot(kind='barh')\n",
    "\n",
-    "# Dodawanie wartości procentowych do słupków\n",
+    "\n",
    "for i, value in enumerate(Genetic_risk_percentages_sorted):\n",
    "    ax.text(value + 0.1, i, f'{value:.2f}%', ha='left', va='center')\n",
    "\n",
-    "# Dodanie legendy\n",
+    "\n",
    "plt.legend()\n",
    "\n",
-    "# Ustawienia etykiet i tytułów\n",
+    "\n",
    "plt.xlabel('Percentage')\n",
    "plt.ylabel('Genetic risk, Gender')\n",
    "plt.title('Distribution of genetic risk by gender (%)')\n",
    "\n",
-    "# Wyświetlenie wykresu\n",
+    "\n",
    "plt.show()\n"
   ]
  },
@ -1192,24 +1156,24 @@
    "\n",
    "Genetic_risk_counts = dane.groupby(['Genetic Risk', 'Level']).size()\n",
    "\n",
-    "# Sortowanie danych według ryzyka genetycznego i liczby w odwrotnej kolejności\n",
+    "\n",
    "Genetic_risk_counts_sorted = Genetic_risk_counts.sort_index(level=['Genetic Risk', 'Level'], key=lambda x: x.map({'High': 1, 'Medium': 2, 'Low': 3}))\n",
    "\n",
    "plt.figure(figsize=(10, 6))\n",
    "\n",
-    "# Tworzenie wykresu słupkowego horyzontalnego\n",
+    "\n",
    "ax = Genetic_risk_counts_sorted.plot(kind='barh')\n",
    "\n",
-    "# Dodawanie wartości do słupków\n",
+    "\n",
    "for i, value in enumerate(Genetic_risk_counts_sorted):\n",
    "    ax.text(value + 0.1, i, str(value), ha='left', va='center')\n",
    "\n",
-    "# Ustawienia etykiet i tytułów\n",
+    "\n",
    "plt.xlabel('Count')\n",
    "plt.ylabel('Genetic Risk, Level')\n",
    "plt.title('Distribution of level by genetic risk')\n",
    "\n",
-    "# Wyświetlenie wykresu\n",
+    "\n",
    "plt.show()"
   ]
  },
@ -1403,24 +1367,24 @@
   "source": [
    "air_pollution = dane.groupby(['Air Pollution', 'Level']).size()\n",
    "\n",
-    "# Sortowanie danych według ryzyka genetycznego i liczby w odwrotnej kolejności\n",
+    "\n",
    "air_pollution_sorted = air_pollution.sort_index(level=['Air Pollution', 'Level'], key=lambda x: x.map({'High': 1, 'Medium': 2, 'Low': 3}))\n",
    "\n",
    "plt.figure(figsize=(10, 6))\n",
    "\n",
-    "# Tworzenie wykresu słupkowego horyzontalnego\n",
+    "\n",
    "ax = air_pollution_sorted.plot(kind='barh')\n",
    "\n",
-    "# Dodawanie wartości do słupków\n",
+    "\n",
    "for i, value in enumerate(air_pollution_sorted):\n",
    "    ax.text(value + 0.1, i, str(value), ha='left', va='center')\n",
    "\n",
-    "# Ustawienia etykiet i tytułów\n",
+    "\n",
    "plt.xlabel('Count')\n",
    "plt.ylabel('Air Pollution, Level')\n",
    "plt.title('Distribution of level by air pollution')\n",
    "\n",
-    "# Wyświetlenie wykresu\n",
+    "\n",
    "plt.show()"
   ]
  },
@ -3644,46 +3608,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": null,
   "id": "322588b1",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Requirement already satisfied: mlxtend in c:\\users\\hp\\anaconda3\\lib\\site-packages (0.23.0)\n",
-      "Requirement already satisfied: scipy>=1.2.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (1.11.4)\n",
-      "Requirement already satisfied: numpy>=1.16.2 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (1.26.0)\n",
-      "Requirement already satisfied: pandas>=0.24.2 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (2.1.1)\n",
-      "Requirement already satisfied: scikit-learn>=1.0.2 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (1.3.2)\n",
-      "Requirement already satisfied: matplotlib>=3.0.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (3.8.0)\n",
-      "Requirement already satisfied: joblib>=0.13.2 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (1.3.2)\n",
-      "Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (1.2.0)\n",
-      "Requirement already satisfied: cycler>=0.10 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (0.12.1)\n",
-      "Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (4.25.0)\n",
-      "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (1.4.4)\n",
-      "Requirement already satisfied: packaging>=20.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (23.1)\n",
-      "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (10.0.1)\n",
-      "Requirement already satisfied: pyparsing>=2.3.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (3.0.9)\n",
-      "Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (2.8.2)\n",
-      "Requirement already satisfied: pytz>=2020.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from pandas>=0.24.2->mlxtend) (2023.3.post1)\n",
-      "Requirement already satisfied: tzdata>=2022.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from pandas>=0.24.2->mlxtend) (2023.3)\n",
-      "Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from scikit-learn>=1.0.2->mlxtend) (3.2.0)\n",
-      "Requirement already satisfied: six>=1.5 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from python-dateutil>=2.7->matplotlib>=3.0.0->mlxtend) (1.16.0)\n",
-      "Note: you may need to restart the kernel to use updated packages.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "[notice] A new release of pip is available: 23.3.2 -> 24.0\n",
-      "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "pip install mlxtend"
   ]
@ -3703,20 +3631,6 @@
    "sns.set()\n"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "id": "cbb6c719",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#cm = confusion_matrix(y_test, y_pred)\n",
-    "#plot_confusion_matrix(cm)\n",
-    "\n",
-    "#acc = accuracy_score(y_test, y_pred)\n",
-    "#print('Accuracy',':', acc)"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": 35,