From ff760e3e40a03ceb33b11126e04d1896e397267f Mon Sep 17 00:00:00 2001 From: "ag.gabka@gmail.com" Date: Tue, 23 Jul 2024 20:07:53 +0200 Subject: [PATCH] Updated lung_cancer_data_project.ipynb --- lung_cancer_data_project.ipynb | 174 +++++++++------------------------ 1 file changed, 44 insertions(+), 130 deletions(-) diff --git a/lung_cancer_data_project.ipynb b/lung_cancer_data_project.ipynb index 6dc5c9c..53c85b2 100644 --- a/lung_cancer_data_project.ipynb +++ b/lung_cancer_data_project.ipynb @@ -32,46 +32,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "3b9fd854", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: plotnine in c:\\users\\hp\\anaconda3\\lib\\site-packages (0.12.4)\n", - "Requirement already satisfied: matplotlib>=3.6.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (3.8.0)\n", - "Requirement already satisfied: mizani<0.10.0,>0.9.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (0.9.3)\n", - "Requirement already satisfied: numpy>=1.23.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (1.26.0)\n", - "Requirement already satisfied: pandas>=1.5.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (2.1.1)\n", - "Requirement already satisfied: patsy>=0.5.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (0.5.5)\n", - "Requirement already satisfied: scipy>=1.5.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (1.11.4)\n", - "Requirement already satisfied: statsmodels>=0.14.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (0.14.0)\n", - "Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (1.2.0)\n", - "Requirement already satisfied: cycler>=0.10 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (0.12.1)\n", - "Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (4.25.0)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (1.4.4)\n", - "Requirement already satisfied: packaging>=20.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (23.1)\n", - "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (10.0.1)\n", - "Requirement already satisfied: pyparsing>=2.3.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (3.0.9)\n", - "Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (2.8.2)\n", - "Requirement already satisfied: tzdata in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mizani<0.10.0,>0.9.0->plotnine) (2023.3)\n", - "Requirement already satisfied: pytz>=2020.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from pandas>=1.5.0->plotnine) (2023.3.post1)\n", - "Requirement already satisfied: six in c:\\users\\hp\\anaconda3\\lib\\site-packages (from patsy>=0.5.1->plotnine) (1.16.0)\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "[notice] A new release of pip is available: 23.3.2 -> 24.0\n", - "[notice] To update, run: python.exe -m pip install --upgrade pip\n" - ] - } - ], + "outputs": [], "source": [ "pip install plotnine" ] @@ -815,7 +779,7 @@ ], "source": [ "\n", - "#stworzenie 'binow' dla pokazania wieku pacjentów\n", + "\n", "bins = []\n", "for i in range (0, 101, 10):\n", " bins.append(i)\n", @@ -873,19 +837,19 @@ "gender_counts = dane['Gender'].value_counts()\n", "ax = gender_counts.plot(kind='bar', color=['blue', 'pink'])\n", "\n", - "# Dodawanie wartości do słupków\n", + "\n", "for i, value in enumerate(gender_counts):\n", " ax.text(i, value + 0.1, str(value), ha='center', va='bottom')\n", "\n", - "# Zmiana etykiet osi x\n", + "\n", "ax.set_xticks([0, 1])\n", "ax.set_xticklabels(['Man', 'Woman'])\n", "ax.set_title (\"Distribution of patients' gender\")\n", "\n", - "# Dodanie legendy\n", + "\n", "plt.legend()\n", "\n", - "# Wyświetlenie wykresu\n", + "\n", "plt.show()\n" ] }, @@ -907,22 +871,22 @@ } ], "source": [ - "# Grupowanie danych\n", + "\n", "grouped_data = dane.groupby(['Gender', 'Level']).size().unstack()\n", "\n", - "# Ustawienia kategorii i szerokości słupków\n", + "\n", "categories = grouped_data.columns\n", "bar_width = 0.35\n", "bar_positions_man = np.arange(len(categories))\n", "bar_positions_woman = [pos + bar_width for pos in bar_positions_man]\n", "\n", - "# Wygenerowanie wykresu słupkowego\n", + "\n", "fig, ax = plt.subplots()\n", "\n", "ax.bar(bar_positions_man, grouped_data.loc[1], width=bar_width, label='Man')\n", "ax.bar(bar_positions_woman, grouped_data.loc[2], width=bar_width, label='Woman')\n", "\n", - "# Dodanie wartości procentowych do słupków\n", + "\n", "for i, column in enumerate(categories):\n", " for j, value in enumerate(grouped_data.index):\n", " total = grouped_data[column].sum()\n", @@ -930,19 +894,19 @@ " height = grouped_data.loc[value, column]\n", " ax.text(i + j * bar_width, height + 0.2, f'{percent:.0%}', ha='center', va='bottom') \n", "\n", - "# Ustawienia etykiet i tytułów\n", + "\n", "plt.xlabel('Gender')\n", "plt.ylabel('Count')\n", "plt.title('Distribution of level by gender')\n", "\n", - "# Dodanie legendy\n", + "\n", "plt.legend(title='Level')\n", "\n", - "# Zmiana etykiet osi x\n", + "\n", "ax.set_xticks([pos + bar_width / 2 for pos in bar_positions_man])\n", "ax.set_xticklabels(categories)\n", "\n", - "# Wyświetlenie wykresu\n", + "\n", "plt.show()" ] }, @@ -1005,25 +969,25 @@ "dane['Gender'] = dane['Gender'].replace({1: 'Man', 2: 'Woman'})\n", "smoking_counts = dane.groupby(['Smoking', 'Gender']).size()\n", "\n", - "# Zamiana liczby na procent\n", + "\n", "smoking_percentages = smoking_counts / smoking_counts.groupby('Gender').sum() * 100\n", "\n", - "# Sortowanie danych według stopnia 'Smoking'\n", + "\n", "smoking_percentages_sorted = smoking_percentages.sort_index(level='Smoking', sort_remaining=False)\n", "\n", "plt.figure(figsize=(14, 6))\n", "\n", - "# Tworzenie wykresu słupkowego poziomego\n", + "\n", "ax = smoking_percentages_sorted.plot(kind='barh')\n", "\n", - "# Dodawanie wartości procentowych do słupków\n", + "\n", "for i, value in enumerate(smoking_percentages_sorted):\n", " ax.text(value + 0.1, i, f'{value:.2f}%', ha='left', va='center')\n", "\n", - "# Dodanie legendy\n", + "\n", "plt.legend()\n", "\n", - "# Ustawienia etykiet i tytułów\n", + "\n", "plt.xlabel('Percentage')\n", "plt.ylabel('Smoking, Gender')\n", "plt.title('Distribution of smoking by gender (%)')\n", @@ -1092,25 +1056,25 @@ "dane['Gender'] = dane['Gender'].replace({1: 'Man', 2: 'Woman'})\n", "smoking_counts = dane.groupby(['Passive Smoker', 'Gender']).size()\n", "\n", - "# Zamiana liczby na procent\n", + "\n", "smoking_percentages = smoking_counts / smoking_counts.groupby('Gender').sum() * 100\n", "\n", - "# Sortowanie danych według stopnia 'Passive smoker'\n", + "\n", "smoking_percentages_sorted = smoking_percentages.sort_index(level='Passive Smoker', sort_remaining=False)\n", "\n", "plt.figure(figsize=(15, 6))\n", "\n", - "# Tworzenie wykresu słupkowego poziomego\n", + "\n", "ax = smoking_percentages_sorted.plot(kind='barh')\n", "\n", - "# Dodawanie wartości procentowych do słupków\n", + "\n", "for i, value in enumerate(smoking_percentages_sorted):\n", " ax.text(value + 0.1, i, f'{value:.2f}%', ha='left', va='center')\n", "\n", - "# Dodanie legendy\n", + "\n", "plt.legend()\n", "\n", - "# Ustawienia etykiet i tytułów\n", + "\n", "plt.xlabel('Percentage')\n", "plt.ylabel('Passive Smoker, Gender')\n", "plt.title('Distribution of passive smokers by gender (%)')\n", @@ -1152,22 +1116,22 @@ "\n", "plt.figure(figsize=(15, 6))\n", "\n", - "# Tworzenie wykresu słupkowego poziomego\n", + "\n", "ax = Genetic_risk_percentages_sorted.plot(kind='barh')\n", "\n", - "# Dodawanie wartości procentowych do słupków\n", + "\n", "for i, value in enumerate(Genetic_risk_percentages_sorted):\n", " ax.text(value + 0.1, i, f'{value:.2f}%', ha='left', va='center')\n", "\n", - "# Dodanie legendy\n", + "\n", "plt.legend()\n", "\n", - "# Ustawienia etykiet i tytułów\n", + "\n", "plt.xlabel('Percentage')\n", "plt.ylabel('Genetic risk, Gender')\n", "plt.title('Distribution of genetic risk by gender (%)')\n", "\n", - "# Wyświetlenie wykresu\n", + "\n", "plt.show()\n" ] }, @@ -1192,24 +1156,24 @@ "\n", "Genetic_risk_counts = dane.groupby(['Genetic Risk', 'Level']).size()\n", "\n", - "# Sortowanie danych według ryzyka genetycznego i liczby w odwrotnej kolejności\n", + "\n", "Genetic_risk_counts_sorted = Genetic_risk_counts.sort_index(level=['Genetic Risk', 'Level'], key=lambda x: x.map({'High': 1, 'Medium': 2, 'Low': 3}))\n", "\n", "plt.figure(figsize=(10, 6))\n", "\n", - "# Tworzenie wykresu słupkowego horyzontalnego\n", + "\n", "ax = Genetic_risk_counts_sorted.plot(kind='barh')\n", "\n", - "# Dodawanie wartości do słupków\n", + "\n", "for i, value in enumerate(Genetic_risk_counts_sorted):\n", " ax.text(value + 0.1, i, str(value), ha='left', va='center')\n", "\n", - "# Ustawienia etykiet i tytułów\n", + "\n", "plt.xlabel('Count')\n", "plt.ylabel('Genetic Risk, Level')\n", "plt.title('Distribution of level by genetic risk')\n", "\n", - "# Wyświetlenie wykresu\n", + "\n", "plt.show()" ] }, @@ -1403,24 +1367,24 @@ "source": [ "air_pollution = dane.groupby(['Air Pollution', 'Level']).size()\n", "\n", - "# Sortowanie danych według ryzyka genetycznego i liczby w odwrotnej kolejności\n", + "\n", "air_pollution_sorted = air_pollution.sort_index(level=['Air Pollution', 'Level'], key=lambda x: x.map({'High': 1, 'Medium': 2, 'Low': 3}))\n", "\n", "plt.figure(figsize=(10, 6))\n", "\n", - "# Tworzenie wykresu słupkowego horyzontalnego\n", + "\n", "ax = air_pollution_sorted.plot(kind='barh')\n", "\n", - "# Dodawanie wartości do słupków\n", + "\n", "for i, value in enumerate(air_pollution_sorted):\n", " ax.text(value + 0.1, i, str(value), ha='left', va='center')\n", "\n", - "# Ustawienia etykiet i tytułów\n", + "\n", "plt.xlabel('Count')\n", "plt.ylabel('Air Pollution, Level')\n", "plt.title('Distribution of level by air pollution')\n", "\n", - "# Wyświetlenie wykresu\n", + "\n", "plt.show()" ] }, @@ -3644,46 +3608,10 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "id": "322588b1", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: mlxtend in c:\\users\\hp\\anaconda3\\lib\\site-packages (0.23.0)\n", - "Requirement already satisfied: scipy>=1.2.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (1.11.4)\n", - "Requirement already satisfied: numpy>=1.16.2 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (1.26.0)\n", - "Requirement already satisfied: pandas>=0.24.2 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (2.1.1)\n", - "Requirement already satisfied: scikit-learn>=1.0.2 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (1.3.2)\n", - "Requirement already satisfied: matplotlib>=3.0.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (3.8.0)\n", - "Requirement already satisfied: joblib>=0.13.2 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (1.3.2)\n", - "Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (1.2.0)\n", - "Requirement already satisfied: cycler>=0.10 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (0.12.1)\n", - "Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (4.25.0)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (1.4.4)\n", - "Requirement already satisfied: packaging>=20.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (23.1)\n", - "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (10.0.1)\n", - "Requirement already satisfied: pyparsing>=2.3.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (3.0.9)\n", - "Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from pandas>=0.24.2->mlxtend) (2023.3.post1)\n", - "Requirement already satisfied: tzdata>=2022.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from pandas>=0.24.2->mlxtend) (2023.3)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from scikit-learn>=1.0.2->mlxtend) (3.2.0)\n", - "Requirement already satisfied: six>=1.5 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from python-dateutil>=2.7->matplotlib>=3.0.0->mlxtend) (1.16.0)\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "[notice] A new release of pip is available: 23.3.2 -> 24.0\n", - "[notice] To update, run: python.exe -m pip install --upgrade pip\n" - ] - } - ], + "outputs": [], "source": [ "pip install mlxtend" ] @@ -3703,20 +3631,6 @@ "sns.set()\n" ] }, - { - "cell_type": "code", - "execution_count": 34, - "id": "cbb6c719", - "metadata": {}, - "outputs": [], - "source": [ - "#cm = confusion_matrix(y_test, y_pred)\n", - "#plot_confusion_matrix(cm)\n", - "\n", - "#acc = accuracy_score(y_test, y_pred)\n", - "#print('Accuracy',':', acc)" - ] - }, { "cell_type": "code", "execution_count": 35,