Updated lung_cancer_data_project.ipynb

This commit is contained in:
ag.gabka@gmail.com 2024-07-23 20:07:53 +02:00
parent 514e7840b3
commit ff760e3e40

View File

@ -32,46 +32,10 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": null,
"id": "3b9fd854", "id": "3b9fd854",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: plotnine in c:\\users\\hp\\anaconda3\\lib\\site-packages (0.12.4)\n",
"Requirement already satisfied: matplotlib>=3.6.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (3.8.0)\n",
"Requirement already satisfied: mizani<0.10.0,>0.9.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (0.9.3)\n",
"Requirement already satisfied: numpy>=1.23.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (1.26.0)\n",
"Requirement already satisfied: pandas>=1.5.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (2.1.1)\n",
"Requirement already satisfied: patsy>=0.5.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (0.5.5)\n",
"Requirement already satisfied: scipy>=1.5.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (1.11.4)\n",
"Requirement already satisfied: statsmodels>=0.14.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from plotnine) (0.14.0)\n",
"Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (1.2.0)\n",
"Requirement already satisfied: cycler>=0.10 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (0.12.1)\n",
"Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (4.25.0)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (1.4.4)\n",
"Requirement already satisfied: packaging>=20.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (23.1)\n",
"Requirement already satisfied: pillow>=6.2.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (10.0.1)\n",
"Requirement already satisfied: pyparsing>=2.3.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (3.0.9)\n",
"Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.6.0->plotnine) (2.8.2)\n",
"Requirement already satisfied: tzdata in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mizani<0.10.0,>0.9.0->plotnine) (2023.3)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from pandas>=1.5.0->plotnine) (2023.3.post1)\n",
"Requirement already satisfied: six in c:\\users\\hp\\anaconda3\\lib\\site-packages (from patsy>=0.5.1->plotnine) (1.16.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"[notice] A new release of pip is available: 23.3.2 -> 24.0\n",
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
]
}
],
"source": [ "source": [
"pip install plotnine" "pip install plotnine"
] ]
@ -815,7 +779,7 @@
], ],
"source": [ "source": [
"\n", "\n",
"#stworzenie 'binow' dla pokazania wieku pacjentów\n", "\n",
"bins = []\n", "bins = []\n",
"for i in range (0, 101, 10):\n", "for i in range (0, 101, 10):\n",
" bins.append(i)\n", " bins.append(i)\n",
@ -873,19 +837,19 @@
"gender_counts = dane['Gender'].value_counts()\n", "gender_counts = dane['Gender'].value_counts()\n",
"ax = gender_counts.plot(kind='bar', color=['blue', 'pink'])\n", "ax = gender_counts.plot(kind='bar', color=['blue', 'pink'])\n",
"\n", "\n",
"# Dodawanie wartości do słupków\n", "\n",
"for i, value in enumerate(gender_counts):\n", "for i, value in enumerate(gender_counts):\n",
" ax.text(i, value + 0.1, str(value), ha='center', va='bottom')\n", " ax.text(i, value + 0.1, str(value), ha='center', va='bottom')\n",
"\n", "\n",
"# Zmiana etykiet osi x\n", "\n",
"ax.set_xticks([0, 1])\n", "ax.set_xticks([0, 1])\n",
"ax.set_xticklabels(['Man', 'Woman'])\n", "ax.set_xticklabels(['Man', 'Woman'])\n",
"ax.set_title (\"Distribution of patients' gender\")\n", "ax.set_title (\"Distribution of patients' gender\")\n",
"\n", "\n",
"# Dodanie legendy\n", "\n",
"plt.legend()\n", "plt.legend()\n",
"\n", "\n",
"# Wyświetlenie wykresu\n", "\n",
"plt.show()\n" "plt.show()\n"
] ]
}, },
@ -907,22 +871,22 @@
} }
], ],
"source": [ "source": [
"# Grupowanie danych\n", "\n",
"grouped_data = dane.groupby(['Gender', 'Level']).size().unstack()\n", "grouped_data = dane.groupby(['Gender', 'Level']).size().unstack()\n",
"\n", "\n",
"# Ustawienia kategorii i szerokości słupków\n", "\n",
"categories = grouped_data.columns\n", "categories = grouped_data.columns\n",
"bar_width = 0.35\n", "bar_width = 0.35\n",
"bar_positions_man = np.arange(len(categories))\n", "bar_positions_man = np.arange(len(categories))\n",
"bar_positions_woman = [pos + bar_width for pos in bar_positions_man]\n", "bar_positions_woman = [pos + bar_width for pos in bar_positions_man]\n",
"\n", "\n",
"# Wygenerowanie wykresu słupkowego\n", "\n",
"fig, ax = plt.subplots()\n", "fig, ax = plt.subplots()\n",
"\n", "\n",
"ax.bar(bar_positions_man, grouped_data.loc[1], width=bar_width, label='Man')\n", "ax.bar(bar_positions_man, grouped_data.loc[1], width=bar_width, label='Man')\n",
"ax.bar(bar_positions_woman, grouped_data.loc[2], width=bar_width, label='Woman')\n", "ax.bar(bar_positions_woman, grouped_data.loc[2], width=bar_width, label='Woman')\n",
"\n", "\n",
"# Dodanie wartości procentowych do słupków\n", "\n",
"for i, column in enumerate(categories):\n", "for i, column in enumerate(categories):\n",
" for j, value in enumerate(grouped_data.index):\n", " for j, value in enumerate(grouped_data.index):\n",
" total = grouped_data[column].sum()\n", " total = grouped_data[column].sum()\n",
@ -930,19 +894,19 @@
" height = grouped_data.loc[value, column]\n", " height = grouped_data.loc[value, column]\n",
" ax.text(i + j * bar_width, height + 0.2, f'{percent:.0%}', ha='center', va='bottom') \n", " ax.text(i + j * bar_width, height + 0.2, f'{percent:.0%}', ha='center', va='bottom') \n",
"\n", "\n",
"# Ustawienia etykiet i tytułów\n", "\n",
"plt.xlabel('Gender')\n", "plt.xlabel('Gender')\n",
"plt.ylabel('Count')\n", "plt.ylabel('Count')\n",
"plt.title('Distribution of level by gender')\n", "plt.title('Distribution of level by gender')\n",
"\n", "\n",
"# Dodanie legendy\n", "\n",
"plt.legend(title='Level')\n", "plt.legend(title='Level')\n",
"\n", "\n",
"# Zmiana etykiet osi x\n", "\n",
"ax.set_xticks([pos + bar_width / 2 for pos in bar_positions_man])\n", "ax.set_xticks([pos + bar_width / 2 for pos in bar_positions_man])\n",
"ax.set_xticklabels(categories)\n", "ax.set_xticklabels(categories)\n",
"\n", "\n",
"# Wyświetlenie wykresu\n", "\n",
"plt.show()" "plt.show()"
] ]
}, },
@ -1005,25 +969,25 @@
"dane['Gender'] = dane['Gender'].replace({1: 'Man', 2: 'Woman'})\n", "dane['Gender'] = dane['Gender'].replace({1: 'Man', 2: 'Woman'})\n",
"smoking_counts = dane.groupby(['Smoking', 'Gender']).size()\n", "smoking_counts = dane.groupby(['Smoking', 'Gender']).size()\n",
"\n", "\n",
"# Zamiana liczby na procent\n", "\n",
"smoking_percentages = smoking_counts / smoking_counts.groupby('Gender').sum() * 100\n", "smoking_percentages = smoking_counts / smoking_counts.groupby('Gender').sum() * 100\n",
"\n", "\n",
"# Sortowanie danych według stopnia 'Smoking'\n", "\n",
"smoking_percentages_sorted = smoking_percentages.sort_index(level='Smoking', sort_remaining=False)\n", "smoking_percentages_sorted = smoking_percentages.sort_index(level='Smoking', sort_remaining=False)\n",
"\n", "\n",
"plt.figure(figsize=(14, 6))\n", "plt.figure(figsize=(14, 6))\n",
"\n", "\n",
"# Tworzenie wykresu słupkowego poziomego\n", "\n",
"ax = smoking_percentages_sorted.plot(kind='barh')\n", "ax = smoking_percentages_sorted.plot(kind='barh')\n",
"\n", "\n",
"# Dodawanie wartości procentowych do słupków\n", "\n",
"for i, value in enumerate(smoking_percentages_sorted):\n", "for i, value in enumerate(smoking_percentages_sorted):\n",
" ax.text(value + 0.1, i, f'{value:.2f}%', ha='left', va='center')\n", " ax.text(value + 0.1, i, f'{value:.2f}%', ha='left', va='center')\n",
"\n", "\n",
"# Dodanie legendy\n", "\n",
"plt.legend()\n", "plt.legend()\n",
"\n", "\n",
"# Ustawienia etykiet i tytułów\n", "\n",
"plt.xlabel('Percentage')\n", "plt.xlabel('Percentage')\n",
"plt.ylabel('Smoking, Gender')\n", "plt.ylabel('Smoking, Gender')\n",
"plt.title('Distribution of smoking by gender (%)')\n", "plt.title('Distribution of smoking by gender (%)')\n",
@ -1092,25 +1056,25 @@
"dane['Gender'] = dane['Gender'].replace({1: 'Man', 2: 'Woman'})\n", "dane['Gender'] = dane['Gender'].replace({1: 'Man', 2: 'Woman'})\n",
"smoking_counts = dane.groupby(['Passive Smoker', 'Gender']).size()\n", "smoking_counts = dane.groupby(['Passive Smoker', 'Gender']).size()\n",
"\n", "\n",
"# Zamiana liczby na procent\n", "\n",
"smoking_percentages = smoking_counts / smoking_counts.groupby('Gender').sum() * 100\n", "smoking_percentages = smoking_counts / smoking_counts.groupby('Gender').sum() * 100\n",
"\n", "\n",
"# Sortowanie danych według stopnia 'Passive smoker'\n", "\n",
"smoking_percentages_sorted = smoking_percentages.sort_index(level='Passive Smoker', sort_remaining=False)\n", "smoking_percentages_sorted = smoking_percentages.sort_index(level='Passive Smoker', sort_remaining=False)\n",
"\n", "\n",
"plt.figure(figsize=(15, 6))\n", "plt.figure(figsize=(15, 6))\n",
"\n", "\n",
"# Tworzenie wykresu słupkowego poziomego\n", "\n",
"ax = smoking_percentages_sorted.plot(kind='barh')\n", "ax = smoking_percentages_sorted.plot(kind='barh')\n",
"\n", "\n",
"# Dodawanie wartości procentowych do słupków\n", "\n",
"for i, value in enumerate(smoking_percentages_sorted):\n", "for i, value in enumerate(smoking_percentages_sorted):\n",
" ax.text(value + 0.1, i, f'{value:.2f}%', ha='left', va='center')\n", " ax.text(value + 0.1, i, f'{value:.2f}%', ha='left', va='center')\n",
"\n", "\n",
"# Dodanie legendy\n", "\n",
"plt.legend()\n", "plt.legend()\n",
"\n", "\n",
"# Ustawienia etykiet i tytułów\n", "\n",
"plt.xlabel('Percentage')\n", "plt.xlabel('Percentage')\n",
"plt.ylabel('Passive Smoker, Gender')\n", "plt.ylabel('Passive Smoker, Gender')\n",
"plt.title('Distribution of passive smokers by gender (%)')\n", "plt.title('Distribution of passive smokers by gender (%)')\n",
@ -1152,22 +1116,22 @@
"\n", "\n",
"plt.figure(figsize=(15, 6))\n", "plt.figure(figsize=(15, 6))\n",
"\n", "\n",
"# Tworzenie wykresu słupkowego poziomego\n", "\n",
"ax = Genetic_risk_percentages_sorted.plot(kind='barh')\n", "ax = Genetic_risk_percentages_sorted.plot(kind='barh')\n",
"\n", "\n",
"# Dodawanie wartości procentowych do słupków\n", "\n",
"for i, value in enumerate(Genetic_risk_percentages_sorted):\n", "for i, value in enumerate(Genetic_risk_percentages_sorted):\n",
" ax.text(value + 0.1, i, f'{value:.2f}%', ha='left', va='center')\n", " ax.text(value + 0.1, i, f'{value:.2f}%', ha='left', va='center')\n",
"\n", "\n",
"# Dodanie legendy\n", "\n",
"plt.legend()\n", "plt.legend()\n",
"\n", "\n",
"# Ustawienia etykiet i tytułów\n", "\n",
"plt.xlabel('Percentage')\n", "plt.xlabel('Percentage')\n",
"plt.ylabel('Genetic risk, Gender')\n", "plt.ylabel('Genetic risk, Gender')\n",
"plt.title('Distribution of genetic risk by gender (%)')\n", "plt.title('Distribution of genetic risk by gender (%)')\n",
"\n", "\n",
"# Wyświetlenie wykresu\n", "\n",
"plt.show()\n" "plt.show()\n"
] ]
}, },
@ -1192,24 +1156,24 @@
"\n", "\n",
"Genetic_risk_counts = dane.groupby(['Genetic Risk', 'Level']).size()\n", "Genetic_risk_counts = dane.groupby(['Genetic Risk', 'Level']).size()\n",
"\n", "\n",
"# Sortowanie danych według ryzyka genetycznego i liczby w odwrotnej kolejności\n", "\n",
"Genetic_risk_counts_sorted = Genetic_risk_counts.sort_index(level=['Genetic Risk', 'Level'], key=lambda x: x.map({'High': 1, 'Medium': 2, 'Low': 3}))\n", "Genetic_risk_counts_sorted = Genetic_risk_counts.sort_index(level=['Genetic Risk', 'Level'], key=lambda x: x.map({'High': 1, 'Medium': 2, 'Low': 3}))\n",
"\n", "\n",
"plt.figure(figsize=(10, 6))\n", "plt.figure(figsize=(10, 6))\n",
"\n", "\n",
"# Tworzenie wykresu słupkowego horyzontalnego\n", "\n",
"ax = Genetic_risk_counts_sorted.plot(kind='barh')\n", "ax = Genetic_risk_counts_sorted.plot(kind='barh')\n",
"\n", "\n",
"# Dodawanie wartości do słupków\n", "\n",
"for i, value in enumerate(Genetic_risk_counts_sorted):\n", "for i, value in enumerate(Genetic_risk_counts_sorted):\n",
" ax.text(value + 0.1, i, str(value), ha='left', va='center')\n", " ax.text(value + 0.1, i, str(value), ha='left', va='center')\n",
"\n", "\n",
"# Ustawienia etykiet i tytułów\n", "\n",
"plt.xlabel('Count')\n", "plt.xlabel('Count')\n",
"plt.ylabel('Genetic Risk, Level')\n", "plt.ylabel('Genetic Risk, Level')\n",
"plt.title('Distribution of level by genetic risk')\n", "plt.title('Distribution of level by genetic risk')\n",
"\n", "\n",
"# Wyświetlenie wykresu\n", "\n",
"plt.show()" "plt.show()"
] ]
}, },
@ -1403,24 +1367,24 @@
"source": [ "source": [
"air_pollution = dane.groupby(['Air Pollution', 'Level']).size()\n", "air_pollution = dane.groupby(['Air Pollution', 'Level']).size()\n",
"\n", "\n",
"# Sortowanie danych według ryzyka genetycznego i liczby w odwrotnej kolejności\n", "\n",
"air_pollution_sorted = air_pollution.sort_index(level=['Air Pollution', 'Level'], key=lambda x: x.map({'High': 1, 'Medium': 2, 'Low': 3}))\n", "air_pollution_sorted = air_pollution.sort_index(level=['Air Pollution', 'Level'], key=lambda x: x.map({'High': 1, 'Medium': 2, 'Low': 3}))\n",
"\n", "\n",
"plt.figure(figsize=(10, 6))\n", "plt.figure(figsize=(10, 6))\n",
"\n", "\n",
"# Tworzenie wykresu słupkowego horyzontalnego\n", "\n",
"ax = air_pollution_sorted.plot(kind='barh')\n", "ax = air_pollution_sorted.plot(kind='barh')\n",
"\n", "\n",
"# Dodawanie wartości do słupków\n", "\n",
"for i, value in enumerate(air_pollution_sorted):\n", "for i, value in enumerate(air_pollution_sorted):\n",
" ax.text(value + 0.1, i, str(value), ha='left', va='center')\n", " ax.text(value + 0.1, i, str(value), ha='left', va='center')\n",
"\n", "\n",
"# Ustawienia etykiet i tytułów\n", "\n",
"plt.xlabel('Count')\n", "plt.xlabel('Count')\n",
"plt.ylabel('Air Pollution, Level')\n", "plt.ylabel('Air Pollution, Level')\n",
"plt.title('Distribution of level by air pollution')\n", "plt.title('Distribution of level by air pollution')\n",
"\n", "\n",
"# Wyświetlenie wykresu\n", "\n",
"plt.show()" "plt.show()"
] ]
}, },
@ -3644,46 +3608,10 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 32, "execution_count": null,
"id": "322588b1", "id": "322588b1",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: mlxtend in c:\\users\\hp\\anaconda3\\lib\\site-packages (0.23.0)\n",
"Requirement already satisfied: scipy>=1.2.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (1.11.4)\n",
"Requirement already satisfied: numpy>=1.16.2 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (1.26.0)\n",
"Requirement already satisfied: pandas>=0.24.2 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (2.1.1)\n",
"Requirement already satisfied: scikit-learn>=1.0.2 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (1.3.2)\n",
"Requirement already satisfied: matplotlib>=3.0.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (3.8.0)\n",
"Requirement already satisfied: joblib>=0.13.2 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from mlxtend) (1.3.2)\n",
"Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (1.2.0)\n",
"Requirement already satisfied: cycler>=0.10 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (0.12.1)\n",
"Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (4.25.0)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (1.4.4)\n",
"Requirement already satisfied: packaging>=20.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (23.1)\n",
"Requirement already satisfied: pillow>=6.2.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (10.0.1)\n",
"Requirement already satisfied: pyparsing>=2.3.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (3.0.9)\n",
"Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from matplotlib>=3.0.0->mlxtend) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from pandas>=0.24.2->mlxtend) (2023.3.post1)\n",
"Requirement already satisfied: tzdata>=2022.1 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from pandas>=0.24.2->mlxtend) (2023.3)\n",
"Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from scikit-learn>=1.0.2->mlxtend) (3.2.0)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\hp\\anaconda3\\lib\\site-packages (from python-dateutil>=2.7->matplotlib>=3.0.0->mlxtend) (1.16.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"[notice] A new release of pip is available: 23.3.2 -> 24.0\n",
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
]
}
],
"source": [ "source": [
"pip install mlxtend" "pip install mlxtend"
] ]
@ -3703,20 +3631,6 @@
"sns.set()\n" "sns.set()\n"
] ]
}, },
{
"cell_type": "code",
"execution_count": 34,
"id": "cbb6c719",
"metadata": {},
"outputs": [],
"source": [
"#cm = confusion_matrix(y_test, y_pred)\n",
"#plot_confusion_matrix(cm)\n",
"\n",
"#acc = accuracy_score(y_test, y_pred)\n",
"#print('Accuracy',':', acc)"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 35, "execution_count": 35,