121 KiB
121 KiB
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dane = pd.read_csv(r'C:\Users\HP\Desktop\podyplomówka\cancer_patient_data_sets.csv', index_col = 0)
dane.head()
Patient Id | Age | Gender | Air Pollution | Alcohol use | Dust Allergy | OccuPational Hazards | Genetic Risk | chronic Lung Disease | Balanced Diet | ... | Fatigue | Weight Loss | Shortness of Breath | Wheezing | Swallowing Difficulty | Clubbing of Finger Nails | Frequent Cold | Dry Cough | Snoring | Level | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | |||||||||||||||||||||
0 | P1 | 33 | 1 | 2 | 4 | 5 | 4 | 3 | 2 | 2 | ... | 3 | 4 | 2 | 2 | 3 | 1 | 2 | 3 | 4 | Low |
1 | P10 | 17 | 1 | 3 | 1 | 5 | 3 | 4 | 2 | 2 | ... | 1 | 3 | 7 | 8 | 6 | 2 | 1 | 7 | 2 | Medium |
2 | P100 | 35 | 1 | 4 | 5 | 6 | 5 | 5 | 4 | 6 | ... | 8 | 7 | 9 | 2 | 1 | 4 | 6 | 7 | 2 | High |
3 | P1000 | 37 | 1 | 7 | 7 | 7 | 7 | 6 | 7 | 7 | ... | 4 | 2 | 3 | 1 | 4 | 5 | 6 | 7 | 5 | High |
4 | P101 | 46 | 1 | 6 | 8 | 7 | 7 | 7 | 6 | 7 | ... | 3 | 2 | 4 | 1 | 4 | 2 | 4 | 2 | 3 | High |
5 rows × 25 columns
dane.info()
<class 'pandas.core.frame.DataFrame'> Index: 1000 entries, 0 to 999 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Patient Id 1000 non-null object 1 Age 1000 non-null int64 2 Gender 1000 non-null int64 3 Air Pollution 1000 non-null int64 4 Alcohol use 1000 non-null int64 5 Dust Allergy 1000 non-null int64 6 OccuPational Hazards 1000 non-null int64 7 Genetic Risk 1000 non-null int64 8 chronic Lung Disease 1000 non-null int64 9 Balanced Diet 1000 non-null int64 10 Obesity 1000 non-null int64 11 Smoking 1000 non-null int64 12 Passive Smoker 1000 non-null int64 13 Chest Pain 1000 non-null int64 14 Coughing of Blood 1000 non-null int64 15 Fatigue 1000 non-null int64 16 Weight Loss 1000 non-null int64 17 Shortness of Breath 1000 non-null int64 18 Wheezing 1000 non-null int64 19 Swallowing Difficulty 1000 non-null int64 20 Clubbing of Finger Nails 1000 non-null int64 21 Frequent Cold 1000 non-null int64 22 Dry Cough 1000 non-null int64 23 Snoring 1000 non-null int64 24 Level 1000 non-null object dtypes: int64(23), object(2) memory usage: 203.1+ KB
dane.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Age | 1000.0 | 37.174 | 12.005493 | 14.0 | 27.75 | 36.0 | 45.0 | 73.0 |
Gender | 1000.0 | 1.402 | 0.490547 | 1.0 | 1.00 | 1.0 | 2.0 | 2.0 |
Air Pollution | 1000.0 | 3.840 | 2.030400 | 1.0 | 2.00 | 3.0 | 6.0 | 8.0 |
Alcohol use | 1000.0 | 4.563 | 2.620477 | 1.0 | 2.00 | 5.0 | 7.0 | 8.0 |
Dust Allergy | 1000.0 | 5.165 | 1.980833 | 1.0 | 4.00 | 6.0 | 7.0 | 8.0 |
OccuPational Hazards | 1000.0 | 4.840 | 2.107805 | 1.0 | 3.00 | 5.0 | 7.0 | 8.0 |
Genetic Risk | 1000.0 | 4.580 | 2.126999 | 1.0 | 2.00 | 5.0 | 7.0 | 7.0 |
chronic Lung Disease | 1000.0 | 4.380 | 1.848518 | 1.0 | 3.00 | 4.0 | 6.0 | 7.0 |
Balanced Diet | 1000.0 | 4.491 | 2.135528 | 1.0 | 2.00 | 4.0 | 7.0 | 7.0 |
Obesity | 1000.0 | 4.465 | 2.124921 | 1.0 | 3.00 | 4.0 | 7.0 | 7.0 |
Smoking | 1000.0 | 3.948 | 2.495902 | 1.0 | 2.00 | 3.0 | 7.0 | 8.0 |
Passive Smoker | 1000.0 | 4.195 | 2.311778 | 1.0 | 2.00 | 4.0 | 7.0 | 8.0 |
Chest Pain | 1000.0 | 4.438 | 2.280209 | 1.0 | 2.00 | 4.0 | 7.0 | 9.0 |
Coughing of Blood | 1000.0 | 4.859 | 2.427965 | 1.0 | 3.00 | 4.0 | 7.0 | 9.0 |
Fatigue | 1000.0 | 3.856 | 2.244616 | 1.0 | 2.00 | 3.0 | 5.0 | 9.0 |
Weight Loss | 1000.0 | 3.855 | 2.206546 | 1.0 | 2.00 | 3.0 | 6.0 | 8.0 |
Shortness of Breath | 1000.0 | 4.240 | 2.285087 | 1.0 | 2.00 | 4.0 | 6.0 | 9.0 |
Wheezing | 1000.0 | 3.777 | 2.041921 | 1.0 | 2.00 | 4.0 | 5.0 | 8.0 |
Swallowing Difficulty | 1000.0 | 3.746 | 2.270383 | 1.0 | 2.00 | 4.0 | 5.0 | 8.0 |
Clubbing of Finger Nails | 1000.0 | 3.923 | 2.388048 | 1.0 | 2.00 | 4.0 | 5.0 | 9.0 |
Frequent Cold | 1000.0 | 3.536 | 1.832502 | 1.0 | 2.00 | 3.0 | 5.0 | 7.0 |
Dry Cough | 1000.0 | 3.853 | 2.039007 | 1.0 | 2.00 | 4.0 | 6.0 | 7.0 |
Snoring | 1000.0 | 2.926 | 1.474686 | 1.0 | 2.00 | 3.0 | 4.0 | 7.0 |
dane.columns
Index(['Patient Id', 'Age', 'Gender', 'Air Pollution', 'Alcohol use', 'Dust Allergy', 'OccuPational Hazards', 'Genetic Risk', 'chronic Lung Disease', 'Balanced Diet', 'Obesity', 'Smoking', 'Passive Smoker', 'Chest Pain', 'Coughing of Blood', 'Fatigue', 'Weight Loss', 'Shortness of Breath', 'Wheezing', 'Swallowing Difficulty', 'Clubbing of Finger Nails', 'Frequent Cold', 'Dry Cough', 'Snoring', 'Level'], dtype='object')
dane2 = dane.groupby('Gender').size()
dane2
Gender 1 598 2 402 dtype: int64
dane['Gender'].value_counts().plot(kind = 'bar')
<Axes: xlabel='Gender'>
dane3 = [dane.groupby('Smoking').size()]
dane3
[Smoking 1 181 2 222 3 172 4 59 5 10 6 60 7 207 8 89 dtype: int64]
dane['Smoking'].value_counts().plot(kind = 'pie')
<Axes: ylabel='count'>
dane4 = [dane.groupby('Passive Smoker').size()]
dane4
[Passive Smoker 1 60 2 284 3 140 4 161 5 30 6 30 7 187 8 108 dtype: int64]
dane['Passive Smoker'].value_counts().plot(kind = 'pie')
<Axes: ylabel='count'>
dane.groupby(['Smoking','Gender']).size()
Smoking Gender 1 1 102 2 79 2 1 102 2 120 3 1 79 2 93 4 1 49 2 10 5 1 10 6 1 28 2 32 7 1 167 2 40 8 1 61 2 28 dtype: int64
dane6 = dane.groupby(['Smoking','Gender'])
dane6[['Smoking', 'Gender']].value_counts().plot(kind = 'bar')
<Axes: xlabel='Smoking,Gender'>
dane7 = dane['Air Pollution'].value_counts()
dane7.sort_values()
Air Pollution 8 19 5 20 7 30 4 90 1 141 3 173 2 201 6 326 Name: count, dtype: int64
dane7 = dane['Air Pollution'].value_counts().plot(kind = 'bar')
dane7
<Axes: xlabel='Air Pollution'>