ium_464863/IUM_02.ipynb

60 KiB
Raw Permalink Blame History

IUM_02

Wymagane zależności

# Instalacja wymaganych zależności
!pip install kaggle
!pip install pandas
!pip install scikit-learn
Requirement already satisfied: kaggle in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (1.6.6)
Requirement already satisfied: python-dateutil in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from kaggle) (2.9.0.post0)
Requirement already satisfied: requests in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from kaggle) (2.31.0)
Requirement already satisfied: tqdm in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from kaggle) (4.66.2)
Requirement already satisfied: certifi in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from kaggle) (2024.2.2)
Requirement already satisfied: urllib3 in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from kaggle) (2.2.1)
Requirement already satisfied: bleach in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from kaggle) (6.1.0)
Requirement already satisfied: six>=1.10 in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from kaggle) (1.16.0)
Requirement already satisfied: python-slugify in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from kaggle) (8.0.4)
Requirement already satisfied: webencodings in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from bleach->kaggle) (0.5.1)
Requirement already satisfied: text-unidecode>=1.3 in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from python-slugify->kaggle) (1.3)
Requirement already satisfied: idna<4,>=2.5 in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from requests->kaggle) (3.6)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from requests->kaggle) (3.3.2)
Requirement already satisfied: colorama in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from tqdm->kaggle) (0.4.6)
[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip
Requirement already satisfied: pandas in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (2.2.1)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: tzdata>=2022.7 in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from pandas) (2024.1)
Requirement already satisfied: pytz>=2020.1 in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from pandas) (2024.1)
Requirement already satisfied: numpy<2,>=1.22.4 in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from pandas) (1.26.4)
Requirement already satisfied: six>=1.5 in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip
Requirement already satisfied: scikit-learn in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (1.4.1.post1)
Requirement already satisfied: joblib>=1.2.0 in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from scikit-learn) (1.3.2)
Requirement already satisfied: numpy<2.0,>=1.19.5 in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from scikit-learn) (1.26.4)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from scikit-learn) (3.3.0)
Requirement already satisfied: scipy>=1.6.0 in c:\users\broke\pycharmprojects\ium_464863\venv\lib\site-packages (from scikit-learn) (1.12.0)
[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip

Import bibliotek

# Import bibliotek
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

1. Pobieranie zbioru danych

# Pobranie zbioru danych
!kaggle datasets download -d uciml/breast-cancer-wisconsin-data

# Wypakowanie archiwum
!tar -xf breast-cancer-wisconsin-data.zip
breast-cancer-wisconsin-data.zip: Skipping, found more recently modified local copy (use --force to force download)

2. Wczytanie danych oraz wstępne przetworzenie

# Wczytanie danych, ustawienie kolumny 'id' jako indeks
df = pd.read_csv('data.csv', index_col='id')

# Usunięcie niepotrzebnych/błędnych kolumn
df = df.drop(columns=['Unnamed: 32'])
# Sprawdzenie czy istnieją wartości brakujące (NaN)
df.isnull().sum()
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64
# Wyświetlenie 5 pierwszych wierszy
df.head()
diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
id
842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 0.2419 ... 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890
842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 0.1812 ... 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902
84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 0.2069 ... 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758
84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 0.2597 ... 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300
84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 0.1809 ... 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678

5 rows × 31 columns

# Normalizacja cech do wartości z przedziału [0, 1]
scaler = MinMaxScaler()

df[df.columns[1:]] = scaler.fit_transform(df[df.columns[1:]])

3. Podział danych na zbiór treningowy, walidacyjny i testowy

# Podział zbioru na zbiór treningowy, walidacyjny i testowy w proporcji 80/10/10
df_train, df_val_test = train_test_split(df, test_size=0.2, random_state=1234)
df_val, df_test = train_test_split(df_val_test, test_size=0.5, random_state=1234)
# Wymiary zbiorów i podzbiorów
print(f"Cały zbiór: {df.shape[0]} wierszy, {df.shape[1]} kolumn")
print(f"Zbiór treningowy: {df_train.shape[0]} wierszy, {df_train.shape[1]} kolumn")
print(f"Zbiór walidacyjny: {df_val.shape[0]} wierszy, {df_val.shape[1]} kolumn")
print(f"Zbiór testowy: {df_test.shape[0]} wierszy, {df_test.shape[1]} kolumn")
Cały zbiór: 569 wierszy, 31 kolumn
Zbiór treningowy: 455 wierszy, 31 kolumn
Zbiór walidacyjny: 57 wierszy, 31 kolumn
Zbiór testowy: 57 wierszy, 31 kolumn

Statystyki dla cech numerycznych (średnia, odchylenie standardowe, min, max, kwantyle)

# Cały zbiór
df.describe()
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
count 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 ... 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000
mean 0.338222 0.323965 0.332935 0.216920 0.394785 0.260601 0.208058 0.243137 0.379605 0.270379 ... 0.296663 0.363998 0.283138 0.170906 0.404138 0.220212 0.217403 0.393836 0.263307 0.189596
std 0.166787 0.145453 0.167915 0.149274 0.126967 0.161992 0.186785 0.192857 0.138456 0.148702 ... 0.171940 0.163813 0.167352 0.139932 0.150779 0.152649 0.166633 0.225884 0.121954 0.118466
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.223342 0.218465 0.216847 0.117413 0.304595 0.139685 0.069260 0.100944 0.282323 0.163016 ... 0.180719 0.241471 0.167837 0.081130 0.300007 0.116337 0.091454 0.223127 0.185098 0.107700
50% 0.302381 0.308759 0.293345 0.172895 0.390358 0.224679 0.144189 0.166501 0.369697 0.243892 ... 0.250445 0.356876 0.235320 0.123206 0.397081 0.179110 0.181070 0.343402 0.247782 0.163977
75% 0.416442 0.408860 0.416765 0.271135 0.475490 0.340531 0.306232 0.367793 0.453030 0.340354 ... 0.386339 0.471748 0.373475 0.220901 0.494156 0.302520 0.305831 0.554639 0.318155 0.242949
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 30 columns

# Zbiór treningowy
df_train.describe()
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
count 455.000000 455.000000 455.000000 455.000000 455.000000 455.000000 455.000000 455.000000 455.000000 455.000000 ... 455.000000 455.000000 455.000000 455.000000 455.000000 455.000000 455.000000 455.000000 455.000000 455.000000
mean 0.338949 0.326381 0.333644 0.217261 0.395892 0.260580 0.209922 0.243100 0.381150 0.270577 ... 0.297118 0.369023 0.284164 0.171085 0.407890 0.221950 0.221686 0.394672 0.263384 0.191552
std 0.165349 0.145664 0.166246 0.147801 0.126845 0.160494 0.187617 0.192341 0.136984 0.147338 ... 0.170654 0.166060 0.166564 0.138560 0.153644 0.156748 0.172039 0.226663 0.119284 0.122769
min 0.000000 0.022658 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.012527 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.225235 0.219817 0.219128 0.119321 0.304776 0.136096 0.068030 0.100497 0.283081 0.168176 ... 0.182675 0.248801 0.169929 0.081781 0.303308 0.116871 0.092212 0.221753 0.188252 0.107110
50% 0.300961 0.310450 0.295833 0.170859 0.389636 0.230262 0.145150 0.168191 0.369697 0.241786 ... 0.250445 0.358742 0.235370 0.123206 0.396421 0.181244 0.184505 0.347079 0.248571 0.164305
75% 0.415259 0.411397 0.414346 0.271113 0.476393 0.340991 0.310098 0.357952 0.453030 0.340354 ... 0.377090 0.481343 0.369740 0.209964 0.496467 0.299318 0.309265 0.558935 0.317465 0.242785
max 1.000000 0.815015 1.000000 0.999152 0.831182 0.895712 1.000000 1.000000 1.000000 1.000000 ... 0.896478 1.000000 0.890931 0.797975 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 30 columns

# Zbiór walidacyjny
df_val.describe()
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
count 57.000000 57.000000 57.000000 57.000000 57.000000 57.000000 57.000000 57.000000 57.000000 57.000000 ... 57.000000 57.000000 57.000000 57.000000 57.000000 57.000000 57.000000 57.000000 57.000000 57.000000
mean 0.334850 0.315985 0.331800 0.216332 0.399432 0.279956 0.216035 0.254466 0.387684 0.281045 ... 0.294395 0.357428 0.281028 0.170796 0.391354 0.222054 0.209888 0.405999 0.276614 0.187256
std 0.176956 0.119165 0.181708 0.171264 0.144361 0.184302 0.208557 0.203610 0.160009 0.161084 ... 0.180500 0.130661 0.179389 0.161083 0.140480 0.133225 0.152149 0.214849 0.143775 0.103423
min 0.089782 0.106865 0.089489 0.041357 0.167193 0.046500 0.003622 0.027793 0.078283 0.047810 ... 0.072963 0.116205 0.074008 0.028264 0.085320 0.019239 0.006176 0.095670 0.066233 0.028073
25% 0.225709 0.226581 0.223205 0.117413 0.293220 0.165235 0.076406 0.112326 0.282323 0.169545 ... 0.180719 0.267058 0.162757 0.082653 0.301327 0.126233 0.107827 0.248179 0.198502 0.107700
50% 0.282976 0.290159 0.279110 0.161909 0.396678 0.222195 0.127413 0.154026 0.381818 0.258214 ... 0.225187 0.348348 0.228597 0.109836 0.416232 0.182893 0.164537 0.352234 0.234969 0.155910
75% 0.413129 0.389922 0.405017 0.260912 0.471879 0.359242 0.296626 0.371918 0.464141 0.342249 ... 0.355034 0.463486 0.343593 0.197454 0.484911 0.308632 0.312380 0.517182 0.323674 0.260724
max 0.967343 0.623267 0.988943 1.000000 1.000000 1.000000 0.879569 0.839463 0.932323 0.949031 ... 1.000000 0.619670 1.000000 1.000000 0.786040 0.571557 0.613498 0.902062 0.829687 0.460186

8 rows × 30 columns

# Zbiór testowy
df_test.describe()
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
count 57.000000 57.000000 57.000000 57.000000 57.000000 57.000000 57.000000 57.000000 57.000000 57.000000 ... 57.000000 57.000000 57.000000 57.000000 57.000000 57.000000 57.000000 57.000000 57.000000 57.000000
mean 0.335794 0.312663 0.328412 0.214790 0.381297 0.241410 0.185207 0.232105 0.359197 0.258133 ... 0.295299 0.330458 0.277057 0.169590 0.386967 0.204497 0.190725 0.374998 0.249384 0.176326
std 0.170785 0.167714 0.169927 0.139898 0.109590 0.150381 0.156133 0.188722 0.127273 0.148563 ... 0.176543 0.173641 0.164054 0.130427 0.137458 0.138294 0.132668 0.233099 0.120215 0.096130
min 0.075252 0.000000 0.072904 0.033001 0.146249 0.021839 0.002798 0.011948 0.072222 0.021061 ... 0.054891 0.000000 0.047263 0.022046 0.111074 0.015504 0.002860 0.035808 0.081411 0.001115
25% 0.192106 0.199188 0.185751 0.097222 0.308658 0.129471 0.062910 0.095179 0.271717 0.139217 ... 0.146211 0.199893 0.146023 0.064368 0.285478 0.098777 0.086981 0.202131 0.159866 0.113210
50% 0.315159 0.310450 0.301223 0.180445 0.390990 0.203239 0.119845 0.154573 0.361616 0.246420 ... 0.256492 0.325160 0.245530 0.128048 0.396421 0.172318 0.155511 0.292509 0.234772 0.160042
75% 0.486961 0.396348 0.481031 0.329629 0.460143 0.330102 0.261246 0.383996 0.439899 0.339090 ... 0.429740 0.425640 0.410827 0.256046 0.478307 0.280205 0.275240 0.520619 0.304356 0.231011
max 0.692366 1.000000 0.695253 0.535949 0.578406 0.809214 0.658388 0.776342 0.674242 0.839090 ... 0.667022 0.875533 0.627970 0.467902 0.674437 0.709327 0.563339 0.997595 0.622708 0.481175

8 rows × 30 columns

Rozkład klas w zbiorze

# Cały zbiór
df['diagnosis'].value_counts()
diagnosis
B    357
M    212
Name: count, dtype: int64
# Zbiór treningowy
df_train['diagnosis'].value_counts()
diagnosis
B    288
M    167
Name: count, dtype: int64
# Zbiór walidacyjny
df_val['diagnosis'].value_counts()
diagnosis
B    35
M    22
Name: count, dtype: int64
# Zbiór testowy
df_test['diagnosis'].value_counts()
diagnosis
B    34
M    23
Name: count, dtype: int64