439 KiB
439 KiB
%pip install --user kaggle
%pip install --user pandas
%pip install --user scikit-learn
%pip install --user matplotlib
%pip install --user geopandas
import matplotlib.pyplot as plt
import pandas as pd
!kaggle datasets download -d uciml/forest-cover-type-dataset
!unzip -o meteorite-landings.zip -d data
Zbiór
data = pd.read_csv("covtype.csv")
data = data.sample(frac = 1)
data.head(10)
Elevation | Aspect | Slope | Horizontal_Distance_To_Hydrology | Vertical_Distance_To_Hydrology | Horizontal_Distance_To_Roadways | Hillshade_9am | Hillshade_Noon | Hillshade_3pm | Horizontal_Distance_To_Fire_Points | ... | Soil_Type32 | Soil_Type33 | Soil_Type34 | Soil_Type35 | Soil_Type36 | Soil_Type37 | Soil_Type38 | Soil_Type39 | Soil_Type40 | Cover_Type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
318054 | 2517 | 271 | 12 | 272 | 84 | 484 | 189 | 244 | 193 | 162 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
30504 | 2959 | 0 | 1 | 180 | 20 | 5960 | 217 | 236 | 156 | 3960 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
349520 | 3093 | 54 | 19 | 42 | -3 | 797 | 227 | 196 | 94 | 1318 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
365645 | 2502 | 330 | 17 | 150 | 52 | 738 | 177 | 216 | 178 | 510 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
131114 | 2962 | 4 | 13 | 95 | 7 | 4270 | 202 | 214 | 148 | 1999 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
385769 | 3181 | 119 | 5 | 170 | -1 | 2416 | 228 | 235 | 141 | 999 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
161626 | 2950 | 270 | 4 | 108 | 15 | 2053 | 210 | 241 | 170 | 2037 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
394880 | 3051 | 155 | 22 | 390 | 70 | 1871 | 239 | 236 | 114 | 1510 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
389492 | 3024 | 191 | 16 | 785 | 110 | 3000 | 218 | 251 | 162 | 1961 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
52507 | 2714 | 349 | 18 | 67 | 20 | 1599 | 184 | 207 | 160 | 3234 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
10 rows × 55 columns
Podział na podzbiory
from sklearn.model_selection import train_test_split
forest_train, forest_test = train_test_split(data, test_size=0.2, random_state=1)
forest_train, forest_val = train_test_split(forest_train, test_size=0.25, random_state=1)
Statystyki
Wielkości zbiorów
print(f'wielkość zbioru: {data.shape}')
print(f'wielkość zbioru treningowego: {forest_train.shape}')
print(f'wielkość zbioru testującego: {forest_test.shape}')
print(f'wielkość zbioru walidacyjnego: {forest_val.shape}')
wielkość zbioru: (581012, 55) wielkość zbioru treningowego: (348606, 55) wielkość zbioru testującego: (116203, 55) wielkość zbioru walidacyjnego: (116203, 55)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 581012 entries, 0 to 581011 Data columns (total 55 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Elevation 581012 non-null int64 1 Aspect 581012 non-null int64 2 Slope 581012 non-null int64 3 Horizontal_Distance_To_Hydrology 581012 non-null int64 4 Vertical_Distance_To_Hydrology 581012 non-null int64 5 Horizontal_Distance_To_Roadways 581012 non-null int64 6 Hillshade_9am 581012 non-null int64 7 Hillshade_Noon 581012 non-null int64 8 Hillshade_3pm 581012 non-null int64 9 Horizontal_Distance_To_Fire_Points 581012 non-null int64 10 Wilderness_Area1 581012 non-null int64 11 Wilderness_Area2 581012 non-null int64 12 Wilderness_Area3 581012 non-null int64 13 Wilderness_Area4 581012 non-null int64 14 Soil_Type1 581012 non-null int64 15 Soil_Type2 581012 non-null int64 16 Soil_Type3 581012 non-null int64 17 Soil_Type4 581012 non-null int64 18 Soil_Type5 581012 non-null int64 19 Soil_Type6 581012 non-null int64 20 Soil_Type7 581012 non-null int64 21 Soil_Type8 581012 non-null int64 22 Soil_Type9 581012 non-null int64 23 Soil_Type10 581012 non-null int64 24 Soil_Type11 581012 non-null int64 25 Soil_Type12 581012 non-null int64 26 Soil_Type13 581012 non-null int64 27 Soil_Type14 581012 non-null int64 28 Soil_Type15 581012 non-null int64 29 Soil_Type16 581012 non-null int64 30 Soil_Type17 581012 non-null int64 31 Soil_Type18 581012 non-null int64 32 Soil_Type19 581012 non-null int64 33 Soil_Type20 581012 non-null int64 34 Soil_Type21 581012 non-null int64 35 Soil_Type22 581012 non-null int64 36 Soil_Type23 581012 non-null int64 37 Soil_Type24 581012 non-null int64 38 Soil_Type25 581012 non-null int64 39 Soil_Type26 581012 non-null int64 40 Soil_Type27 581012 non-null int64 41 Soil_Type28 581012 non-null int64 42 Soil_Type29 581012 non-null int64 43 Soil_Type30 581012 non-null int64 44 Soil_Type31 581012 non-null int64 45 Soil_Type32 581012 non-null int64 46 Soil_Type33 581012 non-null int64 47 Soil_Type34 581012 non-null int64 48 Soil_Type35 581012 non-null int64 49 Soil_Type36 581012 non-null int64 50 Soil_Type37 581012 non-null int64 51 Soil_Type38 581012 non-null int64 52 Soil_Type39 581012 non-null int64 53 Soil_Type40 581012 non-null int64 54 Cover_Type 581012 non-null int64 dtypes: int64(55) memory usage: 243.8 MB
Nachylenie
print(f'Średnie nachylenie: {data["Slope"].mean()}')
print(f'Maksymalne nachylenie: {data["Slope"].max()}')
print(f'Minimalne nachylenie: {data["Slope"].min()}')
Średnie nachylenie: 14.103703537964792 Maksymalne nachylenie: 66 Minimalne nachylenie: 0
import seaborn as sns
features = data.loc[:,'Elevation':'Horizontal_Distance_To_Fire_Points']
plt.figure(figsize=(30, 50))
for i,col in enumerate(features.columns.values):
plt.subplot(5,2,i+1)
sns.boxplot(x=data['Cover_Type'], y=col, data=data)
plt.title(col, fontsize=20)
plt.show()
Normalizacja
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
columns_to_normalize = data.columns[~data.columns.str.startswith('Soil_Type')]
columns_to_normalize = columns_to_normalize.to_list()
columns_to_normalize.remove('Cover_Type')
data[columns_to_normalize] = scaler.fit_transform(data[columns_to_normalize])
data.head(10)
Elevation | Aspect | Slope | Horizontal_Distance_To_Hydrology | Vertical_Distance_To_Hydrology | Horizontal_Distance_To_Roadways | Hillshade_9am | Hillshade_Noon | Hillshade_3pm | Horizontal_Distance_To_Fire_Points | ... | Soil_Type32 | Soil_Type33 | Soil_Type34 | Soil_Type35 | Soil_Type36 | Soil_Type37 | Soil_Type38 | Soil_Type39 | Soil_Type40 | Cover_Type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
318054 | -1.579964 | 1.030645 | -0.280934 | 0.012100 | 0.644670 | -1.196821 | -0.864631 | 1.046164 | 1.318678 | -1.373130 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
30504 | -0.001305 | -1.390866 | -1.749905 | -0.420741 | -0.453191 | 2.315116 | 0.181321 | 0.641484 | 0.351977 | 1.495029 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
349520 | 0.477293 | -0.908351 | 0.653865 | -1.070003 | -0.847735 | -0.996083 | 0.554876 | -1.381919 | -1.267901 | -0.500147 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
365645 | -1.633538 | 1.557837 | 0.386780 | -0.561885 | 0.095739 | -1.033922 | -1.312896 | -0.370218 | 0.926772 | -1.110329 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 |
131114 | 0.009410 | -1.355124 | -0.147392 | -0.820649 | -0.676194 | 1.231264 | -0.379010 | -0.471388 | 0.142960 | 0.014128 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
385769 | 0.791596 | -0.327546 | -1.215734 | -0.467789 | -0.813427 | 0.042234 | 0.592231 | 0.590899 | -0.039929 | -0.741048 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
161626 | -0.033449 | 1.021709 | -1.349277 | -0.759486 | -0.538961 | -0.190570 | -0.080167 | 0.894409 | 0.717756 | 0.042825 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
394880 | 0.327285 | -0.005869 | 1.054494 | 0.567265 | 0.404513 | -0.307292 | 1.003141 | 0.641484 | -0.745360 | -0.355153 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
389492 | 0.230851 | 0.315808 | 0.253237 | 2.425659 | 1.090676 | 0.416772 | 0.218677 | 1.400260 | 0.508739 | -0.014568 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
52507 | -0.876353 | 1.727611 | 0.520322 | -0.952383 | -0.453191 | -0.481735 | -1.051408 | -0.825483 | 0.456485 | 0.946771 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
10 rows × 55 columns