1940 lines
74 KiB
Python
1940 lines
74 KiB
Python
|
"""
|
||
|
This module gathers tree-based methods, including decision, regression and
|
||
|
randomized trees. Single and multi-output problems are both handled.
|
||
|
"""
|
||
|
|
||
|
# Authors: Gilles Louppe <g.louppe@gmail.com>
|
||
|
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
|
||
|
# Brian Holt <bdholt1@gmail.com>
|
||
|
# Noel Dawe <noel@dawe.me>
|
||
|
# Satrajit Gosh <satrajit.ghosh@gmail.com>
|
||
|
# Joly Arnaud <arnaud.v.joly@gmail.com>
|
||
|
# Fares Hedayati <fares.hedayati@gmail.com>
|
||
|
# Nelson Liu <nelson@nelsonliu.me>
|
||
|
#
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
import copy
|
||
|
import numbers
|
||
|
from abc import ABCMeta, abstractmethod
|
||
|
from math import ceil
|
||
|
from numbers import Integral, Real
|
||
|
|
||
|
import numpy as np
|
||
|
from scipy.sparse import issparse
|
||
|
|
||
|
from ..base import (
|
||
|
BaseEstimator,
|
||
|
ClassifierMixin,
|
||
|
MultiOutputMixin,
|
||
|
RegressorMixin,
|
||
|
_fit_context,
|
||
|
clone,
|
||
|
is_classifier,
|
||
|
)
|
||
|
from ..utils import Bunch, check_random_state, compute_sample_weight
|
||
|
from ..utils._param_validation import Hidden, Interval, RealNotInt, StrOptions
|
||
|
from ..utils.multiclass import check_classification_targets
|
||
|
from ..utils.validation import (
|
||
|
_assert_all_finite_element_wise,
|
||
|
_check_sample_weight,
|
||
|
assert_all_finite,
|
||
|
check_is_fitted,
|
||
|
)
|
||
|
from . import _criterion, _splitter, _tree
|
||
|
from ._criterion import Criterion
|
||
|
from ._splitter import Splitter
|
||
|
from ._tree import (
|
||
|
BestFirstTreeBuilder,
|
||
|
DepthFirstTreeBuilder,
|
||
|
Tree,
|
||
|
_build_pruned_tree_ccp,
|
||
|
ccp_pruning_path,
|
||
|
)
|
||
|
from ._utils import _any_isnan_axis0
|
||
|
|
||
|
__all__ = [
|
||
|
"DecisionTreeClassifier",
|
||
|
"DecisionTreeRegressor",
|
||
|
"ExtraTreeClassifier",
|
||
|
"ExtraTreeRegressor",
|
||
|
]
|
||
|
|
||
|
|
||
|
# =============================================================================
|
||
|
# Types and constants
|
||
|
# =============================================================================
|
||
|
|
||
|
DTYPE = _tree.DTYPE
|
||
|
DOUBLE = _tree.DOUBLE
|
||
|
|
||
|
CRITERIA_CLF = {
|
||
|
"gini": _criterion.Gini,
|
||
|
"log_loss": _criterion.Entropy,
|
||
|
"entropy": _criterion.Entropy,
|
||
|
}
|
||
|
CRITERIA_REG = {
|
||
|
"squared_error": _criterion.MSE,
|
||
|
"friedman_mse": _criterion.FriedmanMSE,
|
||
|
"absolute_error": _criterion.MAE,
|
||
|
"poisson": _criterion.Poisson,
|
||
|
}
|
||
|
|
||
|
DENSE_SPLITTERS = {"best": _splitter.BestSplitter, "random": _splitter.RandomSplitter}
|
||
|
|
||
|
SPARSE_SPLITTERS = {
|
||
|
"best": _splitter.BestSparseSplitter,
|
||
|
"random": _splitter.RandomSparseSplitter,
|
||
|
}
|
||
|
|
||
|
# =============================================================================
|
||
|
# Base decision tree
|
||
|
# =============================================================================
|
||
|
|
||
|
|
||
|
class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
|
||
|
"""Base class for decision trees.
|
||
|
|
||
|
Warning: This class should not be used directly.
|
||
|
Use derived classes instead.
|
||
|
"""
|
||
|
|
||
|
_parameter_constraints: dict = {
|
||
|
"splitter": [StrOptions({"best", "random"})],
|
||
|
"max_depth": [Interval(Integral, 1, None, closed="left"), None],
|
||
|
"min_samples_split": [
|
||
|
Interval(Integral, 2, None, closed="left"),
|
||
|
Interval(RealNotInt, 0.0, 1.0, closed="right"),
|
||
|
],
|
||
|
"min_samples_leaf": [
|
||
|
Interval(Integral, 1, None, closed="left"),
|
||
|
Interval(RealNotInt, 0.0, 1.0, closed="neither"),
|
||
|
],
|
||
|
"min_weight_fraction_leaf": [Interval(Real, 0.0, 0.5, closed="both")],
|
||
|
"max_features": [
|
||
|
Interval(Integral, 1, None, closed="left"),
|
||
|
Interval(RealNotInt, 0.0, 1.0, closed="right"),
|
||
|
StrOptions({"sqrt", "log2"}),
|
||
|
None,
|
||
|
],
|
||
|
"random_state": ["random_state"],
|
||
|
"max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None],
|
||
|
"min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")],
|
||
|
"ccp_alpha": [Interval(Real, 0.0, None, closed="left")],
|
||
|
"monotonic_cst": ["array-like", None],
|
||
|
}
|
||
|
|
||
|
@abstractmethod
|
||
|
def __init__(
|
||
|
self,
|
||
|
*,
|
||
|
criterion,
|
||
|
splitter,
|
||
|
max_depth,
|
||
|
min_samples_split,
|
||
|
min_samples_leaf,
|
||
|
min_weight_fraction_leaf,
|
||
|
max_features,
|
||
|
max_leaf_nodes,
|
||
|
random_state,
|
||
|
min_impurity_decrease,
|
||
|
class_weight=None,
|
||
|
ccp_alpha=0.0,
|
||
|
monotonic_cst=None,
|
||
|
):
|
||
|
self.criterion = criterion
|
||
|
self.splitter = splitter
|
||
|
self.max_depth = max_depth
|
||
|
self.min_samples_split = min_samples_split
|
||
|
self.min_samples_leaf = min_samples_leaf
|
||
|
self.min_weight_fraction_leaf = min_weight_fraction_leaf
|
||
|
self.max_features = max_features
|
||
|
self.max_leaf_nodes = max_leaf_nodes
|
||
|
self.random_state = random_state
|
||
|
self.min_impurity_decrease = min_impurity_decrease
|
||
|
self.class_weight = class_weight
|
||
|
self.ccp_alpha = ccp_alpha
|
||
|
self.monotonic_cst = monotonic_cst
|
||
|
|
||
|
def get_depth(self):
|
||
|
"""Return the depth of the decision tree.
|
||
|
|
||
|
The depth of a tree is the maximum distance between the root
|
||
|
and any leaf.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self.tree_.max_depth : int
|
||
|
The maximum depth of the tree.
|
||
|
"""
|
||
|
check_is_fitted(self)
|
||
|
return self.tree_.max_depth
|
||
|
|
||
|
def get_n_leaves(self):
|
||
|
"""Return the number of leaves of the decision tree.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self.tree_.n_leaves : int
|
||
|
Number of leaves.
|
||
|
"""
|
||
|
check_is_fitted(self)
|
||
|
return self.tree_.n_leaves
|
||
|
|
||
|
def _support_missing_values(self, X):
|
||
|
return (
|
||
|
not issparse(X)
|
||
|
and self._get_tags()["allow_nan"]
|
||
|
and self.monotonic_cst is None
|
||
|
)
|
||
|
|
||
|
def _compute_missing_values_in_feature_mask(self, X, estimator_name=None):
|
||
|
"""Return boolean mask denoting if there are missing values for each feature.
|
||
|
|
||
|
This method also ensures that X is finite.
|
||
|
|
||
|
Parameter
|
||
|
---------
|
||
|
X : array-like of shape (n_samples, n_features), dtype=DOUBLE
|
||
|
Input data.
|
||
|
|
||
|
estimator_name : str or None, default=None
|
||
|
Name to use when raising an error. Defaults to the class name.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
missing_values_in_feature_mask : ndarray of shape (n_features,), or None
|
||
|
Missing value mask. If missing values are not supported or there
|
||
|
are no missing values, return None.
|
||
|
"""
|
||
|
estimator_name = estimator_name or self.__class__.__name__
|
||
|
common_kwargs = dict(estimator_name=estimator_name, input_name="X")
|
||
|
|
||
|
if not self._support_missing_values(X):
|
||
|
assert_all_finite(X, **common_kwargs)
|
||
|
return None
|
||
|
|
||
|
with np.errstate(over="ignore"):
|
||
|
overall_sum = np.sum(X)
|
||
|
|
||
|
if not np.isfinite(overall_sum):
|
||
|
# Raise a ValueError in case of the presence of an infinite element.
|
||
|
_assert_all_finite_element_wise(X, xp=np, allow_nan=True, **common_kwargs)
|
||
|
|
||
|
# If the sum is not nan, then there are no missing values
|
||
|
if not np.isnan(overall_sum):
|
||
|
return None
|
||
|
|
||
|
missing_values_in_feature_mask = _any_isnan_axis0(X)
|
||
|
return missing_values_in_feature_mask
|
||
|
|
||
|
def _fit(
|
||
|
self,
|
||
|
X,
|
||
|
y,
|
||
|
sample_weight=None,
|
||
|
check_input=True,
|
||
|
missing_values_in_feature_mask=None,
|
||
|
):
|
||
|
random_state = check_random_state(self.random_state)
|
||
|
|
||
|
if check_input:
|
||
|
# Need to validate separately here.
|
||
|
# We can't pass multi_output=True because that would allow y to be
|
||
|
# csr.
|
||
|
|
||
|
# _compute_missing_values_in_feature_mask will check for finite values and
|
||
|
# compute the missing mask if the tree supports missing values
|
||
|
check_X_params = dict(
|
||
|
dtype=DTYPE, accept_sparse="csc", force_all_finite=False
|
||
|
)
|
||
|
check_y_params = dict(ensure_2d=False, dtype=None)
|
||
|
X, y = self._validate_data(
|
||
|
X, y, validate_separately=(check_X_params, check_y_params)
|
||
|
)
|
||
|
|
||
|
missing_values_in_feature_mask = (
|
||
|
self._compute_missing_values_in_feature_mask(X)
|
||
|
)
|
||
|
if issparse(X):
|
||
|
X.sort_indices()
|
||
|
|
||
|
if X.indices.dtype != np.intc or X.indptr.dtype != np.intc:
|
||
|
raise ValueError(
|
||
|
"No support for np.int64 index based sparse matrices"
|
||
|
)
|
||
|
|
||
|
if self.criterion == "poisson":
|
||
|
if np.any(y < 0):
|
||
|
raise ValueError(
|
||
|
"Some value(s) of y are negative which is"
|
||
|
" not allowed for Poisson regression."
|
||
|
)
|
||
|
if np.sum(y) <= 0:
|
||
|
raise ValueError(
|
||
|
"Sum of y is not positive which is "
|
||
|
"necessary for Poisson regression."
|
||
|
)
|
||
|
|
||
|
# Determine output settings
|
||
|
n_samples, self.n_features_in_ = X.shape
|
||
|
is_classification = is_classifier(self)
|
||
|
|
||
|
y = np.atleast_1d(y)
|
||
|
expanded_class_weight = None
|
||
|
|
||
|
if y.ndim == 1:
|
||
|
# reshape is necessary to preserve the data contiguity against vs
|
||
|
# [:, np.newaxis] that does not.
|
||
|
y = np.reshape(y, (-1, 1))
|
||
|
|
||
|
self.n_outputs_ = y.shape[1]
|
||
|
|
||
|
if is_classification:
|
||
|
check_classification_targets(y)
|
||
|
y = np.copy(y)
|
||
|
|
||
|
self.classes_ = []
|
||
|
self.n_classes_ = []
|
||
|
|
||
|
if self.class_weight is not None:
|
||
|
y_original = np.copy(y)
|
||
|
|
||
|
y_encoded = np.zeros(y.shape, dtype=int)
|
||
|
for k in range(self.n_outputs_):
|
||
|
classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True)
|
||
|
self.classes_.append(classes_k)
|
||
|
self.n_classes_.append(classes_k.shape[0])
|
||
|
y = y_encoded
|
||
|
|
||
|
if self.class_weight is not None:
|
||
|
expanded_class_weight = compute_sample_weight(
|
||
|
self.class_weight, y_original
|
||
|
)
|
||
|
|
||
|
self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
|
||
|
|
||
|
if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
|
||
|
y = np.ascontiguousarray(y, dtype=DOUBLE)
|
||
|
|
||
|
max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth
|
||
|
|
||
|
if isinstance(self.min_samples_leaf, numbers.Integral):
|
||
|
min_samples_leaf = self.min_samples_leaf
|
||
|
else: # float
|
||
|
min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples))
|
||
|
|
||
|
if isinstance(self.min_samples_split, numbers.Integral):
|
||
|
min_samples_split = self.min_samples_split
|
||
|
else: # float
|
||
|
min_samples_split = int(ceil(self.min_samples_split * n_samples))
|
||
|
min_samples_split = max(2, min_samples_split)
|
||
|
|
||
|
min_samples_split = max(min_samples_split, 2 * min_samples_leaf)
|
||
|
|
||
|
if isinstance(self.max_features, str):
|
||
|
if self.max_features == "sqrt":
|
||
|
max_features = max(1, int(np.sqrt(self.n_features_in_)))
|
||
|
elif self.max_features == "log2":
|
||
|
max_features = max(1, int(np.log2(self.n_features_in_)))
|
||
|
elif self.max_features is None:
|
||
|
max_features = self.n_features_in_
|
||
|
elif isinstance(self.max_features, numbers.Integral):
|
||
|
max_features = self.max_features
|
||
|
else: # float
|
||
|
if self.max_features > 0.0:
|
||
|
max_features = max(1, int(self.max_features * self.n_features_in_))
|
||
|
else:
|
||
|
max_features = 0
|
||
|
|
||
|
self.max_features_ = max_features
|
||
|
|
||
|
max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes
|
||
|
|
||
|
if len(y) != n_samples:
|
||
|
raise ValueError(
|
||
|
"Number of labels=%d does not match number of samples=%d"
|
||
|
% (len(y), n_samples)
|
||
|
)
|
||
|
|
||
|
if sample_weight is not None:
|
||
|
sample_weight = _check_sample_weight(sample_weight, X, DOUBLE)
|
||
|
|
||
|
if expanded_class_weight is not None:
|
||
|
if sample_weight is not None:
|
||
|
sample_weight = sample_weight * expanded_class_weight
|
||
|
else:
|
||
|
sample_weight = expanded_class_weight
|
||
|
|
||
|
# Set min_weight_leaf from min_weight_fraction_leaf
|
||
|
if sample_weight is None:
|
||
|
min_weight_leaf = self.min_weight_fraction_leaf * n_samples
|
||
|
else:
|
||
|
min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight)
|
||
|
|
||
|
# Build tree
|
||
|
criterion = self.criterion
|
||
|
if not isinstance(criterion, Criterion):
|
||
|
if is_classification:
|
||
|
criterion = CRITERIA_CLF[self.criterion](
|
||
|
self.n_outputs_, self.n_classes_
|
||
|
)
|
||
|
else:
|
||
|
criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples)
|
||
|
else:
|
||
|
# Make a deepcopy in case the criterion has mutable attributes that
|
||
|
# might be shared and modified concurrently during parallel fitting
|
||
|
criterion = copy.deepcopy(criterion)
|
||
|
|
||
|
SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS
|
||
|
|
||
|
splitter = self.splitter
|
||
|
if self.monotonic_cst is None:
|
||
|
monotonic_cst = None
|
||
|
else:
|
||
|
if self.n_outputs_ > 1:
|
||
|
raise ValueError(
|
||
|
"Monotonicity constraints are not supported with multiple outputs."
|
||
|
)
|
||
|
# Check to correct monotonicity constraint' specification,
|
||
|
# by applying element-wise logical conjunction
|
||
|
# Note: we do not cast `np.asarray(self.monotonic_cst, dtype=np.int8)`
|
||
|
# straight away here so as to generate error messages for invalid
|
||
|
# values using the original values prior to any dtype related conversion.
|
||
|
monotonic_cst = np.asarray(self.monotonic_cst)
|
||
|
if monotonic_cst.shape[0] != X.shape[1]:
|
||
|
raise ValueError(
|
||
|
"monotonic_cst has shape {} but the input data "
|
||
|
"X has {} features.".format(monotonic_cst.shape[0], X.shape[1])
|
||
|
)
|
||
|
valid_constraints = np.isin(monotonic_cst, (-1, 0, 1))
|
||
|
if not np.all(valid_constraints):
|
||
|
unique_constaints_value = np.unique(monotonic_cst)
|
||
|
raise ValueError(
|
||
|
"monotonic_cst must be None or an array-like of -1, 0 or 1, but"
|
||
|
f" got {unique_constaints_value}"
|
||
|
)
|
||
|
monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)
|
||
|
if is_classifier(self):
|
||
|
if self.n_classes_[0] > 2:
|
||
|
raise ValueError(
|
||
|
"Monotonicity constraints are not supported with multiclass "
|
||
|
"classification"
|
||
|
)
|
||
|
# Binary classification trees are built by constraining probabilities
|
||
|
# of the *negative class* in order to make the implementation similar
|
||
|
# to regression trees.
|
||
|
# Since self.monotonic_cst encodes constraints on probabilities of the
|
||
|
# *positive class*, all signs must be flipped.
|
||
|
monotonic_cst *= -1
|
||
|
|
||
|
if not isinstance(self.splitter, Splitter):
|
||
|
splitter = SPLITTERS[self.splitter](
|
||
|
criterion,
|
||
|
self.max_features_,
|
||
|
min_samples_leaf,
|
||
|
min_weight_leaf,
|
||
|
random_state,
|
||
|
monotonic_cst,
|
||
|
)
|
||
|
|
||
|
if is_classifier(self):
|
||
|
self.tree_ = Tree(self.n_features_in_, self.n_classes_, self.n_outputs_)
|
||
|
else:
|
||
|
self.tree_ = Tree(
|
||
|
self.n_features_in_,
|
||
|
# TODO: tree shouldn't need this in this case
|
||
|
np.array([1] * self.n_outputs_, dtype=np.intp),
|
||
|
self.n_outputs_,
|
||
|
)
|
||
|
|
||
|
# Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
|
||
|
if max_leaf_nodes < 0:
|
||
|
builder = DepthFirstTreeBuilder(
|
||
|
splitter,
|
||
|
min_samples_split,
|
||
|
min_samples_leaf,
|
||
|
min_weight_leaf,
|
||
|
max_depth,
|
||
|
self.min_impurity_decrease,
|
||
|
)
|
||
|
else:
|
||
|
builder = BestFirstTreeBuilder(
|
||
|
splitter,
|
||
|
min_samples_split,
|
||
|
min_samples_leaf,
|
||
|
min_weight_leaf,
|
||
|
max_depth,
|
||
|
max_leaf_nodes,
|
||
|
self.min_impurity_decrease,
|
||
|
)
|
||
|
|
||
|
builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)
|
||
|
|
||
|
if self.n_outputs_ == 1 and is_classifier(self):
|
||
|
self.n_classes_ = self.n_classes_[0]
|
||
|
self.classes_ = self.classes_[0]
|
||
|
|
||
|
self._prune_tree()
|
||
|
|
||
|
return self
|
||
|
|
||
|
def _validate_X_predict(self, X, check_input):
|
||
|
"""Validate the training data on predict (probabilities)."""
|
||
|
if check_input:
|
||
|
if self._support_missing_values(X):
|
||
|
force_all_finite = "allow-nan"
|
||
|
else:
|
||
|
force_all_finite = True
|
||
|
X = self._validate_data(
|
||
|
X,
|
||
|
dtype=DTYPE,
|
||
|
accept_sparse="csr",
|
||
|
reset=False,
|
||
|
force_all_finite=force_all_finite,
|
||
|
)
|
||
|
if issparse(X) and (
|
||
|
X.indices.dtype != np.intc or X.indptr.dtype != np.intc
|
||
|
):
|
||
|
raise ValueError("No support for np.int64 index based sparse matrices")
|
||
|
else:
|
||
|
# The number of features is checked regardless of `check_input`
|
||
|
self._check_n_features(X, reset=False)
|
||
|
return X
|
||
|
|
||
|
def predict(self, X, check_input=True):
|
||
|
"""Predict class or regression value for X.
|
||
|
|
||
|
For a classification model, the predicted class for each sample in X is
|
||
|
returned. For a regression model, the predicted value based on X is
|
||
|
returned.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||
|
The input samples. Internally, it will be converted to
|
||
|
``dtype=np.float32`` and if a sparse matrix is provided
|
||
|
to a sparse ``csr_matrix``.
|
||
|
|
||
|
check_input : bool, default=True
|
||
|
Allow to bypass several input checking.
|
||
|
Don't use this parameter unless you know what you're doing.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
|
||
|
The predicted classes, or the predict values.
|
||
|
"""
|
||
|
check_is_fitted(self)
|
||
|
X = self._validate_X_predict(X, check_input)
|
||
|
proba = self.tree_.predict(X)
|
||
|
n_samples = X.shape[0]
|
||
|
|
||
|
# Classification
|
||
|
if is_classifier(self):
|
||
|
if self.n_outputs_ == 1:
|
||
|
return self.classes_.take(np.argmax(proba, axis=1), axis=0)
|
||
|
|
||
|
else:
|
||
|
class_type = self.classes_[0].dtype
|
||
|
predictions = np.zeros((n_samples, self.n_outputs_), dtype=class_type)
|
||
|
for k in range(self.n_outputs_):
|
||
|
predictions[:, k] = self.classes_[k].take(
|
||
|
np.argmax(proba[:, k], axis=1), axis=0
|
||
|
)
|
||
|
|
||
|
return predictions
|
||
|
|
||
|
# Regression
|
||
|
else:
|
||
|
if self.n_outputs_ == 1:
|
||
|
return proba[:, 0]
|
||
|
|
||
|
else:
|
||
|
return proba[:, :, 0]
|
||
|
|
||
|
def apply(self, X, check_input=True):
|
||
|
"""Return the index of the leaf that each sample is predicted as.
|
||
|
|
||
|
.. versionadded:: 0.17
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||
|
The input samples. Internally, it will be converted to
|
||
|
``dtype=np.float32`` and if a sparse matrix is provided
|
||
|
to a sparse ``csr_matrix``.
|
||
|
|
||
|
check_input : bool, default=True
|
||
|
Allow to bypass several input checking.
|
||
|
Don't use this parameter unless you know what you're doing.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
X_leaves : array-like of shape (n_samples,)
|
||
|
For each datapoint x in X, return the index of the leaf x
|
||
|
ends up in. Leaves are numbered within
|
||
|
``[0; self.tree_.node_count)``, possibly with gaps in the
|
||
|
numbering.
|
||
|
"""
|
||
|
check_is_fitted(self)
|
||
|
X = self._validate_X_predict(X, check_input)
|
||
|
return self.tree_.apply(X)
|
||
|
|
||
|
def decision_path(self, X, check_input=True):
|
||
|
"""Return the decision path in the tree.
|
||
|
|
||
|
.. versionadded:: 0.18
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||
|
The input samples. Internally, it will be converted to
|
||
|
``dtype=np.float32`` and if a sparse matrix is provided
|
||
|
to a sparse ``csr_matrix``.
|
||
|
|
||
|
check_input : bool, default=True
|
||
|
Allow to bypass several input checking.
|
||
|
Don't use this parameter unless you know what you're doing.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
indicator : sparse matrix of shape (n_samples, n_nodes)
|
||
|
Return a node indicator CSR matrix where non zero elements
|
||
|
indicates that the samples goes through the nodes.
|
||
|
"""
|
||
|
X = self._validate_X_predict(X, check_input)
|
||
|
return self.tree_.decision_path(X)
|
||
|
|
||
|
def _prune_tree(self):
|
||
|
"""Prune tree using Minimal Cost-Complexity Pruning."""
|
||
|
check_is_fitted(self)
|
||
|
|
||
|
if self.ccp_alpha == 0.0:
|
||
|
return
|
||
|
|
||
|
# build pruned tree
|
||
|
if is_classifier(self):
|
||
|
n_classes = np.atleast_1d(self.n_classes_)
|
||
|
pruned_tree = Tree(self.n_features_in_, n_classes, self.n_outputs_)
|
||
|
else:
|
||
|
pruned_tree = Tree(
|
||
|
self.n_features_in_,
|
||
|
# TODO: the tree shouldn't need this param
|
||
|
np.array([1] * self.n_outputs_, dtype=np.intp),
|
||
|
self.n_outputs_,
|
||
|
)
|
||
|
_build_pruned_tree_ccp(pruned_tree, self.tree_, self.ccp_alpha)
|
||
|
|
||
|
self.tree_ = pruned_tree
|
||
|
|
||
|
def cost_complexity_pruning_path(self, X, y, sample_weight=None):
|
||
|
"""Compute the pruning path during Minimal Cost-Complexity Pruning.
|
||
|
|
||
|
See :ref:`minimal_cost_complexity_pruning` for details on the pruning
|
||
|
process.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||
|
The training input samples. Internally, it will be converted to
|
||
|
``dtype=np.float32`` and if a sparse matrix is provided
|
||
|
to a sparse ``csc_matrix``.
|
||
|
|
||
|
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
|
||
|
The target values (class labels) as integers or strings.
|
||
|
|
||
|
sample_weight : array-like of shape (n_samples,), default=None
|
||
|
Sample weights. If None, then samples are equally weighted. Splits
|
||
|
that would create child nodes with net zero or negative weight are
|
||
|
ignored while searching for a split in each node. Splits are also
|
||
|
ignored if they would result in any single class carrying a
|
||
|
negative weight in either child node.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
ccp_path : :class:`~sklearn.utils.Bunch`
|
||
|
Dictionary-like object, with the following attributes.
|
||
|
|
||
|
ccp_alphas : ndarray
|
||
|
Effective alphas of subtree during pruning.
|
||
|
|
||
|
impurities : ndarray
|
||
|
Sum of the impurities of the subtree leaves for the
|
||
|
corresponding alpha value in ``ccp_alphas``.
|
||
|
"""
|
||
|
est = clone(self).set_params(ccp_alpha=0.0)
|
||
|
est.fit(X, y, sample_weight=sample_weight)
|
||
|
return Bunch(**ccp_pruning_path(est.tree_))
|
||
|
|
||
|
@property
|
||
|
def feature_importances_(self):
|
||
|
"""Return the feature importances.
|
||
|
|
||
|
The importance of a feature is computed as the (normalized) total
|
||
|
reduction of the criterion brought by that feature.
|
||
|
It is also known as the Gini importance.
|
||
|
|
||
|
Warning: impurity-based feature importances can be misleading for
|
||
|
high cardinality features (many unique values). See
|
||
|
:func:`sklearn.inspection.permutation_importance` as an alternative.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
feature_importances_ : ndarray of shape (n_features,)
|
||
|
Normalized total reduction of criteria by feature
|
||
|
(Gini importance).
|
||
|
"""
|
||
|
check_is_fitted(self)
|
||
|
|
||
|
return self.tree_.compute_feature_importances()
|
||
|
|
||
|
|
||
|
# =============================================================================
|
||
|
# Public estimators
|
||
|
# =============================================================================
|
||
|
|
||
|
|
||
|
class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
|
||
|
"""A decision tree classifier.
|
||
|
|
||
|
Read more in the :ref:`User Guide <tree>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
criterion : {"gini", "entropy", "log_loss"}, default="gini"
|
||
|
The function to measure the quality of a split. Supported criteria are
|
||
|
"gini" for the Gini impurity and "log_loss" and "entropy" both for the
|
||
|
Shannon information gain, see :ref:`tree_mathematical_formulation`.
|
||
|
|
||
|
splitter : {"best", "random"}, default="best"
|
||
|
The strategy used to choose the split at each node. Supported
|
||
|
strategies are "best" to choose the best split and "random" to choose
|
||
|
the best random split.
|
||
|
|
||
|
max_depth : int, default=None
|
||
|
The maximum depth of the tree. If None, then nodes are expanded until
|
||
|
all leaves are pure or until all leaves contain less than
|
||
|
min_samples_split samples.
|
||
|
|
||
|
min_samples_split : int or float, default=2
|
||
|
The minimum number of samples required to split an internal node:
|
||
|
|
||
|
- If int, then consider `min_samples_split` as the minimum number.
|
||
|
- If float, then `min_samples_split` is a fraction and
|
||
|
`ceil(min_samples_split * n_samples)` are the minimum
|
||
|
number of samples for each split.
|
||
|
|
||
|
.. versionchanged:: 0.18
|
||
|
Added float values for fractions.
|
||
|
|
||
|
min_samples_leaf : int or float, default=1
|
||
|
The minimum number of samples required to be at a leaf node.
|
||
|
A split point at any depth will only be considered if it leaves at
|
||
|
least ``min_samples_leaf`` training samples in each of the left and
|
||
|
right branches. This may have the effect of smoothing the model,
|
||
|
especially in regression.
|
||
|
|
||
|
- If int, then consider `min_samples_leaf` as the minimum number.
|
||
|
- If float, then `min_samples_leaf` is a fraction and
|
||
|
`ceil(min_samples_leaf * n_samples)` are the minimum
|
||
|
number of samples for each node.
|
||
|
|
||
|
.. versionchanged:: 0.18
|
||
|
Added float values for fractions.
|
||
|
|
||
|
min_weight_fraction_leaf : float, default=0.0
|
||
|
The minimum weighted fraction of the sum total of weights (of all
|
||
|
the input samples) required to be at a leaf node. Samples have
|
||
|
equal weight when sample_weight is not provided.
|
||
|
|
||
|
max_features : int, float or {"sqrt", "log2"}, default=None
|
||
|
The number of features to consider when looking for the best split:
|
||
|
|
||
|
- If int, then consider `max_features` features at each split.
|
||
|
- If float, then `max_features` is a fraction and
|
||
|
`max(1, int(max_features * n_features_in_))` features are considered at
|
||
|
each split.
|
||
|
- If "sqrt", then `max_features=sqrt(n_features)`.
|
||
|
- If "log2", then `max_features=log2(n_features)`.
|
||
|
- If None, then `max_features=n_features`.
|
||
|
|
||
|
Note: the search for a split does not stop until at least one
|
||
|
valid partition of the node samples is found, even if it requires to
|
||
|
effectively inspect more than ``max_features`` features.
|
||
|
|
||
|
random_state : int, RandomState instance or None, default=None
|
||
|
Controls the randomness of the estimator. The features are always
|
||
|
randomly permuted at each split, even if ``splitter`` is set to
|
||
|
``"best"``. When ``max_features < n_features``, the algorithm will
|
||
|
select ``max_features`` at random at each split before finding the best
|
||
|
split among them. But the best found split may vary across different
|
||
|
runs, even if ``max_features=n_features``. That is the case, if the
|
||
|
improvement of the criterion is identical for several splits and one
|
||
|
split has to be selected at random. To obtain a deterministic behaviour
|
||
|
during fitting, ``random_state`` has to be fixed to an integer.
|
||
|
See :term:`Glossary <random_state>` for details.
|
||
|
|
||
|
max_leaf_nodes : int, default=None
|
||
|
Grow a tree with ``max_leaf_nodes`` in best-first fashion.
|
||
|
Best nodes are defined as relative reduction in impurity.
|
||
|
If None then unlimited number of leaf nodes.
|
||
|
|
||
|
min_impurity_decrease : float, default=0.0
|
||
|
A node will be split if this split induces a decrease of the impurity
|
||
|
greater than or equal to this value.
|
||
|
|
||
|
The weighted impurity decrease equation is the following::
|
||
|
|
||
|
N_t / N * (impurity - N_t_R / N_t * right_impurity
|
||
|
- N_t_L / N_t * left_impurity)
|
||
|
|
||
|
where ``N`` is the total number of samples, ``N_t`` is the number of
|
||
|
samples at the current node, ``N_t_L`` is the number of samples in the
|
||
|
left child, and ``N_t_R`` is the number of samples in the right child.
|
||
|
|
||
|
``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
|
||
|
if ``sample_weight`` is passed.
|
||
|
|
||
|
.. versionadded:: 0.19
|
||
|
|
||
|
class_weight : dict, list of dict or "balanced", default=None
|
||
|
Weights associated with classes in the form ``{class_label: weight}``.
|
||
|
If None, all classes are supposed to have weight one. For
|
||
|
multi-output problems, a list of dicts can be provided in the same
|
||
|
order as the columns of y.
|
||
|
|
||
|
Note that for multioutput (including multilabel) weights should be
|
||
|
defined for each class of every column in its own dict. For example,
|
||
|
for four-class multilabel classification weights should be
|
||
|
[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
|
||
|
[{1:1}, {2:5}, {3:1}, {4:1}].
|
||
|
|
||
|
The "balanced" mode uses the values of y to automatically adjust
|
||
|
weights inversely proportional to class frequencies in the input data
|
||
|
as ``n_samples / (n_classes * np.bincount(y))``
|
||
|
|
||
|
For multi-output, the weights of each column of y will be multiplied.
|
||
|
|
||
|
Note that these weights will be multiplied with sample_weight (passed
|
||
|
through the fit method) if sample_weight is specified.
|
||
|
|
||
|
ccp_alpha : non-negative float, default=0.0
|
||
|
Complexity parameter used for Minimal Cost-Complexity Pruning. The
|
||
|
subtree with the largest cost complexity that is smaller than
|
||
|
``ccp_alpha`` will be chosen. By default, no pruning is performed. See
|
||
|
:ref:`minimal_cost_complexity_pruning` for details.
|
||
|
|
||
|
.. versionadded:: 0.22
|
||
|
|
||
|
monotonic_cst : array-like of int of shape (n_features), default=None
|
||
|
Indicates the monotonicity constraint to enforce on each feature.
|
||
|
- 1: monotonic increase
|
||
|
- 0: no constraint
|
||
|
- -1: monotonic decrease
|
||
|
|
||
|
If monotonic_cst is None, no constraints are applied.
|
||
|
|
||
|
Monotonicity constraints are not supported for:
|
||
|
- multiclass classifications (i.e. when `n_classes > 2`),
|
||
|
- multioutput classifications (i.e. when `n_outputs_ > 1`),
|
||
|
- classifications trained on data with missing values.
|
||
|
|
||
|
The constraints hold over the probability of the positive class.
|
||
|
|
||
|
Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
|
||
|
|
||
|
.. versionadded:: 1.4
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
classes_ : ndarray of shape (n_classes,) or list of ndarray
|
||
|
The classes labels (single output problem),
|
||
|
or a list of arrays of class labels (multi-output problem).
|
||
|
|
||
|
feature_importances_ : ndarray of shape (n_features,)
|
||
|
The impurity-based feature importances.
|
||
|
The higher, the more important the feature.
|
||
|
The importance of a feature is computed as the (normalized)
|
||
|
total reduction of the criterion brought by that feature. It is also
|
||
|
known as the Gini importance [4]_.
|
||
|
|
||
|
Warning: impurity-based feature importances can be misleading for
|
||
|
high cardinality features (many unique values). See
|
||
|
:func:`sklearn.inspection.permutation_importance` as an alternative.
|
||
|
|
||
|
max_features_ : int
|
||
|
The inferred value of max_features.
|
||
|
|
||
|
n_classes_ : int or list of int
|
||
|
The number of classes (for single output problems),
|
||
|
or a list containing the number of classes for each
|
||
|
output (for multi-output problems).
|
||
|
|
||
|
n_features_in_ : int
|
||
|
Number of features seen during :term:`fit`.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
|
||
|
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||
|
Names of features seen during :term:`fit`. Defined only when `X`
|
||
|
has feature names that are all strings.
|
||
|
|
||
|
.. versionadded:: 1.0
|
||
|
|
||
|
n_outputs_ : int
|
||
|
The number of outputs when ``fit`` is performed.
|
||
|
|
||
|
tree_ : Tree instance
|
||
|
The underlying Tree object. Please refer to
|
||
|
``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and
|
||
|
:ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
|
||
|
for basic usage of these attributes.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
DecisionTreeRegressor : A decision tree regressor.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The default values for the parameters controlling the size of the trees
|
||
|
(e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
|
||
|
unpruned trees which can potentially be very large on some data sets. To
|
||
|
reduce memory consumption, the complexity and size of the trees should be
|
||
|
controlled by setting those parameter values.
|
||
|
|
||
|
The :meth:`predict` method operates using the :func:`numpy.argmax`
|
||
|
function on the outputs of :meth:`predict_proba`. This means that in
|
||
|
case the highest predicted probabilities are tied, the classifier will
|
||
|
predict the tied class with the lowest index in :term:`classes_`.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
|
||
|
.. [1] https://en.wikipedia.org/wiki/Decision_tree_learning
|
||
|
|
||
|
.. [2] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification
|
||
|
and Regression Trees", Wadsworth, Belmont, CA, 1984.
|
||
|
|
||
|
.. [3] T. Hastie, R. Tibshirani and J. Friedman. "Elements of Statistical
|
||
|
Learning", Springer, 2009.
|
||
|
|
||
|
.. [4] L. Breiman, and A. Cutler, "Random Forests",
|
||
|
https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from sklearn.datasets import load_iris
|
||
|
>>> from sklearn.model_selection import cross_val_score
|
||
|
>>> from sklearn.tree import DecisionTreeClassifier
|
||
|
>>> clf = DecisionTreeClassifier(random_state=0)
|
||
|
>>> iris = load_iris()
|
||
|
>>> cross_val_score(clf, iris.data, iris.target, cv=10)
|
||
|
... # doctest: +SKIP
|
||
|
...
|
||
|
array([ 1. , 0.93..., 0.86..., 0.93..., 0.93...,
|
||
|
0.93..., 0.93..., 1. , 0.93..., 1. ])
|
||
|
"""
|
||
|
|
||
|
_parameter_constraints: dict = {
|
||
|
**BaseDecisionTree._parameter_constraints,
|
||
|
"criterion": [StrOptions({"gini", "entropy", "log_loss"}), Hidden(Criterion)],
|
||
|
"class_weight": [dict, list, StrOptions({"balanced"}), None],
|
||
|
}
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
*,
|
||
|
criterion="gini",
|
||
|
splitter="best",
|
||
|
max_depth=None,
|
||
|
min_samples_split=2,
|
||
|
min_samples_leaf=1,
|
||
|
min_weight_fraction_leaf=0.0,
|
||
|
max_features=None,
|
||
|
random_state=None,
|
||
|
max_leaf_nodes=None,
|
||
|
min_impurity_decrease=0.0,
|
||
|
class_weight=None,
|
||
|
ccp_alpha=0.0,
|
||
|
monotonic_cst=None,
|
||
|
):
|
||
|
super().__init__(
|
||
|
criterion=criterion,
|
||
|
splitter=splitter,
|
||
|
max_depth=max_depth,
|
||
|
min_samples_split=min_samples_split,
|
||
|
min_samples_leaf=min_samples_leaf,
|
||
|
min_weight_fraction_leaf=min_weight_fraction_leaf,
|
||
|
max_features=max_features,
|
||
|
max_leaf_nodes=max_leaf_nodes,
|
||
|
class_weight=class_weight,
|
||
|
random_state=random_state,
|
||
|
min_impurity_decrease=min_impurity_decrease,
|
||
|
monotonic_cst=monotonic_cst,
|
||
|
ccp_alpha=ccp_alpha,
|
||
|
)
|
||
|
|
||
|
@_fit_context(prefer_skip_nested_validation=True)
|
||
|
def fit(self, X, y, sample_weight=None, check_input=True):
|
||
|
"""Build a decision tree classifier from the training set (X, y).
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||
|
The training input samples. Internally, it will be converted to
|
||
|
``dtype=np.float32`` and if a sparse matrix is provided
|
||
|
to a sparse ``csc_matrix``.
|
||
|
|
||
|
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
|
||
|
The target values (class labels) as integers or strings.
|
||
|
|
||
|
sample_weight : array-like of shape (n_samples,), default=None
|
||
|
Sample weights. If None, then samples are equally weighted. Splits
|
||
|
that would create child nodes with net zero or negative weight are
|
||
|
ignored while searching for a split in each node. Splits are also
|
||
|
ignored if they would result in any single class carrying a
|
||
|
negative weight in either child node.
|
||
|
|
||
|
check_input : bool, default=True
|
||
|
Allow to bypass several input checking.
|
||
|
Don't use this parameter unless you know what you're doing.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self : DecisionTreeClassifier
|
||
|
Fitted estimator.
|
||
|
"""
|
||
|
|
||
|
super()._fit(
|
||
|
X,
|
||
|
y,
|
||
|
sample_weight=sample_weight,
|
||
|
check_input=check_input,
|
||
|
)
|
||
|
return self
|
||
|
|
||
|
def predict_proba(self, X, check_input=True):
|
||
|
"""Predict class probabilities of the input samples X.
|
||
|
|
||
|
The predicted class probability is the fraction of samples of the same
|
||
|
class in a leaf.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||
|
The input samples. Internally, it will be converted to
|
||
|
``dtype=np.float32`` and if a sparse matrix is provided
|
||
|
to a sparse ``csr_matrix``.
|
||
|
|
||
|
check_input : bool, default=True
|
||
|
Allow to bypass several input checking.
|
||
|
Don't use this parameter unless you know what you're doing.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
proba : ndarray of shape (n_samples, n_classes) or list of n_outputs \
|
||
|
such arrays if n_outputs > 1
|
||
|
The class probabilities of the input samples. The order of the
|
||
|
classes corresponds to that in the attribute :term:`classes_`.
|
||
|
"""
|
||
|
check_is_fitted(self)
|
||
|
X = self._validate_X_predict(X, check_input)
|
||
|
proba = self.tree_.predict(X)
|
||
|
|
||
|
if self.n_outputs_ == 1:
|
||
|
return proba[:, : self.n_classes_]
|
||
|
else:
|
||
|
all_proba = []
|
||
|
for k in range(self.n_outputs_):
|
||
|
proba_k = proba[:, k, : self.n_classes_[k]]
|
||
|
all_proba.append(proba_k)
|
||
|
return all_proba
|
||
|
|
||
|
def predict_log_proba(self, X):
|
||
|
"""Predict class log-probabilities of the input samples X.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||
|
The input samples. Internally, it will be converted to
|
||
|
``dtype=np.float32`` and if a sparse matrix is provided
|
||
|
to a sparse ``csr_matrix``.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
proba : ndarray of shape (n_samples, n_classes) or list of n_outputs \
|
||
|
such arrays if n_outputs > 1
|
||
|
The class log-probabilities of the input samples. The order of the
|
||
|
classes corresponds to that in the attribute :term:`classes_`.
|
||
|
"""
|
||
|
proba = self.predict_proba(X)
|
||
|
|
||
|
if self.n_outputs_ == 1:
|
||
|
return np.log(proba)
|
||
|
|
||
|
else:
|
||
|
for k in range(self.n_outputs_):
|
||
|
proba[k] = np.log(proba[k])
|
||
|
|
||
|
return proba
|
||
|
|
||
|
def _more_tags(self):
|
||
|
# XXX: nan is only support for dense arrays, but we set this for common test to
|
||
|
# pass, specifically: check_estimators_nan_inf
|
||
|
allow_nan = self.splitter == "best" and self.criterion in {
|
||
|
"gini",
|
||
|
"log_loss",
|
||
|
"entropy",
|
||
|
}
|
||
|
return {"multilabel": True, "allow_nan": allow_nan}
|
||
|
|
||
|
|
||
|
class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
|
||
|
"""A decision tree regressor.
|
||
|
|
||
|
Read more in the :ref:`User Guide <tree>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
criterion : {"squared_error", "friedman_mse", "absolute_error", \
|
||
|
"poisson"}, default="squared_error"
|
||
|
The function to measure the quality of a split. Supported criteria
|
||
|
are "squared_error" for the mean squared error, which is equal to
|
||
|
variance reduction as feature selection criterion and minimizes the L2
|
||
|
loss using the mean of each terminal node, "friedman_mse", which uses
|
||
|
mean squared error with Friedman's improvement score for potential
|
||
|
splits, "absolute_error" for the mean absolute error, which minimizes
|
||
|
the L1 loss using the median of each terminal node, and "poisson" which
|
||
|
uses reduction in Poisson deviance to find splits.
|
||
|
|
||
|
.. versionadded:: 0.18
|
||
|
Mean Absolute Error (MAE) criterion.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
Poisson deviance criterion.
|
||
|
|
||
|
splitter : {"best", "random"}, default="best"
|
||
|
The strategy used to choose the split at each node. Supported
|
||
|
strategies are "best" to choose the best split and "random" to choose
|
||
|
the best random split.
|
||
|
|
||
|
max_depth : int, default=None
|
||
|
The maximum depth of the tree. If None, then nodes are expanded until
|
||
|
all leaves are pure or until all leaves contain less than
|
||
|
min_samples_split samples.
|
||
|
|
||
|
min_samples_split : int or float, default=2
|
||
|
The minimum number of samples required to split an internal node:
|
||
|
|
||
|
- If int, then consider `min_samples_split` as the minimum number.
|
||
|
- If float, then `min_samples_split` is a fraction and
|
||
|
`ceil(min_samples_split * n_samples)` are the minimum
|
||
|
number of samples for each split.
|
||
|
|
||
|
.. versionchanged:: 0.18
|
||
|
Added float values for fractions.
|
||
|
|
||
|
min_samples_leaf : int or float, default=1
|
||
|
The minimum number of samples required to be at a leaf node.
|
||
|
A split point at any depth will only be considered if it leaves at
|
||
|
least ``min_samples_leaf`` training samples in each of the left and
|
||
|
right branches. This may have the effect of smoothing the model,
|
||
|
especially in regression.
|
||
|
|
||
|
- If int, then consider `min_samples_leaf` as the minimum number.
|
||
|
- If float, then `min_samples_leaf` is a fraction and
|
||
|
`ceil(min_samples_leaf * n_samples)` are the minimum
|
||
|
number of samples for each node.
|
||
|
|
||
|
.. versionchanged:: 0.18
|
||
|
Added float values for fractions.
|
||
|
|
||
|
min_weight_fraction_leaf : float, default=0.0
|
||
|
The minimum weighted fraction of the sum total of weights (of all
|
||
|
the input samples) required to be at a leaf node. Samples have
|
||
|
equal weight when sample_weight is not provided.
|
||
|
|
||
|
max_features : int, float or {"sqrt", "log2"}, default=None
|
||
|
The number of features to consider when looking for the best split:
|
||
|
|
||
|
- If int, then consider `max_features` features at each split.
|
||
|
- If float, then `max_features` is a fraction and
|
||
|
`max(1, int(max_features * n_features_in_))` features are considered at each
|
||
|
split.
|
||
|
- If "sqrt", then `max_features=sqrt(n_features)`.
|
||
|
- If "log2", then `max_features=log2(n_features)`.
|
||
|
- If None, then `max_features=n_features`.
|
||
|
|
||
|
Note: the search for a split does not stop until at least one
|
||
|
valid partition of the node samples is found, even if it requires to
|
||
|
effectively inspect more than ``max_features`` features.
|
||
|
|
||
|
random_state : int, RandomState instance or None, default=None
|
||
|
Controls the randomness of the estimator. The features are always
|
||
|
randomly permuted at each split, even if ``splitter`` is set to
|
||
|
``"best"``. When ``max_features < n_features``, the algorithm will
|
||
|
select ``max_features`` at random at each split before finding the best
|
||
|
split among them. But the best found split may vary across different
|
||
|
runs, even if ``max_features=n_features``. That is the case, if the
|
||
|
improvement of the criterion is identical for several splits and one
|
||
|
split has to be selected at random. To obtain a deterministic behaviour
|
||
|
during fitting, ``random_state`` has to be fixed to an integer.
|
||
|
See :term:`Glossary <random_state>` for details.
|
||
|
|
||
|
max_leaf_nodes : int, default=None
|
||
|
Grow a tree with ``max_leaf_nodes`` in best-first fashion.
|
||
|
Best nodes are defined as relative reduction in impurity.
|
||
|
If None then unlimited number of leaf nodes.
|
||
|
|
||
|
min_impurity_decrease : float, default=0.0
|
||
|
A node will be split if this split induces a decrease of the impurity
|
||
|
greater than or equal to this value.
|
||
|
|
||
|
The weighted impurity decrease equation is the following::
|
||
|
|
||
|
N_t / N * (impurity - N_t_R / N_t * right_impurity
|
||
|
- N_t_L / N_t * left_impurity)
|
||
|
|
||
|
where ``N`` is the total number of samples, ``N_t`` is the number of
|
||
|
samples at the current node, ``N_t_L`` is the number of samples in the
|
||
|
left child, and ``N_t_R`` is the number of samples in the right child.
|
||
|
|
||
|
``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
|
||
|
if ``sample_weight`` is passed.
|
||
|
|
||
|
.. versionadded:: 0.19
|
||
|
|
||
|
ccp_alpha : non-negative float, default=0.0
|
||
|
Complexity parameter used for Minimal Cost-Complexity Pruning. The
|
||
|
subtree with the largest cost complexity that is smaller than
|
||
|
``ccp_alpha`` will be chosen. By default, no pruning is performed. See
|
||
|
:ref:`minimal_cost_complexity_pruning` for details.
|
||
|
|
||
|
.. versionadded:: 0.22
|
||
|
|
||
|
monotonic_cst : array-like of int of shape (n_features), default=None
|
||
|
Indicates the monotonicity constraint to enforce on each feature.
|
||
|
- 1: monotonic increase
|
||
|
- 0: no constraint
|
||
|
- -1: monotonic decrease
|
||
|
|
||
|
If monotonic_cst is None, no constraints are applied.
|
||
|
|
||
|
Monotonicity constraints are not supported for:
|
||
|
- multioutput regressions (i.e. when `n_outputs_ > 1`),
|
||
|
- regressions trained on data with missing values.
|
||
|
|
||
|
Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
|
||
|
|
||
|
.. versionadded:: 1.4
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
feature_importances_ : ndarray of shape (n_features,)
|
||
|
The feature importances.
|
||
|
The higher, the more important the feature.
|
||
|
The importance of a feature is computed as the
|
||
|
(normalized) total reduction of the criterion brought
|
||
|
by that feature. It is also known as the Gini importance [4]_.
|
||
|
|
||
|
Warning: impurity-based feature importances can be misleading for
|
||
|
high cardinality features (many unique values). See
|
||
|
:func:`sklearn.inspection.permutation_importance` as an alternative.
|
||
|
|
||
|
max_features_ : int
|
||
|
The inferred value of max_features.
|
||
|
|
||
|
n_features_in_ : int
|
||
|
Number of features seen during :term:`fit`.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
|
||
|
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||
|
Names of features seen during :term:`fit`. Defined only when `X`
|
||
|
has feature names that are all strings.
|
||
|
|
||
|
.. versionadded:: 1.0
|
||
|
|
||
|
n_outputs_ : int
|
||
|
The number of outputs when ``fit`` is performed.
|
||
|
|
||
|
tree_ : Tree instance
|
||
|
The underlying Tree object. Please refer to
|
||
|
``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and
|
||
|
:ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
|
||
|
for basic usage of these attributes.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
DecisionTreeClassifier : A decision tree classifier.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The default values for the parameters controlling the size of the trees
|
||
|
(e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
|
||
|
unpruned trees which can potentially be very large on some data sets. To
|
||
|
reduce memory consumption, the complexity and size of the trees should be
|
||
|
controlled by setting those parameter values.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
|
||
|
.. [1] https://en.wikipedia.org/wiki/Decision_tree_learning
|
||
|
|
||
|
.. [2] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification
|
||
|
and Regression Trees", Wadsworth, Belmont, CA, 1984.
|
||
|
|
||
|
.. [3] T. Hastie, R. Tibshirani and J. Friedman. "Elements of Statistical
|
||
|
Learning", Springer, 2009.
|
||
|
|
||
|
.. [4] L. Breiman, and A. Cutler, "Random Forests",
|
||
|
https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from sklearn.datasets import load_diabetes
|
||
|
>>> from sklearn.model_selection import cross_val_score
|
||
|
>>> from sklearn.tree import DecisionTreeRegressor
|
||
|
>>> X, y = load_diabetes(return_X_y=True)
|
||
|
>>> regressor = DecisionTreeRegressor(random_state=0)
|
||
|
>>> cross_val_score(regressor, X, y, cv=10)
|
||
|
... # doctest: +SKIP
|
||
|
...
|
||
|
array([-0.39..., -0.46..., 0.02..., 0.06..., -0.50...,
|
||
|
0.16..., 0.11..., -0.73..., -0.30..., -0.00...])
|
||
|
"""
|
||
|
|
||
|
_parameter_constraints: dict = {
|
||
|
**BaseDecisionTree._parameter_constraints,
|
||
|
"criterion": [
|
||
|
StrOptions({"squared_error", "friedman_mse", "absolute_error", "poisson"}),
|
||
|
Hidden(Criterion),
|
||
|
],
|
||
|
}
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
*,
|
||
|
criterion="squared_error",
|
||
|
splitter="best",
|
||
|
max_depth=None,
|
||
|
min_samples_split=2,
|
||
|
min_samples_leaf=1,
|
||
|
min_weight_fraction_leaf=0.0,
|
||
|
max_features=None,
|
||
|
random_state=None,
|
||
|
max_leaf_nodes=None,
|
||
|
min_impurity_decrease=0.0,
|
||
|
ccp_alpha=0.0,
|
||
|
monotonic_cst=None,
|
||
|
):
|
||
|
super().__init__(
|
||
|
criterion=criterion,
|
||
|
splitter=splitter,
|
||
|
max_depth=max_depth,
|
||
|
min_samples_split=min_samples_split,
|
||
|
min_samples_leaf=min_samples_leaf,
|
||
|
min_weight_fraction_leaf=min_weight_fraction_leaf,
|
||
|
max_features=max_features,
|
||
|
max_leaf_nodes=max_leaf_nodes,
|
||
|
random_state=random_state,
|
||
|
min_impurity_decrease=min_impurity_decrease,
|
||
|
ccp_alpha=ccp_alpha,
|
||
|
monotonic_cst=monotonic_cst,
|
||
|
)
|
||
|
|
||
|
@_fit_context(prefer_skip_nested_validation=True)
|
||
|
def fit(self, X, y, sample_weight=None, check_input=True):
|
||
|
"""Build a decision tree regressor from the training set (X, y).
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||
|
The training input samples. Internally, it will be converted to
|
||
|
``dtype=np.float32`` and if a sparse matrix is provided
|
||
|
to a sparse ``csc_matrix``.
|
||
|
|
||
|
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
|
||
|
The target values (real numbers). Use ``dtype=np.float64`` and
|
||
|
``order='C'`` for maximum efficiency.
|
||
|
|
||
|
sample_weight : array-like of shape (n_samples,), default=None
|
||
|
Sample weights. If None, then samples are equally weighted. Splits
|
||
|
that would create child nodes with net zero or negative weight are
|
||
|
ignored while searching for a split in each node.
|
||
|
|
||
|
check_input : bool, default=True
|
||
|
Allow to bypass several input checking.
|
||
|
Don't use this parameter unless you know what you're doing.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self : DecisionTreeRegressor
|
||
|
Fitted estimator.
|
||
|
"""
|
||
|
|
||
|
super()._fit(
|
||
|
X,
|
||
|
y,
|
||
|
sample_weight=sample_weight,
|
||
|
check_input=check_input,
|
||
|
)
|
||
|
return self
|
||
|
|
||
|
def _compute_partial_dependence_recursion(self, grid, target_features):
|
||
|
"""Fast partial dependence computation.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
grid : ndarray of shape (n_samples, n_target_features), dtype=np.float32
|
||
|
The grid points on which the partial dependence should be
|
||
|
evaluated.
|
||
|
target_features : ndarray of shape (n_target_features), dtype=np.intp
|
||
|
The set of target features for which the partial dependence
|
||
|
should be evaluated.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
averaged_predictions : ndarray of shape (n_samples,), dtype=np.float64
|
||
|
The value of the partial dependence function on each grid point.
|
||
|
"""
|
||
|
grid = np.asarray(grid, dtype=DTYPE, order="C")
|
||
|
averaged_predictions = np.zeros(
|
||
|
shape=grid.shape[0], dtype=np.float64, order="C"
|
||
|
)
|
||
|
target_features = np.asarray(target_features, dtype=np.intp, order="C")
|
||
|
|
||
|
self.tree_.compute_partial_dependence(
|
||
|
grid, target_features, averaged_predictions
|
||
|
)
|
||
|
return averaged_predictions
|
||
|
|
||
|
def _more_tags(self):
|
||
|
# XXX: nan is only support for dense arrays, but we set this for common test to
|
||
|
# pass, specifically: check_estimators_nan_inf
|
||
|
allow_nan = self.splitter == "best" and self.criterion in {
|
||
|
"squared_error",
|
||
|
"friedman_mse",
|
||
|
"poisson",
|
||
|
}
|
||
|
return {"allow_nan": allow_nan}
|
||
|
|
||
|
|
||
|
class ExtraTreeClassifier(DecisionTreeClassifier):
|
||
|
"""An extremely randomized tree classifier.
|
||
|
|
||
|
Extra-trees differ from classic decision trees in the way they are built.
|
||
|
When looking for the best split to separate the samples of a node into two
|
||
|
groups, random splits are drawn for each of the `max_features` randomly
|
||
|
selected features and the best split among those is chosen. When
|
||
|
`max_features` is set 1, this amounts to building a totally random
|
||
|
decision tree.
|
||
|
|
||
|
Warning: Extra-trees should only be used within ensemble methods.
|
||
|
|
||
|
Read more in the :ref:`User Guide <tree>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
criterion : {"gini", "entropy", "log_loss"}, default="gini"
|
||
|
The function to measure the quality of a split. Supported criteria are
|
||
|
"gini" for the Gini impurity and "log_loss" and "entropy" both for the
|
||
|
Shannon information gain, see :ref:`tree_mathematical_formulation`.
|
||
|
|
||
|
splitter : {"random", "best"}, default="random"
|
||
|
The strategy used to choose the split at each node. Supported
|
||
|
strategies are "best" to choose the best split and "random" to choose
|
||
|
the best random split.
|
||
|
|
||
|
max_depth : int, default=None
|
||
|
The maximum depth of the tree. If None, then nodes are expanded until
|
||
|
all leaves are pure or until all leaves contain less than
|
||
|
min_samples_split samples.
|
||
|
|
||
|
min_samples_split : int or float, default=2
|
||
|
The minimum number of samples required to split an internal node:
|
||
|
|
||
|
- If int, then consider `min_samples_split` as the minimum number.
|
||
|
- If float, then `min_samples_split` is a fraction and
|
||
|
`ceil(min_samples_split * n_samples)` are the minimum
|
||
|
number of samples for each split.
|
||
|
|
||
|
.. versionchanged:: 0.18
|
||
|
Added float values for fractions.
|
||
|
|
||
|
min_samples_leaf : int or float, default=1
|
||
|
The minimum number of samples required to be at a leaf node.
|
||
|
A split point at any depth will only be considered if it leaves at
|
||
|
least ``min_samples_leaf`` training samples in each of the left and
|
||
|
right branches. This may have the effect of smoothing the model,
|
||
|
especially in regression.
|
||
|
|
||
|
- If int, then consider `min_samples_leaf` as the minimum number.
|
||
|
- If float, then `min_samples_leaf` is a fraction and
|
||
|
`ceil(min_samples_leaf * n_samples)` are the minimum
|
||
|
number of samples for each node.
|
||
|
|
||
|
.. versionchanged:: 0.18
|
||
|
Added float values for fractions.
|
||
|
|
||
|
min_weight_fraction_leaf : float, default=0.0
|
||
|
The minimum weighted fraction of the sum total of weights (of all
|
||
|
the input samples) required to be at a leaf node. Samples have
|
||
|
equal weight when sample_weight is not provided.
|
||
|
|
||
|
max_features : int, float, {"sqrt", "log2"} or None, default="sqrt"
|
||
|
The number of features to consider when looking for the best split:
|
||
|
|
||
|
- If int, then consider `max_features` features at each split.
|
||
|
- If float, then `max_features` is a fraction and
|
||
|
`max(1, int(max_features * n_features_in_))` features are considered at
|
||
|
each split.
|
||
|
- If "sqrt", then `max_features=sqrt(n_features)`.
|
||
|
- If "log2", then `max_features=log2(n_features)`.
|
||
|
- If None, then `max_features=n_features`.
|
||
|
|
||
|
.. versionchanged:: 1.1
|
||
|
The default of `max_features` changed from `"auto"` to `"sqrt"`.
|
||
|
|
||
|
Note: the search for a split does not stop until at least one
|
||
|
valid partition of the node samples is found, even if it requires to
|
||
|
effectively inspect more than ``max_features`` features.
|
||
|
|
||
|
random_state : int, RandomState instance or None, default=None
|
||
|
Used to pick randomly the `max_features` used at each split.
|
||
|
See :term:`Glossary <random_state>` for details.
|
||
|
|
||
|
max_leaf_nodes : int, default=None
|
||
|
Grow a tree with ``max_leaf_nodes`` in best-first fashion.
|
||
|
Best nodes are defined as relative reduction in impurity.
|
||
|
If None then unlimited number of leaf nodes.
|
||
|
|
||
|
min_impurity_decrease : float, default=0.0
|
||
|
A node will be split if this split induces a decrease of the impurity
|
||
|
greater than or equal to this value.
|
||
|
|
||
|
The weighted impurity decrease equation is the following::
|
||
|
|
||
|
N_t / N * (impurity - N_t_R / N_t * right_impurity
|
||
|
- N_t_L / N_t * left_impurity)
|
||
|
|
||
|
where ``N`` is the total number of samples, ``N_t`` is the number of
|
||
|
samples at the current node, ``N_t_L`` is the number of samples in the
|
||
|
left child, and ``N_t_R`` is the number of samples in the right child.
|
||
|
|
||
|
``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
|
||
|
if ``sample_weight`` is passed.
|
||
|
|
||
|
.. versionadded:: 0.19
|
||
|
|
||
|
class_weight : dict, list of dict or "balanced", default=None
|
||
|
Weights associated with classes in the form ``{class_label: weight}``.
|
||
|
If None, all classes are supposed to have weight one. For
|
||
|
multi-output problems, a list of dicts can be provided in the same
|
||
|
order as the columns of y.
|
||
|
|
||
|
Note that for multioutput (including multilabel) weights should be
|
||
|
defined for each class of every column in its own dict. For example,
|
||
|
for four-class multilabel classification weights should be
|
||
|
[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
|
||
|
[{1:1}, {2:5}, {3:1}, {4:1}].
|
||
|
|
||
|
The "balanced" mode uses the values of y to automatically adjust
|
||
|
weights inversely proportional to class frequencies in the input data
|
||
|
as ``n_samples / (n_classes * np.bincount(y))``
|
||
|
|
||
|
For multi-output, the weights of each column of y will be multiplied.
|
||
|
|
||
|
Note that these weights will be multiplied with sample_weight (passed
|
||
|
through the fit method) if sample_weight is specified.
|
||
|
|
||
|
ccp_alpha : non-negative float, default=0.0
|
||
|
Complexity parameter used for Minimal Cost-Complexity Pruning. The
|
||
|
subtree with the largest cost complexity that is smaller than
|
||
|
``ccp_alpha`` will be chosen. By default, no pruning is performed. See
|
||
|
:ref:`minimal_cost_complexity_pruning` for details.
|
||
|
|
||
|
.. versionadded:: 0.22
|
||
|
|
||
|
monotonic_cst : array-like of int of shape (n_features), default=None
|
||
|
Indicates the monotonicity constraint to enforce on each feature.
|
||
|
- 1: monotonic increase
|
||
|
- 0: no constraint
|
||
|
- -1: monotonic decrease
|
||
|
|
||
|
If monotonic_cst is None, no constraints are applied.
|
||
|
|
||
|
Monotonicity constraints are not supported for:
|
||
|
- multiclass classifications (i.e. when `n_classes > 2`),
|
||
|
- multioutput classifications (i.e. when `n_outputs_ > 1`),
|
||
|
- classifications trained on data with missing values.
|
||
|
|
||
|
The constraints hold over the probability of the positive class.
|
||
|
|
||
|
Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
|
||
|
|
||
|
.. versionadded:: 1.4
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
classes_ : ndarray of shape (n_classes,) or list of ndarray
|
||
|
The classes labels (single output problem),
|
||
|
or a list of arrays of class labels (multi-output problem).
|
||
|
|
||
|
max_features_ : int
|
||
|
The inferred value of max_features.
|
||
|
|
||
|
n_classes_ : int or list of int
|
||
|
The number of classes (for single output problems),
|
||
|
or a list containing the number of classes for each
|
||
|
output (for multi-output problems).
|
||
|
|
||
|
feature_importances_ : ndarray of shape (n_features,)
|
||
|
The impurity-based feature importances.
|
||
|
The higher, the more important the feature.
|
||
|
The importance of a feature is computed as the (normalized)
|
||
|
total reduction of the criterion brought by that feature. It is also
|
||
|
known as the Gini importance.
|
||
|
|
||
|
Warning: impurity-based feature importances can be misleading for
|
||
|
high cardinality features (many unique values). See
|
||
|
:func:`sklearn.inspection.permutation_importance` as an alternative.
|
||
|
|
||
|
n_features_in_ : int
|
||
|
Number of features seen during :term:`fit`.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
|
||
|
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||
|
Names of features seen during :term:`fit`. Defined only when `X`
|
||
|
has feature names that are all strings.
|
||
|
|
||
|
.. versionadded:: 1.0
|
||
|
|
||
|
n_outputs_ : int
|
||
|
The number of outputs when ``fit`` is performed.
|
||
|
|
||
|
tree_ : Tree instance
|
||
|
The underlying Tree object. Please refer to
|
||
|
``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and
|
||
|
:ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
|
||
|
for basic usage of these attributes.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
ExtraTreeRegressor : An extremely randomized tree regressor.
|
||
|
sklearn.ensemble.ExtraTreesClassifier : An extra-trees classifier.
|
||
|
sklearn.ensemble.ExtraTreesRegressor : An extra-trees regressor.
|
||
|
sklearn.ensemble.RandomForestClassifier : A random forest classifier.
|
||
|
sklearn.ensemble.RandomForestRegressor : A random forest regressor.
|
||
|
sklearn.ensemble.RandomTreesEmbedding : An ensemble of
|
||
|
totally random trees.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The default values for the parameters controlling the size of the trees
|
||
|
(e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
|
||
|
unpruned trees which can potentially be very large on some data sets. To
|
||
|
reduce memory consumption, the complexity and size of the trees should be
|
||
|
controlled by setting those parameter values.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
|
||
|
.. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
|
||
|
Machine Learning, 63(1), 3-42, 2006.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from sklearn.datasets import load_iris
|
||
|
>>> from sklearn.model_selection import train_test_split
|
||
|
>>> from sklearn.ensemble import BaggingClassifier
|
||
|
>>> from sklearn.tree import ExtraTreeClassifier
|
||
|
>>> X, y = load_iris(return_X_y=True)
|
||
|
>>> X_train, X_test, y_train, y_test = train_test_split(
|
||
|
... X, y, random_state=0)
|
||
|
>>> extra_tree = ExtraTreeClassifier(random_state=0)
|
||
|
>>> cls = BaggingClassifier(extra_tree, random_state=0).fit(
|
||
|
... X_train, y_train)
|
||
|
>>> cls.score(X_test, y_test)
|
||
|
0.8947...
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
*,
|
||
|
criterion="gini",
|
||
|
splitter="random",
|
||
|
max_depth=None,
|
||
|
min_samples_split=2,
|
||
|
min_samples_leaf=1,
|
||
|
min_weight_fraction_leaf=0.0,
|
||
|
max_features="sqrt",
|
||
|
random_state=None,
|
||
|
max_leaf_nodes=None,
|
||
|
min_impurity_decrease=0.0,
|
||
|
class_weight=None,
|
||
|
ccp_alpha=0.0,
|
||
|
monotonic_cst=None,
|
||
|
):
|
||
|
super().__init__(
|
||
|
criterion=criterion,
|
||
|
splitter=splitter,
|
||
|
max_depth=max_depth,
|
||
|
min_samples_split=min_samples_split,
|
||
|
min_samples_leaf=min_samples_leaf,
|
||
|
min_weight_fraction_leaf=min_weight_fraction_leaf,
|
||
|
max_features=max_features,
|
||
|
max_leaf_nodes=max_leaf_nodes,
|
||
|
class_weight=class_weight,
|
||
|
min_impurity_decrease=min_impurity_decrease,
|
||
|
random_state=random_state,
|
||
|
ccp_alpha=ccp_alpha,
|
||
|
monotonic_cst=monotonic_cst,
|
||
|
)
|
||
|
|
||
|
|
||
|
class ExtraTreeRegressor(DecisionTreeRegressor):
|
||
|
"""An extremely randomized tree regressor.
|
||
|
|
||
|
Extra-trees differ from classic decision trees in the way they are built.
|
||
|
When looking for the best split to separate the samples of a node into two
|
||
|
groups, random splits are drawn for each of the `max_features` randomly
|
||
|
selected features and the best split among those is chosen. When
|
||
|
`max_features` is set 1, this amounts to building a totally random
|
||
|
decision tree.
|
||
|
|
||
|
Warning: Extra-trees should only be used within ensemble methods.
|
||
|
|
||
|
Read more in the :ref:`User Guide <tree>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
criterion : {"squared_error", "friedman_mse", "absolute_error", "poisson"}, \
|
||
|
default="squared_error"
|
||
|
The function to measure the quality of a split. Supported criteria
|
||
|
are "squared_error" for the mean squared error, which is equal to
|
||
|
variance reduction as feature selection criterion and minimizes the L2
|
||
|
loss using the mean of each terminal node, "friedman_mse", which uses
|
||
|
mean squared error with Friedman's improvement score for potential
|
||
|
splits, "absolute_error" for the mean absolute error, which minimizes
|
||
|
the L1 loss using the median of each terminal node, and "poisson" which
|
||
|
uses reduction in Poisson deviance to find splits.
|
||
|
|
||
|
.. versionadded:: 0.18
|
||
|
Mean Absolute Error (MAE) criterion.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
Poisson deviance criterion.
|
||
|
|
||
|
splitter : {"random", "best"}, default="random"
|
||
|
The strategy used to choose the split at each node. Supported
|
||
|
strategies are "best" to choose the best split and "random" to choose
|
||
|
the best random split.
|
||
|
|
||
|
max_depth : int, default=None
|
||
|
The maximum depth of the tree. If None, then nodes are expanded until
|
||
|
all leaves are pure or until all leaves contain less than
|
||
|
min_samples_split samples.
|
||
|
|
||
|
min_samples_split : int or float, default=2
|
||
|
The minimum number of samples required to split an internal node:
|
||
|
|
||
|
- If int, then consider `min_samples_split` as the minimum number.
|
||
|
- If float, then `min_samples_split` is a fraction and
|
||
|
`ceil(min_samples_split * n_samples)` are the minimum
|
||
|
number of samples for each split.
|
||
|
|
||
|
.. versionchanged:: 0.18
|
||
|
Added float values for fractions.
|
||
|
|
||
|
min_samples_leaf : int or float, default=1
|
||
|
The minimum number of samples required to be at a leaf node.
|
||
|
A split point at any depth will only be considered if it leaves at
|
||
|
least ``min_samples_leaf`` training samples in each of the left and
|
||
|
right branches. This may have the effect of smoothing the model,
|
||
|
especially in regression.
|
||
|
|
||
|
- If int, then consider `min_samples_leaf` as the minimum number.
|
||
|
- If float, then `min_samples_leaf` is a fraction and
|
||
|
`ceil(min_samples_leaf * n_samples)` are the minimum
|
||
|
number of samples for each node.
|
||
|
|
||
|
.. versionchanged:: 0.18
|
||
|
Added float values for fractions.
|
||
|
|
||
|
min_weight_fraction_leaf : float, default=0.0
|
||
|
The minimum weighted fraction of the sum total of weights (of all
|
||
|
the input samples) required to be at a leaf node. Samples have
|
||
|
equal weight when sample_weight is not provided.
|
||
|
|
||
|
max_features : int, float, {"sqrt", "log2"} or None, default=1.0
|
||
|
The number of features to consider when looking for the best split:
|
||
|
|
||
|
- If int, then consider `max_features` features at each split.
|
||
|
- If float, then `max_features` is a fraction and
|
||
|
`max(1, int(max_features * n_features_in_))` features are considered at each
|
||
|
split.
|
||
|
- If "sqrt", then `max_features=sqrt(n_features)`.
|
||
|
- If "log2", then `max_features=log2(n_features)`.
|
||
|
- If None, then `max_features=n_features`.
|
||
|
|
||
|
.. versionchanged:: 1.1
|
||
|
The default of `max_features` changed from `"auto"` to `1.0`.
|
||
|
|
||
|
Note: the search for a split does not stop until at least one
|
||
|
valid partition of the node samples is found, even if it requires to
|
||
|
effectively inspect more than ``max_features`` features.
|
||
|
|
||
|
random_state : int, RandomState instance or None, default=None
|
||
|
Used to pick randomly the `max_features` used at each split.
|
||
|
See :term:`Glossary <random_state>` for details.
|
||
|
|
||
|
min_impurity_decrease : float, default=0.0
|
||
|
A node will be split if this split induces a decrease of the impurity
|
||
|
greater than or equal to this value.
|
||
|
|
||
|
The weighted impurity decrease equation is the following::
|
||
|
|
||
|
N_t / N * (impurity - N_t_R / N_t * right_impurity
|
||
|
- N_t_L / N_t * left_impurity)
|
||
|
|
||
|
where ``N`` is the total number of samples, ``N_t`` is the number of
|
||
|
samples at the current node, ``N_t_L`` is the number of samples in the
|
||
|
left child, and ``N_t_R`` is the number of samples in the right child.
|
||
|
|
||
|
``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
|
||
|
if ``sample_weight`` is passed.
|
||
|
|
||
|
.. versionadded:: 0.19
|
||
|
|
||
|
max_leaf_nodes : int, default=None
|
||
|
Grow a tree with ``max_leaf_nodes`` in best-first fashion.
|
||
|
Best nodes are defined as relative reduction in impurity.
|
||
|
If None then unlimited number of leaf nodes.
|
||
|
|
||
|
ccp_alpha : non-negative float, default=0.0
|
||
|
Complexity parameter used for Minimal Cost-Complexity Pruning. The
|
||
|
subtree with the largest cost complexity that is smaller than
|
||
|
``ccp_alpha`` will be chosen. By default, no pruning is performed. See
|
||
|
:ref:`minimal_cost_complexity_pruning` for details.
|
||
|
|
||
|
.. versionadded:: 0.22
|
||
|
|
||
|
monotonic_cst : array-like of int of shape (n_features), default=None
|
||
|
Indicates the monotonicity constraint to enforce on each feature.
|
||
|
- 1: monotonic increase
|
||
|
- 0: no constraint
|
||
|
- -1: monotonic decrease
|
||
|
|
||
|
If monotonic_cst is None, no constraints are applied.
|
||
|
|
||
|
Monotonicity constraints are not supported for:
|
||
|
- multioutput regressions (i.e. when `n_outputs_ > 1`),
|
||
|
- regressions trained on data with missing values.
|
||
|
|
||
|
Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
|
||
|
|
||
|
.. versionadded:: 1.4
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
max_features_ : int
|
||
|
The inferred value of max_features.
|
||
|
|
||
|
n_features_in_ : int
|
||
|
Number of features seen during :term:`fit`.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
|
||
|
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||
|
Names of features seen during :term:`fit`. Defined only when `X`
|
||
|
has feature names that are all strings.
|
||
|
|
||
|
.. versionadded:: 1.0
|
||
|
|
||
|
feature_importances_ : ndarray of shape (n_features,)
|
||
|
Return impurity-based feature importances (the higher, the more
|
||
|
important the feature).
|
||
|
|
||
|
Warning: impurity-based feature importances can be misleading for
|
||
|
high cardinality features (many unique values). See
|
||
|
:func:`sklearn.inspection.permutation_importance` as an alternative.
|
||
|
|
||
|
n_outputs_ : int
|
||
|
The number of outputs when ``fit`` is performed.
|
||
|
|
||
|
tree_ : Tree instance
|
||
|
The underlying Tree object. Please refer to
|
||
|
``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and
|
||
|
:ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
|
||
|
for basic usage of these attributes.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
ExtraTreeClassifier : An extremely randomized tree classifier.
|
||
|
sklearn.ensemble.ExtraTreesClassifier : An extra-trees classifier.
|
||
|
sklearn.ensemble.ExtraTreesRegressor : An extra-trees regressor.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The default values for the parameters controlling the size of the trees
|
||
|
(e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
|
||
|
unpruned trees which can potentially be very large on some data sets. To
|
||
|
reduce memory consumption, the complexity and size of the trees should be
|
||
|
controlled by setting those parameter values.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
|
||
|
.. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
|
||
|
Machine Learning, 63(1), 3-42, 2006.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from sklearn.datasets import load_diabetes
|
||
|
>>> from sklearn.model_selection import train_test_split
|
||
|
>>> from sklearn.ensemble import BaggingRegressor
|
||
|
>>> from sklearn.tree import ExtraTreeRegressor
|
||
|
>>> X, y = load_diabetes(return_X_y=True)
|
||
|
>>> X_train, X_test, y_train, y_test = train_test_split(
|
||
|
... X, y, random_state=0)
|
||
|
>>> extra_tree = ExtraTreeRegressor(random_state=0)
|
||
|
>>> reg = BaggingRegressor(extra_tree, random_state=0).fit(
|
||
|
... X_train, y_train)
|
||
|
>>> reg.score(X_test, y_test)
|
||
|
0.33...
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
*,
|
||
|
criterion="squared_error",
|
||
|
splitter="random",
|
||
|
max_depth=None,
|
||
|
min_samples_split=2,
|
||
|
min_samples_leaf=1,
|
||
|
min_weight_fraction_leaf=0.0,
|
||
|
max_features=1.0,
|
||
|
random_state=None,
|
||
|
min_impurity_decrease=0.0,
|
||
|
max_leaf_nodes=None,
|
||
|
ccp_alpha=0.0,
|
||
|
monotonic_cst=None,
|
||
|
):
|
||
|
super().__init__(
|
||
|
criterion=criterion,
|
||
|
splitter=splitter,
|
||
|
max_depth=max_depth,
|
||
|
min_samples_split=min_samples_split,
|
||
|
min_samples_leaf=min_samples_leaf,
|
||
|
min_weight_fraction_leaf=min_weight_fraction_leaf,
|
||
|
max_features=max_features,
|
||
|
max_leaf_nodes=max_leaf_nodes,
|
||
|
min_impurity_decrease=min_impurity_decrease,
|
||
|
random_state=random_state,
|
||
|
ccp_alpha=ccp_alpha,
|
||
|
monotonic_cst=monotonic_cst,
|
||
|
)
|