"""Partial dependence plots for regression and classification models.""" # Authors: Peter Prettenhofer # Trevor Stephens # Nicolas Hug # License: BSD 3 clause from collections.abc import Iterable import warnings import numpy as np from scipy import sparse from scipy.stats.mstats import mquantiles from ..base import is_classifier, is_regressor from ..pipeline import Pipeline from ..utils.extmath import cartesian from ..utils import check_array from ..utils import check_matplotlib_support # noqa from ..utils import _safe_indexing from ..utils import _determine_key_type from ..utils import _get_column_indices from ..utils.validation import check_is_fitted from ..utils import Bunch from ..utils.validation import _deprecate_positional_args from ..tree import DecisionTreeRegressor from ..ensemble import RandomForestRegressor from ..exceptions import NotFittedError from ..ensemble._gb import BaseGradientBoosting from sklearn.ensemble._hist_gradient_boosting.gradient_boosting import ( BaseHistGradientBoosting) __all__ = [ 'partial_dependence', ] def _grid_from_X(X, percentiles, grid_resolution): """Generate a grid of points based on the percentiles of X. The grid is a cartesian product between the columns of ``values``. The ith column of ``values`` consists in ``grid_resolution`` equally-spaced points between the percentiles of the jth column of X. If ``grid_resolution`` is bigger than the number of unique values in the jth column of X, then those unique values will be used instead. Parameters ---------- X : ndarray, shape (n_samples, n_target_features) The data. percentiles : tuple of floats The percentiles which are used to construct the extreme values of the grid. Must be in [0, 1]. grid_resolution : int The number of equally spaced points to be placed on the grid for each feature. Returns ------- grid : ndarray, shape (n_points, n_target_features) A value for each feature at each point in the grid. ``n_points`` is always ``<= grid_resolution ** X.shape[1]``. values : list of 1d ndarrays The values with which the grid has been created. The size of each array ``values[j]`` is either ``grid_resolution``, or the number of unique values in ``X[:, j]``, whichever is smaller. """ if not isinstance(percentiles, Iterable) or len(percentiles) != 2: raise ValueError("'percentiles' must be a sequence of 2 elements.") if not all(0 <= x <= 1 for x in percentiles): raise ValueError("'percentiles' values must be in [0, 1].") if percentiles[0] >= percentiles[1]: raise ValueError('percentiles[0] must be strictly less ' 'than percentiles[1].') if grid_resolution <= 1: raise ValueError("'grid_resolution' must be strictly greater than 1.") values = [] for feature in range(X.shape[1]): uniques = np.unique(_safe_indexing(X, feature, axis=1)) if uniques.shape[0] < grid_resolution: # feature has low resolution use unique vals axis = uniques else: # create axis based on percentiles and grid resolution emp_percentiles = mquantiles( _safe_indexing(X, feature, axis=1), prob=percentiles, axis=0 ) if np.allclose(emp_percentiles[0], emp_percentiles[1]): raise ValueError( 'percentiles are too close to each other, ' 'unable to build the grid. Please choose percentiles ' 'that are further apart.') axis = np.linspace(emp_percentiles[0], emp_percentiles[1], num=grid_resolution, endpoint=True) values.append(axis) return cartesian(values), values def _partial_dependence_recursion(est, grid, features): averaged_predictions = est._compute_partial_dependence_recursion(grid, features) if averaged_predictions.ndim == 1: # reshape to (1, n_points) for consistency with # _partial_dependence_brute averaged_predictions = averaged_predictions.reshape(1, -1) return averaged_predictions def _partial_dependence_brute(est, grid, features, X, response_method): predictions = [] averaged_predictions = [] # define the prediction_method (predict, predict_proba, decision_function). if is_regressor(est): prediction_method = est.predict else: predict_proba = getattr(est, 'predict_proba', None) decision_function = getattr(est, 'decision_function', None) if response_method == 'auto': # try predict_proba, then decision_function if it doesn't exist prediction_method = predict_proba or decision_function else: prediction_method = (predict_proba if response_method == 'predict_proba' else decision_function) if prediction_method is None: if response_method == 'auto': raise ValueError( 'The estimator has no predict_proba and no ' 'decision_function method.' ) elif response_method == 'predict_proba': raise ValueError('The estimator has no predict_proba method.') else: raise ValueError( 'The estimator has no decision_function method.') for new_values in grid: X_eval = X.copy() for i, variable in enumerate(features): if hasattr(X_eval, 'iloc'): X_eval.iloc[:, variable] = new_values[i] else: X_eval[:, variable] = new_values[i] try: # Note: predictions is of shape # (n_points,) for non-multioutput regressors # (n_points, n_tasks) for multioutput regressors # (n_points, 1) for the regressors in cross_decomposition (I think) # (n_points, 2) for binary classification # (n_points, n_classes) for multiclass classification pred = prediction_method(X_eval) predictions.append(pred) # average over samples averaged_predictions.append(np.mean(pred, axis=0)) except NotFittedError as e: raise ValueError( "'estimator' parameter must be a fitted estimator") from e n_samples = X.shape[0] # reshape to (n_targets, n_instances, n_points) where n_targets is: # - 1 for non-multioutput regression and binary classification (shape is # already correct in those cases) # - n_tasks for multi-output regression # - n_classes for multiclass classification. predictions = np.array(predictions).T if is_regressor(est) and predictions.ndim == 2: # non-multioutput regression, shape is (n_instances, n_points,) predictions = predictions.reshape(n_samples, -1) elif is_classifier(est) and predictions.shape[0] == 2: # Binary classification, shape is (2, n_instances, n_points). # we output the effect of **positive** class predictions = predictions[1] predictions = predictions.reshape(n_samples, -1) # reshape averaged_predictions to (n_targets, n_points) where n_targets is: # - 1 for non-multioutput regression and binary classification (shape is # already correct in those cases) # - n_tasks for multi-output regression # - n_classes for multiclass classification. averaged_predictions = np.array(averaged_predictions).T if is_regressor(est) and averaged_predictions.ndim == 1: # non-multioutput regression, shape is (n_points,) averaged_predictions = averaged_predictions.reshape(1, -1) elif is_classifier(est) and averaged_predictions.shape[0] == 2: # Binary classification, shape is (2, n_points). # we output the effect of **positive** class averaged_predictions = averaged_predictions[1] averaged_predictions = averaged_predictions.reshape(1, -1) return averaged_predictions, predictions @_deprecate_positional_args def partial_dependence(estimator, X, features, *, response_method='auto', percentiles=(0.05, 0.95), grid_resolution=100, method='auto', kind='legacy'): """Partial dependence of ``features``. Partial dependence of a feature (or a set of features) corresponds to the average response of an estimator for each possible value of the feature. Read more in the :ref:`User Guide `. .. warning:: For :class:`~sklearn.ensemble.GradientBoostingClassifier` and :class:`~sklearn.ensemble.GradientBoostingRegressor`, the `'recursion'` method (used by default) will not account for the `init` predictor of the boosting process. In practice, this will produce the same values as `'brute'` up to a constant offset in the target response, provided that `init` is a constant estimator (which is the default). However, if `init` is not a constant estimator, the partial dependence values are incorrect for `'recursion'` because the offset will be sample-dependent. It is preferable to use the `'brute'` method. Note that this only applies to :class:`~sklearn.ensemble.GradientBoostingClassifier` and :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and :class:`~sklearn.ensemble.HistGradientBoostingRegressor`. Parameters ---------- estimator : BaseEstimator A fitted estimator object implementing :term:`predict`, :term:`predict_proba`, or :term:`decision_function`. Multioutput-multiclass classifiers are not supported. X : {array-like or dataframe} of shape (n_samples, n_features) ``X`` is used to generate a grid of values for the target ``features`` (where the partial dependence will be evaluated), and also to generate values for the complement features when the `method` is 'brute'. features : array-like of {int, str} The feature (e.g. `[0]`) or pair of interacting features (e.g. `[(0, 1)]`) for which the partial dependency should be computed. response_method : {'auto', 'predict_proba', 'decision_function'}, \ default='auto' Specifies whether to use :term:`predict_proba` or :term:`decision_function` as the target response. For regressors this parameter is ignored and the response is always the output of :term:`predict`. By default, :term:`predict_proba` is tried first and we revert to :term:`decision_function` if it doesn't exist. If ``method`` is 'recursion', the response is always the output of :term:`decision_function`. percentiles : tuple of float, default=(0.05, 0.95) The lower and upper percentile used to create the extreme values for the grid. Must be in [0, 1]. grid_resolution : int, default=100 The number of equally spaced points on the grid, for each target feature. method : {'auto', 'recursion', 'brute'}, default='auto' The method used to calculate the averaged predictions: - `'recursion'` is only supported for some tree-based estimators (namely :class:`~sklearn.ensemble.GradientBoostingClassifier`, :class:`~sklearn.ensemble.GradientBoostingRegressor`, :class:`~sklearn.ensemble.HistGradientBoostingClassifier`, :class:`~sklearn.ensemble.HistGradientBoostingRegressor`, :class:`~sklearn.tree.DecisionTreeRegressor`, :class:`~sklearn.ensemble.RandomForestRegressor`, ) when `kind='average'`. This is more efficient in terms of speed. With this method, the target response of a classifier is always the decision function, not the predicted probabilities. Since the `'recursion'` method implicitely computes the average of the Individual Conditional Expectation (ICE) by design, it is not compatible with ICE and thus `kind` must be `'average'`. - `'brute'` is supported for any estimator, but is more computationally intensive. - `'auto'`: the `'recursion'` is used for estimators that support it, and `'brute'` is used otherwise. Please see :ref:`this note ` for differences between the `'brute'` and `'recursion'` method. kind : {'legacy', 'average', 'individual', 'both'}, default='legacy' Whether to return the partial dependence averaged across all the samples in the dataset or one line per sample or both. See Returns below. Note that the fast `method='recursion'` option is only available for `kind='average'`. Plotting individual dependencies requires using the slower `method='brute'` option. .. versionadded:: 0.24 .. deprecated:: 0.24 `kind='legacy'` is deprecated and will be removed in version 1.1. `kind='average'` will be the new default. It is intended to migrate from the ndarray output to :class:`~sklearn.utils.Bunch` output. Returns ------- predictions : ndarray or :class:`~sklearn.utils.Bunch` - if `kind='legacy'`, return value is ndarray of shape (n_outputs, \ len(values[0]), len(values[1]), ...) The predictions for all the points in the grid, averaged over all samples in X (or over the training data if ``method`` is 'recursion'). - if `kind='individual'`, `'average'` or `'both'`, return value is \ :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes. individual : ndarray of shape (n_outputs, n_instances, \ len(values[0]), len(values[1]), ...) The predictions for all the points in the grid for all samples in X. This is also known as Individual Conditional Expectation (ICE) average : ndarray of shape (n_outputs, len(values[0]), \ len(values[1]), ...) The predictions for all the points in the grid, averaged over all samples in X (or over the training data if ``method`` is 'recursion'). Only available when kind='both'. values : seq of 1d ndarrays The values with which the grid has been created. The generated grid is a cartesian product of the arrays in ``values``. ``len(values) == len(features)``. The size of each array ``values[j]`` is either ``grid_resolution``, or the number of unique values in ``X[:, j]``, whichever is smaller. ``n_outputs`` corresponds to the number of classes in a multi-class setting, or to the number of tasks for multi-output regression. For classical regression and binary classification ``n_outputs==1``. ``n_values_feature_j`` corresponds to the size ``values[j]``. values : seq of 1d ndarrays The values with which the grid has been created. The generated grid is a cartesian product of the arrays in ``values``. ``len(values) == len(features)``. The size of each array ``values[j]`` is either ``grid_resolution``, or the number of unique values in ``X[:, j]``, whichever is smaller. Only available when `kind="legacy"`. See Also -------- plot_partial_dependence : Plot Partial Dependence. PartialDependenceDisplay : Partial Dependence visualization. Examples -------- >>> X = [[0, 0, 2], [1, 0, 0]] >>> y = [0, 1] >>> from sklearn.ensemble import GradientBoostingClassifier >>> gb = GradientBoostingClassifier(random_state=0).fit(X, y) >>> partial_dependence(gb, features=[0], X=X, percentiles=(0, 1), ... grid_resolution=2) # doctest: +SKIP (array([[-4.52..., 4.52...]]), [array([ 0., 1.])]) """ if not (is_classifier(estimator) or is_regressor(estimator)): raise ValueError( "'estimator' must be a fitted regressor or classifier." ) if isinstance(estimator, Pipeline): # TODO: to be removed if/when pipeline get a `steps_` attributes # assuming Pipeline is the only estimator that does not store a new # attribute for est in estimator: # FIXME: remove the None option when it will be deprecated if est not in (None, 'drop'): check_is_fitted(est) else: check_is_fitted(estimator) if (is_classifier(estimator) and isinstance(estimator.classes_[0], np.ndarray)): raise ValueError( 'Multiclass-multioutput estimators are not supported' ) # Use check_array only on lists and other non-array-likes / sparse. Do not # convert DataFrame into a NumPy array. if not(hasattr(X, '__array__') or sparse.issparse(X)): X = check_array(X, force_all_finite='allow-nan', dtype=object) accepted_responses = ('auto', 'predict_proba', 'decision_function') if response_method not in accepted_responses: raise ValueError( 'response_method {} is invalid. Accepted response_method names ' 'are {}.'.format(response_method, ', '.join(accepted_responses))) if is_regressor(estimator) and response_method != 'auto': raise ValueError( "The response_method parameter is ignored for regressors and " "must be 'auto'." ) accepted_methods = ('brute', 'recursion', 'auto') if method not in accepted_methods: raise ValueError( 'method {} is invalid. Accepted method names are {}.'.format( method, ', '.join(accepted_methods))) if kind != 'average' and kind != 'legacy': if method == 'recursion': raise ValueError( "The 'recursion' method only applies when 'kind' is set " "to 'average'" ) method = 'brute' if method == 'auto': if (isinstance(estimator, BaseGradientBoosting) and estimator.init is None): method = 'recursion' elif isinstance(estimator, (BaseHistGradientBoosting, DecisionTreeRegressor, RandomForestRegressor)): method = 'recursion' else: method = 'brute' if method == 'recursion': if not isinstance(estimator, (BaseGradientBoosting, BaseHistGradientBoosting, DecisionTreeRegressor, RandomForestRegressor)): supported_classes_recursion = ( 'GradientBoostingClassifier', 'GradientBoostingRegressor', 'HistGradientBoostingClassifier', 'HistGradientBoostingRegressor', 'HistGradientBoostingRegressor', 'DecisionTreeRegressor', 'RandomForestRegressor', ) raise ValueError( "Only the following estimators support the 'recursion' " "method: {}. Try using method='brute'." .format(', '.join(supported_classes_recursion))) if response_method == 'auto': response_method = 'decision_function' if response_method != 'decision_function': raise ValueError( "With the 'recursion' method, the response_method must be " "'decision_function'. Got {}.".format(response_method) ) if _determine_key_type(features, accept_slice=False) == 'int': # _get_column_indices() supports negative indexing. Here, we limit # the indexing to be positive. The upper bound will be checked # by _get_column_indices() if np.any(np.less(features, 0)): raise ValueError( 'all features must be in [0, {}]'.format(X.shape[1] - 1) ) features_indices = np.asarray( _get_column_indices(X, features), dtype=np.int32, order='C' ).ravel() grid, values = _grid_from_X( _safe_indexing(X, features_indices, axis=1), percentiles, grid_resolution ) if method == 'brute': averaged_predictions, predictions = _partial_dependence_brute( estimator, grid, features_indices, X, response_method ) # reshape predictions to # (n_outputs, n_instances, n_values_feature_0, n_values_feature_1, ...) predictions = predictions.reshape( -1, X.shape[0], *[val.shape[0] for val in values] ) else: averaged_predictions = _partial_dependence_recursion( estimator, grid, features_indices ) # reshape averaged_predictions to # (n_outputs, n_values_feature_0, n_values_feature_1, ...) averaged_predictions = averaged_predictions.reshape( -1, *[val.shape[0] for val in values]) if kind == 'legacy': warnings.warn( "A Bunch will be returned in place of 'predictions' from version" " 1.1 (renaming of 0.26) with partial dependence results " "accessible via the 'average' key. In the meantime, pass " "kind='average' to get the future behaviour.", FutureWarning ) # TODO 1.1: Remove kind == 'legacy' section return averaged_predictions, values elif kind == 'average': return Bunch(average=averaged_predictions, values=values) elif kind == 'individual': return Bunch(individual=predictions, values=values) else: # kind='both' return Bunch( average=averaged_predictions, individual=predictions, values=values, )