302 lines
11 KiB
Python
302 lines
11 KiB
Python
|
"""
|
||
|
Sequential feature selection
|
||
|
"""
|
||
|
|
||
|
from numbers import Integral, Real
|
||
|
|
||
|
import numpy as np
|
||
|
|
||
|
from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier
|
||
|
from ..metrics import get_scorer_names
|
||
|
from ..model_selection import check_cv, cross_val_score
|
||
|
from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions
|
||
|
from ..utils._tags import _safe_tags
|
||
|
from ..utils.metadata_routing import _RoutingNotSupportedMixin
|
||
|
from ..utils.validation import check_is_fitted
|
||
|
from ._base import SelectorMixin
|
||
|
|
||
|
|
||
|
class SequentialFeatureSelector(
|
||
|
_RoutingNotSupportedMixin, SelectorMixin, MetaEstimatorMixin, BaseEstimator
|
||
|
):
|
||
|
"""Transformer that performs Sequential Feature Selection.
|
||
|
|
||
|
This Sequential Feature Selector adds (forward selection) or
|
||
|
removes (backward selection) features to form a feature subset in a
|
||
|
greedy fashion. At each stage, this estimator chooses the best feature to
|
||
|
add or remove based on the cross-validation score of an estimator. In
|
||
|
the case of unsupervised learning, this Sequential Feature Selector
|
||
|
looks only at the features (X), not the desired outputs (y).
|
||
|
|
||
|
Read more in the :ref:`User Guide <sequential_feature_selection>`.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
estimator : estimator instance
|
||
|
An unfitted estimator.
|
||
|
|
||
|
n_features_to_select : "auto", int or float, default="auto"
|
||
|
If `"auto"`, the behaviour depends on the `tol` parameter:
|
||
|
|
||
|
- if `tol` is not `None`, then features are selected while the score
|
||
|
change does not exceed `tol`.
|
||
|
- otherwise, half of the features are selected.
|
||
|
|
||
|
If integer, the parameter is the absolute number of features to select.
|
||
|
If float between 0 and 1, it is the fraction of features to select.
|
||
|
|
||
|
.. versionadded:: 1.1
|
||
|
The option `"auto"` was added in version 1.1.
|
||
|
|
||
|
.. versionchanged:: 1.3
|
||
|
The default changed from `"warn"` to `"auto"` in 1.3.
|
||
|
|
||
|
tol : float, default=None
|
||
|
If the score is not incremented by at least `tol` between two
|
||
|
consecutive feature additions or removals, stop adding or removing.
|
||
|
|
||
|
`tol` can be negative when removing features using `direction="backward"`.
|
||
|
It can be useful to reduce the number of features at the cost of a small
|
||
|
decrease in the score.
|
||
|
|
||
|
`tol` is enabled only when `n_features_to_select` is `"auto"`.
|
||
|
|
||
|
.. versionadded:: 1.1
|
||
|
|
||
|
direction : {'forward', 'backward'}, default='forward'
|
||
|
Whether to perform forward selection or backward selection.
|
||
|
|
||
|
scoring : str or callable, default=None
|
||
|
A single str (see :ref:`scoring_parameter`) or a callable
|
||
|
(see :ref:`scoring`) to evaluate the predictions on the test set.
|
||
|
|
||
|
NOTE that when using a custom scorer, it should return a single
|
||
|
value.
|
||
|
|
||
|
If None, the estimator's score method is used.
|
||
|
|
||
|
cv : int, cross-validation generator or an iterable, default=None
|
||
|
Determines the cross-validation splitting strategy.
|
||
|
Possible inputs for cv are:
|
||
|
|
||
|
- None, to use the default 5-fold cross validation,
|
||
|
- integer, to specify the number of folds in a `(Stratified)KFold`,
|
||
|
- :term:`CV splitter`,
|
||
|
- An iterable yielding (train, test) splits as arrays of indices.
|
||
|
|
||
|
For integer/None inputs, if the estimator is a classifier and ``y`` is
|
||
|
either binary or multiclass,
|
||
|
:class:`~sklearn.model_selection.StratifiedKFold` is used. In all other
|
||
|
cases, :class:`~sklearn.model_selection.KFold` is used. These splitters
|
||
|
are instantiated with `shuffle=False` so the splits will be the same
|
||
|
across calls.
|
||
|
|
||
|
Refer :ref:`User Guide <cross_validation>` for the various
|
||
|
cross-validation strategies that can be used here.
|
||
|
|
||
|
n_jobs : int, default=None
|
||
|
Number of jobs to run in parallel. When evaluating a new feature to
|
||
|
add or remove, the cross-validation procedure is parallel over the
|
||
|
folds.
|
||
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||
|
for more details.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
n_features_in_ : int
|
||
|
Number of features seen during :term:`fit`. Only defined if the
|
||
|
underlying estimator exposes such an attribute when fit.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
|
||
|
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||
|
Names of features seen during :term:`fit`. Defined only when `X`
|
||
|
has feature names that are all strings.
|
||
|
|
||
|
.. versionadded:: 1.0
|
||
|
|
||
|
n_features_to_select_ : int
|
||
|
The number of features that were selected.
|
||
|
|
||
|
support_ : ndarray of shape (n_features,), dtype=bool
|
||
|
The mask of selected features.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
GenericUnivariateSelect : Univariate feature selector with configurable
|
||
|
strategy.
|
||
|
RFE : Recursive feature elimination based on importance weights.
|
||
|
RFECV : Recursive feature elimination based on importance weights, with
|
||
|
automatic selection of the number of features.
|
||
|
SelectFromModel : Feature selection based on thresholds of importance
|
||
|
weights.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from sklearn.feature_selection import SequentialFeatureSelector
|
||
|
>>> from sklearn.neighbors import KNeighborsClassifier
|
||
|
>>> from sklearn.datasets import load_iris
|
||
|
>>> X, y = load_iris(return_X_y=True)
|
||
|
>>> knn = KNeighborsClassifier(n_neighbors=3)
|
||
|
>>> sfs = SequentialFeatureSelector(knn, n_features_to_select=3)
|
||
|
>>> sfs.fit(X, y)
|
||
|
SequentialFeatureSelector(estimator=KNeighborsClassifier(n_neighbors=3),
|
||
|
n_features_to_select=3)
|
||
|
>>> sfs.get_support()
|
||
|
array([ True, False, True, True])
|
||
|
>>> sfs.transform(X).shape
|
||
|
(150, 3)
|
||
|
"""
|
||
|
|
||
|
_parameter_constraints: dict = {
|
||
|
"estimator": [HasMethods(["fit"])],
|
||
|
"n_features_to_select": [
|
||
|
StrOptions({"auto"}),
|
||
|
Interval(RealNotInt, 0, 1, closed="right"),
|
||
|
Interval(Integral, 0, None, closed="neither"),
|
||
|
],
|
||
|
"tol": [None, Interval(Real, None, None, closed="neither")],
|
||
|
"direction": [StrOptions({"forward", "backward"})],
|
||
|
"scoring": [None, StrOptions(set(get_scorer_names())), callable],
|
||
|
"cv": ["cv_object"],
|
||
|
"n_jobs": [None, Integral],
|
||
|
}
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
estimator,
|
||
|
*,
|
||
|
n_features_to_select="auto",
|
||
|
tol=None,
|
||
|
direction="forward",
|
||
|
scoring=None,
|
||
|
cv=5,
|
||
|
n_jobs=None,
|
||
|
):
|
||
|
self.estimator = estimator
|
||
|
self.n_features_to_select = n_features_to_select
|
||
|
self.tol = tol
|
||
|
self.direction = direction
|
||
|
self.scoring = scoring
|
||
|
self.cv = cv
|
||
|
self.n_jobs = n_jobs
|
||
|
|
||
|
@_fit_context(
|
||
|
# SequentialFeatureSelector.estimator is not validated yet
|
||
|
prefer_skip_nested_validation=False
|
||
|
)
|
||
|
def fit(self, X, y=None):
|
||
|
"""Learn the features to select from X.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like of shape (n_samples, n_features)
|
||
|
Training vectors, where `n_samples` is the number of samples and
|
||
|
`n_features` is the number of predictors.
|
||
|
|
||
|
y : array-like of shape (n_samples,), default=None
|
||
|
Target values. This parameter may be ignored for
|
||
|
unsupervised learning.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self : object
|
||
|
Returns the instance itself.
|
||
|
"""
|
||
|
tags = self._get_tags()
|
||
|
X = self._validate_data(
|
||
|
X,
|
||
|
accept_sparse="csc",
|
||
|
ensure_min_features=2,
|
||
|
force_all_finite=not tags.get("allow_nan", True),
|
||
|
)
|
||
|
n_features = X.shape[1]
|
||
|
|
||
|
if self.n_features_to_select == "auto":
|
||
|
if self.tol is not None:
|
||
|
# With auto feature selection, `n_features_to_select_` will be updated
|
||
|
# to `support_.sum()` after features are selected.
|
||
|
self.n_features_to_select_ = n_features - 1
|
||
|
else:
|
||
|
self.n_features_to_select_ = n_features // 2
|
||
|
elif isinstance(self.n_features_to_select, Integral):
|
||
|
if self.n_features_to_select >= n_features:
|
||
|
raise ValueError("n_features_to_select must be < n_features.")
|
||
|
self.n_features_to_select_ = self.n_features_to_select
|
||
|
elif isinstance(self.n_features_to_select, Real):
|
||
|
self.n_features_to_select_ = int(n_features * self.n_features_to_select)
|
||
|
|
||
|
if self.tol is not None and self.tol < 0 and self.direction == "forward":
|
||
|
raise ValueError("tol must be positive when doing forward selection")
|
||
|
|
||
|
cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
|
||
|
|
||
|
cloned_estimator = clone(self.estimator)
|
||
|
|
||
|
# the current mask corresponds to the set of features:
|
||
|
# - that we have already *selected* if we do forward selection
|
||
|
# - that we have already *excluded* if we do backward selection
|
||
|
current_mask = np.zeros(shape=n_features, dtype=bool)
|
||
|
n_iterations = (
|
||
|
self.n_features_to_select_
|
||
|
if self.n_features_to_select == "auto" or self.direction == "forward"
|
||
|
else n_features - self.n_features_to_select_
|
||
|
)
|
||
|
|
||
|
old_score = -np.inf
|
||
|
is_auto_select = self.tol is not None and self.n_features_to_select == "auto"
|
||
|
for _ in range(n_iterations):
|
||
|
new_feature_idx, new_score = self._get_best_new_feature_score(
|
||
|
cloned_estimator, X, y, cv, current_mask
|
||
|
)
|
||
|
if is_auto_select and ((new_score - old_score) < self.tol):
|
||
|
break
|
||
|
|
||
|
old_score = new_score
|
||
|
current_mask[new_feature_idx] = True
|
||
|
|
||
|
if self.direction == "backward":
|
||
|
current_mask = ~current_mask
|
||
|
|
||
|
self.support_ = current_mask
|
||
|
self.n_features_to_select_ = self.support_.sum()
|
||
|
|
||
|
return self
|
||
|
|
||
|
def _get_best_new_feature_score(self, estimator, X, y, cv, current_mask):
|
||
|
# Return the best new feature and its score to add to the current_mask,
|
||
|
# i.e. return the best new feature and its score to add (resp. remove)
|
||
|
# when doing forward selection (resp. backward selection).
|
||
|
# Feature will be added if the current score and past score are greater
|
||
|
# than tol when n_feature is auto,
|
||
|
candidate_feature_indices = np.flatnonzero(~current_mask)
|
||
|
scores = {}
|
||
|
for feature_idx in candidate_feature_indices:
|
||
|
candidate_mask = current_mask.copy()
|
||
|
candidate_mask[feature_idx] = True
|
||
|
if self.direction == "backward":
|
||
|
candidate_mask = ~candidate_mask
|
||
|
X_new = X[:, candidate_mask]
|
||
|
scores[feature_idx] = cross_val_score(
|
||
|
estimator,
|
||
|
X_new,
|
||
|
y,
|
||
|
cv=cv,
|
||
|
scoring=self.scoring,
|
||
|
n_jobs=self.n_jobs,
|
||
|
).mean()
|
||
|
new_feature_idx = max(scores, key=lambda feature_idx: scores[feature_idx])
|
||
|
return new_feature_idx, scores[new_feature_idx]
|
||
|
|
||
|
def _get_support_mask(self):
|
||
|
check_is_fitted(self)
|
||
|
return self.support_
|
||
|
|
||
|
def _more_tags(self):
|
||
|
return {
|
||
|
"allow_nan": _safe_tags(self.estimator, key="allow_nan"),
|
||
|
}
|