Inzynierka/Lib/site-packages/sklearn/feature_selection/_sequential.py
2023-06-02 12:51:02 +02:00

326 lines
12 KiB
Python

"""
Sequential feature selection
"""
from numbers import Integral, Real
import numpy as np
import warnings
from ._base import SelectorMixin
from ..base import BaseEstimator, MetaEstimatorMixin, clone
from ..utils._param_validation import HasMethods, Hidden, Interval, StrOptions
from ..utils._tags import _safe_tags
from ..utils.validation import check_is_fitted
from ..model_selection import cross_val_score
from ..metrics import get_scorer_names
class SequentialFeatureSelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator):
"""Transformer that performs Sequential Feature Selection.
This Sequential Feature Selector adds (forward selection) or
removes (backward selection) features to form a feature subset in a
greedy fashion. At each stage, this estimator chooses the best feature to
add or remove based on the cross-validation score of an estimator. In
the case of unsupervised learning, this Sequential Feature Selector
looks only at the features (X), not the desired outputs (y).
Read more in the :ref:`User Guide <sequential_feature_selection>`.
.. versionadded:: 0.24
Parameters
----------
estimator : estimator instance
An unfitted estimator.
n_features_to_select : "auto", int or float, default='warn'
If `"auto"`, the behaviour depends on the `tol` parameter:
- if `tol` is not `None`, then features are selected until the score
improvement does not exceed `tol`.
- otherwise, half of the features are selected.
If integer, the parameter is the absolute number of features to select.
If float between 0 and 1, it is the fraction of features to select.
.. versionadded:: 1.1
The option `"auto"` was added in version 1.1.
.. deprecated:: 1.1
The default changed from `None` to `"warn"` in 1.1 and will become
`"auto"` in 1.3. `None` and `'warn'` will be removed in 1.3.
To keep the same behaviour as `None`, set
`n_features_to_select="auto" and `tol=None`.
tol : float, default=None
If the score is not incremented by at least `tol` between two
consecutive feature additions or removals, stop adding or removing.
`tol` can be negative when removing features using `direction="backward"`.
It can be useful to reduce the number of features at the cost of a small
decrease in the score.
`tol` is enabled only when `n_features_to_select` is `"auto"`.
.. versionadded:: 1.1
direction : {'forward', 'backward'}, default='forward'
Whether to perform forward selection or backward selection.
scoring : str or callable, default=None
A single str (see :ref:`scoring_parameter`) or a callable
(see :ref:`scoring`) to evaluate the predictions on the test set.
NOTE that when using a custom scorer, it should return a single
value.
If None, the estimator's score method is used.
cv : int, cross-validation generator or an iterable, default=None
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 5-fold cross validation,
- integer, to specify the number of folds in a `(Stratified)KFold`,
- :term:`CV splitter`,
- An iterable yielding (train, test) splits as arrays of indices.
For integer/None inputs, if the estimator is a classifier and ``y`` is
either binary or multiclass, :class:`StratifiedKFold` is used. In all
other cases, :class:`KFold` is used. These splitters are instantiated
with `shuffle=False` so the splits will be the same across calls.
Refer :ref:`User Guide <cross_validation>` for the various
cross-validation strategies that can be used here.
n_jobs : int, default=None
Number of jobs to run in parallel. When evaluating a new feature to
add or remove, the cross-validation procedure is parallel over the
folds.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Attributes
----------
n_features_in_ : int
Number of features seen during :term:`fit`. Only defined if the
underlying estimator exposes such an attribute when fit.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
n_features_to_select_ : int
The number of features that were selected.
support_ : ndarray of shape (n_features,), dtype=bool
The mask of selected features.
See Also
--------
GenericUnivariateSelect : Univariate feature selector with configurable
strategy.
RFE : Recursive feature elimination based on importance weights.
RFECV : Recursive feature elimination based on importance weights, with
automatic selection of the number of features.
SelectFromModel : Feature selection based on thresholds of importance
weights.
Examples
--------
>>> from sklearn.feature_selection import SequentialFeatureSelector
>>> from sklearn.neighbors import KNeighborsClassifier
>>> from sklearn.datasets import load_iris
>>> X, y = load_iris(return_X_y=True)
>>> knn = KNeighborsClassifier(n_neighbors=3)
>>> sfs = SequentialFeatureSelector(knn, n_features_to_select=3)
>>> sfs.fit(X, y)
SequentialFeatureSelector(estimator=KNeighborsClassifier(n_neighbors=3),
n_features_to_select=3)
>>> sfs.get_support()
array([ True, False, True, True])
>>> sfs.transform(X).shape
(150, 3)
"""
_parameter_constraints: dict = {
"estimator": [HasMethods(["fit"])],
"n_features_to_select": [
StrOptions({"auto", "warn"}, deprecated={"warn"}),
Interval(Real, 0, 1, closed="right"),
Interval(Integral, 0, None, closed="neither"),
Hidden(None),
],
"tol": [None, Interval(Real, None, None, closed="neither")],
"direction": [StrOptions({"forward", "backward"})],
"scoring": [None, StrOptions(set(get_scorer_names())), callable],
"cv": ["cv_object"],
"n_jobs": [None, Integral],
}
def __init__(
self,
estimator,
*,
n_features_to_select="warn",
tol=None,
direction="forward",
scoring=None,
cv=5,
n_jobs=None,
):
self.estimator = estimator
self.n_features_to_select = n_features_to_select
self.tol = tol
self.direction = direction
self.scoring = scoring
self.cv = cv
self.n_jobs = n_jobs
def fit(self, X, y=None):
"""Learn the features to select from X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training vectors, where `n_samples` is the number of samples and
`n_features` is the number of predictors.
y : array-like of shape (n_samples,), default=None
Target values. This parameter may be ignored for
unsupervised learning.
Returns
-------
self : object
Returns the instance itself.
"""
self._validate_params()
# FIXME: to be removed in 1.3
if self.n_features_to_select in ("warn", None):
# for backwards compatibility
warnings.warn(
"Leaving `n_features_to_select` to "
"None is deprecated in 1.0 and will become 'auto' "
"in 1.3. To keep the same behaviour as with None "
"(i.e. select half of the features) and avoid "
"this warning, you should manually set "
"`n_features_to_select='auto'` and set tol=None "
"when creating an instance.",
FutureWarning,
)
tags = self._get_tags()
X = self._validate_data(
X,
accept_sparse="csc",
ensure_min_features=2,
force_all_finite=not tags.get("allow_nan", True),
)
n_features = X.shape[1]
# FIXME: to be fixed in 1.3
error_msg = (
"n_features_to_select must be either 'auto', 'warn', "
"None, an integer in [1, n_features - 1] "
"representing the absolute "
"number of features, or a float in (0, 1] "
"representing a percentage of features to "
f"select. Got {self.n_features_to_select}"
)
if self.n_features_to_select in ("warn", None):
if self.tol is not None:
raise ValueError("tol is only enabled if `n_features_to_select='auto'`")
self.n_features_to_select_ = n_features // 2
elif self.n_features_to_select == "auto":
if self.tol is not None:
# With auto feature selection, `n_features_to_select_` will be updated
# to `support_.sum()` after features are selected.
self.n_features_to_select_ = n_features - 1
else:
self.n_features_to_select_ = n_features // 2
elif isinstance(self.n_features_to_select, Integral):
if not 0 < self.n_features_to_select < n_features:
raise ValueError(error_msg)
self.n_features_to_select_ = self.n_features_to_select
elif isinstance(self.n_features_to_select, Real):
self.n_features_to_select_ = int(n_features * self.n_features_to_select)
if self.tol is not None and self.tol < 0 and self.direction == "forward":
raise ValueError("tol must be positive when doing forward selection")
cloned_estimator = clone(self.estimator)
# the current mask corresponds to the set of features:
# - that we have already *selected* if we do forward selection
# - that we have already *excluded* if we do backward selection
current_mask = np.zeros(shape=n_features, dtype=bool)
n_iterations = (
self.n_features_to_select_
if self.n_features_to_select == "auto" or self.direction == "forward"
else n_features - self.n_features_to_select_
)
old_score = -np.inf
is_auto_select = self.tol is not None and self.n_features_to_select == "auto"
for _ in range(n_iterations):
new_feature_idx, new_score = self._get_best_new_feature_score(
cloned_estimator, X, y, current_mask
)
if is_auto_select and ((new_score - old_score) < self.tol):
break
old_score = new_score
current_mask[new_feature_idx] = True
if self.direction == "backward":
current_mask = ~current_mask
self.support_ = current_mask
self.n_features_to_select_ = self.support_.sum()
return self
def _get_best_new_feature_score(self, estimator, X, y, current_mask):
# Return the best new feature and its score to add to the current_mask,
# i.e. return the best new feature and its score to add (resp. remove)
# when doing forward selection (resp. backward selection).
# Feature will be added if the current score and past score are greater
# than tol when n_feature is auto,
candidate_feature_indices = np.flatnonzero(~current_mask)
scores = {}
for feature_idx in candidate_feature_indices:
candidate_mask = current_mask.copy()
candidate_mask[feature_idx] = True
if self.direction == "backward":
candidate_mask = ~candidate_mask
X_new = X[:, candidate_mask]
scores[feature_idx] = cross_val_score(
estimator,
X_new,
y,
cv=self.cv,
scoring=self.scoring,
n_jobs=self.n_jobs,
).mean()
new_feature_idx = max(scores, key=lambda feature_idx: scores[feature_idx])
return new_feature_idx, scores[new_feature_idx]
def _get_support_mask(self):
check_is_fitted(self)
return self.support_
def _more_tags(self):
return {
"allow_nan": _safe_tags(self.estimator, key="allow_nan"),
}