projektAI/venv/Lib/site-packages/mlxtend/feature_selection/sequential_feature_selector.py
2021-06-06 22:13:05 +02:00

748 lines
30 KiB
Python

# Sebastian Raschka 2014-2020
# mlxtend Machine Learning Library Extensions
#
# Algorithm for sequential feature selection.
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause
import datetime
import types
import numpy as np
import scipy as sp
import scipy.stats
import sys
from copy import deepcopy
from itertools import combinations
from sklearn.metrics import get_scorer
from sklearn.base import clone
from sklearn.base import MetaEstimatorMixin
from ..externals.name_estimators import _name_estimators
from ..utils.base_compostion import _BaseXComposition
from sklearn.model_selection import cross_val_score
from joblib import Parallel, delayed
def _calc_score(selector, X, y, indices, groups=None, **fit_params):
if selector.cv:
scores = cross_val_score(selector.est_,
X, y,
groups=groups,
cv=selector.cv,
scoring=selector.scorer,
n_jobs=1,
pre_dispatch=selector.pre_dispatch,
fit_params=fit_params)
else:
selector.est_.fit(X, y, **fit_params)
scores = np.array([selector.scorer(selector.est_, X, y)])
return indices, scores
def _get_featurenames(subsets_dict, feature_idx, custom_feature_names, X):
feature_names = None
if feature_idx is not None:
if custom_feature_names is not None:
feature_names = tuple((custom_feature_names[i]
for i in feature_idx))
elif hasattr(X, 'loc'):
feature_names = tuple((X.columns[i] for i in feature_idx))
else:
feature_names = tuple(str(i) for i in feature_idx)
subsets_dict_ = deepcopy(subsets_dict)
for key in subsets_dict_:
if custom_feature_names is not None:
new_tuple = tuple((custom_feature_names[i]
for i in subsets_dict[key]['feature_idx']))
elif hasattr(X, 'loc'):
new_tuple = tuple((X.columns[i]
for i in subsets_dict[key]['feature_idx']))
else:
new_tuple = tuple(str(i) for i in subsets_dict[key]['feature_idx'])
subsets_dict_[key]['feature_names'] = new_tuple
return subsets_dict_, feature_names
class SequentialFeatureSelector(_BaseXComposition, MetaEstimatorMixin):
"""Sequential Feature Selection for Classification and Regression.
Parameters
----------
estimator : scikit-learn classifier or regressor
k_features : int or tuple or str (default: 1)
Number of features to select,
where k_features < the full feature set.
New in 0.4.2: A tuple containing a min and max value can be provided,
and the SFS will consider return any feature combination between
min and max that scored highest in cross-validtion. For example,
the tuple (1, 4) will return any combination from
1 up to 4 features instead of a fixed number of features k.
New in 0.8.0: A string argument "best" or "parsimonious".
If "best" is provided, the feature selector will return the
feature subset with the best cross-validation performance.
If "parsimonious" is provided as an argument, the smallest
feature subset that is within one standard error of the
cross-validation performance will be selected.
forward : bool (default: True)
Forward selection if True,
backward selection otherwise
floating : bool (default: False)
Adds a conditional exclusion/inclusion if True.
verbose : int (default: 0), level of verbosity to use in logging.
If 0, no output,
if 1 number of features in current set, if 2 detailed logging i
ncluding timestamp and cv scores at step.
scoring : str, callable, or None (default: None)
If None (default), uses 'accuracy' for sklearn classifiers
and 'r2' for sklearn regressors.
If str, uses a sklearn scoring metric string identifier, for example
{accuracy, f1, precision, recall, roc_auc} for classifiers,
{'mean_absolute_error', 'mean_squared_error'/'neg_mean_squared_error',
'median_absolute_error', 'r2'} for regressors.
If a callable object or function is provided, it has to be conform with
sklearn's signature ``scorer(estimator, X, y)``; see
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
for more information.
cv : int (default: 5)
Integer or iterable yielding train, test splits. If cv is an integer
and `estimator` is a classifier (or y consists of integer class
labels) stratified k-fold. Otherwise regular k-fold cross-validation
is performed. No cross-validation if cv is None, False, or 0.
n_jobs : int (default: 1)
The number of CPUs to use for evaluating different feature subsets
in parallel. -1 means 'all CPUs'.
pre_dispatch : int, or string (default: '2*n_jobs')
Controls the number of jobs that get dispatched
during parallel execution if `n_jobs > 1` or `n_jobs=-1`.
Reducing this number can be useful to avoid an explosion of
memory consumption when more jobs get dispatched than CPUs can process.
This parameter can be:
None, in which case all the jobs are immediately created and spawned.
Use this for lightweight and fast-running jobs,
to avoid delays due to on-demand spawning of the jobs
An int, giving the exact number of total jobs that are spawned
A string, giving an expression as a function
of n_jobs, as in `2*n_jobs`
clone_estimator : bool (default: True)
Clones estimator if True; works with the original estimator instance
if False. Set to False if the estimator doesn't
implement scikit-learn's set_params and get_params methods.
In addition, it is required to set cv=0, and n_jobs=1.
fixed_features : tuple (default: None)
If not `None`, the feature indices provided as a tuple will be
regarded as fixed by the feature selector. For example, if
`fixed_features=(1, 3, 7)`, the 2nd, 4th, and 8th feature are
guaranteed to be present in the solution. Note that if
`fixed_features` is not `None`, make sure that the number of
features to be selected is greater than `len(fixed_features)`.
In other words, ensure that `k_features > len(fixed_features)`.
New in mlxtend v. 0.18.0.
Attributes
----------
k_feature_idx_ : array-like, shape = [n_predictions]
Feature Indices of the selected feature subsets.
k_feature_names_ : array-like, shape = [n_predictions]
Feature names of the selected feature subsets. If pandas
DataFrames are used in the `fit` method, the feature
names correspond to the column names. Otherwise, the
feature names are string representation of the feature
array indices. New in v 0.13.0.
k_score_ : float
Cross validation average score of the selected subset.
subsets_ : dict
A dictionary of selected feature subsets during the
sequential selection, where the dictionary keys are
the lengths k of these feature subsets. The dictionary
values are dictionaries themselves with the following
keys: 'feature_idx' (tuple of indices of the feature subset)
'feature_names' (tuple of feature names of the feat. subset)
'cv_scores' (list individual cross-validation scores)
'avg_score' (average cross-validation score)
Note that if pandas
DataFrames are used in the `fit` method, the 'feature_names'
correspond to the column names. Otherwise, the
feature names are string representation of the feature
array indices. The 'feature_names' is new in v 0.13.0.
Examples
-----------
For usage examples, please see
http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/
"""
def __init__(self, estimator, k_features=1,
forward=True, floating=False,
verbose=0, scoring=None,
cv=5, n_jobs=1,
pre_dispatch='2*n_jobs',
clone_estimator=True,
fixed_features=None):
self.estimator = estimator
self.k_features = k_features
self.forward = forward
self.floating = floating
self.pre_dispatch = pre_dispatch
# Want to raise meaningful error message if a
# cross-validation generator is inputted
if isinstance(cv, types.GeneratorType):
err_msg = ('Input cv is a generator object, which is not '
'supported. Instead please input an iterable yielding '
'train, test splits. This can usually be done by '
'passing a cross-validation generator to the '
'built-in list function. I.e. cv=list(<cv-generator>)')
raise TypeError(err_msg)
self.cv = cv
self.n_jobs = n_jobs
self.verbose = verbose
self.clone_estimator = clone_estimator
if fixed_features is not None:
if isinstance(self.k_features, int) and \
self.k_features <= len(fixed_features):
raise ValueError('Number of features to be selected must'
' be larger than the number of'
' features specified via `fixed_features`.'
' Got `k_features=%d` and'
' `fixed_features=%d`' %
(k_features, len(fixed_features)))
elif isinstance(self.k_features, tuple) and \
self.k_features[0] <= len(fixed_features):
raise ValueError('The minimum number of features to'
' be selected must'
' be larger than the number of'
' features specified via `fixed_features`.'
' Got `k_features=%s` and '
'`len(fixed_features)=%d`' %
(k_features, len(fixed_features)))
self.fixed_features = fixed_features
if self.clone_estimator:
self.est_ = clone(self.estimator)
else:
self.est_ = self.estimator
self.scoring = scoring
if scoring is None:
if self.est_._estimator_type == 'classifier':
scoring = 'accuracy'
elif self.est_._estimator_type == 'regressor':
scoring = 'r2'
else:
raise AttributeError('Estimator must '
'be a Classifier or Regressor.')
if isinstance(scoring, str):
self.scorer = get_scorer(scoring)
else:
self.scorer = scoring
self.fitted = False
self.subsets_ = {}
self.interrupted_ = False
# don't mess with this unless testing
self._TESTING_INTERRUPT_MODE = False
@property
def named_estimators(self):
"""
Returns
-------
List of named estimator tuples, like [('svc', SVC(...))]
"""
return _name_estimators([self.estimator])
def get_params(self, deep=True):
#
# Return estimator parameter names for GridSearch support.
#
return self._get_params('named_estimators', deep=deep)
def set_params(self, **params):
"""Set the parameters of this estimator.
Valid parameter keys can be listed with ``get_params()``.
Returns
-------
self
"""
self._set_params('estimator', 'named_estimators', **params)
return self
def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params):
"""Perform feature selection and learn model from training data.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
New in v 0.13.0: pandas DataFrames are now also accepted as
argument for X.
y : array-like, shape = [n_samples]
Target values.
New in v 0.13.0: pandas DataFrames are now also accepted as
argument for y.
custom_feature_names : None or tuple (default: tuple)
Custom feature names for `self.k_feature_names` and
`self.subsets_[i]['feature_names']`.
(new in v 0.13.0)
groups : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
train/test set. Passed to the fit method of the cross-validator.
fit_params : various, optional
Additional parameters that are being passed to the estimator.
For example, `sample_weights=weights`.
Returns
-------
self : object
"""
# reset from a potential previous fit run
self.subsets_ = {}
self.fitted = False
self.interrupted_ = False
self.k_feature_idx_ = None
self.k_feature_names_ = None
self.k_score_ = None
self.fixed_features_ = self.fixed_features
self.fixed_features_set_ = set()
if hasattr(X, 'loc'):
X_ = X.values
if self.fixed_features is not None:
self.fixed_features_ = tuple(X.columns.get_loc(c)
if isinstance(c, str) else c
for c in self.fixed_features
)
else:
X_ = X
if self.fixed_features is not None:
self.fixed_features_set_ = set(self.fixed_features_)
if (custom_feature_names is not None
and len(custom_feature_names) != X.shape[1]):
raise ValueError('If custom_feature_names is not None, '
'the number of elements in custom_feature_names '
'must equal the number of columns in X.')
if not isinstance(self.k_features, int) and\
not isinstance(self.k_features, tuple)\
and not isinstance(self.k_features, str):
raise AttributeError('k_features must be a positive integer'
', tuple, or string')
if (isinstance(self.k_features, int) and (
self.k_features < 1 or self.k_features > X_.shape[1])):
raise AttributeError('k_features must be a positive integer'
' between 1 and X.shape[1], got %s'
% (self.k_features, ))
if isinstance(self.k_features, tuple):
if len(self.k_features) != 2:
raise AttributeError('k_features tuple must consist of 2'
' elements a min and a max value.')
if self.k_features[0] not in range(1, X_.shape[1] + 1):
raise AttributeError('k_features tuple min value must be in'
' range(1, X.shape[1]+1).')
if self.k_features[1] not in range(1, X_.shape[1] + 1):
raise AttributeError('k_features tuple max value must be in'
' range(1, X.shape[1]+1).')
if self.k_features[0] > self.k_features[1]:
raise AttributeError('The min k_features value must be smaller'
' than the max k_features value.')
if isinstance(self.k_features, tuple) or\
isinstance(self.k_features, str):
select_in_range = True
if isinstance(self.k_features, str):
if self.k_features not in {'best', 'parsimonious'}:
raise AttributeError('If a string argument is provided, '
'it must be "best" or "parsimonious"')
else:
min_k = 1
max_k = X_.shape[1]
else:
min_k = self.k_features[0]
max_k = self.k_features[1]
else:
select_in_range = False
k_to_select = self.k_features
orig_set = set(range(X_.shape[1]))
n_features = X_.shape[1]
if self.forward and self.fixed_features is not None:
orig_set = set(range(X_.shape[1])) - self.fixed_features_set_
n_features = len(orig_set)
if self.forward:
if select_in_range:
k_to_select = max_k
if self.fixed_features is not None:
k_idx = self.fixed_features_
k = len(k_idx)
k_idx, k_score = _calc_score(self, X_[:, k_idx], y, k_idx,
groups=groups, **fit_params)
self.subsets_[k] = {
'feature_idx': k_idx,
'cv_scores': k_score,
'avg_score': np.nanmean(k_score)
}
else:
k_idx = ()
k = 0
else:
if select_in_range:
k_to_select = min_k
k_idx = tuple(orig_set)
k = len(k_idx)
k_idx, k_score = _calc_score(self, X_[:, k_idx], y, k_idx,
groups=groups, **fit_params)
self.subsets_[k] = {
'feature_idx': k_idx,
'cv_scores': k_score,
'avg_score': np.nanmean(k_score)
}
best_subset = None
k_score = 0
try:
while k != k_to_select:
prev_subset = set(k_idx)
if self.forward:
k_idx, k_score, cv_scores = self._inclusion(
orig_set=orig_set,
subset=prev_subset,
X=X_,
y=y,
groups=groups,
**fit_params
)
else:
k_idx, k_score, cv_scores = self._exclusion(
feature_set=prev_subset,
X=X_,
y=y,
groups=groups,
fixed_feature=self.fixed_features_set_,
**fit_params
)
if self.floating:
if self.forward:
continuation_cond_1 = len(k_idx)
else:
continuation_cond_1 = n_features - len(k_idx)
continuation_cond_2 = True
ran_step_1 = True
new_feature = None
while continuation_cond_1 >= 2 and continuation_cond_2:
k_score_c = None
if ran_step_1:
(new_feature,) = set(k_idx) ^ prev_subset
if self.forward:
fixed_features_ok = True
if self.fixed_features is not None and \
len(self.fixed_features) - len(k_idx) <= 1:
fixed_features_ok = False
if fixed_features_ok:
k_idx_c, k_score_c, cv_scores_c = \
self._exclusion(
feature_set=k_idx,
fixed_feature=(
{new_feature} |
self.fixed_features_set_),
X=X_,
y=y,
groups=groups,
**fit_params
)
else:
k_idx_c, k_score_c, cv_scores_c = self._inclusion(
orig_set=orig_set - {new_feature},
subset=set(k_idx),
X=X_,
y=y,
groups=groups,
**fit_params
)
if k_score_c is not None and k_score_c > k_score:
if len(k_idx_c) in self.subsets_:
cached_score = self.subsets_[len(
k_idx_c)]['avg_score']
else:
cached_score = None
if cached_score is None or \
k_score_c > cached_score:
prev_subset = set(k_idx)
k_idx, k_score, cv_scores = \
k_idx_c, k_score_c, cv_scores_c
continuation_cond_1 = len(k_idx)
ran_step_1 = False
else:
continuation_cond_2 = False
else:
continuation_cond_2 = False
k = len(k_idx)
# floating can lead to multiple same-sized subsets
if k not in self.subsets_ or (k_score >
self.subsets_[k]['avg_score']):
k_idx = tuple(sorted(k_idx))
self.subsets_[k] = {
'feature_idx': k_idx,
'cv_scores': cv_scores,
'avg_score': k_score
}
if self.verbose == 1:
sys.stderr.write('\rFeatures: %d/%s' % (
len(k_idx),
k_to_select
))
sys.stderr.flush()
elif self.verbose > 1:
sys.stderr.write('\n[%s] Features: %d/%s -- score: %s' % (
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
len(k_idx),
k_to_select,
k_score
))
if self._TESTING_INTERRUPT_MODE:
self.subsets_, self.k_feature_names_ = \
_get_featurenames(self.subsets_,
self.k_feature_idx_,
custom_feature_names,
X)
raise KeyboardInterrupt
except KeyboardInterrupt:
self.interrupted_ = True
sys.stderr.write('\nSTOPPING EARLY DUE TO KEYBOARD INTERRUPT...')
if select_in_range:
max_score = float('-inf')
max_score = float('-inf')
for k in self.subsets_:
if k < min_k or k > max_k:
continue
if self.subsets_[k]['avg_score'] > max_score:
max_score = self.subsets_[k]['avg_score']
best_subset = k
k_score = max_score
k_idx = self.subsets_[best_subset]['feature_idx']
if self.k_features == 'parsimonious':
for k in self.subsets_:
if k >= best_subset:
continue
if self.subsets_[k]['avg_score'] >= (
max_score - np.std(self.subsets_[k]['cv_scores']) /
self.subsets_[k]['cv_scores'].shape[0]):
max_score = self.subsets_[k]['avg_score']
best_subset = k
k_score = max_score
k_idx = self.subsets_[best_subset]['feature_idx']
self.k_feature_idx_ = k_idx
self.k_score_ = k_score
self.fitted = True
self.subsets_, self.k_feature_names_ = \
_get_featurenames(self.subsets_,
self.k_feature_idx_,
custom_feature_names,
X)
return self
def _inclusion(self, orig_set, subset, X, y, ignore_feature=None,
groups=None, **fit_params):
all_avg_scores = []
all_cv_scores = []
all_subsets = []
res = (None, None, None)
remaining = orig_set - subset
if remaining:
features = len(remaining)
n_jobs = min(self.n_jobs, features)
parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose,
pre_dispatch=self.pre_dispatch)
work = parallel(delayed(_calc_score)
(self, X[:, tuple(subset | {feature})], y,
tuple(subset | {feature}),
groups=groups, **fit_params)
for feature in remaining
if feature != ignore_feature)
for new_subset, cv_scores in work:
all_avg_scores.append(np.nanmean(cv_scores))
all_cv_scores.append(cv_scores)
all_subsets.append(new_subset)
best = np.argmax(all_avg_scores)
res = (all_subsets[best],
all_avg_scores[best],
all_cv_scores[best])
return res
def _exclusion(self, feature_set, X, y, fixed_feature=None,
groups=None, **fit_params):
n = len(feature_set)
res = (None, None, None)
if n > 1:
all_avg_scores = []
all_cv_scores = []
all_subsets = []
features = n
n_jobs = min(self.n_jobs, features)
parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose,
pre_dispatch=self.pre_dispatch)
work = parallel(delayed(_calc_score)(self, X[:, p], y, p,
groups=groups, **fit_params)
for p in combinations(feature_set, r=n - 1)
if not fixed_feature or
fixed_feature.issubset(set(p)))
for p, cv_scores in work:
all_avg_scores.append(np.nanmean(cv_scores))
all_cv_scores.append(cv_scores)
all_subsets.append(p)
best = np.argmax(all_avg_scores)
res = (all_subsets[best],
all_avg_scores[best],
all_cv_scores[best])
return res
def transform(self, X):
"""Reduce X to its most important features.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
New in v 0.13.0: pandas DataFrames are now also accepted as
argument for X.
Returns
-------
Reduced feature subset of X, shape={n_samples, k_features}
"""
self._check_fitted()
if hasattr(X, 'loc'):
X_ = X.values
else:
X_ = X
return X_[:, self.k_feature_idx_]
def fit_transform(self, X, y, groups=None, **fit_params):
"""Fit to training data then reduce X to its most important features.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
New in v 0.13.0: pandas DataFrames are now also accepted as
argument for X.
y : array-like, shape = [n_samples]
Target values.
New in v 0.13.0: a pandas Series are now also accepted as
argument for y.
groups : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
train/test set. Passed to the fit method of the cross-validator.
fit_params : various, optional
Additional parameters that are being passed to the estimator.
For example, `sample_weights=weights`.
Returns
-------
Reduced feature subset of X, shape={n_samples, k_features}
"""
self.fit(X, y, groups=groups, **fit_params)
return self.transform(X)
def get_metric_dict(self, confidence_interval=0.95):
"""Return metric dictionary
Parameters
----------
confidence_interval : float (default: 0.95)
A positive float between 0.0 and 1.0 to compute the confidence
interval bounds of the CV score averages.
Returns
----------
Dictionary with items where each dictionary value is a list
with the number of iterations (number of feature subsets) as
its length. The dictionary keys corresponding to these lists
are as follows:
'feature_idx': tuple of the indices of the feature subset
'cv_scores': list with individual CV scores
'avg_score': of CV average scores
'std_dev': standard deviation of the CV score average
'std_err': standard error of the CV score average
'ci_bound': confidence interval bound of the CV score average
"""
self._check_fitted()
fdict = deepcopy(self.subsets_)
for k in fdict:
std_dev = np.std(self.subsets_[k]['cv_scores'])
bound, std_err = self._calc_confidence(
self.subsets_[k]['cv_scores'],
confidence=confidence_interval)
fdict[k]['ci_bound'] = bound
fdict[k]['std_dev'] = std_dev
fdict[k]['std_err'] = std_err
return fdict
def _calc_confidence(self, ary, confidence=0.95):
std_err = scipy.stats.sem(ary)
bound = std_err * sp.stats.t._ppf((1 + confidence) / 2.0, len(ary))
return bound, std_err
def _check_fitted(self):
if not self.fitted:
raise AttributeError('SequentialFeatureSelector has not been'
' fitted, yet.')