748 lines
30 KiB
Python
748 lines
30 KiB
Python
# Sebastian Raschka 2014-2020
|
|
# mlxtend Machine Learning Library Extensions
|
|
#
|
|
# Algorithm for sequential feature selection.
|
|
# Author: Sebastian Raschka <sebastianraschka.com>
|
|
#
|
|
# License: BSD 3 clause
|
|
|
|
import datetime
|
|
import types
|
|
import numpy as np
|
|
import scipy as sp
|
|
import scipy.stats
|
|
import sys
|
|
from copy import deepcopy
|
|
from itertools import combinations
|
|
from sklearn.metrics import get_scorer
|
|
from sklearn.base import clone
|
|
from sklearn.base import MetaEstimatorMixin
|
|
from ..externals.name_estimators import _name_estimators
|
|
from ..utils.base_compostion import _BaseXComposition
|
|
from sklearn.model_selection import cross_val_score
|
|
from joblib import Parallel, delayed
|
|
|
|
|
|
def _calc_score(selector, X, y, indices, groups=None, **fit_params):
|
|
if selector.cv:
|
|
scores = cross_val_score(selector.est_,
|
|
X, y,
|
|
groups=groups,
|
|
cv=selector.cv,
|
|
scoring=selector.scorer,
|
|
n_jobs=1,
|
|
pre_dispatch=selector.pre_dispatch,
|
|
fit_params=fit_params)
|
|
else:
|
|
selector.est_.fit(X, y, **fit_params)
|
|
scores = np.array([selector.scorer(selector.est_, X, y)])
|
|
return indices, scores
|
|
|
|
|
|
def _get_featurenames(subsets_dict, feature_idx, custom_feature_names, X):
|
|
feature_names = None
|
|
if feature_idx is not None:
|
|
if custom_feature_names is not None:
|
|
feature_names = tuple((custom_feature_names[i]
|
|
for i in feature_idx))
|
|
elif hasattr(X, 'loc'):
|
|
feature_names = tuple((X.columns[i] for i in feature_idx))
|
|
else:
|
|
feature_names = tuple(str(i) for i in feature_idx)
|
|
|
|
subsets_dict_ = deepcopy(subsets_dict)
|
|
for key in subsets_dict_:
|
|
if custom_feature_names is not None:
|
|
new_tuple = tuple((custom_feature_names[i]
|
|
for i in subsets_dict[key]['feature_idx']))
|
|
elif hasattr(X, 'loc'):
|
|
new_tuple = tuple((X.columns[i]
|
|
for i in subsets_dict[key]['feature_idx']))
|
|
else:
|
|
new_tuple = tuple(str(i) for i in subsets_dict[key]['feature_idx'])
|
|
subsets_dict_[key]['feature_names'] = new_tuple
|
|
|
|
return subsets_dict_, feature_names
|
|
|
|
|
|
class SequentialFeatureSelector(_BaseXComposition, MetaEstimatorMixin):
|
|
|
|
"""Sequential Feature Selection for Classification and Regression.
|
|
|
|
Parameters
|
|
----------
|
|
estimator : scikit-learn classifier or regressor
|
|
k_features : int or tuple or str (default: 1)
|
|
Number of features to select,
|
|
where k_features < the full feature set.
|
|
New in 0.4.2: A tuple containing a min and max value can be provided,
|
|
and the SFS will consider return any feature combination between
|
|
min and max that scored highest in cross-validtion. For example,
|
|
the tuple (1, 4) will return any combination from
|
|
1 up to 4 features instead of a fixed number of features k.
|
|
New in 0.8.0: A string argument "best" or "parsimonious".
|
|
If "best" is provided, the feature selector will return the
|
|
feature subset with the best cross-validation performance.
|
|
If "parsimonious" is provided as an argument, the smallest
|
|
feature subset that is within one standard error of the
|
|
cross-validation performance will be selected.
|
|
forward : bool (default: True)
|
|
Forward selection if True,
|
|
backward selection otherwise
|
|
floating : bool (default: False)
|
|
Adds a conditional exclusion/inclusion if True.
|
|
verbose : int (default: 0), level of verbosity to use in logging.
|
|
If 0, no output,
|
|
if 1 number of features in current set, if 2 detailed logging i
|
|
ncluding timestamp and cv scores at step.
|
|
scoring : str, callable, or None (default: None)
|
|
If None (default), uses 'accuracy' for sklearn classifiers
|
|
and 'r2' for sklearn regressors.
|
|
If str, uses a sklearn scoring metric string identifier, for example
|
|
{accuracy, f1, precision, recall, roc_auc} for classifiers,
|
|
{'mean_absolute_error', 'mean_squared_error'/'neg_mean_squared_error',
|
|
'median_absolute_error', 'r2'} for regressors.
|
|
If a callable object or function is provided, it has to be conform with
|
|
sklearn's signature ``scorer(estimator, X, y)``; see
|
|
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
|
|
for more information.
|
|
cv : int (default: 5)
|
|
Integer or iterable yielding train, test splits. If cv is an integer
|
|
and `estimator` is a classifier (or y consists of integer class
|
|
labels) stratified k-fold. Otherwise regular k-fold cross-validation
|
|
is performed. No cross-validation if cv is None, False, or 0.
|
|
n_jobs : int (default: 1)
|
|
The number of CPUs to use for evaluating different feature subsets
|
|
in parallel. -1 means 'all CPUs'.
|
|
pre_dispatch : int, or string (default: '2*n_jobs')
|
|
Controls the number of jobs that get dispatched
|
|
during parallel execution if `n_jobs > 1` or `n_jobs=-1`.
|
|
Reducing this number can be useful to avoid an explosion of
|
|
memory consumption when more jobs get dispatched than CPUs can process.
|
|
This parameter can be:
|
|
None, in which case all the jobs are immediately created and spawned.
|
|
Use this for lightweight and fast-running jobs,
|
|
to avoid delays due to on-demand spawning of the jobs
|
|
An int, giving the exact number of total jobs that are spawned
|
|
A string, giving an expression as a function
|
|
of n_jobs, as in `2*n_jobs`
|
|
clone_estimator : bool (default: True)
|
|
Clones estimator if True; works with the original estimator instance
|
|
if False. Set to False if the estimator doesn't
|
|
implement scikit-learn's set_params and get_params methods.
|
|
In addition, it is required to set cv=0, and n_jobs=1.
|
|
fixed_features : tuple (default: None)
|
|
If not `None`, the feature indices provided as a tuple will be
|
|
regarded as fixed by the feature selector. For example, if
|
|
`fixed_features=(1, 3, 7)`, the 2nd, 4th, and 8th feature are
|
|
guaranteed to be present in the solution. Note that if
|
|
`fixed_features` is not `None`, make sure that the number of
|
|
features to be selected is greater than `len(fixed_features)`.
|
|
In other words, ensure that `k_features > len(fixed_features)`.
|
|
New in mlxtend v. 0.18.0.
|
|
|
|
Attributes
|
|
----------
|
|
k_feature_idx_ : array-like, shape = [n_predictions]
|
|
Feature Indices of the selected feature subsets.
|
|
k_feature_names_ : array-like, shape = [n_predictions]
|
|
Feature names of the selected feature subsets. If pandas
|
|
DataFrames are used in the `fit` method, the feature
|
|
names correspond to the column names. Otherwise, the
|
|
feature names are string representation of the feature
|
|
array indices. New in v 0.13.0.
|
|
k_score_ : float
|
|
Cross validation average score of the selected subset.
|
|
subsets_ : dict
|
|
A dictionary of selected feature subsets during the
|
|
sequential selection, where the dictionary keys are
|
|
the lengths k of these feature subsets. The dictionary
|
|
values are dictionaries themselves with the following
|
|
keys: 'feature_idx' (tuple of indices of the feature subset)
|
|
'feature_names' (tuple of feature names of the feat. subset)
|
|
'cv_scores' (list individual cross-validation scores)
|
|
'avg_score' (average cross-validation score)
|
|
Note that if pandas
|
|
DataFrames are used in the `fit` method, the 'feature_names'
|
|
correspond to the column names. Otherwise, the
|
|
feature names are string representation of the feature
|
|
array indices. The 'feature_names' is new in v 0.13.0.
|
|
|
|
Examples
|
|
-----------
|
|
For usage examples, please see
|
|
http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/
|
|
|
|
"""
|
|
def __init__(self, estimator, k_features=1,
|
|
forward=True, floating=False,
|
|
verbose=0, scoring=None,
|
|
cv=5, n_jobs=1,
|
|
pre_dispatch='2*n_jobs',
|
|
clone_estimator=True,
|
|
fixed_features=None):
|
|
|
|
self.estimator = estimator
|
|
self.k_features = k_features
|
|
self.forward = forward
|
|
self.floating = floating
|
|
self.pre_dispatch = pre_dispatch
|
|
# Want to raise meaningful error message if a
|
|
# cross-validation generator is inputted
|
|
if isinstance(cv, types.GeneratorType):
|
|
err_msg = ('Input cv is a generator object, which is not '
|
|
'supported. Instead please input an iterable yielding '
|
|
'train, test splits. This can usually be done by '
|
|
'passing a cross-validation generator to the '
|
|
'built-in list function. I.e. cv=list(<cv-generator>)')
|
|
raise TypeError(err_msg)
|
|
self.cv = cv
|
|
self.n_jobs = n_jobs
|
|
self.verbose = verbose
|
|
self.clone_estimator = clone_estimator
|
|
|
|
if fixed_features is not None:
|
|
if isinstance(self.k_features, int) and \
|
|
self.k_features <= len(fixed_features):
|
|
raise ValueError('Number of features to be selected must'
|
|
' be larger than the number of'
|
|
' features specified via `fixed_features`.'
|
|
' Got `k_features=%d` and'
|
|
' `fixed_features=%d`' %
|
|
(k_features, len(fixed_features)))
|
|
|
|
elif isinstance(self.k_features, tuple) and \
|
|
self.k_features[0] <= len(fixed_features):
|
|
raise ValueError('The minimum number of features to'
|
|
' be selected must'
|
|
' be larger than the number of'
|
|
' features specified via `fixed_features`.'
|
|
' Got `k_features=%s` and '
|
|
'`len(fixed_features)=%d`' %
|
|
(k_features, len(fixed_features)))
|
|
|
|
self.fixed_features = fixed_features
|
|
|
|
if self.clone_estimator:
|
|
self.est_ = clone(self.estimator)
|
|
else:
|
|
self.est_ = self.estimator
|
|
self.scoring = scoring
|
|
|
|
if scoring is None:
|
|
if self.est_._estimator_type == 'classifier':
|
|
scoring = 'accuracy'
|
|
elif self.est_._estimator_type == 'regressor':
|
|
scoring = 'r2'
|
|
else:
|
|
raise AttributeError('Estimator must '
|
|
'be a Classifier or Regressor.')
|
|
if isinstance(scoring, str):
|
|
self.scorer = get_scorer(scoring)
|
|
else:
|
|
self.scorer = scoring
|
|
|
|
self.fitted = False
|
|
self.subsets_ = {}
|
|
self.interrupted_ = False
|
|
|
|
# don't mess with this unless testing
|
|
self._TESTING_INTERRUPT_MODE = False
|
|
|
|
@property
|
|
def named_estimators(self):
|
|
"""
|
|
Returns
|
|
-------
|
|
List of named estimator tuples, like [('svc', SVC(...))]
|
|
"""
|
|
return _name_estimators([self.estimator])
|
|
|
|
def get_params(self, deep=True):
|
|
#
|
|
# Return estimator parameter names for GridSearch support.
|
|
#
|
|
return self._get_params('named_estimators', deep=deep)
|
|
|
|
def set_params(self, **params):
|
|
"""Set the parameters of this estimator.
|
|
Valid parameter keys can be listed with ``get_params()``.
|
|
|
|
Returns
|
|
-------
|
|
self
|
|
"""
|
|
self._set_params('estimator', 'named_estimators', **params)
|
|
return self
|
|
|
|
def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params):
|
|
"""Perform feature selection and learn model from training data.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
|
|
Training vectors, where n_samples is the number of samples and
|
|
n_features is the number of features.
|
|
New in v 0.13.0: pandas DataFrames are now also accepted as
|
|
argument for X.
|
|
y : array-like, shape = [n_samples]
|
|
Target values.
|
|
New in v 0.13.0: pandas DataFrames are now also accepted as
|
|
argument for y.
|
|
custom_feature_names : None or tuple (default: tuple)
|
|
Custom feature names for `self.k_feature_names` and
|
|
`self.subsets_[i]['feature_names']`.
|
|
(new in v 0.13.0)
|
|
groups : array-like, with shape (n_samples,), optional
|
|
Group labels for the samples used while splitting the dataset into
|
|
train/test set. Passed to the fit method of the cross-validator.
|
|
fit_params : various, optional
|
|
Additional parameters that are being passed to the estimator.
|
|
For example, `sample_weights=weights`.
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
|
|
"""
|
|
|
|
# reset from a potential previous fit run
|
|
self.subsets_ = {}
|
|
self.fitted = False
|
|
self.interrupted_ = False
|
|
self.k_feature_idx_ = None
|
|
self.k_feature_names_ = None
|
|
self.k_score_ = None
|
|
|
|
self.fixed_features_ = self.fixed_features
|
|
self.fixed_features_set_ = set()
|
|
|
|
if hasattr(X, 'loc'):
|
|
X_ = X.values
|
|
if self.fixed_features is not None:
|
|
self.fixed_features_ = tuple(X.columns.get_loc(c)
|
|
if isinstance(c, str) else c
|
|
for c in self.fixed_features
|
|
)
|
|
else:
|
|
X_ = X
|
|
|
|
if self.fixed_features is not None:
|
|
self.fixed_features_set_ = set(self.fixed_features_)
|
|
|
|
if (custom_feature_names is not None
|
|
and len(custom_feature_names) != X.shape[1]):
|
|
raise ValueError('If custom_feature_names is not None, '
|
|
'the number of elements in custom_feature_names '
|
|
'must equal the number of columns in X.')
|
|
|
|
if not isinstance(self.k_features, int) and\
|
|
not isinstance(self.k_features, tuple)\
|
|
and not isinstance(self.k_features, str):
|
|
raise AttributeError('k_features must be a positive integer'
|
|
', tuple, or string')
|
|
|
|
if (isinstance(self.k_features, int) and (
|
|
self.k_features < 1 or self.k_features > X_.shape[1])):
|
|
raise AttributeError('k_features must be a positive integer'
|
|
' between 1 and X.shape[1], got %s'
|
|
% (self.k_features, ))
|
|
|
|
if isinstance(self.k_features, tuple):
|
|
if len(self.k_features) != 2:
|
|
raise AttributeError('k_features tuple must consist of 2'
|
|
' elements a min and a max value.')
|
|
|
|
if self.k_features[0] not in range(1, X_.shape[1] + 1):
|
|
raise AttributeError('k_features tuple min value must be in'
|
|
' range(1, X.shape[1]+1).')
|
|
|
|
if self.k_features[1] not in range(1, X_.shape[1] + 1):
|
|
raise AttributeError('k_features tuple max value must be in'
|
|
' range(1, X.shape[1]+1).')
|
|
|
|
if self.k_features[0] > self.k_features[1]:
|
|
raise AttributeError('The min k_features value must be smaller'
|
|
' than the max k_features value.')
|
|
|
|
if isinstance(self.k_features, tuple) or\
|
|
isinstance(self.k_features, str):
|
|
|
|
select_in_range = True
|
|
|
|
if isinstance(self.k_features, str):
|
|
if self.k_features not in {'best', 'parsimonious'}:
|
|
raise AttributeError('If a string argument is provided, '
|
|
'it must be "best" or "parsimonious"')
|
|
else:
|
|
min_k = 1
|
|
max_k = X_.shape[1]
|
|
else:
|
|
min_k = self.k_features[0]
|
|
max_k = self.k_features[1]
|
|
|
|
else:
|
|
select_in_range = False
|
|
k_to_select = self.k_features
|
|
|
|
orig_set = set(range(X_.shape[1]))
|
|
n_features = X_.shape[1]
|
|
|
|
if self.forward and self.fixed_features is not None:
|
|
orig_set = set(range(X_.shape[1])) - self.fixed_features_set_
|
|
n_features = len(orig_set)
|
|
|
|
if self.forward:
|
|
if select_in_range:
|
|
k_to_select = max_k
|
|
|
|
if self.fixed_features is not None:
|
|
k_idx = self.fixed_features_
|
|
k = len(k_idx)
|
|
k_idx, k_score = _calc_score(self, X_[:, k_idx], y, k_idx,
|
|
groups=groups, **fit_params)
|
|
self.subsets_[k] = {
|
|
'feature_idx': k_idx,
|
|
'cv_scores': k_score,
|
|
'avg_score': np.nanmean(k_score)
|
|
}
|
|
|
|
else:
|
|
k_idx = ()
|
|
k = 0
|
|
else:
|
|
if select_in_range:
|
|
k_to_select = min_k
|
|
k_idx = tuple(orig_set)
|
|
k = len(k_idx)
|
|
k_idx, k_score = _calc_score(self, X_[:, k_idx], y, k_idx,
|
|
groups=groups, **fit_params)
|
|
self.subsets_[k] = {
|
|
'feature_idx': k_idx,
|
|
'cv_scores': k_score,
|
|
'avg_score': np.nanmean(k_score)
|
|
}
|
|
best_subset = None
|
|
k_score = 0
|
|
|
|
try:
|
|
while k != k_to_select:
|
|
prev_subset = set(k_idx)
|
|
|
|
if self.forward:
|
|
k_idx, k_score, cv_scores = self._inclusion(
|
|
orig_set=orig_set,
|
|
subset=prev_subset,
|
|
X=X_,
|
|
y=y,
|
|
groups=groups,
|
|
**fit_params
|
|
)
|
|
else:
|
|
k_idx, k_score, cv_scores = self._exclusion(
|
|
feature_set=prev_subset,
|
|
X=X_,
|
|
y=y,
|
|
groups=groups,
|
|
fixed_feature=self.fixed_features_set_,
|
|
**fit_params
|
|
)
|
|
|
|
if self.floating:
|
|
|
|
if self.forward:
|
|
continuation_cond_1 = len(k_idx)
|
|
else:
|
|
continuation_cond_1 = n_features - len(k_idx)
|
|
|
|
continuation_cond_2 = True
|
|
ran_step_1 = True
|
|
new_feature = None
|
|
|
|
while continuation_cond_1 >= 2 and continuation_cond_2:
|
|
k_score_c = None
|
|
|
|
if ran_step_1:
|
|
(new_feature,) = set(k_idx) ^ prev_subset
|
|
|
|
if self.forward:
|
|
|
|
fixed_features_ok = True
|
|
if self.fixed_features is not None and \
|
|
len(self.fixed_features) - len(k_idx) <= 1:
|
|
fixed_features_ok = False
|
|
if fixed_features_ok:
|
|
k_idx_c, k_score_c, cv_scores_c = \
|
|
self._exclusion(
|
|
feature_set=k_idx,
|
|
fixed_feature=(
|
|
{new_feature} |
|
|
self.fixed_features_set_),
|
|
X=X_,
|
|
y=y,
|
|
groups=groups,
|
|
**fit_params
|
|
)
|
|
|
|
else:
|
|
k_idx_c, k_score_c, cv_scores_c = self._inclusion(
|
|
orig_set=orig_set - {new_feature},
|
|
subset=set(k_idx),
|
|
X=X_,
|
|
y=y,
|
|
groups=groups,
|
|
**fit_params
|
|
)
|
|
|
|
if k_score_c is not None and k_score_c > k_score:
|
|
|
|
if len(k_idx_c) in self.subsets_:
|
|
cached_score = self.subsets_[len(
|
|
k_idx_c)]['avg_score']
|
|
else:
|
|
cached_score = None
|
|
|
|
if cached_score is None or \
|
|
k_score_c > cached_score:
|
|
prev_subset = set(k_idx)
|
|
k_idx, k_score, cv_scores = \
|
|
k_idx_c, k_score_c, cv_scores_c
|
|
continuation_cond_1 = len(k_idx)
|
|
ran_step_1 = False
|
|
|
|
else:
|
|
continuation_cond_2 = False
|
|
|
|
else:
|
|
continuation_cond_2 = False
|
|
|
|
k = len(k_idx)
|
|
# floating can lead to multiple same-sized subsets
|
|
if k not in self.subsets_ or (k_score >
|
|
self.subsets_[k]['avg_score']):
|
|
|
|
k_idx = tuple(sorted(k_idx))
|
|
self.subsets_[k] = {
|
|
'feature_idx': k_idx,
|
|
'cv_scores': cv_scores,
|
|
'avg_score': k_score
|
|
}
|
|
|
|
if self.verbose == 1:
|
|
sys.stderr.write('\rFeatures: %d/%s' % (
|
|
len(k_idx),
|
|
k_to_select
|
|
))
|
|
sys.stderr.flush()
|
|
elif self.verbose > 1:
|
|
sys.stderr.write('\n[%s] Features: %d/%s -- score: %s' % (
|
|
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
|
len(k_idx),
|
|
k_to_select,
|
|
k_score
|
|
))
|
|
|
|
if self._TESTING_INTERRUPT_MODE:
|
|
self.subsets_, self.k_feature_names_ = \
|
|
_get_featurenames(self.subsets_,
|
|
self.k_feature_idx_,
|
|
custom_feature_names,
|
|
X)
|
|
raise KeyboardInterrupt
|
|
|
|
except KeyboardInterrupt:
|
|
self.interrupted_ = True
|
|
sys.stderr.write('\nSTOPPING EARLY DUE TO KEYBOARD INTERRUPT...')
|
|
|
|
if select_in_range:
|
|
max_score = float('-inf')
|
|
|
|
max_score = float('-inf')
|
|
for k in self.subsets_:
|
|
if k < min_k or k > max_k:
|
|
continue
|
|
if self.subsets_[k]['avg_score'] > max_score:
|
|
max_score = self.subsets_[k]['avg_score']
|
|
best_subset = k
|
|
k_score = max_score
|
|
k_idx = self.subsets_[best_subset]['feature_idx']
|
|
|
|
if self.k_features == 'parsimonious':
|
|
for k in self.subsets_:
|
|
if k >= best_subset:
|
|
continue
|
|
if self.subsets_[k]['avg_score'] >= (
|
|
max_score - np.std(self.subsets_[k]['cv_scores']) /
|
|
self.subsets_[k]['cv_scores'].shape[0]):
|
|
max_score = self.subsets_[k]['avg_score']
|
|
best_subset = k
|
|
k_score = max_score
|
|
k_idx = self.subsets_[best_subset]['feature_idx']
|
|
|
|
self.k_feature_idx_ = k_idx
|
|
self.k_score_ = k_score
|
|
self.fitted = True
|
|
self.subsets_, self.k_feature_names_ = \
|
|
_get_featurenames(self.subsets_,
|
|
self.k_feature_idx_,
|
|
custom_feature_names,
|
|
X)
|
|
return self
|
|
|
|
def _inclusion(self, orig_set, subset, X, y, ignore_feature=None,
|
|
groups=None, **fit_params):
|
|
all_avg_scores = []
|
|
all_cv_scores = []
|
|
all_subsets = []
|
|
res = (None, None, None)
|
|
remaining = orig_set - subset
|
|
if remaining:
|
|
features = len(remaining)
|
|
n_jobs = min(self.n_jobs, features)
|
|
parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose,
|
|
pre_dispatch=self.pre_dispatch)
|
|
work = parallel(delayed(_calc_score)
|
|
(self, X[:, tuple(subset | {feature})], y,
|
|
tuple(subset | {feature}),
|
|
groups=groups, **fit_params)
|
|
for feature in remaining
|
|
if feature != ignore_feature)
|
|
|
|
for new_subset, cv_scores in work:
|
|
all_avg_scores.append(np.nanmean(cv_scores))
|
|
all_cv_scores.append(cv_scores)
|
|
all_subsets.append(new_subset)
|
|
|
|
best = np.argmax(all_avg_scores)
|
|
res = (all_subsets[best],
|
|
all_avg_scores[best],
|
|
all_cv_scores[best])
|
|
return res
|
|
|
|
def _exclusion(self, feature_set, X, y, fixed_feature=None,
|
|
groups=None, **fit_params):
|
|
n = len(feature_set)
|
|
res = (None, None, None)
|
|
if n > 1:
|
|
all_avg_scores = []
|
|
all_cv_scores = []
|
|
all_subsets = []
|
|
features = n
|
|
n_jobs = min(self.n_jobs, features)
|
|
parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose,
|
|
pre_dispatch=self.pre_dispatch)
|
|
work = parallel(delayed(_calc_score)(self, X[:, p], y, p,
|
|
groups=groups, **fit_params)
|
|
for p in combinations(feature_set, r=n - 1)
|
|
if not fixed_feature or
|
|
fixed_feature.issubset(set(p)))
|
|
|
|
for p, cv_scores in work:
|
|
|
|
all_avg_scores.append(np.nanmean(cv_scores))
|
|
all_cv_scores.append(cv_scores)
|
|
all_subsets.append(p)
|
|
|
|
best = np.argmax(all_avg_scores)
|
|
res = (all_subsets[best],
|
|
all_avg_scores[best],
|
|
all_cv_scores[best])
|
|
return res
|
|
|
|
def transform(self, X):
|
|
"""Reduce X to its most important features.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
|
|
Training vectors, where n_samples is the number of samples and
|
|
n_features is the number of features.
|
|
New in v 0.13.0: pandas DataFrames are now also accepted as
|
|
argument for X.
|
|
|
|
Returns
|
|
-------
|
|
Reduced feature subset of X, shape={n_samples, k_features}
|
|
|
|
"""
|
|
self._check_fitted()
|
|
if hasattr(X, 'loc'):
|
|
X_ = X.values
|
|
else:
|
|
X_ = X
|
|
return X_[:, self.k_feature_idx_]
|
|
|
|
def fit_transform(self, X, y, groups=None, **fit_params):
|
|
"""Fit to training data then reduce X to its most important features.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
|
|
Training vectors, where n_samples is the number of samples and
|
|
n_features is the number of features.
|
|
New in v 0.13.0: pandas DataFrames are now also accepted as
|
|
argument for X.
|
|
y : array-like, shape = [n_samples]
|
|
Target values.
|
|
New in v 0.13.0: a pandas Series are now also accepted as
|
|
argument for y.
|
|
groups : array-like, with shape (n_samples,), optional
|
|
Group labels for the samples used while splitting the dataset into
|
|
train/test set. Passed to the fit method of the cross-validator.
|
|
fit_params : various, optional
|
|
Additional parameters that are being passed to the estimator.
|
|
For example, `sample_weights=weights`.
|
|
|
|
Returns
|
|
-------
|
|
Reduced feature subset of X, shape={n_samples, k_features}
|
|
|
|
"""
|
|
self.fit(X, y, groups=groups, **fit_params)
|
|
return self.transform(X)
|
|
|
|
def get_metric_dict(self, confidence_interval=0.95):
|
|
"""Return metric dictionary
|
|
|
|
Parameters
|
|
----------
|
|
confidence_interval : float (default: 0.95)
|
|
A positive float between 0.0 and 1.0 to compute the confidence
|
|
interval bounds of the CV score averages.
|
|
|
|
Returns
|
|
----------
|
|
Dictionary with items where each dictionary value is a list
|
|
with the number of iterations (number of feature subsets) as
|
|
its length. The dictionary keys corresponding to these lists
|
|
are as follows:
|
|
'feature_idx': tuple of the indices of the feature subset
|
|
'cv_scores': list with individual CV scores
|
|
'avg_score': of CV average scores
|
|
'std_dev': standard deviation of the CV score average
|
|
'std_err': standard error of the CV score average
|
|
'ci_bound': confidence interval bound of the CV score average
|
|
|
|
"""
|
|
self._check_fitted()
|
|
fdict = deepcopy(self.subsets_)
|
|
for k in fdict:
|
|
std_dev = np.std(self.subsets_[k]['cv_scores'])
|
|
bound, std_err = self._calc_confidence(
|
|
self.subsets_[k]['cv_scores'],
|
|
confidence=confidence_interval)
|
|
fdict[k]['ci_bound'] = bound
|
|
fdict[k]['std_dev'] = std_dev
|
|
fdict[k]['std_err'] = std_err
|
|
return fdict
|
|
|
|
def _calc_confidence(self, ary, confidence=0.95):
|
|
std_err = scipy.stats.sem(ary)
|
|
bound = std_err * sp.stats.t._ppf((1 + confidence) / 2.0, len(ary))
|
|
return bound, std_err
|
|
|
|
def _check_fitted(self):
|
|
if not self.fitted:
|
|
raise AttributeError('SequentialFeatureSelector has not been'
|
|
' fitted, yet.')
|