projektAI/venv/Lib/site-packages/mlxtend/feature_selection/sequential_feature_selector.py

# Sebastian Raschka 2014-2020
# mlxtend Machine Learning Library Extensions
#
# Algorithm for sequential feature selection.
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause

import datetime
import types
import numpy as np
import scipy as sp
import scipy.stats
import sys
from copy import deepcopy
from itertools import combinations
from sklearn.metrics import get_scorer
from sklearn.base import clone
from sklearn.base import MetaEstimatorMixin
from ..externals.name_estimators import _name_estimators
from ..utils.base_compostion import _BaseXComposition
from sklearn.model_selection import cross_val_score
from joblib import Parallel, delayed


def _calc_score(selector, X, y, indices, groups=None, **fit_params):
    if selector.cv:
        scores = cross_val_score(selector.est_,
                                 X, y,
                                 groups=groups,
                                 cv=selector.cv,
                                 scoring=selector.scorer,
                                 n_jobs=1,
                                 pre_dispatch=selector.pre_dispatch,
                                 fit_params=fit_params)
    else:
        selector.est_.fit(X, y, **fit_params)
        scores = np.array([selector.scorer(selector.est_, X, y)])
    return indices, scores


def _get_featurenames(subsets_dict, feature_idx, custom_feature_names, X):
    feature_names = None
    if feature_idx is not None:
        if custom_feature_names is not None:
            feature_names = tuple((custom_feature_names[i]
                                   for i in feature_idx))
        elif hasattr(X, 'loc'):
            feature_names = tuple((X.columns[i] for i in feature_idx))
        else:
            feature_names = tuple(str(i) for i in feature_idx)

    subsets_dict_ = deepcopy(subsets_dict)
    for key in subsets_dict_:
        if custom_feature_names is not None:
            new_tuple = tuple((custom_feature_names[i]
                               for i in subsets_dict[key]['feature_idx']))
        elif hasattr(X, 'loc'):
            new_tuple = tuple((X.columns[i]
                               for i in subsets_dict[key]['feature_idx']))
        else:
            new_tuple = tuple(str(i) for i in subsets_dict[key]['feature_idx'])
        subsets_dict_[key]['feature_names'] = new_tuple

    return subsets_dict_, feature_names


class SequentialFeatureSelector(_BaseXComposition, MetaEstimatorMixin):

    """Sequential Feature Selection for Classification and Regression.

    Parameters
    ----------
    estimator : scikit-learn classifier or regressor
    k_features : int or tuple or str (default: 1)
        Number of features to select,
        where k_features < the full feature set.
        New in 0.4.2: A tuple containing a min and max value can be provided,
            and the SFS will consider return any feature combination between
            min and max that scored highest in cross-validtion. For example,
            the tuple (1, 4) will return any combination from
            1 up to 4 features instead of a fixed number of features k.
        New in 0.8.0: A string argument "best" or "parsimonious".
            If "best" is provided, the feature selector will return the
            feature subset with the best cross-validation performance.
            If "parsimonious" is provided as an argument, the smallest
            feature subset that is within one standard error of the
            cross-validation performance will be selected.
    forward : bool (default: True)
        Forward selection if True,
        backward selection otherwise
    floating : bool (default: False)
        Adds a conditional exclusion/inclusion if True.
    verbose : int (default: 0), level of verbosity to use in logging.
        If 0, no output,
        if 1 number of features in current set, if 2 detailed logging i
        ncluding timestamp and cv scores at step.
    scoring : str, callable, or None (default: None)
        If None (default), uses 'accuracy' for sklearn classifiers
        and 'r2' for sklearn regressors.
        If str, uses a sklearn scoring metric string identifier, for example
        {accuracy, f1, precision, recall, roc_auc} for classifiers,
        {'mean_absolute_error', 'mean_squared_error'/'neg_mean_squared_error',
        'median_absolute_error', 'r2'} for regressors.
        If a callable object or function is provided, it has to be conform with
        sklearn's signature ``scorer(estimator, X, y)``; see
        http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
        for more information.
    cv : int (default: 5)
        Integer or iterable yielding train, test splits. If cv is an integer
        and `estimator` is a classifier (or y consists of integer class
        labels) stratified k-fold. Otherwise regular k-fold cross-validation
        is performed. No cross-validation if cv is None, False, or 0.
    n_jobs : int (default: 1)
        The number of CPUs to use for evaluating different feature subsets
        in parallel. -1 means 'all CPUs'.
    pre_dispatch : int, or string (default: '2*n_jobs')
        Controls the number of jobs that get dispatched
        during parallel execution if `n_jobs > 1` or `n_jobs=-1`.
        Reducing this number can be useful to avoid an explosion of
        memory consumption when more jobs get dispatched than CPUs can process.
        This parameter can be:
        None, in which case all the jobs are immediately created and spawned.
            Use this for lightweight and fast-running jobs,
            to avoid delays due to on-demand spawning of the jobs
        An int, giving the exact number of total jobs that are spawned
        A string, giving an expression as a function
            of n_jobs, as in `2*n_jobs`
    clone_estimator : bool (default: True)
        Clones estimator if True; works with the original estimator instance
        if False. Set to False if the estimator doesn't
        implement scikit-learn's set_params and get_params methods.
        In addition, it is required to set cv=0, and n_jobs=1.
    fixed_features : tuple (default: None)
        If not `None`, the feature indices provided as a tuple will be
        regarded as fixed by the feature selector. For example, if
        `fixed_features=(1, 3, 7)`, the 2nd, 4th, and 8th feature are
        guaranteed to be present in the solution. Note that if
        `fixed_features` is not `None`, make sure that the number of
        features to be selected is greater than `len(fixed_features)`.
        In other words, ensure that `k_features > len(fixed_features)`.
        New in mlxtend v. 0.18.0.

    Attributes
    ----------
    k_feature_idx_ : array-like, shape = [n_predictions]
        Feature Indices of the selected feature subsets.
    k_feature_names_ : array-like, shape = [n_predictions]
        Feature names of the selected feature subsets. If pandas
        DataFrames are used in the `fit` method, the feature
        names correspond to the column names. Otherwise, the
        feature names are string representation of the feature
        array indices. New in v 0.13.0.
    k_score_ : float
        Cross validation average score of the selected subset.
    subsets_ : dict
        A dictionary of selected feature subsets during the
        sequential selection, where the dictionary keys are
        the lengths k of these feature subsets. The dictionary
        values are dictionaries themselves with the following
        keys: 'feature_idx' (tuple of indices of the feature subset)
              'feature_names' (tuple of feature names of the feat. subset)
              'cv_scores' (list individual cross-validation scores)
              'avg_score' (average cross-validation score)
        Note that if pandas
        DataFrames are used in the `fit` method, the 'feature_names'
        correspond to the column names. Otherwise, the
        feature names are string representation of the feature
        array indices. The 'feature_names' is new in v 0.13.0.

    Examples
    -----------
    For usage examples, please see
    http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/

    """
    def __init__(self, estimator, k_features=1,
                 forward=True, floating=False,
                 verbose=0, scoring=None,
                 cv=5, n_jobs=1,
                 pre_dispatch='2*n_jobs',
                 clone_estimator=True,
                 fixed_features=None):

        self.estimator = estimator
        self.k_features = k_features
        self.forward = forward
        self.floating = floating
        self.pre_dispatch = pre_dispatch
        # Want to raise meaningful error message if a
        # cross-validation generator is inputted
        if isinstance(cv, types.GeneratorType):
            err_msg = ('Input cv is a generator object, which is not '
                       'supported. Instead please input an iterable yielding '
                       'train, test splits. This can usually be done by '
                       'passing a cross-validation generator to the '
                       'built-in list function. I.e. cv=list(<cv-generator>)')
            raise TypeError(err_msg)
        self.cv = cv
        self.n_jobs = n_jobs
        self.verbose = verbose
        self.clone_estimator = clone_estimator

        if fixed_features is not None:
            if isinstance(self.k_features, int) and \
                    self.k_features <= len(fixed_features):
                raise ValueError('Number of features to be selected must'
                                 ' be larger than the number of'
                                 ' features specified via `fixed_features`.'
                                 ' Got `k_features=%d` and'
                                 ' `fixed_features=%d`' %
                                 (k_features, len(fixed_features)))

            elif isinstance(self.k_features, tuple) and \
                    self.k_features[0] <= len(fixed_features):
                raise ValueError('The minimum number of features to'
                                 ' be selected must'
                                 ' be larger than the number of'
                                 ' features specified via `fixed_features`.'
                                 ' Got `k_features=%s` and '
                                 '`len(fixed_features)=%d`' %
                                 (k_features, len(fixed_features)))

        self.fixed_features = fixed_features

        if self.clone_estimator:
            self.est_ = clone(self.estimator)
        else:
            self.est_ = self.estimator
        self.scoring = scoring

        if scoring is None:
            if self.est_._estimator_type == 'classifier':
                scoring = 'accuracy'
            elif self.est_._estimator_type == 'regressor':
                scoring = 'r2'
            else:
                raise AttributeError('Estimator must '
                                     'be a Classifier or Regressor.')
        if isinstance(scoring, str):
            self.scorer = get_scorer(scoring)
        else:
            self.scorer = scoring

        self.fitted = False
        self.subsets_ = {}
        self.interrupted_ = False

        # don't mess with this unless testing
        self._TESTING_INTERRUPT_MODE = False

    @property
    def named_estimators(self):
        """
        Returns
        -------
        List of named estimator tuples, like [('svc', SVC(...))]
        """
        return _name_estimators([self.estimator])

    def get_params(self, deep=True):
        #
        # Return estimator parameter names for GridSearch support.
        #
        return self._get_params('named_estimators', deep=deep)

    def set_params(self, **params):
        """Set the parameters of this estimator.
        Valid parameter keys can be listed with ``get_params()``.

        Returns
        -------
        self
        """
        self._set_params('estimator', 'named_estimators', **params)
        return self

    def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params):
        """Perform feature selection and learn model from training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
            New in v 0.13.0: pandas DataFrames are now also accepted as
            argument for X.
        y : array-like, shape = [n_samples]
            Target values.
            New in v 0.13.0: pandas DataFrames are now also accepted as
            argument for y.
        custom_feature_names : None or tuple (default: tuple)
            Custom feature names for `self.k_feature_names` and
            `self.subsets_[i]['feature_names']`.
            (new in v 0.13.0)
        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set. Passed to the fit method of the cross-validator.
        fit_params : various, optional
            Additional parameters that are being passed to the estimator.
            For example, `sample_weights=weights`.

        Returns
        -------
        self : object

        """

        # reset from a potential previous fit run
        self.subsets_ = {}
        self.fitted = False
        self.interrupted_ = False
        self.k_feature_idx_ = None
        self.k_feature_names_ = None
        self.k_score_ = None

        self.fixed_features_ = self.fixed_features
        self.fixed_features_set_ = set()

        if hasattr(X, 'loc'):
            X_ = X.values
            if self.fixed_features is not None:
                self.fixed_features_ = tuple(X.columns.get_loc(c)
                                             if isinstance(c, str) else c
                                             for c in self.fixed_features
                                             )
        else:
            X_ = X

        if self.fixed_features is not None:
            self.fixed_features_set_ = set(self.fixed_features_)

        if (custom_feature_names is not None
                and len(custom_feature_names) != X.shape[1]):
            raise ValueError('If custom_feature_names is not None, '
                             'the number of elements in custom_feature_names '
                             'must equal the number of columns in X.')

        if not isinstance(self.k_features, int) and\
                not isinstance(self.k_features, tuple)\
                and not isinstance(self.k_features, str):
            raise AttributeError('k_features must be a positive integer'
                                 ', tuple, or string')

        if (isinstance(self.k_features, int) and (
                self.k_features < 1 or self.k_features > X_.shape[1])):
            raise AttributeError('k_features must be a positive integer'
                                 ' between 1 and X.shape[1], got %s'
                                 % (self.k_features, ))

        if isinstance(self.k_features, tuple):
            if len(self.k_features) != 2:
                raise AttributeError('k_features tuple must consist of 2'
                                     ' elements a min and a max value.')

            if self.k_features[0] not in range(1, X_.shape[1] + 1):
                raise AttributeError('k_features tuple min value must be in'
                                     ' range(1, X.shape[1]+1).')

            if self.k_features[1] not in range(1, X_.shape[1] + 1):
                raise AttributeError('k_features tuple max value must be in'
                                     ' range(1, X.shape[1]+1).')

            if self.k_features[0] > self.k_features[1]:
                raise AttributeError('The min k_features value must be smaller'
                                     ' than the max k_features value.')

        if isinstance(self.k_features, tuple) or\
                isinstance(self.k_features, str):

            select_in_range = True

            if isinstance(self.k_features, str):
                if self.k_features not in {'best', 'parsimonious'}:
                    raise AttributeError('If a string argument is provided, '
                                         'it must be "best" or "parsimonious"')
                else:
                    min_k = 1
                    max_k = X_.shape[1]
            else:
                min_k = self.k_features[0]
                max_k = self.k_features[1]

        else:
            select_in_range = False
            k_to_select = self.k_features

        orig_set = set(range(X_.shape[1]))
        n_features = X_.shape[1]

        if self.forward and self.fixed_features is not None:
            orig_set = set(range(X_.shape[1])) - self.fixed_features_set_
            n_features = len(orig_set)

        if self.forward:
            if select_in_range:
                k_to_select = max_k

            if self.fixed_features is not None:
                k_idx = self.fixed_features_
                k = len(k_idx)
                k_idx, k_score = _calc_score(self, X_[:, k_idx], y, k_idx,
                                             groups=groups, **fit_params)
                self.subsets_[k] = {
                    'feature_idx': k_idx,
                    'cv_scores': k_score,
                    'avg_score': np.nanmean(k_score)
                }

            else:
                k_idx = ()
                k = 0
        else:
            if select_in_range:
                k_to_select = min_k
            k_idx = tuple(orig_set)
            k = len(k_idx)
            k_idx, k_score = _calc_score(self, X_[:, k_idx], y, k_idx,
                                         groups=groups, **fit_params)
            self.subsets_[k] = {
                'feature_idx': k_idx,
                'cv_scores': k_score,
                'avg_score': np.nanmean(k_score)
            }
        best_subset = None
        k_score = 0

        try:
            while k != k_to_select:
                prev_subset = set(k_idx)

                if self.forward:
                    k_idx, k_score, cv_scores = self._inclusion(
                        orig_set=orig_set,
                        subset=prev_subset,
                        X=X_,
                        y=y,
                        groups=groups,
                        **fit_params
                    )
                else:
                    k_idx, k_score, cv_scores = self._exclusion(
                        feature_set=prev_subset,
                        X=X_,
                        y=y,
                        groups=groups,
                        fixed_feature=self.fixed_features_set_,
                        **fit_params
                    )

                if self.floating:

                    if self.forward:
                        continuation_cond_1 = len(k_idx)
                    else:
                        continuation_cond_1 = n_features - len(k_idx)

                    continuation_cond_2 = True
                    ran_step_1 = True
                    new_feature = None

                    while continuation_cond_1 >= 2 and continuation_cond_2:
                        k_score_c = None

                        if ran_step_1:
                            (new_feature,) = set(k_idx) ^ prev_subset

                        if self.forward:

                            fixed_features_ok = True
                            if self.fixed_features is not None and \
                                    len(self.fixed_features) - len(k_idx) <= 1:
                                fixed_features_ok = False
                            if fixed_features_ok:
                                k_idx_c, k_score_c, cv_scores_c = \
                                    self._exclusion(
                                        feature_set=k_idx,
                                        fixed_feature=(
                                            {new_feature} |
                                            self.fixed_features_set_),
                                        X=X_,
                                        y=y,
                                        groups=groups,
                                        **fit_params
                                    )

                        else:
                            k_idx_c, k_score_c, cv_scores_c = self._inclusion(
                                orig_set=orig_set - {new_feature},
                                subset=set(k_idx),
                                X=X_,
                                y=y,
                                groups=groups,
                                **fit_params
                            )

                        if k_score_c is not None and k_score_c > k_score:

                            if len(k_idx_c) in self.subsets_:
                                cached_score = self.subsets_[len(
                                    k_idx_c)]['avg_score']
                            else:
                                cached_score = None

                            if cached_score is None or \
                                    k_score_c > cached_score:
                                prev_subset = set(k_idx)
                                k_idx, k_score, cv_scores = \
                                    k_idx_c, k_score_c, cv_scores_c
                                continuation_cond_1 = len(k_idx)
                                ran_step_1 = False

                            else:
                                continuation_cond_2 = False

                        else:
                            continuation_cond_2 = False

                k = len(k_idx)
                # floating can lead to multiple same-sized subsets
                if k not in self.subsets_ or (k_score >
                                              self.subsets_[k]['avg_score']):

                    k_idx = tuple(sorted(k_idx))
                    self.subsets_[k] = {
                        'feature_idx': k_idx,
                        'cv_scores': cv_scores,
                        'avg_score': k_score
                    }

                if self.verbose == 1:
                    sys.stderr.write('\rFeatures: %d/%s' % (
                        len(k_idx),
                        k_to_select
                    ))
                    sys.stderr.flush()
                elif self.verbose > 1:
                    sys.stderr.write('\n[%s] Features: %d/%s -- score: %s' % (
                        datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        len(k_idx),
                        k_to_select,
                        k_score
                    ))

                if self._TESTING_INTERRUPT_MODE:
                    self.subsets_, self.k_feature_names_ = \
                        _get_featurenames(self.subsets_,
                                          self.k_feature_idx_,
                                          custom_feature_names,
                                          X)
                    raise KeyboardInterrupt

        except KeyboardInterrupt:
            self.interrupted_ = True
            sys.stderr.write('\nSTOPPING EARLY DUE TO KEYBOARD INTERRUPT...')

        if select_in_range:
            max_score = float('-inf')

            max_score = float('-inf')
            for k in self.subsets_:
                if k < min_k or k > max_k:
                    continue
                if self.subsets_[k]['avg_score'] > max_score:
                    max_score = self.subsets_[k]['avg_score']
                    best_subset = k
            k_score = max_score
            k_idx = self.subsets_[best_subset]['feature_idx']

            if self.k_features == 'parsimonious':
                for k in self.subsets_:
                    if k >= best_subset:
                        continue
                    if self.subsets_[k]['avg_score'] >= (
                            max_score - np.std(self.subsets_[k]['cv_scores']) /
                            self.subsets_[k]['cv_scores'].shape[0]):
                        max_score = self.subsets_[k]['avg_score']
                        best_subset = k
                k_score = max_score
                k_idx = self.subsets_[best_subset]['feature_idx']

        self.k_feature_idx_ = k_idx
        self.k_score_ = k_score
        self.fitted = True
        self.subsets_, self.k_feature_names_ = \
            _get_featurenames(self.subsets_,
                              self.k_feature_idx_,
                              custom_feature_names,
                              X)
        return self

    def _inclusion(self, orig_set, subset, X, y, ignore_feature=None,
                   groups=None, **fit_params):
        all_avg_scores = []
        all_cv_scores = []
        all_subsets = []
        res = (None, None, None)
        remaining = orig_set - subset
        if remaining:
            features = len(remaining)
            n_jobs = min(self.n_jobs, features)
            parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose,
                                pre_dispatch=self.pre_dispatch)
            work = parallel(delayed(_calc_score)
                            (self, X[:, tuple(subset | {feature})], y,
                             tuple(subset | {feature}),
                             groups=groups, **fit_params)
                            for feature in remaining
                            if feature != ignore_feature)

            for new_subset, cv_scores in work:
                all_avg_scores.append(np.nanmean(cv_scores))
                all_cv_scores.append(cv_scores)
                all_subsets.append(new_subset)

            best = np.argmax(all_avg_scores)
            res = (all_subsets[best],
                   all_avg_scores[best],
                   all_cv_scores[best])
        return res

    def _exclusion(self, feature_set, X, y, fixed_feature=None,
                   groups=None, **fit_params):
        n = len(feature_set)
        res = (None, None, None)
        if n > 1:
            all_avg_scores = []
            all_cv_scores = []
            all_subsets = []
            features = n
            n_jobs = min(self.n_jobs, features)
            parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose,
                                pre_dispatch=self.pre_dispatch)
            work = parallel(delayed(_calc_score)(self, X[:, p], y, p,
                                                 groups=groups, **fit_params)
                            for p in combinations(feature_set, r=n - 1)
                            if not fixed_feature or
                            fixed_feature.issubset(set(p)))

            for p, cv_scores in work:

                all_avg_scores.append(np.nanmean(cv_scores))
                all_cv_scores.append(cv_scores)
                all_subsets.append(p)

            best = np.argmax(all_avg_scores)
            res = (all_subsets[best],
                   all_avg_scores[best],
                   all_cv_scores[best])
        return res

    def transform(self, X):
        """Reduce X to its most important features.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
            New in v 0.13.0: pandas DataFrames are now also accepted as
            argument for X.

        Returns
        -------
        Reduced feature subset of X, shape={n_samples, k_features}

        """
        self._check_fitted()
        if hasattr(X, 'loc'):
            X_ = X.values
        else:
            X_ = X
        return X_[:, self.k_feature_idx_]

    def fit_transform(self, X, y, groups=None, **fit_params):
        """Fit to training data then reduce X to its most important features.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
            New in v 0.13.0: pandas DataFrames are now also accepted as
            argument for X.
        y : array-like, shape = [n_samples]
            Target values.
            New in v 0.13.0: a pandas Series are now also accepted as
            argument for y.
        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set. Passed to the fit method of the cross-validator.
        fit_params : various, optional
            Additional parameters that are being passed to the estimator.
            For example, `sample_weights=weights`.

        Returns
        -------
        Reduced feature subset of X, shape={n_samples, k_features}

        """
        self.fit(X, y, groups=groups, **fit_params)
        return self.transform(X)

    def get_metric_dict(self, confidence_interval=0.95):
        """Return metric dictionary

        Parameters
        ----------
        confidence_interval : float (default: 0.95)
            A positive float between 0.0 and 1.0 to compute the confidence
            interval bounds of the CV score averages.

        Returns
        ----------
        Dictionary with items where each dictionary value is a list
        with the number of iterations (number of feature subsets) as
        its length. The dictionary keys corresponding to these lists
        are as follows:
            'feature_idx': tuple of the indices of the feature subset
            'cv_scores': list with individual CV scores
            'avg_score': of CV average scores
            'std_dev': standard deviation of the CV score average
            'std_err': standard error of the CV score average
            'ci_bound': confidence interval bound of the CV score average

        """
        self._check_fitted()
        fdict = deepcopy(self.subsets_)
        for k in fdict:
            std_dev = np.std(self.subsets_[k]['cv_scores'])
            bound, std_err = self._calc_confidence(
                self.subsets_[k]['cv_scores'],
                confidence=confidence_interval)
            fdict[k]['ci_bound'] = bound
            fdict[k]['std_dev'] = std_dev
            fdict[k]['std_err'] = std_err
        return fdict

    def _calc_confidence(self, ary, confidence=0.95):
        std_err = scipy.stats.sem(ary)
        bound = std_err * sp.stats.t._ppf((1 + confidence) / 2.0, len(ary))
        return bound, std_err

    def _check_fitted(self):
        if not self.fitted:
            raise AttributeError('SequentialFeatureSelector has not been'
                                 ' fitted, yet.')