projektAI/venv/Lib/site-packages/mlxtend/feature_selection/exhaustive_feature_selector.py

# Sebastian Raschka 2014-2020
# mlxtend Machine Learning Library Extensions
#
# Algorithm for exhaustive feature selection.
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause


import numpy as np
import scipy as sp
import scipy.stats
import sys
import operator as op
from copy import deepcopy
from itertools import combinations
from itertools import chain
from functools import reduce
from sklearn.metrics import get_scorer
from sklearn.base import clone
from sklearn.base import BaseEstimator
from sklearn.base import MetaEstimatorMixin
from ..externals.name_estimators import _name_estimators
from sklearn.model_selection import cross_val_score
from joblib import Parallel, delayed


def _calc_score(selector, X, y, indices, groups=None, **fit_params):
    if selector.cv:
        scores = cross_val_score(selector.est_,
                                 X[:, indices], y,
                                 groups=groups,
                                 cv=selector.cv,
                                 scoring=selector.scorer,
                                 n_jobs=1,
                                 pre_dispatch=selector.pre_dispatch,
                                 fit_params=fit_params)
    else:
        selector.est_.fit(X[:, indices], y, **fit_params)
        scores = np.array([selector.scorer(selector.est_, X[:, indices], y)])
    return indices, scores


def _get_featurenames(subsets_dict, feature_idx, custom_feature_names, X):
    feature_names = None
    if feature_idx is not None:
        if custom_feature_names is not None:
            feature_names = tuple((custom_feature_names[i]
                                   for i in feature_idx))
        elif hasattr(X, 'loc'):
            feature_names = tuple((X.columns[i] for i in feature_idx))
        else:
            feature_names = tuple(str(i) for i in feature_idx)

    subsets_dict_ = deepcopy(subsets_dict)
    for key in subsets_dict_:
        if custom_feature_names is not None:
            new_tuple = tuple((custom_feature_names[i]
                               for i in subsets_dict[key]['feature_idx']))
        elif hasattr(X, 'loc'):
            new_tuple = tuple((X.columns[i]
                               for i in subsets_dict[key]['feature_idx']))
        else:
            new_tuple = tuple(str(i) for i in subsets_dict[key]['feature_idx'])
        subsets_dict_[key]['feature_names'] = new_tuple

    return subsets_dict_, feature_names


class ExhaustiveFeatureSelector(BaseEstimator, MetaEstimatorMixin):

    """Exhaustive Feature Selection for Classification and Regression.
       (new in v0.4.3)

    Parameters
    ----------
    estimator : scikit-learn classifier or regressor
    min_features : int (default: 1)
        Minumum number of features to select
    max_features : int (default: 1)
        Maximum number of features to select
    print_progress : bool (default: True)
        Prints progress as the number of epochs
        to stderr.
    scoring : str, (default='accuracy')
        Scoring metric in {accuracy, f1, precision, recall, roc_auc}
        for classifiers,
        {'mean_absolute_error', 'mean_squared_error',
        'median_absolute_error', 'r2'} for regressors,
        or a callable object or function with
        signature ``scorer(estimator, X, y)``.
    cv : int (default: 5)
        Scikit-learn cross-validation generator or `int`.
        If estimator is a classifier (or y consists of integer class labels),
        stratified k-fold is performed, and regular k-fold cross-validation
        otherwise.
        No cross-validation if cv is None, False, or 0.
    n_jobs : int (default: 1)
        The number of CPUs to use for evaluating different feature subsets
        in parallel. -1 means 'all CPUs'.
    pre_dispatch : int, or string (default: '2*n_jobs')
        Controls the number of jobs that get dispatched
        during parallel execution if `n_jobs > 1` or `n_jobs=-1`.
        Reducing this number can be useful to avoid an explosion of
        memory consumption when more jobs get dispatched than CPUs can process.
        This parameter can be:
        None, in which case all the jobs are immediately created and spawned.
            Use this for lightweight and fast-running jobs,
            to avoid delays due to on-demand spawning of the jobs
        An int, giving the exact number of total jobs that are spawned
        A string, giving an expression as a function
            of n_jobs, as in `2*n_jobs`
    clone_estimator : bool (default: True)
        Clones estimator if True; works with the original estimator instance
        if False. Set to False if the estimator doesn't
        implement scikit-learn's set_params and get_params methods.
        In addition, it is required to set cv=0, and n_jobs=1.

    Attributes
    ----------
    best_idx_ : array-like, shape = [n_predictions]
        Feature Indices of the selected feature subsets.
    best_feature_names_ : array-like, shape = [n_predictions]
        Feature names of the selected feature subsets. If pandas
        DataFrames are used in the `fit` method, the feature
        names correspond to the column names. Otherwise, the
        feature names are string representation of the feature
        array indices. New in v 0.13.0.
    best_score_ : float
        Cross validation average score of the selected subset.
    subsets_ : dict
        A dictionary of selected feature subsets during the
        exhaustive selection, where the dictionary keys are
        the lengths k of these feature subsets. The dictionary
        values are dictionaries themselves with the following
        keys: 'feature_idx' (tuple of indices of the feature subset)
              'feature_names' (tuple of feature names of the feat. subset)
              'cv_scores' (list individual cross-validation scores)
              'avg_score' (average cross-validation score)
        Note that if pandas
        DataFrames are used in the `fit` method, the 'feature_names'
        correspond to the column names. Otherwise, the
        feature names are string representation of the feature
        array indices. The 'feature_names' is new in v 0.13.0.

    Examples
    -----------
    For usage examples, please see
    http://rasbt.github.io/mlxtend/user_guide/feature_selection/ExhaustiveFeatureSelector/

    """
    def __init__(self, estimator, min_features=1, max_features=1,
                 print_progress=True, scoring='accuracy',
                 cv=5, n_jobs=1,
                 pre_dispatch='2*n_jobs',
                 clone_estimator=True):
        self.estimator = estimator
        self.min_features = min_features
        self.max_features = max_features
        self.pre_dispatch = pre_dispatch
        self.scoring = scoring
        self.scorer = get_scorer(scoring)
        self.cv = cv
        self.print_progress = print_progress
        self.n_jobs = n_jobs
        self.named_est = {key: value for key, value in
                          _name_estimators([self.estimator])}
        self.clone_estimator = clone_estimator
        if self.clone_estimator:
            self.est_ = clone(self.estimator)
        else:
            self.est_ = self.estimator
        self.fitted = False
        self.interrupted_ = False

        # don't mess with this unless testing
        self._TESTING_INTERRUPT_MODE = False

    def fit(self, X, y, custom_feature_names=None, groups=None, **fit_params):
        """Perform feature selection and learn model from training data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
            New in v 0.13.0: pandas DataFrames are now also accepted as
            argument for X.
        y : array-like, shape = [n_samples]
            Target values.
        custom_feature_names : None or tuple (default: tuple)
            Custom feature names for `self.k_feature_names` and
            `self.subsets_[i]['feature_names']`.
            (new in v 0.13.0)
        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set. Passed to the fit method of the cross-validator.
        fit_params : dict of string -> object, optional
            Parameters to pass to to the fit method of classifier.

        Returns
        -------
        self : object

        """

        # reset from a potential previous fit run
        self.subsets_ = {}
        self.fitted = False
        self.interrupted_ = False
        self.best_idx_ = None
        self.best_feature_names_ = None
        self.best_score_ = None

        if hasattr(X, 'loc'):
            X_ = X.values
        else:
            X_ = X

        if (custom_feature_names is not None
                and len(custom_feature_names) != X.shape[1]):
            raise ValueError('If custom_feature_names is not None, '
                             'the number of elements in custom_feature_names '
                             'must equal the number of columns in X.')

        if (not isinstance(self.max_features, int) or
                (self.max_features > X.shape[1] or self.max_features < 1)):
            raise AttributeError('max_features must be'
                                 ' smaller than %d and larger than 0' %
                                 (X.shape[1] + 1))

        if (not isinstance(self.min_features, int) or
                (self.min_features > X.shape[1] or self.min_features < 1)):
            raise AttributeError('min_features must be'
                                 ' smaller than %d and larger than 0'
                                 % (X.shape[1] + 1))

        if self.max_features < self.min_features:
            raise AttributeError('min_features must be <= max_features')

        candidates = chain.from_iterable(
            combinations(range(X_.shape[1]), r=i) for i in
            range(self.min_features, self.max_features + 1)
        )

        def ncr(n, r):
            """Return the number of combinations of length r from n items.

            Parameters
            ----------
            n : {integer}
            Total number of items
            r : {integer}
            Number of items to select from n

            Returns
            -------
            Number of combinations, integer

            """

            r = min(r, n-r)
            if r == 0:
                return 1
            numer = reduce(op.mul, range(n, n-r, -1))
            denom = reduce(op.mul, range(1, r+1))
            return numer//denom

        all_comb = np.sum([ncr(n=X_.shape[1], r=i)
                           for i in range(self.min_features,
                                          self.max_features + 1)])

        n_jobs = min(self.n_jobs, all_comb)
        parallel = Parallel(n_jobs=n_jobs, pre_dispatch=self.pre_dispatch)
        work = enumerate(parallel(delayed(_calc_score)
                                  (self, X_, y, c, groups=groups, **fit_params)
                                  for c in candidates))

        try:
            for iteration, (c, cv_scores) in work:

                self.subsets_[iteration] = {'feature_idx': c,
                                            'cv_scores': cv_scores,
                                            'avg_score': np.mean(cv_scores)}

                if self.print_progress:
                    sys.stderr.write('\rFeatures: %d/%d' % (
                        iteration + 1, all_comb))
                    sys.stderr.flush()

                if self._TESTING_INTERRUPT_MODE:
                    self.subsets_, self.best_feature_names_ = \
                        _get_featurenames(self.subsets_,
                                          self.best_idx_,
                                          custom_feature_names,
                                          X)
                    raise KeyboardInterrupt

        except KeyboardInterrupt as e:
            self.interrupted_ = True
            sys.stderr.write('\nSTOPPING EARLY DUE TO KEYBOARD INTERRUPT...')

        max_score = float('-inf')
        for c in self.subsets_:
            if self.subsets_[c]['avg_score'] > max_score:
                max_score = self.subsets_[c]['avg_score']
                best_subset = c
        score = max_score
        idx = self.subsets_[best_subset]['feature_idx']

        self.best_idx_ = idx
        self.best_score_ = score
        self.fitted = True
        self.subsets_, self.best_feature_names_ = \
            _get_featurenames(self.subsets_,
                              self.best_idx_,
                              custom_feature_names,
                              X)
        return self

    def transform(self, X):
        """Return the best selected features from X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
            New in v 0.13.0: pandas DataFrames are now also accepted as
            argument for X.

        Returns
        -------
        Feature subset of X, shape={n_samples, k_features}

        """
        self._check_fitted()
        if hasattr(X, 'loc'):
            X_ = X.values
        else:
            X_ = X
        return X_[:, self.best_idx_]

    def fit_transform(self, X, y, groups=None, **fit_params):
        """Fit to training data and return the best selected features from X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
            New in v 0.13.0: pandas DataFrames are now also accepted as
            argument for X.
        y : array-like, shape = [n_samples]
            Target values.
        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set. Passed to the fit method of the cross-validator.
        fit_params : dict of string -> object, optional
            Parameters to pass to to the fit method of classifier.

        Returns
        -------
        Feature subset of X, shape={n_samples, k_features}

        """
        self.fit(X, y, groups=groups, **fit_params)
        return self.transform(X)

    def get_metric_dict(self, confidence_interval=0.95):
        """Return metric dictionary

        Parameters
        ----------
        confidence_interval : float (default: 0.95)
            A positive float between 0.0 and 1.0 to compute the confidence
            interval bounds of the CV score averages.

        Returns
        ----------
        Dictionary with items where each dictionary value is a list
        with the number of iterations (number of feature subsets) as
        its length. The dictionary keys corresponding to these lists
        are as follows:
            'feature_idx': tuple of the indices of the feature subset
            'cv_scores': list with individual CV scores
            'avg_score': of CV average scores
            'std_dev': standard deviation of the CV score average
            'std_err': standard error of the CV score average
            'ci_bound': confidence interval bound of the CV score average

        """
        self._check_fitted()
        fdict = deepcopy(self.subsets_)
        for k in fdict:
            std_dev = np.std(self.subsets_[k]['cv_scores'])
            bound, std_err = self._calc_confidence(
                self.subsets_[k]['cv_scores'],
                confidence=confidence_interval)
            fdict[k]['ci_bound'] = bound
            fdict[k]['std_dev'] = std_dev
            fdict[k]['std_err'] = std_err
        return fdict

    def _calc_confidence(self, ary, confidence=0.95):
        std_err = scipy.stats.sem(ary)
        bound = std_err * sp.stats.t._ppf((1 + confidence) / 2.0, len(ary))
        return bound, std_err

    def _check_fitted(self):
        if not self.fitted:
            raise AttributeError('ExhaustiveFeatureSelector has not been'
                                 ' fitted, yet.')