349 lines
14 KiB
Python
349 lines
14 KiB
Python
|
# Stacking CV classifier
|
||
|
|
||
|
# Sebastian Raschka 2014-2020
|
||
|
# mlxtend Machine Learning Library Extensions
|
||
|
#
|
||
|
# An ensemble-learning meta-classifier for stacking
|
||
|
# Authors: Reiichiro Nakano <github.com/reiinakano>
|
||
|
# Sebastian Raschka <sebastianraschka.com>
|
||
|
#
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
import numpy as np
|
||
|
from scipy import sparse
|
||
|
from sklearn.base import TransformerMixin, clone
|
||
|
from sklearn.model_selection import cross_val_predict
|
||
|
from sklearn.model_selection._split import check_cv
|
||
|
|
||
|
from ..externals.estimator_checks import check_is_fitted
|
||
|
from ..externals.name_estimators import _name_estimators
|
||
|
from ..utils.base_compostion import _BaseXComposition
|
||
|
from ._base_classification import _BaseStackingClassifier
|
||
|
|
||
|
# from sklearn.utils import check_X_y
|
||
|
|
||
|
|
||
|
class StackingCVClassifier(_BaseXComposition, _BaseStackingClassifier,
|
||
|
TransformerMixin):
|
||
|
|
||
|
"""A 'Stacking Cross-Validation' classifier for scikit-learn estimators.
|
||
|
|
||
|
New in mlxtend v0.4.3
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
classifiers : array-like, shape = [n_classifiers]
|
||
|
A list of classifiers.
|
||
|
Invoking the `fit` method on the `StackingCVClassifer` will fit clones
|
||
|
of these original classifiers that will
|
||
|
be stored in the class attribute `self.clfs_` if `use_clones=True`.
|
||
|
meta_classifier : object
|
||
|
The meta-classifier to be fitted on the ensemble of
|
||
|
classifiers
|
||
|
use_probas : bool (default: False)
|
||
|
If True, trains meta-classifier based on predicted probabilities
|
||
|
instead of class labels.
|
||
|
drop_proba_col : string (default: None)
|
||
|
Drops extra "probability" column in the feature set, because it is
|
||
|
redundant:
|
||
|
p(y_c) = 1 - p(y_1) + p(y_2) + ... + p(y_{c-1}).
|
||
|
This can be useful for meta-classifiers that are sensitive to perfectly
|
||
|
collinear features.
|
||
|
If 'last', drops last probability column.
|
||
|
If 'first', drops first probability column.
|
||
|
Only relevant if `use_probas=True`.
|
||
|
cv : int, cross-validation generator or an iterable, optional (default: 2)
|
||
|
Determines the cross-validation splitting strategy.
|
||
|
Possible inputs for cv are:
|
||
|
- None, to use the default 2-fold cross validation,
|
||
|
- integer, to specify the number of folds in a `(Stratified)KFold`,
|
||
|
- An object to be used as a cross-validation generator.
|
||
|
- An iterable yielding train, test splits.
|
||
|
For integer/None inputs, it will use either a `KFold` or
|
||
|
`StratifiedKFold` cross validation depending the value of `stratify`
|
||
|
argument.
|
||
|
shuffle : bool (default: True)
|
||
|
If True, and the `cv` argument is integer, the training data will be
|
||
|
shuffled at fitting stage prior to cross-validation. If the `cv`
|
||
|
argument is a specific cross validation technique, this argument is
|
||
|
omitted.
|
||
|
random_state : int, RandomState instance or None, optional (default: None)
|
||
|
Constrols the randomness of the cv splitter. Used when `cv` is
|
||
|
integer and `shuffle=True`. New in v0.16.0.
|
||
|
stratify : bool (default: True)
|
||
|
If True, and the `cv` argument is integer it will follow a stratified
|
||
|
K-Fold cross validation technique. If the `cv` argument is a specific
|
||
|
cross validation technique, this argument is omitted.
|
||
|
verbose : int, optional (default=0)
|
||
|
Controls the verbosity of the building process.
|
||
|
- `verbose=0` (default): Prints nothing
|
||
|
- `verbose=1`: Prints the number & name of the regressor being fitted
|
||
|
and which fold is currently being used for fitting
|
||
|
- `verbose=2`: Prints info about the parameters of the
|
||
|
regressor being fitted
|
||
|
- `verbose>2`: Changes `verbose` param of the underlying regressor to
|
||
|
self.verbose - 2
|
||
|
use_features_in_secondary : bool (default: False)
|
||
|
If True, the meta-classifier will be trained both on the predictions
|
||
|
of the original classifiers and the original dataset.
|
||
|
If False, the meta-classifier will be trained only on the predictions
|
||
|
of the original classifiers.
|
||
|
store_train_meta_features : bool (default: False)
|
||
|
If True, the meta-features computed from the training data used
|
||
|
for fitting the meta-classifier stored in the
|
||
|
`self.train_meta_features_` array, which can be
|
||
|
accessed after calling `fit`.
|
||
|
use_clones : bool (default: True)
|
||
|
Clones the classifiers for stacking classification if True (default)
|
||
|
or else uses the original ones, which will be refitted on the dataset
|
||
|
upon calling the `fit` method. Hence, if use_clones=True, the original
|
||
|
input classifiers will remain unmodified upon using the
|
||
|
StackingCVClassifier's `fit` method.
|
||
|
Setting `use_clones=False` is
|
||
|
recommended if you are working with estimators that are supporting
|
||
|
the scikit-learn fit/predict API interface but are not compatible
|
||
|
to scikit-learn's `clone` function.
|
||
|
n_jobs : int or None, optional (default=None)
|
||
|
The number of CPUs to use to do the computation.
|
||
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||
|
for more details. New in v0.16.0.
|
||
|
pre_dispatch : int, or string, optional
|
||
|
Controls the number of jobs that get dispatched during parallel
|
||
|
execution. Reducing this number can be useful to avoid an
|
||
|
explosion of memory consumption when more jobs get dispatched
|
||
|
than CPUs can process. This parameter can be:
|
||
|
- None, in which case all the jobs are immediately
|
||
|
created and spawned. Use this for lightweight and
|
||
|
fast-running jobs, to avoid delays due to on-demand
|
||
|
spawning of the jobs
|
||
|
- An int, giving the exact number of total jobs that are
|
||
|
spawned
|
||
|
- A string, giving an expression as a function of n_jobs,
|
||
|
as in '2*n_jobs'
|
||
|
New in v0.16.0.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
clfs_ : list, shape=[n_classifiers]
|
||
|
Fitted classifiers (clones of the original classifiers)
|
||
|
meta_clf_ : estimator
|
||
|
Fitted meta-classifier (clone of the original meta-estimator)
|
||
|
train_meta_features : numpy array, shape = [n_samples, n_classifiers]
|
||
|
meta-features for training data, where n_samples is the
|
||
|
number of samples
|
||
|
in training data and n_classifiers is the number of classfiers.
|
||
|
|
||
|
Examples
|
||
|
-----------
|
||
|
For usage examples, please see
|
||
|
http://rasbt.github.io/mlxtend/user_guide/classifier/StackingCVClassifier/
|
||
|
|
||
|
"""
|
||
|
|
||
|
def __init__(self, classifiers, meta_classifier,
|
||
|
use_probas=False, drop_proba_col=None,
|
||
|
cv=2, shuffle=True,
|
||
|
random_state=None, stratify=True, verbose=0,
|
||
|
use_features_in_secondary=False,
|
||
|
store_train_meta_features=False,
|
||
|
use_clones=True, n_jobs=None,
|
||
|
pre_dispatch='2*n_jobs'):
|
||
|
|
||
|
self.classifiers = classifiers
|
||
|
self.meta_classifier = meta_classifier
|
||
|
self.use_probas = use_probas
|
||
|
|
||
|
allowed = {None, 'first', 'last'}
|
||
|
if drop_proba_col not in allowed:
|
||
|
raise ValueError('`drop_proba_col` must be in %s. Got %s'
|
||
|
% (allowed, drop_proba_col))
|
||
|
|
||
|
self.drop_proba_col = drop_proba_col
|
||
|
self.cv = cv
|
||
|
self.shuffle = shuffle
|
||
|
self.random_state = random_state
|
||
|
self.stratify = stratify
|
||
|
self.verbose = verbose
|
||
|
self.use_features_in_secondary = use_features_in_secondary
|
||
|
self.store_train_meta_features = store_train_meta_features
|
||
|
self.use_clones = use_clones
|
||
|
self.n_jobs = n_jobs
|
||
|
self.pre_dispatch = pre_dispatch
|
||
|
|
||
|
@property
|
||
|
def named_classifiers(self):
|
||
|
return _name_estimators(self.classifiers)
|
||
|
|
||
|
def fit(self, X, y, groups=None, sample_weight=None):
|
||
|
""" Fit ensemble classifers and the meta-classifier.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : numpy array, shape = [n_samples, n_features]
|
||
|
Training vectors, where n_samples is the number of samples and
|
||
|
n_features is the number of features.
|
||
|
|
||
|
y : numpy array, shape = [n_samples]
|
||
|
Target values.
|
||
|
|
||
|
groups : numpy array/None, shape = [n_samples]
|
||
|
The group that each sample belongs to. This is used by specific
|
||
|
folding strategies such as GroupKFold()
|
||
|
|
||
|
sample_weight : array-like, shape = [n_samples], optional
|
||
|
Sample weights passed as sample_weights to each regressor
|
||
|
in the regressors list as well as the meta_regressor.
|
||
|
Raises error if some regressor does not support
|
||
|
sample_weight in the fit() method.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self : object
|
||
|
|
||
|
"""
|
||
|
if self.use_clones:
|
||
|
self.clfs_ = clone(self.classifiers)
|
||
|
self.meta_clf_ = clone(self.meta_classifier)
|
||
|
else:
|
||
|
self.clfs_ = self.classifiers
|
||
|
self.meta_clf_ = self.meta_classifier
|
||
|
if self.verbose > 0:
|
||
|
print("Fitting %d classifiers..." % (len(self.classifiers)))
|
||
|
|
||
|
final_cv = check_cv(self.cv, y, classifier=self.stratify)
|
||
|
if isinstance(self.cv, int):
|
||
|
# Override shuffle parameter in case of self generated
|
||
|
# cross-validation strategy
|
||
|
final_cv.shuffle = self.shuffle
|
||
|
final_cv.random_state = self.random_state
|
||
|
|
||
|
# Disable global input validation, because it causes issue when
|
||
|
# pipelines are used that perform preprocessing on X. I.e., X may
|
||
|
# not be directly passed to the classifiers, which is why this code
|
||
|
# would raise unecessary errors at this point.
|
||
|
# X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'], dtype=None)
|
||
|
|
||
|
if sample_weight is None:
|
||
|
fit_params = None
|
||
|
else:
|
||
|
fit_params = dict(sample_weight=sample_weight)
|
||
|
|
||
|
meta_features = None
|
||
|
|
||
|
for n, model in enumerate(self.clfs_):
|
||
|
|
||
|
if self.verbose > 0:
|
||
|
i = self.clfs_.index(model) + 1
|
||
|
print("Fitting classifier%d: %s (%d/%d)" %
|
||
|
(i, _name_estimators((model,))[0][0],
|
||
|
i, len(self.clfs_)))
|
||
|
|
||
|
if self.verbose > 2:
|
||
|
if hasattr(model, 'verbose'):
|
||
|
model.set_params(verbose=self.verbose - 2)
|
||
|
|
||
|
if self.verbose > 1:
|
||
|
print(_name_estimators((model,))[0][1])
|
||
|
|
||
|
prediction = cross_val_predict(
|
||
|
model, X, y, groups=groups, cv=final_cv,
|
||
|
n_jobs=self.n_jobs, fit_params=fit_params,
|
||
|
verbose=self.verbose, pre_dispatch=self.pre_dispatch,
|
||
|
method='predict_proba' if self.use_probas else 'predict')
|
||
|
|
||
|
if not self.use_probas:
|
||
|
prediction = prediction[:, np.newaxis]
|
||
|
elif self.drop_proba_col == 'last':
|
||
|
prediction = prediction[:, :-1]
|
||
|
elif self.drop_proba_col == 'first':
|
||
|
prediction = prediction[:, 1:]
|
||
|
|
||
|
if meta_features is None:
|
||
|
meta_features = prediction
|
||
|
else:
|
||
|
meta_features = np.column_stack((meta_features, prediction))
|
||
|
|
||
|
if self.store_train_meta_features:
|
||
|
self.train_meta_features_ = meta_features
|
||
|
|
||
|
# Fit the base models correctly this time using ALL the training set
|
||
|
for model in self.clfs_:
|
||
|
if sample_weight is None:
|
||
|
model.fit(X, y)
|
||
|
else:
|
||
|
model.fit(X, y, sample_weight=sample_weight)
|
||
|
|
||
|
# Fit the secondary model
|
||
|
if self.use_features_in_secondary:
|
||
|
meta_features = self._stack_first_level_features(
|
||
|
X,
|
||
|
meta_features
|
||
|
)
|
||
|
|
||
|
if sample_weight is None:
|
||
|
self.meta_clf_.fit(meta_features, y)
|
||
|
else:
|
||
|
self.meta_clf_.fit(meta_features, y,
|
||
|
sample_weight=sample_weight)
|
||
|
|
||
|
return self
|
||
|
|
||
|
def get_params(self, deep=True):
|
||
|
"""Return estimator parameter names for GridSearch support."""
|
||
|
return self._get_params('named_classifiers', deep=deep)
|
||
|
|
||
|
def set_params(self, **params):
|
||
|
"""Set the parameters of this estimator.
|
||
|
|
||
|
Valid parameter keys can be listed with ``get_params()``.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self
|
||
|
"""
|
||
|
self._set_params('classifiers', 'named_classifiers', **params)
|
||
|
return self
|
||
|
|
||
|
def predict_meta_features(self, X):
|
||
|
""" Get meta-features of test-data.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : numpy array, shape = [n_samples, n_features]
|
||
|
Test vectors, where n_samples is the number of samples and
|
||
|
n_features is the number of features.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
meta-features : numpy array, shape = [n_samples, n_classifiers]
|
||
|
Returns the meta-features for test data.
|
||
|
|
||
|
"""
|
||
|
check_is_fitted(self, ['clfs_', 'meta_clf_'])
|
||
|
|
||
|
per_model_preds = []
|
||
|
|
||
|
for model in self.clfs_:
|
||
|
if not self.use_probas:
|
||
|
prediction = model.predict(X)[:, np.newaxis]
|
||
|
else:
|
||
|
if self.drop_proba_col == 'last':
|
||
|
prediction = model.predict_proba(X)[:, :-1]
|
||
|
elif self.drop_proba_col == 'first':
|
||
|
prediction = model.predict_proba(X)[:, 1:]
|
||
|
else:
|
||
|
prediction = model.predict_proba(X)
|
||
|
|
||
|
per_model_preds.append(prediction)
|
||
|
|
||
|
return np.hstack(per_model_preds)
|
||
|
|
||
|
def _stack_first_level_features(self, X, meta_features):
|
||
|
if sparse.issparse(X):
|
||
|
stack_fn = sparse.hstack
|
||
|
else:
|
||
|
stack_fn = np.hstack
|
||
|
|
||
|
return stack_fn((X, meta_features))
|