projektAI/venv/Lib/site-packages/mlxtend/classifier/stacking_cv_classification.py
2021-06-06 22:13:05 +02:00

349 lines
14 KiB
Python

# Stacking CV classifier
# Sebastian Raschka 2014-2020
# mlxtend Machine Learning Library Extensions
#
# An ensemble-learning meta-classifier for stacking
# Authors: Reiichiro Nakano <github.com/reiinakano>
# Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause
import numpy as np
from scipy import sparse
from sklearn.base import TransformerMixin, clone
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection._split import check_cv
from ..externals.estimator_checks import check_is_fitted
from ..externals.name_estimators import _name_estimators
from ..utils.base_compostion import _BaseXComposition
from ._base_classification import _BaseStackingClassifier
# from sklearn.utils import check_X_y
class StackingCVClassifier(_BaseXComposition, _BaseStackingClassifier,
TransformerMixin):
"""A 'Stacking Cross-Validation' classifier for scikit-learn estimators.
New in mlxtend v0.4.3
Parameters
----------
classifiers : array-like, shape = [n_classifiers]
A list of classifiers.
Invoking the `fit` method on the `StackingCVClassifer` will fit clones
of these original classifiers that will
be stored in the class attribute `self.clfs_` if `use_clones=True`.
meta_classifier : object
The meta-classifier to be fitted on the ensemble of
classifiers
use_probas : bool (default: False)
If True, trains meta-classifier based on predicted probabilities
instead of class labels.
drop_proba_col : string (default: None)
Drops extra "probability" column in the feature set, because it is
redundant:
p(y_c) = 1 - p(y_1) + p(y_2) + ... + p(y_{c-1}).
This can be useful for meta-classifiers that are sensitive to perfectly
collinear features.
If 'last', drops last probability column.
If 'first', drops first probability column.
Only relevant if `use_probas=True`.
cv : int, cross-validation generator or an iterable, optional (default: 2)
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 2-fold cross validation,
- integer, to specify the number of folds in a `(Stratified)KFold`,
- An object to be used as a cross-validation generator.
- An iterable yielding train, test splits.
For integer/None inputs, it will use either a `KFold` or
`StratifiedKFold` cross validation depending the value of `stratify`
argument.
shuffle : bool (default: True)
If True, and the `cv` argument is integer, the training data will be
shuffled at fitting stage prior to cross-validation. If the `cv`
argument is a specific cross validation technique, this argument is
omitted.
random_state : int, RandomState instance or None, optional (default: None)
Constrols the randomness of the cv splitter. Used when `cv` is
integer and `shuffle=True`. New in v0.16.0.
stratify : bool (default: True)
If True, and the `cv` argument is integer it will follow a stratified
K-Fold cross validation technique. If the `cv` argument is a specific
cross validation technique, this argument is omitted.
verbose : int, optional (default=0)
Controls the verbosity of the building process.
- `verbose=0` (default): Prints nothing
- `verbose=1`: Prints the number & name of the regressor being fitted
and which fold is currently being used for fitting
- `verbose=2`: Prints info about the parameters of the
regressor being fitted
- `verbose>2`: Changes `verbose` param of the underlying regressor to
self.verbose - 2
use_features_in_secondary : bool (default: False)
If True, the meta-classifier will be trained both on the predictions
of the original classifiers and the original dataset.
If False, the meta-classifier will be trained only on the predictions
of the original classifiers.
store_train_meta_features : bool (default: False)
If True, the meta-features computed from the training data used
for fitting the meta-classifier stored in the
`self.train_meta_features_` array, which can be
accessed after calling `fit`.
use_clones : bool (default: True)
Clones the classifiers for stacking classification if True (default)
or else uses the original ones, which will be refitted on the dataset
upon calling the `fit` method. Hence, if use_clones=True, the original
input classifiers will remain unmodified upon using the
StackingCVClassifier's `fit` method.
Setting `use_clones=False` is
recommended if you are working with estimators that are supporting
the scikit-learn fit/predict API interface but are not compatible
to scikit-learn's `clone` function.
n_jobs : int or None, optional (default=None)
The number of CPUs to use to do the computation.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details. New in v0.16.0.
pre_dispatch : int, or string, optional
Controls the number of jobs that get dispatched during parallel
execution. Reducing this number can be useful to avoid an
explosion of memory consumption when more jobs get dispatched
than CPUs can process. This parameter can be:
- None, in which case all the jobs are immediately
created and spawned. Use this for lightweight and
fast-running jobs, to avoid delays due to on-demand
spawning of the jobs
- An int, giving the exact number of total jobs that are
spawned
- A string, giving an expression as a function of n_jobs,
as in '2*n_jobs'
New in v0.16.0.
Attributes
----------
clfs_ : list, shape=[n_classifiers]
Fitted classifiers (clones of the original classifiers)
meta_clf_ : estimator
Fitted meta-classifier (clone of the original meta-estimator)
train_meta_features : numpy array, shape = [n_samples, n_classifiers]
meta-features for training data, where n_samples is the
number of samples
in training data and n_classifiers is the number of classfiers.
Examples
-----------
For usage examples, please see
http://rasbt.github.io/mlxtend/user_guide/classifier/StackingCVClassifier/
"""
def __init__(self, classifiers, meta_classifier,
use_probas=False, drop_proba_col=None,
cv=2, shuffle=True,
random_state=None, stratify=True, verbose=0,
use_features_in_secondary=False,
store_train_meta_features=False,
use_clones=True, n_jobs=None,
pre_dispatch='2*n_jobs'):
self.classifiers = classifiers
self.meta_classifier = meta_classifier
self.use_probas = use_probas
allowed = {None, 'first', 'last'}
if drop_proba_col not in allowed:
raise ValueError('`drop_proba_col` must be in %s. Got %s'
% (allowed, drop_proba_col))
self.drop_proba_col = drop_proba_col
self.cv = cv
self.shuffle = shuffle
self.random_state = random_state
self.stratify = stratify
self.verbose = verbose
self.use_features_in_secondary = use_features_in_secondary
self.store_train_meta_features = store_train_meta_features
self.use_clones = use_clones
self.n_jobs = n_jobs
self.pre_dispatch = pre_dispatch
@property
def named_classifiers(self):
return _name_estimators(self.classifiers)
def fit(self, X, y, groups=None, sample_weight=None):
""" Fit ensemble classifers and the meta-classifier.
Parameters
----------
X : numpy array, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
y : numpy array, shape = [n_samples]
Target values.
groups : numpy array/None, shape = [n_samples]
The group that each sample belongs to. This is used by specific
folding strategies such as GroupKFold()
sample_weight : array-like, shape = [n_samples], optional
Sample weights passed as sample_weights to each regressor
in the regressors list as well as the meta_regressor.
Raises error if some regressor does not support
sample_weight in the fit() method.
Returns
-------
self : object
"""
if self.use_clones:
self.clfs_ = clone(self.classifiers)
self.meta_clf_ = clone(self.meta_classifier)
else:
self.clfs_ = self.classifiers
self.meta_clf_ = self.meta_classifier
if self.verbose > 0:
print("Fitting %d classifiers..." % (len(self.classifiers)))
final_cv = check_cv(self.cv, y, classifier=self.stratify)
if isinstance(self.cv, int):
# Override shuffle parameter in case of self generated
# cross-validation strategy
final_cv.shuffle = self.shuffle
final_cv.random_state = self.random_state
# Disable global input validation, because it causes issue when
# pipelines are used that perform preprocessing on X. I.e., X may
# not be directly passed to the classifiers, which is why this code
# would raise unecessary errors at this point.
# X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'], dtype=None)
if sample_weight is None:
fit_params = None
else:
fit_params = dict(sample_weight=sample_weight)
meta_features = None
for n, model in enumerate(self.clfs_):
if self.verbose > 0:
i = self.clfs_.index(model) + 1
print("Fitting classifier%d: %s (%d/%d)" %
(i, _name_estimators((model,))[0][0],
i, len(self.clfs_)))
if self.verbose > 2:
if hasattr(model, 'verbose'):
model.set_params(verbose=self.verbose - 2)
if self.verbose > 1:
print(_name_estimators((model,))[0][1])
prediction = cross_val_predict(
model, X, y, groups=groups, cv=final_cv,
n_jobs=self.n_jobs, fit_params=fit_params,
verbose=self.verbose, pre_dispatch=self.pre_dispatch,
method='predict_proba' if self.use_probas else 'predict')
if not self.use_probas:
prediction = prediction[:, np.newaxis]
elif self.drop_proba_col == 'last':
prediction = prediction[:, :-1]
elif self.drop_proba_col == 'first':
prediction = prediction[:, 1:]
if meta_features is None:
meta_features = prediction
else:
meta_features = np.column_stack((meta_features, prediction))
if self.store_train_meta_features:
self.train_meta_features_ = meta_features
# Fit the base models correctly this time using ALL the training set
for model in self.clfs_:
if sample_weight is None:
model.fit(X, y)
else:
model.fit(X, y, sample_weight=sample_weight)
# Fit the secondary model
if self.use_features_in_secondary:
meta_features = self._stack_first_level_features(
X,
meta_features
)
if sample_weight is None:
self.meta_clf_.fit(meta_features, y)
else:
self.meta_clf_.fit(meta_features, y,
sample_weight=sample_weight)
return self
def get_params(self, deep=True):
"""Return estimator parameter names for GridSearch support."""
return self._get_params('named_classifiers', deep=deep)
def set_params(self, **params):
"""Set the parameters of this estimator.
Valid parameter keys can be listed with ``get_params()``.
Returns
-------
self
"""
self._set_params('classifiers', 'named_classifiers', **params)
return self
def predict_meta_features(self, X):
""" Get meta-features of test-data.
Parameters
----------
X : numpy array, shape = [n_samples, n_features]
Test vectors, where n_samples is the number of samples and
n_features is the number of features.
Returns
-------
meta-features : numpy array, shape = [n_samples, n_classifiers]
Returns the meta-features for test data.
"""
check_is_fitted(self, ['clfs_', 'meta_clf_'])
per_model_preds = []
for model in self.clfs_:
if not self.use_probas:
prediction = model.predict(X)[:, np.newaxis]
else:
if self.drop_proba_col == 'last':
prediction = model.predict_proba(X)[:, :-1]
elif self.drop_proba_col == 'first':
prediction = model.predict_proba(X)[:, 1:]
else:
prediction = model.predict_proba(X)
per_model_preds.append(prediction)
return np.hstack(per_model_preds)
def _stack_first_level_features(self, X, meta_features):
if sparse.issparse(X):
stack_fn = sparse.hstack
else:
stack_fn = np.hstack
return stack_fn((X, meta_features))