projektAI/venv/Lib/site-packages/mlxtend/regressor/stacking_cv_regression.py
2021-06-06 22:13:05 +02:00

298 lines
11 KiB
Python

# Out-of-fold stacking regressor
#
# For explanation of approach, see:
# dnc1994.com/2016/05/rank-10-percent-in-first-kaggle-competition-en/#Stacking
#
# Sebastian Raschka 2014-2020
# mlxtend Machine Learning Library Extensions
#
# An ensemble-learning meta-regressor for out-of-fold stacking regression
# Authors:
# Eike Dehling <e.e.dehling@gmail.com>
# Sebastian Raschka <https://sebastianraschka.com>
#
# License: BSD 3 clause
from ..externals.estimator_checks import check_is_fitted
from ..externals.name_estimators import _name_estimators
from ..utils.base_compostion import _BaseXComposition
from scipy import sparse
from sklearn.base import RegressorMixin
from sklearn.base import TransformerMixin
from sklearn.base import clone
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection._split import check_cv
from sklearn.utils import check_X_y
import numpy as np
class StackingCVRegressor(_BaseXComposition, RegressorMixin, TransformerMixin):
"""A 'Stacking Cross-Validation' regressor for scikit-learn estimators.
New in mlxtend v0.7.0
Parameters
----------
regressors : array-like, shape = [n_regressors]
A list of regressors.
Invoking the `fit` method on the `StackingCVRegressor` will fit clones
of these original regressors that will
be stored in the class attribute `self.regr_`.
meta_regressor : object
The meta-regressor to be fitted on the ensemble of
regressor
cv : int, cross-validation generator or iterable, optional (default: 5)
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 5-fold cross validation,
- integer, to specify the number of folds in a `KFold`,
- An object to be used as a cross-validation generator.
- An iterable yielding train, test splits.
For integer/None inputs, it will use `KFold` cross-validation
shuffle : bool (default: True)
If True, and the `cv` argument is integer, the training data will
be shuffled at fitting stage prior to cross-validation. If the `cv`
argument is a specific cross validation technique, this argument is
omitted.
random_state : int, RandomState instance or None, optional (default: None)
Constrols the randomness of the cv splitter. Used when `cv` is
integer and `shuffle=True`. New in v0.16.0.
verbose : int, optional (default=0)
Controls the verbosity of the building process. New in v0.16.0
refit : bool (default: True)
Clones the regressors for stacking regression if True (default)
or else uses the original ones, which will be refitted on the dataset
upon calling the `fit` method. Setting refit=False is
recommended if you are working with estimators that are supporting
the scikit-learn fit/predict API interface but are not compatible
to scikit-learn's `clone` function.
use_features_in_secondary : bool (default: False)
If True, the meta-regressor will be trained both on
the predictions of the original regressors and the
original dataset.
If False, the meta-regressor will be trained only on
the predictions of the original regressors.
store_train_meta_features : bool (default: False)
If True, the meta-features computed from the training data
used for fitting the
meta-regressor stored in the `self.train_meta_features_` array,
which can be
accessed after calling `fit`.
n_jobs : int or None, optional (default=None)
The number of CPUs to use to do the computation.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details. New in v0.16.0.
pre_dispatch : int, or string, optional
Controls the number of jobs that get dispatched during parallel
execution. Reducing this number can be useful to avoid an
explosion of memory consumption when more jobs get dispatched
than CPUs can process. This parameter can be:
- None, in which case all the jobs are immediately
created and spawned. Use this for lightweight and
fast-running jobs, to avoid delays due to on-demand
spawning of the jobs
- An int, giving the exact number of total jobs that are
spawned
- A string, giving an expression as a function of n_jobs,
as in '2*n_jobs'
New in v0.16.0.
Attributes
----------
train_meta_features : numpy array, shape = [n_samples, n_regressors]
meta-features for training data, where n_samples is the
number of samples
in training data and len(self.regressors) is the number of regressors.
Examples
-----------
For usage examples, please see
http://rasbt.github.io/mlxtend/user_guide/regressor/StackingCVRegressor/
"""
def __init__(self, regressors, meta_regressor, cv=5,
shuffle=True, random_state=None, verbose=0,
refit=True, use_features_in_secondary=False,
store_train_meta_features=False, n_jobs=None,
pre_dispatch='2*n_jobs'):
self.regressors = regressors
self.meta_regressor = meta_regressor
self.cv = cv
self.shuffle = shuffle
self.random_state = random_state
self.verbose = verbose
self.refit = refit
self.use_features_in_secondary = use_features_in_secondary
self.store_train_meta_features = store_train_meta_features
self.n_jobs = n_jobs
self.pre_dispatch = pre_dispatch
def fit(self, X, y, groups=None, sample_weight=None):
""" Fit ensemble regressors and the meta-regressor.
Parameters
----------
X : numpy array, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
y : numpy array, shape = [n_samples]
Target values.
groups : numpy array/None, shape = [n_samples]
The group that each sample belongs to. This is used by specific
folding strategies such as GroupKFold()
sample_weight : array-like, shape = [n_samples], optional
Sample weights passed as sample_weights to each regressor
in the regressors list as well as the meta_regressor.
Raises error if some regressor does not support
sample_weight in the fit() method.
Returns
-------
self : object
"""
if self.refit:
self.regr_ = [clone(clf) for clf in self.regressors]
self.meta_regr_ = clone(self.meta_regressor)
else:
self.regr_ = self.regressors
self.meta_regr_ = self.meta_regressor
X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'], dtype=None)
kfold = check_cv(self.cv, y)
if isinstance(self.cv, int):
# Override shuffle parameter in case of self generated
# cross-validation strategy
kfold.shuffle = self.shuffle
kfold.random_state = self.random_state
#
# The meta_features are collection of the prediction data,
# in shape of [n_samples, len(self.regressors)]. Each column
# corresponds to the result of `corss_val_predict` using every
# base regressors.
# Advantage of this complex approach is that data points we're
# predicting have not been trained on by the algorithm, so it's
# less susceptible to overfitting.
if sample_weight is None:
fit_params = None
else:
fit_params = dict(sample_weight=sample_weight)
meta_features = np.column_stack([cross_val_predict(
regr, X, y, groups=groups, cv=kfold,
verbose=self.verbose, n_jobs=self.n_jobs,
fit_params=fit_params, pre_dispatch=self.pre_dispatch)
for regr in self.regr_])
# save meta-features for training data
if self.store_train_meta_features:
self.train_meta_features_ = meta_features
# Train meta-model on the out-of-fold predictions
if not self.use_features_in_secondary:
pass
elif sparse.issparse(X):
meta_features = sparse.hstack((X, meta_features))
else:
meta_features = np.hstack((X, meta_features))
if sample_weight is None:
self.meta_regr_.fit(meta_features, y)
else:
self.meta_regr_.fit(meta_features, y, sample_weight=sample_weight)
# Retrain base models on all data
for regr in self.regr_:
if sample_weight is None:
regr.fit(X, y)
else:
regr.fit(X, y, sample_weight=sample_weight)
return self
def predict(self, X):
""" Predict target values for X.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
Returns
----------
y_target : array-like, shape = [n_samples] or [n_samples, n_targets]
Predicted target values.
"""
#
# First we make predictions with the base-models then we predict with
# the meta-model from that info.
#
check_is_fitted(self, 'regr_')
meta_features = np.column_stack([
regr.predict(X) for regr in self.regr_
])
if not self.use_features_in_secondary:
return self.meta_regr_.predict(meta_features)
elif sparse.issparse(X):
return self.meta_regr_.predict(sparse.hstack((X, meta_features)))
else:
return self.meta_regr_.predict(np.hstack((X, meta_features)))
def predict_meta_features(self, X):
""" Get meta-features of test-data.
Parameters
----------
X : numpy array, shape = [n_samples, n_features]
Test vectors, where n_samples is the number of samples and
n_features is the number of features.
Returns
-------
meta-features : numpy array, shape = [n_samples, len(self.regressors)]
meta-features for test data, where n_samples is the number of
samples in test data and len(self.regressors) is the number
of regressors.
"""
check_is_fitted(self, 'regr_')
return np.column_stack([regr.predict(X) for regr in self.regr_])
@property
def named_regressors(self):
"""
Returns
-------
List of named estimator tuples, like [('svc', SVC(...))]
"""
return _name_estimators(self.regressors)
def get_params(self, deep=True):
#
# Return estimator parameter names for GridSearch support.
#
return self._get_params('named_regressors', deep=deep)
def set_params(self, **params):
"""Set the parameters of this estimator.
Valid parameter keys can be listed with ``get_params()``.
Returns
-------
self
"""
self._set_params('regressors', 'named_regressors', **params)
return self