306 lines
10 KiB
Python
306 lines
10 KiB
Python
"""Common tests for metaestimators"""
|
|
import functools
|
|
from inspect import signature
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from sklearn.base import BaseEstimator
|
|
from sklearn.base import is_regressor
|
|
from sklearn.datasets import make_classification
|
|
from sklearn.utils import all_estimators
|
|
from sklearn.utils.estimator_checks import _enforce_estimator_tags_X
|
|
from sklearn.utils.estimator_checks import _enforce_estimator_tags_y
|
|
from sklearn.utils.validation import check_is_fitted
|
|
from sklearn.utils._testing import set_random_state
|
|
from sklearn.pipeline import Pipeline, make_pipeline
|
|
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.feature_selection import RFE, RFECV
|
|
from sklearn.ensemble import BaggingClassifier
|
|
from sklearn.exceptions import NotFittedError
|
|
from sklearn.semi_supervised import SelfTrainingClassifier
|
|
from sklearn.linear_model import Ridge, LogisticRegression
|
|
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
|
|
|
|
|
|
class DelegatorData:
|
|
def __init__(
|
|
self,
|
|
name,
|
|
construct,
|
|
skip_methods=(),
|
|
fit_args=make_classification(random_state=0),
|
|
):
|
|
self.name = name
|
|
self.construct = construct
|
|
self.fit_args = fit_args
|
|
self.skip_methods = skip_methods
|
|
|
|
|
|
DELEGATING_METAESTIMATORS = [
|
|
DelegatorData("Pipeline", lambda est: Pipeline([("est", est)])),
|
|
DelegatorData(
|
|
"GridSearchCV",
|
|
lambda est: GridSearchCV(est, param_grid={"param": [5]}, cv=2),
|
|
skip_methods=["score"],
|
|
),
|
|
DelegatorData(
|
|
"RandomizedSearchCV",
|
|
lambda est: RandomizedSearchCV(
|
|
est, param_distributions={"param": [5]}, cv=2, n_iter=1
|
|
),
|
|
skip_methods=["score"],
|
|
),
|
|
DelegatorData("RFE", RFE, skip_methods=["transform", "inverse_transform"]),
|
|
DelegatorData("RFECV", RFECV, skip_methods=["transform", "inverse_transform"]),
|
|
DelegatorData(
|
|
"BaggingClassifier",
|
|
BaggingClassifier,
|
|
skip_methods=[
|
|
"transform",
|
|
"inverse_transform",
|
|
"score",
|
|
"predict_proba",
|
|
"predict_log_proba",
|
|
"predict",
|
|
],
|
|
),
|
|
DelegatorData(
|
|
"SelfTrainingClassifier",
|
|
lambda est: SelfTrainingClassifier(est),
|
|
skip_methods=["transform", "inverse_transform", "predict_proba"],
|
|
),
|
|
]
|
|
|
|
|
|
def test_metaestimator_delegation():
|
|
# Ensures specified metaestimators have methods iff subestimator does
|
|
def hides(method):
|
|
@property
|
|
def wrapper(obj):
|
|
if obj.hidden_method == method.__name__:
|
|
raise AttributeError("%r is hidden" % obj.hidden_method)
|
|
return functools.partial(method, obj)
|
|
|
|
return wrapper
|
|
|
|
class SubEstimator(BaseEstimator):
|
|
def __init__(self, param=1, hidden_method=None):
|
|
self.param = param
|
|
self.hidden_method = hidden_method
|
|
|
|
def fit(self, X, y=None, *args, **kwargs):
|
|
self.coef_ = np.arange(X.shape[1])
|
|
self.classes_ = []
|
|
return True
|
|
|
|
def _check_fit(self):
|
|
check_is_fitted(self)
|
|
|
|
@hides
|
|
def inverse_transform(self, X, *args, **kwargs):
|
|
self._check_fit()
|
|
return X
|
|
|
|
@hides
|
|
def transform(self, X, *args, **kwargs):
|
|
self._check_fit()
|
|
return X
|
|
|
|
@hides
|
|
def predict(self, X, *args, **kwargs):
|
|
self._check_fit()
|
|
return np.ones(X.shape[0])
|
|
|
|
@hides
|
|
def predict_proba(self, X, *args, **kwargs):
|
|
self._check_fit()
|
|
return np.ones(X.shape[0])
|
|
|
|
@hides
|
|
def predict_log_proba(self, X, *args, **kwargs):
|
|
self._check_fit()
|
|
return np.ones(X.shape[0])
|
|
|
|
@hides
|
|
def decision_function(self, X, *args, **kwargs):
|
|
self._check_fit()
|
|
return np.ones(X.shape[0])
|
|
|
|
@hides
|
|
def score(self, X, y, *args, **kwargs):
|
|
self._check_fit()
|
|
return 1.0
|
|
|
|
methods = [
|
|
k
|
|
for k in SubEstimator.__dict__.keys()
|
|
if not k.startswith("_") and not k.startswith("fit")
|
|
]
|
|
methods.sort()
|
|
|
|
for delegator_data in DELEGATING_METAESTIMATORS:
|
|
delegate = SubEstimator()
|
|
delegator = delegator_data.construct(delegate)
|
|
for method in methods:
|
|
if method in delegator_data.skip_methods:
|
|
continue
|
|
assert hasattr(delegate, method)
|
|
assert hasattr(
|
|
delegator, method
|
|
), "%s does not have method %r when its delegate does" % (
|
|
delegator_data.name,
|
|
method,
|
|
)
|
|
# delegation before fit raises a NotFittedError
|
|
if method == "score":
|
|
with pytest.raises(NotFittedError):
|
|
getattr(delegator, method)(
|
|
delegator_data.fit_args[0], delegator_data.fit_args[1]
|
|
)
|
|
else:
|
|
with pytest.raises(NotFittedError):
|
|
getattr(delegator, method)(delegator_data.fit_args[0])
|
|
|
|
delegator.fit(*delegator_data.fit_args)
|
|
for method in methods:
|
|
if method in delegator_data.skip_methods:
|
|
continue
|
|
# smoke test delegation
|
|
if method == "score":
|
|
getattr(delegator, method)(
|
|
delegator_data.fit_args[0], delegator_data.fit_args[1]
|
|
)
|
|
else:
|
|
getattr(delegator, method)(delegator_data.fit_args[0])
|
|
|
|
for method in methods:
|
|
if method in delegator_data.skip_methods:
|
|
continue
|
|
delegate = SubEstimator(hidden_method=method)
|
|
delegator = delegator_data.construct(delegate)
|
|
assert not hasattr(delegate, method)
|
|
assert not hasattr(
|
|
delegator, method
|
|
), "%s has method %r when its delegate does not" % (
|
|
delegator_data.name,
|
|
method,
|
|
)
|
|
|
|
|
|
def _generate_meta_estimator_instances_with_pipeline():
|
|
"""Generate instances of meta-estimators fed with a pipeline
|
|
|
|
Are considered meta-estimators all estimators accepting one of "estimator",
|
|
"base_estimator" or "estimators".
|
|
"""
|
|
for _, Estimator in sorted(all_estimators()):
|
|
sig = set(signature(Estimator).parameters)
|
|
|
|
if "estimator" in sig or "base_estimator" in sig or "regressor" in sig:
|
|
if is_regressor(Estimator):
|
|
estimator = make_pipeline(TfidfVectorizer(), Ridge())
|
|
param_grid = {"ridge__alpha": [0.1, 1.0]}
|
|
else:
|
|
estimator = make_pipeline(TfidfVectorizer(), LogisticRegression())
|
|
param_grid = {"logisticregression__C": [0.1, 1.0]}
|
|
|
|
if "param_grid" in sig or "param_distributions" in sig:
|
|
# SearchCV estimators
|
|
extra_params = {"n_iter": 2} if "n_iter" in sig else {}
|
|
yield Estimator(estimator, param_grid, **extra_params)
|
|
else:
|
|
yield Estimator(estimator)
|
|
|
|
elif "transformer_list" in sig:
|
|
# FeatureUnion
|
|
transformer_list = [
|
|
("trans1", make_pipeline(TfidfVectorizer(), MaxAbsScaler())),
|
|
(
|
|
"trans2",
|
|
make_pipeline(TfidfVectorizer(), StandardScaler(with_mean=False)),
|
|
),
|
|
]
|
|
yield Estimator(transformer_list)
|
|
|
|
elif "estimators" in sig:
|
|
# stacking, voting
|
|
if is_regressor(Estimator):
|
|
estimator = [
|
|
("est1", make_pipeline(TfidfVectorizer(), Ridge(alpha=0.1))),
|
|
("est2", make_pipeline(TfidfVectorizer(), Ridge(alpha=1))),
|
|
]
|
|
else:
|
|
estimator = [
|
|
(
|
|
"est1",
|
|
make_pipeline(TfidfVectorizer(), LogisticRegression(C=0.1)),
|
|
),
|
|
("est2", make_pipeline(TfidfVectorizer(), LogisticRegression(C=1))),
|
|
]
|
|
yield Estimator(estimator)
|
|
|
|
else:
|
|
continue
|
|
|
|
|
|
# TODO: remove data validation for the following estimators
|
|
# They should be able to work on any data and delegate data validation to
|
|
# their inner estimator(s).
|
|
DATA_VALIDATION_META_ESTIMATORS_TO_IGNORE = [
|
|
"AdaBoostClassifier",
|
|
"AdaBoostRegressor",
|
|
"BaggingClassifier",
|
|
"BaggingRegressor",
|
|
"ClassifierChain", # data validation is necessary
|
|
"IterativeImputer",
|
|
"OneVsOneClassifier", # input validation can't be avoided
|
|
"RANSACRegressor",
|
|
"RFE",
|
|
"RFECV",
|
|
"RegressorChain", # data validation is necessary
|
|
"SelfTrainingClassifier",
|
|
"SequentialFeatureSelector", # not applicable (2D data mandatory)
|
|
]
|
|
|
|
DATA_VALIDATION_META_ESTIMATORS = [
|
|
est
|
|
for est in _generate_meta_estimator_instances_with_pipeline()
|
|
if est.__class__.__name__ not in DATA_VALIDATION_META_ESTIMATORS_TO_IGNORE
|
|
]
|
|
|
|
|
|
def _get_meta_estimator_id(estimator):
|
|
return estimator.__class__.__name__
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"estimator", DATA_VALIDATION_META_ESTIMATORS, ids=_get_meta_estimator_id
|
|
)
|
|
def test_meta_estimators_delegate_data_validation(estimator):
|
|
# Check that meta-estimators delegate data validation to the inner
|
|
# estimator(s).
|
|
rng = np.random.RandomState(0)
|
|
set_random_state(estimator)
|
|
|
|
n_samples = 30
|
|
X = rng.choice(np.array(["aa", "bb", "cc"], dtype=object), size=n_samples)
|
|
|
|
if is_regressor(estimator):
|
|
y = rng.normal(size=n_samples)
|
|
else:
|
|
y = rng.randint(3, size=n_samples)
|
|
|
|
# We convert to lists to make sure it works on array-like
|
|
X = _enforce_estimator_tags_X(estimator, X).tolist()
|
|
y = _enforce_estimator_tags_y(estimator, y).tolist()
|
|
|
|
# Calling fit should not raise any data validation exception since X is a
|
|
# valid input datastructure for the first step of the pipeline passed as
|
|
# base estimator to the meta estimator.
|
|
estimator.fit(X, y)
|
|
|
|
# n_features_in_ should not be defined since data is not tabular data.
|
|
assert not hasattr(estimator, "n_features_in_")
|