2024-05-26 19:49:15 +02:00
# Author: Gael Varoquaux
# License: BSD 3 clause
import pickle
import re
import warnings
import numpy as np
import pytest
import scipy.sparse as sp
from numpy.testing import assert_allclose
import sklearn
from sklearn import config_context, datasets
from sklearn.base import (
from sklearn.decomposition import PCA
from sklearn.exceptions import InconsistentVersionWarning
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils._mocking import MockDataFrame
from sklearn.utils._set_output import _get_output_config
from sklearn.utils._testing import (
# A few test classes
class MyEstimator(BaseEstimator):
def __init__(self, l1=0, empty=None):
self.l1 = l1
self.empty = empty
class K(BaseEstimator):
def __init__(self, c=None, d=None):
self.c = c
self.d = d
class T(BaseEstimator):
def __init__(self, a=None, b=None):
self.a = a
self.b = b
class NaNTag(BaseEstimator):
def _more_tags(self):
return {"allow_nan": True}
class NoNaNTag(BaseEstimator):
def _more_tags(self):
return {"allow_nan": False}
class OverrideTag(NaNTag):
def _more_tags(self):
return {"allow_nan": False}
class DiamondOverwriteTag(NaNTag, NoNaNTag):
def _more_tags(self):
return dict()
class InheritDiamondOverwriteTag(DiamondOverwriteTag):
class ModifyInitParams(BaseEstimator):
"""Deprecated behavior.
Equal parameters but with a type cast.
Doesn't fulfill a is a
def __init__(self, a=np.array([0])):
self.a = a.copy()
class Buggy(BaseEstimator):
"A buggy estimator that does not set its parameters right."
def __init__(self, a=None):
self.a = 1
class NoEstimator:
def __init__(self):
def fit(self, X=None, y=None):
return self
def predict(self, X=None):
return None
class VargEstimator(BaseEstimator):
"""scikit-learn estimators shouldn't have vargs."""
def __init__(self, *vargs):
# The tests
def test_clone():
# Tests that clone creates a correct deep copy.
# We create an estimator, make a copy of its original state
# (which, in this case, is the current state of the estimator),
# and check that the obtained copy is a correct deep copy.
from sklearn.feature_selection import SelectFpr, f_classif
selector = SelectFpr(f_classif, alpha=0.1)
new_selector = clone(selector)
assert selector is not new_selector
assert selector.get_params() == new_selector.get_params()
selector = SelectFpr(f_classif, alpha=np.zeros((10, 2)))
new_selector = clone(selector)
assert selector is not new_selector
def test_clone_2():
# Tests that clone doesn't copy everything.
# We first create an estimator, give it an own attribute, and
# make a copy of its original state. Then we check that the copy doesn't
# have the specific attribute we manually added to the initial estimator.
from sklearn.feature_selection import SelectFpr, f_classif
selector = SelectFpr(f_classif, alpha=0.1)
selector.own_attribute = "test"
new_selector = clone(selector)
assert not hasattr(new_selector, "own_attribute")
def test_clone_buggy():
# Check that clone raises an error on buggy estimators.
buggy = Buggy()
buggy.a = 2
with pytest.raises(RuntimeError):
no_estimator = NoEstimator()
with pytest.raises(TypeError):
varg_est = VargEstimator()
with pytest.raises(RuntimeError):
est = ModifyInitParams()
with pytest.raises(RuntimeError):
def test_clone_empty_array():
# Regression test for cloning estimators with empty arrays
clf = MyEstimator(empty=np.array([]))
clf2 = clone(clf)
assert_array_equal(clf.empty, clf2.empty)
clf = MyEstimator(empty=sp.csr_matrix(np.array([[0]])))
clf2 = clone(clf)
assert_array_equal(clf.empty.data, clf2.empty.data)
def test_clone_nan():
# Regression test for cloning estimators with default parameter as np.nan
clf = MyEstimator(empty=np.nan)
clf2 = clone(clf)
assert clf.empty is clf2.empty
def test_clone_dict():
# test that clone creates a clone of a dict
orig = {"a": MyEstimator()}
cloned = clone(orig)
assert orig["a"] is not cloned["a"]
def test_clone_sparse_matrices():
sparse_matrix_classes = [
for name in dir(sp)
if name.endswith("_matrix") and type(cls := getattr(sp, name)) is type
for cls in sparse_matrix_classes:
sparse_matrix = cls(np.eye(5))
clf = MyEstimator(empty=sparse_matrix)
clf_cloned = clone(clf)
assert clf.empty.__class__ is clf_cloned.empty.__class__
assert_array_equal(clf.empty.toarray(), clf_cloned.empty.toarray())
def test_clone_estimator_types():
# Check that clone works for parameters that are types rather than
# instances
clf = MyEstimator(empty=MyEstimator)
clf2 = clone(clf)
assert clf.empty is clf2.empty
def test_clone_class_rather_than_instance():
# Check that clone raises expected error message when
# cloning class rather than instance
msg = "You should provide an instance of scikit-learn estimator"
with pytest.raises(TypeError, match=msg):
def test_repr():
# Smoke test the repr of the base estimator.
my_estimator = MyEstimator()
test = T(K(), K())
assert repr(test) == "T(a=K(), b=K())"
some_est = T(a=["long_params"] * 1000)
assert len(repr(some_est)) == 485
def test_str():
# Smoke test the str of the base estimator
my_estimator = MyEstimator()
def test_get_params():
test = T(K(), K)
assert "a__d" in test.get_params(deep=True)
assert "a__d" not in test.get_params(deep=False)
assert test.a.d == 2
with pytest.raises(ValueError):
def test_is_classifier():
svc = SVC()
assert is_classifier(svc)
assert is_classifier(GridSearchCV(svc, {"C": [0.1, 1]}))
assert is_classifier(Pipeline([("svc", svc)]))
assert is_classifier(Pipeline([("svc_cv", GridSearchCV(svc, {"C": [0.1, 1]}))]))
def test_set_params():
# test nested estimator parameter setting
clf = Pipeline([("svc", SVC())])
# non-existing parameter in svc
with pytest.raises(ValueError):
# non-existing parameter of pipeline
with pytest.raises(ValueError):
# we don't currently catch if the things in pipeline are estimators
# bad_pipeline = Pipeline([("bad", NoEstimator())])
# assert_raises(AttributeError, bad_pipeline.set_params,
# bad__stupid_param=True)
def test_set_params_passes_all_parameters():
# Make sure all parameters are passed together to set_params
# of nested estimator. Regression test for #9944
class TestDecisionTree(DecisionTreeClassifier):
def set_params(self, **kwargs):
# expected_kwargs is in test scope
assert kwargs == expected_kwargs
return self
expected_kwargs = {"max_depth": 5, "min_samples_leaf": 2}
for est in [
Pipeline([("estimator", TestDecisionTree())]),
GridSearchCV(TestDecisionTree(), {}),
est.set_params(estimator__max_depth=5, estimator__min_samples_leaf=2)
def test_set_params_updates_valid_params():
# Check that set_params tries to set SVC().C, not
# DecisionTreeClassifier().C
gscv = GridSearchCV(DecisionTreeClassifier(), {})
gscv.set_params(estimator=SVC(), estimator__C=42.0)
assert gscv.estimator.C == 42.0
DecisionTreeClassifier(max_depth=2, random_state=0),
DecisionTreeRegressor(max_depth=2, random_state=0),
def test_score_sample_weight(tree, dataset):
rng = np.random.RandomState(0)
# check that the score with and without sample weights are different
X, y = dataset
tree.fit(X, y)
# generate random sample weights
sample_weight = rng.randint(1, 10, size=len(y))
score_unweighted = tree.score(X, y)
score_weighted = tree.score(X, y, sample_weight=sample_weight)
msg = "Unweighted and weighted scores are unexpectedly equal"
assert score_unweighted != score_weighted, msg
def test_clone_pandas_dataframe():
class DummyEstimator(TransformerMixin, BaseEstimator):
"""This is a dummy class for generating numerical features
This feature extractor extracts numerical features from pandas data
df: pandas data frame
The pandas data frame parameter.
def __init__(self, df=None, scalar_param=1):
self.df = df
self.scalar_param = scalar_param
def fit(self, X, y=None):
def transform(self, X):
# build and clone estimator
d = np.arange(10)
df = MockDataFrame(d)
e = DummyEstimator(df, scalar_param=1)
cloned_e = clone(e)
# the test
assert (e.df == cloned_e.df).values.all()
assert e.scalar_param == cloned_e.scalar_param
def test_clone_protocol():
"""Checks that clone works with `__sklearn_clone__` protocol."""
class FrozenEstimator(BaseEstimator):
def __init__(self, fitted_estimator):
self.fitted_estimator = fitted_estimator
def __getattr__(self, name):
return getattr(self.fitted_estimator, name)
def __sklearn_clone__(self):
return self
def fit(self, *args, **kwargs):
return self
def fit_transform(self, *args, **kwargs):
return self.fitted_estimator.transform(*args, **kwargs)
X = np.array([[-1, -1], [-2, -1], [-3, -2]])
pca = PCA().fit(X)
components = pca.components_
frozen_pca = FrozenEstimator(pca)
assert_allclose(frozen_pca.components_, components)
# Calling PCA methods such as `get_feature_names_out` still works
assert_array_equal(frozen_pca.get_feature_names_out(), pca.get_feature_names_out())
# Fitting on a new data does not alter `components_`
X_new = np.asarray([[-1, 2], [3, 4], [1, 2]])
assert_allclose(frozen_pca.components_, components)
# `fit_transform` does not alter state
assert_allclose(frozen_pca.components_, components)
# Cloning estimator is a no-op
clone_frozen_pca = clone(frozen_pca)
assert clone_frozen_pca is frozen_pca
assert_allclose(clone_frozen_pca.components_, components)
def test_pickle_version_warning_is_not_raised_with_matching_version():
iris = datasets.load_iris()
tree = DecisionTreeClassifier().fit(iris.data, iris.target)
tree_pickle = pickle.dumps(tree)
assert b"_sklearn_version" in tree_pickle
tree_restored = assert_no_warnings(pickle.loads, tree_pickle)
# test that we can predict with the restored decision tree classifier
score_of_original = tree.score(iris.data, iris.target)
score_of_restored = tree_restored.score(iris.data, iris.target)
assert score_of_original == score_of_restored
class TreeBadVersion(DecisionTreeClassifier):
def __getstate__(self):
return dict(self.__dict__.items(), _sklearn_version="something")
pickle_error_message = (
"Trying to unpickle estimator {estimator} from "
"version {old_version} when using version "
"{current_version}. This might "
"lead to breaking code or invalid results. "
"Use at your own risk."
def test_pickle_version_warning_is_issued_upon_different_version():
iris = datasets.load_iris()
tree = TreeBadVersion().fit(iris.data, iris.target)
tree_pickle_other = pickle.dumps(tree)
message = pickle_error_message.format(
with pytest.warns(UserWarning, match=message) as warning_record:
message = warning_record.list[0].message
assert isinstance(message, InconsistentVersionWarning)
assert message.estimator_name == "TreeBadVersion"
assert message.original_sklearn_version == "something"
assert message.current_sklearn_version == sklearn.__version__
class TreeNoVersion(DecisionTreeClassifier):
def __getstate__(self):
return self.__dict__
def test_pickle_version_warning_is_issued_when_no_version_info_in_pickle():
iris = datasets.load_iris()
# TreeNoVersion has no getstate, like pre-0.18
tree = TreeNoVersion().fit(iris.data, iris.target)
tree_pickle_noversion = pickle.dumps(tree)
assert b"_sklearn_version" not in tree_pickle_noversion
message = pickle_error_message.format(
# check we got the warning about using pre-0.18 pickle
with pytest.warns(UserWarning, match=message):
def test_pickle_version_no_warning_is_issued_with_non_sklearn_estimator():
iris = datasets.load_iris()
tree = TreeNoVersion().fit(iris.data, iris.target)
tree_pickle_noversion = pickle.dumps(tree)
module_backup = TreeNoVersion.__module__
TreeNoVersion.__module__ = "notsklearn"
assert_no_warnings(pickle.loads, tree_pickle_noversion)
TreeNoVersion.__module__ = module_backup
class DontPickleAttributeMixin:
def __getstate__(self):
data = self.__dict__.copy()
data["_attribute_not_pickled"] = None
return data
def __setstate__(self, state):
state["_restored"] = True
class MultiInheritanceEstimator(DontPickleAttributeMixin, BaseEstimator):
def __init__(self, attribute_pickled=5):
self.attribute_pickled = attribute_pickled
self._attribute_not_pickled = None
def test_pickling_when_getstate_is_overwritten_by_mixin():
estimator = MultiInheritanceEstimator()
estimator._attribute_not_pickled = "this attribute should not be pickled"
serialized = pickle.dumps(estimator)
estimator_restored = pickle.loads(serialized)
assert estimator_restored.attribute_pickled == 5
assert estimator_restored._attribute_not_pickled is None
assert estimator_restored._restored
def test_pickling_when_getstate_is_overwritten_by_mixin_outside_of_sklearn():
estimator = MultiInheritanceEstimator()
text = "this attribute should not be pickled"
estimator._attribute_not_pickled = text
old_mod = type(estimator).__module__
type(estimator).__module__ = "notsklearn"
serialized = estimator.__getstate__()
assert serialized == {"_attribute_not_pickled": None, "attribute_pickled": 5}
serialized["attribute_pickled"] = 4
assert estimator.attribute_pickled == 4
assert estimator._restored
type(estimator).__module__ = old_mod
class SingleInheritanceEstimator(BaseEstimator):
def __init__(self, attribute_pickled=5):
self.attribute_pickled = attribute_pickled
self._attribute_not_pickled = None
def __getstate__(self):
data = self.__dict__.copy()
data["_attribute_not_pickled"] = None
return data
def test_pickling_works_when_getstate_is_overwritten_in_the_child_class():
estimator = SingleInheritanceEstimator()
estimator._attribute_not_pickled = "this attribute should not be pickled"
serialized = pickle.dumps(estimator)
estimator_restored = pickle.loads(serialized)
assert estimator_restored.attribute_pickled == 5
assert estimator_restored._attribute_not_pickled is None
def test_tag_inheritance():
# test that changing tags by inheritance is not allowed
nan_tag_est = NaNTag()
no_nan_tag_est = NoNaNTag()
assert nan_tag_est._get_tags()["allow_nan"]
assert not no_nan_tag_est._get_tags()["allow_nan"]
redefine_tags_est = OverrideTag()
assert not redefine_tags_est._get_tags()["allow_nan"]
diamond_tag_est = DiamondOverwriteTag()
assert diamond_tag_est._get_tags()["allow_nan"]
inherit_diamond_tag_est = InheritDiamondOverwriteTag()
assert inherit_diamond_tag_est._get_tags()["allow_nan"]
def test_raises_on_get_params_non_attribute():
class MyEstimator(BaseEstimator):
def __init__(self, param=5):
def fit(self, X, y=None):
return self
est = MyEstimator()
msg = "'MyEstimator' object has no attribute 'param'"
with pytest.raises(AttributeError, match=msg):
def test_repr_mimebundle_():
# Checks the display configuration flag controls the json output
tree = DecisionTreeClassifier()
output = tree._repr_mimebundle_()
assert "text/plain" in output
assert "text/html" in output
with config_context(display="text"):
output = tree._repr_mimebundle_()
assert "text/plain" in output
assert "text/html" not in output
def test_repr_html_wraps():
# Checks the display configuration flag controls the html output
tree = DecisionTreeClassifier()
output = tree._repr_html_()
assert "<style>" in output
with config_context(display="text"):
msg = "_repr_html_ is only defined when"
with pytest.raises(AttributeError, match=msg):
output = tree._repr_html_()
def test_n_features_in_validation():
"""Check that `_check_n_features` validates data when reset=False"""
est = MyEstimator()
X_train = [[1, 2, 3], [4, 5, 6]]
est._check_n_features(X_train, reset=True)
assert est.n_features_in_ == 3
msg = "X does not contain any features, but MyEstimator is expecting 3 features"
with pytest.raises(ValueError, match=msg):
est._check_n_features("invalid X", reset=False)
def test_n_features_in_no_validation():
"""Check that `_check_n_features` does not validate data when
n_features_in_ is not defined."""
est = MyEstimator()
est._check_n_features("invalid X", reset=True)
assert not hasattr(est, "n_features_in_")
# does not raise
est._check_n_features("invalid X", reset=False)
def test_feature_names_in():
"""Check that feature_name_in are recorded by `_validate_data`"""
pd = pytest.importorskip("pandas")
iris = datasets.load_iris()
X_np = iris.data
df = pd.DataFrame(X_np, columns=iris.feature_names)
class NoOpTransformer(TransformerMixin, BaseEstimator):
def fit(self, X, y=None):
return self
def transform(self, X):
self._validate_data(X, reset=False)
return X
# fit on dataframe saves the feature names
trans = NoOpTransformer().fit(df)
assert_array_equal(trans.feature_names_in_, df.columns)
# fit again but on ndarray does not keep the previous feature names (see #21383)
assert not hasattr(trans, "feature_names_in_")
msg = "The feature names should match those that were passed"
df_bad = pd.DataFrame(X_np, columns=iris.feature_names[::-1])
with pytest.raises(ValueError, match=msg):
# warns when fitted on dataframe and transforming a ndarray
msg = (
"X does not have valid feature names, but NoOpTransformer was "
"fitted with feature names"
with pytest.warns(UserWarning, match=msg):
# warns when fitted on a ndarray and transforming dataframe
msg = "X has feature names, but NoOpTransformer was fitted without feature names"
trans = NoOpTransformer().fit(X_np)
with pytest.warns(UserWarning, match=msg):
# fit on dataframe with all integer feature names works without warning
df_int_names = pd.DataFrame(X_np)
trans = NoOpTransformer()
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
# fit on dataframe with no feature names or all integer feature names
# -> do not warn on transform
Xs = [X_np, df_int_names]
for X in Xs:
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
# fit on dataframe with feature names that are mixed raises an error:
df_mixed = pd.DataFrame(X_np, columns=["a", "b", 1, 2])
trans = NoOpTransformer()
msg = re.escape(
"Feature names are only supported if all input features have string names, "
"but your input has ['int', 'str'] as feature name / column name types. "
"If you want feature names to be stored and validated, you must convert "
"them all to strings, by using X.columns = X.columns.astype(str) for "
"example. Otherwise you can remove feature / column names from your input "
"data, or convert them all to a non-string data type."
with pytest.raises(TypeError, match=msg):
# transform on feature names that are mixed also raises:
with pytest.raises(TypeError, match=msg):
def test_validate_data_cast_to_ndarray():
"""Check cast_to_ndarray option of _validate_data."""
pd = pytest.importorskip("pandas")
iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target)
class NoOpTransformer(TransformerMixin, BaseEstimator):
no_op = NoOpTransformer()
X_np_out = no_op._validate_data(df, cast_to_ndarray=True)
assert isinstance(X_np_out, np.ndarray)
assert_allclose(X_np_out, df.to_numpy())
X_df_out = no_op._validate_data(df, cast_to_ndarray=False)
assert X_df_out is df
y_np_out = no_op._validate_data(y=y, cast_to_ndarray=True)
assert isinstance(y_np_out, np.ndarray)
assert_allclose(y_np_out, y.to_numpy())
y_series_out = no_op._validate_data(y=y, cast_to_ndarray=False)
assert y_series_out is y
X_np_out, y_np_out = no_op._validate_data(df, y, cast_to_ndarray=True)
assert isinstance(X_np_out, np.ndarray)
assert_allclose(X_np_out, df.to_numpy())
assert isinstance(y_np_out, np.ndarray)
assert_allclose(y_np_out, y.to_numpy())
X_df_out, y_series_out = no_op._validate_data(df, y, cast_to_ndarray=False)
assert X_df_out is df
assert y_series_out is y
msg = "Validation should be done on X, y or both."
with pytest.raises(ValueError, match=msg):
def test_clone_keeps_output_config():
"""Check that clone keeps the set_output config."""
ss = StandardScaler().set_output(transform="pandas")
config = _get_output_config("transform", ss)
ss_clone = clone(ss)
config_clone = _get_output_config("transform", ss_clone)
assert config == config_clone
class _Empty:
class EmptyEstimator(_Empty, BaseEstimator):
@pytest.mark.parametrize("estimator", [BaseEstimator(), EmptyEstimator()])
def test_estimator_empty_instance_dict(estimator):
"""Check that ``__getstate__`` returns an empty ``dict`` with an empty
Python 3.11+ changed behaviour by returning ``None`` instead of raising an
``AttributeError``. Non-regression test for gh-25188.
state = estimator.__getstate__()
expected = {"_sklearn_version": sklearn.__version__}
assert state == expected
# this should not raise
def test_estimator_getstate_using_slots_error_message():
"""Using a `BaseEstimator` with `__slots__` is not supported."""
class WithSlots:
__slots__ = ("x",)
class Estimator(BaseEstimator, WithSlots):
msg = (
"You cannot use `__slots__` in objects inheriting from "
with pytest.raises(TypeError, match=msg):
with pytest.raises(TypeError, match=msg):
"constructor_name, minversion",
("dataframe", "1.5.0"),
("pyarrow", "12.0.0"),
("polars", "0.20.23"),
def test_dataframe_protocol(constructor_name, minversion):
"""Uses the dataframe exchange protocol to get feature names."""
data = [[1, 4, 2], [3, 3, 6]]
columns = ["col_0", "col_1", "col_2"]
df = _convert_container(
data, constructor_name, columns_name=columns, minversion=minversion
class NoOpTransformer(TransformerMixin, BaseEstimator):
def fit(self, X, y=None):
return self
def transform(self, X):
return self._validate_data(X, reset=False)
no_op = NoOpTransformer()
assert_array_equal(no_op.feature_names_in_, columns)
X_out = no_op.transform(df)
if constructor_name != "pyarrow":
# pyarrow does not work with `np.asarray`
# https://github.com/apache/arrow/issues/34886
assert_allclose(df, X_out)
bad_names = ["a", "b", "c"]
df_bad = _convert_container(data, constructor_name, columns_name=bad_names)
with pytest.raises(ValueError, match="The feature names should match"):
def test_transformer_fit_transform_with_metadata_in_transform():
"""Test that having a transformer with metadata for transform raises a
warning when calling fit_transform."""
class CustomTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None, prop=None):
return self
def transform(self, X, prop=None):
return X
# passing the metadata to `fit_transform` should raise a warning since it
# could potentially be consumed by `transform`
with pytest.warns(UserWarning, match="`transform` method which consumes metadata"):
[[1]], [1], prop=1
# not passing a metadata which can potentially be consumed by `transform` should
# not raise a warning
with warnings.catch_warnings(record=True) as record:
CustomTransformer().set_transform_request(prop=True).fit_transform([[1]], [1])
assert len(record) == 0
def test_outlier_mixin_fit_predict_with_metadata_in_predict():
"""Test that having an OutlierMixin with metadata for predict raises a
warning when calling fit_predict."""
class CustomOutlierDetector(BaseEstimator, OutlierMixin):
def fit(self, X, y=None, prop=None):
return self
def predict(self, X, prop=None):
return X
# passing the metadata to `fit_predict` should raise a warning since it
# could potentially be consumed by `predict`
with pytest.warns(UserWarning, match="`predict` method which consumes metadata"):
[[1]], [1], prop=1
# not passing a metadata which can potentially be consumed by `predict` should
# not raise a warning
with warnings.catch_warnings(record=True) as record:
CustomOutlierDetector().set_predict_request(prop=True).fit_predict([[1]], [1])
assert len(record) == 0