1216 lines
41 KiB
Python
1216 lines
41 KiB
Python
# We can not use pytest here, because we run
|
|
# build_tools/azure/test_pytest_soft_dependency.sh on these
|
|
# tests to make sure estimator_checks works without pytest.
|
|
|
|
import unittest
|
|
import sys
|
|
import warnings
|
|
from numbers import Integral, Real
|
|
|
|
import numpy as np
|
|
import scipy.sparse as sp
|
|
import joblib
|
|
|
|
from sklearn.base import BaseEstimator, ClassifierMixin, OutlierMixin
|
|
from sklearn.datasets import make_multilabel_classification
|
|
from sklearn.utils import deprecated
|
|
from sklearn.utils._testing import (
|
|
raises,
|
|
ignore_warnings,
|
|
MinimalClassifier,
|
|
MinimalRegressor,
|
|
MinimalTransformer,
|
|
SkipTest,
|
|
)
|
|
|
|
from sklearn.utils.validation import check_is_fitted, check_X_y
|
|
from sklearn.ensemble import ExtraTreesClassifier
|
|
from sklearn.linear_model import LinearRegression, SGDClassifier
|
|
from sklearn.mixture import GaussianMixture
|
|
from sklearn.cluster import MiniBatchKMeans
|
|
from sklearn.decomposition import PCA
|
|
from sklearn.linear_model import MultiTaskElasticNet, LogisticRegression
|
|
from sklearn.svm import SVC, NuSVC
|
|
from sklearn.neighbors import KNeighborsRegressor
|
|
from sklearn.utils.validation import check_array
|
|
from sklearn.utils import all_estimators
|
|
from sklearn.exceptions import SkipTestWarning
|
|
from sklearn.utils.metaestimators import available_if
|
|
from sklearn.utils.estimator_checks import check_decision_proba_consistency
|
|
from sklearn.utils._param_validation import Interval, StrOptions
|
|
|
|
from sklearn.utils.estimator_checks import (
|
|
_NotAnArray,
|
|
_set_checking_parameters,
|
|
check_class_weight_balanced_linear_classifier,
|
|
check_classifier_data_not_an_array,
|
|
check_classifiers_multilabel_output_format_decision_function,
|
|
check_classifiers_multilabel_output_format_predict,
|
|
check_classifiers_multilabel_output_format_predict_proba,
|
|
check_dataframe_column_names_consistency,
|
|
check_estimator,
|
|
check_estimator_get_tags_default_keys,
|
|
check_estimators_unfitted,
|
|
check_fit_score_takes_y,
|
|
check_no_attributes_set_in_init,
|
|
check_regressor_data_not_an_array,
|
|
check_requires_y_none,
|
|
check_outlier_corruption,
|
|
check_outlier_contamination,
|
|
set_random_state,
|
|
check_fit_check_is_fitted,
|
|
check_methods_sample_order_invariance,
|
|
check_methods_subset_invariance,
|
|
_yield_all_checks,
|
|
)
|
|
|
|
|
|
class CorrectNotFittedError(ValueError):
|
|
"""Exception class to raise if estimator is used before fitting.
|
|
|
|
Like NotFittedError, it inherits from ValueError, but not from
|
|
AttributeError. Used for testing only.
|
|
"""
|
|
|
|
|
|
class BaseBadClassifier(ClassifierMixin, BaseEstimator):
|
|
def fit(self, X, y):
|
|
return self
|
|
|
|
def predict(self, X):
|
|
return np.ones(X.shape[0])
|
|
|
|
|
|
class ChangesDict(BaseEstimator):
|
|
def __init__(self, key=0):
|
|
self.key = key
|
|
|
|
def fit(self, X, y=None):
|
|
X, y = self._validate_data(X, y)
|
|
return self
|
|
|
|
def predict(self, X):
|
|
X = check_array(X)
|
|
self.key = 1000
|
|
return np.ones(X.shape[0])
|
|
|
|
|
|
class SetsWrongAttribute(BaseEstimator):
|
|
def __init__(self, acceptable_key=0):
|
|
self.acceptable_key = acceptable_key
|
|
|
|
def fit(self, X, y=None):
|
|
self.wrong_attribute = 0
|
|
X, y = self._validate_data(X, y)
|
|
return self
|
|
|
|
|
|
class ChangesWrongAttribute(BaseEstimator):
|
|
def __init__(self, wrong_attribute=0):
|
|
self.wrong_attribute = wrong_attribute
|
|
|
|
def fit(self, X, y=None):
|
|
self.wrong_attribute = 1
|
|
X, y = self._validate_data(X, y)
|
|
return self
|
|
|
|
|
|
class ChangesUnderscoreAttribute(BaseEstimator):
|
|
def fit(self, X, y=None):
|
|
self._good_attribute = 1
|
|
X, y = self._validate_data(X, y)
|
|
return self
|
|
|
|
|
|
class RaisesErrorInSetParams(BaseEstimator):
|
|
def __init__(self, p=0):
|
|
self.p = p
|
|
|
|
def set_params(self, **kwargs):
|
|
if "p" in kwargs:
|
|
p = kwargs.pop("p")
|
|
if p < 0:
|
|
raise ValueError("p can't be less than 0")
|
|
self.p = p
|
|
return super().set_params(**kwargs)
|
|
|
|
def fit(self, X, y=None):
|
|
X, y = self._validate_data(X, y)
|
|
return self
|
|
|
|
|
|
class HasMutableParameters(BaseEstimator):
|
|
def __init__(self, p=object()):
|
|
self.p = p
|
|
|
|
def fit(self, X, y=None):
|
|
X, y = self._validate_data(X, y)
|
|
return self
|
|
|
|
|
|
class HasImmutableParameters(BaseEstimator):
|
|
# Note that object is an uninitialized class, thus immutable.
|
|
def __init__(self, p=42, q=np.int32(42), r=object):
|
|
self.p = p
|
|
self.q = q
|
|
self.r = r
|
|
|
|
def fit(self, X, y=None):
|
|
X, y = self._validate_data(X, y)
|
|
return self
|
|
|
|
|
|
class ModifiesValueInsteadOfRaisingError(BaseEstimator):
|
|
def __init__(self, p=0):
|
|
self.p = p
|
|
|
|
def set_params(self, **kwargs):
|
|
if "p" in kwargs:
|
|
p = kwargs.pop("p")
|
|
if p < 0:
|
|
p = 0
|
|
self.p = p
|
|
return super().set_params(**kwargs)
|
|
|
|
def fit(self, X, y=None):
|
|
X, y = self._validate_data(X, y)
|
|
return self
|
|
|
|
|
|
class ModifiesAnotherValue(BaseEstimator):
|
|
def __init__(self, a=0, b="method1"):
|
|
self.a = a
|
|
self.b = b
|
|
|
|
def set_params(self, **kwargs):
|
|
if "a" in kwargs:
|
|
a = kwargs.pop("a")
|
|
self.a = a
|
|
if a is None:
|
|
kwargs.pop("b")
|
|
self.b = "method2"
|
|
return super().set_params(**kwargs)
|
|
|
|
def fit(self, X, y=None):
|
|
X, y = self._validate_data(X, y)
|
|
return self
|
|
|
|
|
|
class NoCheckinPredict(BaseBadClassifier):
|
|
def fit(self, X, y):
|
|
X, y = self._validate_data(X, y)
|
|
return self
|
|
|
|
|
|
class NoSparseClassifier(BaseBadClassifier):
|
|
def fit(self, X, y):
|
|
X, y = self._validate_data(X, y, accept_sparse=["csr", "csc"])
|
|
if sp.issparse(X):
|
|
raise ValueError("Nonsensical Error")
|
|
return self
|
|
|
|
def predict(self, X):
|
|
X = check_array(X)
|
|
return np.ones(X.shape[0])
|
|
|
|
|
|
class CorrectNotFittedErrorClassifier(BaseBadClassifier):
|
|
def fit(self, X, y):
|
|
X, y = self._validate_data(X, y)
|
|
self.coef_ = np.ones(X.shape[1])
|
|
return self
|
|
|
|
def predict(self, X):
|
|
check_is_fitted(self)
|
|
X = check_array(X)
|
|
return np.ones(X.shape[0])
|
|
|
|
|
|
class NoSampleWeightPandasSeriesType(BaseEstimator):
|
|
def fit(self, X, y, sample_weight=None):
|
|
# Convert data
|
|
X, y = self._validate_data(
|
|
X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True
|
|
)
|
|
# Function is only called after we verify that pandas is installed
|
|
from pandas import Series
|
|
|
|
if isinstance(sample_weight, Series):
|
|
raise ValueError(
|
|
"Estimator does not accept 'sample_weight'of type pandas.Series"
|
|
)
|
|
return self
|
|
|
|
def predict(self, X):
|
|
X = check_array(X)
|
|
return np.ones(X.shape[0])
|
|
|
|
|
|
class BadBalancedWeightsClassifier(BaseBadClassifier):
|
|
def __init__(self, class_weight=None):
|
|
self.class_weight = class_weight
|
|
|
|
def fit(self, X, y):
|
|
from sklearn.preprocessing import LabelEncoder
|
|
from sklearn.utils import compute_class_weight
|
|
|
|
label_encoder = LabelEncoder().fit(y)
|
|
classes = label_encoder.classes_
|
|
class_weight = compute_class_weight(self.class_weight, classes=classes, y=y)
|
|
|
|
# Intentionally modify the balanced class_weight
|
|
# to simulate a bug and raise an exception
|
|
if self.class_weight == "balanced":
|
|
class_weight += 1.0
|
|
|
|
# Simply assigning coef_ to the class_weight
|
|
self.coef_ = class_weight
|
|
return self
|
|
|
|
|
|
class BadTransformerWithoutMixin(BaseEstimator):
|
|
def fit(self, X, y=None):
|
|
X = self._validate_data(X)
|
|
return self
|
|
|
|
def transform(self, X):
|
|
X = check_array(X)
|
|
return X
|
|
|
|
|
|
class NotInvariantPredict(BaseEstimator):
|
|
def fit(self, X, y):
|
|
# Convert data
|
|
X, y = self._validate_data(
|
|
X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True
|
|
)
|
|
return self
|
|
|
|
def predict(self, X):
|
|
# return 1 if X has more than one element else return 0
|
|
X = check_array(X)
|
|
if X.shape[0] > 1:
|
|
return np.ones(X.shape[0])
|
|
return np.zeros(X.shape[0])
|
|
|
|
|
|
class NotInvariantSampleOrder(BaseEstimator):
|
|
def fit(self, X, y):
|
|
X, y = self._validate_data(
|
|
X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True
|
|
)
|
|
# store the original X to check for sample order later
|
|
self._X = X
|
|
return self
|
|
|
|
def predict(self, X):
|
|
X = check_array(X)
|
|
# if the input contains the same elements but different sample order,
|
|
# then just return zeros.
|
|
if (
|
|
np.array_equiv(np.sort(X, axis=0), np.sort(self._X, axis=0))
|
|
and (X != self._X).any()
|
|
):
|
|
return np.zeros(X.shape[0])
|
|
return X[:, 0]
|
|
|
|
|
|
class OneClassSampleErrorClassifier(BaseBadClassifier):
|
|
"""Classifier allowing to trigger different behaviors when `sample_weight` reduces
|
|
the number of classes to 1."""
|
|
|
|
def __init__(self, raise_when_single_class=False):
|
|
self.raise_when_single_class = raise_when_single_class
|
|
|
|
def fit(self, X, y, sample_weight=None):
|
|
X, y = check_X_y(
|
|
X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True
|
|
)
|
|
|
|
self.has_single_class_ = False
|
|
self.classes_, y = np.unique(y, return_inverse=True)
|
|
n_classes_ = self.classes_.shape[0]
|
|
if n_classes_ < 2 and self.raise_when_single_class:
|
|
self.has_single_class_ = True
|
|
raise ValueError("normal class error")
|
|
|
|
# find the number of class after trimming
|
|
if sample_weight is not None:
|
|
if isinstance(sample_weight, np.ndarray) and len(sample_weight) > 0:
|
|
n_classes_ = np.count_nonzero(np.bincount(y, sample_weight))
|
|
if n_classes_ < 2:
|
|
self.has_single_class_ = True
|
|
raise ValueError("Nonsensical Error")
|
|
|
|
return self
|
|
|
|
def predict(self, X):
|
|
check_is_fitted(self)
|
|
X = check_array(X)
|
|
if self.has_single_class_:
|
|
return np.zeros(X.shape[0])
|
|
return np.ones(X.shape[0])
|
|
|
|
|
|
class LargeSparseNotSupportedClassifier(BaseEstimator):
|
|
def fit(self, X, y):
|
|
X, y = self._validate_data(
|
|
X,
|
|
y,
|
|
accept_sparse=("csr", "csc", "coo"),
|
|
accept_large_sparse=True,
|
|
multi_output=True,
|
|
y_numeric=True,
|
|
)
|
|
if sp.issparse(X):
|
|
if X.getformat() == "coo":
|
|
if X.row.dtype == "int64" or X.col.dtype == "int64":
|
|
raise ValueError("Estimator doesn't support 64-bit indices")
|
|
elif X.getformat() in ["csc", "csr"]:
|
|
assert "int64" not in (
|
|
X.indices.dtype,
|
|
X.indptr.dtype,
|
|
), "Estimator doesn't support 64-bit indices"
|
|
|
|
return self
|
|
|
|
|
|
class SparseTransformer(BaseEstimator):
|
|
def fit(self, X, y=None):
|
|
self.X_shape_ = self._validate_data(X).shape
|
|
return self
|
|
|
|
def fit_transform(self, X, y=None):
|
|
return self.fit(X, y).transform(X)
|
|
|
|
def transform(self, X):
|
|
X = check_array(X)
|
|
if X.shape[1] != self.X_shape_[1]:
|
|
raise ValueError("Bad number of features")
|
|
return sp.csr_matrix(X)
|
|
|
|
|
|
class EstimatorInconsistentForPandas(BaseEstimator):
|
|
def fit(self, X, y):
|
|
try:
|
|
from pandas import DataFrame
|
|
|
|
if isinstance(X, DataFrame):
|
|
self.value_ = X.iloc[0, 0]
|
|
else:
|
|
X = check_array(X)
|
|
self.value_ = X[1, 0]
|
|
return self
|
|
|
|
except ImportError:
|
|
X = check_array(X)
|
|
self.value_ = X[1, 0]
|
|
return self
|
|
|
|
def predict(self, X):
|
|
X = check_array(X)
|
|
return np.array([self.value_] * X.shape[0])
|
|
|
|
|
|
class UntaggedBinaryClassifier(SGDClassifier):
|
|
# Toy classifier that only supports binary classification, will fail tests.
|
|
def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None):
|
|
super().fit(X, y, coef_init, intercept_init, sample_weight)
|
|
if len(self.classes_) > 2:
|
|
raise ValueError("Only 2 classes are supported")
|
|
return self
|
|
|
|
def partial_fit(self, X, y, classes=None, sample_weight=None):
|
|
super().partial_fit(X=X, y=y, classes=classes, sample_weight=sample_weight)
|
|
if len(self.classes_) > 2:
|
|
raise ValueError("Only 2 classes are supported")
|
|
return self
|
|
|
|
|
|
class TaggedBinaryClassifier(UntaggedBinaryClassifier):
|
|
# Toy classifier that only supports binary classification.
|
|
def _more_tags(self):
|
|
return {"binary_only": True}
|
|
|
|
|
|
class EstimatorMissingDefaultTags(BaseEstimator):
|
|
def _get_tags(self):
|
|
tags = super()._get_tags().copy()
|
|
del tags["allow_nan"]
|
|
return tags
|
|
|
|
|
|
class RequiresPositiveXRegressor(LinearRegression):
|
|
def fit(self, X, y):
|
|
X, y = self._validate_data(X, y, multi_output=True)
|
|
if (X < 0).any():
|
|
raise ValueError("negative X values not supported!")
|
|
return super().fit(X, y)
|
|
|
|
def _more_tags(self):
|
|
return {"requires_positive_X": True}
|
|
|
|
|
|
class RequiresPositiveYRegressor(LinearRegression):
|
|
def fit(self, X, y):
|
|
X, y = self._validate_data(X, y, multi_output=True)
|
|
if (y <= 0).any():
|
|
raise ValueError("negative y values not supported!")
|
|
return super().fit(X, y)
|
|
|
|
def _more_tags(self):
|
|
return {"requires_positive_y": True}
|
|
|
|
|
|
class PoorScoreLogisticRegression(LogisticRegression):
|
|
def decision_function(self, X):
|
|
return super().decision_function(X) + 1
|
|
|
|
def _more_tags(self):
|
|
return {"poor_score": True}
|
|
|
|
|
|
class PartialFitChecksName(BaseEstimator):
|
|
def fit(self, X, y):
|
|
self._validate_data(X, y)
|
|
return self
|
|
|
|
def partial_fit(self, X, y):
|
|
reset = not hasattr(self, "_fitted")
|
|
self._validate_data(X, y, reset=reset)
|
|
self._fitted = True
|
|
return self
|
|
|
|
|
|
def test_not_an_array_array_function():
|
|
not_array = _NotAnArray(np.ones(10))
|
|
msg = "Don't want to call array_function sum!"
|
|
with raises(TypeError, match=msg):
|
|
np.sum(not_array)
|
|
# always returns True
|
|
assert np.may_share_memory(not_array, None)
|
|
|
|
|
|
def test_check_fit_score_takes_y_works_on_deprecated_fit():
|
|
# Tests that check_fit_score_takes_y works on a class with
|
|
# a deprecated fit method
|
|
|
|
class TestEstimatorWithDeprecatedFitMethod(BaseEstimator):
|
|
@deprecated("Deprecated for the purpose of testing check_fit_score_takes_y")
|
|
def fit(self, X, y):
|
|
return self
|
|
|
|
check_fit_score_takes_y("test", TestEstimatorWithDeprecatedFitMethod())
|
|
|
|
|
|
def test_check_estimator():
|
|
# tests that the estimator actually fails on "bad" estimators.
|
|
# not a complete test of all checks, which are very extensive.
|
|
|
|
# check that we have a set_params and can clone
|
|
msg = "Passing a class was deprecated"
|
|
with raises(TypeError, match=msg):
|
|
check_estimator(object)
|
|
msg = (
|
|
"Parameter 'p' of estimator 'HasMutableParameters' is of type "
|
|
"object which is not allowed"
|
|
)
|
|
# check that the "default_constructible" test checks for mutable parameters
|
|
check_estimator(HasImmutableParameters()) # should pass
|
|
with raises(AssertionError, match=msg):
|
|
check_estimator(HasMutableParameters())
|
|
# check that values returned by get_params match set_params
|
|
msg = "get_params result does not match what was passed to set_params"
|
|
with raises(AssertionError, match=msg):
|
|
check_estimator(ModifiesValueInsteadOfRaisingError())
|
|
with warnings.catch_warnings(record=True) as records:
|
|
check_estimator(RaisesErrorInSetParams())
|
|
assert UserWarning in [rec.category for rec in records]
|
|
|
|
with raises(AssertionError, match=msg):
|
|
check_estimator(ModifiesAnotherValue())
|
|
# check that we have a fit method
|
|
msg = "object has no attribute 'fit'"
|
|
with raises(AttributeError, match=msg):
|
|
check_estimator(BaseEstimator())
|
|
# check that fit does input validation
|
|
msg = "Did not raise"
|
|
with raises(AssertionError, match=msg):
|
|
check_estimator(BaseBadClassifier())
|
|
# check that sample_weights in fit accepts pandas.Series type
|
|
try:
|
|
from pandas import Series # noqa
|
|
|
|
msg = (
|
|
"Estimator NoSampleWeightPandasSeriesType raises error if "
|
|
"'sample_weight' parameter is of type pandas.Series"
|
|
)
|
|
with raises(ValueError, match=msg):
|
|
check_estimator(NoSampleWeightPandasSeriesType())
|
|
except ImportError:
|
|
pass
|
|
# check that predict does input validation (doesn't accept dicts in input)
|
|
msg = "Estimator NoCheckinPredict doesn't check for NaN and inf in predict"
|
|
with raises(AssertionError, match=msg):
|
|
check_estimator(NoCheckinPredict())
|
|
# check that estimator state does not change
|
|
# at transform/predict/predict_proba time
|
|
msg = "Estimator changes __dict__ during predict"
|
|
with raises(AssertionError, match=msg):
|
|
check_estimator(ChangesDict())
|
|
# check that `fit` only changes attributes that
|
|
# are private (start with an _ or end with a _).
|
|
msg = (
|
|
"Estimator ChangesWrongAttribute should not change or mutate "
|
|
"the parameter wrong_attribute from 0 to 1 during fit."
|
|
)
|
|
with raises(AssertionError, match=msg):
|
|
check_estimator(ChangesWrongAttribute())
|
|
check_estimator(ChangesUnderscoreAttribute())
|
|
# check that `fit` doesn't add any public attribute
|
|
msg = (
|
|
r"Estimator adds public attribute\(s\) during the fit method."
|
|
" Estimators are only allowed to add private attributes"
|
|
" either started with _ or ended"
|
|
" with _ but wrong_attribute added"
|
|
)
|
|
with raises(AssertionError, match=msg):
|
|
check_estimator(SetsWrongAttribute())
|
|
# check for sample order invariance
|
|
name = NotInvariantSampleOrder.__name__
|
|
method = "predict"
|
|
msg = (
|
|
"{method} of {name} is not invariant when applied to a dataset"
|
|
"with different sample order."
|
|
).format(method=method, name=name)
|
|
with raises(AssertionError, match=msg):
|
|
check_estimator(NotInvariantSampleOrder())
|
|
# check for invariant method
|
|
name = NotInvariantPredict.__name__
|
|
method = "predict"
|
|
msg = ("{method} of {name} is not invariant when applied to a subset.").format(
|
|
method=method, name=name
|
|
)
|
|
with raises(AssertionError, match=msg):
|
|
check_estimator(NotInvariantPredict())
|
|
# check for sparse matrix input handling
|
|
name = NoSparseClassifier.__name__
|
|
msg = "Estimator %s doesn't seem to fail gracefully on sparse data" % name
|
|
with raises(AssertionError, match=msg):
|
|
check_estimator(NoSparseClassifier())
|
|
|
|
# check for classifiers reducing to less than two classes via sample weights
|
|
name = OneClassSampleErrorClassifier.__name__
|
|
msg = (
|
|
f"{name} failed when fitted on one label after sample_weight "
|
|
"trimming. Error message is not explicit, it should have "
|
|
"'class'."
|
|
)
|
|
with raises(AssertionError, match=msg):
|
|
check_estimator(OneClassSampleErrorClassifier())
|
|
|
|
# Large indices test on bad estimator
|
|
msg = (
|
|
"Estimator LargeSparseNotSupportedClassifier doesn't seem to "
|
|
r"support \S{3}_64 matrix, and is not failing gracefully.*"
|
|
)
|
|
with raises(AssertionError, match=msg):
|
|
check_estimator(LargeSparseNotSupportedClassifier())
|
|
|
|
# does error on binary_only untagged estimator
|
|
msg = "Only 2 classes are supported"
|
|
with raises(ValueError, match=msg):
|
|
check_estimator(UntaggedBinaryClassifier())
|
|
|
|
# non-regression test for estimators transforming to sparse data
|
|
check_estimator(SparseTransformer())
|
|
|
|
# doesn't error on actual estimator
|
|
check_estimator(LogisticRegression())
|
|
check_estimator(LogisticRegression(C=0.01))
|
|
check_estimator(MultiTaskElasticNet())
|
|
|
|
# doesn't error on binary_only tagged estimator
|
|
check_estimator(TaggedBinaryClassifier())
|
|
check_estimator(RequiresPositiveXRegressor())
|
|
|
|
# Check regressor with requires_positive_y estimator tag
|
|
msg = "negative y values not supported!"
|
|
with raises(ValueError, match=msg):
|
|
check_estimator(RequiresPositiveYRegressor())
|
|
|
|
# Does not raise error on classifier with poor_score tag
|
|
check_estimator(PoorScoreLogisticRegression())
|
|
|
|
|
|
def test_check_outlier_corruption():
|
|
# should raise AssertionError
|
|
decision = np.array([0.0, 1.0, 1.5, 2.0])
|
|
with raises(AssertionError):
|
|
check_outlier_corruption(1, 2, decision)
|
|
# should pass
|
|
decision = np.array([0.0, 1.0, 1.0, 2.0])
|
|
check_outlier_corruption(1, 2, decision)
|
|
|
|
|
|
def test_check_estimator_transformer_no_mixin():
|
|
# check that TransformerMixin is not required for transformer tests to run
|
|
with raises(AttributeError, ".*fit_transform.*"):
|
|
check_estimator(BadTransformerWithoutMixin())
|
|
|
|
|
|
def test_check_estimator_clones():
|
|
# check that check_estimator doesn't modify the estimator it receives
|
|
from sklearn.datasets import load_iris
|
|
|
|
iris = load_iris()
|
|
|
|
for Estimator in [
|
|
GaussianMixture,
|
|
LinearRegression,
|
|
SGDClassifier,
|
|
PCA,
|
|
ExtraTreesClassifier,
|
|
MiniBatchKMeans,
|
|
]:
|
|
with ignore_warnings(category=FutureWarning):
|
|
# when 'est = SGDClassifier()'
|
|
est = Estimator()
|
|
_set_checking_parameters(est)
|
|
set_random_state(est)
|
|
# without fitting
|
|
old_hash = joblib.hash(est)
|
|
check_estimator(est)
|
|
assert old_hash == joblib.hash(est)
|
|
|
|
with ignore_warnings(category=FutureWarning):
|
|
# when 'est = SGDClassifier()'
|
|
est = Estimator()
|
|
_set_checking_parameters(est)
|
|
set_random_state(est)
|
|
# with fitting
|
|
est.fit(iris.data + 10, iris.target)
|
|
old_hash = joblib.hash(est)
|
|
check_estimator(est)
|
|
assert old_hash == joblib.hash(est)
|
|
|
|
|
|
def test_check_estimators_unfitted():
|
|
# check that a ValueError/AttributeError is raised when calling predict
|
|
# on an unfitted estimator
|
|
msg = "Did not raise"
|
|
with raises(AssertionError, match=msg):
|
|
check_estimators_unfitted("estimator", NoSparseClassifier())
|
|
|
|
# check that CorrectNotFittedError inherit from either ValueError
|
|
# or AttributeError
|
|
check_estimators_unfitted("estimator", CorrectNotFittedErrorClassifier())
|
|
|
|
|
|
def test_check_no_attributes_set_in_init():
|
|
class NonConformantEstimatorPrivateSet(BaseEstimator):
|
|
def __init__(self):
|
|
self.you_should_not_set_this_ = None
|
|
|
|
class NonConformantEstimatorNoParamSet(BaseEstimator):
|
|
def __init__(self, you_should_set_this_=None):
|
|
pass
|
|
|
|
msg = (
|
|
"Estimator estimator_name should not set any"
|
|
" attribute apart from parameters during init."
|
|
r" Found attributes \['you_should_not_set_this_'\]."
|
|
)
|
|
with raises(AssertionError, match=msg):
|
|
check_no_attributes_set_in_init(
|
|
"estimator_name", NonConformantEstimatorPrivateSet()
|
|
)
|
|
|
|
msg = (
|
|
"Estimator estimator_name should store all parameters as an attribute"
|
|
" during init"
|
|
)
|
|
with raises(AttributeError, match=msg):
|
|
check_no_attributes_set_in_init(
|
|
"estimator_name", NonConformantEstimatorNoParamSet()
|
|
)
|
|
|
|
|
|
def test_check_estimator_pairwise():
|
|
# check that check_estimator() works on estimator with _pairwise
|
|
# kernel or metric
|
|
|
|
# test precomputed kernel
|
|
est = SVC(kernel="precomputed")
|
|
check_estimator(est)
|
|
|
|
# test precomputed metric
|
|
est = KNeighborsRegressor(metric="precomputed")
|
|
check_estimator(est)
|
|
|
|
|
|
def test_check_classifier_data_not_an_array():
|
|
with raises(AssertionError, match="Not equal to tolerance"):
|
|
check_classifier_data_not_an_array(
|
|
"estimator_name", EstimatorInconsistentForPandas()
|
|
)
|
|
|
|
|
|
def test_check_regressor_data_not_an_array():
|
|
with raises(AssertionError, match="Not equal to tolerance"):
|
|
check_regressor_data_not_an_array(
|
|
"estimator_name", EstimatorInconsistentForPandas()
|
|
)
|
|
|
|
|
|
def test_check_estimator_get_tags_default_keys():
|
|
estimator = EstimatorMissingDefaultTags()
|
|
err_msg = (
|
|
r"EstimatorMissingDefaultTags._get_tags\(\) is missing entries"
|
|
r" for the following default tags: {'allow_nan'}"
|
|
)
|
|
with raises(AssertionError, match=err_msg):
|
|
check_estimator_get_tags_default_keys(estimator.__class__.__name__, estimator)
|
|
|
|
# noop check when _get_tags is not available
|
|
estimator = MinimalTransformer()
|
|
check_estimator_get_tags_default_keys(estimator.__class__.__name__, estimator)
|
|
|
|
|
|
def test_check_dataframe_column_names_consistency():
|
|
err_msg = "Estimator does not have a feature_names_in_"
|
|
with raises(ValueError, match=err_msg):
|
|
check_dataframe_column_names_consistency("estimator_name", BaseBadClassifier())
|
|
check_dataframe_column_names_consistency("estimator_name", PartialFitChecksName())
|
|
|
|
lr = LogisticRegression()
|
|
check_dataframe_column_names_consistency(lr.__class__.__name__, lr)
|
|
lr.__doc__ = "Docstring that does not document the estimator's attributes"
|
|
err_msg = (
|
|
"Estimator LogisticRegression does not document its feature_names_in_ attribute"
|
|
)
|
|
with raises(ValueError, match=err_msg):
|
|
check_dataframe_column_names_consistency(lr.__class__.__name__, lr)
|
|
|
|
|
|
class _BaseMultiLabelClassifierMock(ClassifierMixin, BaseEstimator):
|
|
def __init__(self, response_output):
|
|
self.response_output = response_output
|
|
|
|
def fit(self, X, y):
|
|
return self
|
|
|
|
def _more_tags(self):
|
|
return {"multilabel": True}
|
|
|
|
|
|
def test_check_classifiers_multilabel_output_format_predict():
|
|
n_samples, test_size, n_outputs = 100, 25, 5
|
|
_, y = make_multilabel_classification(
|
|
n_samples=n_samples,
|
|
n_features=2,
|
|
n_classes=n_outputs,
|
|
n_labels=3,
|
|
length=50,
|
|
allow_unlabeled=True,
|
|
random_state=0,
|
|
)
|
|
y_test = y[-test_size:]
|
|
|
|
class MultiLabelClassifierPredict(_BaseMultiLabelClassifierMock):
|
|
def predict(self, X):
|
|
return self.response_output
|
|
|
|
# 1. inconsistent array type
|
|
clf = MultiLabelClassifierPredict(response_output=y_test.tolist())
|
|
err_msg = (
|
|
r"MultiLabelClassifierPredict.predict is expected to output a "
|
|
r"NumPy array. Got <class 'list'> instead."
|
|
)
|
|
with raises(AssertionError, match=err_msg):
|
|
check_classifiers_multilabel_output_format_predict(clf.__class__.__name__, clf)
|
|
# 2. inconsistent shape
|
|
clf = MultiLabelClassifierPredict(response_output=y_test[:, :-1])
|
|
err_msg = (
|
|
r"MultiLabelClassifierPredict.predict outputs a NumPy array of "
|
|
r"shape \(25, 4\) instead of \(25, 5\)."
|
|
)
|
|
with raises(AssertionError, match=err_msg):
|
|
check_classifiers_multilabel_output_format_predict(clf.__class__.__name__, clf)
|
|
# 3. inconsistent dtype
|
|
clf = MultiLabelClassifierPredict(response_output=y_test.astype(np.float64))
|
|
err_msg = (
|
|
r"MultiLabelClassifierPredict.predict does not output the same "
|
|
r"dtype than the targets."
|
|
)
|
|
with raises(AssertionError, match=err_msg):
|
|
check_classifiers_multilabel_output_format_predict(clf.__class__.__name__, clf)
|
|
|
|
|
|
def test_check_classifiers_multilabel_output_format_predict_proba():
|
|
n_samples, test_size, n_outputs = 100, 25, 5
|
|
_, y = make_multilabel_classification(
|
|
n_samples=n_samples,
|
|
n_features=2,
|
|
n_classes=n_outputs,
|
|
n_labels=3,
|
|
length=50,
|
|
allow_unlabeled=True,
|
|
random_state=0,
|
|
)
|
|
y_test = y[-test_size:]
|
|
|
|
class MultiLabelClassifierPredictProba(_BaseMultiLabelClassifierMock):
|
|
def predict_proba(self, X):
|
|
return self.response_output
|
|
|
|
# 1. unknown output type
|
|
clf = MultiLabelClassifierPredictProba(response_output=sp.csr_matrix(y_test))
|
|
err_msg = (
|
|
"Unknown returned type .*csr_matrix.* by "
|
|
r"MultiLabelClassifierPredictProba.predict_proba. A list or a Numpy "
|
|
r"array is expected."
|
|
)
|
|
with raises(ValueError, match=err_msg):
|
|
check_classifiers_multilabel_output_format_predict_proba(
|
|
clf.__class__.__name__,
|
|
clf,
|
|
)
|
|
# 2. for list output
|
|
# 2.1. inconsistent length
|
|
clf = MultiLabelClassifierPredictProba(response_output=y_test.tolist())
|
|
err_msg = (
|
|
"When MultiLabelClassifierPredictProba.predict_proba returns a list, "
|
|
"the list should be of length n_outputs and contain NumPy arrays. Got "
|
|
f"length of {test_size} instead of {n_outputs}."
|
|
)
|
|
with raises(AssertionError, match=err_msg):
|
|
check_classifiers_multilabel_output_format_predict_proba(
|
|
clf.__class__.__name__,
|
|
clf,
|
|
)
|
|
# 2.2. array of inconsistent shape
|
|
response_output = [np.ones_like(y_test) for _ in range(n_outputs)]
|
|
clf = MultiLabelClassifierPredictProba(response_output=response_output)
|
|
err_msg = (
|
|
r"When MultiLabelClassifierPredictProba.predict_proba returns a list, "
|
|
r"this list should contain NumPy arrays of shape \(n_samples, 2\). Got "
|
|
r"NumPy arrays of shape \(25, 5\) instead of \(25, 2\)."
|
|
)
|
|
with raises(AssertionError, match=err_msg):
|
|
check_classifiers_multilabel_output_format_predict_proba(
|
|
clf.__class__.__name__,
|
|
clf,
|
|
)
|
|
# 2.3. array of inconsistent dtype
|
|
response_output = [
|
|
np.ones(shape=(y_test.shape[0], 2), dtype=np.int64) for _ in range(n_outputs)
|
|
]
|
|
clf = MultiLabelClassifierPredictProba(response_output=response_output)
|
|
err_msg = (
|
|
"When MultiLabelClassifierPredictProba.predict_proba returns a list, "
|
|
"it should contain NumPy arrays with floating dtype."
|
|
)
|
|
with raises(AssertionError, match=err_msg):
|
|
check_classifiers_multilabel_output_format_predict_proba(
|
|
clf.__class__.__name__,
|
|
clf,
|
|
)
|
|
# 2.4. array does not contain probability (each row should sum to 1)
|
|
response_output = [
|
|
np.ones(shape=(y_test.shape[0], 2), dtype=np.float64) for _ in range(n_outputs)
|
|
]
|
|
clf = MultiLabelClassifierPredictProba(response_output=response_output)
|
|
err_msg = (
|
|
r"When MultiLabelClassifierPredictProba.predict_proba returns a list, "
|
|
r"each NumPy array should contain probabilities for each class and "
|
|
r"thus each row should sum to 1"
|
|
)
|
|
with raises(AssertionError, match=err_msg):
|
|
check_classifiers_multilabel_output_format_predict_proba(
|
|
clf.__class__.__name__,
|
|
clf,
|
|
)
|
|
# 3 for array output
|
|
# 3.1. array of inconsistent shape
|
|
clf = MultiLabelClassifierPredictProba(response_output=y_test[:, :-1])
|
|
err_msg = (
|
|
r"When MultiLabelClassifierPredictProba.predict_proba returns a NumPy "
|
|
r"array, the expected shape is \(n_samples, n_outputs\). Got \(25, 4\)"
|
|
r" instead of \(25, 5\)."
|
|
)
|
|
with raises(AssertionError, match=err_msg):
|
|
check_classifiers_multilabel_output_format_predict_proba(
|
|
clf.__class__.__name__,
|
|
clf,
|
|
)
|
|
# 3.2. array of inconsistent dtype
|
|
response_output = np.zeros_like(y_test, dtype=np.int64)
|
|
clf = MultiLabelClassifierPredictProba(response_output=response_output)
|
|
err_msg = (
|
|
r"When MultiLabelClassifierPredictProba.predict_proba returns a NumPy "
|
|
r"array, the expected data type is floating."
|
|
)
|
|
with raises(AssertionError, match=err_msg):
|
|
check_classifiers_multilabel_output_format_predict_proba(
|
|
clf.__class__.__name__,
|
|
clf,
|
|
)
|
|
# 4. array does not contain probabilities
|
|
clf = MultiLabelClassifierPredictProba(response_output=y_test * 2.0)
|
|
err_msg = (
|
|
r"When MultiLabelClassifierPredictProba.predict_proba returns a NumPy "
|
|
r"array, this array is expected to provide probabilities of the "
|
|
r"positive class and should therefore contain values between 0 and 1."
|
|
)
|
|
with raises(AssertionError, match=err_msg):
|
|
check_classifiers_multilabel_output_format_predict_proba(
|
|
clf.__class__.__name__,
|
|
clf,
|
|
)
|
|
|
|
|
|
def test_check_classifiers_multilabel_output_format_decision_function():
|
|
n_samples, test_size, n_outputs = 100, 25, 5
|
|
_, y = make_multilabel_classification(
|
|
n_samples=n_samples,
|
|
n_features=2,
|
|
n_classes=n_outputs,
|
|
n_labels=3,
|
|
length=50,
|
|
allow_unlabeled=True,
|
|
random_state=0,
|
|
)
|
|
y_test = y[-test_size:]
|
|
|
|
class MultiLabelClassifierDecisionFunction(_BaseMultiLabelClassifierMock):
|
|
def decision_function(self, X):
|
|
return self.response_output
|
|
|
|
# 1. inconsistent array type
|
|
clf = MultiLabelClassifierDecisionFunction(response_output=y_test.tolist())
|
|
err_msg = (
|
|
r"MultiLabelClassifierDecisionFunction.decision_function is expected "
|
|
r"to output a NumPy array. Got <class 'list'> instead."
|
|
)
|
|
with raises(AssertionError, match=err_msg):
|
|
check_classifiers_multilabel_output_format_decision_function(
|
|
clf.__class__.__name__,
|
|
clf,
|
|
)
|
|
# 2. inconsistent shape
|
|
clf = MultiLabelClassifierDecisionFunction(response_output=y_test[:, :-1])
|
|
err_msg = (
|
|
r"MultiLabelClassifierDecisionFunction.decision_function is expected "
|
|
r"to provide a NumPy array of shape \(n_samples, n_outputs\). Got "
|
|
r"\(25, 4\) instead of \(25, 5\)"
|
|
)
|
|
with raises(AssertionError, match=err_msg):
|
|
check_classifiers_multilabel_output_format_decision_function(
|
|
clf.__class__.__name__,
|
|
clf,
|
|
)
|
|
# 3. inconsistent dtype
|
|
clf = MultiLabelClassifierDecisionFunction(response_output=y_test)
|
|
err_msg = (
|
|
r"MultiLabelClassifierDecisionFunction.decision_function is expected "
|
|
r"to output a floating dtype."
|
|
)
|
|
with raises(AssertionError, match=err_msg):
|
|
check_classifiers_multilabel_output_format_decision_function(
|
|
clf.__class__.__name__,
|
|
clf,
|
|
)
|
|
|
|
|
|
def run_tests_without_pytest():
|
|
"""Runs the tests in this file without using pytest."""
|
|
main_module = sys.modules["__main__"]
|
|
test_functions = [
|
|
getattr(main_module, name)
|
|
for name in dir(main_module)
|
|
if name.startswith("test_")
|
|
]
|
|
test_cases = [unittest.FunctionTestCase(fn) for fn in test_functions]
|
|
suite = unittest.TestSuite()
|
|
suite.addTests(test_cases)
|
|
runner = unittest.TextTestRunner()
|
|
runner.run(suite)
|
|
|
|
|
|
def test_check_class_weight_balanced_linear_classifier():
|
|
# check that ill-computed balanced weights raises an exception
|
|
msg = "Classifier estimator_name is not computing class_weight=balanced properly"
|
|
with raises(AssertionError, match=msg):
|
|
check_class_weight_balanced_linear_classifier(
|
|
"estimator_name", BadBalancedWeightsClassifier
|
|
)
|
|
|
|
|
|
def test_all_estimators_all_public():
|
|
# all_estimator should not fail when pytest is not installed and return
|
|
# only public estimators
|
|
with warnings.catch_warnings(record=True) as record:
|
|
estimators = all_estimators()
|
|
# no warnings are raised
|
|
assert not record
|
|
for est in estimators:
|
|
assert not est.__class__.__name__.startswith("_")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# This module is run as a script to check that we have no dependency on
|
|
# pytest for estimator checks.
|
|
run_tests_without_pytest()
|
|
|
|
|
|
def test_xfail_ignored_in_check_estimator():
|
|
# Make sure checks marked as xfail are just ignored and not run by
|
|
# check_estimator(), but still raise a warning.
|
|
with warnings.catch_warnings(record=True) as records:
|
|
check_estimator(NuSVC())
|
|
assert SkipTestWarning in [rec.category for rec in records]
|
|
|
|
|
|
# FIXME: this test should be uncommented when the checks will be granular
|
|
# enough. In 0.24, these tests fail due to low estimator performance.
|
|
def test_minimal_class_implementation_checks():
|
|
# Check that third-party library can run tests without inheriting from
|
|
# BaseEstimator.
|
|
# FIXME
|
|
raise SkipTest
|
|
minimal_estimators = [MinimalTransformer(), MinimalRegressor(), MinimalClassifier()]
|
|
for estimator in minimal_estimators:
|
|
check_estimator(estimator)
|
|
|
|
|
|
def test_check_fit_check_is_fitted():
|
|
class Estimator(BaseEstimator):
|
|
def __init__(self, behavior="attribute"):
|
|
self.behavior = behavior
|
|
|
|
def fit(self, X, y, **kwargs):
|
|
if self.behavior == "attribute":
|
|
self.is_fitted_ = True
|
|
elif self.behavior == "method":
|
|
self._is_fitted = True
|
|
return self
|
|
|
|
@available_if(lambda self: self.behavior in {"method", "always-true"})
|
|
def __sklearn_is_fitted__(self):
|
|
if self.behavior == "always-true":
|
|
return True
|
|
return hasattr(self, "_is_fitted")
|
|
|
|
with raises(Exception, match="passes check_is_fitted before being fit"):
|
|
check_fit_check_is_fitted("estimator", Estimator(behavior="always-true"))
|
|
|
|
check_fit_check_is_fitted("estimator", Estimator(behavior="method"))
|
|
check_fit_check_is_fitted("estimator", Estimator(behavior="attribute"))
|
|
|
|
|
|
def test_check_requires_y_none():
|
|
class Estimator(BaseEstimator):
|
|
def fit(self, X, y):
|
|
X, y = check_X_y(X, y)
|
|
|
|
with warnings.catch_warnings(record=True) as record:
|
|
check_requires_y_none("estimator", Estimator())
|
|
|
|
# no warnings are raised
|
|
assert not [r.message for r in record]
|
|
|
|
|
|
# TODO: Remove in 1.3 when Estimator is removed
|
|
def test_deprecated_Estimator_check_estimator():
|
|
err_msg = "'Estimator' was deprecated in favor of"
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("error", FutureWarning)
|
|
with raises(FutureWarning, match=err_msg, may_pass=True):
|
|
check_estimator(Estimator=NuSVC())
|
|
|
|
err_msg = "Either estimator or Estimator should be passed"
|
|
with raises(ValueError, match=err_msg, may_pass=False):
|
|
check_estimator()
|
|
|
|
|
|
def test_non_deterministic_estimator_skip_tests():
|
|
# check estimators with non_deterministic tag set to True
|
|
# will skip certain tests, refer to issue #22313 for details
|
|
for est in [MinimalTransformer, MinimalRegressor, MinimalClassifier]:
|
|
all_tests = list(_yield_all_checks(est()))
|
|
assert check_methods_sample_order_invariance in all_tests
|
|
assert check_methods_subset_invariance in all_tests
|
|
|
|
class Estimator(est):
|
|
def _more_tags(self):
|
|
return {"non_deterministic": True}
|
|
|
|
all_tests = list(_yield_all_checks(Estimator()))
|
|
assert check_methods_sample_order_invariance not in all_tests
|
|
assert check_methods_subset_invariance not in all_tests
|
|
|
|
|
|
def test_check_outlier_contamination():
|
|
"""Check the test for the contamination parameter in the outlier detectors."""
|
|
|
|
# Without any parameter constraints, the estimator will early exit the test by
|
|
# returning None.
|
|
class OutlierDetectorWithoutConstraint(OutlierMixin, BaseEstimator):
|
|
"""Outlier detector without parameter validation."""
|
|
|
|
def __init__(self, contamination=0.1):
|
|
self.contamination = contamination
|
|
|
|
def fit(self, X, y=None, sample_weight=None):
|
|
return self # pragma: no cover
|
|
|
|
def predict(self, X, y=None):
|
|
return np.ones(X.shape[0])
|
|
|
|
detector = OutlierDetectorWithoutConstraint()
|
|
assert check_outlier_contamination(detector.__class__.__name__, detector) is None
|
|
|
|
# Now, we check that with the parameter constraints, the test should only be valid
|
|
# if an Interval constraint with bound in [0, 1] is provided.
|
|
class OutlierDetectorWithConstraint(OutlierDetectorWithoutConstraint):
|
|
_parameter_constraints = {"contamination": [StrOptions({"auto"})]}
|
|
|
|
detector = OutlierDetectorWithConstraint()
|
|
err_msg = "contamination constraints should contain a Real Interval constraint."
|
|
with raises(AssertionError, match=err_msg):
|
|
check_outlier_contamination(detector.__class__.__name__, detector)
|
|
|
|
# Add a correct interval constraint and check that the test passes.
|
|
OutlierDetectorWithConstraint._parameter_constraints["contamination"] = [
|
|
Interval(Real, 0, 0.5, closed="right")
|
|
]
|
|
detector = OutlierDetectorWithConstraint()
|
|
check_outlier_contamination(detector.__class__.__name__, detector)
|
|
|
|
incorrect_intervals = [
|
|
Interval(Integral, 0, 1, closed="right"), # not an integral interval
|
|
Interval(Real, -1, 1, closed="right"), # lower bound is negative
|
|
Interval(Real, 0, 2, closed="right"), # upper bound is greater than 1
|
|
Interval(Real, 0, 0.5, closed="left"), # lower bound include 0
|
|
]
|
|
|
|
err_msg = r"contamination constraint should be an interval in \(0, 0.5\]"
|
|
for interval in incorrect_intervals:
|
|
OutlierDetectorWithConstraint._parameter_constraints["contamination"] = [
|
|
interval
|
|
]
|
|
detector = OutlierDetectorWithConstraint()
|
|
with raises(AssertionError, match=err_msg):
|
|
check_outlier_contamination(detector.__class__.__name__, detector)
|
|
|
|
|
|
def test_decision_proba_tie_ranking():
|
|
"""Check that in case with some probabilities ties, we relax the
|
|
ranking comparison with the decision function.
|
|
Non-regression test for:
|
|
https://github.com/scikit-learn/scikit-learn/issues/24025
|
|
"""
|
|
estimator = SGDClassifier(loss="log_loss")
|
|
check_decision_proba_consistency("SGDClassifier", estimator)
|