import numpy as np from ..base import BaseEstimator, ClassifierMixin from ..utils._metadata_requests import RequestMethod from .metaestimators import available_if from .validation import ( _check_sample_weight, _num_samples, check_array, check_is_fitted, check_random_state, ) class ArraySlicingWrapper: """ Parameters ---------- array """ def __init__(self, array): self.array = array def __getitem__(self, aslice): return MockDataFrame(self.array[aslice]) class MockDataFrame: """ Parameters ---------- array """ # have shape and length but don't support indexing. def __init__(self, array): self.array = array self.values = array self.shape = array.shape self.ndim = array.ndim # ugly hack to make iloc work. self.iloc = ArraySlicingWrapper(array) def __len__(self): return len(self.array) def __array__(self, dtype=None): # Pandas data frames also are array-like: we want to make sure that # input validation in cross-validation does not try to call that # method. return self.array def __eq__(self, other): return MockDataFrame(self.array == other.array) def __ne__(self, other): return not self == other def take(self, indices, axis=0): return MockDataFrame(self.array.take(indices, axis=axis)) class CheckingClassifier(ClassifierMixin, BaseEstimator): """Dummy classifier to test pipelining and meta-estimators. Checks some property of `X` and `y`in fit / predict. This allows testing whether pipelines / cross-validation or metaestimators changed the input. Can also be used to check if `fit_params` are passed correctly, and to force a certain score to be returned. Parameters ---------- check_y, check_X : callable, default=None The callable used to validate `X` and `y`. These callable should return a bool where `False` will trigger an `AssertionError`. If `None`, the data is not validated. Default is `None`. check_y_params, check_X_params : dict, default=None The optional parameters to pass to `check_X` and `check_y`. If `None`, then no parameters are passed in. methods_to_check : "all" or list of str, default="all" The methods in which the checks should be applied. By default, all checks will be done on all methods (`fit`, `predict`, `predict_proba`, `decision_function` and `score`). foo_param : int, default=0 A `foo` param. When `foo > 1`, the output of :meth:`score` will be 1 otherwise it is 0. expected_sample_weight : bool, default=False Whether to check if a valid `sample_weight` was passed to `fit`. expected_fit_params : list of str, default=None A list of the expected parameters given when calling `fit`. Attributes ---------- classes_ : int The classes seen during `fit`. n_features_in_ : int The number of features seen during `fit`. Examples -------- >>> from sklearn.utils._mocking import CheckingClassifier This helper allow to assert to specificities regarding `X` or `y`. In this case we expect `check_X` or `check_y` to return a boolean. >>> from sklearn.datasets import load_iris >>> X, y = load_iris(return_X_y=True) >>> clf = CheckingClassifier(check_X=lambda x: x.shape == (150, 4)) >>> clf.fit(X, y) CheckingClassifier(...) We can also provide a check which might raise an error. In this case, we expect `check_X` to return `X` and `check_y` to return `y`. >>> from sklearn.utils import check_array >>> clf = CheckingClassifier(check_X=check_array) >>> clf.fit(X, y) CheckingClassifier(...) """ def __init__( self, *, check_y=None, check_y_params=None, check_X=None, check_X_params=None, methods_to_check="all", foo_param=0, expected_sample_weight=None, expected_fit_params=None, random_state=None, ): self.check_y = check_y self.check_y_params = check_y_params self.check_X = check_X self.check_X_params = check_X_params self.methods_to_check = methods_to_check self.foo_param = foo_param self.expected_sample_weight = expected_sample_weight self.expected_fit_params = expected_fit_params self.random_state = random_state def _check_X_y(self, X, y=None, should_be_fitted=True): """Validate X and y and make extra check. Parameters ---------- X : array-like of shape (n_samples, n_features) The data set. `X` is checked only if `check_X` is not `None` (default is None). y : array-like of shape (n_samples), default=None The corresponding target, by default `None`. `y` is checked only if `check_y` is not `None` (default is None). should_be_fitted : bool, default=True Whether or not the classifier should be already fitted. By default True. Returns ------- X, y """ if should_be_fitted: check_is_fitted(self) if self.check_X is not None: params = {} if self.check_X_params is None else self.check_X_params checked_X = self.check_X(X, **params) if isinstance(checked_X, (bool, np.bool_)): assert checked_X else: X = checked_X if y is not None and self.check_y is not None: params = {} if self.check_y_params is None else self.check_y_params checked_y = self.check_y(y, **params) if isinstance(checked_y, (bool, np.bool_)): assert checked_y else: y = checked_y return X, y def fit(self, X, y, sample_weight=None, **fit_params): """Fit classifier. Parameters ---------- X : array-like of shape (n_samples, n_features) Training vector, where `n_samples` is the number of samples and `n_features` is the number of features. y : array-like of shape (n_samples, n_outputs) or (n_samples,), \ default=None Target relative to X for classification or regression; None for unsupervised learning. sample_weight : array-like of shape (n_samples,), default=None Sample weights. If None, then samples are equally weighted. **fit_params : dict of string -> object Parameters passed to the ``fit`` method of the estimator Returns ------- self """ assert _num_samples(X) == _num_samples(y) if self.methods_to_check == "all" or "fit" in self.methods_to_check: X, y = self._check_X_y(X, y, should_be_fitted=False) self.n_features_in_ = np.shape(X)[1] self.classes_ = np.unique(check_array(y, ensure_2d=False, allow_nd=True)) if self.expected_fit_params: missing = set(self.expected_fit_params) - set(fit_params) if missing: raise AssertionError( f"Expected fit parameter(s) {list(missing)} not seen." ) for key, value in fit_params.items(): if _num_samples(value) != _num_samples(X): raise AssertionError( f"Fit parameter {key} has length {_num_samples(value)}" f"; expected {_num_samples(X)}." ) if self.expected_sample_weight: if sample_weight is None: raise AssertionError("Expected sample_weight to be passed") _check_sample_weight(sample_weight, X) return self def predict(self, X): """Predict the first class seen in `classes_`. Parameters ---------- X : array-like of shape (n_samples, n_features) The input data. Returns ------- preds : ndarray of shape (n_samples,) Predictions of the first class seens in `classes_`. """ if self.methods_to_check == "all" or "predict" in self.methods_to_check: X, y = self._check_X_y(X) rng = check_random_state(self.random_state) return rng.choice(self.classes_, size=_num_samples(X)) def predict_proba(self, X): """Predict probabilities for each class. Here, the dummy classifier will provide a probability of 1 for the first class of `classes_` and 0 otherwise. Parameters ---------- X : array-like of shape (n_samples, n_features) The input data. Returns ------- proba : ndarray of shape (n_samples, n_classes) The probabilities for each sample and class. """ if self.methods_to_check == "all" or "predict_proba" in self.methods_to_check: X, y = self._check_X_y(X) rng = check_random_state(self.random_state) proba = rng.randn(_num_samples(X), len(self.classes_)) proba = np.abs(proba, out=proba) proba /= np.sum(proba, axis=1)[:, np.newaxis] return proba def decision_function(self, X): """Confidence score. Parameters ---------- X : array-like of shape (n_samples, n_features) The input data. Returns ------- decision : ndarray of shape (n_samples,) if n_classes == 2\ else (n_samples, n_classes) Confidence score. """ if ( self.methods_to_check == "all" or "decision_function" in self.methods_to_check ): X, y = self._check_X_y(X) rng = check_random_state(self.random_state) if len(self.classes_) == 2: # for binary classifier, the confidence score is related to # classes_[1] and therefore should be null. return rng.randn(_num_samples(X)) else: return rng.randn(_num_samples(X), len(self.classes_)) def score(self, X=None, Y=None): """Fake score. Parameters ---------- X : array-like of shape (n_samples, n_features) Input data, where `n_samples` is the number of samples and `n_features` is the number of features. Y : array-like of shape (n_samples, n_output) or (n_samples,) Target relative to X for classification or regression; None for unsupervised learning. Returns ------- score : float Either 0 or 1 depending of `foo_param` (i.e. `foo_param > 1 => score=1` otherwise `score=0`). """ if self.methods_to_check == "all" or "score" in self.methods_to_check: self._check_X_y(X, Y) if self.foo_param > 1: score = 1.0 else: score = 0.0 return score def _more_tags(self): return {"_skip_test": True, "X_types": ["1dlabel"]} # Deactivate key validation for CheckingClassifier because we want to be able to # call fit with arbitrary fit_params and record them. Without this change, we # would get an error because those arbitrary params are not expected. CheckingClassifier.set_fit_request = RequestMethod( # type: ignore name="fit", keys=[], validate_keys=False ) class NoSampleWeightWrapper(BaseEstimator): """Wrap estimator which will not expose `sample_weight`. Parameters ---------- est : estimator, default=None The estimator to wrap. """ def __init__(self, est=None): self.est = est def fit(self, X, y): return self.est.fit(X, y) def predict(self, X): return self.est.predict(X) def predict_proba(self, X): return self.est.predict_proba(X) def _more_tags(self): return {"_skip_test": True} def _check_response(method): def check(self): return self.response_methods is not None and method in self.response_methods return check class _MockEstimatorOnOffPrediction(BaseEstimator): """Estimator for which we can turn on/off the prediction methods. Parameters ---------- response_methods: list of \ {"predict", "predict_proba", "decision_function"}, default=None List containing the response implemented by the estimator. When, the response is in the list, it will return the name of the response method when called. Otherwise, an `AttributeError` is raised. It allows to use `getattr` as any conventional estimator. By default, no response methods are mocked. """ def __init__(self, response_methods=None): self.response_methods = response_methods def fit(self, X, y): self.classes_ = np.unique(y) return self @available_if(_check_response("predict")) def predict(self, X): return "predict" @available_if(_check_response("predict_proba")) def predict_proba(self, X): return "predict_proba" @available_if(_check_response("decision_function")) def decision_function(self, X): return "decision_function"