1069 lines
38 KiB
Python
1069 lines
38 KiB
Python
|
"""Base classes for all estimators."""
|
||
|
|
||
|
# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
import copy
|
||
|
import warnings
|
||
|
from collections import defaultdict
|
||
|
import platform
|
||
|
import inspect
|
||
|
import re
|
||
|
|
||
|
import numpy as np
|
||
|
|
||
|
from . import __version__
|
||
|
from ._config import get_config
|
||
|
from .utils import _IS_32BIT
|
||
|
from .utils._set_output import _SetOutputMixin
|
||
|
from .utils._tags import (
|
||
|
_DEFAULT_TAGS,
|
||
|
)
|
||
|
from .utils.validation import check_X_y
|
||
|
from .utils.validation import check_array
|
||
|
from .utils.validation import _check_y
|
||
|
from .utils.validation import _num_features
|
||
|
from .utils.validation import _check_feature_names_in
|
||
|
from .utils.validation import _generate_get_feature_names_out
|
||
|
from .utils.validation import check_is_fitted
|
||
|
from .utils.validation import _get_feature_names
|
||
|
from .utils._estimator_html_repr import estimator_html_repr
|
||
|
from .utils._param_validation import validate_parameter_constraints
|
||
|
|
||
|
|
||
|
def clone(estimator, *, safe=True):
|
||
|
"""Construct a new unfitted estimator with the same parameters.
|
||
|
|
||
|
Clone does a deep copy of the model in an estimator
|
||
|
without actually copying attached data. It returns a new estimator
|
||
|
with the same parameters that has not been fitted on any data.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
estimator : {list, tuple, set} of estimator instance or a single \
|
||
|
estimator instance
|
||
|
The estimator or group of estimators to be cloned.
|
||
|
safe : bool, default=True
|
||
|
If safe is False, clone will fall back to a deep copy on objects
|
||
|
that are not estimators.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
estimator : object
|
||
|
The deep copy of the input, an estimator if input is an estimator.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
If the estimator's `random_state` parameter is an integer (or if the
|
||
|
estimator doesn't have a `random_state` parameter), an *exact clone* is
|
||
|
returned: the clone and the original estimator will give the exact same
|
||
|
results. Otherwise, *statistical clone* is returned: the clone might
|
||
|
return different results from the original estimator. More details can be
|
||
|
found in :ref:`randomness`.
|
||
|
"""
|
||
|
estimator_type = type(estimator)
|
||
|
# XXX: not handling dictionaries
|
||
|
if estimator_type in (list, tuple, set, frozenset):
|
||
|
return estimator_type([clone(e, safe=safe) for e in estimator])
|
||
|
elif not hasattr(estimator, "get_params") or isinstance(estimator, type):
|
||
|
if not safe:
|
||
|
return copy.deepcopy(estimator)
|
||
|
else:
|
||
|
if isinstance(estimator, type):
|
||
|
raise TypeError(
|
||
|
"Cannot clone object. "
|
||
|
+ "You should provide an instance of "
|
||
|
+ "scikit-learn estimator instead of a class."
|
||
|
)
|
||
|
else:
|
||
|
raise TypeError(
|
||
|
"Cannot clone object '%s' (type %s): "
|
||
|
"it does not seem to be a scikit-learn "
|
||
|
"estimator as it does not implement a "
|
||
|
"'get_params' method." % (repr(estimator), type(estimator))
|
||
|
)
|
||
|
|
||
|
klass = estimator.__class__
|
||
|
new_object_params = estimator.get_params(deep=False)
|
||
|
for name, param in new_object_params.items():
|
||
|
new_object_params[name] = clone(param, safe=False)
|
||
|
new_object = klass(**new_object_params)
|
||
|
params_set = new_object.get_params(deep=False)
|
||
|
|
||
|
# quick sanity check of the parameters of the clone
|
||
|
for name in new_object_params:
|
||
|
param1 = new_object_params[name]
|
||
|
param2 = params_set[name]
|
||
|
if param1 is not param2:
|
||
|
raise RuntimeError(
|
||
|
"Cannot clone object %s, as the constructor "
|
||
|
"either does not set or modifies parameter %s" % (estimator, name)
|
||
|
)
|
||
|
|
||
|
# _sklearn_output_config is used by `set_output` to configure the output
|
||
|
# container of an estimator.
|
||
|
if hasattr(estimator, "_sklearn_output_config"):
|
||
|
new_object._sklearn_output_config = copy.deepcopy(
|
||
|
estimator._sklearn_output_config
|
||
|
)
|
||
|
return new_object
|
||
|
|
||
|
|
||
|
class BaseEstimator:
|
||
|
"""Base class for all estimators in scikit-learn.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
All estimators should specify all the parameters that can be set
|
||
|
at the class level in their ``__init__`` as explicit keyword
|
||
|
arguments (no ``*args`` or ``**kwargs``).
|
||
|
"""
|
||
|
|
||
|
@classmethod
|
||
|
def _get_param_names(cls):
|
||
|
"""Get parameter names for the estimator"""
|
||
|
# fetch the constructor or the original constructor before
|
||
|
# deprecation wrapping if any
|
||
|
init = getattr(cls.__init__, "deprecated_original", cls.__init__)
|
||
|
if init is object.__init__:
|
||
|
# No explicit constructor to introspect
|
||
|
return []
|
||
|
|
||
|
# introspect the constructor arguments to find the model parameters
|
||
|
# to represent
|
||
|
init_signature = inspect.signature(init)
|
||
|
# Consider the constructor parameters excluding 'self'
|
||
|
parameters = [
|
||
|
p
|
||
|
for p in init_signature.parameters.values()
|
||
|
if p.name != "self" and p.kind != p.VAR_KEYWORD
|
||
|
]
|
||
|
for p in parameters:
|
||
|
if p.kind == p.VAR_POSITIONAL:
|
||
|
raise RuntimeError(
|
||
|
"scikit-learn estimators should always "
|
||
|
"specify their parameters in the signature"
|
||
|
" of their __init__ (no varargs)."
|
||
|
" %s with constructor %s doesn't "
|
||
|
" follow this convention." % (cls, init_signature)
|
||
|
)
|
||
|
# Extract and sort argument names excluding 'self'
|
||
|
return sorted([p.name for p in parameters])
|
||
|
|
||
|
def get_params(self, deep=True):
|
||
|
"""
|
||
|
Get parameters for this estimator.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
deep : bool, default=True
|
||
|
If True, will return the parameters for this estimator and
|
||
|
contained subobjects that are estimators.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
params : dict
|
||
|
Parameter names mapped to their values.
|
||
|
"""
|
||
|
out = dict()
|
||
|
for key in self._get_param_names():
|
||
|
value = getattr(self, key)
|
||
|
if deep and hasattr(value, "get_params") and not isinstance(value, type):
|
||
|
deep_items = value.get_params().items()
|
||
|
out.update((key + "__" + k, val) for k, val in deep_items)
|
||
|
out[key] = value
|
||
|
return out
|
||
|
|
||
|
def set_params(self, **params):
|
||
|
"""Set the parameters of this estimator.
|
||
|
|
||
|
The method works on simple estimators as well as on nested objects
|
||
|
(such as :class:`~sklearn.pipeline.Pipeline`). The latter have
|
||
|
parameters of the form ``<component>__<parameter>`` so that it's
|
||
|
possible to update each component of a nested object.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
**params : dict
|
||
|
Estimator parameters.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self : estimator instance
|
||
|
Estimator instance.
|
||
|
"""
|
||
|
if not params:
|
||
|
# Simple optimization to gain speed (inspect is slow)
|
||
|
return self
|
||
|
valid_params = self.get_params(deep=True)
|
||
|
|
||
|
nested_params = defaultdict(dict) # grouped by prefix
|
||
|
for key, value in params.items():
|
||
|
key, delim, sub_key = key.partition("__")
|
||
|
if key not in valid_params:
|
||
|
local_valid_params = self._get_param_names()
|
||
|
raise ValueError(
|
||
|
f"Invalid parameter {key!r} for estimator {self}. "
|
||
|
f"Valid parameters are: {local_valid_params!r}."
|
||
|
)
|
||
|
|
||
|
if delim:
|
||
|
nested_params[key][sub_key] = value
|
||
|
else:
|
||
|
setattr(self, key, value)
|
||
|
valid_params[key] = value
|
||
|
|
||
|
for key, sub_params in nested_params.items():
|
||
|
# TODO(1.4): remove specific handling of "base_estimator".
|
||
|
# The "base_estimator" key is special. It was deprecated and
|
||
|
# renamed to "estimator" for several estimators. This means we
|
||
|
# need to translate it here and set sub-parameters on "estimator",
|
||
|
# but only if the user did not explicitly set a value for
|
||
|
# "base_estimator".
|
||
|
if (
|
||
|
key == "base_estimator"
|
||
|
and valid_params[key] == "deprecated"
|
||
|
and self.__module__.startswith("sklearn.")
|
||
|
):
|
||
|
warnings.warn(
|
||
|
f"Parameter 'base_estimator' of {self.__class__.__name__} is"
|
||
|
" deprecated in favor of 'estimator'. See"
|
||
|
f" {self.__class__.__name__}'s docstring for more details.",
|
||
|
FutureWarning,
|
||
|
stacklevel=2,
|
||
|
)
|
||
|
key = "estimator"
|
||
|
valid_params[key].set_params(**sub_params)
|
||
|
|
||
|
return self
|
||
|
|
||
|
def __repr__(self, N_CHAR_MAX=700):
|
||
|
# N_CHAR_MAX is the (approximate) maximum number of non-blank
|
||
|
# characters to render. We pass it as an optional parameter to ease
|
||
|
# the tests.
|
||
|
|
||
|
from .utils._pprint import _EstimatorPrettyPrinter
|
||
|
|
||
|
N_MAX_ELEMENTS_TO_SHOW = 30 # number of elements to show in sequences
|
||
|
|
||
|
# use ellipsis for sequences with a lot of elements
|
||
|
pp = _EstimatorPrettyPrinter(
|
||
|
compact=True,
|
||
|
indent=1,
|
||
|
indent_at_name=True,
|
||
|
n_max_elements_to_show=N_MAX_ELEMENTS_TO_SHOW,
|
||
|
)
|
||
|
|
||
|
repr_ = pp.pformat(self)
|
||
|
|
||
|
# Use bruteforce ellipsis when there are a lot of non-blank characters
|
||
|
n_nonblank = len("".join(repr_.split()))
|
||
|
if n_nonblank > N_CHAR_MAX:
|
||
|
lim = N_CHAR_MAX // 2 # apprx number of chars to keep on both ends
|
||
|
regex = r"^(\s*\S){%d}" % lim
|
||
|
# The regex '^(\s*\S){%d}' % n
|
||
|
# matches from the start of the string until the nth non-blank
|
||
|
# character:
|
||
|
# - ^ matches the start of string
|
||
|
# - (pattern){n} matches n repetitions of pattern
|
||
|
# - \s*\S matches a non-blank char following zero or more blanks
|
||
|
left_lim = re.match(regex, repr_).end()
|
||
|
right_lim = re.match(regex, repr_[::-1]).end()
|
||
|
|
||
|
if "\n" in repr_[left_lim:-right_lim]:
|
||
|
# The left side and right side aren't on the same line.
|
||
|
# To avoid weird cuts, e.g.:
|
||
|
# categoric...ore',
|
||
|
# we need to start the right side with an appropriate newline
|
||
|
# character so that it renders properly as:
|
||
|
# categoric...
|
||
|
# handle_unknown='ignore',
|
||
|
# so we add [^\n]*\n which matches until the next \n
|
||
|
regex += r"[^\n]*\n"
|
||
|
right_lim = re.match(regex, repr_[::-1]).end()
|
||
|
|
||
|
ellipsis = "..."
|
||
|
if left_lim + len(ellipsis) < len(repr_) - right_lim:
|
||
|
# Only add ellipsis if it results in a shorter repr
|
||
|
repr_ = repr_[:left_lim] + "..." + repr_[-right_lim:]
|
||
|
|
||
|
return repr_
|
||
|
|
||
|
def __getstate__(self):
|
||
|
if getattr(self, "__slots__", None):
|
||
|
raise TypeError(
|
||
|
"You cannot use `__slots__` in objects inheriting from "
|
||
|
"`sklearn.base.BaseEstimator`."
|
||
|
)
|
||
|
|
||
|
try:
|
||
|
state = super().__getstate__()
|
||
|
if state is None:
|
||
|
# For Python 3.11+, empty instance (no `__slots__`,
|
||
|
# and `__dict__`) will return a state equal to `None`.
|
||
|
state = self.__dict__.copy()
|
||
|
except AttributeError:
|
||
|
# Python < 3.11
|
||
|
state = self.__dict__.copy()
|
||
|
|
||
|
if type(self).__module__.startswith("sklearn."):
|
||
|
return dict(state.items(), _sklearn_version=__version__)
|
||
|
else:
|
||
|
return state
|
||
|
|
||
|
def __setstate__(self, state):
|
||
|
if type(self).__module__.startswith("sklearn."):
|
||
|
pickle_version = state.pop("_sklearn_version", "pre-0.18")
|
||
|
if pickle_version != __version__:
|
||
|
warnings.warn(
|
||
|
"Trying to unpickle estimator {0} from version {1} when "
|
||
|
"using version {2}. This might lead to breaking code or "
|
||
|
"invalid results. Use at your own risk. "
|
||
|
"For more info please refer to:\n"
|
||
|
"https://scikit-learn.org/stable/model_persistence.html"
|
||
|
"#security-maintainability-limitations".format(
|
||
|
self.__class__.__name__, pickle_version, __version__
|
||
|
),
|
||
|
UserWarning,
|
||
|
)
|
||
|
try:
|
||
|
super().__setstate__(state)
|
||
|
except AttributeError:
|
||
|
self.__dict__.update(state)
|
||
|
|
||
|
def _more_tags(self):
|
||
|
return _DEFAULT_TAGS
|
||
|
|
||
|
def _get_tags(self):
|
||
|
collected_tags = {}
|
||
|
for base_class in reversed(inspect.getmro(self.__class__)):
|
||
|
if hasattr(base_class, "_more_tags"):
|
||
|
# need the if because mixins might not have _more_tags
|
||
|
# but might do redundant work in estimators
|
||
|
# (i.e. calling more tags on BaseEstimator multiple times)
|
||
|
more_tags = base_class._more_tags(self)
|
||
|
collected_tags.update(more_tags)
|
||
|
return collected_tags
|
||
|
|
||
|
def _check_n_features(self, X, reset):
|
||
|
"""Set the `n_features_in_` attribute, or check against it.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {ndarray, sparse matrix} of shape (n_samples, n_features)
|
||
|
The input samples.
|
||
|
reset : bool
|
||
|
If True, the `n_features_in_` attribute is set to `X.shape[1]`.
|
||
|
If False and the attribute exists, then check that it is equal to
|
||
|
`X.shape[1]`. If False and the attribute does *not* exist, then
|
||
|
the check is skipped.
|
||
|
.. note::
|
||
|
It is recommended to call reset=True in `fit` and in the first
|
||
|
call to `partial_fit`. All other methods that validate `X`
|
||
|
should set `reset=False`.
|
||
|
"""
|
||
|
try:
|
||
|
n_features = _num_features(X)
|
||
|
except TypeError as e:
|
||
|
if not reset and hasattr(self, "n_features_in_"):
|
||
|
raise ValueError(
|
||
|
"X does not contain any features, but "
|
||
|
f"{self.__class__.__name__} is expecting "
|
||
|
f"{self.n_features_in_} features"
|
||
|
) from e
|
||
|
# If the number of features is not defined and reset=True,
|
||
|
# then we skip this check
|
||
|
return
|
||
|
|
||
|
if reset:
|
||
|
self.n_features_in_ = n_features
|
||
|
return
|
||
|
|
||
|
if not hasattr(self, "n_features_in_"):
|
||
|
# Skip this check if the expected number of expected input features
|
||
|
# was not recorded by calling fit first. This is typically the case
|
||
|
# for stateless transformers.
|
||
|
return
|
||
|
|
||
|
if n_features != self.n_features_in_:
|
||
|
raise ValueError(
|
||
|
f"X has {n_features} features, but {self.__class__.__name__} "
|
||
|
f"is expecting {self.n_features_in_} features as input."
|
||
|
)
|
||
|
|
||
|
def _check_feature_names(self, X, *, reset):
|
||
|
"""Set or check the `feature_names_in_` attribute.
|
||
|
|
||
|
.. versionadded:: 1.0
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {ndarray, dataframe} of shape (n_samples, n_features)
|
||
|
The input samples.
|
||
|
|
||
|
reset : bool
|
||
|
Whether to reset the `feature_names_in_` attribute.
|
||
|
If False, the input will be checked for consistency with
|
||
|
feature names of data provided when reset was last True.
|
||
|
.. note::
|
||
|
It is recommended to call `reset=True` in `fit` and in the first
|
||
|
call to `partial_fit`. All other methods that validate `X`
|
||
|
should set `reset=False`.
|
||
|
"""
|
||
|
|
||
|
if reset:
|
||
|
feature_names_in = _get_feature_names(X)
|
||
|
if feature_names_in is not None:
|
||
|
self.feature_names_in_ = feature_names_in
|
||
|
elif hasattr(self, "feature_names_in_"):
|
||
|
# Delete the attribute when the estimator is fitted on a new dataset
|
||
|
# that has no feature names.
|
||
|
delattr(self, "feature_names_in_")
|
||
|
return
|
||
|
|
||
|
fitted_feature_names = getattr(self, "feature_names_in_", None)
|
||
|
X_feature_names = _get_feature_names(X)
|
||
|
|
||
|
if fitted_feature_names is None and X_feature_names is None:
|
||
|
# no feature names seen in fit and in X
|
||
|
return
|
||
|
|
||
|
if X_feature_names is not None and fitted_feature_names is None:
|
||
|
warnings.warn(
|
||
|
f"X has feature names, but {self.__class__.__name__} was fitted without"
|
||
|
" feature names"
|
||
|
)
|
||
|
return
|
||
|
|
||
|
if X_feature_names is None and fitted_feature_names is not None:
|
||
|
warnings.warn(
|
||
|
"X does not have valid feature names, but"
|
||
|
f" {self.__class__.__name__} was fitted with feature names"
|
||
|
)
|
||
|
return
|
||
|
|
||
|
# validate the feature names against the `feature_names_in_` attribute
|
||
|
if len(fitted_feature_names) != len(X_feature_names) or np.any(
|
||
|
fitted_feature_names != X_feature_names
|
||
|
):
|
||
|
message = (
|
||
|
"The feature names should match those that were passed during fit.\n"
|
||
|
)
|
||
|
fitted_feature_names_set = set(fitted_feature_names)
|
||
|
X_feature_names_set = set(X_feature_names)
|
||
|
|
||
|
unexpected_names = sorted(X_feature_names_set - fitted_feature_names_set)
|
||
|
missing_names = sorted(fitted_feature_names_set - X_feature_names_set)
|
||
|
|
||
|
def add_names(names):
|
||
|
output = ""
|
||
|
max_n_names = 5
|
||
|
for i, name in enumerate(names):
|
||
|
if i >= max_n_names:
|
||
|
output += "- ...\n"
|
||
|
break
|
||
|
output += f"- {name}\n"
|
||
|
return output
|
||
|
|
||
|
if unexpected_names:
|
||
|
message += "Feature names unseen at fit time:\n"
|
||
|
message += add_names(unexpected_names)
|
||
|
|
||
|
if missing_names:
|
||
|
message += "Feature names seen at fit time, yet now missing:\n"
|
||
|
message += add_names(missing_names)
|
||
|
|
||
|
if not missing_names and not unexpected_names:
|
||
|
message += (
|
||
|
"Feature names must be in the same order as they were in fit.\n"
|
||
|
)
|
||
|
|
||
|
raise ValueError(message)
|
||
|
|
||
|
def _validate_data(
|
||
|
self,
|
||
|
X="no_validation",
|
||
|
y="no_validation",
|
||
|
reset=True,
|
||
|
validate_separately=False,
|
||
|
**check_params,
|
||
|
):
|
||
|
"""Validate input data and set or check the `n_features_in_` attribute.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix, dataframe} of shape \
|
||
|
(n_samples, n_features), default='no validation'
|
||
|
The input samples.
|
||
|
If `'no_validation'`, no validation is performed on `X`. This is
|
||
|
useful for meta-estimator which can delegate input validation to
|
||
|
their underlying estimator(s). In that case `y` must be passed and
|
||
|
the only accepted `check_params` are `multi_output` and
|
||
|
`y_numeric`.
|
||
|
|
||
|
y : array-like of shape (n_samples,), default='no_validation'
|
||
|
The targets.
|
||
|
|
||
|
- If `None`, `check_array` is called on `X`. If the estimator's
|
||
|
requires_y tag is True, then an error will be raised.
|
||
|
- If `'no_validation'`, `check_array` is called on `X` and the
|
||
|
estimator's requires_y tag is ignored. This is a default
|
||
|
placeholder and is never meant to be explicitly set. In that case
|
||
|
`X` must be passed.
|
||
|
- Otherwise, only `y` with `_check_y` or both `X` and `y` are
|
||
|
checked with either `check_array` or `check_X_y` depending on
|
||
|
`validate_separately`.
|
||
|
|
||
|
reset : bool, default=True
|
||
|
Whether to reset the `n_features_in_` attribute.
|
||
|
If False, the input will be checked for consistency with data
|
||
|
provided when reset was last True.
|
||
|
.. note::
|
||
|
It is recommended to call reset=True in `fit` and in the first
|
||
|
call to `partial_fit`. All other methods that validate `X`
|
||
|
should set `reset=False`.
|
||
|
|
||
|
validate_separately : False or tuple of dicts, default=False
|
||
|
Only used if y is not None.
|
||
|
If False, call validate_X_y(). Else, it must be a tuple of kwargs
|
||
|
to be used for calling check_array() on X and y respectively.
|
||
|
|
||
|
`estimator=self` is automatically added to these dicts to generate
|
||
|
more informative error message in case of invalid input data.
|
||
|
|
||
|
**check_params : kwargs
|
||
|
Parameters passed to :func:`sklearn.utils.check_array` or
|
||
|
:func:`sklearn.utils.check_X_y`. Ignored if validate_separately
|
||
|
is not False.
|
||
|
|
||
|
`estimator=self` is automatically added to these params to generate
|
||
|
more informative error message in case of invalid input data.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
out : {ndarray, sparse matrix} or tuple of these
|
||
|
The validated input. A tuple is returned if both `X` and `y` are
|
||
|
validated.
|
||
|
"""
|
||
|
self._check_feature_names(X, reset=reset)
|
||
|
|
||
|
if y is None and self._get_tags()["requires_y"]:
|
||
|
raise ValueError(
|
||
|
f"This {self.__class__.__name__} estimator "
|
||
|
"requires y to be passed, but the target y is None."
|
||
|
)
|
||
|
|
||
|
no_val_X = isinstance(X, str) and X == "no_validation"
|
||
|
no_val_y = y is None or isinstance(y, str) and y == "no_validation"
|
||
|
|
||
|
default_check_params = {"estimator": self}
|
||
|
check_params = {**default_check_params, **check_params}
|
||
|
|
||
|
if no_val_X and no_val_y:
|
||
|
raise ValueError("Validation should be done on X, y or both.")
|
||
|
elif not no_val_X and no_val_y:
|
||
|
X = check_array(X, input_name="X", **check_params)
|
||
|
out = X
|
||
|
elif no_val_X and not no_val_y:
|
||
|
y = _check_y(y, **check_params)
|
||
|
out = y
|
||
|
else:
|
||
|
if validate_separately:
|
||
|
# We need this because some estimators validate X and y
|
||
|
# separately, and in general, separately calling check_array()
|
||
|
# on X and y isn't equivalent to just calling check_X_y()
|
||
|
# :(
|
||
|
check_X_params, check_y_params = validate_separately
|
||
|
if "estimator" not in check_X_params:
|
||
|
check_X_params = {**default_check_params, **check_X_params}
|
||
|
X = check_array(X, input_name="X", **check_X_params)
|
||
|
if "estimator" not in check_y_params:
|
||
|
check_y_params = {**default_check_params, **check_y_params}
|
||
|
y = check_array(y, input_name="y", **check_y_params)
|
||
|
else:
|
||
|
X, y = check_X_y(X, y, **check_params)
|
||
|
out = X, y
|
||
|
|
||
|
if not no_val_X and check_params.get("ensure_2d", True):
|
||
|
self._check_n_features(X, reset=reset)
|
||
|
|
||
|
return out
|
||
|
|
||
|
def _validate_params(self):
|
||
|
"""Validate types and values of constructor parameters
|
||
|
|
||
|
The expected type and values must be defined in the `_parameter_constraints`
|
||
|
class attribute, which is a dictionary `param_name: list of constraints`. See
|
||
|
the docstring of `validate_parameter_constraints` for a description of the
|
||
|
accepted constraints.
|
||
|
"""
|
||
|
validate_parameter_constraints(
|
||
|
self._parameter_constraints,
|
||
|
self.get_params(deep=False),
|
||
|
caller_name=self.__class__.__name__,
|
||
|
)
|
||
|
|
||
|
@property
|
||
|
def _repr_html_(self):
|
||
|
"""HTML representation of estimator.
|
||
|
|
||
|
This is redundant with the logic of `_repr_mimebundle_`. The latter
|
||
|
should be favorted in the long term, `_repr_html_` is only
|
||
|
implemented for consumers who do not interpret `_repr_mimbundle_`.
|
||
|
"""
|
||
|
if get_config()["display"] != "diagram":
|
||
|
raise AttributeError(
|
||
|
"_repr_html_ is only defined when the "
|
||
|
"'display' configuration option is set to "
|
||
|
"'diagram'"
|
||
|
)
|
||
|
return self._repr_html_inner
|
||
|
|
||
|
def _repr_html_inner(self):
|
||
|
"""This function is returned by the @property `_repr_html_` to make
|
||
|
`hasattr(estimator, "_repr_html_") return `True` or `False` depending
|
||
|
on `get_config()["display"]`.
|
||
|
"""
|
||
|
return estimator_html_repr(self)
|
||
|
|
||
|
def _repr_mimebundle_(self, **kwargs):
|
||
|
"""Mime bundle used by jupyter kernels to display estimator"""
|
||
|
output = {"text/plain": repr(self)}
|
||
|
if get_config()["display"] == "diagram":
|
||
|
output["text/html"] = estimator_html_repr(self)
|
||
|
return output
|
||
|
|
||
|
|
||
|
class ClassifierMixin:
|
||
|
"""Mixin class for all classifiers in scikit-learn."""
|
||
|
|
||
|
_estimator_type = "classifier"
|
||
|
|
||
|
def score(self, X, y, sample_weight=None):
|
||
|
"""
|
||
|
Return the mean accuracy on the given test data and labels.
|
||
|
|
||
|
In multi-label classification, this is the subset accuracy
|
||
|
which is a harsh metric since you require for each sample that
|
||
|
each label set be correctly predicted.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like of shape (n_samples, n_features)
|
||
|
Test samples.
|
||
|
|
||
|
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
|
||
|
True labels for `X`.
|
||
|
|
||
|
sample_weight : array-like of shape (n_samples,), default=None
|
||
|
Sample weights.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
score : float
|
||
|
Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
|
||
|
"""
|
||
|
from .metrics import accuracy_score
|
||
|
|
||
|
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
|
||
|
|
||
|
def _more_tags(self):
|
||
|
return {"requires_y": True}
|
||
|
|
||
|
|
||
|
class RegressorMixin:
|
||
|
"""Mixin class for all regression estimators in scikit-learn."""
|
||
|
|
||
|
_estimator_type = "regressor"
|
||
|
|
||
|
def score(self, X, y, sample_weight=None):
|
||
|
"""Return the coefficient of determination of the prediction.
|
||
|
|
||
|
The coefficient of determination :math:`R^2` is defined as
|
||
|
:math:`(1 - \\frac{u}{v})`, where :math:`u` is the residual
|
||
|
sum of squares ``((y_true - y_pred)** 2).sum()`` and :math:`v`
|
||
|
is the total sum of squares ``((y_true - y_true.mean()) ** 2).sum()``.
|
||
|
The best possible score is 1.0 and it can be negative (because the
|
||
|
model can be arbitrarily worse). A constant model that always predicts
|
||
|
the expected value of `y`, disregarding the input features, would get
|
||
|
a :math:`R^2` score of 0.0.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like of shape (n_samples, n_features)
|
||
|
Test samples. For some estimators this may be a precomputed
|
||
|
kernel matrix or a list of generic objects instead with shape
|
||
|
``(n_samples, n_samples_fitted)``, where ``n_samples_fitted``
|
||
|
is the number of samples used in the fitting for the estimator.
|
||
|
|
||
|
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
|
||
|
True values for `X`.
|
||
|
|
||
|
sample_weight : array-like of shape (n_samples,), default=None
|
||
|
Sample weights.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
score : float
|
||
|
:math:`R^2` of ``self.predict(X)`` w.r.t. `y`.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The :math:`R^2` score used when calling ``score`` on a regressor uses
|
||
|
``multioutput='uniform_average'`` from version 0.23 to keep consistent
|
||
|
with default value of :func:`~sklearn.metrics.r2_score`.
|
||
|
This influences the ``score`` method of all the multioutput
|
||
|
regressors (except for
|
||
|
:class:`~sklearn.multioutput.MultiOutputRegressor`).
|
||
|
"""
|
||
|
|
||
|
from .metrics import r2_score
|
||
|
|
||
|
y_pred = self.predict(X)
|
||
|
return r2_score(y, y_pred, sample_weight=sample_weight)
|
||
|
|
||
|
def _more_tags(self):
|
||
|
return {"requires_y": True}
|
||
|
|
||
|
|
||
|
class ClusterMixin:
|
||
|
"""Mixin class for all cluster estimators in scikit-learn."""
|
||
|
|
||
|
_estimator_type = "clusterer"
|
||
|
|
||
|
def fit_predict(self, X, y=None):
|
||
|
"""
|
||
|
Perform clustering on `X` and returns cluster labels.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like of shape (n_samples, n_features)
|
||
|
Input data.
|
||
|
|
||
|
y : Ignored
|
||
|
Not used, present for API consistency by convention.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
labels : ndarray of shape (n_samples,), dtype=np.int64
|
||
|
Cluster labels.
|
||
|
"""
|
||
|
# non-optimized default implementation; override when a better
|
||
|
# method is possible for a given clustering algorithm
|
||
|
self.fit(X)
|
||
|
return self.labels_
|
||
|
|
||
|
def _more_tags(self):
|
||
|
return {"preserves_dtype": []}
|
||
|
|
||
|
|
||
|
class BiclusterMixin:
|
||
|
"""Mixin class for all bicluster estimators in scikit-learn."""
|
||
|
|
||
|
@property
|
||
|
def biclusters_(self):
|
||
|
"""Convenient way to get row and column indicators together.
|
||
|
|
||
|
Returns the ``rows_`` and ``columns_`` members.
|
||
|
"""
|
||
|
return self.rows_, self.columns_
|
||
|
|
||
|
def get_indices(self, i):
|
||
|
"""Row and column indices of the `i`'th bicluster.
|
||
|
|
||
|
Only works if ``rows_`` and ``columns_`` attributes exist.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
i : int
|
||
|
The index of the cluster.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
row_ind : ndarray, dtype=np.intp
|
||
|
Indices of rows in the dataset that belong to the bicluster.
|
||
|
col_ind : ndarray, dtype=np.intp
|
||
|
Indices of columns in the dataset that belong to the bicluster.
|
||
|
"""
|
||
|
rows = self.rows_[i]
|
||
|
columns = self.columns_[i]
|
||
|
return np.nonzero(rows)[0], np.nonzero(columns)[0]
|
||
|
|
||
|
def get_shape(self, i):
|
||
|
"""Shape of the `i`'th bicluster.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
i : int
|
||
|
The index of the cluster.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
n_rows : int
|
||
|
Number of rows in the bicluster.
|
||
|
|
||
|
n_cols : int
|
||
|
Number of columns in the bicluster.
|
||
|
"""
|
||
|
indices = self.get_indices(i)
|
||
|
return tuple(len(i) for i in indices)
|
||
|
|
||
|
def get_submatrix(self, i, data):
|
||
|
"""Return the submatrix corresponding to bicluster `i`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
i : int
|
||
|
The index of the cluster.
|
||
|
data : array-like of shape (n_samples, n_features)
|
||
|
The data.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
submatrix : ndarray of shape (n_rows, n_cols)
|
||
|
The submatrix corresponding to bicluster `i`.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Works with sparse matrices. Only works if ``rows_`` and
|
||
|
``columns_`` attributes exist.
|
||
|
"""
|
||
|
from .utils.validation import check_array
|
||
|
|
||
|
data = check_array(data, accept_sparse="csr")
|
||
|
row_ind, col_ind = self.get_indices(i)
|
||
|
return data[row_ind[:, np.newaxis], col_ind]
|
||
|
|
||
|
|
||
|
class TransformerMixin(_SetOutputMixin):
|
||
|
"""Mixin class for all transformers in scikit-learn.
|
||
|
|
||
|
If :term:`get_feature_names_out` is defined, then `BaseEstimator` will
|
||
|
automatically wrap `transform` and `fit_transform` to follow the `set_output`
|
||
|
API. See the :ref:`developer_api_set_output` for details.
|
||
|
|
||
|
:class:`base.OneToOneFeatureMixin` and
|
||
|
:class:`base.ClassNamePrefixFeaturesOutMixin` are helpful mixins for
|
||
|
defining :term:`get_feature_names_out`.
|
||
|
"""
|
||
|
|
||
|
def fit_transform(self, X, y=None, **fit_params):
|
||
|
"""
|
||
|
Fit to data, then transform it.
|
||
|
|
||
|
Fits transformer to `X` and `y` with optional parameters `fit_params`
|
||
|
and returns a transformed version of `X`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like of shape (n_samples, n_features)
|
||
|
Input samples.
|
||
|
|
||
|
y : array-like of shape (n_samples,) or (n_samples, n_outputs), \
|
||
|
default=None
|
||
|
Target values (None for unsupervised transformations).
|
||
|
|
||
|
**fit_params : dict
|
||
|
Additional fit parameters.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
X_new : ndarray array of shape (n_samples, n_features_new)
|
||
|
Transformed array.
|
||
|
"""
|
||
|
# non-optimized default implementation; override when a better
|
||
|
# method is possible for a given clustering algorithm
|
||
|
if y is None:
|
||
|
# fit method of arity 1 (unsupervised transformation)
|
||
|
return self.fit(X, **fit_params).transform(X)
|
||
|
else:
|
||
|
# fit method of arity 2 (supervised transformation)
|
||
|
return self.fit(X, y, **fit_params).transform(X)
|
||
|
|
||
|
|
||
|
class OneToOneFeatureMixin:
|
||
|
"""Provides `get_feature_names_out` for simple transformers.
|
||
|
|
||
|
This mixin assumes there's a 1-to-1 correspondence between input features
|
||
|
and output features, such as :class:`~preprocessing.StandardScaler`.
|
||
|
"""
|
||
|
|
||
|
def get_feature_names_out(self, input_features=None):
|
||
|
"""Get output feature names for transformation.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
input_features : array-like of str or None, default=None
|
||
|
Input features.
|
||
|
|
||
|
- If `input_features` is `None`, then `feature_names_in_` is
|
||
|
used as feature names in. If `feature_names_in_` is not defined,
|
||
|
then the following input feature names are generated:
|
||
|
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
|
||
|
- If `input_features` is an array-like, then `input_features` must
|
||
|
match `feature_names_in_` if `feature_names_in_` is defined.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
feature_names_out : ndarray of str objects
|
||
|
Same as input features.
|
||
|
"""
|
||
|
return _check_feature_names_in(self, input_features)
|
||
|
|
||
|
|
||
|
class ClassNamePrefixFeaturesOutMixin:
|
||
|
"""Mixin class for transformers that generate their own names by prefixing.
|
||
|
|
||
|
This mixin is useful when the transformer needs to generate its own feature
|
||
|
names out, such as :class:`~decomposition.PCA`. For example, if
|
||
|
:class:`~decomposition.PCA` outputs 3 features, then the generated feature
|
||
|
names out are: `["pca0", "pca1", "pca2"]`.
|
||
|
|
||
|
This mixin assumes that a `_n_features_out` attribute is defined when the
|
||
|
transformer is fitted. `_n_features_out` is the number of output features
|
||
|
that the transformer will return in `transform` of `fit_transform`.
|
||
|
"""
|
||
|
|
||
|
def get_feature_names_out(self, input_features=None):
|
||
|
"""Get output feature names for transformation.
|
||
|
|
||
|
The feature names out will prefixed by the lowercased class name. For
|
||
|
example, if the transformer outputs 3 features, then the feature names
|
||
|
out are: `["class_name0", "class_name1", "class_name2"]`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
input_features : array-like of str or None, default=None
|
||
|
Only used to validate feature names with the names seen in :meth:`fit`.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
feature_names_out : ndarray of str objects
|
||
|
Transformed feature names.
|
||
|
"""
|
||
|
check_is_fitted(self, "_n_features_out")
|
||
|
return _generate_get_feature_names_out(
|
||
|
self, self._n_features_out, input_features=input_features
|
||
|
)
|
||
|
|
||
|
|
||
|
class DensityMixin:
|
||
|
"""Mixin class for all density estimators in scikit-learn."""
|
||
|
|
||
|
_estimator_type = "DensityEstimator"
|
||
|
|
||
|
def score(self, X, y=None):
|
||
|
"""Return the score of the model on the data `X`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like of shape (n_samples, n_features)
|
||
|
Test samples.
|
||
|
|
||
|
y : Ignored
|
||
|
Not used, present for API consistency by convention.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
score : float
|
||
|
"""
|
||
|
pass
|
||
|
|
||
|
|
||
|
class OutlierMixin:
|
||
|
"""Mixin class for all outlier detection estimators in scikit-learn."""
|
||
|
|
||
|
_estimator_type = "outlier_detector"
|
||
|
|
||
|
def fit_predict(self, X, y=None):
|
||
|
"""Perform fit on X and returns labels for X.
|
||
|
|
||
|
Returns -1 for outliers and 1 for inliers.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||
|
The input samples.
|
||
|
|
||
|
y : Ignored
|
||
|
Not used, present for API consistency by convention.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
y : ndarray of shape (n_samples,)
|
||
|
1 for inliers, -1 for outliers.
|
||
|
"""
|
||
|
# override for transductive outlier detectors like LocalOulierFactor
|
||
|
return self.fit(X).predict(X)
|
||
|
|
||
|
|
||
|
class MetaEstimatorMixin:
|
||
|
_required_parameters = ["estimator"]
|
||
|
"""Mixin class for all meta estimators in scikit-learn."""
|
||
|
|
||
|
|
||
|
class MultiOutputMixin:
|
||
|
"""Mixin to mark estimators that support multioutput."""
|
||
|
|
||
|
def _more_tags(self):
|
||
|
return {"multioutput": True}
|
||
|
|
||
|
|
||
|
class _UnstableArchMixin:
|
||
|
"""Mark estimators that are non-determinstic on 32bit or PowerPC"""
|
||
|
|
||
|
def _more_tags(self):
|
||
|
return {
|
||
|
"non_deterministic": (
|
||
|
_IS_32BIT or platform.machine().startswith(("ppc", "powerpc"))
|
||
|
)
|
||
|
}
|
||
|
|
||
|
|
||
|
def is_classifier(estimator):
|
||
|
"""Return True if the given estimator is (probably) a classifier.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
estimator : object
|
||
|
Estimator object to test.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
out : bool
|
||
|
True if estimator is a classifier and False otherwise.
|
||
|
"""
|
||
|
return getattr(estimator, "_estimator_type", None) == "classifier"
|
||
|
|
||
|
|
||
|
def is_regressor(estimator):
|
||
|
"""Return True if the given estimator is (probably) a regressor.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
estimator : estimator instance
|
||
|
Estimator object to test.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
out : bool
|
||
|
True if estimator is a regressor and False otherwise.
|
||
|
"""
|
||
|
return getattr(estimator, "_estimator_type", None) == "regressor"
|
||
|
|
||
|
|
||
|
def is_outlier_detector(estimator):
|
||
|
"""Return True if the given estimator is (probably) an outlier detector.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
estimator : estimator instance
|
||
|
Estimator object to test.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
out : bool
|
||
|
True if estimator is an outlier detector and False otherwise.
|
||
|
"""
|
||
|
return getattr(estimator, "_estimator_type", None) == "outlier_detector"
|