2518 lines
89 KiB
Python
2518 lines
89 KiB
Python
"""
|
|
The :mod:`sklearn.utils.validation` module includes functions to validate
|
|
input and parameters within scikit-learn estimators.
|
|
"""
|
|
|
|
# Authors: Olivier Grisel
|
|
# Gael Varoquaux
|
|
# Andreas Mueller
|
|
# Lars Buitinck
|
|
# Alexandre Gramfort
|
|
# Nicolas Tresegnie
|
|
# Sylvain Marie
|
|
# License: BSD 3 clause
|
|
|
|
import numbers
|
|
import operator
|
|
import sys
|
|
import warnings
|
|
from contextlib import suppress
|
|
from functools import reduce, wraps
|
|
from inspect import Parameter, isclass, signature
|
|
|
|
import joblib
|
|
import numpy as np
|
|
import scipy.sparse as sp
|
|
|
|
from .. import get_config as _get_config
|
|
from ..exceptions import DataConversionWarning, NotFittedError, PositiveSpectrumWarning
|
|
from ..utils._array_api import _asarray_with_order, _is_numpy_namespace, get_namespace
|
|
from ..utils.fixes import ComplexWarning, _preserve_dia_indices_dtype
|
|
from ._isfinite import FiniteStatus, cy_isfinite
|
|
from .fixes import _object_dtype_isnan
|
|
|
|
FLOAT_DTYPES = (np.float64, np.float32, np.float16)
|
|
|
|
|
|
# This function is not used anymore at this moment in the code base but we keep it in
|
|
# case that we merge a new public function without kwarg only by mistake, which would
|
|
# require a deprecation cycle to fix.
|
|
def _deprecate_positional_args(func=None, *, version="1.3"):
|
|
"""Decorator for methods that issues warnings for positional arguments.
|
|
|
|
Using the keyword-only argument syntax in pep 3102, arguments after the
|
|
* will issue a warning when passed as a positional argument.
|
|
|
|
Parameters
|
|
----------
|
|
func : callable, default=None
|
|
Function to check arguments on.
|
|
version : callable, default="1.3"
|
|
The version when positional arguments will result in error.
|
|
"""
|
|
|
|
def _inner_deprecate_positional_args(f):
|
|
sig = signature(f)
|
|
kwonly_args = []
|
|
all_args = []
|
|
|
|
for name, param in sig.parameters.items():
|
|
if param.kind == Parameter.POSITIONAL_OR_KEYWORD:
|
|
all_args.append(name)
|
|
elif param.kind == Parameter.KEYWORD_ONLY:
|
|
kwonly_args.append(name)
|
|
|
|
@wraps(f)
|
|
def inner_f(*args, **kwargs):
|
|
extra_args = len(args) - len(all_args)
|
|
if extra_args <= 0:
|
|
return f(*args, **kwargs)
|
|
|
|
# extra_args > 0
|
|
args_msg = [
|
|
"{}={}".format(name, arg)
|
|
for name, arg in zip(kwonly_args[:extra_args], args[-extra_args:])
|
|
]
|
|
args_msg = ", ".join(args_msg)
|
|
warnings.warn(
|
|
(
|
|
f"Pass {args_msg} as keyword args. From version "
|
|
f"{version} passing these as positional arguments "
|
|
"will result in an error"
|
|
),
|
|
FutureWarning,
|
|
)
|
|
kwargs.update(zip(sig.parameters, args))
|
|
return f(**kwargs)
|
|
|
|
return inner_f
|
|
|
|
if func is not None:
|
|
return _inner_deprecate_positional_args(func)
|
|
|
|
return _inner_deprecate_positional_args
|
|
|
|
|
|
def _assert_all_finite(
|
|
X, allow_nan=False, msg_dtype=None, estimator_name=None, input_name=""
|
|
):
|
|
"""Like assert_all_finite, but only for ndarray."""
|
|
|
|
xp, _ = get_namespace(X)
|
|
|
|
if _get_config()["assume_finite"]:
|
|
return
|
|
|
|
X = xp.asarray(X)
|
|
|
|
# for object dtype data, we only check for NaNs (GH-13254)
|
|
if X.dtype == np.dtype("object") and not allow_nan:
|
|
if _object_dtype_isnan(X).any():
|
|
raise ValueError("Input contains NaN")
|
|
|
|
# We need only consider float arrays, hence can early return for all else.
|
|
if not xp.isdtype(X.dtype, ("real floating", "complex floating")):
|
|
return
|
|
|
|
# First try an O(n) time, O(1) space solution for the common case that
|
|
# everything is finite; fall back to O(n) space `np.isinf/isnan` or custom
|
|
# Cython implementation to prevent false positives and provide a detailed
|
|
# error message.
|
|
with np.errstate(over="ignore"):
|
|
first_pass_isfinite = xp.isfinite(xp.sum(X))
|
|
if first_pass_isfinite:
|
|
return
|
|
|
|
_assert_all_finite_element_wise(
|
|
X,
|
|
xp=xp,
|
|
allow_nan=allow_nan,
|
|
msg_dtype=msg_dtype,
|
|
estimator_name=estimator_name,
|
|
input_name=input_name,
|
|
)
|
|
|
|
|
|
def _assert_all_finite_element_wise(
|
|
X, *, xp, allow_nan, msg_dtype=None, estimator_name=None, input_name=""
|
|
):
|
|
# Cython implementation doesn't support FP16 or complex numbers
|
|
use_cython = (
|
|
xp is np and X.data.contiguous and X.dtype.type in {np.float32, np.float64}
|
|
)
|
|
if use_cython:
|
|
out = cy_isfinite(X.reshape(-1), allow_nan=allow_nan)
|
|
has_nan_error = False if allow_nan else out == FiniteStatus.has_nan
|
|
has_inf = out == FiniteStatus.has_infinite
|
|
else:
|
|
has_inf = xp.any(xp.isinf(X))
|
|
has_nan_error = False if allow_nan else xp.any(xp.isnan(X))
|
|
if has_inf or has_nan_error:
|
|
if has_nan_error:
|
|
type_err = "NaN"
|
|
else:
|
|
msg_dtype = msg_dtype if msg_dtype is not None else X.dtype
|
|
type_err = f"infinity or a value too large for {msg_dtype!r}"
|
|
padded_input_name = input_name + " " if input_name else ""
|
|
msg_err = f"Input {padded_input_name}contains {type_err}."
|
|
if estimator_name and input_name == "X" and has_nan_error:
|
|
# Improve the error message on how to handle missing values in
|
|
# scikit-learn.
|
|
msg_err += (
|
|
f"\n{estimator_name} does not accept missing values"
|
|
" encoded as NaN natively. For supervised learning, you might want"
|
|
" to consider sklearn.ensemble.HistGradientBoostingClassifier and"
|
|
" Regressor which accept missing values encoded as NaNs natively."
|
|
" Alternatively, it is possible to preprocess the data, for"
|
|
" instance by using an imputer transformer in a pipeline or drop"
|
|
" samples with missing values. See"
|
|
" https://scikit-learn.org/stable/modules/impute.html"
|
|
" You can find a list of all estimators that handle NaN values"
|
|
" at the following page:"
|
|
" https://scikit-learn.org/stable/modules/impute.html"
|
|
"#estimators-that-handle-nan-values"
|
|
)
|
|
raise ValueError(msg_err)
|
|
|
|
|
|
def assert_all_finite(
|
|
X,
|
|
*,
|
|
allow_nan=False,
|
|
estimator_name=None,
|
|
input_name="",
|
|
):
|
|
"""Throw a ValueError if X contains NaN or infinity.
|
|
|
|
Parameters
|
|
----------
|
|
X : {ndarray, sparse matrix}
|
|
The input data.
|
|
|
|
allow_nan : bool, default=False
|
|
If True, do not throw error when `X` contains NaN.
|
|
|
|
estimator_name : str, default=None
|
|
The estimator name, used to construct the error message.
|
|
|
|
input_name : str, default=""
|
|
The data name used to construct the error message. In particular
|
|
if `input_name` is "X" and the data has NaN values and
|
|
allow_nan is False, the error message will link to the imputer
|
|
documentation.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.utils import assert_all_finite
|
|
>>> import numpy as np
|
|
>>> array = np.array([1, np.inf, np.nan, 4])
|
|
>>> try:
|
|
... assert_all_finite(array)
|
|
... print("Test passed: Array contains only finite values.")
|
|
... except ValueError:
|
|
... print("Test failed: Array contains non-finite values.")
|
|
Test failed: Array contains non-finite values.
|
|
"""
|
|
_assert_all_finite(
|
|
X.data if sp.issparse(X) else X,
|
|
allow_nan=allow_nan,
|
|
estimator_name=estimator_name,
|
|
input_name=input_name,
|
|
)
|
|
|
|
|
|
def as_float_array(X, *, copy=True, force_all_finite=True):
|
|
"""Convert an array-like to an array of floats.
|
|
|
|
The new dtype will be np.float32 or np.float64, depending on the original
|
|
type. The function can create a copy or modify the argument depending
|
|
on the argument copy.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix}
|
|
The input data.
|
|
|
|
copy : bool, default=True
|
|
If True, a copy of X will be created. If False, a copy may still be
|
|
returned if X's dtype is not a floating point type.
|
|
|
|
force_all_finite : bool or 'allow-nan', default=True
|
|
Whether to raise an error on np.inf, np.nan, pd.NA in X. The
|
|
possibilities are:
|
|
|
|
- True: Force all values of X to be finite.
|
|
- False: accepts np.inf, np.nan, pd.NA in X.
|
|
- 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot
|
|
be infinite.
|
|
|
|
.. versionadded:: 0.20
|
|
``force_all_finite`` accepts the string ``'allow-nan'``.
|
|
|
|
.. versionchanged:: 0.23
|
|
Accepts `pd.NA` and converts it into `np.nan`
|
|
|
|
Returns
|
|
-------
|
|
XT : {ndarray, sparse matrix}
|
|
An array of type float.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.utils import as_float_array
|
|
>>> import numpy as np
|
|
>>> array = np.array([0, 0, 1, 2, 2], dtype=np.int64)
|
|
>>> as_float_array(array)
|
|
array([0., 0., 1., 2., 2.])
|
|
"""
|
|
if isinstance(X, np.matrix) or (
|
|
not isinstance(X, np.ndarray) and not sp.issparse(X)
|
|
):
|
|
return check_array(
|
|
X,
|
|
accept_sparse=["csr", "csc", "coo"],
|
|
dtype=np.float64,
|
|
copy=copy,
|
|
force_all_finite=force_all_finite,
|
|
ensure_2d=False,
|
|
)
|
|
elif sp.issparse(X) and X.dtype in [np.float32, np.float64]:
|
|
return X.copy() if copy else X
|
|
elif X.dtype in [np.float32, np.float64]: # is numpy array
|
|
return X.copy("F" if X.flags["F_CONTIGUOUS"] else "C") if copy else X
|
|
else:
|
|
if X.dtype.kind in "uib" and X.dtype.itemsize <= 4:
|
|
return_dtype = np.float32
|
|
else:
|
|
return_dtype = np.float64
|
|
return X.astype(return_dtype)
|
|
|
|
|
|
def _is_arraylike(x):
|
|
"""Returns whether the input is array-like."""
|
|
if sp.issparse(x):
|
|
return False
|
|
|
|
return hasattr(x, "__len__") or hasattr(x, "shape") or hasattr(x, "__array__")
|
|
|
|
|
|
def _is_arraylike_not_scalar(array):
|
|
"""Return True if array is array-like and not a scalar"""
|
|
return _is_arraylike(array) and not np.isscalar(array)
|
|
|
|
|
|
def _use_interchange_protocol(X):
|
|
"""Use interchange protocol for non-pandas dataframes that follow the protocol.
|
|
|
|
Note: at this point we chose not to use the interchange API on pandas dataframe
|
|
to ensure strict behavioral backward compatibility with older versions of
|
|
scikit-learn.
|
|
"""
|
|
return not _is_pandas_df(X) and hasattr(X, "__dataframe__")
|
|
|
|
|
|
def _num_features(X):
|
|
"""Return the number of features in an array-like X.
|
|
|
|
This helper function tries hard to avoid to materialize an array version
|
|
of X unless necessary. For instance, if X is a list of lists,
|
|
this function will return the length of the first element, assuming
|
|
that subsequent elements are all lists of the same length without
|
|
checking.
|
|
Parameters
|
|
----------
|
|
X : array-like
|
|
array-like to get the number of features.
|
|
|
|
Returns
|
|
-------
|
|
features : int
|
|
Number of features
|
|
"""
|
|
type_ = type(X)
|
|
if type_.__module__ == "builtins":
|
|
type_name = type_.__qualname__
|
|
else:
|
|
type_name = f"{type_.__module__}.{type_.__qualname__}"
|
|
message = f"Unable to find the number of features from X of type {type_name}"
|
|
if not hasattr(X, "__len__") and not hasattr(X, "shape"):
|
|
if not hasattr(X, "__array__"):
|
|
raise TypeError(message)
|
|
# Only convert X to a numpy array if there is no cheaper, heuristic
|
|
# option.
|
|
X = np.asarray(X)
|
|
|
|
if hasattr(X, "shape"):
|
|
if not hasattr(X.shape, "__len__") or len(X.shape) <= 1:
|
|
message += f" with shape {X.shape}"
|
|
raise TypeError(message)
|
|
return X.shape[1]
|
|
|
|
first_sample = X[0]
|
|
|
|
# Do not consider an array-like of strings or dicts to be a 2D array
|
|
if isinstance(first_sample, (str, bytes, dict)):
|
|
message += f" where the samples are of type {type(first_sample).__qualname__}"
|
|
raise TypeError(message)
|
|
|
|
try:
|
|
# If X is a list of lists, for instance, we assume that all nested
|
|
# lists have the same length without checking or converting to
|
|
# a numpy array to keep this function call as cheap as possible.
|
|
return len(first_sample)
|
|
except Exception as err:
|
|
raise TypeError(message) from err
|
|
|
|
|
|
def _num_samples(x):
|
|
"""Return number of samples in array-like x."""
|
|
message = "Expected sequence or array-like, got %s" % type(x)
|
|
if hasattr(x, "fit") and callable(x.fit):
|
|
# Don't get num_samples from an ensembles length!
|
|
raise TypeError(message)
|
|
|
|
if _use_interchange_protocol(x):
|
|
return x.__dataframe__().num_rows()
|
|
|
|
if not hasattr(x, "__len__") and not hasattr(x, "shape"):
|
|
if hasattr(x, "__array__"):
|
|
x = np.asarray(x)
|
|
else:
|
|
raise TypeError(message)
|
|
|
|
if hasattr(x, "shape") and x.shape is not None:
|
|
if len(x.shape) == 0:
|
|
raise TypeError(
|
|
"Singleton array %r cannot be considered a valid collection." % x
|
|
)
|
|
# Check that shape is returning an integer or default to len
|
|
# Dask dataframes may not return numeric shape[0] value
|
|
if isinstance(x.shape[0], numbers.Integral):
|
|
return x.shape[0]
|
|
|
|
try:
|
|
return len(x)
|
|
except TypeError as type_error:
|
|
raise TypeError(message) from type_error
|
|
|
|
|
|
def check_memory(memory):
|
|
"""Check that ``memory`` is joblib.Memory-like.
|
|
|
|
joblib.Memory-like means that ``memory`` can be converted into a
|
|
joblib.Memory instance (typically a str denoting the ``location``)
|
|
or has the same interface (has a ``cache`` method).
|
|
|
|
Parameters
|
|
----------
|
|
memory : None, str or object with the joblib.Memory interface
|
|
- If string, the location where to create the `joblib.Memory` interface.
|
|
- If None, no caching is done and the Memory object is completely transparent.
|
|
|
|
Returns
|
|
-------
|
|
memory : object with the joblib.Memory interface
|
|
A correct joblib.Memory object.
|
|
|
|
Raises
|
|
------
|
|
ValueError
|
|
If ``memory`` is not joblib.Memory-like.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.utils.validation import check_memory
|
|
>>> check_memory("caching_dir")
|
|
Memory(location=caching_dir/joblib)
|
|
"""
|
|
if memory is None or isinstance(memory, str):
|
|
memory = joblib.Memory(location=memory, verbose=0)
|
|
elif not hasattr(memory, "cache"):
|
|
raise ValueError(
|
|
"'memory' should be None, a string or have the same"
|
|
" interface as joblib.Memory."
|
|
" Got memory='{}' instead.".format(memory)
|
|
)
|
|
return memory
|
|
|
|
|
|
def check_consistent_length(*arrays):
|
|
"""Check that all arrays have consistent first dimensions.
|
|
|
|
Checks whether all objects in arrays have the same shape or length.
|
|
|
|
Parameters
|
|
----------
|
|
*arrays : list or tuple of input objects.
|
|
Objects that will be checked for consistent length.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.utils.validation import check_consistent_length
|
|
>>> a = [1, 2, 3]
|
|
>>> b = [2, 3, 4]
|
|
>>> check_consistent_length(a, b)
|
|
"""
|
|
|
|
lengths = [_num_samples(X) for X in arrays if X is not None]
|
|
uniques = np.unique(lengths)
|
|
if len(uniques) > 1:
|
|
raise ValueError(
|
|
"Found input variables with inconsistent numbers of samples: %r"
|
|
% [int(l) for l in lengths]
|
|
)
|
|
|
|
|
|
def _make_indexable(iterable):
|
|
"""Ensure iterable supports indexing or convert to an indexable variant.
|
|
|
|
Convert sparse matrices to csr and other non-indexable iterable to arrays.
|
|
Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged.
|
|
|
|
Parameters
|
|
----------
|
|
iterable : {list, dataframe, ndarray, sparse matrix} or None
|
|
Object to be converted to an indexable iterable.
|
|
"""
|
|
if sp.issparse(iterable):
|
|
return iterable.tocsr()
|
|
elif hasattr(iterable, "__getitem__") or hasattr(iterable, "iloc"):
|
|
return iterable
|
|
elif iterable is None:
|
|
return iterable
|
|
return np.array(iterable)
|
|
|
|
|
|
def indexable(*iterables):
|
|
"""Make arrays indexable for cross-validation.
|
|
|
|
Checks consistent length, passes through None, and ensures that everything
|
|
can be indexed by converting sparse matrices to csr and converting
|
|
non-interable objects to arrays.
|
|
|
|
Parameters
|
|
----------
|
|
*iterables : {lists, dataframes, ndarrays, sparse matrices}
|
|
List of objects to ensure sliceability.
|
|
|
|
Returns
|
|
-------
|
|
result : list of {ndarray, sparse matrix, dataframe} or None
|
|
Returns a list containing indexable arrays (i.e. NumPy array,
|
|
sparse matrix, or dataframe) or `None`.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.utils import indexable
|
|
>>> from scipy.sparse import csr_matrix
|
|
>>> import numpy as np
|
|
>>> iterables = [
|
|
... [1, 2, 3], np.array([2, 3, 4]), None, csr_matrix([[5], [6], [7]])
|
|
... ]
|
|
>>> indexable(*iterables)
|
|
[[1, 2, 3], array([2, 3, 4]), None, <3x1 sparse matrix ...>]
|
|
"""
|
|
|
|
result = [_make_indexable(X) for X in iterables]
|
|
check_consistent_length(*result)
|
|
return result
|
|
|
|
|
|
def _ensure_sparse_format(
|
|
sparse_container,
|
|
accept_sparse,
|
|
dtype,
|
|
copy,
|
|
force_all_finite,
|
|
accept_large_sparse,
|
|
estimator_name=None,
|
|
input_name="",
|
|
):
|
|
"""Convert a sparse container to a given format.
|
|
|
|
Checks the sparse format of `sparse_container` and converts if necessary.
|
|
|
|
Parameters
|
|
----------
|
|
sparse_container : sparse matrix or array
|
|
Input to validate and convert.
|
|
|
|
accept_sparse : str, bool or list/tuple of str
|
|
String[s] representing allowed sparse matrix formats ('csc',
|
|
'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but
|
|
not in the allowed format, it will be converted to the first listed
|
|
format. True allows the input to be any format. False means
|
|
that a sparse matrix input will raise an error.
|
|
|
|
dtype : str, type or None
|
|
Data type of result. If None, the dtype of the input is preserved.
|
|
|
|
copy : bool
|
|
Whether a forced copy will be triggered. If copy=False, a copy might
|
|
be triggered by a conversion.
|
|
|
|
force_all_finite : bool or 'allow-nan'
|
|
Whether to raise an error on np.inf, np.nan, pd.NA in X. The
|
|
possibilities are:
|
|
|
|
- True: Force all values of X to be finite.
|
|
- False: accepts np.inf, np.nan, pd.NA in X.
|
|
- 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot
|
|
be infinite.
|
|
|
|
.. versionadded:: 0.20
|
|
``force_all_finite`` accepts the string ``'allow-nan'``.
|
|
|
|
.. versionchanged:: 0.23
|
|
Accepts `pd.NA` and converts it into `np.nan`
|
|
|
|
|
|
estimator_name : str, default=None
|
|
The estimator name, used to construct the error message.
|
|
|
|
input_name : str, default=""
|
|
The data name used to construct the error message. In particular
|
|
if `input_name` is "X" and the data has NaN values and
|
|
allow_nan is False, the error message will link to the imputer
|
|
documentation.
|
|
|
|
Returns
|
|
-------
|
|
sparse_container_converted : sparse matrix or array
|
|
Sparse container (matrix/array) that is ensured to have an allowed type.
|
|
"""
|
|
if dtype is None:
|
|
dtype = sparse_container.dtype
|
|
|
|
changed_format = False
|
|
sparse_container_type_name = type(sparse_container).__name__
|
|
|
|
if isinstance(accept_sparse, str):
|
|
accept_sparse = [accept_sparse]
|
|
|
|
# Indices dtype validation
|
|
_check_large_sparse(sparse_container, accept_large_sparse)
|
|
|
|
if accept_sparse is False:
|
|
padded_input = " for " + input_name if input_name else ""
|
|
raise TypeError(
|
|
f"Sparse data was passed{padded_input}, but dense data is required. "
|
|
"Use '.toarray()' to convert to a dense numpy array."
|
|
)
|
|
elif isinstance(accept_sparse, (list, tuple)):
|
|
if len(accept_sparse) == 0:
|
|
raise ValueError(
|
|
"When providing 'accept_sparse' as a tuple or list, it must contain at "
|
|
"least one string value."
|
|
)
|
|
# ensure correct sparse format
|
|
if sparse_container.format not in accept_sparse:
|
|
# create new with correct sparse
|
|
sparse_container = sparse_container.asformat(accept_sparse[0])
|
|
changed_format = True
|
|
elif accept_sparse is not True:
|
|
# any other type
|
|
raise ValueError(
|
|
"Parameter 'accept_sparse' should be a string, boolean or list of strings."
|
|
f" You provided 'accept_sparse={accept_sparse}'."
|
|
)
|
|
|
|
if dtype != sparse_container.dtype:
|
|
# convert dtype
|
|
sparse_container = sparse_container.astype(dtype)
|
|
elif copy and not changed_format:
|
|
# force copy
|
|
sparse_container = sparse_container.copy()
|
|
|
|
if force_all_finite:
|
|
if not hasattr(sparse_container, "data"):
|
|
warnings.warn(
|
|
f"Can't check {sparse_container.format} sparse matrix for nan or inf.",
|
|
stacklevel=2,
|
|
)
|
|
else:
|
|
_assert_all_finite(
|
|
sparse_container.data,
|
|
allow_nan=force_all_finite == "allow-nan",
|
|
estimator_name=estimator_name,
|
|
input_name=input_name,
|
|
)
|
|
|
|
# TODO: Remove when the minimum version of SciPy supported is 1.12
|
|
# With SciPy sparse arrays, conversion from DIA format to COO, CSR, or BSR
|
|
# triggers the use of `np.int64` indices even if the data is such that it could
|
|
# be more efficiently represented with `np.int32` indices.
|
|
# https://github.com/scipy/scipy/issues/19245 Since not all scikit-learn
|
|
# algorithms support large indices, the following code downcasts to `np.int32`
|
|
# indices when it's safe to do so.
|
|
if changed_format:
|
|
# accept_sparse is specified to a specific format and a conversion occurred
|
|
requested_sparse_format = accept_sparse[0]
|
|
_preserve_dia_indices_dtype(
|
|
sparse_container, sparse_container_type_name, requested_sparse_format
|
|
)
|
|
|
|
return sparse_container
|
|
|
|
|
|
def _ensure_no_complex_data(array):
|
|
if (
|
|
hasattr(array, "dtype")
|
|
and array.dtype is not None
|
|
and hasattr(array.dtype, "kind")
|
|
and array.dtype.kind == "c"
|
|
):
|
|
raise ValueError("Complex data not supported\n{}\n".format(array))
|
|
|
|
|
|
def _check_estimator_name(estimator):
|
|
if estimator is not None:
|
|
if isinstance(estimator, str):
|
|
return estimator
|
|
else:
|
|
return estimator.__class__.__name__
|
|
return None
|
|
|
|
|
|
def _pandas_dtype_needs_early_conversion(pd_dtype):
|
|
"""Return True if pandas extension pd_dtype need to be converted early."""
|
|
# Check these early for pandas versions without extension dtypes
|
|
from pandas import SparseDtype
|
|
from pandas.api.types import (
|
|
is_bool_dtype,
|
|
is_float_dtype,
|
|
is_integer_dtype,
|
|
)
|
|
|
|
if is_bool_dtype(pd_dtype):
|
|
# bool and extension booleans need early conversion because __array__
|
|
# converts mixed dtype dataframes into object dtypes
|
|
return True
|
|
|
|
if isinstance(pd_dtype, SparseDtype):
|
|
# Sparse arrays will be converted later in `check_array`
|
|
return False
|
|
|
|
try:
|
|
from pandas.api.types import is_extension_array_dtype
|
|
except ImportError:
|
|
return False
|
|
|
|
if isinstance(pd_dtype, SparseDtype) or not is_extension_array_dtype(pd_dtype):
|
|
# Sparse arrays will be converted later in `check_array`
|
|
# Only handle extension arrays for integer and floats
|
|
return False
|
|
elif is_float_dtype(pd_dtype):
|
|
# Float ndarrays can normally support nans. They need to be converted
|
|
# first to map pd.NA to np.nan
|
|
return True
|
|
elif is_integer_dtype(pd_dtype):
|
|
# XXX: Warn when converting from a high integer to a float
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def _is_extension_array_dtype(array):
|
|
# Pandas extension arrays have a dtype with an na_value
|
|
return hasattr(array, "dtype") and hasattr(array.dtype, "na_value")
|
|
|
|
|
|
def check_array(
|
|
array,
|
|
accept_sparse=False,
|
|
*,
|
|
accept_large_sparse=True,
|
|
dtype="numeric",
|
|
order=None,
|
|
copy=False,
|
|
force_all_finite=True,
|
|
ensure_2d=True,
|
|
allow_nd=False,
|
|
ensure_min_samples=1,
|
|
ensure_min_features=1,
|
|
estimator=None,
|
|
input_name="",
|
|
):
|
|
"""Input validation on an array, list, sparse matrix or similar.
|
|
|
|
By default, the input is checked to be a non-empty 2D array containing
|
|
only finite values. If the dtype of the array is object, attempt
|
|
converting to float, raising on failure.
|
|
|
|
Parameters
|
|
----------
|
|
array : object
|
|
Input object to check / convert.
|
|
|
|
accept_sparse : str, bool or list/tuple of str, default=False
|
|
String[s] representing allowed sparse matrix formats, such as 'csc',
|
|
'csr', etc. If the input is sparse but not in the allowed format,
|
|
it will be converted to the first listed format. True allows the input
|
|
to be any format. False means that a sparse matrix input will
|
|
raise an error.
|
|
|
|
accept_large_sparse : bool, default=True
|
|
If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
|
|
accept_sparse, accept_large_sparse=False will cause it to be accepted
|
|
only if its indices are stored with a 32-bit dtype.
|
|
|
|
.. versionadded:: 0.20
|
|
|
|
dtype : 'numeric', type, list of type or None, default='numeric'
|
|
Data type of result. If None, the dtype of the input is preserved.
|
|
If "numeric", dtype is preserved unless array.dtype is object.
|
|
If dtype is a list of types, conversion on the first type is only
|
|
performed if the dtype of the input is not in the list.
|
|
|
|
order : {'F', 'C'} or None, default=None
|
|
Whether an array will be forced to be fortran or c-style.
|
|
When order is None (default), then if copy=False, nothing is ensured
|
|
about the memory layout of the output array; otherwise (copy=True)
|
|
the memory layout of the returned array is kept as close as possible
|
|
to the original array.
|
|
|
|
copy : bool, default=False
|
|
Whether a forced copy will be triggered. If copy=False, a copy might
|
|
be triggered by a conversion.
|
|
|
|
force_all_finite : bool or 'allow-nan', default=True
|
|
Whether to raise an error on np.inf, np.nan, pd.NA in array. The
|
|
possibilities are:
|
|
|
|
- True: Force all values of array to be finite.
|
|
- False: accepts np.inf, np.nan, pd.NA in array.
|
|
- 'allow-nan': accepts only np.nan and pd.NA values in array. Values
|
|
cannot be infinite.
|
|
|
|
.. versionadded:: 0.20
|
|
``force_all_finite`` accepts the string ``'allow-nan'``.
|
|
|
|
.. versionchanged:: 0.23
|
|
Accepts `pd.NA` and converts it into `np.nan`
|
|
|
|
ensure_2d : bool, default=True
|
|
Whether to raise a value error if array is not 2D.
|
|
|
|
allow_nd : bool, default=False
|
|
Whether to allow array.ndim > 2.
|
|
|
|
ensure_min_samples : int, default=1
|
|
Make sure that the array has a minimum number of samples in its first
|
|
axis (rows for a 2D array). Setting to 0 disables this check.
|
|
|
|
ensure_min_features : int, default=1
|
|
Make sure that the 2D array has some minimum number of features
|
|
(columns). The default value of 1 rejects empty datasets.
|
|
This check is only enforced when the input data has effectively 2
|
|
dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
|
|
disables this check.
|
|
|
|
estimator : str or estimator instance, default=None
|
|
If passed, include the name of the estimator in warning messages.
|
|
|
|
input_name : str, default=""
|
|
The data name used to construct the error message. In particular
|
|
if `input_name` is "X" and the data has NaN values and
|
|
allow_nan is False, the error message will link to the imputer
|
|
documentation.
|
|
|
|
.. versionadded:: 1.1.0
|
|
|
|
Returns
|
|
-------
|
|
array_converted : object
|
|
The converted and validated array.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.utils.validation import check_array
|
|
>>> X = [[1, 2, 3], [4, 5, 6]]
|
|
>>> X_checked = check_array(X)
|
|
>>> X_checked
|
|
array([[1, 2, 3], [4, 5, 6]])
|
|
"""
|
|
if isinstance(array, np.matrix):
|
|
raise TypeError(
|
|
"np.matrix is not supported. Please convert to a numpy array with "
|
|
"np.asarray. For more information see: "
|
|
"https://numpy.org/doc/stable/reference/generated/numpy.matrix.html"
|
|
)
|
|
|
|
xp, is_array_api_compliant = get_namespace(array)
|
|
|
|
# store reference to original array to check if copy is needed when
|
|
# function returns
|
|
array_orig = array
|
|
|
|
# store whether originally we wanted numeric dtype
|
|
dtype_numeric = isinstance(dtype, str) and dtype == "numeric"
|
|
|
|
dtype_orig = getattr(array, "dtype", None)
|
|
if not is_array_api_compliant and not hasattr(dtype_orig, "kind"):
|
|
# not a data type (e.g. a column named dtype in a pandas DataFrame)
|
|
dtype_orig = None
|
|
|
|
# check if the object contains several dtypes (typically a pandas
|
|
# DataFrame), and store them. If not, store None.
|
|
dtypes_orig = None
|
|
pandas_requires_conversion = False
|
|
# track if we have a Series-like object to raise a better error message
|
|
type_if_series = None
|
|
if hasattr(array, "dtypes") and hasattr(array.dtypes, "__array__"):
|
|
# throw warning if columns are sparse. If all columns are sparse, then
|
|
# array.sparse exists and sparsity will be preserved (later).
|
|
with suppress(ImportError):
|
|
from pandas import SparseDtype
|
|
|
|
def is_sparse(dtype):
|
|
return isinstance(dtype, SparseDtype)
|
|
|
|
if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
|
|
warnings.warn(
|
|
"pandas.DataFrame with sparse columns found."
|
|
"It will be converted to a dense numpy array."
|
|
)
|
|
|
|
dtypes_orig = list(array.dtypes)
|
|
pandas_requires_conversion = any(
|
|
_pandas_dtype_needs_early_conversion(i) for i in dtypes_orig
|
|
)
|
|
if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig):
|
|
dtype_orig = np.result_type(*dtypes_orig)
|
|
elif pandas_requires_conversion and any(d == object for d in dtypes_orig):
|
|
# Force object if any of the dtypes is an object
|
|
dtype_orig = object
|
|
|
|
elif (_is_extension_array_dtype(array) or hasattr(array, "iloc")) and hasattr(
|
|
array, "dtype"
|
|
):
|
|
# array is a pandas series
|
|
type_if_series = type(array)
|
|
pandas_requires_conversion = _pandas_dtype_needs_early_conversion(array.dtype)
|
|
if isinstance(array.dtype, np.dtype):
|
|
dtype_orig = array.dtype
|
|
else:
|
|
# Set to None to let array.astype work out the best dtype
|
|
dtype_orig = None
|
|
|
|
if dtype_numeric:
|
|
if (
|
|
dtype_orig is not None
|
|
and hasattr(dtype_orig, "kind")
|
|
and dtype_orig.kind == "O"
|
|
):
|
|
# if input is object, convert to float.
|
|
dtype = xp.float64
|
|
else:
|
|
dtype = None
|
|
|
|
if isinstance(dtype, (list, tuple)):
|
|
if dtype_orig is not None and dtype_orig in dtype:
|
|
# no dtype conversion required
|
|
dtype = None
|
|
else:
|
|
# dtype conversion required. Let's select the first element of the
|
|
# list of accepted types.
|
|
dtype = dtype[0]
|
|
|
|
if pandas_requires_conversion:
|
|
# pandas dataframe requires conversion earlier to handle extension dtypes with
|
|
# nans
|
|
# Use the original dtype for conversion if dtype is None
|
|
new_dtype = dtype_orig if dtype is None else dtype
|
|
array = array.astype(new_dtype)
|
|
# Since we converted here, we do not need to convert again later
|
|
dtype = None
|
|
|
|
if force_all_finite not in (True, False, "allow-nan"):
|
|
raise ValueError(
|
|
'force_all_finite should be a bool or "allow-nan". Got {!r} instead'.format(
|
|
force_all_finite
|
|
)
|
|
)
|
|
|
|
if dtype is not None and _is_numpy_namespace(xp):
|
|
# convert to dtype object to conform to Array API to be use `xp.isdtype` later
|
|
dtype = np.dtype(dtype)
|
|
|
|
estimator_name = _check_estimator_name(estimator)
|
|
context = " by %s" % estimator_name if estimator is not None else ""
|
|
|
|
# When all dataframe columns are sparse, convert to a sparse array
|
|
if hasattr(array, "sparse") and array.ndim > 1:
|
|
with suppress(ImportError):
|
|
from pandas import SparseDtype # noqa: F811
|
|
|
|
def is_sparse(dtype):
|
|
return isinstance(dtype, SparseDtype)
|
|
|
|
if array.dtypes.apply(is_sparse).all():
|
|
# DataFrame.sparse only supports `to_coo`
|
|
array = array.sparse.to_coo()
|
|
if array.dtype == np.dtype("object"):
|
|
unique_dtypes = set([dt.subtype.name for dt in array_orig.dtypes])
|
|
if len(unique_dtypes) > 1:
|
|
raise ValueError(
|
|
"Pandas DataFrame with mixed sparse extension arrays "
|
|
"generated a sparse matrix with object dtype which "
|
|
"can not be converted to a scipy sparse matrix."
|
|
"Sparse extension arrays should all have the same "
|
|
"numeric type."
|
|
)
|
|
|
|
if sp.issparse(array):
|
|
_ensure_no_complex_data(array)
|
|
array = _ensure_sparse_format(
|
|
array,
|
|
accept_sparse=accept_sparse,
|
|
dtype=dtype,
|
|
copy=copy,
|
|
force_all_finite=force_all_finite,
|
|
accept_large_sparse=accept_large_sparse,
|
|
estimator_name=estimator_name,
|
|
input_name=input_name,
|
|
)
|
|
if ensure_2d and array.ndim < 2:
|
|
raise ValueError(
|
|
f"Expected 2D input, got input with shape {array.shape}.\n"
|
|
"Reshape your data either using array.reshape(-1, 1) if "
|
|
"your data has a single feature or array.reshape(1, -1) "
|
|
"if it contains a single sample."
|
|
)
|
|
else:
|
|
# If np.array(..) gives ComplexWarning, then we convert the warning
|
|
# to an error. This is needed because specifying a non complex
|
|
# dtype to the function converts complex to real dtype,
|
|
# thereby passing the test made in the lines following the scope
|
|
# of warnings context manager.
|
|
with warnings.catch_warnings():
|
|
try:
|
|
warnings.simplefilter("error", ComplexWarning)
|
|
if dtype is not None and xp.isdtype(dtype, "integral"):
|
|
# Conversion float -> int should not contain NaN or
|
|
# inf (numpy#14412). We cannot use casting='safe' because
|
|
# then conversion float -> int would be disallowed.
|
|
array = _asarray_with_order(array, order=order, xp=xp)
|
|
if xp.isdtype(array.dtype, ("real floating", "complex floating")):
|
|
_assert_all_finite(
|
|
array,
|
|
allow_nan=False,
|
|
msg_dtype=dtype,
|
|
estimator_name=estimator_name,
|
|
input_name=input_name,
|
|
)
|
|
array = xp.astype(array, dtype, copy=False)
|
|
else:
|
|
array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
|
|
except ComplexWarning as complex_warning:
|
|
raise ValueError(
|
|
"Complex data not supported\n{}\n".format(array)
|
|
) from complex_warning
|
|
|
|
# It is possible that the np.array(..) gave no warning. This happens
|
|
# when no dtype conversion happened, for example dtype = None. The
|
|
# result is that np.array(..) produces an array of complex dtype
|
|
# and we need to catch and raise exception for such cases.
|
|
_ensure_no_complex_data(array)
|
|
|
|
if ensure_2d:
|
|
# If input is scalar raise error
|
|
if array.ndim == 0:
|
|
raise ValueError(
|
|
"Expected 2D array, got scalar array instead:\narray={}.\n"
|
|
"Reshape your data either using array.reshape(-1, 1) if "
|
|
"your data has a single feature or array.reshape(1, -1) "
|
|
"if it contains a single sample.".format(array)
|
|
)
|
|
# If input is 1D raise error
|
|
if array.ndim == 1:
|
|
# If input is a Series-like object (eg. pandas Series or polars Series)
|
|
if type_if_series is not None:
|
|
msg = (
|
|
f"Expected a 2-dimensional container but got {type_if_series} "
|
|
"instead. Pass a DataFrame containing a single row (i.e. "
|
|
"single sample) or a single column (i.e. single feature) "
|
|
"instead."
|
|
)
|
|
else:
|
|
msg = (
|
|
f"Expected 2D array, got 1D array instead:\narray={array}.\n"
|
|
"Reshape your data either using array.reshape(-1, 1) if "
|
|
"your data has a single feature or array.reshape(1, -1) "
|
|
"if it contains a single sample."
|
|
)
|
|
raise ValueError(msg)
|
|
|
|
if dtype_numeric and hasattr(array.dtype, "kind") and array.dtype.kind in "USV":
|
|
raise ValueError(
|
|
"dtype='numeric' is not compatible with arrays of bytes/strings."
|
|
"Convert your data to numeric values explicitly instead."
|
|
)
|
|
if not allow_nd and array.ndim >= 3:
|
|
raise ValueError(
|
|
"Found array with dim %d. %s expected <= 2."
|
|
% (array.ndim, estimator_name)
|
|
)
|
|
|
|
if force_all_finite:
|
|
_assert_all_finite(
|
|
array,
|
|
input_name=input_name,
|
|
estimator_name=estimator_name,
|
|
allow_nan=force_all_finite == "allow-nan",
|
|
)
|
|
|
|
if copy:
|
|
if _is_numpy_namespace(xp):
|
|
# only make a copy if `array` and `array_orig` may share memory`
|
|
if np.may_share_memory(array, array_orig):
|
|
array = _asarray_with_order(
|
|
array, dtype=dtype, order=order, copy=True, xp=xp
|
|
)
|
|
else:
|
|
# always make a copy for non-numpy arrays
|
|
array = _asarray_with_order(
|
|
array, dtype=dtype, order=order, copy=True, xp=xp
|
|
)
|
|
|
|
if ensure_min_samples > 0:
|
|
n_samples = _num_samples(array)
|
|
if n_samples < ensure_min_samples:
|
|
raise ValueError(
|
|
"Found array with %d sample(s) (shape=%s) while a"
|
|
" minimum of %d is required%s."
|
|
% (n_samples, array.shape, ensure_min_samples, context)
|
|
)
|
|
|
|
if ensure_min_features > 0 and array.ndim == 2:
|
|
n_features = array.shape[1]
|
|
if n_features < ensure_min_features:
|
|
raise ValueError(
|
|
"Found array with %d feature(s) (shape=%s) while"
|
|
" a minimum of %d is required%s."
|
|
% (n_features, array.shape, ensure_min_features, context)
|
|
)
|
|
|
|
# With an input pandas dataframe or series, we know we can always make the
|
|
# resulting array writeable:
|
|
# - if copy=True, we have already made a copy so it is fine to make the
|
|
# array writeable
|
|
# - if copy=False, the caller is telling us explicitly that we can do
|
|
# in-place modifications
|
|
# See https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html#read-only-numpy-arrays
|
|
# for more details about pandas copy-on-write mechanism, that is enabled by
|
|
# default in pandas 3.0.0.dev.
|
|
if _is_pandas_df_or_series(array_orig) and hasattr(array, "flags"):
|
|
array.flags.writeable = True
|
|
|
|
return array
|
|
|
|
|
|
def _check_large_sparse(X, accept_large_sparse=False):
|
|
"""Raise a ValueError if X has 64bit indices and accept_large_sparse=False"""
|
|
if not accept_large_sparse:
|
|
supported_indices = ["int32"]
|
|
if X.format == "coo":
|
|
index_keys = ["col", "row"]
|
|
elif X.format in ["csr", "csc", "bsr"]:
|
|
index_keys = ["indices", "indptr"]
|
|
else:
|
|
return
|
|
for key in index_keys:
|
|
indices_datatype = getattr(X, key).dtype
|
|
if indices_datatype not in supported_indices:
|
|
raise ValueError(
|
|
"Only sparse matrices with 32-bit integer indices are accepted."
|
|
f" Got {indices_datatype} indices. Please do report a minimal"
|
|
" reproducer on scikit-learn issue tracker so that support for"
|
|
" your use-case can be studied by maintainers. See:"
|
|
" https://scikit-learn.org/dev/developers/minimal_reproducer.html"
|
|
)
|
|
|
|
|
|
def check_X_y(
|
|
X,
|
|
y,
|
|
accept_sparse=False,
|
|
*,
|
|
accept_large_sparse=True,
|
|
dtype="numeric",
|
|
order=None,
|
|
copy=False,
|
|
force_all_finite=True,
|
|
ensure_2d=True,
|
|
allow_nd=False,
|
|
multi_output=False,
|
|
ensure_min_samples=1,
|
|
ensure_min_features=1,
|
|
y_numeric=False,
|
|
estimator=None,
|
|
):
|
|
"""Input validation for standard estimators.
|
|
|
|
Checks X and y for consistent length, enforces X to be 2D and y 1D. By
|
|
default, X is checked to be non-empty and containing only finite values.
|
|
Standard input checks are also applied to y, such as checking that y
|
|
does not have np.nan or np.inf targets. For multi-label y, set
|
|
multi_output=True to allow 2D and sparse y. If the dtype of X is
|
|
object, attempt converting to float, raising on failure.
|
|
|
|
Parameters
|
|
----------
|
|
X : {ndarray, list, sparse matrix}
|
|
Input data.
|
|
|
|
y : {ndarray, list, sparse matrix}
|
|
Labels.
|
|
|
|
accept_sparse : str, bool or list of str, default=False
|
|
String[s] representing allowed sparse matrix formats, such as 'csc',
|
|
'csr', etc. If the input is sparse but not in the allowed format,
|
|
it will be converted to the first listed format. True allows the input
|
|
to be any format. False means that a sparse matrix input will
|
|
raise an error.
|
|
|
|
accept_large_sparse : bool, default=True
|
|
If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
|
|
accept_sparse, accept_large_sparse will cause it to be accepted only
|
|
if its indices are stored with a 32-bit dtype.
|
|
|
|
.. versionadded:: 0.20
|
|
|
|
dtype : 'numeric', type, list of type or None, default='numeric'
|
|
Data type of result. If None, the dtype of the input is preserved.
|
|
If "numeric", dtype is preserved unless array.dtype is object.
|
|
If dtype is a list of types, conversion on the first type is only
|
|
performed if the dtype of the input is not in the list.
|
|
|
|
order : {'F', 'C'}, default=None
|
|
Whether an array will be forced to be fortran or c-style. If
|
|
`None`, then the input data's order is preserved when possible.
|
|
|
|
copy : bool, default=False
|
|
Whether a forced copy will be triggered. If copy=False, a copy might
|
|
be triggered by a conversion.
|
|
|
|
force_all_finite : bool or 'allow-nan', default=True
|
|
Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter
|
|
does not influence whether y can have np.inf, np.nan, pd.NA values.
|
|
The possibilities are:
|
|
|
|
- True: Force all values of X to be finite.
|
|
- False: accepts np.inf, np.nan, pd.NA in X.
|
|
- 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot
|
|
be infinite.
|
|
|
|
.. versionadded:: 0.20
|
|
``force_all_finite`` accepts the string ``'allow-nan'``.
|
|
|
|
.. versionchanged:: 0.23
|
|
Accepts `pd.NA` and converts it into `np.nan`
|
|
|
|
ensure_2d : bool, default=True
|
|
Whether to raise a value error if X is not 2D.
|
|
|
|
allow_nd : bool, default=False
|
|
Whether to allow X.ndim > 2.
|
|
|
|
multi_output : bool, default=False
|
|
Whether to allow 2D y (array or sparse matrix). If false, y will be
|
|
validated as a vector. y cannot have np.nan or np.inf values if
|
|
multi_output=True.
|
|
|
|
ensure_min_samples : int, default=1
|
|
Make sure that X has a minimum number of samples in its first
|
|
axis (rows for a 2D array).
|
|
|
|
ensure_min_features : int, default=1
|
|
Make sure that the 2D array has some minimum number of features
|
|
(columns). The default value of 1 rejects empty datasets.
|
|
This check is only enforced when X has effectively 2 dimensions or
|
|
is originally 1D and ``ensure_2d`` is True. Setting to 0 disables
|
|
this check.
|
|
|
|
y_numeric : bool, default=False
|
|
Whether to ensure that y has a numeric type. If dtype of y is object,
|
|
it is converted to float64. Should only be used for regression
|
|
algorithms.
|
|
|
|
estimator : str or estimator instance, default=None
|
|
If passed, include the name of the estimator in warning messages.
|
|
|
|
Returns
|
|
-------
|
|
X_converted : object
|
|
The converted and validated X.
|
|
|
|
y_converted : object
|
|
The converted and validated y.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.utils.validation import check_X_y
|
|
>>> X = [[1, 2], [3, 4], [5, 6]]
|
|
>>> y = [1, 2, 3]
|
|
>>> X, y = check_X_y(X, y)
|
|
>>> X
|
|
array([[1, 2],
|
|
[3, 4],
|
|
[5, 6]])
|
|
>>> y
|
|
array([1, 2, 3])
|
|
"""
|
|
if y is None:
|
|
if estimator is None:
|
|
estimator_name = "estimator"
|
|
else:
|
|
estimator_name = _check_estimator_name(estimator)
|
|
raise ValueError(
|
|
f"{estimator_name} requires y to be passed, but the target y is None"
|
|
)
|
|
|
|
X = check_array(
|
|
X,
|
|
accept_sparse=accept_sparse,
|
|
accept_large_sparse=accept_large_sparse,
|
|
dtype=dtype,
|
|
order=order,
|
|
copy=copy,
|
|
force_all_finite=force_all_finite,
|
|
ensure_2d=ensure_2d,
|
|
allow_nd=allow_nd,
|
|
ensure_min_samples=ensure_min_samples,
|
|
ensure_min_features=ensure_min_features,
|
|
estimator=estimator,
|
|
input_name="X",
|
|
)
|
|
|
|
y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
|
|
|
|
check_consistent_length(X, y)
|
|
|
|
return X, y
|
|
|
|
|
|
def _check_y(y, multi_output=False, y_numeric=False, estimator=None):
|
|
"""Isolated part of check_X_y dedicated to y validation"""
|
|
if multi_output:
|
|
y = check_array(
|
|
y,
|
|
accept_sparse="csr",
|
|
force_all_finite=True,
|
|
ensure_2d=False,
|
|
dtype=None,
|
|
input_name="y",
|
|
estimator=estimator,
|
|
)
|
|
else:
|
|
estimator_name = _check_estimator_name(estimator)
|
|
y = column_or_1d(y, warn=True)
|
|
_assert_all_finite(y, input_name="y", estimator_name=estimator_name)
|
|
_ensure_no_complex_data(y)
|
|
if y_numeric and hasattr(y.dtype, "kind") and y.dtype.kind == "O":
|
|
y = y.astype(np.float64)
|
|
|
|
return y
|
|
|
|
|
|
def column_or_1d(y, *, dtype=None, warn=False):
|
|
"""Ravel column or 1d numpy array, else raises an error.
|
|
|
|
Parameters
|
|
----------
|
|
y : array-like
|
|
Input data.
|
|
|
|
dtype : data-type, default=None
|
|
Data type for `y`.
|
|
|
|
.. versionadded:: 1.2
|
|
|
|
warn : bool, default=False
|
|
To control display of warnings.
|
|
|
|
Returns
|
|
-------
|
|
y : ndarray
|
|
Output data.
|
|
|
|
Raises
|
|
------
|
|
ValueError
|
|
If `y` is not a 1D array or a 2D array with a single row or column.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.utils.validation import column_or_1d
|
|
>>> column_or_1d([1, 1])
|
|
array([1, 1])
|
|
"""
|
|
xp, _ = get_namespace(y)
|
|
y = check_array(
|
|
y,
|
|
ensure_2d=False,
|
|
dtype=dtype,
|
|
input_name="y",
|
|
force_all_finite=False,
|
|
ensure_min_samples=0,
|
|
)
|
|
|
|
shape = y.shape
|
|
if len(shape) == 1:
|
|
return _asarray_with_order(xp.reshape(y, (-1,)), order="C", xp=xp)
|
|
if len(shape) == 2 and shape[1] == 1:
|
|
if warn:
|
|
warnings.warn(
|
|
(
|
|
"A column-vector y was passed when a 1d array was"
|
|
" expected. Please change the shape of y to "
|
|
"(n_samples, ), for example using ravel()."
|
|
),
|
|
DataConversionWarning,
|
|
stacklevel=2,
|
|
)
|
|
return _asarray_with_order(xp.reshape(y, (-1,)), order="C", xp=xp)
|
|
|
|
raise ValueError(
|
|
"y should be a 1d array, got an array of shape {} instead.".format(shape)
|
|
)
|
|
|
|
|
|
def check_random_state(seed):
|
|
"""Turn seed into a np.random.RandomState instance.
|
|
|
|
Parameters
|
|
----------
|
|
seed : None, int or instance of RandomState
|
|
If seed is None, return the RandomState singleton used by np.random.
|
|
If seed is an int, return a new RandomState instance seeded with seed.
|
|
If seed is already a RandomState instance, return it.
|
|
Otherwise raise ValueError.
|
|
|
|
Returns
|
|
-------
|
|
:class:`numpy:numpy.random.RandomState`
|
|
The random state object based on `seed` parameter.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.utils.validation import check_random_state
|
|
>>> check_random_state(42)
|
|
RandomState(MT19937) at 0x...
|
|
"""
|
|
if seed is None or seed is np.random:
|
|
return np.random.mtrand._rand
|
|
if isinstance(seed, numbers.Integral):
|
|
return np.random.RandomState(seed)
|
|
if isinstance(seed, np.random.RandomState):
|
|
return seed
|
|
raise ValueError(
|
|
"%r cannot be used to seed a numpy.random.RandomState instance" % seed
|
|
)
|
|
|
|
|
|
def has_fit_parameter(estimator, parameter):
|
|
"""Check whether the estimator's fit method supports the given parameter.
|
|
|
|
Parameters
|
|
----------
|
|
estimator : object
|
|
An estimator to inspect.
|
|
|
|
parameter : str
|
|
The searched parameter.
|
|
|
|
Returns
|
|
-------
|
|
is_parameter : bool
|
|
Whether the parameter was found to be a named parameter of the
|
|
estimator's fit method.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.svm import SVC
|
|
>>> from sklearn.utils.validation import has_fit_parameter
|
|
>>> has_fit_parameter(SVC(), "sample_weight")
|
|
True
|
|
"""
|
|
return parameter in signature(estimator.fit).parameters
|
|
|
|
|
|
def check_symmetric(array, *, tol=1e-10, raise_warning=True, raise_exception=False):
|
|
"""Make sure that array is 2D, square and symmetric.
|
|
|
|
If the array is not symmetric, then a symmetrized version is returned.
|
|
Optionally, a warning or exception is raised if the matrix is not
|
|
symmetric.
|
|
|
|
Parameters
|
|
----------
|
|
array : {ndarray, sparse matrix}
|
|
Input object to check / convert. Must be two-dimensional and square,
|
|
otherwise a ValueError will be raised.
|
|
|
|
tol : float, default=1e-10
|
|
Absolute tolerance for equivalence of arrays. Default = 1E-10.
|
|
|
|
raise_warning : bool, default=True
|
|
If True then raise a warning if conversion is required.
|
|
|
|
raise_exception : bool, default=False
|
|
If True then raise an exception if array is not symmetric.
|
|
|
|
Returns
|
|
-------
|
|
array_sym : {ndarray, sparse matrix}
|
|
Symmetrized version of the input array, i.e. the average of array
|
|
and array.transpose(). If sparse, then duplicate entries are first
|
|
summed and zeros are eliminated.
|
|
|
|
Examples
|
|
--------
|
|
>>> import numpy as np
|
|
>>> from sklearn.utils.validation import check_symmetric
|
|
>>> symmetric_array = np.array([[0, 1, 2], [1, 0, 1], [2, 1, 0]])
|
|
>>> check_symmetric(symmetric_array)
|
|
array([[0, 1, 2],
|
|
[1, 0, 1],
|
|
[2, 1, 0]])
|
|
>>> from scipy.sparse import csr_matrix
|
|
>>> sparse_symmetric_array = csr_matrix(symmetric_array)
|
|
>>> check_symmetric(sparse_symmetric_array)
|
|
<3x3 sparse matrix of type '<class 'numpy.int64'>'
|
|
with 6 stored elements in Compressed Sparse Row format>
|
|
"""
|
|
if (array.ndim != 2) or (array.shape[0] != array.shape[1]):
|
|
raise ValueError(
|
|
"array must be 2-dimensional and square. shape = {0}".format(array.shape)
|
|
)
|
|
|
|
if sp.issparse(array):
|
|
diff = array - array.T
|
|
# only csr, csc, and coo have `data` attribute
|
|
if diff.format not in ["csr", "csc", "coo"]:
|
|
diff = diff.tocsr()
|
|
symmetric = np.all(abs(diff.data) < tol)
|
|
else:
|
|
symmetric = np.allclose(array, array.T, atol=tol)
|
|
|
|
if not symmetric:
|
|
if raise_exception:
|
|
raise ValueError("Array must be symmetric")
|
|
if raise_warning:
|
|
warnings.warn(
|
|
(
|
|
"Array is not symmetric, and will be converted "
|
|
"to symmetric by average with its transpose."
|
|
),
|
|
stacklevel=2,
|
|
)
|
|
if sp.issparse(array):
|
|
conversion = "to" + array.format
|
|
array = getattr(0.5 * (array + array.T), conversion)()
|
|
else:
|
|
array = 0.5 * (array + array.T)
|
|
|
|
return array
|
|
|
|
|
|
def _is_fitted(estimator, attributes=None, all_or_any=all):
|
|
"""Determine if an estimator is fitted
|
|
|
|
Parameters
|
|
----------
|
|
estimator : estimator instance
|
|
Estimator instance for which the check is performed.
|
|
|
|
attributes : str, list or tuple of str, default=None
|
|
Attribute name(s) given as string or a list/tuple of strings
|
|
Eg.: ``["coef_", "estimator_", ...], "coef_"``
|
|
|
|
If `None`, `estimator` is considered fitted if there exist an
|
|
attribute that ends with a underscore and does not start with double
|
|
underscore.
|
|
|
|
all_or_any : callable, {all, any}, default=all
|
|
Specify whether all or any of the given attributes must exist.
|
|
|
|
Returns
|
|
-------
|
|
fitted : bool
|
|
Whether the estimator is fitted.
|
|
"""
|
|
if attributes is not None:
|
|
if not isinstance(attributes, (list, tuple)):
|
|
attributes = [attributes]
|
|
return all_or_any([hasattr(estimator, attr) for attr in attributes])
|
|
|
|
if hasattr(estimator, "__sklearn_is_fitted__"):
|
|
return estimator.__sklearn_is_fitted__()
|
|
|
|
fitted_attrs = [
|
|
v for v in vars(estimator) if v.endswith("_") and not v.startswith("__")
|
|
]
|
|
return len(fitted_attrs) > 0
|
|
|
|
|
|
def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all):
|
|
"""Perform is_fitted validation for estimator.
|
|
|
|
Checks if the estimator is fitted by verifying the presence of
|
|
fitted attributes (ending with a trailing underscore) and otherwise
|
|
raises a NotFittedError with the given message.
|
|
|
|
If an estimator does not set any attributes with a trailing underscore, it
|
|
can define a ``__sklearn_is_fitted__`` method returning a boolean to
|
|
specify if the estimator is fitted or not. See
|
|
:ref:`sphx_glr_auto_examples_developing_estimators_sklearn_is_fitted.py`
|
|
for an example on how to use the API.
|
|
|
|
Parameters
|
|
----------
|
|
estimator : estimator instance
|
|
Estimator instance for which the check is performed.
|
|
|
|
attributes : str, list or tuple of str, default=None
|
|
Attribute name(s) given as string or a list/tuple of strings
|
|
Eg.: ``["coef_", "estimator_", ...], "coef_"``
|
|
|
|
If `None`, `estimator` is considered fitted if there exist an
|
|
attribute that ends with a underscore and does not start with double
|
|
underscore.
|
|
|
|
msg : str, default=None
|
|
The default error message is, "This %(name)s instance is not fitted
|
|
yet. Call 'fit' with appropriate arguments before using this
|
|
estimator."
|
|
|
|
For custom messages if "%(name)s" is present in the message string,
|
|
it is substituted for the estimator name.
|
|
|
|
Eg. : "Estimator, %(name)s, must be fitted before sparsifying".
|
|
|
|
all_or_any : callable, {all, any}, default=all
|
|
Specify whether all or any of the given attributes must exist.
|
|
|
|
Raises
|
|
------
|
|
TypeError
|
|
If the estimator is a class or not an estimator instance
|
|
|
|
NotFittedError
|
|
If the attributes are not found.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.linear_model import LogisticRegression
|
|
>>> from sklearn.utils.validation import check_is_fitted
|
|
>>> from sklearn.exceptions import NotFittedError
|
|
>>> lr = LogisticRegression()
|
|
>>> try:
|
|
... check_is_fitted(lr)
|
|
... except NotFittedError as exc:
|
|
... print(f"Model is not fitted yet.")
|
|
Model is not fitted yet.
|
|
>>> lr.fit([[1, 2], [1, 3]], [1, 0])
|
|
LogisticRegression()
|
|
>>> check_is_fitted(lr)
|
|
"""
|
|
if isclass(estimator):
|
|
raise TypeError("{} is a class, not an instance.".format(estimator))
|
|
if msg is None:
|
|
msg = (
|
|
"This %(name)s instance is not fitted yet. Call 'fit' with "
|
|
"appropriate arguments before using this estimator."
|
|
)
|
|
|
|
if not hasattr(estimator, "fit"):
|
|
raise TypeError("%s is not an estimator instance." % (estimator))
|
|
|
|
if not _is_fitted(estimator, attributes, all_or_any):
|
|
raise NotFittedError(msg % {"name": type(estimator).__name__})
|
|
|
|
|
|
def check_non_negative(X, whom):
|
|
"""
|
|
Check if there is any negative value in an array.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix}
|
|
Input data.
|
|
|
|
whom : str
|
|
Who passed X to this function.
|
|
"""
|
|
xp, _ = get_namespace(X)
|
|
# avoid X.min() on sparse matrix since it also sorts the indices
|
|
if sp.issparse(X):
|
|
if X.format in ["lil", "dok"]:
|
|
X = X.tocsr()
|
|
if X.data.size == 0:
|
|
X_min = 0
|
|
else:
|
|
X_min = X.data.min()
|
|
else:
|
|
X_min = xp.min(X)
|
|
|
|
if X_min < 0:
|
|
raise ValueError("Negative values in data passed to %s" % whom)
|
|
|
|
|
|
def check_scalar(
|
|
x,
|
|
name,
|
|
target_type,
|
|
*,
|
|
min_val=None,
|
|
max_val=None,
|
|
include_boundaries="both",
|
|
):
|
|
"""Validate scalar parameters type and value.
|
|
|
|
Parameters
|
|
----------
|
|
x : object
|
|
The scalar parameter to validate.
|
|
|
|
name : str
|
|
The name of the parameter to be printed in error messages.
|
|
|
|
target_type : type or tuple
|
|
Acceptable data types for the parameter.
|
|
|
|
min_val : float or int, default=None
|
|
The minimum valid value the parameter can take. If None (default) it
|
|
is implied that the parameter does not have a lower bound.
|
|
|
|
max_val : float or int, default=None
|
|
The maximum valid value the parameter can take. If None (default) it
|
|
is implied that the parameter does not have an upper bound.
|
|
|
|
include_boundaries : {"left", "right", "both", "neither"}, default="both"
|
|
Whether the interval defined by `min_val` and `max_val` should include
|
|
the boundaries. Possible choices are:
|
|
|
|
- `"left"`: only `min_val` is included in the valid interval.
|
|
It is equivalent to the interval `[ min_val, max_val )`.
|
|
- `"right"`: only `max_val` is included in the valid interval.
|
|
It is equivalent to the interval `( min_val, max_val ]`.
|
|
- `"both"`: `min_val` and `max_val` are included in the valid interval.
|
|
It is equivalent to the interval `[ min_val, max_val ]`.
|
|
- `"neither"`: neither `min_val` nor `max_val` are included in the
|
|
valid interval. It is equivalent to the interval `( min_val, max_val )`.
|
|
|
|
Returns
|
|
-------
|
|
x : numbers.Number
|
|
The validated number.
|
|
|
|
Raises
|
|
------
|
|
TypeError
|
|
If the parameter's type does not match the desired type.
|
|
|
|
ValueError
|
|
If the parameter's value violates the given bounds.
|
|
If `min_val`, `max_val` and `include_boundaries` are inconsistent.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.utils.validation import check_scalar
|
|
>>> check_scalar(10, "x", int, min_val=1, max_val=20)
|
|
10
|
|
"""
|
|
|
|
def type_name(t):
|
|
"""Convert type into humman readable string."""
|
|
module = t.__module__
|
|
qualname = t.__qualname__
|
|
if module == "builtins":
|
|
return qualname
|
|
elif t == numbers.Real:
|
|
return "float"
|
|
elif t == numbers.Integral:
|
|
return "int"
|
|
return f"{module}.{qualname}"
|
|
|
|
if not isinstance(x, target_type):
|
|
if isinstance(target_type, tuple):
|
|
types_str = ", ".join(type_name(t) for t in target_type)
|
|
target_type_str = f"{{{types_str}}}"
|
|
else:
|
|
target_type_str = type_name(target_type)
|
|
|
|
raise TypeError(
|
|
f"{name} must be an instance of {target_type_str}, not"
|
|
f" {type(x).__qualname__}."
|
|
)
|
|
|
|
expected_include_boundaries = ("left", "right", "both", "neither")
|
|
if include_boundaries not in expected_include_boundaries:
|
|
raise ValueError(
|
|
f"Unknown value for `include_boundaries`: {repr(include_boundaries)}. "
|
|
f"Possible values are: {expected_include_boundaries}."
|
|
)
|
|
|
|
if max_val is None and include_boundaries == "right":
|
|
raise ValueError(
|
|
"`include_boundaries`='right' without specifying explicitly `max_val` "
|
|
"is inconsistent."
|
|
)
|
|
|
|
if min_val is None and include_boundaries == "left":
|
|
raise ValueError(
|
|
"`include_boundaries`='left' without specifying explicitly `min_val` "
|
|
"is inconsistent."
|
|
)
|
|
|
|
comparison_operator = (
|
|
operator.lt if include_boundaries in ("left", "both") else operator.le
|
|
)
|
|
if min_val is not None and comparison_operator(x, min_val):
|
|
raise ValueError(
|
|
f"{name} == {x}, must be"
|
|
f" {'>=' if include_boundaries in ('left', 'both') else '>'} {min_val}."
|
|
)
|
|
|
|
comparison_operator = (
|
|
operator.gt if include_boundaries in ("right", "both") else operator.ge
|
|
)
|
|
if max_val is not None and comparison_operator(x, max_val):
|
|
raise ValueError(
|
|
f"{name} == {x}, must be"
|
|
f" {'<=' if include_boundaries in ('right', 'both') else '<'} {max_val}."
|
|
)
|
|
|
|
return x
|
|
|
|
|
|
def _check_psd_eigenvalues(lambdas, enable_warnings=False):
|
|
"""Check the eigenvalues of a positive semidefinite (PSD) matrix.
|
|
|
|
Checks the provided array of PSD matrix eigenvalues for numerical or
|
|
conditioning issues and returns a fixed validated version. This method
|
|
should typically be used if the PSD matrix is user-provided (e.g. a
|
|
Gram matrix) or computed using a user-provided dissimilarity metric
|
|
(e.g. kernel function), or if the decomposition process uses approximation
|
|
methods (randomized SVD, etc.).
|
|
|
|
It checks for three things:
|
|
|
|
- that there are no significant imaginary parts in eigenvalues (more than
|
|
1e-5 times the maximum real part). If this check fails, it raises a
|
|
``ValueError``. Otherwise all non-significant imaginary parts that may
|
|
remain are set to zero. This operation is traced with a
|
|
``PositiveSpectrumWarning`` when ``enable_warnings=True``.
|
|
|
|
- that eigenvalues are not all negative. If this check fails, it raises a
|
|
``ValueError``
|
|
|
|
- that there are no significant negative eigenvalues with absolute value
|
|
more than 1e-10 (1e-6) and more than 1e-5 (5e-3) times the largest
|
|
positive eigenvalue in double (simple) precision. If this check fails,
|
|
it raises a ``ValueError``. Otherwise all negative eigenvalues that may
|
|
remain are set to zero. This operation is traced with a
|
|
``PositiveSpectrumWarning`` when ``enable_warnings=True``.
|
|
|
|
Finally, all the positive eigenvalues that are too small (with a value
|
|
smaller than the maximum eigenvalue multiplied by 1e-12 (2e-7)) are set to
|
|
zero. This operation is traced with a ``PositiveSpectrumWarning`` when
|
|
``enable_warnings=True``.
|
|
|
|
Parameters
|
|
----------
|
|
lambdas : array-like of shape (n_eigenvalues,)
|
|
Array of eigenvalues to check / fix.
|
|
|
|
enable_warnings : bool, default=False
|
|
When this is set to ``True``, a ``PositiveSpectrumWarning`` will be
|
|
raised when there are imaginary parts, negative eigenvalues, or
|
|
extremely small non-zero eigenvalues. Otherwise no warning will be
|
|
raised. In both cases, imaginary parts, negative eigenvalues, and
|
|
extremely small non-zero eigenvalues will be set to zero.
|
|
|
|
Returns
|
|
-------
|
|
lambdas_fixed : ndarray of shape (n_eigenvalues,)
|
|
A fixed validated copy of the array of eigenvalues.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.utils.validation import _check_psd_eigenvalues
|
|
>>> _check_psd_eigenvalues([1, 2]) # nominal case
|
|
array([1, 2])
|
|
>>> _check_psd_eigenvalues([5, 5j]) # significant imag part
|
|
Traceback (most recent call last):
|
|
...
|
|
ValueError: There are significant imaginary parts in eigenvalues (1
|
|
of the maximum real part). Either the matrix is not PSD, or there was
|
|
an issue while computing the eigendecomposition of the matrix.
|
|
>>> _check_psd_eigenvalues([5, 5e-5j]) # insignificant imag part
|
|
array([5., 0.])
|
|
>>> _check_psd_eigenvalues([-5, -1]) # all negative
|
|
Traceback (most recent call last):
|
|
...
|
|
ValueError: All eigenvalues are negative (maximum is -1). Either the
|
|
matrix is not PSD, or there was an issue while computing the
|
|
eigendecomposition of the matrix.
|
|
>>> _check_psd_eigenvalues([5, -1]) # significant negative
|
|
Traceback (most recent call last):
|
|
...
|
|
ValueError: There are significant negative eigenvalues (0.2 of the
|
|
maximum positive). Either the matrix is not PSD, or there was an issue
|
|
while computing the eigendecomposition of the matrix.
|
|
>>> _check_psd_eigenvalues([5, -5e-5]) # insignificant negative
|
|
array([5., 0.])
|
|
>>> _check_psd_eigenvalues([5, 4e-12]) # bad conditioning (too small)
|
|
array([5., 0.])
|
|
|
|
"""
|
|
|
|
lambdas = np.array(lambdas)
|
|
is_double_precision = lambdas.dtype == np.float64
|
|
|
|
# note: the minimum value available is
|
|
# - single-precision: np.finfo('float32').eps = 1.2e-07
|
|
# - double-precision: np.finfo('float64').eps = 2.2e-16
|
|
|
|
# the various thresholds used for validation
|
|
# we may wish to change the value according to precision.
|
|
significant_imag_ratio = 1e-5
|
|
significant_neg_ratio = 1e-5 if is_double_precision else 5e-3
|
|
significant_neg_value = 1e-10 if is_double_precision else 1e-6
|
|
small_pos_ratio = 1e-12 if is_double_precision else 2e-7
|
|
|
|
# Check that there are no significant imaginary parts
|
|
if not np.isreal(lambdas).all():
|
|
max_imag_abs = np.abs(np.imag(lambdas)).max()
|
|
max_real_abs = np.abs(np.real(lambdas)).max()
|
|
if max_imag_abs > significant_imag_ratio * max_real_abs:
|
|
raise ValueError(
|
|
"There are significant imaginary parts in eigenvalues (%g "
|
|
"of the maximum real part). Either the matrix is not PSD, or "
|
|
"there was an issue while computing the eigendecomposition "
|
|
"of the matrix." % (max_imag_abs / max_real_abs)
|
|
)
|
|
|
|
# warn about imaginary parts being removed
|
|
if enable_warnings:
|
|
warnings.warn(
|
|
"There are imaginary parts in eigenvalues (%g "
|
|
"of the maximum real part). Either the matrix is not"
|
|
" PSD, or there was an issue while computing the "
|
|
"eigendecomposition of the matrix. Only the real "
|
|
"parts will be kept." % (max_imag_abs / max_real_abs),
|
|
PositiveSpectrumWarning,
|
|
)
|
|
|
|
# Remove all imaginary parts (even if zero)
|
|
lambdas = np.real(lambdas)
|
|
|
|
# Check that there are no significant negative eigenvalues
|
|
max_eig = lambdas.max()
|
|
if max_eig < 0:
|
|
raise ValueError(
|
|
"All eigenvalues are negative (maximum is %g). "
|
|
"Either the matrix is not PSD, or there was an "
|
|
"issue while computing the eigendecomposition of "
|
|
"the matrix." % max_eig
|
|
)
|
|
|
|
else:
|
|
min_eig = lambdas.min()
|
|
if (
|
|
min_eig < -significant_neg_ratio * max_eig
|
|
and min_eig < -significant_neg_value
|
|
):
|
|
raise ValueError(
|
|
"There are significant negative eigenvalues (%g"
|
|
" of the maximum positive). Either the matrix is "
|
|
"not PSD, or there was an issue while computing "
|
|
"the eigendecomposition of the matrix." % (-min_eig / max_eig)
|
|
)
|
|
elif min_eig < 0:
|
|
# Remove all negative values and warn about it
|
|
if enable_warnings:
|
|
warnings.warn(
|
|
"There are negative eigenvalues (%g of the "
|
|
"maximum positive). Either the matrix is not "
|
|
"PSD, or there was an issue while computing the"
|
|
" eigendecomposition of the matrix. Negative "
|
|
"eigenvalues will be replaced with 0." % (-min_eig / max_eig),
|
|
PositiveSpectrumWarning,
|
|
)
|
|
lambdas[lambdas < 0] = 0
|
|
|
|
# Check for conditioning (small positive non-zeros)
|
|
too_small_lambdas = (0 < lambdas) & (lambdas < small_pos_ratio * max_eig)
|
|
if too_small_lambdas.any():
|
|
if enable_warnings:
|
|
warnings.warn(
|
|
"Badly conditioned PSD matrix spectrum: the largest "
|
|
"eigenvalue is more than %g times the smallest. "
|
|
"Small eigenvalues will be replaced with 0."
|
|
"" % (1 / small_pos_ratio),
|
|
PositiveSpectrumWarning,
|
|
)
|
|
lambdas[too_small_lambdas] = 0
|
|
|
|
return lambdas
|
|
|
|
|
|
def _check_sample_weight(
|
|
sample_weight, X, dtype=None, copy=False, only_non_negative=False
|
|
):
|
|
"""Validate sample weights.
|
|
|
|
Note that passing sample_weight=None will output an array of ones.
|
|
Therefore, in some cases, you may want to protect the call with:
|
|
if sample_weight is not None:
|
|
sample_weight = _check_sample_weight(...)
|
|
|
|
Parameters
|
|
----------
|
|
sample_weight : {ndarray, Number or None}, shape (n_samples,)
|
|
Input sample weights.
|
|
|
|
X : {ndarray, list, sparse matrix}
|
|
Input data.
|
|
|
|
only_non_negative : bool, default=False,
|
|
Whether or not the weights are expected to be non-negative.
|
|
|
|
.. versionadded:: 1.0
|
|
|
|
dtype : dtype, default=None
|
|
dtype of the validated `sample_weight`.
|
|
If None, and the input `sample_weight` is an array, the dtype of the
|
|
input is preserved; otherwise an array with the default numpy dtype
|
|
is be allocated. If `dtype` is not one of `float32`, `float64`,
|
|
`None`, the output will be of dtype `float64`.
|
|
|
|
copy : bool, default=False
|
|
If True, a copy of sample_weight will be created.
|
|
|
|
Returns
|
|
-------
|
|
sample_weight : ndarray of shape (n_samples,)
|
|
Validated sample weight. It is guaranteed to be "C" contiguous.
|
|
"""
|
|
n_samples = _num_samples(X)
|
|
|
|
if dtype is not None and dtype not in [np.float32, np.float64]:
|
|
dtype = np.float64
|
|
|
|
if sample_weight is None:
|
|
sample_weight = np.ones(n_samples, dtype=dtype)
|
|
elif isinstance(sample_weight, numbers.Number):
|
|
sample_weight = np.full(n_samples, sample_weight, dtype=dtype)
|
|
else:
|
|
if dtype is None:
|
|
dtype = [np.float64, np.float32]
|
|
sample_weight = check_array(
|
|
sample_weight,
|
|
accept_sparse=False,
|
|
ensure_2d=False,
|
|
dtype=dtype,
|
|
order="C",
|
|
copy=copy,
|
|
input_name="sample_weight",
|
|
)
|
|
if sample_weight.ndim != 1:
|
|
raise ValueError("Sample weights must be 1D array or scalar")
|
|
|
|
if sample_weight.shape != (n_samples,):
|
|
raise ValueError(
|
|
"sample_weight.shape == {}, expected {}!".format(
|
|
sample_weight.shape, (n_samples,)
|
|
)
|
|
)
|
|
|
|
if only_non_negative:
|
|
check_non_negative(sample_weight, "`sample_weight`")
|
|
|
|
return sample_weight
|
|
|
|
|
|
def _allclose_dense_sparse(x, y, rtol=1e-7, atol=1e-9):
|
|
"""Check allclose for sparse and dense data.
|
|
|
|
Both x and y need to be either sparse or dense, they
|
|
can't be mixed.
|
|
|
|
Parameters
|
|
----------
|
|
x : {array-like, sparse matrix}
|
|
First array to compare.
|
|
|
|
y : {array-like, sparse matrix}
|
|
Second array to compare.
|
|
|
|
rtol : float, default=1e-7
|
|
Relative tolerance; see numpy.allclose.
|
|
|
|
atol : float, default=1e-9
|
|
absolute tolerance; see numpy.allclose. Note that the default here is
|
|
more tolerant than the default for numpy.testing.assert_allclose, where
|
|
atol=0.
|
|
"""
|
|
if sp.issparse(x) and sp.issparse(y):
|
|
x = x.tocsr()
|
|
y = y.tocsr()
|
|
x.sum_duplicates()
|
|
y.sum_duplicates()
|
|
return (
|
|
np.array_equal(x.indices, y.indices)
|
|
and np.array_equal(x.indptr, y.indptr)
|
|
and np.allclose(x.data, y.data, rtol=rtol, atol=atol)
|
|
)
|
|
elif not sp.issparse(x) and not sp.issparse(y):
|
|
return np.allclose(x, y, rtol=rtol, atol=atol)
|
|
raise ValueError(
|
|
"Can only compare two sparse matrices, not a sparse matrix and an array"
|
|
)
|
|
|
|
|
|
def _check_response_method(estimator, response_method):
|
|
"""Check if `response_method` is available in estimator and return it.
|
|
|
|
.. versionadded:: 1.3
|
|
|
|
Parameters
|
|
----------
|
|
estimator : estimator instance
|
|
Classifier or regressor to check.
|
|
|
|
response_method : {"predict_proba", "predict_log_proba", "decision_function",
|
|
"predict"} or list of such str
|
|
Specifies the response method to use get prediction from an estimator
|
|
(i.e. :term:`predict_proba`, :term:`predict_log_proba`,
|
|
:term:`decision_function` or :term:`predict`). Possible choices are:
|
|
- if `str`, it corresponds to the name to the method to return;
|
|
- if a list of `str`, it provides the method names in order of
|
|
preference. The method returned corresponds to the first method in
|
|
the list and which is implemented by `estimator`.
|
|
|
|
Returns
|
|
-------
|
|
prediction_method : callable
|
|
Prediction method of estimator.
|
|
|
|
Raises
|
|
------
|
|
AttributeError
|
|
If `response_method` is not available in `estimator`.
|
|
"""
|
|
if isinstance(response_method, str):
|
|
list_methods = [response_method]
|
|
else:
|
|
list_methods = response_method
|
|
|
|
prediction_method = [getattr(estimator, method, None) for method in list_methods]
|
|
prediction_method = reduce(lambda x, y: x or y, prediction_method)
|
|
if prediction_method is None:
|
|
raise AttributeError(
|
|
f"{estimator.__class__.__name__} has none of the following attributes: "
|
|
f"{', '.join(list_methods)}."
|
|
)
|
|
|
|
return prediction_method
|
|
|
|
|
|
def _check_method_params(X, params, indices=None):
|
|
"""Check and validate the parameters passed to a specific
|
|
method like `fit`.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features)
|
|
Data array.
|
|
|
|
params : dict
|
|
Dictionary containing the parameters passed to the method.
|
|
|
|
indices : array-like of shape (n_samples,), default=None
|
|
Indices to be selected if the parameter has the same size as `X`.
|
|
|
|
Returns
|
|
-------
|
|
method_params_validated : dict
|
|
Validated parameters. We ensure that the values support indexing.
|
|
"""
|
|
from . import _safe_indexing
|
|
|
|
method_params_validated = {}
|
|
for param_key, param_value in params.items():
|
|
if (
|
|
not _is_arraylike(param_value)
|
|
and not sp.issparse(param_value)
|
|
or _num_samples(param_value) != _num_samples(X)
|
|
):
|
|
# Non-indexable pass-through (for now for backward-compatibility).
|
|
# https://github.com/scikit-learn/scikit-learn/issues/15805
|
|
method_params_validated[param_key] = param_value
|
|
else:
|
|
# Any other method_params should support indexing
|
|
# (e.g. for cross-validation).
|
|
method_params_validated[param_key] = _make_indexable(param_value)
|
|
method_params_validated[param_key] = _safe_indexing(
|
|
method_params_validated[param_key], indices
|
|
)
|
|
|
|
return method_params_validated
|
|
|
|
|
|
def _is_pandas_df_or_series(X):
|
|
"""Return True if the X is a pandas dataframe or series."""
|
|
try:
|
|
pd = sys.modules["pandas"]
|
|
except KeyError:
|
|
return False
|
|
return isinstance(X, (pd.DataFrame, pd.Series))
|
|
|
|
|
|
def _is_pandas_df(X):
|
|
"""Return True if the X is a pandas dataframe."""
|
|
try:
|
|
pd = sys.modules["pandas"]
|
|
except KeyError:
|
|
return False
|
|
return isinstance(X, pd.DataFrame)
|
|
|
|
|
|
def _is_polars_df_or_series(X):
|
|
"""Return True if the X is a polars dataframe or series."""
|
|
try:
|
|
pl = sys.modules["polars"]
|
|
except KeyError:
|
|
return False
|
|
return isinstance(X, (pl.DataFrame, pl.Series))
|
|
|
|
|
|
def _is_polars_df(X):
|
|
"""Return True if the X is a polars dataframe."""
|
|
try:
|
|
pl = sys.modules["polars"]
|
|
except KeyError:
|
|
return False
|
|
return isinstance(X, pl.DataFrame)
|
|
|
|
|
|
def _get_feature_names(X):
|
|
"""Get feature names from X.
|
|
|
|
Support for other array containers should place its implementation here.
|
|
|
|
Parameters
|
|
----------
|
|
X : {ndarray, dataframe} of shape (n_samples, n_features)
|
|
Array container to extract feature names.
|
|
|
|
- pandas dataframe : The columns will be considered to be feature
|
|
names. If the dataframe contains non-string feature names, `None` is
|
|
returned.
|
|
- All other array containers will return `None`.
|
|
|
|
Returns
|
|
-------
|
|
names: ndarray or None
|
|
Feature names of `X`. Unrecognized array containers will return `None`.
|
|
"""
|
|
feature_names = None
|
|
|
|
# extract feature names for support array containers
|
|
if _is_pandas_df(X):
|
|
# Make sure we can inspect columns names from pandas, even with
|
|
# versions too old to expose a working implementation of
|
|
# __dataframe__.column_names() and avoid introducing any
|
|
# additional copy.
|
|
# TODO: remove the pandas-specific branch once the minimum supported
|
|
# version of pandas has a working implementation of
|
|
# __dataframe__.column_names() that is guaranteed to not introduce any
|
|
# additional copy of the data without having to impose allow_copy=False
|
|
# that could fail with other libraries. Note: in the longer term, we
|
|
# could decide to instead rely on the __dataframe_namespace__ API once
|
|
# adopted by our minimally supported pandas version.
|
|
feature_names = np.asarray(X.columns, dtype=object)
|
|
elif hasattr(X, "__dataframe__"):
|
|
df_protocol = X.__dataframe__()
|
|
feature_names = np.asarray(list(df_protocol.column_names()), dtype=object)
|
|
|
|
if feature_names is None or len(feature_names) == 0:
|
|
return
|
|
|
|
types = sorted(t.__qualname__ for t in set(type(v) for v in feature_names))
|
|
|
|
# mixed type of string and non-string is not supported
|
|
if len(types) > 1 and "str" in types:
|
|
raise TypeError(
|
|
"Feature names are only supported if all input features have string names, "
|
|
f"but your input has {types} as feature name / column name types. "
|
|
"If you want feature names to be stored and validated, you must convert "
|
|
"them all to strings, by using X.columns = X.columns.astype(str) for "
|
|
"example. Otherwise you can remove feature / column names from your input "
|
|
"data, or convert them all to a non-string data type."
|
|
)
|
|
|
|
# Only feature names of all strings are supported
|
|
if len(types) == 1 and types[0] == "str":
|
|
return feature_names
|
|
|
|
|
|
def _check_feature_names_in(estimator, input_features=None, *, generate_names=True):
|
|
"""Check `input_features` and generate names if needed.
|
|
|
|
Commonly used in :term:`get_feature_names_out`.
|
|
|
|
Parameters
|
|
----------
|
|
input_features : array-like of str or None, default=None
|
|
Input features.
|
|
|
|
- If `input_features` is `None`, then `feature_names_in_` is
|
|
used as feature names in. If `feature_names_in_` is not defined,
|
|
then the following input feature names are generated:
|
|
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
|
|
- If `input_features` is an array-like, then `input_features` must
|
|
match `feature_names_in_` if `feature_names_in_` is defined.
|
|
|
|
generate_names : bool, default=True
|
|
Whether to generate names when `input_features` is `None` and
|
|
`estimator.feature_names_in_` is not defined. This is useful for transformers
|
|
that validates `input_features` but do not require them in
|
|
:term:`get_feature_names_out` e.g. `PCA`.
|
|
|
|
Returns
|
|
-------
|
|
feature_names_in : ndarray of str or `None`
|
|
Feature names in.
|
|
"""
|
|
|
|
feature_names_in_ = getattr(estimator, "feature_names_in_", None)
|
|
n_features_in_ = getattr(estimator, "n_features_in_", None)
|
|
|
|
if input_features is not None:
|
|
input_features = np.asarray(input_features, dtype=object)
|
|
if feature_names_in_ is not None and not np.array_equal(
|
|
feature_names_in_, input_features
|
|
):
|
|
raise ValueError("input_features is not equal to feature_names_in_")
|
|
|
|
if n_features_in_ is not None and len(input_features) != n_features_in_:
|
|
raise ValueError(
|
|
"input_features should have length equal to number of "
|
|
f"features ({n_features_in_}), got {len(input_features)}"
|
|
)
|
|
return input_features
|
|
|
|
if feature_names_in_ is not None:
|
|
return feature_names_in_
|
|
|
|
if not generate_names:
|
|
return
|
|
|
|
# Generates feature names if `n_features_in_` is defined
|
|
if n_features_in_ is None:
|
|
raise ValueError("Unable to generate feature names without n_features_in_")
|
|
|
|
return np.asarray([f"x{i}" for i in range(n_features_in_)], dtype=object)
|
|
|
|
|
|
def _generate_get_feature_names_out(estimator, n_features_out, input_features=None):
|
|
"""Generate feature names out for estimator using the estimator name as the prefix.
|
|
|
|
The input_feature names are validated but not used. This function is useful
|
|
for estimators that generate their own names based on `n_features_out`, i.e. PCA.
|
|
|
|
Parameters
|
|
----------
|
|
estimator : estimator instance
|
|
Estimator producing output feature names.
|
|
|
|
n_feature_out : int
|
|
Number of feature names out.
|
|
|
|
input_features : array-like of str or None, default=None
|
|
Only used to validate feature names with `estimator.feature_names_in_`.
|
|
|
|
Returns
|
|
-------
|
|
feature_names_in : ndarray of str or `None`
|
|
Feature names in.
|
|
"""
|
|
_check_feature_names_in(estimator, input_features, generate_names=False)
|
|
estimator_name = estimator.__class__.__name__.lower()
|
|
return np.asarray(
|
|
[f"{estimator_name}{i}" for i in range(n_features_out)], dtype=object
|
|
)
|
|
|
|
|
|
def _check_monotonic_cst(estimator, monotonic_cst=None):
|
|
"""Check the monotonic constraints and return the corresponding array.
|
|
|
|
This helper function should be used in the `fit` method of an estimator
|
|
that supports monotonic constraints and called after the estimator has
|
|
introspected input data to set the `n_features_in_` and optionally the
|
|
`feature_names_in_` attributes.
|
|
|
|
.. versionadded:: 1.2
|
|
|
|
Parameters
|
|
----------
|
|
estimator : estimator instance
|
|
|
|
monotonic_cst : array-like of int, dict of str or None, default=None
|
|
Monotonic constraints for the features.
|
|
|
|
- If array-like, then it should contain only -1, 0 or 1. Each value
|
|
will be checked to be in [-1, 0, 1]. If a value is -1, then the
|
|
corresponding feature is required to be monotonically decreasing.
|
|
- If dict, then it the keys should be the feature names occurring in
|
|
`estimator.feature_names_in_` and the values should be -1, 0 or 1.
|
|
- If None, then an array of 0s will be allocated.
|
|
|
|
Returns
|
|
-------
|
|
monotonic_cst : ndarray of int
|
|
Monotonic constraints for each feature.
|
|
"""
|
|
original_monotonic_cst = monotonic_cst
|
|
if monotonic_cst is None or isinstance(monotonic_cst, dict):
|
|
monotonic_cst = np.full(
|
|
shape=estimator.n_features_in_,
|
|
fill_value=0,
|
|
dtype=np.int8,
|
|
)
|
|
if isinstance(original_monotonic_cst, dict):
|
|
if not hasattr(estimator, "feature_names_in_"):
|
|
raise ValueError(
|
|
f"{estimator.__class__.__name__} was not fitted on data "
|
|
"with feature names. Pass monotonic_cst as an integer "
|
|
"array instead."
|
|
)
|
|
unexpected_feature_names = list(
|
|
set(original_monotonic_cst) - set(estimator.feature_names_in_)
|
|
)
|
|
unexpected_feature_names.sort() # deterministic error message
|
|
n_unexpeced = len(unexpected_feature_names)
|
|
if unexpected_feature_names:
|
|
if len(unexpected_feature_names) > 5:
|
|
unexpected_feature_names = unexpected_feature_names[:5]
|
|
unexpected_feature_names.append("...")
|
|
raise ValueError(
|
|
f"monotonic_cst contains {n_unexpeced} unexpected feature "
|
|
f"names: {unexpected_feature_names}."
|
|
)
|
|
for feature_idx, feature_name in enumerate(estimator.feature_names_in_):
|
|
if feature_name in original_monotonic_cst:
|
|
cst = original_monotonic_cst[feature_name]
|
|
if cst not in [-1, 0, 1]:
|
|
raise ValueError(
|
|
f"monotonic_cst['{feature_name}'] must be either "
|
|
f"-1, 0 or 1. Got {cst!r}."
|
|
)
|
|
monotonic_cst[feature_idx] = cst
|
|
else:
|
|
unexpected_cst = np.setdiff1d(monotonic_cst, [-1, 0, 1])
|
|
if unexpected_cst.shape[0]:
|
|
raise ValueError(
|
|
"monotonic_cst must be an array-like of -1, 0 or 1. Observed "
|
|
f"values: {unexpected_cst.tolist()}."
|
|
)
|
|
|
|
monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)
|
|
if monotonic_cst.shape[0] != estimator.n_features_in_:
|
|
raise ValueError(
|
|
f"monotonic_cst has shape {monotonic_cst.shape} but the input data "
|
|
f"X has {estimator.n_features_in_} features."
|
|
)
|
|
return monotonic_cst
|
|
|
|
|
|
def _check_pos_label_consistency(pos_label, y_true):
|
|
"""Check if `pos_label` need to be specified or not.
|
|
|
|
In binary classification, we fix `pos_label=1` if the labels are in the set
|
|
{-1, 1} or {0, 1}. Otherwise, we raise an error asking to specify the
|
|
`pos_label` parameters.
|
|
|
|
Parameters
|
|
----------
|
|
pos_label : int, float, bool, str or None
|
|
The positive label.
|
|
y_true : ndarray of shape (n_samples,)
|
|
The target vector.
|
|
|
|
Returns
|
|
-------
|
|
pos_label : int, float, bool or str
|
|
If `pos_label` can be inferred, it will be returned.
|
|
|
|
Raises
|
|
------
|
|
ValueError
|
|
In the case that `y_true` does not have label in {-1, 1} or {0, 1},
|
|
it will raise a `ValueError`.
|
|
"""
|
|
# ensure binary classification if pos_label is not specified
|
|
# classes.dtype.kind in ('O', 'U', 'S') is required to avoid
|
|
# triggering a FutureWarning by calling np.array_equal(a, b)
|
|
# when elements in the two arrays are not comparable.
|
|
if pos_label is None:
|
|
# Compute classes only if pos_label is not specified:
|
|
classes = np.unique(y_true)
|
|
if classes.dtype.kind in "OUS" or not (
|
|
np.array_equal(classes, [0, 1])
|
|
or np.array_equal(classes, [-1, 1])
|
|
or np.array_equal(classes, [0])
|
|
or np.array_equal(classes, [-1])
|
|
or np.array_equal(classes, [1])
|
|
):
|
|
classes_repr = ", ".join([repr(c) for c in classes.tolist()])
|
|
raise ValueError(
|
|
f"y_true takes value in {{{classes_repr}}} and pos_label is not "
|
|
"specified: either make y_true take value in {0, 1} or "
|
|
"{-1, 1} or pass pos_label explicitly."
|
|
)
|
|
pos_label = 1
|
|
|
|
return pos_label
|
|
|
|
|
|
def _to_object_array(sequence):
|
|
"""Convert sequence to a 1-D NumPy array of object dtype.
|
|
|
|
numpy.array constructor has a similar use but it's output
|
|
is ambiguous. It can be 1-D NumPy array of object dtype if
|
|
the input is a ragged array, but if the input is a list of
|
|
equal length arrays, then the output is a 2D numpy.array.
|
|
_to_object_array solves this ambiguity by guarantying that
|
|
the output is a 1-D NumPy array of objects for any input.
|
|
|
|
Parameters
|
|
----------
|
|
sequence : array-like of shape (n_elements,)
|
|
The sequence to be converted.
|
|
|
|
Returns
|
|
-------
|
|
out : ndarray of shape (n_elements,), dtype=object
|
|
The converted sequence into a 1-D NumPy array of object dtype.
|
|
|
|
Examples
|
|
--------
|
|
>>> import numpy as np
|
|
>>> from sklearn.utils.validation import _to_object_array
|
|
>>> _to_object_array([np.array([0]), np.array([1])])
|
|
array([array([0]), array([1])], dtype=object)
|
|
>>> _to_object_array([np.array([0]), np.array([1, 2])])
|
|
array([array([0]), array([1, 2])], dtype=object)
|
|
>>> _to_object_array([np.array([0]), np.array([1, 2])])
|
|
array([array([0]), array([1, 2])], dtype=object)
|
|
"""
|
|
out = np.empty(len(sequence), dtype=object)
|
|
out[:] = sequence
|
|
return out
|