from contextlib import suppress import numpy as np from scipy import sparse as sp from ._missing import is_scalar_nan from ._param_validation import validate_params from .fixes import _object_dtype_isnan def _get_dense_mask(X, value_to_mask): with suppress(ImportError, AttributeError): # We also suppress `AttributeError` because older versions of pandas do # not have `NA`. import pandas if value_to_mask is pandas.NA: return pandas.isna(X) if is_scalar_nan(value_to_mask): if X.dtype.kind == "f": Xt = np.isnan(X) elif X.dtype.kind in ("i", "u"): # can't have NaNs in integer array. Xt = np.zeros(X.shape, dtype=bool) else: # np.isnan does not work on object dtypes. Xt = _object_dtype_isnan(X) else: Xt = X == value_to_mask return Xt def _get_mask(X, value_to_mask): """Compute the boolean mask X == value_to_mask. Parameters ---------- X : {ndarray, sparse matrix} of shape (n_samples, n_features) Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. value_to_mask : {int, float} The value which is to be masked in X. Returns ------- X_mask : {ndarray, sparse matrix} of shape (n_samples, n_features) Missing mask. """ if not sp.issparse(X): # For all cases apart of a sparse input where we need to reconstruct # a sparse output return _get_dense_mask(X, value_to_mask) Xt = _get_dense_mask(X.data, value_to_mask) sparse_constructor = sp.csr_matrix if X.format == "csr" else sp.csc_matrix Xt_sparse = sparse_constructor( (Xt, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=bool ) return Xt_sparse @validate_params( { "X": ["array-like", "sparse matrix"], "mask": ["array-like"], }, prefer_skip_nested_validation=True, ) def safe_mask(X, mask): """Return a mask which is safe to use on X. Parameters ---------- X : {array-like, sparse matrix} Data on which to apply mask. mask : array-like Mask to be used on X. Returns ------- mask : ndarray Array that is safe to use on X. Examples -------- >>> from sklearn.utils import safe_mask >>> from scipy.sparse import csr_matrix >>> data = csr_matrix([[1], [2], [3], [4], [5]]) >>> condition = [False, True, True, False, True] >>> mask = safe_mask(data, condition) >>> data[mask].toarray() array([[2], [3], [5]]) """ mask = np.asarray(mask) if np.issubdtype(mask.dtype, np.signedinteger): return mask if hasattr(X, "toarray"): ind = np.arange(mask.shape[0]) mask = ind[mask] return mask def axis0_safe_slice(X, mask, len_mask): """Return a mask which is safer to use on X than safe_mask. This mask is safer than safe_mask since it returns an empty array, when a sparse matrix is sliced with a boolean mask with all False, instead of raising an unhelpful error in older versions of SciPy. See: https://github.com/scipy/scipy/issues/5361 Also note that we can avoid doing the dot product by checking if the len_mask is not zero in _huber_loss_and_gradient but this is not going to be the bottleneck, since the number of outliers and non_outliers are typically non-zero and it makes the code tougher to follow. Parameters ---------- X : {array-like, sparse matrix} Data on which to apply mask. mask : ndarray Mask to be used on X. len_mask : int The length of the mask. Returns ------- mask : ndarray Array that is safe to use on X. """ if len_mask != 0: return X[safe_mask(X, mask), :] return np.zeros(shape=(0, X.shape[1])) def indices_to_mask(indices, mask_length): """Convert list of indices to boolean mask. Parameters ---------- indices : list-like List of integers treated as indices. mask_length : int Length of boolean mask to be generated. This parameter must be greater than max(indices). Returns ------- mask : 1d boolean nd-array Boolean array that is True where indices are present, else False. Examples -------- >>> from sklearn.utils._mask import indices_to_mask >>> indices = [1, 2 , 3, 4] >>> indices_to_mask(indices, 5) array([False, True, True, True, True]) """ if mask_length <= np.max(indices): raise ValueError("mask_length must be greater than max(indices)") mask = np.zeros(mask_length, dtype=bool) mask[indices] = True return mask