import warnings import numpy as np import pytest from sklearn.base import clone from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.preprocessing import ( MaxAbsScaler, MinMaxScaler, PowerTransformer, QuantileTransformer, RobustScaler, StandardScaler, maxabs_scale, minmax_scale, power_transform, quantile_transform, robust_scale, scale, ) from sklearn.utils._testing import assert_allclose, assert_array_equal from sklearn.utils.fixes import ( BSR_CONTAINERS, COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS, DIA_CONTAINERS, DOK_CONTAINERS, LIL_CONTAINERS, ) iris = load_iris() def _get_valid_samples_by_column(X, col): """Get non NaN samples in column of X""" return X[:, [col]][~np.isnan(X[:, col])] @pytest.mark.parametrize( "est, func, support_sparse, strictly_positive, omit_kwargs", [ (MaxAbsScaler(), maxabs_scale, True, False, []), (MinMaxScaler(), minmax_scale, False, False, ["clip"]), (StandardScaler(), scale, False, False, []), (StandardScaler(with_mean=False), scale, True, False, []), (PowerTransformer("yeo-johnson"), power_transform, False, False, []), (PowerTransformer("box-cox"), power_transform, False, True, []), (QuantileTransformer(n_quantiles=10), quantile_transform, True, False, []), (RobustScaler(), robust_scale, False, False, []), (RobustScaler(with_centering=False), robust_scale, True, False, []), ], ) def test_missing_value_handling( est, func, support_sparse, strictly_positive, omit_kwargs ): # check that the preprocessing method let pass nan rng = np.random.RandomState(42) X = iris.data.copy() n_missing = 50 X[ rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing) ] = np.nan if strictly_positive: X += np.nanmin(X) + 0.1 X_train, X_test = train_test_split(X, random_state=1) # sanity check assert not np.all(np.isnan(X_train), axis=0).any() assert np.any(np.isnan(X_train), axis=0).all() assert np.any(np.isnan(X_test), axis=0).all() X_test[:, 0] = np.nan # make sure this boundary case is tested with warnings.catch_warnings(): warnings.simplefilter("error", RuntimeWarning) Xt = est.fit(X_train).transform(X_test) # ensure no warnings are raised # missing values should still be missing, and only them assert_array_equal(np.isnan(Xt), np.isnan(X_test)) # check that the function leads to the same results as the class with warnings.catch_warnings(): warnings.simplefilter("error", RuntimeWarning) Xt_class = est.transform(X_train) kwargs = est.get_params() # remove the parameters which should be omitted because they # are not defined in the counterpart function of the preprocessing class for kwarg in omit_kwargs: _ = kwargs.pop(kwarg) Xt_func = func(X_train, **kwargs) assert_array_equal(np.isnan(Xt_func), np.isnan(Xt_class)) assert_allclose(Xt_func[~np.isnan(Xt_func)], Xt_class[~np.isnan(Xt_class)]) # check that the inverse transform keep NaN Xt_inv = est.inverse_transform(Xt) assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test)) # FIXME: we can introduce equal_nan=True in recent version of numpy. # For the moment which just check that non-NaN values are almost equal. assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)]) for i in range(X.shape[1]): # train only on non-NaN est.fit(_get_valid_samples_by_column(X_train, i)) # check transforming with NaN works even when training without NaN with warnings.catch_warnings(): warnings.simplefilter("error", RuntimeWarning) Xt_col = est.transform(X_test[:, [i]]) assert_allclose(Xt_col, Xt[:, [i]]) # check non-NaN is handled as before - the 1st column is all nan if not np.isnan(X_test[:, i]).all(): Xt_col_nonan = est.transform(_get_valid_samples_by_column(X_test, i)) assert_array_equal(Xt_col_nonan, Xt_col[~np.isnan(Xt_col.squeeze())]) if support_sparse: est_dense = clone(est) est_sparse = clone(est) with warnings.catch_warnings(): warnings.simplefilter("error", RuntimeWarning) Xt_dense = est_dense.fit(X_train).transform(X_test) Xt_inv_dense = est_dense.inverse_transform(Xt_dense) for sparse_container in ( BSR_CONTAINERS + COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS + DIA_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS ): # check that the dense and sparse inputs lead to the same results # precompute the matrix to avoid catching side warnings X_train_sp = sparse_container(X_train) X_test_sp = sparse_container(X_test) with warnings.catch_warnings(): warnings.simplefilter("ignore", PendingDeprecationWarning) warnings.simplefilter("error", RuntimeWarning) Xt_sp = est_sparse.fit(X_train_sp).transform(X_test_sp) assert_allclose(Xt_sp.toarray(), Xt_dense) with warnings.catch_warnings(): warnings.simplefilter("ignore", PendingDeprecationWarning) warnings.simplefilter("error", RuntimeWarning) Xt_inv_sp = est_sparse.inverse_transform(Xt_sp) assert_allclose(Xt_inv_sp.toarray(), Xt_inv_dense) @pytest.mark.parametrize( "est, func", [ (MaxAbsScaler(), maxabs_scale), (MinMaxScaler(), minmax_scale), (StandardScaler(), scale), (StandardScaler(with_mean=False), scale), (PowerTransformer("yeo-johnson"), power_transform), ( PowerTransformer("box-cox"), power_transform, ), (QuantileTransformer(n_quantiles=3), quantile_transform), (RobustScaler(), robust_scale), (RobustScaler(with_centering=False), robust_scale), ], ) def test_missing_value_pandas_na_support(est, func): # Test pandas IntegerArray with pd.NA pd = pytest.importorskip("pandas") X = np.array( [ [1, 2, 3, np.nan, np.nan, 4, 5, 1], [np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8], [1, 2, 3, 4, 5, 6, 7, 8], ] ).T # Creates dataframe with IntegerArrays with pd.NA X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c"]) X_df["c"] = X_df["c"].astype("int") X_trans = est.fit_transform(X) X_df_trans = est.fit_transform(X_df) assert_allclose(X_trans, X_df_trans)