# Authors: # # Giorgio Patrini # # License: BSD 3 clause import warnings import itertools import numpy as np import numpy.linalg as la from scipy import sparse, stats from scipy.sparse import random as sparse_random import pytest from sklearn.utils import gen_batches from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_less from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import assert_no_warnings from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_allclose_dense_sparse from sklearn.utils._testing import skip_if_32bit from sklearn.utils._testing import _convert_container from sklearn.utils.sparsefuncs import mean_variance_axis from sklearn.preprocessing._data import _handle_zeros_in_scale from sklearn.preprocessing._data import Binarizer from sklearn.preprocessing._data import KernelCenterer from sklearn.preprocessing._data import Normalizer from sklearn.preprocessing._data import normalize from sklearn.preprocessing._data import StandardScaler from sklearn.preprocessing._data import scale from sklearn.preprocessing._data import MinMaxScaler from sklearn.preprocessing._data import minmax_scale from sklearn.preprocessing._data import QuantileTransformer from sklearn.preprocessing._data import quantile_transform from sklearn.preprocessing._data import MaxAbsScaler from sklearn.preprocessing._data import maxabs_scale from sklearn.preprocessing._data import RobustScaler from sklearn.preprocessing._data import robust_scale from sklearn.preprocessing._data import add_dummy_feature from sklearn.preprocessing._data import PolynomialFeatures from sklearn.preprocessing._data import PowerTransformer from sklearn.preprocessing._data import power_transform from sklearn.preprocessing._data import BOUNDS_THRESHOLD from sklearn.exceptions import NotFittedError from sklearn.base import clone from sklearn.pipeline import Pipeline from sklearn.model_selection import cross_val_predict from sklearn.svm import SVR from sklearn.utils import shuffle from sklearn import datasets iris = datasets.load_iris() # Make some data to be used many times rng = np.random.RandomState(0) n_features = 30 n_samples = 1000 offsets = rng.uniform(-1, 1, size=n_features) scales = rng.uniform(1, 10, size=n_features) X_2d = rng.randn(n_samples, n_features) * scales + offsets X_1row = X_2d[0, :].reshape(1, n_features) X_1col = X_2d[:, 0].reshape(n_samples, 1) X_list_1row = X_1row.tolist() X_list_1col = X_1col.tolist() def toarray(a): if hasattr(a, "toarray"): a = a.toarray() return a def _check_dim_1axis(a): return np.asarray(a).shape[0] def assert_correct_incr(i, batch_start, batch_stop, n, chunk_size, n_samples_seen): if batch_stop != n: assert (i + 1) * chunk_size == n_samples_seen else: assert (i * chunk_size + (batch_stop - batch_start) == n_samples_seen) def test_polynomial_features(): # Test Polynomial Features X1 = np.arange(6)[:, np.newaxis] P1 = np.hstack([np.ones_like(X1), X1, X1 ** 2, X1 ** 3]) deg1 = 3 X2 = np.arange(6).reshape((3, 2)) x1 = X2[:, :1] x2 = X2[:, 1:] P2 = np.hstack([x1 ** 0 * x2 ** 0, x1 ** 1 * x2 ** 0, x1 ** 0 * x2 ** 1, x1 ** 2 * x2 ** 0, x1 ** 1 * x2 ** 1, x1 ** 0 * x2 ** 2]) deg2 = 2 for (deg, X, P) in [(deg1, X1, P1), (deg2, X2, P2)]: P_test = PolynomialFeatures(deg, include_bias=True).fit_transform(X) assert_array_almost_equal(P_test, P) P_test = PolynomialFeatures(deg, include_bias=False).fit_transform(X) assert_array_almost_equal(P_test, P[:, 1:]) interact = PolynomialFeatures(2, interaction_only=True, include_bias=True) X_poly = interact.fit_transform(X) assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]]) assert interact.powers_.shape == (interact.n_output_features_, interact.n_input_features_) def test_polynomial_feature_names(): X = np.arange(30).reshape(10, 3) poly = PolynomialFeatures(degree=2, include_bias=True).fit(X) feature_names = poly.get_feature_names() assert_array_equal(['1', 'x0', 'x1', 'x2', 'x0^2', 'x0 x1', 'x0 x2', 'x1^2', 'x1 x2', 'x2^2'], feature_names) poly = PolynomialFeatures(degree=3, include_bias=False).fit(X) feature_names = poly.get_feature_names(["a", "b", "c"]) assert_array_equal(['a', 'b', 'c', 'a^2', 'a b', 'a c', 'b^2', 'b c', 'c^2', 'a^3', 'a^2 b', 'a^2 c', 'a b^2', 'a b c', 'a c^2', 'b^3', 'b^2 c', 'b c^2', 'c^3'], feature_names) # test some unicode poly = PolynomialFeatures(degree=1, include_bias=True).fit(X) feature_names = poly.get_feature_names( ["\u0001F40D", "\u262E", "\u05D0"]) assert_array_equal(["1", "\u0001F40D", "\u262E", "\u05D0"], feature_names) def test_polynomial_feature_array_order(): X = np.arange(10).reshape(5, 2) def is_c_contiguous(a): return np.isfortran(a.T) assert is_c_contiguous(PolynomialFeatures().fit_transform(X)) assert is_c_contiguous(PolynomialFeatures(order='C').fit_transform(X)) assert np.isfortran(PolynomialFeatures(order='F').fit_transform(X)) @pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], [(1, True, False, int), (2, True, False, int), (2, True, False, np.float32), (2, True, False, np.float64), (3, False, False, np.float64), (3, False, True, np.float64), (4, False, False, np.float64), (4, False, True, np.float64)]) def test_polynomial_features_csc_X(deg, include_bias, interaction_only, dtype): rng = np.random.RandomState(0) X = rng.randint(0, 2, (100, 2)) X_csc = sparse.csc_matrix(X) est = PolynomialFeatures(deg, include_bias=include_bias, interaction_only=interaction_only) Xt_csc = est.fit_transform(X_csc.astype(dtype)) Xt_dense = est.fit_transform(X.astype(dtype)) assert isinstance(Xt_csc, sparse.csc_matrix) assert Xt_csc.dtype == Xt_dense.dtype assert_array_almost_equal(Xt_csc.A, Xt_dense) @pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], [(1, True, False, int), (2, True, False, int), (2, True, False, np.float32), (2, True, False, np.float64), (3, False, False, np.float64), (3, False, True, np.float64)]) def test_polynomial_features_csr_X(deg, include_bias, interaction_only, dtype): rng = np.random.RandomState(0) X = rng.randint(0, 2, (100, 2)) X_csr = sparse.csr_matrix(X) est = PolynomialFeatures(deg, include_bias=include_bias, interaction_only=interaction_only) Xt_csr = est.fit_transform(X_csr.astype(dtype)) Xt_dense = est.fit_transform(X.astype(dtype, copy=False)) assert isinstance(Xt_csr, sparse.csr_matrix) assert Xt_csr.dtype == Xt_dense.dtype assert_array_almost_equal(Xt_csr.A, Xt_dense) @pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], [(2, True, False, np.float32), (2, True, False, np.float64), (3, False, False, np.float64), (3, False, True, np.float64)]) def test_polynomial_features_csr_X_floats(deg, include_bias, interaction_only, dtype): X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr() X = X_csr.toarray() est = PolynomialFeatures(deg, include_bias=include_bias, interaction_only=interaction_only) Xt_csr = est.fit_transform(X_csr.astype(dtype)) Xt_dense = est.fit_transform(X.astype(dtype)) assert isinstance(Xt_csr, sparse.csr_matrix) assert Xt_csr.dtype == Xt_dense.dtype assert_array_almost_equal(Xt_csr.A, Xt_dense) @pytest.mark.parametrize(['zero_row_index', 'deg', 'interaction_only'], [(0, 2, True), (1, 2, True), (2, 2, True), (0, 3, True), (1, 3, True), (2, 3, True), (0, 2, False), (1, 2, False), (2, 2, False), (0, 3, False), (1, 3, False), (2, 3, False)]) def test_polynomial_features_csr_X_zero_row(zero_row_index, deg, interaction_only): X_csr = sparse_random(3, 10, 1.0, random_state=0).tocsr() X_csr[zero_row_index, :] = 0.0 X = X_csr.toarray() est = PolynomialFeatures(deg, include_bias=False, interaction_only=interaction_only) Xt_csr = est.fit_transform(X_csr) Xt_dense = est.fit_transform(X) assert isinstance(Xt_csr, sparse.csr_matrix) assert Xt_csr.dtype == Xt_dense.dtype assert_array_almost_equal(Xt_csr.A, Xt_dense) # This degree should always be one more than the highest degree supported by # _csr_expansion. @pytest.mark.parametrize(['include_bias', 'interaction_only'], [(True, True), (True, False), (False, True), (False, False)]) def test_polynomial_features_csr_X_degree_4(include_bias, interaction_only): X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr() X = X_csr.toarray() est = PolynomialFeatures(4, include_bias=include_bias, interaction_only=interaction_only) Xt_csr = est.fit_transform(X_csr) Xt_dense = est.fit_transform(X) assert isinstance(Xt_csr, sparse.csr_matrix) assert Xt_csr.dtype == Xt_dense.dtype assert_array_almost_equal(Xt_csr.A, Xt_dense) @pytest.mark.parametrize(['deg', 'dim', 'interaction_only'], [(2, 1, True), (2, 2, True), (3, 1, True), (3, 2, True), (3, 3, True), (2, 1, False), (2, 2, False), (3, 1, False), (3, 2, False), (3, 3, False)]) def test_polynomial_features_csr_X_dim_edges(deg, dim, interaction_only): X_csr = sparse_random(1000, dim, 0.5, random_state=0).tocsr() X = X_csr.toarray() est = PolynomialFeatures(deg, interaction_only=interaction_only) Xt_csr = est.fit_transform(X_csr) Xt_dense = est.fit_transform(X) assert isinstance(Xt_csr, sparse.csr_matrix) assert Xt_csr.dtype == Xt_dense.dtype assert_array_almost_equal(Xt_csr.A, Xt_dense) def test_raises_value_error_if_sample_weights_greater_than_1d(): # Sample weights must be either scalar or 1D n_sampless = [2, 3] n_featuress = [3, 2] for n_samples, n_features in zip(n_sampless, n_featuress): X = rng.randn(n_samples, n_features) y = rng.randn(n_samples) scaler = StandardScaler() # make sure Error is raised the sample weights greater than 1d sample_weight_notOK = rng.randn(n_samples, 1) ** 2 with pytest.raises(ValueError): scaler.fit(X, y, sample_weight=sample_weight_notOK) @pytest.mark.parametrize(['Xw', 'X', 'sample_weight'], [([[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [1, 2, 3], [4, 5, 6]], [2., 1.]), ([[1, 0, 1], [0, 0, 1]], [[1, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1]], np.array([1, 3])), ([[1, np.nan, 1], [np.nan, np.nan, 1]], [[1, np.nan, 1], [np.nan, np.nan, 1], [np.nan, np.nan, 1], [np.nan, np.nan, 1]], np.array([1, 3])), ]) @pytest.mark.parametrize( "array_constructor", ["array", "sparse_csr", "sparse_csc"] ) def test_standard_scaler_sample_weight( Xw, X, sample_weight, array_constructor): with_mean = not array_constructor.startswith("sparse") X = _convert_container(X, array_constructor) Xw = _convert_container(Xw, array_constructor) # weighted StandardScaler yw = np.ones(Xw.shape[0]) scaler_w = StandardScaler(with_mean=with_mean) scaler_w.fit(Xw, yw, sample_weight=sample_weight) # unweighted, but with repeated samples y = np.ones(X.shape[0]) scaler = StandardScaler(with_mean=with_mean) scaler.fit(X, y) X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]] assert_almost_equal(scaler.mean_, scaler_w.mean_) assert_almost_equal(scaler.var_, scaler_w.var_) assert_almost_equal(scaler.transform(X_test), scaler_w.transform(X_test)) def test_standard_scaler_1d(): # Test scaling of dataset along single axis for X in [X_1row, X_1col, X_list_1row, X_list_1row]: scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) if isinstance(X, list): X = np.array(X) # cast only after scaling done if _check_dim_1axis(X) == 1: assert_almost_equal(scaler.mean_, X.ravel()) assert_almost_equal(scaler.scale_, np.ones(n_features)) assert_array_almost_equal(X_scaled.mean(axis=0), np.zeros_like(n_features)) assert_array_almost_equal(X_scaled.std(axis=0), np.zeros_like(n_features)) else: assert_almost_equal(scaler.mean_, X.mean()) assert_almost_equal(scaler.scale_, X.std()) assert_array_almost_equal(X_scaled.mean(axis=0), np.zeros_like(n_features)) assert_array_almost_equal(X_scaled.mean(axis=0), .0) assert_array_almost_equal(X_scaled.std(axis=0), 1.) assert scaler.n_samples_seen_ == X.shape[0] # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X) # Constant feature X = np.ones((5, 1)) scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_almost_equal(scaler.mean_, 1.) assert_almost_equal(scaler.scale_, 1.) assert_array_almost_equal(X_scaled.mean(axis=0), .0) assert_array_almost_equal(X_scaled.std(axis=0), .0) assert scaler.n_samples_seen_ == X.shape[0] @pytest.mark.parametrize("sparse_constructor", [None, sparse.csc_matrix, sparse.csr_matrix]) @pytest.mark.parametrize("add_sample_weight", [False, True]) def test_standard_scaler_dtype(add_sample_weight, sparse_constructor): # Ensure scaling does not affect dtype rng = np.random.RandomState(0) n_samples = 10 n_features = 3 if add_sample_weight: sample_weight = np.ones(n_samples) else: sample_weight = None with_mean = True for dtype in [np.float16, np.float32, np.float64]: X = rng.randn(n_samples, n_features).astype(dtype) if sparse_constructor is not None: X = sparse_constructor(X) with_mean = False scaler = StandardScaler(with_mean=with_mean) X_scaled = scaler.fit(X, sample_weight=sample_weight).transform(X) assert X.dtype == X_scaled.dtype assert scaler.mean_.dtype == np.float64 assert scaler.scale_.dtype == np.float64 def test_scale_1d(): # 1-d inputs X_list = [1., 3., 5., 0.] X_arr = np.array(X_list) for X in [X_list, X_arr]: X_scaled = scale(X) assert_array_almost_equal(X_scaled.mean(), 0.0) assert_array_almost_equal(X_scaled.std(), 1.0) assert_array_equal(scale(X, with_mean=False, with_std=False), X) @skip_if_32bit def test_standard_scaler_numerical_stability(): # Test numerical stability of scaling # np.log(1e-5) is taken because of its floating point representation # was empirically found to cause numerical problems with np.mean & np.std. x = np.full(8, np.log(1e-5), dtype=np.float64) # This does not raise a warning as the number of samples is too low # to trigger the problem in recent numpy x_scaled = assert_no_warnings(scale, x) assert_array_almost_equal(scale(x), np.zeros(8)) # with 2 more samples, the std computation run into numerical issues: x = np.full(10, np.log(1e-5), dtype=np.float64) w = "standard deviation of the data is probably very close to 0" x_scaled = assert_warns_message(UserWarning, w, scale, x) assert_array_almost_equal(x_scaled, np.zeros(10)) x = np.full(10, 1e-100, dtype=np.float64) x_small_scaled = assert_no_warnings(scale, x) assert_array_almost_equal(x_small_scaled, np.zeros(10)) # Large values can cause (often recoverable) numerical stability issues: x_big = np.full(10, 1e100, dtype=np.float64) w = "Dataset may contain too large values" x_big_scaled = assert_warns_message(UserWarning, w, scale, x_big) assert_array_almost_equal(x_big_scaled, np.zeros(10)) assert_array_almost_equal(x_big_scaled, x_small_scaled) x_big_centered = assert_warns_message(UserWarning, w, scale, x_big, with_std=False) assert_array_almost_equal(x_big_centered, np.zeros(10)) assert_array_almost_equal(x_big_centered, x_small_scaled) def test_scaler_2d_arrays(): # Test scaling of 2d array along first axis rng = np.random.RandomState(0) n_features = 5 n_samples = 4 X = rng.randn(n_samples, n_features) X[:, 0] = 0.0 # first feature is always of zero scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert not np.any(np.isnan(X_scaled)) assert scaler.n_samples_seen_ == n_samples assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has been copied assert X_scaled is not X # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert X_scaled_back is not X assert X_scaled_back is not X_scaled assert_array_almost_equal(X_scaled_back, X) X_scaled = scale(X, axis=1, with_std=False) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0]) X_scaled = scale(X, axis=1, with_std=True) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0]) assert_array_almost_equal(X_scaled.std(axis=1), n_samples * [1.0]) # Check that the data hasn't been modified assert X_scaled is not X X_scaled = scaler.fit(X).transform(X, copy=False) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert X_scaled is X X = rng.randn(4, 5) X[:, 0] = 1.0 # first feature is a constant, non zero feature scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert X_scaled is not X def test_scaler_float16_overflow(): # Test if the scaler will not overflow on float16 numpy arrays rng = np.random.RandomState(0) # float16 has a maximum of 65500.0. On the worst case 5 * 200000 is 100000 # which is enough to overflow the data type X = rng.uniform(5, 10, [200000, 1]).astype(np.float16) with np.errstate(over='raise'): scaler = StandardScaler().fit(X) X_scaled = scaler.transform(X) # Calculate the float64 equivalent to verify result X_scaled_f64 = StandardScaler().fit_transform(X.astype(np.float64)) # Overflow calculations may cause -inf, inf, or nan. Since there is no nan # input, all of the outputs should be finite. This may be redundant since a # FloatingPointError exception will be thrown on overflow above. assert np.all(np.isfinite(X_scaled)) # The normal distribution is very unlikely to go above 4. At 4.0-8.0 the # float16 precision is 2^-8 which is around 0.004. Thus only 2 decimals are # checked to account for precision differences. assert_array_almost_equal(X_scaled, X_scaled_f64, decimal=2) def test_handle_zeros_in_scale(): s1 = np.array([0, 1, 2, 3]) s2 = _handle_zeros_in_scale(s1, copy=True) assert not s1[0] == s2[0] assert_array_equal(s1, np.array([0, 1, 2, 3])) assert_array_equal(s2, np.array([1, 1, 2, 3])) def test_minmax_scaler_partial_fit(): # Test if partial_fit run over many batches of size 1 and 50 # gives the same results as fit X = X_2d n = X.shape[0] for chunk_size in [1, 2, 50, n, n + 42]: # Test mean at the end of the process scaler_batch = MinMaxScaler().fit(X) scaler_incr = MinMaxScaler() for batch in gen_batches(n_samples, chunk_size): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_) assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_) assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_) # Test std after 1 step batch0 = slice(0, chunk_size) scaler_batch = MinMaxScaler().fit(X[batch0]) scaler_incr = MinMaxScaler().partial_fit(X[batch0]) assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_) assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_) assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_) # Test std until the end of partial fits, and scaler_batch = MinMaxScaler().fit(X) scaler_incr = MinMaxScaler() # Clean estimator for i, batch in enumerate(gen_batches(n_samples, chunk_size)): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_correct_incr(i, batch_start=batch.start, batch_stop=batch.stop, n=n, chunk_size=chunk_size, n_samples_seen=scaler_incr.n_samples_seen_) def test_standard_scaler_partial_fit(): # Test if partial_fit run over many batches of size 1 and 50 # gives the same results as fit X = X_2d n = X.shape[0] for chunk_size in [1, 2, 50, n, n + 42]: # Test mean at the end of the process scaler_batch = StandardScaler(with_std=False).fit(X) scaler_incr = StandardScaler(with_std=False) for batch in gen_batches(n_samples, chunk_size): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_) assert scaler_batch.var_ == scaler_incr.var_ # Nones assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ # Test std after 1 step batch0 = slice(0, chunk_size) scaler_incr = StandardScaler().partial_fit(X[batch0]) if chunk_size == 1: assert_array_almost_equal(np.zeros(n_features, dtype=np.float64), scaler_incr.var_) assert_array_almost_equal(np.ones(n_features, dtype=np.float64), scaler_incr.scale_) else: assert_array_almost_equal(np.var(X[batch0], axis=0), scaler_incr.var_) assert_array_almost_equal(np.std(X[batch0], axis=0), scaler_incr.scale_) # no constants # Test std until the end of partial fits, and scaler_batch = StandardScaler().fit(X) scaler_incr = StandardScaler() # Clean estimator for i, batch in enumerate(gen_batches(n_samples, chunk_size)): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_correct_incr(i, batch_start=batch.start, batch_stop=batch.stop, n=n, chunk_size=chunk_size, n_samples_seen=scaler_incr.n_samples_seen_) assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_) assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ def test_standard_scaler_partial_fit_numerical_stability(): # Test if the incremental computation introduces significative errors # for large datasets with values of large magniture rng = np.random.RandomState(0) n_features = 2 n_samples = 100 offsets = rng.uniform(-1e15, 1e15, size=n_features) scales = rng.uniform(1e3, 1e6, size=n_features) X = rng.randn(n_samples, n_features) * scales + offsets scaler_batch = StandardScaler().fit(X) scaler_incr = StandardScaler() for chunk in X: scaler_incr = scaler_incr.partial_fit(chunk.reshape(1, n_features)) # Regardless of abs values, they must not be more diff 6 significant digits tol = 10 ** (-6) assert_allclose(scaler_incr.mean_, scaler_batch.mean_, rtol=tol) assert_allclose(scaler_incr.var_, scaler_batch.var_, rtol=tol) assert_allclose(scaler_incr.scale_, scaler_batch.scale_, rtol=tol) # NOTE Be aware that for much larger offsets std is very unstable (last # assert) while mean is OK. # Sparse input size = (100, 3) scale = 1e20 X = rng.randint(0, 2, size).astype(np.float64) * scale X_csr = sparse.csr_matrix(X) X_csc = sparse.csc_matrix(X) for X in [X_csr, X_csc]: # with_mean=False is required with sparse input scaler = StandardScaler(with_mean=False).fit(X) scaler_incr = StandardScaler(with_mean=False) for chunk in X: # chunk = sparse.csr_matrix(data_chunks) scaler_incr = scaler_incr.partial_fit(chunk) # Regardless of magnitude, they must not differ more than of 6 digits tol = 10 ** (-6) assert scaler.mean_ is not None assert_allclose(scaler_incr.var_, scaler.var_, rtol=tol) assert_allclose(scaler_incr.scale_, scaler.scale_, rtol=tol) @pytest.mark.parametrize("sample_weight", [True, None]) def test_partial_fit_sparse_input(sample_weight): # Check that sparsity is not destroyed X = np.array([[1.], [0.], [0.], [5.]]) X_csr = sparse.csr_matrix(X) X_csc = sparse.csc_matrix(X) if sample_weight: sample_weight = rng.rand(X_csc.shape[0]) null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) for X in [X_csr, X_csc]: X_null = null_transform.partial_fit( X, sample_weight=sample_weight).transform(X) assert_array_equal(X_null.toarray(), X.toarray()) X_orig = null_transform.inverse_transform(X_null) assert_array_equal(X_orig.toarray(), X_null.toarray()) assert_array_equal(X_orig.toarray(), X.toarray()) @pytest.mark.parametrize("sample_weight", [True, None]) def test_standard_scaler_trasform_with_partial_fit(sample_weight): # Check some postconditions after applying partial_fit and transform X = X_2d[:100, :] if sample_weight: sample_weight = rng.rand(X.shape[0]) scaler_incr = StandardScaler() for i, batch in enumerate(gen_batches(X.shape[0], 1)): X_sofar = X[:(i + 1), :] chunks_copy = X_sofar.copy() if sample_weight is None: scaled_batch = StandardScaler().fit_transform(X_sofar) scaler_incr = scaler_incr.partial_fit(X[batch]) else: scaled_batch = StandardScaler().fit_transform( X_sofar, sample_weight=sample_weight[:i + 1]) scaler_incr = scaler_incr.partial_fit( X[batch], sample_weight=sample_weight[batch]) scaled_incr = scaler_incr.transform(X_sofar) assert_array_almost_equal(scaled_batch, scaled_incr) assert_array_almost_equal(X_sofar, chunks_copy) # No change right_input = scaler_incr.inverse_transform(scaled_incr) assert_array_almost_equal(X_sofar, right_input) zero = np.zeros(X.shape[1]) epsilon = np.finfo(float).eps assert_array_less(zero, scaler_incr.var_ + epsilon) # as less or equal assert_array_less(zero, scaler_incr.scale_ + epsilon) if sample_weight is None: # (i+1) because the Scaler has been already fitted assert (i + 1) == scaler_incr.n_samples_seen_ else: assert ( np.sum(sample_weight[:i + 1]) == pytest.approx(scaler_incr.n_samples_seen_) ) def test_min_max_scaler_iris(): X = iris.data scaler = MinMaxScaler() # default params X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), 0) assert_array_almost_equal(X_trans.max(axis=0), 1) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # not default params: min=1, max=2 scaler = MinMaxScaler(feature_range=(1, 2)) X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), 1) assert_array_almost_equal(X_trans.max(axis=0), 2) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # min=-.5, max=.6 scaler = MinMaxScaler(feature_range=(-.5, .6)) X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), -.5) assert_array_almost_equal(X_trans.max(axis=0), .6) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # raises on invalid range scaler = MinMaxScaler(feature_range=(2, 1)) with pytest.raises(ValueError): scaler.fit(X) def test_min_max_scaler_zero_variance_features(): # Check min max scaler on toy data with zero variance features X = [[0., 1., +0.5], [0., 1., -0.1], [0., 1., +1.1]] X_new = [[+0., 2., 0.5], [-1., 1., 0.0], [+0., 1., 1.5]] # default params scaler = MinMaxScaler() X_trans = scaler.fit_transform(X) X_expected_0_1 = [[0., 0., 0.5], [0., 0., 0.0], [0., 0., 1.0]] assert_array_almost_equal(X_trans, X_expected_0_1) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) X_trans_new = scaler.transform(X_new) X_expected_0_1_new = [[+0., 1., 0.500], [-1., 0., 0.083], [+0., 0., 1.333]] assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2) # not default params scaler = MinMaxScaler(feature_range=(1, 2)) X_trans = scaler.fit_transform(X) X_expected_1_2 = [[1., 1., 1.5], [1., 1., 1.0], [1., 1., 2.0]] assert_array_almost_equal(X_trans, X_expected_1_2) # function interface X_trans = minmax_scale(X) assert_array_almost_equal(X_trans, X_expected_0_1) X_trans = minmax_scale(X, feature_range=(1, 2)) assert_array_almost_equal(X_trans, X_expected_1_2) def test_minmax_scale_axis1(): X = iris.data X_trans = minmax_scale(X, axis=1) assert_array_almost_equal(np.min(X_trans, axis=1), 0) assert_array_almost_equal(np.max(X_trans, axis=1), 1) def test_min_max_scaler_1d(): # Test scaling of dataset along single axis for X in [X_1row, X_1col, X_list_1row, X_list_1row]: scaler = MinMaxScaler(copy=True) X_scaled = scaler.fit(X).transform(X) if isinstance(X, list): X = np.array(X) # cast only after scaling done if _check_dim_1axis(X) == 1: assert_array_almost_equal(X_scaled.min(axis=0), np.zeros(n_features)) assert_array_almost_equal(X_scaled.max(axis=0), np.zeros(n_features)) else: assert_array_almost_equal(X_scaled.min(axis=0), .0) assert_array_almost_equal(X_scaled.max(axis=0), 1.) assert scaler.n_samples_seen_ == X.shape[0] # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X) # Constant feature X = np.ones((5, 1)) scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert X_scaled.min() >= 0. assert X_scaled.max() <= 1. assert scaler.n_samples_seen_ == X.shape[0] # Function interface X_1d = X_1row.ravel() min_ = X_1d.min() max_ = X_1d.max() assert_array_almost_equal((X_1d - min_) / (max_ - min_), minmax_scale(X_1d, copy=True)) @pytest.mark.parametrize("sample_weight", [True, None]) def test_scaler_without_centering(sample_weight): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sparse.csr_matrix(X) X_csc = sparse.csc_matrix(X) if sample_weight: sample_weight = rng.rand(X.shape[0]) with pytest.raises(ValueError): StandardScaler().fit(X_csr) with pytest.raises(ValueError): StandardScaler().fit(X_csc) null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) X_null = null_transform.fit_transform(X_csr) assert_array_equal(X_null.data, X_csr.data) X_orig = null_transform.inverse_transform(X_null) assert_array_equal(X_orig.data, X_csr.data) scaler = StandardScaler(with_mean=False).fit( X, sample_weight=sample_weight) X_scaled = scaler.transform(X, copy=True) assert not np.any(np.isnan(X_scaled)) scaler_csr = StandardScaler(with_mean=False).fit( X_csr, sample_weight=sample_weight) X_csr_scaled = scaler_csr.transform(X_csr, copy=True) assert not np.any(np.isnan(X_csr_scaled.data)) scaler_csc = StandardScaler(with_mean=False).fit( X_csc, sample_weight=sample_weight) X_csc_scaled = scaler_csc.transform(X_csc, copy=True) assert not np.any(np.isnan(X_csc_scaled.data)) assert_array_almost_equal(scaler.mean_, scaler_csr.mean_) assert_array_almost_equal(scaler.var_, scaler_csr.var_) assert_array_almost_equal(scaler.scale_, scaler_csr.scale_) assert_array_almost_equal(scaler.n_samples_seen_, scaler_csr.n_samples_seen_) assert_array_almost_equal(scaler.mean_, scaler_csc.mean_) assert_array_almost_equal(scaler.var_, scaler_csc.var_) assert_array_almost_equal(scaler.scale_, scaler_csc.scale_) assert_array_almost_equal(scaler.n_samples_seen_, scaler_csc.n_samples_seen_) if sample_weight is None: assert_array_almost_equal( X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) X_csr_scaled_mean, X_csr_scaled_var = \ mean_variance_axis(X_csr_scaled, 0) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_var, X_scaled.var(axis=0)) # Check that X has not been modified (copy) assert X_scaled is not X assert X_csr_scaled is not X_csr X_scaled_back = scaler.inverse_transform(X_scaled) assert X_scaled_back is not X assert X_scaled_back is not X_scaled assert_array_almost_equal(X_scaled_back, X) X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled) assert X_csr_scaled_back is not X_csr assert X_csr_scaled_back is not X_csr_scaled assert_array_almost_equal(X_csr_scaled_back.toarray(), X) X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc()) assert X_csc_scaled_back is not X_csc assert X_csc_scaled_back is not X_csc_scaled assert_array_almost_equal(X_csc_scaled_back.toarray(), X) @pytest.mark.parametrize("with_mean", [True, False]) @pytest.mark.parametrize("with_std", [True, False]) @pytest.mark.parametrize("array_constructor", [np.asarray, sparse.csc_matrix, sparse.csr_matrix]) def test_scaler_n_samples_seen_with_nan(with_mean, with_std, array_constructor): X = np.array([[0, 1, 3], [np.nan, 6, 10], [5, 4, np.nan], [8, 0, np.nan]], dtype=np.float64) X = array_constructor(X) if sparse.issparse(X) and with_mean: pytest.skip("'with_mean=True' cannot be used with sparse matrix.") transformer = StandardScaler(with_mean=with_mean, with_std=with_std) transformer.fit(X) assert_array_equal(transformer.n_samples_seen_, np.array([3, 4, 2])) def _check_identity_scalers_attributes(scaler_1, scaler_2): assert scaler_1.mean_ is scaler_2.mean_ is None assert scaler_1.var_ is scaler_2.var_ is None assert scaler_1.scale_ is scaler_2.scale_ is None assert scaler_1.n_samples_seen_ == scaler_2.n_samples_seen_ def test_scaler_return_identity(): # test that the scaler return identity when with_mean and with_std are # False X_dense = np.array([[0, 1, 3], [5, 6, 0], [8, 0, 10]], dtype=np.float64) X_csr = sparse.csr_matrix(X_dense) X_csc = X_csr.tocsc() transformer_dense = StandardScaler(with_mean=False, with_std=False) X_trans_dense = transformer_dense.fit_transform(X_dense) transformer_csr = clone(transformer_dense) X_trans_csr = transformer_csr.fit_transform(X_csr) transformer_csc = clone(transformer_dense) X_trans_csc = transformer_csc.fit_transform(X_csc) assert_allclose_dense_sparse(X_trans_csr, X_csr) assert_allclose_dense_sparse(X_trans_csc, X_csc) assert_allclose(X_trans_dense, X_dense) for trans_1, trans_2 in itertools.combinations([transformer_dense, transformer_csr, transformer_csc], 2): _check_identity_scalers_attributes(trans_1, trans_2) transformer_dense.partial_fit(X_dense) transformer_csr.partial_fit(X_csr) transformer_csc.partial_fit(X_csc) for trans_1, trans_2 in itertools.combinations([transformer_dense, transformer_csr, transformer_csc], 2): _check_identity_scalers_attributes(trans_1, trans_2) transformer_dense.fit(X_dense) transformer_csr.fit(X_csr) transformer_csc.fit(X_csc) for trans_1, trans_2 in itertools.combinations([transformer_dense, transformer_csr, transformer_csc], 2): _check_identity_scalers_attributes(trans_1, trans_2) def test_scaler_int(): # test that scaler converts integer input to floating # for both sparse and dense matrices rng = np.random.RandomState(42) X = rng.randint(20, size=(4, 5)) X[:, 0] = 0 # first feature is always of zero X_csr = sparse.csr_matrix(X) X_csc = sparse.csc_matrix(X) null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) with warnings.catch_warnings(record=True): X_null = null_transform.fit_transform(X_csr) assert_array_equal(X_null.data, X_csr.data) X_orig = null_transform.inverse_transform(X_null) assert_array_equal(X_orig.data, X_csr.data) with warnings.catch_warnings(record=True): scaler = StandardScaler(with_mean=False).fit(X) X_scaled = scaler.transform(X, copy=True) assert not np.any(np.isnan(X_scaled)) with warnings.catch_warnings(record=True): scaler_csr = StandardScaler(with_mean=False).fit(X_csr) X_csr_scaled = scaler_csr.transform(X_csr, copy=True) assert not np.any(np.isnan(X_csr_scaled.data)) with warnings.catch_warnings(record=True): scaler_csc = StandardScaler(with_mean=False).fit(X_csc) X_csc_scaled = scaler_csc.transform(X_csc, copy=True) assert not np.any(np.isnan(X_csc_scaled.data)) assert_array_almost_equal(scaler.mean_, scaler_csr.mean_) assert_array_almost_equal(scaler.var_, scaler_csr.var_) assert_array_almost_equal(scaler.scale_, scaler_csr.scale_) assert_array_almost_equal(scaler.mean_, scaler_csc.mean_) assert_array_almost_equal(scaler.var_, scaler_csc.var_) assert_array_almost_equal(scaler.scale_, scaler_csc.scale_) assert_array_almost_equal( X_scaled.mean(axis=0), [0., 1.109, 1.856, 21., 1.559], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis( X_csr_scaled.astype(float), 0) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) # Check that X has not been modified (copy) assert X_scaled is not X assert X_csr_scaled is not X_csr X_scaled_back = scaler.inverse_transform(X_scaled) assert X_scaled_back is not X assert X_scaled_back is not X_scaled assert_array_almost_equal(X_scaled_back, X) X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled) assert X_csr_scaled_back is not X_csr assert X_csr_scaled_back is not X_csr_scaled assert_array_almost_equal(X_csr_scaled_back.toarray(), X) X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc()) assert X_csc_scaled_back is not X_csc assert X_csc_scaled_back is not X_csc_scaled assert_array_almost_equal(X_csc_scaled_back.toarray(), X) def test_scaler_without_copy(): # Check that StandardScaler.fit does not change input rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sparse.csr_matrix(X) X_csc = sparse.csc_matrix(X) X_copy = X.copy() StandardScaler(copy=False).fit(X) assert_array_equal(X, X_copy) X_csr_copy = X_csr.copy() StandardScaler(with_mean=False, copy=False).fit(X_csr) assert_array_equal(X_csr.toarray(), X_csr_copy.toarray()) X_csc_copy = X_csc.copy() StandardScaler(with_mean=False, copy=False).fit(X_csc) assert_array_equal(X_csc.toarray(), X_csc_copy.toarray()) def test_scale_sparse_with_mean_raise_exception(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X_csr = sparse.csr_matrix(X) X_csc = sparse.csc_matrix(X) # check scaling and fit with direct calls on sparse data with pytest.raises(ValueError): scale(X_csr, with_mean=True) with pytest.raises(ValueError): StandardScaler(with_mean=True).fit(X_csr) with pytest.raises(ValueError): scale(X_csc, with_mean=True) with pytest.raises(ValueError): StandardScaler(with_mean=True).fit(X_csc) # check transform and inverse_transform after a fit on a dense array scaler = StandardScaler(with_mean=True).fit(X) with pytest.raises(ValueError): scaler.transform(X_csr) with pytest.raises(ValueError): scaler.transform(X_csc) X_transformed_csr = sparse.csr_matrix(scaler.transform(X)) with pytest.raises(ValueError): scaler.inverse_transform(X_transformed_csr) X_transformed_csc = sparse.csc_matrix(scaler.transform(X)) with pytest.raises(ValueError): scaler.inverse_transform(X_transformed_csc) def test_scale_input_finiteness_validation(): # Check if non finite inputs raise ValueError X = [[np.inf, 5, 6, 7, 8]] with pytest.raises(ValueError, match="Input contains infinity " "or a value too large"): scale(X) def test_robust_scaler_error_sparse(): X_sparse = sparse.rand(1000, 10) scaler = RobustScaler(with_centering=True) err_msg = "Cannot center sparse matrices" with pytest.raises(ValueError, match=err_msg): scaler.fit(X_sparse) @pytest.mark.parametrize("with_centering", [True, False]) @pytest.mark.parametrize("with_scaling", [True, False]) @pytest.mark.parametrize("X", [np.random.randn(10, 3), sparse.rand(10, 3, density=0.5)]) def test_robust_scaler_attributes(X, with_centering, with_scaling): # check consistent type of attributes if with_centering and sparse.issparse(X): pytest.skip("RobustScaler cannot center sparse matrix") scaler = RobustScaler(with_centering=with_centering, with_scaling=with_scaling) scaler.fit(X) if with_centering: assert isinstance(scaler.center_, np.ndarray) else: assert scaler.center_ is None if with_scaling: assert isinstance(scaler.scale_, np.ndarray) else: assert scaler.scale_ is None def test_robust_scaler_col_zero_sparse(): # check that the scaler is working when there is not data materialized in a # column of a sparse matrix X = np.random.randn(10, 5) X[:, 0] = 0 X = sparse.csr_matrix(X) scaler = RobustScaler(with_centering=False) scaler.fit(X) assert scaler.scale_[0] == pytest.approx(1) X_trans = scaler.transform(X) assert_allclose(X[:, 0].toarray(), X_trans[:, 0].toarray()) def test_robust_scaler_2d_arrays(): # Test robust scaling of 2d array along first axis rng = np.random.RandomState(0) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero scaler = RobustScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(np.median(X_scaled, axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0)[0], 0) @pytest.mark.parametrize("density", [0, 0.05, 0.1, 0.5, 1]) @pytest.mark.parametrize("strictly_signed", ['positive', 'negative', 'zeros', None]) def test_robust_scaler_equivalence_dense_sparse(density, strictly_signed): # Check the equivalence of the fitting with dense and sparse matrices X_sparse = sparse.rand(1000, 5, density=density).tocsc() if strictly_signed == 'positive': X_sparse.data = np.abs(X_sparse.data) elif strictly_signed == 'negative': X_sparse.data = - np.abs(X_sparse.data) elif strictly_signed == 'zeros': X_sparse.data = np.zeros(X_sparse.data.shape, dtype=np.float64) X_dense = X_sparse.toarray() scaler_sparse = RobustScaler(with_centering=False) scaler_dense = RobustScaler(with_centering=False) scaler_sparse.fit(X_sparse) scaler_dense.fit(X_dense) assert_allclose(scaler_sparse.scale_, scaler_dense.scale_) def test_robust_scaler_transform_one_row_csr(): # Check RobustScaler on transforming csr matrix with one row rng = np.random.RandomState(0) X = rng.randn(4, 5) single_row = np.array([[0.1, 1., 2., 0., -1.]]) scaler = RobustScaler(with_centering=False) scaler = scaler.fit(X) row_trans = scaler.transform(sparse.csr_matrix(single_row)) row_expected = single_row / scaler.scale_ assert_array_almost_equal(row_trans.toarray(), row_expected) row_scaled_back = scaler.inverse_transform(row_trans) assert_array_almost_equal(single_row, row_scaled_back.toarray()) def test_robust_scaler_iris(): X = iris.data scaler = RobustScaler() X_trans = scaler.fit_transform(X) assert_array_almost_equal(np.median(X_trans, axis=0), 0) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) q = np.percentile(X_trans, q=(25, 75), axis=0) iqr = q[1] - q[0] assert_array_almost_equal(iqr, 1) def test_robust_scaler_iris_quantiles(): X = iris.data scaler = RobustScaler(quantile_range=(10, 90)) X_trans = scaler.fit_transform(X) assert_array_almost_equal(np.median(X_trans, axis=0), 0) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) q = np.percentile(X_trans, q=(10, 90), axis=0) q_range = q[1] - q[0] assert_array_almost_equal(q_range, 1) def test_quantile_transform_iris(): X = iris.data # uniform output distribution transformer = QuantileTransformer(n_quantiles=30) X_trans = transformer.fit_transform(X) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # normal output distribution transformer = QuantileTransformer(n_quantiles=30, output_distribution='normal') X_trans = transformer.fit_transform(X) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # make sure it is possible to take the inverse of a sparse matrix # which contain negative value; this is the case in the iris dataset X_sparse = sparse.csc_matrix(X) X_sparse_tran = transformer.fit_transform(X_sparse) X_sparse_tran_inv = transformer.inverse_transform(X_sparse_tran) assert_array_almost_equal(X_sparse.A, X_sparse_tran_inv.A) def test_quantile_transform_check_error(): X = np.transpose([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [2, 4, 0, 0, 6, 8, 0, 10, 0, 0], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]) X = sparse.csc_matrix(X) X_neg = np.transpose([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [-2, 4, 0, 0, 6, 8, 0, 10, 0, 0], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]) X_neg = sparse.csc_matrix(X_neg) err_msg = "Invalid value for 'n_quantiles': 0." with pytest.raises(ValueError, match=err_msg): QuantileTransformer(n_quantiles=0).fit(X) err_msg = "Invalid value for 'subsample': 0." with pytest.raises(ValueError, match=err_msg): QuantileTransformer(subsample=0).fit(X) err_msg = ("The number of quantiles cannot be greater than " "the number of samples used. Got 1000 quantiles " "and 10 samples.") with pytest.raises(ValueError, match=err_msg): QuantileTransformer(subsample=10).fit(X) transformer = QuantileTransformer(n_quantiles=10) err_msg = "QuantileTransformer only accepts non-negative sparse matrices." with pytest.raises(ValueError, match=err_msg): transformer.fit(X_neg) transformer.fit(X) err_msg = "QuantileTransformer only accepts non-negative sparse matrices." with pytest.raises(ValueError, match=err_msg): transformer.transform(X_neg) X_bad_feat = np.transpose([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]) err_msg = ("X has 2 features, but QuantileTransformer is expecting " "3 features as input.") with pytest.raises(ValueError, match=err_msg): transformer.inverse_transform(X_bad_feat) transformer = QuantileTransformer(n_quantiles=10, output_distribution='rnd') # check that an error is raised at fit time err_msg = ("'output_distribution' has to be either 'normal' or " "'uniform'. Got 'rnd' instead.") with pytest.raises(ValueError, match=err_msg): transformer.fit(X) # check that an error is raised at transform time transformer.output_distribution = 'uniform' transformer.fit(X) X_tran = transformer.transform(X) transformer.output_distribution = 'rnd' err_msg = ("'output_distribution' has to be either 'normal' or 'uniform'." " Got 'rnd' instead.") with pytest.raises(ValueError, match=err_msg): transformer.transform(X) # check that an error is raised at inverse_transform time err_msg = ("'output_distribution' has to be either 'normal' or 'uniform'." " Got 'rnd' instead.") with pytest.raises(ValueError, match=err_msg): transformer.inverse_transform(X_tran) # check that an error is raised if input is scalar with pytest.raises(ValueError, match='Expected 2D array, got scalar array instead'): transformer.transform(10) # check that a warning is raised is n_quantiles > n_samples transformer = QuantileTransformer(n_quantiles=100) warn_msg = "n_quantiles is set to n_samples" with pytest.warns(UserWarning, match=warn_msg) as record: transformer.fit(X) assert len(record) == 1 assert transformer.n_quantiles_ == X.shape[0] def test_quantile_transform_sparse_ignore_zeros(): X = np.array([[0, 1], [0, 0], [0, 2], [0, 2], [0, 1]]) X_sparse = sparse.csc_matrix(X) transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5) # dense case -> warning raise assert_warns_message(UserWarning, "'ignore_implicit_zeros' takes effect" " only with sparse matrix. This parameter has no" " effect.", transformer.fit, X) X_expected = np.array([[0, 0], [0, 0], [0, 1], [0, 1], [0, 0]]) X_trans = transformer.fit_transform(X_sparse) assert_almost_equal(X_expected, X_trans.A) # consider the case where sparse entries are missing values and user-given # zeros are to be considered X_data = np.array([0, 0, 1, 0, 2, 2, 1, 0, 1, 2, 0]) X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]) X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6, 7, 8]) X_sparse = sparse.csc_matrix((X_data, (X_row, X_col))) X_trans = transformer.fit_transform(X_sparse) X_expected = np.array([[0., 0.5], [0., 0.], [0., 1.], [0., 1.], [0., 0.5], [0., 0.], [0., 0.5], [0., 1.], [0., 0.]]) assert_almost_equal(X_expected, X_trans.A) transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5) X_data = np.array([-1, -1, 1, 0, 0, 0, 1, -1, 1]) X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1]) X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6]) X_sparse = sparse.csc_matrix((X_data, (X_row, X_col))) X_trans = transformer.fit_transform(X_sparse) X_expected = np.array([[0, 1], [0, 0.375], [0, 0.375], [0, 0.375], [0, 1], [0, 0], [0, 1]]) assert_almost_equal(X_expected, X_trans.A) assert_almost_equal(X_sparse.A, transformer.inverse_transform(X_trans).A) # check in conjunction with subsampling transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5, subsample=8, random_state=0) X_trans = transformer.fit_transform(X_sparse) assert_almost_equal(X_expected, X_trans.A) assert_almost_equal(X_sparse.A, transformer.inverse_transform(X_trans).A) def test_quantile_transform_dense_toy(): X = np.array([[0, 2, 2.6], [25, 4, 4.1], [50, 6, 2.3], [75, 8, 9.5], [100, 10, 0.1]]) transformer = QuantileTransformer(n_quantiles=5) transformer.fit(X) # using the a uniform output, each entry of X should be map between 0 and 1 # and equally spaced X_trans = transformer.fit_transform(X) X_expected = np.tile(np.linspace(0, 1, num=5), (3, 1)).T assert_almost_equal(np.sort(X_trans, axis=0), X_expected) X_test = np.array([ [-1, 1, 0], [101, 11, 10], ]) X_expected = np.array([ [0, 0, 0], [1, 1, 1], ]) assert_array_almost_equal(transformer.transform(X_test), X_expected) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) def test_quantile_transform_subsampling(): # Test that subsampling the input yield to a consistent results We check # that the computed quantiles are almost mapped to a [0, 1] vector where # values are equally spaced. The infinite norm is checked to be smaller # than a given threshold. This is repeated 5 times. # dense support n_samples = 1000000 n_quantiles = 1000 X = np.sort(np.random.sample((n_samples, 1)), axis=0) ROUND = 5 inf_norm_arr = [] for random_state in range(ROUND): transformer = QuantileTransformer(random_state=random_state, n_quantiles=n_quantiles, subsample=n_samples // 10) transformer.fit(X) diff = (np.linspace(0, 1, n_quantiles) - np.ravel(transformer.quantiles_)) inf_norm = np.max(np.abs(diff)) assert inf_norm < 1e-2 inf_norm_arr.append(inf_norm) # each random subsampling yield a unique approximation to the expected # linspace CDF assert len(np.unique(inf_norm_arr)) == len(inf_norm_arr) # sparse support X = sparse.rand(n_samples, 1, density=.99, format='csc', random_state=0) inf_norm_arr = [] for random_state in range(ROUND): transformer = QuantileTransformer(random_state=random_state, n_quantiles=n_quantiles, subsample=n_samples // 10) transformer.fit(X) diff = (np.linspace(0, 1, n_quantiles) - np.ravel(transformer.quantiles_)) inf_norm = np.max(np.abs(diff)) assert inf_norm < 1e-1 inf_norm_arr.append(inf_norm) # each random subsampling yield a unique approximation to the expected # linspace CDF assert len(np.unique(inf_norm_arr)) == len(inf_norm_arr) def test_quantile_transform_sparse_toy(): X = np.array([[0., 2., 0.], [25., 4., 0.], [50., 0., 2.6], [0., 0., 4.1], [0., 6., 0.], [0., 8., 0.], [75., 0., 2.3], [0., 10., 0.], [0., 0., 9.5], [100., 0., 0.1]]) X = sparse.csc_matrix(X) transformer = QuantileTransformer(n_quantiles=10) transformer.fit(X) X_trans = transformer.fit_transform(X) assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.) assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) transformer_dense = QuantileTransformer(n_quantiles=10).fit( X.toarray()) X_trans = transformer_dense.transform(X) assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.) assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.) X_trans_inv = transformer_dense.inverse_transform(X_trans) assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) def test_quantile_transform_axis1(): X = np.array([[0, 25, 50, 75, 100], [2, 4, 6, 8, 10], [2.6, 4.1, 2.3, 9.5, 0.1]]) X_trans_a0 = quantile_transform(X.T, axis=0, n_quantiles=5) X_trans_a1 = quantile_transform(X, axis=1, n_quantiles=5) assert_array_almost_equal(X_trans_a0, X_trans_a1.T) def test_quantile_transform_bounds(): # Lower and upper bounds are manually mapped. We checked that in the case # of a constant feature and binary feature, the bounds are properly mapped. X_dense = np.array([[0, 0], [0, 0], [1, 0]]) X_sparse = sparse.csc_matrix(X_dense) # check sparse and dense are consistent X_trans = QuantileTransformer(n_quantiles=3, random_state=0).fit_transform(X_dense) assert_array_almost_equal(X_trans, X_dense) X_trans_sp = QuantileTransformer(n_quantiles=3, random_state=0).fit_transform(X_sparse) assert_array_almost_equal(X_trans_sp.A, X_dense) assert_array_almost_equal(X_trans, X_trans_sp.A) # check the consistency of the bounds by learning on 1 matrix # and transforming another X = np.array([[0, 1], [0, 0.5], [1, 0]]) X1 = np.array([[0, 0.1], [0, 0.5], [1, 0.1]]) transformer = QuantileTransformer(n_quantiles=3).fit(X) X_trans = transformer.transform(X1) assert_array_almost_equal(X_trans, X1) # check that values outside of the range learned will be mapped properly. X = np.random.random((1000, 1)) transformer = QuantileTransformer() transformer.fit(X) assert (transformer.transform([[-10]]) == transformer.transform([[np.min(X)]])) assert (transformer.transform([[10]]) == transformer.transform([[np.max(X)]])) assert (transformer.inverse_transform([[-10]]) == transformer.inverse_transform( [[np.min(transformer.references_)]])) assert (transformer.inverse_transform([[10]]) == transformer.inverse_transform( [[np.max(transformer.references_)]])) def test_quantile_transform_and_inverse(): X_1 = iris.data X_2 = np.array([[0.], [BOUNDS_THRESHOLD / 10], [1.5], [2], [3], [3], [4]]) for X in [X_1, X_2]: transformer = QuantileTransformer(n_quantiles=1000, random_state=0) X_trans = transformer.fit_transform(X) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv, decimal=9) def test_quantile_transform_nan(): X = np.array([[np.nan, 0, 0, 1], [np.nan, np.nan, 0, 0.5], [np.nan, 1, 1, 0]]) transformer = QuantileTransformer(n_quantiles=10, random_state=42) transformer.fit_transform(X) # check that the quantile of the first column is all NaN assert np.isnan(transformer.quantiles_[:, 0]).all() # all other column should not contain NaN assert not np.isnan(transformer.quantiles_[:, 1:]).any() @pytest.mark.parametrize("array_type", ['array', 'sparse']) def test_quantile_transformer_sorted_quantiles(array_type): # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/15733 # Taken from upstream bug report: # https://github.com/numpy/numpy/issues/14685 X = np.array([0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 1, 1, 9, 9, 9, 8, 8, 7] * 10) X = 0.1 * X.reshape(-1, 1) X = _convert_container(X, array_type) n_quantiles = 100 qt = QuantileTransformer(n_quantiles=n_quantiles).fit(X) # Check that the estimated quantile threasholds are monotically # increasing: quantiles = qt.quantiles_[:, 0] assert len(quantiles) == 100 assert all(np.diff(quantiles) >= 0) def test_robust_scaler_invalid_range(): for range_ in [ (-1, 90), (-2, -3), (10, 101), (100.5, 101), (90, 50), ]: scaler = RobustScaler(quantile_range=range_) with pytest.raises(ValueError, match=r'Invalid quantile range: \('): scaler.fit(iris.data) def test_scale_function_without_centering(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sparse.csr_matrix(X) X_scaled = scale(X, with_mean=False) assert not np.any(np.isnan(X_scaled)) X_csr_scaled = scale(X_csr, with_mean=False) assert not np.any(np.isnan(X_csr_scaled.data)) # test csc has same outcome X_csc_scaled = scale(X_csr.tocsc(), with_mean=False) assert_array_almost_equal(X_scaled, X_csc_scaled.toarray()) # raises value error on axis != 0 with pytest.raises(ValueError): scale(X_csr, with_mean=False, axis=1) assert_array_almost_equal(X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert X_scaled is not X X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) # null scale X_csr_scaled = scale(X_csr, with_mean=False, with_std=False, copy=True) assert_array_almost_equal(X_csr.toarray(), X_csr_scaled.toarray()) def test_robust_scale_axis1(): X = iris.data X_trans = robust_scale(X, axis=1) assert_array_almost_equal(np.median(X_trans, axis=1), 0) q = np.percentile(X_trans, q=(25, 75), axis=1) iqr = q[1] - q[0] assert_array_almost_equal(iqr, 1) def test_robust_scale_1d_array(): X = iris.data[:, 1] X_trans = robust_scale(X) assert_array_almost_equal(np.median(X_trans), 0) q = np.percentile(X_trans, q=(25, 75)) iqr = q[1] - q[0] assert_array_almost_equal(iqr, 1) def test_robust_scaler_zero_variance_features(): # Check RobustScaler on toy data with zero variance features X = [[0., 1., +0.5], [0., 1., -0.1], [0., 1., +1.1]] scaler = RobustScaler() X_trans = scaler.fit_transform(X) # NOTE: for such a small sample size, what we expect in the third column # depends HEAVILY on the method used to calculate quantiles. The values # here were calculated to fit the quantiles produces by np.percentile # using numpy 1.9 Calculating quantiles with # scipy.stats.mstats.scoreatquantile or scipy.stats.mstats.mquantiles # would yield very different results! X_expected = [[0., 0., +0.0], [0., 0., -1.0], [0., 0., +1.0]] assert_array_almost_equal(X_trans, X_expected) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # make sure new data gets transformed correctly X_new = [[+0., 2., 0.5], [-1., 1., 0.0], [+0., 1., 1.5]] X_trans_new = scaler.transform(X_new) X_expected_new = [[+0., 1., +0.], [-1., 0., -0.83333], [+0., 0., +1.66667]] assert_array_almost_equal(X_trans_new, X_expected_new, decimal=3) def test_robust_scaler_unit_variance(): # Check RobustScaler with unit_variance=True on standard normal data with # outliers rng = np.random.RandomState(42) X = rng.randn(1000000, 1) X_with_outliers = np.vstack( [X, np.ones((100, 1)) * 100, np.ones((100, 1)) * -100] ) quantile_range = (1, 99) robust_scaler = RobustScaler( quantile_range=quantile_range, unit_variance=True ).fit(X_with_outliers) X_trans = robust_scaler.transform(X) assert robust_scaler.center_ == pytest.approx(0, abs=1e-3) assert robust_scaler.scale_ == pytest.approx(1, abs=1e-2) assert X_trans.std() == pytest.approx(1, abs=1e-2) def test_maxabs_scaler_zero_variance_features(): # Check MaxAbsScaler on toy data with zero variance features X = [[0., 1., +0.5], [0., 1., -0.3], [0., 1., +1.5], [0., 0., +0.0]] scaler = MaxAbsScaler() X_trans = scaler.fit_transform(X) X_expected = [[0., 1., 1.0 / 3.0], [0., 1., -0.2], [0., 1., 1.0], [0., 0., 0.0]] assert_array_almost_equal(X_trans, X_expected) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # make sure new data gets transformed correctly X_new = [[+0., 2., 0.5], [-1., 1., 0.0], [+0., 1., 1.5]] X_trans_new = scaler.transform(X_new) X_expected_new = [[+0., 2.0, 1.0 / 3.0], [-1., 1.0, 0.0], [+0., 1.0, 1.0]] assert_array_almost_equal(X_trans_new, X_expected_new, decimal=2) # function interface X_trans = maxabs_scale(X) assert_array_almost_equal(X_trans, X_expected) # sparse data X_csr = sparse.csr_matrix(X) X_csc = sparse.csc_matrix(X) X_trans_csr = scaler.fit_transform(X_csr) X_trans_csc = scaler.fit_transform(X_csc) X_expected = [[0., 1., 1.0 / 3.0], [0., 1., -0.2], [0., 1., 1.0], [0., 0., 0.0]] assert_array_almost_equal(X_trans_csr.A, X_expected) assert_array_almost_equal(X_trans_csc.A, X_expected) X_trans_csr_inv = scaler.inverse_transform(X_trans_csr) X_trans_csc_inv = scaler.inverse_transform(X_trans_csc) assert_array_almost_equal(X, X_trans_csr_inv.A) assert_array_almost_equal(X, X_trans_csc_inv.A) def test_maxabs_scaler_large_negative_value(): # Check MaxAbsScaler on toy data with a large negative value X = [[0., 1., +0.5, -1.0], [0., 1., -0.3, -0.5], [0., 1., -100.0, 0.0], [0., 0., +0.0, -2.0]] scaler = MaxAbsScaler() X_trans = scaler.fit_transform(X) X_expected = [[0., 1., 0.005, -0.5], [0., 1., -0.003, -0.25], [0., 1., -1.0, 0.0], [0., 0., 0.0, -1.0]] assert_array_almost_equal(X_trans, X_expected) def test_maxabs_scaler_transform_one_row_csr(): # Check MaxAbsScaler on transforming csr matrix with one row X = sparse.csr_matrix([[0.5, 1., 1.]]) scaler = MaxAbsScaler() scaler = scaler.fit(X) X_trans = scaler.transform(X) X_expected = sparse.csr_matrix([[1., 1., 1.]]) assert_array_almost_equal(X_trans.toarray(), X_expected.toarray()) X_scaled_back = scaler.inverse_transform(X_trans) assert_array_almost_equal(X.toarray(), X_scaled_back.toarray()) def test_maxabs_scaler_1d(): # Test scaling of dataset along single axis for X in [X_1row, X_1col, X_list_1row, X_list_1row]: scaler = MaxAbsScaler(copy=True) X_scaled = scaler.fit(X).transform(X) if isinstance(X, list): X = np.array(X) # cast only after scaling done if _check_dim_1axis(X) == 1: assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), np.ones(n_features)) else: assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.) assert scaler.n_samples_seen_ == X.shape[0] # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X) # Constant feature X = np.ones((5, 1)) scaler = MaxAbsScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.) assert scaler.n_samples_seen_ == X.shape[0] # function interface X_1d = X_1row.ravel() max_abs = np.abs(X_1d).max() assert_array_almost_equal(X_1d / max_abs, maxabs_scale(X_1d, copy=True)) def test_maxabs_scaler_partial_fit(): # Test if partial_fit run over many batches of size 1 and 50 # gives the same results as fit X = X_2d[:100, :] n = X.shape[0] for chunk_size in [1, 2, 50, n, n + 42]: # Test mean at the end of the process scaler_batch = MaxAbsScaler().fit(X) scaler_incr = MaxAbsScaler() scaler_incr_csr = MaxAbsScaler() scaler_incr_csc = MaxAbsScaler() for batch in gen_batches(n, chunk_size): scaler_incr = scaler_incr.partial_fit(X[batch]) X_csr = sparse.csr_matrix(X[batch]) scaler_incr_csr = scaler_incr_csr.partial_fit(X_csr) X_csc = sparse.csc_matrix(X[batch]) scaler_incr_csc = scaler_incr_csc.partial_fit(X_csc) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csr.max_abs_) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csc.max_abs_) assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ assert (scaler_batch.n_samples_seen_ == scaler_incr_csr.n_samples_seen_) assert (scaler_batch.n_samples_seen_ == scaler_incr_csc.n_samples_seen_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csr.scale_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csc.scale_) assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X)) # Test std after 1 step batch0 = slice(0, chunk_size) scaler_batch = MaxAbsScaler().fit(X[batch0]) scaler_incr = MaxAbsScaler().partial_fit(X[batch0]) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_) assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X)) # Test std until the end of partial fits, and scaler_batch = MaxAbsScaler().fit(X) scaler_incr = MaxAbsScaler() # Clean estimator for i, batch in enumerate(gen_batches(n, chunk_size)): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_correct_incr(i, batch_start=batch.start, batch_stop=batch.stop, n=n, chunk_size=chunk_size, n_samples_seen=scaler_incr.n_samples_seen_) def test_normalizer_l1(): rng = np.random.RandomState(0) X_dense = rng.randn(4, 5) X_sparse_unpruned = sparse.csr_matrix(X_dense) # set the row number 3 to zero X_dense[3, :] = 0.0 # set the row number 3 to zero without pruning (can happen in real life) indptr_3 = X_sparse_unpruned.indptr[3] indptr_4 = X_sparse_unpruned.indptr[4] X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0 # build the pruned variant using the regular constructor X_sparse_pruned = sparse.csr_matrix(X_dense) # check inputs that support the no-copy optim for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): normalizer = Normalizer(norm='l1', copy=True) X_norm = normalizer.transform(X) assert X_norm is not X X_norm1 = toarray(X_norm) normalizer = Normalizer(norm='l1', copy=False) X_norm = normalizer.transform(X) assert X_norm is X X_norm2 = toarray(X_norm) for X_norm in (X_norm1, X_norm2): row_sums = np.abs(X_norm).sum(axis=1) for i in range(3): assert_almost_equal(row_sums[i], 1.0) assert_almost_equal(row_sums[3], 0.0) # check input for which copy=False won't prevent a copy for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix): X = init(X_dense) X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X) assert X_norm is not X assert isinstance(X_norm, sparse.csr_matrix) X_norm = toarray(X_norm) for i in range(3): assert_almost_equal(row_sums[i], 1.0) assert_almost_equal(la.norm(X_norm[3]), 0.0) def test_normalizer_l2(): rng = np.random.RandomState(0) X_dense = rng.randn(4, 5) X_sparse_unpruned = sparse.csr_matrix(X_dense) # set the row number 3 to zero X_dense[3, :] = 0.0 # set the row number 3 to zero without pruning (can happen in real life) indptr_3 = X_sparse_unpruned.indptr[3] indptr_4 = X_sparse_unpruned.indptr[4] X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0 # build the pruned variant using the regular constructor X_sparse_pruned = sparse.csr_matrix(X_dense) # check inputs that support the no-copy optim for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): normalizer = Normalizer(norm='l2', copy=True) X_norm1 = normalizer.transform(X) assert X_norm1 is not X X_norm1 = toarray(X_norm1) normalizer = Normalizer(norm='l2', copy=False) X_norm2 = normalizer.transform(X) assert X_norm2 is X X_norm2 = toarray(X_norm2) for X_norm in (X_norm1, X_norm2): for i in range(3): assert_almost_equal(la.norm(X_norm[i]), 1.0) assert_almost_equal(la.norm(X_norm[3]), 0.0) # check input for which copy=False won't prevent a copy for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix): X = init(X_dense) X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X) assert X_norm is not X assert isinstance(X_norm, sparse.csr_matrix) X_norm = toarray(X_norm) for i in range(3): assert_almost_equal(la.norm(X_norm[i]), 1.0) assert_almost_equal(la.norm(X_norm[3]), 0.0) def test_normalizer_max(): rng = np.random.RandomState(0) X_dense = rng.randn(4, 5) X_sparse_unpruned = sparse.csr_matrix(X_dense) # set the row number 3 to zero X_dense[3, :] = 0.0 # set the row number 3 to zero without pruning (can happen in real life) indptr_3 = X_sparse_unpruned.indptr[3] indptr_4 = X_sparse_unpruned.indptr[4] X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0 # build the pruned variant using the regular constructor X_sparse_pruned = sparse.csr_matrix(X_dense) # check inputs that support the no-copy optim for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): normalizer = Normalizer(norm='max', copy=True) X_norm1 = normalizer.transform(X) assert X_norm1 is not X X_norm1 = toarray(X_norm1) normalizer = Normalizer(norm='max', copy=False) X_norm2 = normalizer.transform(X) assert X_norm2 is X X_norm2 = toarray(X_norm2) for X_norm in (X_norm1, X_norm2): row_maxs = abs(X_norm).max(axis=1) for i in range(3): assert_almost_equal(row_maxs[i], 1.0) assert_almost_equal(row_maxs[3], 0.0) # check input for which copy=False won't prevent a copy for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix): X = init(X_dense) X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X) assert X_norm is not X assert isinstance(X_norm, sparse.csr_matrix) X_norm = toarray(X_norm) for i in range(3): assert_almost_equal(row_maxs[i], 1.0) assert_almost_equal(la.norm(X_norm[3]), 0.0) def test_normalizer_max_sign(): # check that we normalize by a positive number even for negative data rng = np.random.RandomState(0) X_dense = rng.randn(4, 5) # set the row number 3 to zero X_dense[3, :] = 0.0 # check for mixed data where the value with # largest magnitude is negative X_dense[2, abs(X_dense[2, :]).argmax()] *= -1 X_all_neg = -np.abs(X_dense) X_all_neg_sparse = sparse.csr_matrix(X_all_neg) for X in (X_dense, X_all_neg, X_all_neg_sparse): normalizer = Normalizer(norm='max') X_norm = normalizer.transform(X) assert X_norm is not X X_norm = toarray(X_norm) assert_array_equal( np.sign(X_norm), np.sign(toarray(X))) def test_normalize(): # Test normalize function # Only tests functionality not used by the tests for Normalizer. X = np.random.RandomState(37).randn(3, 2) assert_array_equal(normalize(X, copy=False), normalize(X.T, axis=0, copy=False).T) with pytest.raises(ValueError): normalize([[0]], axis=2) with pytest.raises(ValueError): normalize([[0]], norm='l3') rs = np.random.RandomState(0) X_dense = rs.randn(10, 5) X_sparse = sparse.csr_matrix(X_dense) ones = np.ones((10)) for X in (X_dense, X_sparse): for dtype in (np.float32, np.float64): for norm in ('l1', 'l2'): X = X.astype(dtype) X_norm = normalize(X, norm=norm) assert X_norm.dtype == dtype X_norm = toarray(X_norm) if norm == 'l1': row_sums = np.abs(X_norm).sum(axis=1) else: X_norm_squared = X_norm**2 row_sums = X_norm_squared.sum(axis=1) assert_array_almost_equal(row_sums, ones) # Test return_norm X_dense = np.array([[3.0, 0, 4.0], [1.0, 0.0, 0.0], [2.0, 3.0, 0.0]]) for norm in ('l1', 'l2', 'max'): _, norms = normalize(X_dense, norm=norm, return_norm=True) if norm == 'l1': assert_array_almost_equal(norms, np.array([7.0, 1.0, 5.0])) elif norm == 'l2': assert_array_almost_equal(norms, np.array([5.0, 1.0, 3.60555127])) else: assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0])) X_sparse = sparse.csr_matrix(X_dense) for norm in ('l1', 'l2'): with pytest.raises(NotImplementedError): normalize(X_sparse, norm=norm, return_norm=True) _, norms = normalize(X_sparse, norm='max', return_norm=True) assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0])) def test_binarizer(): X_ = np.array([[1, 0, 5], [2, 3, -1]]) for init in (np.array, list, sparse.csr_matrix, sparse.csc_matrix): X = init(X_.copy()) binarizer = Binarizer(threshold=2.0, copy=True) X_bin = toarray(binarizer.transform(X)) assert np.sum(X_bin == 0) == 4 assert np.sum(X_bin == 1) == 2 X_bin = binarizer.transform(X) assert sparse.issparse(X) == sparse.issparse(X_bin) binarizer = Binarizer(copy=True).fit(X) X_bin = toarray(binarizer.transform(X)) assert X_bin is not X assert np.sum(X_bin == 0) == 2 assert np.sum(X_bin == 1) == 4 binarizer = Binarizer(copy=True) X_bin = binarizer.transform(X) assert X_bin is not X X_bin = toarray(X_bin) assert np.sum(X_bin == 0) == 2 assert np.sum(X_bin == 1) == 4 binarizer = Binarizer(copy=False) X_bin = binarizer.transform(X) if init is not list: assert X_bin is X binarizer = Binarizer(copy=False) X_float = np.array([[1, 0, 5], [2, 3, -1]], dtype=np.float64) X_bin = binarizer.transform(X_float) if init is not list: assert X_bin is X_float X_bin = toarray(X_bin) assert np.sum(X_bin == 0) == 2 assert np.sum(X_bin == 1) == 4 binarizer = Binarizer(threshold=-0.5, copy=True) for init in (np.array, list): X = init(X_.copy()) X_bin = toarray(binarizer.transform(X)) assert np.sum(X_bin == 0) == 1 assert np.sum(X_bin == 1) == 5 X_bin = binarizer.transform(X) # Cannot use threshold < 0 for sparse with pytest.raises(ValueError): binarizer.transform(sparse.csc_matrix(X)) def test_center_kernel(): # Test that KernelCenterer is equivalent to StandardScaler # in feature space rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) scaler = StandardScaler(with_std=False) scaler.fit(X_fit) X_fit_centered = scaler.transform(X_fit) K_fit = np.dot(X_fit, X_fit.T) # center fit time matrix centerer = KernelCenterer() K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T) K_fit_centered2 = centerer.fit_transform(K_fit) assert_array_almost_equal(K_fit_centered, K_fit_centered2) # center predict time matrix X_pred = rng.random_sample((2, 4)) K_pred = np.dot(X_pred, X_fit.T) X_pred_centered = scaler.transform(X_pred) K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T) K_pred_centered2 = centerer.transform(K_pred) assert_array_almost_equal(K_pred_centered, K_pred_centered2) def test_cv_pipeline_precomputed(): # Cross-validate a regression on four coplanar points with the same # value. Use precomputed kernel to ensure Pipeline with KernelCenterer # is treated as a pairwise operation. X = np.array([[3, 0, 0], [0, 3, 0], [0, 0, 3], [1, 1, 1]]) y_true = np.ones((4,)) K = X.dot(X.T) kcent = KernelCenterer() pipeline = Pipeline([("kernel_centerer", kcent), ("svr", SVR())]) # did the pipeline set the pairwise attribute? assert pipeline._get_tags()['pairwise'] # TODO: Remove in 1.1 msg = r"Attribute _pairwise was deprecated in version 0\.24" with pytest.warns(FutureWarning, match=msg): assert pipeline._pairwise # test cross-validation, score should be almost perfect # NB: this test is pretty vacuous -- it's mainly to test integration # of Pipeline and KernelCenterer y_pred = cross_val_predict(pipeline, K, y_true, cv=2) assert_array_almost_equal(y_true, y_pred) # TODO: Remove in 1.1 def test_pairwise_deprecated(): kcent = KernelCenterer() msg = r"Attribute _pairwise was deprecated in version 0\.24" with pytest.warns(FutureWarning, match=msg): kcent._pairwise def test_fit_transform(): rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) for obj in ((StandardScaler(), Normalizer(), Binarizer())): X_transformed = obj.fit(X).transform(X) X_transformed2 = obj.fit_transform(X) assert_array_equal(X_transformed, X_transformed2) def test_add_dummy_feature(): X = [[1, 0], [0, 1], [0, 1]] X = add_dummy_feature(X) assert_array_equal(X, [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) def test_add_dummy_feature_coo(): X = sparse.coo_matrix([[1, 0], [0, 1], [0, 1]]) X = add_dummy_feature(X) assert sparse.isspmatrix_coo(X), X assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) def test_add_dummy_feature_csc(): X = sparse.csc_matrix([[1, 0], [0, 1], [0, 1]]) X = add_dummy_feature(X) assert sparse.isspmatrix_csc(X), X assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) def test_add_dummy_feature_csr(): X = sparse.csr_matrix([[1, 0], [0, 1], [0, 1]]) X = add_dummy_feature(X) assert sparse.isspmatrix_csr(X), X assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) def test_fit_cold_start(): X = iris.data X_2d = X[:, :2] # Scalers that have a partial_fit method scalers = [StandardScaler(with_mean=False, with_std=False), MinMaxScaler(), MaxAbsScaler()] for scaler in scalers: scaler.fit_transform(X) # with a different shape, this may break the scaler unless the internal # state is reset scaler.fit_transform(X_2d) def test_quantile_transform_valid_axis(): X = np.array([[0, 25, 50, 75, 100], [2, 4, 6, 8, 10], [2.6, 4.1, 2.3, 9.5, 0.1]]) with pytest.raises(ValueError, match="axis should be either equal " "to 0 or 1. Got axis=2"): quantile_transform(X.T, axis=2) @pytest.mark.parametrize("method", ['box-cox', 'yeo-johnson']) def test_power_transformer_notfitted(method): pt = PowerTransformer(method=method) X = np.abs(X_1col) with pytest.raises(NotFittedError): pt.transform(X) with pytest.raises(NotFittedError): pt.inverse_transform(X) @pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson']) @pytest.mark.parametrize('standardize', [True, False]) @pytest.mark.parametrize('X', [X_1col, X_2d]) def test_power_transformer_inverse(method, standardize, X): # Make sure we get the original input when applying transform and then # inverse transform X = np.abs(X) if method == 'box-cox' else X pt = PowerTransformer(method=method, standardize=standardize) X_trans = pt.fit_transform(X) assert_almost_equal(X, pt.inverse_transform(X_trans)) def test_power_transformer_1d(): X = np.abs(X_1col) for standardize in [True, False]: pt = PowerTransformer(method='box-cox', standardize=standardize) X_trans = pt.fit_transform(X) X_trans_func = power_transform( X, method='box-cox', standardize=standardize ) X_expected, lambda_expected = stats.boxcox(X.flatten()) if standardize: X_expected = scale(X_expected) assert_almost_equal(X_expected.reshape(-1, 1), X_trans) assert_almost_equal(X_expected.reshape(-1, 1), X_trans_func) assert_almost_equal(X, pt.inverse_transform(X_trans)) assert_almost_equal(lambda_expected, pt.lambdas_[0]) assert len(pt.lambdas_) == X.shape[1] assert isinstance(pt.lambdas_, np.ndarray) def test_power_transformer_2d(): X = np.abs(X_2d) for standardize in [True, False]: pt = PowerTransformer(method='box-cox', standardize=standardize) X_trans_class = pt.fit_transform(X) X_trans_func = power_transform( X, method='box-cox', standardize=standardize ) for X_trans in [X_trans_class, X_trans_func]: for j in range(X_trans.shape[1]): X_expected, lmbda = stats.boxcox(X[:, j].flatten()) if standardize: X_expected = scale(X_expected) assert_almost_equal(X_trans[:, j], X_expected) assert_almost_equal(lmbda, pt.lambdas_[j]) # Test inverse transformation X_inv = pt.inverse_transform(X_trans) assert_array_almost_equal(X_inv, X) assert len(pt.lambdas_) == X.shape[1] assert isinstance(pt.lambdas_, np.ndarray) def test_power_transformer_boxcox_strictly_positive_exception(): # Exceptions should be raised for negative arrays and zero arrays when # method is boxcox pt = PowerTransformer(method='box-cox') pt.fit(np.abs(X_2d)) X_with_negatives = X_2d not_positive_message = 'strictly positive' with pytest.raises(ValueError, match=not_positive_message): pt.transform(X_with_negatives) with pytest.raises(ValueError, match=not_positive_message): pt.fit(X_with_negatives) with pytest.raises(ValueError, match=not_positive_message): power_transform(X_with_negatives, method='box-cox') with pytest.raises(ValueError, match=not_positive_message): pt.transform(np.zeros(X_2d.shape)) with pytest.raises(ValueError, match=not_positive_message): pt.fit(np.zeros(X_2d.shape)) with pytest.raises(ValueError, match=not_positive_message): power_transform(np.zeros(X_2d.shape), method='box-cox') @pytest.mark.parametrize('X', [X_2d, np.abs(X_2d), -np.abs(X_2d), np.zeros(X_2d.shape)]) def test_power_transformer_yeojohnson_any_input(X): # Yeo-Johnson method should support any kind of input power_transform(X, method='yeo-johnson') @pytest.mark.parametrize("method", ['box-cox', 'yeo-johnson']) def test_power_transformer_shape_exception(method): pt = PowerTransformer(method=method) X = np.abs(X_2d) pt.fit(X) # Exceptions should be raised for arrays with different num_columns # than during fitting wrong_shape_message = (r"X has \d+ features, but PowerTransformer is " r"expecting \d+ features") with pytest.raises(ValueError, match=wrong_shape_message): pt.transform(X[:, 0:1]) with pytest.raises(ValueError, match=wrong_shape_message): pt.inverse_transform(X[:, 0:1]) def test_power_transformer_method_exception(): pt = PowerTransformer(method='monty-python') X = np.abs(X_2d) # An exception should be raised if PowerTransformer.method isn't valid bad_method_message = "'method' must be one of" with pytest.raises(ValueError, match=bad_method_message): pt.fit(X) def test_power_transformer_lambda_zero(): pt = PowerTransformer(method='box-cox', standardize=False) X = np.abs(X_2d)[:, 0:1] # Test the lambda = 0 case pt.lambdas_ = np.array([0]) X_trans = pt.transform(X) assert_array_almost_equal(pt.inverse_transform(X_trans), X) def test_power_transformer_lambda_one(): # Make sure lambda = 1 corresponds to the identity for yeo-johnson pt = PowerTransformer(method='yeo-johnson', standardize=False) X = np.abs(X_2d)[:, 0:1] pt.lambdas_ = np.array([1]) X_trans = pt.transform(X) assert_array_almost_equal(X_trans, X) @pytest.mark.parametrize("method, lmbda", [('box-cox', .1), ('box-cox', .5), ('yeo-johnson', .1), ('yeo-johnson', .5), ('yeo-johnson', 1.), ]) def test_optimization_power_transformer(method, lmbda): # Test the optimization procedure: # - set a predefined value for lambda # - apply inverse_transform to a normal dist (we get X_inv) # - apply fit_transform to X_inv (we get X_inv_trans) # - check that X_inv_trans is roughly equal to X rng = np.random.RandomState(0) n_samples = 20000 X = rng.normal(loc=0, scale=1, size=(n_samples, 1)) pt = PowerTransformer(method=method, standardize=False) pt.lambdas_ = [lmbda] X_inv = pt.inverse_transform(X) pt = PowerTransformer(method=method, standardize=False) X_inv_trans = pt.fit_transform(X_inv) assert_almost_equal(0, np.linalg.norm(X - X_inv_trans) / n_samples, decimal=2) assert_almost_equal(0, X_inv_trans.mean(), decimal=1) assert_almost_equal(1, X_inv_trans.std(), decimal=1) def test_yeo_johnson_darwin_example(): # test from original paper "A new family of power transformations to # improve normality or symmetry" by Yeo and Johnson. X = [6.1, -8.4, 1.0, 2.0, 0.7, 2.9, 3.5, 5.1, 1.8, 3.6, 7.0, 3.0, 9.3, 7.5, -6.0] X = np.array(X).reshape(-1, 1) lmbda = PowerTransformer(method='yeo-johnson').fit(X).lambdas_ assert np.allclose(lmbda, 1.305, atol=1e-3) @pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson']) def test_power_transformer_nans(method): # Make sure lambda estimation is not influenced by NaN values # and that transform() supports NaN silently X = np.abs(X_1col) pt = PowerTransformer(method=method) pt.fit(X) lmbda_no_nans = pt.lambdas_[0] # concat nans at the end and check lambda stays the same X = np.concatenate([X, np.full_like(X, np.nan)]) X = shuffle(X, random_state=0) pt.fit(X) lmbda_nans = pt.lambdas_[0] assert_almost_equal(lmbda_no_nans, lmbda_nans, decimal=5) X_trans = pt.transform(X) assert_array_equal(np.isnan(X_trans), np.isnan(X)) @pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson']) @pytest.mark.parametrize('standardize', [True, False]) def test_power_transformer_fit_transform(method, standardize): # check that fit_transform() and fit().transform() return the same values X = X_1col if method == 'box-cox': X = np.abs(X) pt = PowerTransformer(method, standardize=standardize) assert_array_almost_equal(pt.fit(X).transform(X), pt.fit_transform(X)) @pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson']) @pytest.mark.parametrize('standardize', [True, False]) def test_power_transformer_copy_True(method, standardize): # Check that neither fit, transform, fit_transform nor inverse_transform # modify X inplace when copy=True X = X_1col if method == 'box-cox': X = np.abs(X) X_original = X.copy() assert X is not X_original # sanity checks assert_array_almost_equal(X, X_original) pt = PowerTransformer(method, standardize=standardize, copy=True) pt.fit(X) assert_array_almost_equal(X, X_original) X_trans = pt.transform(X) assert X_trans is not X X_trans = pt.fit_transform(X) assert_array_almost_equal(X, X_original) assert X_trans is not X X_inv_trans = pt.inverse_transform(X_trans) assert X_trans is not X_inv_trans @pytest.mark.parametrize('method', ['box-cox', 'yeo-johnson']) @pytest.mark.parametrize('standardize', [True, False]) def test_power_transformer_copy_False(method, standardize): # check that when copy=False fit doesn't change X inplace but transform, # fit_transform and inverse_transform do. X = X_1col if method == 'box-cox': X = np.abs(X) X_original = X.copy() assert X is not X_original # sanity checks assert_array_almost_equal(X, X_original) pt = PowerTransformer(method, standardize=standardize, copy=False) pt.fit(X) assert_array_almost_equal(X, X_original) # fit didn't change X X_trans = pt.transform(X) assert X_trans is X if method == 'box-cox': X = np.abs(X) X_trans = pt.fit_transform(X) assert X_trans is X X_inv_trans = pt.inverse_transform(X_trans) assert X_trans is X_inv_trans @pytest.mark.parametrize( "X_2", [sparse.random(10, 1, density=0.8, random_state=0), sparse.csr_matrix(np.full((10, 1), fill_value=np.nan))] ) def test_standard_scaler_sparse_partial_fit_finite_variance(X_2): # non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/16448 X_1 = sparse.random(5, 1, density=0.8) scaler = StandardScaler(with_mean=False) scaler.fit(X_1).partial_fit(X_2) assert np.isfinite(scaler.var_[0]) @pytest.mark.parametrize( "feature_range", [(0, 1), (-10, 10)] ) def test_minmax_scaler_clip(feature_range): # test behaviour of the paramter 'clip' in MinMaxScaler X = iris.data scaler = MinMaxScaler(feature_range=feature_range, clip=True).fit(X) X_min, X_max = np.min(X, axis=0), np.max(X, axis=0) X_test = [np.r_[X_min[:2] - 10, X_max[2:] + 10]] X_transformed = scaler.transform(X_test) assert_allclose( X_transformed, [[feature_range[0], feature_range[0], feature_range[1], feature_range[1]]])