2611 lines
93 KiB
Python
2611 lines
93 KiB
Python
|
# Authors:
|
||
|
#
|
||
|
# Giorgio Patrini
|
||
|
#
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
import re
|
||
|
import warnings
|
||
|
|
||
|
import numpy as np
|
||
|
import numpy.linalg as la
|
||
|
import pytest
|
||
|
from scipy import sparse, stats
|
||
|
|
||
|
from sklearn import datasets
|
||
|
from sklearn.base import clone
|
||
|
from sklearn.exceptions import NotFittedError
|
||
|
from sklearn.metrics.pairwise import linear_kernel
|
||
|
from sklearn.model_selection import cross_val_predict
|
||
|
from sklearn.pipeline import Pipeline
|
||
|
from sklearn.preprocessing import (
|
||
|
Binarizer,
|
||
|
KernelCenterer,
|
||
|
MaxAbsScaler,
|
||
|
MinMaxScaler,
|
||
|
Normalizer,
|
||
|
PowerTransformer,
|
||
|
QuantileTransformer,
|
||
|
RobustScaler,
|
||
|
StandardScaler,
|
||
|
add_dummy_feature,
|
||
|
maxabs_scale,
|
||
|
minmax_scale,
|
||
|
normalize,
|
||
|
power_transform,
|
||
|
quantile_transform,
|
||
|
robust_scale,
|
||
|
scale,
|
||
|
)
|
||
|
from sklearn.preprocessing._data import BOUNDS_THRESHOLD, _handle_zeros_in_scale
|
||
|
from sklearn.svm import SVR
|
||
|
from sklearn.utils import gen_batches, shuffle
|
||
|
from sklearn.utils._array_api import (
|
||
|
yield_namespace_device_dtype_combinations,
|
||
|
)
|
||
|
from sklearn.utils._testing import (
|
||
|
_convert_container,
|
||
|
assert_allclose,
|
||
|
assert_allclose_dense_sparse,
|
||
|
assert_almost_equal,
|
||
|
assert_array_almost_equal,
|
||
|
assert_array_equal,
|
||
|
assert_array_less,
|
||
|
skip_if_32bit,
|
||
|
)
|
||
|
from sklearn.utils.estimator_checks import (
|
||
|
_get_check_estimator_ids,
|
||
|
check_array_api_input_and_values,
|
||
|
)
|
||
|
from sklearn.utils.fixes import (
|
||
|
COO_CONTAINERS,
|
||
|
CSC_CONTAINERS,
|
||
|
CSR_CONTAINERS,
|
||
|
LIL_CONTAINERS,
|
||
|
)
|
||
|
from sklearn.utils.sparsefuncs import mean_variance_axis
|
||
|
|
||
|
iris = datasets.load_iris()
|
||
|
|
||
|
# Make some data to be used many times
|
||
|
rng = np.random.RandomState(0)
|
||
|
n_features = 30
|
||
|
n_samples = 1000
|
||
|
offsets = rng.uniform(-1, 1, size=n_features)
|
||
|
scales = rng.uniform(1, 10, size=n_features)
|
||
|
X_2d = rng.randn(n_samples, n_features) * scales + offsets
|
||
|
X_1row = X_2d[0, :].reshape(1, n_features)
|
||
|
X_1col = X_2d[:, 0].reshape(n_samples, 1)
|
||
|
X_list_1row = X_1row.tolist()
|
||
|
X_list_1col = X_1col.tolist()
|
||
|
|
||
|
|
||
|
def toarray(a):
|
||
|
if hasattr(a, "toarray"):
|
||
|
a = a.toarray()
|
||
|
return a
|
||
|
|
||
|
|
||
|
def _check_dim_1axis(a):
|
||
|
return np.asarray(a).shape[0]
|
||
|
|
||
|
|
||
|
def assert_correct_incr(i, batch_start, batch_stop, n, chunk_size, n_samples_seen):
|
||
|
if batch_stop != n:
|
||
|
assert (i + 1) * chunk_size == n_samples_seen
|
||
|
else:
|
||
|
assert i * chunk_size + (batch_stop - batch_start) == n_samples_seen
|
||
|
|
||
|
|
||
|
def test_raises_value_error_if_sample_weights_greater_than_1d():
|
||
|
# Sample weights must be either scalar or 1D
|
||
|
|
||
|
n_sampless = [2, 3]
|
||
|
n_featuress = [3, 2]
|
||
|
|
||
|
for n_samples, n_features in zip(n_sampless, n_featuress):
|
||
|
X = rng.randn(n_samples, n_features)
|
||
|
y = rng.randn(n_samples)
|
||
|
|
||
|
scaler = StandardScaler()
|
||
|
|
||
|
# make sure Error is raised the sample weights greater than 1d
|
||
|
sample_weight_notOK = rng.randn(n_samples, 1) ** 2
|
||
|
with pytest.raises(ValueError):
|
||
|
scaler.fit(X, y, sample_weight=sample_weight_notOK)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
["Xw", "X", "sample_weight"],
|
||
|
[
|
||
|
([[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [1, 2, 3], [4, 5, 6]], [2.0, 1.0]),
|
||
|
(
|
||
|
[[1, 0, 1], [0, 0, 1]],
|
||
|
[[1, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1]],
|
||
|
np.array([1, 3]),
|
||
|
),
|
||
|
(
|
||
|
[[1, np.nan, 1], [np.nan, np.nan, 1]],
|
||
|
[
|
||
|
[1, np.nan, 1],
|
||
|
[np.nan, np.nan, 1],
|
||
|
[np.nan, np.nan, 1],
|
||
|
[np.nan, np.nan, 1],
|
||
|
],
|
||
|
np.array([1, 3]),
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize("array_constructor", ["array", "sparse_csr", "sparse_csc"])
|
||
|
def test_standard_scaler_sample_weight(Xw, X, sample_weight, array_constructor):
|
||
|
with_mean = not array_constructor.startswith("sparse")
|
||
|
X = _convert_container(X, array_constructor)
|
||
|
Xw = _convert_container(Xw, array_constructor)
|
||
|
|
||
|
# weighted StandardScaler
|
||
|
yw = np.ones(Xw.shape[0])
|
||
|
scaler_w = StandardScaler(with_mean=with_mean)
|
||
|
scaler_w.fit(Xw, yw, sample_weight=sample_weight)
|
||
|
|
||
|
# unweighted, but with repeated samples
|
||
|
y = np.ones(X.shape[0])
|
||
|
scaler = StandardScaler(with_mean=with_mean)
|
||
|
scaler.fit(X, y)
|
||
|
|
||
|
X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
|
||
|
|
||
|
assert_almost_equal(scaler.mean_, scaler_w.mean_)
|
||
|
assert_almost_equal(scaler.var_, scaler_w.var_)
|
||
|
assert_almost_equal(scaler.transform(X_test), scaler_w.transform(X_test))
|
||
|
|
||
|
|
||
|
def test_standard_scaler_1d():
|
||
|
# Test scaling of dataset along single axis
|
||
|
for X in [X_1row, X_1col, X_list_1row, X_list_1row]:
|
||
|
scaler = StandardScaler()
|
||
|
X_scaled = scaler.fit(X).transform(X, copy=True)
|
||
|
|
||
|
if isinstance(X, list):
|
||
|
X = np.array(X) # cast only after scaling done
|
||
|
|
||
|
if _check_dim_1axis(X) == 1:
|
||
|
assert_almost_equal(scaler.mean_, X.ravel())
|
||
|
assert_almost_equal(scaler.scale_, np.ones(n_features))
|
||
|
assert_array_almost_equal(X_scaled.mean(axis=0), np.zeros_like(n_features))
|
||
|
assert_array_almost_equal(X_scaled.std(axis=0), np.zeros_like(n_features))
|
||
|
else:
|
||
|
assert_almost_equal(scaler.mean_, X.mean())
|
||
|
assert_almost_equal(scaler.scale_, X.std())
|
||
|
assert_array_almost_equal(X_scaled.mean(axis=0), np.zeros_like(n_features))
|
||
|
assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
|
||
|
assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
|
||
|
assert scaler.n_samples_seen_ == X.shape[0]
|
||
|
|
||
|
# check inverse transform
|
||
|
X_scaled_back = scaler.inverse_transform(X_scaled)
|
||
|
assert_array_almost_equal(X_scaled_back, X)
|
||
|
|
||
|
# Constant feature
|
||
|
X = np.ones((5, 1))
|
||
|
scaler = StandardScaler()
|
||
|
X_scaled = scaler.fit(X).transform(X, copy=True)
|
||
|
assert_almost_equal(scaler.mean_, 1.0)
|
||
|
assert_almost_equal(scaler.scale_, 1.0)
|
||
|
assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
|
||
|
assert_array_almost_equal(X_scaled.std(axis=0), 0.0)
|
||
|
assert scaler.n_samples_seen_ == X.shape[0]
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
|
||
|
@pytest.mark.parametrize("add_sample_weight", [False, True])
|
||
|
def test_standard_scaler_dtype(add_sample_weight, sparse_container):
|
||
|
# Ensure scaling does not affect dtype
|
||
|
rng = np.random.RandomState(0)
|
||
|
n_samples = 10
|
||
|
n_features = 3
|
||
|
if add_sample_weight:
|
||
|
sample_weight = np.ones(n_samples)
|
||
|
else:
|
||
|
sample_weight = None
|
||
|
with_mean = True
|
||
|
if sparse_container is not None:
|
||
|
# scipy sparse containers do not support float16, see
|
||
|
# https://github.com/scipy/scipy/issues/7408 for more details.
|
||
|
supported_dtype = [np.float64, np.float32]
|
||
|
else:
|
||
|
supported_dtype = [np.float64, np.float32, np.float16]
|
||
|
for dtype in supported_dtype:
|
||
|
X = rng.randn(n_samples, n_features).astype(dtype)
|
||
|
if sparse_container is not None:
|
||
|
X = sparse_container(X)
|
||
|
with_mean = False
|
||
|
|
||
|
scaler = StandardScaler(with_mean=with_mean)
|
||
|
X_scaled = scaler.fit(X, sample_weight=sample_weight).transform(X)
|
||
|
assert X.dtype == X_scaled.dtype
|
||
|
assert scaler.mean_.dtype == np.float64
|
||
|
assert scaler.scale_.dtype == np.float64
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"scaler",
|
||
|
[
|
||
|
StandardScaler(with_mean=False),
|
||
|
RobustScaler(with_centering=False),
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
|
||
|
@pytest.mark.parametrize("add_sample_weight", [False, True])
|
||
|
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||
|
@pytest.mark.parametrize("constant", [0, 1.0, 100.0])
|
||
|
def test_standard_scaler_constant_features(
|
||
|
scaler, add_sample_weight, sparse_container, dtype, constant
|
||
|
):
|
||
|
if isinstance(scaler, RobustScaler) and add_sample_weight:
|
||
|
pytest.skip(f"{scaler.__class__.__name__} does not yet support sample_weight")
|
||
|
|
||
|
rng = np.random.RandomState(0)
|
||
|
n_samples = 100
|
||
|
n_features = 1
|
||
|
if add_sample_weight:
|
||
|
fit_params = dict(sample_weight=rng.uniform(size=n_samples) * 2)
|
||
|
else:
|
||
|
fit_params = {}
|
||
|
X_array = np.full(shape=(n_samples, n_features), fill_value=constant, dtype=dtype)
|
||
|
X = X_array if sparse_container is None else sparse_container(X_array)
|
||
|
X_scaled = scaler.fit(X, **fit_params).transform(X)
|
||
|
|
||
|
if isinstance(scaler, StandardScaler):
|
||
|
# The variance info should be close to zero for constant features.
|
||
|
assert_allclose(scaler.var_, np.zeros(X.shape[1]), atol=1e-7)
|
||
|
|
||
|
# Constant features should not be scaled (scale of 1.):
|
||
|
assert_allclose(scaler.scale_, np.ones(X.shape[1]))
|
||
|
|
||
|
assert X_scaled is not X # make sure we make a copy
|
||
|
assert_allclose_dense_sparse(X_scaled, X)
|
||
|
|
||
|
if isinstance(scaler, StandardScaler) and not add_sample_weight:
|
||
|
# Also check consistency with the standard scale function.
|
||
|
X_scaled_2 = scale(X, with_mean=scaler.with_mean)
|
||
|
assert X_scaled_2 is not X # make sure we did a copy
|
||
|
assert_allclose_dense_sparse(X_scaled_2, X)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("n_samples", [10, 100, 10_000])
|
||
|
@pytest.mark.parametrize("average", [1e-10, 1, 1e10])
|
||
|
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||
|
@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
|
||
|
def test_standard_scaler_near_constant_features(
|
||
|
n_samples, sparse_container, average, dtype
|
||
|
):
|
||
|
# Check that when the variance is too small (var << mean**2) the feature
|
||
|
# is considered constant and not scaled.
|
||
|
|
||
|
scale_min, scale_max = -30, 19
|
||
|
scales = np.array([10**i for i in range(scale_min, scale_max + 1)], dtype=dtype)
|
||
|
|
||
|
n_features = scales.shape[0]
|
||
|
X = np.empty((n_samples, n_features), dtype=dtype)
|
||
|
# Make a dataset of known var = scales**2 and mean = average
|
||
|
X[: n_samples // 2, :] = average + scales
|
||
|
X[n_samples // 2 :, :] = average - scales
|
||
|
X_array = X if sparse_container is None else sparse_container(X)
|
||
|
|
||
|
scaler = StandardScaler(with_mean=False).fit(X_array)
|
||
|
|
||
|
# StandardScaler uses float64 accumulators even if the data has a float32
|
||
|
# dtype.
|
||
|
eps = np.finfo(np.float64).eps
|
||
|
|
||
|
# if var < bound = N.eps.var + N².eps².mean², the feature is considered
|
||
|
# constant and the scale_ attribute is set to 1.
|
||
|
bounds = n_samples * eps * scales**2 + n_samples**2 * eps**2 * average**2
|
||
|
within_bounds = scales**2 <= bounds
|
||
|
|
||
|
# Check that scale_min is small enough to have some scales below the
|
||
|
# bound and therefore detected as constant:
|
||
|
assert np.any(within_bounds)
|
||
|
|
||
|
# Check that such features are actually treated as constant by the scaler:
|
||
|
assert all(scaler.var_[within_bounds] <= bounds[within_bounds])
|
||
|
assert_allclose(scaler.scale_[within_bounds], 1.0)
|
||
|
|
||
|
# Depending the on the dtype of X, some features might not actually be
|
||
|
# representable as non constant for small scales (even if above the
|
||
|
# precision bound of the float64 variance estimate). Such feature should
|
||
|
# be correctly detected as constants with 0 variance by StandardScaler.
|
||
|
representable_diff = X[0, :] - X[-1, :] != 0
|
||
|
assert_allclose(scaler.var_[np.logical_not(representable_diff)], 0)
|
||
|
assert_allclose(scaler.scale_[np.logical_not(representable_diff)], 1)
|
||
|
|
||
|
# The other features are scaled and scale_ is equal to sqrt(var_) assuming
|
||
|
# that scales are large enough for average + scale and average - scale to
|
||
|
# be distinct in X (depending on X's dtype).
|
||
|
common_mask = np.logical_and(scales**2 > bounds, representable_diff)
|
||
|
assert_allclose(scaler.scale_[common_mask], np.sqrt(scaler.var_)[common_mask])
|
||
|
|
||
|
|
||
|
def test_scale_1d():
|
||
|
# 1-d inputs
|
||
|
X_list = [1.0, 3.0, 5.0, 0.0]
|
||
|
X_arr = np.array(X_list)
|
||
|
|
||
|
for X in [X_list, X_arr]:
|
||
|
X_scaled = scale(X)
|
||
|
assert_array_almost_equal(X_scaled.mean(), 0.0)
|
||
|
assert_array_almost_equal(X_scaled.std(), 1.0)
|
||
|
assert_array_equal(scale(X, with_mean=False, with_std=False), X)
|
||
|
|
||
|
|
||
|
@skip_if_32bit
|
||
|
def test_standard_scaler_numerical_stability():
|
||
|
# Test numerical stability of scaling
|
||
|
# np.log(1e-5) is taken because of its floating point representation
|
||
|
# was empirically found to cause numerical problems with np.mean & np.std.
|
||
|
x = np.full(8, np.log(1e-5), dtype=np.float64)
|
||
|
# This does not raise a warning as the number of samples is too low
|
||
|
# to trigger the problem in recent numpy
|
||
|
with warnings.catch_warnings():
|
||
|
warnings.simplefilter("error", UserWarning)
|
||
|
scale(x)
|
||
|
assert_array_almost_equal(scale(x), np.zeros(8))
|
||
|
|
||
|
# with 2 more samples, the std computation run into numerical issues:
|
||
|
x = np.full(10, np.log(1e-5), dtype=np.float64)
|
||
|
warning_message = "standard deviation of the data is probably very close to 0"
|
||
|
with pytest.warns(UserWarning, match=warning_message):
|
||
|
x_scaled = scale(x)
|
||
|
assert_array_almost_equal(x_scaled, np.zeros(10))
|
||
|
|
||
|
x = np.full(10, 1e-100, dtype=np.float64)
|
||
|
with warnings.catch_warnings():
|
||
|
warnings.simplefilter("error", UserWarning)
|
||
|
x_small_scaled = scale(x)
|
||
|
assert_array_almost_equal(x_small_scaled, np.zeros(10))
|
||
|
|
||
|
# Large values can cause (often recoverable) numerical stability issues:
|
||
|
x_big = np.full(10, 1e100, dtype=np.float64)
|
||
|
warning_message = "Dataset may contain too large values"
|
||
|
with pytest.warns(UserWarning, match=warning_message):
|
||
|
x_big_scaled = scale(x_big)
|
||
|
assert_array_almost_equal(x_big_scaled, np.zeros(10))
|
||
|
assert_array_almost_equal(x_big_scaled, x_small_scaled)
|
||
|
with pytest.warns(UserWarning, match=warning_message):
|
||
|
x_big_centered = scale(x_big, with_std=False)
|
||
|
assert_array_almost_equal(x_big_centered, np.zeros(10))
|
||
|
assert_array_almost_equal(x_big_centered, x_small_scaled)
|
||
|
|
||
|
|
||
|
def test_scaler_2d_arrays():
|
||
|
# Test scaling of 2d array along first axis
|
||
|
rng = np.random.RandomState(0)
|
||
|
n_features = 5
|
||
|
n_samples = 4
|
||
|
X = rng.randn(n_samples, n_features)
|
||
|
X[:, 0] = 0.0 # first feature is always of zero
|
||
|
|
||
|
scaler = StandardScaler()
|
||
|
X_scaled = scaler.fit(X).transform(X, copy=True)
|
||
|
assert not np.any(np.isnan(X_scaled))
|
||
|
assert scaler.n_samples_seen_ == n_samples
|
||
|
|
||
|
assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0])
|
||
|
assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
|
||
|
# Check that X has been copied
|
||
|
assert X_scaled is not X
|
||
|
|
||
|
# check inverse transform
|
||
|
X_scaled_back = scaler.inverse_transform(X_scaled)
|
||
|
assert X_scaled_back is not X
|
||
|
assert X_scaled_back is not X_scaled
|
||
|
assert_array_almost_equal(X_scaled_back, X)
|
||
|
|
||
|
X_scaled = scale(X, axis=1, with_std=False)
|
||
|
assert not np.any(np.isnan(X_scaled))
|
||
|
assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0])
|
||
|
X_scaled = scale(X, axis=1, with_std=True)
|
||
|
assert not np.any(np.isnan(X_scaled))
|
||
|
assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0])
|
||
|
assert_array_almost_equal(X_scaled.std(axis=1), n_samples * [1.0])
|
||
|
# Check that the data hasn't been modified
|
||
|
assert X_scaled is not X
|
||
|
|
||
|
X_scaled = scaler.fit(X).transform(X, copy=False)
|
||
|
assert not np.any(np.isnan(X_scaled))
|
||
|
assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0])
|
||
|
assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
|
||
|
# Check that X has not been copied
|
||
|
assert X_scaled is X
|
||
|
|
||
|
X = rng.randn(4, 5)
|
||
|
X[:, 0] = 1.0 # first feature is a constant, non zero feature
|
||
|
scaler = StandardScaler()
|
||
|
X_scaled = scaler.fit(X).transform(X, copy=True)
|
||
|
assert not np.any(np.isnan(X_scaled))
|
||
|
assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0])
|
||
|
assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
|
||
|
# Check that X has not been copied
|
||
|
assert X_scaled is not X
|
||
|
|
||
|
|
||
|
def test_scaler_float16_overflow():
|
||
|
# Test if the scaler will not overflow on float16 numpy arrays
|
||
|
rng = np.random.RandomState(0)
|
||
|
# float16 has a maximum of 65500.0. On the worst case 5 * 200000 is 100000
|
||
|
# which is enough to overflow the data type
|
||
|
X = rng.uniform(5, 10, [200000, 1]).astype(np.float16)
|
||
|
|
||
|
with np.errstate(over="raise"):
|
||
|
scaler = StandardScaler().fit(X)
|
||
|
X_scaled = scaler.transform(X)
|
||
|
|
||
|
# Calculate the float64 equivalent to verify result
|
||
|
X_scaled_f64 = StandardScaler().fit_transform(X.astype(np.float64))
|
||
|
|
||
|
# Overflow calculations may cause -inf, inf, or nan. Since there is no nan
|
||
|
# input, all of the outputs should be finite. This may be redundant since a
|
||
|
# FloatingPointError exception will be thrown on overflow above.
|
||
|
assert np.all(np.isfinite(X_scaled))
|
||
|
|
||
|
# The normal distribution is very unlikely to go above 4. At 4.0-8.0 the
|
||
|
# float16 precision is 2^-8 which is around 0.004. Thus only 2 decimals are
|
||
|
# checked to account for precision differences.
|
||
|
assert_array_almost_equal(X_scaled, X_scaled_f64, decimal=2)
|
||
|
|
||
|
|
||
|
def test_handle_zeros_in_scale():
|
||
|
s1 = np.array([0, 1e-16, 1, 2, 3])
|
||
|
s2 = _handle_zeros_in_scale(s1, copy=True)
|
||
|
|
||
|
assert_allclose(s1, np.array([0, 1e-16, 1, 2, 3]))
|
||
|
assert_allclose(s2, np.array([1, 1, 1, 2, 3]))
|
||
|
|
||
|
|
||
|
def test_minmax_scaler_partial_fit():
|
||
|
# Test if partial_fit run over many batches of size 1 and 50
|
||
|
# gives the same results as fit
|
||
|
X = X_2d
|
||
|
n = X.shape[0]
|
||
|
|
||
|
for chunk_size in [1, 2, 50, n, n + 42]:
|
||
|
# Test mean at the end of the process
|
||
|
scaler_batch = MinMaxScaler().fit(X)
|
||
|
|
||
|
scaler_incr = MinMaxScaler()
|
||
|
for batch in gen_batches(n_samples, chunk_size):
|
||
|
scaler_incr = scaler_incr.partial_fit(X[batch])
|
||
|
|
||
|
assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_)
|
||
|
assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_)
|
||
|
assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
|
||
|
assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_)
|
||
|
assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
|
||
|
assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)
|
||
|
|
||
|
# Test std after 1 step
|
||
|
batch0 = slice(0, chunk_size)
|
||
|
scaler_batch = MinMaxScaler().fit(X[batch0])
|
||
|
scaler_incr = MinMaxScaler().partial_fit(X[batch0])
|
||
|
|
||
|
assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_)
|
||
|
assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_)
|
||
|
assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
|
||
|
assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_)
|
||
|
assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
|
||
|
assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)
|
||
|
|
||
|
# Test std until the end of partial fits, and
|
||
|
scaler_batch = MinMaxScaler().fit(X)
|
||
|
scaler_incr = MinMaxScaler() # Clean estimator
|
||
|
for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
|
||
|
scaler_incr = scaler_incr.partial_fit(X[batch])
|
||
|
assert_correct_incr(
|
||
|
i,
|
||
|
batch_start=batch.start,
|
||
|
batch_stop=batch.stop,
|
||
|
n=n,
|
||
|
chunk_size=chunk_size,
|
||
|
n_samples_seen=scaler_incr.n_samples_seen_,
|
||
|
)
|
||
|
|
||
|
|
||
|
def test_standard_scaler_partial_fit():
|
||
|
# Test if partial_fit run over many batches of size 1 and 50
|
||
|
# gives the same results as fit
|
||
|
X = X_2d
|
||
|
n = X.shape[0]
|
||
|
|
||
|
for chunk_size in [1, 2, 50, n, n + 42]:
|
||
|
# Test mean at the end of the process
|
||
|
scaler_batch = StandardScaler(with_std=False).fit(X)
|
||
|
|
||
|
scaler_incr = StandardScaler(with_std=False)
|
||
|
for batch in gen_batches(n_samples, chunk_size):
|
||
|
scaler_incr = scaler_incr.partial_fit(X[batch])
|
||
|
assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_)
|
||
|
assert scaler_batch.var_ == scaler_incr.var_ # Nones
|
||
|
assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
|
||
|
|
||
|
# Test std after 1 step
|
||
|
batch0 = slice(0, chunk_size)
|
||
|
scaler_incr = StandardScaler().partial_fit(X[batch0])
|
||
|
if chunk_size == 1:
|
||
|
assert_array_almost_equal(
|
||
|
np.zeros(n_features, dtype=np.float64), scaler_incr.var_
|
||
|
)
|
||
|
assert_array_almost_equal(
|
||
|
np.ones(n_features, dtype=np.float64), scaler_incr.scale_
|
||
|
)
|
||
|
else:
|
||
|
assert_array_almost_equal(np.var(X[batch0], axis=0), scaler_incr.var_)
|
||
|
assert_array_almost_equal(
|
||
|
np.std(X[batch0], axis=0), scaler_incr.scale_
|
||
|
) # no constants
|
||
|
|
||
|
# Test std until the end of partial fits, and
|
||
|
scaler_batch = StandardScaler().fit(X)
|
||
|
scaler_incr = StandardScaler() # Clean estimator
|
||
|
for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
|
||
|
scaler_incr = scaler_incr.partial_fit(X[batch])
|
||
|
assert_correct_incr(
|
||
|
i,
|
||
|
batch_start=batch.start,
|
||
|
batch_stop=batch.stop,
|
||
|
n=n,
|
||
|
chunk_size=chunk_size,
|
||
|
n_samples_seen=scaler_incr.n_samples_seen_,
|
||
|
)
|
||
|
|
||
|
assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_)
|
||
|
assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
|
||
|
def test_standard_scaler_partial_fit_numerical_stability(sparse_container):
|
||
|
# Test if the incremental computation introduces significative errors
|
||
|
# for large datasets with values of large magniture
|
||
|
rng = np.random.RandomState(0)
|
||
|
n_features = 2
|
||
|
n_samples = 100
|
||
|
offsets = rng.uniform(-1e15, 1e15, size=n_features)
|
||
|
scales = rng.uniform(1e3, 1e6, size=n_features)
|
||
|
X = rng.randn(n_samples, n_features) * scales + offsets
|
||
|
|
||
|
scaler_batch = StandardScaler().fit(X)
|
||
|
scaler_incr = StandardScaler()
|
||
|
for chunk in X:
|
||
|
scaler_incr = scaler_incr.partial_fit(chunk.reshape(1, n_features))
|
||
|
|
||
|
# Regardless of abs values, they must not be more diff 6 significant digits
|
||
|
tol = 10 ** (-6)
|
||
|
assert_allclose(scaler_incr.mean_, scaler_batch.mean_, rtol=tol)
|
||
|
assert_allclose(scaler_incr.var_, scaler_batch.var_, rtol=tol)
|
||
|
assert_allclose(scaler_incr.scale_, scaler_batch.scale_, rtol=tol)
|
||
|
# NOTE Be aware that for much larger offsets std is very unstable (last
|
||
|
# assert) while mean is OK.
|
||
|
|
||
|
# Sparse input
|
||
|
size = (100, 3)
|
||
|
scale = 1e20
|
||
|
X = sparse_container(rng.randint(0, 2, size).astype(np.float64) * scale)
|
||
|
|
||
|
# with_mean=False is required with sparse input
|
||
|
scaler = StandardScaler(with_mean=False).fit(X)
|
||
|
scaler_incr = StandardScaler(with_mean=False)
|
||
|
|
||
|
for chunk in X:
|
||
|
if chunk.ndim == 1:
|
||
|
# Sparse arrays can be 1D (in scipy 1.14 and later) while old
|
||
|
# sparse matrix instances are always 2D.
|
||
|
chunk = chunk.reshape(1, -1)
|
||
|
scaler_incr = scaler_incr.partial_fit(chunk)
|
||
|
|
||
|
# Regardless of magnitude, they must not differ more than of 6 digits
|
||
|
tol = 10 ** (-6)
|
||
|
assert scaler.mean_ is not None
|
||
|
assert_allclose(scaler_incr.var_, scaler.var_, rtol=tol)
|
||
|
assert_allclose(scaler_incr.scale_, scaler.scale_, rtol=tol)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("sample_weight", [True, None])
|
||
|
@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
|
||
|
def test_partial_fit_sparse_input(sample_weight, sparse_container):
|
||
|
# Check that sparsity is not destroyed
|
||
|
X = sparse_container(np.array([[1.0], [0.0], [0.0], [5.0]]))
|
||
|
|
||
|
if sample_weight:
|
||
|
sample_weight = rng.rand(X.shape[0])
|
||
|
|
||
|
null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
|
||
|
X_null = null_transform.partial_fit(X, sample_weight=sample_weight).transform(X)
|
||
|
assert_array_equal(X_null.toarray(), X.toarray())
|
||
|
X_orig = null_transform.inverse_transform(X_null)
|
||
|
assert_array_equal(X_orig.toarray(), X_null.toarray())
|
||
|
assert_array_equal(X_orig.toarray(), X.toarray())
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("sample_weight", [True, None])
|
||
|
def test_standard_scaler_trasform_with_partial_fit(sample_weight):
|
||
|
# Check some postconditions after applying partial_fit and transform
|
||
|
X = X_2d[:100, :]
|
||
|
|
||
|
if sample_weight:
|
||
|
sample_weight = rng.rand(X.shape[0])
|
||
|
|
||
|
scaler_incr = StandardScaler()
|
||
|
for i, batch in enumerate(gen_batches(X.shape[0], 1)):
|
||
|
X_sofar = X[: (i + 1), :]
|
||
|
chunks_copy = X_sofar.copy()
|
||
|
if sample_weight is None:
|
||
|
scaled_batch = StandardScaler().fit_transform(X_sofar)
|
||
|
scaler_incr = scaler_incr.partial_fit(X[batch])
|
||
|
else:
|
||
|
scaled_batch = StandardScaler().fit_transform(
|
||
|
X_sofar, sample_weight=sample_weight[: i + 1]
|
||
|
)
|
||
|
scaler_incr = scaler_incr.partial_fit(
|
||
|
X[batch], sample_weight=sample_weight[batch]
|
||
|
)
|
||
|
scaled_incr = scaler_incr.transform(X_sofar)
|
||
|
|
||
|
assert_array_almost_equal(scaled_batch, scaled_incr)
|
||
|
assert_array_almost_equal(X_sofar, chunks_copy) # No change
|
||
|
right_input = scaler_incr.inverse_transform(scaled_incr)
|
||
|
assert_array_almost_equal(X_sofar, right_input)
|
||
|
|
||
|
zero = np.zeros(X.shape[1])
|
||
|
epsilon = np.finfo(float).eps
|
||
|
assert_array_less(zero, scaler_incr.var_ + epsilon) # as less or equal
|
||
|
assert_array_less(zero, scaler_incr.scale_ + epsilon)
|
||
|
if sample_weight is None:
|
||
|
# (i+1) because the Scaler has been already fitted
|
||
|
assert (i + 1) == scaler_incr.n_samples_seen_
|
||
|
else:
|
||
|
assert np.sum(sample_weight[: i + 1]) == pytest.approx(
|
||
|
scaler_incr.n_samples_seen_
|
||
|
)
|
||
|
|
||
|
|
||
|
def test_standard_check_array_of_inverse_transform():
|
||
|
# Check if StandardScaler inverse_transform is
|
||
|
# converting the integer array to float
|
||
|
x = np.array(
|
||
|
[
|
||
|
[1, 1, 1, 0, 1, 0],
|
||
|
[1, 1, 1, 0, 1, 0],
|
||
|
[0, 8, 0, 1, 0, 0],
|
||
|
[1, 4, 1, 1, 0, 0],
|
||
|
[0, 1, 0, 0, 1, 0],
|
||
|
[0, 4, 0, 1, 0, 1],
|
||
|
],
|
||
|
dtype=np.int32,
|
||
|
)
|
||
|
|
||
|
scaler = StandardScaler()
|
||
|
scaler.fit(x)
|
||
|
|
||
|
# The of inverse_transform should be converted
|
||
|
# to a float array.
|
||
|
# If not X *= self.scale_ will fail.
|
||
|
scaler.inverse_transform(x)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations()
|
||
|
)
|
||
|
@pytest.mark.parametrize(
|
||
|
"check",
|
||
|
[check_array_api_input_and_values],
|
||
|
ids=_get_check_estimator_ids,
|
||
|
)
|
||
|
@pytest.mark.parametrize(
|
||
|
"estimator",
|
||
|
[
|
||
|
MaxAbsScaler(),
|
||
|
MinMaxScaler(),
|
||
|
KernelCenterer(),
|
||
|
Normalizer(norm="l1"),
|
||
|
Normalizer(norm="l2"),
|
||
|
Normalizer(norm="max"),
|
||
|
],
|
||
|
ids=_get_check_estimator_ids,
|
||
|
)
|
||
|
def test_scaler_array_api_compliance(
|
||
|
estimator, check, array_namespace, device, dtype_name
|
||
|
):
|
||
|
name = estimator.__class__.__name__
|
||
|
check(name, estimator, array_namespace, device=device, dtype_name=dtype_name)
|
||
|
|
||
|
|
||
|
def test_min_max_scaler_iris():
|
||
|
X = iris.data
|
||
|
scaler = MinMaxScaler()
|
||
|
# default params
|
||
|
X_trans = scaler.fit_transform(X)
|
||
|
assert_array_almost_equal(X_trans.min(axis=0), 0)
|
||
|
assert_array_almost_equal(X_trans.max(axis=0), 1)
|
||
|
X_trans_inv = scaler.inverse_transform(X_trans)
|
||
|
assert_array_almost_equal(X, X_trans_inv)
|
||
|
|
||
|
# not default params: min=1, max=2
|
||
|
scaler = MinMaxScaler(feature_range=(1, 2))
|
||
|
X_trans = scaler.fit_transform(X)
|
||
|
assert_array_almost_equal(X_trans.min(axis=0), 1)
|
||
|
assert_array_almost_equal(X_trans.max(axis=0), 2)
|
||
|
X_trans_inv = scaler.inverse_transform(X_trans)
|
||
|
assert_array_almost_equal(X, X_trans_inv)
|
||
|
|
||
|
# min=-.5, max=.6
|
||
|
scaler = MinMaxScaler(feature_range=(-0.5, 0.6))
|
||
|
X_trans = scaler.fit_transform(X)
|
||
|
assert_array_almost_equal(X_trans.min(axis=0), -0.5)
|
||
|
assert_array_almost_equal(X_trans.max(axis=0), 0.6)
|
||
|
X_trans_inv = scaler.inverse_transform(X_trans)
|
||
|
assert_array_almost_equal(X, X_trans_inv)
|
||
|
|
||
|
# raises on invalid range
|
||
|
scaler = MinMaxScaler(feature_range=(2, 1))
|
||
|
with pytest.raises(ValueError):
|
||
|
scaler.fit(X)
|
||
|
|
||
|
|
||
|
def test_min_max_scaler_zero_variance_features():
|
||
|
# Check min max scaler on toy data with zero variance features
|
||
|
X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.1], [0.0, 1.0, +1.1]]
|
||
|
|
||
|
X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]]
|
||
|
|
||
|
# default params
|
||
|
scaler = MinMaxScaler()
|
||
|
X_trans = scaler.fit_transform(X)
|
||
|
X_expected_0_1 = [[0.0, 0.0, 0.5], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0]]
|
||
|
assert_array_almost_equal(X_trans, X_expected_0_1)
|
||
|
X_trans_inv = scaler.inverse_transform(X_trans)
|
||
|
assert_array_almost_equal(X, X_trans_inv)
|
||
|
|
||
|
X_trans_new = scaler.transform(X_new)
|
||
|
X_expected_0_1_new = [[+0.0, 1.0, 0.500], [-1.0, 0.0, 0.083], [+0.0, 0.0, 1.333]]
|
||
|
assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2)
|
||
|
|
||
|
# not default params
|
||
|
scaler = MinMaxScaler(feature_range=(1, 2))
|
||
|
X_trans = scaler.fit_transform(X)
|
||
|
X_expected_1_2 = [[1.0, 1.0, 1.5], [1.0, 1.0, 1.0], [1.0, 1.0, 2.0]]
|
||
|
assert_array_almost_equal(X_trans, X_expected_1_2)
|
||
|
|
||
|
# function interface
|
||
|
X_trans = minmax_scale(X)
|
||
|
assert_array_almost_equal(X_trans, X_expected_0_1)
|
||
|
X_trans = minmax_scale(X, feature_range=(1, 2))
|
||
|
assert_array_almost_equal(X_trans, X_expected_1_2)
|
||
|
|
||
|
|
||
|
def test_minmax_scale_axis1():
|
||
|
X = iris.data
|
||
|
X_trans = minmax_scale(X, axis=1)
|
||
|
assert_array_almost_equal(np.min(X_trans, axis=1), 0)
|
||
|
assert_array_almost_equal(np.max(X_trans, axis=1), 1)
|
||
|
|
||
|
|
||
|
def test_min_max_scaler_1d():
|
||
|
# Test scaling of dataset along single axis
|
||
|
for X in [X_1row, X_1col, X_list_1row, X_list_1row]:
|
||
|
scaler = MinMaxScaler(copy=True)
|
||
|
X_scaled = scaler.fit(X).transform(X)
|
||
|
|
||
|
if isinstance(X, list):
|
||
|
X = np.array(X) # cast only after scaling done
|
||
|
|
||
|
if _check_dim_1axis(X) == 1:
|
||
|
assert_array_almost_equal(X_scaled.min(axis=0), np.zeros(n_features))
|
||
|
assert_array_almost_equal(X_scaled.max(axis=0), np.zeros(n_features))
|
||
|
else:
|
||
|
assert_array_almost_equal(X_scaled.min(axis=0), 0.0)
|
||
|
assert_array_almost_equal(X_scaled.max(axis=0), 1.0)
|
||
|
assert scaler.n_samples_seen_ == X.shape[0]
|
||
|
|
||
|
# check inverse transform
|
||
|
X_scaled_back = scaler.inverse_transform(X_scaled)
|
||
|
assert_array_almost_equal(X_scaled_back, X)
|
||
|
|
||
|
# Constant feature
|
||
|
X = np.ones((5, 1))
|
||
|
scaler = MinMaxScaler()
|
||
|
X_scaled = scaler.fit(X).transform(X)
|
||
|
assert X_scaled.min() >= 0.0
|
||
|
assert X_scaled.max() <= 1.0
|
||
|
assert scaler.n_samples_seen_ == X.shape[0]
|
||
|
|
||
|
# Function interface
|
||
|
X_1d = X_1row.ravel()
|
||
|
min_ = X_1d.min()
|
||
|
max_ = X_1d.max()
|
||
|
assert_array_almost_equal(
|
||
|
(X_1d - min_) / (max_ - min_), minmax_scale(X_1d, copy=True)
|
||
|
)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("sample_weight", [True, None])
|
||
|
@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
|
||
|
def test_scaler_without_centering(sample_weight, sparse_container):
|
||
|
rng = np.random.RandomState(42)
|
||
|
X = rng.randn(4, 5)
|
||
|
X[:, 0] = 0.0 # first feature is always of zero
|
||
|
X_sparse = sparse_container(X)
|
||
|
|
||
|
if sample_weight:
|
||
|
sample_weight = rng.rand(X.shape[0])
|
||
|
|
||
|
with pytest.raises(ValueError):
|
||
|
StandardScaler().fit(X_sparse)
|
||
|
|
||
|
scaler = StandardScaler(with_mean=False).fit(X, sample_weight=sample_weight)
|
||
|
X_scaled = scaler.transform(X, copy=True)
|
||
|
assert not np.any(np.isnan(X_scaled))
|
||
|
|
||
|
scaler_sparse = StandardScaler(with_mean=False).fit(
|
||
|
X_sparse, sample_weight=sample_weight
|
||
|
)
|
||
|
X_sparse_scaled = scaler_sparse.transform(X_sparse, copy=True)
|
||
|
assert not np.any(np.isnan(X_sparse_scaled.data))
|
||
|
|
||
|
assert_array_almost_equal(scaler.mean_, scaler_sparse.mean_)
|
||
|
assert_array_almost_equal(scaler.var_, scaler_sparse.var_)
|
||
|
assert_array_almost_equal(scaler.scale_, scaler_sparse.scale_)
|
||
|
assert_array_almost_equal(scaler.n_samples_seen_, scaler_sparse.n_samples_seen_)
|
||
|
|
||
|
if sample_weight is None:
|
||
|
assert_array_almost_equal(
|
||
|
X_scaled.mean(axis=0), [0.0, -0.01, 2.24, -0.35, -0.78], 2
|
||
|
)
|
||
|
assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
|
||
|
|
||
|
X_sparse_scaled_mean, X_sparse_scaled_var = mean_variance_axis(X_sparse_scaled, 0)
|
||
|
assert_array_almost_equal(X_sparse_scaled_mean, X_scaled.mean(axis=0))
|
||
|
assert_array_almost_equal(X_sparse_scaled_var, X_scaled.var(axis=0))
|
||
|
|
||
|
# Check that X has not been modified (copy)
|
||
|
assert X_scaled is not X
|
||
|
assert X_sparse_scaled is not X_sparse
|
||
|
|
||
|
X_scaled_back = scaler.inverse_transform(X_scaled)
|
||
|
assert X_scaled_back is not X
|
||
|
assert X_scaled_back is not X_scaled
|
||
|
assert_array_almost_equal(X_scaled_back, X)
|
||
|
|
||
|
X_sparse_scaled_back = scaler_sparse.inverse_transform(X_sparse_scaled)
|
||
|
assert X_sparse_scaled_back is not X_sparse
|
||
|
assert X_sparse_scaled_back is not X_sparse_scaled
|
||
|
assert_array_almost_equal(X_sparse_scaled_back.toarray(), X)
|
||
|
|
||
|
if sparse_container in CSR_CONTAINERS:
|
||
|
null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
|
||
|
X_null = null_transform.fit_transform(X_sparse)
|
||
|
assert_array_equal(X_null.data, X_sparse.data)
|
||
|
X_orig = null_transform.inverse_transform(X_null)
|
||
|
assert_array_equal(X_orig.data, X_sparse.data)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("with_mean", [True, False])
|
||
|
@pytest.mark.parametrize("with_std", [True, False])
|
||
|
@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS)
|
||
|
def test_scaler_n_samples_seen_with_nan(with_mean, with_std, sparse_container):
|
||
|
X = np.array(
|
||
|
[[0, 1, 3], [np.nan, 6, 10], [5, 4, np.nan], [8, 0, np.nan]], dtype=np.float64
|
||
|
)
|
||
|
if sparse_container is not None:
|
||
|
X = sparse_container(X)
|
||
|
|
||
|
if sparse.issparse(X) and with_mean:
|
||
|
pytest.skip("'with_mean=True' cannot be used with sparse matrix.")
|
||
|
|
||
|
transformer = StandardScaler(with_mean=with_mean, with_std=with_std)
|
||
|
transformer.fit(X)
|
||
|
|
||
|
assert_array_equal(transformer.n_samples_seen_, np.array([3, 4, 2]))
|
||
|
|
||
|
|
||
|
def _check_identity_scalers_attributes(scaler_1, scaler_2):
|
||
|
assert scaler_1.mean_ is scaler_2.mean_ is None
|
||
|
assert scaler_1.var_ is scaler_2.var_ is None
|
||
|
assert scaler_1.scale_ is scaler_2.scale_ is None
|
||
|
assert scaler_1.n_samples_seen_ == scaler_2.n_samples_seen_
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
|
||
|
def test_scaler_return_identity(sparse_container):
|
||
|
# test that the scaler return identity when with_mean and with_std are
|
||
|
# False
|
||
|
X_dense = np.array([[0, 1, 3], [5, 6, 0], [8, 0, 10]], dtype=np.float64)
|
||
|
X_sparse = sparse_container(X_dense)
|
||
|
|
||
|
transformer_dense = StandardScaler(with_mean=False, with_std=False)
|
||
|
X_trans_dense = transformer_dense.fit_transform(X_dense)
|
||
|
assert_allclose(X_trans_dense, X_dense)
|
||
|
|
||
|
transformer_sparse = clone(transformer_dense)
|
||
|
X_trans_sparse = transformer_sparse.fit_transform(X_sparse)
|
||
|
assert_allclose_dense_sparse(X_trans_sparse, X_sparse)
|
||
|
|
||
|
_check_identity_scalers_attributes(transformer_dense, transformer_sparse)
|
||
|
|
||
|
transformer_dense.partial_fit(X_dense)
|
||
|
transformer_sparse.partial_fit(X_sparse)
|
||
|
_check_identity_scalers_attributes(transformer_dense, transformer_sparse)
|
||
|
|
||
|
transformer_dense.fit(X_dense)
|
||
|
transformer_sparse.fit(X_sparse)
|
||
|
_check_identity_scalers_attributes(transformer_dense, transformer_sparse)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
|
||
|
def test_scaler_int(sparse_container):
|
||
|
# test that scaler converts integer input to floating
|
||
|
# for both sparse and dense matrices
|
||
|
rng = np.random.RandomState(42)
|
||
|
X = rng.randint(20, size=(4, 5))
|
||
|
X[:, 0] = 0 # first feature is always of zero
|
||
|
X_sparse = sparse_container(X)
|
||
|
|
||
|
with warnings.catch_warnings(record=True):
|
||
|
scaler = StandardScaler(with_mean=False).fit(X)
|
||
|
X_scaled = scaler.transform(X, copy=True)
|
||
|
assert not np.any(np.isnan(X_scaled))
|
||
|
|
||
|
with warnings.catch_warnings(record=True):
|
||
|
scaler_sparse = StandardScaler(with_mean=False).fit(X_sparse)
|
||
|
X_sparse_scaled = scaler_sparse.transform(X_sparse, copy=True)
|
||
|
assert not np.any(np.isnan(X_sparse_scaled.data))
|
||
|
|
||
|
assert_array_almost_equal(scaler.mean_, scaler_sparse.mean_)
|
||
|
assert_array_almost_equal(scaler.var_, scaler_sparse.var_)
|
||
|
assert_array_almost_equal(scaler.scale_, scaler_sparse.scale_)
|
||
|
|
||
|
assert_array_almost_equal(
|
||
|
X_scaled.mean(axis=0), [0.0, 1.109, 1.856, 21.0, 1.559], 2
|
||
|
)
|
||
|
assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
|
||
|
|
||
|
X_sparse_scaled_mean, X_sparse_scaled_std = mean_variance_axis(
|
||
|
X_sparse_scaled.astype(float), 0
|
||
|
)
|
||
|
assert_array_almost_equal(X_sparse_scaled_mean, X_scaled.mean(axis=0))
|
||
|
assert_array_almost_equal(X_sparse_scaled_std, X_scaled.std(axis=0))
|
||
|
|
||
|
# Check that X has not been modified (copy)
|
||
|
assert X_scaled is not X
|
||
|
assert X_sparse_scaled is not X_sparse
|
||
|
|
||
|
X_scaled_back = scaler.inverse_transform(X_scaled)
|
||
|
assert X_scaled_back is not X
|
||
|
assert X_scaled_back is not X_scaled
|
||
|
assert_array_almost_equal(X_scaled_back, X)
|
||
|
|
||
|
X_sparse_scaled_back = scaler_sparse.inverse_transform(X_sparse_scaled)
|
||
|
assert X_sparse_scaled_back is not X_sparse
|
||
|
assert X_sparse_scaled_back is not X_sparse_scaled
|
||
|
assert_array_almost_equal(X_sparse_scaled_back.toarray(), X)
|
||
|
|
||
|
if sparse_container in CSR_CONTAINERS:
|
||
|
null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
|
||
|
with warnings.catch_warnings(record=True):
|
||
|
X_null = null_transform.fit_transform(X_sparse)
|
||
|
assert_array_equal(X_null.data, X_sparse.data)
|
||
|
X_orig = null_transform.inverse_transform(X_null)
|
||
|
assert_array_equal(X_orig.data, X_sparse.data)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS)
|
||
|
def test_scaler_without_copy(sparse_container):
|
||
|
# Check that StandardScaler.fit does not change input
|
||
|
rng = np.random.RandomState(42)
|
||
|
X = rng.randn(4, 5)
|
||
|
X[:, 0] = 0.0 # first feature is always of zero
|
||
|
X_sparse = sparse_container(X)
|
||
|
|
||
|
X_copy = X.copy()
|
||
|
StandardScaler(copy=False).fit(X)
|
||
|
assert_array_equal(X, X_copy)
|
||
|
|
||
|
X_sparse_copy = X_sparse.copy()
|
||
|
StandardScaler(with_mean=False, copy=False).fit(X_sparse)
|
||
|
assert_array_equal(X_sparse.toarray(), X_sparse_copy.toarray())
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS)
|
||
|
def test_scale_sparse_with_mean_raise_exception(sparse_container):
|
||
|
rng = np.random.RandomState(42)
|
||
|
X = rng.randn(4, 5)
|
||
|
X_sparse = sparse_container(X)
|
||
|
|
||
|
# check scaling and fit with direct calls on sparse data
|
||
|
with pytest.raises(ValueError):
|
||
|
scale(X_sparse, with_mean=True)
|
||
|
with pytest.raises(ValueError):
|
||
|
StandardScaler(with_mean=True).fit(X_sparse)
|
||
|
|
||
|
# check transform and inverse_transform after a fit on a dense array
|
||
|
scaler = StandardScaler(with_mean=True).fit(X)
|
||
|
with pytest.raises(ValueError):
|
||
|
scaler.transform(X_sparse)
|
||
|
|
||
|
X_transformed_sparse = sparse_container(scaler.transform(X))
|
||
|
with pytest.raises(ValueError):
|
||
|
scaler.inverse_transform(X_transformed_sparse)
|
||
|
|
||
|
|
||
|
def test_scale_input_finiteness_validation():
|
||
|
# Check if non finite inputs raise ValueError
|
||
|
X = [[np.inf, 5, 6, 7, 8]]
|
||
|
with pytest.raises(
|
||
|
ValueError, match="Input contains infinity or a value too large"
|
||
|
):
|
||
|
scale(X)
|
||
|
|
||
|
|
||
|
def test_robust_scaler_error_sparse():
|
||
|
X_sparse = sparse.rand(1000, 10)
|
||
|
scaler = RobustScaler(with_centering=True)
|
||
|
err_msg = "Cannot center sparse matrices"
|
||
|
with pytest.raises(ValueError, match=err_msg):
|
||
|
scaler.fit(X_sparse)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("with_centering", [True, False])
|
||
|
@pytest.mark.parametrize("with_scaling", [True, False])
|
||
|
@pytest.mark.parametrize("X", [np.random.randn(10, 3), sparse.rand(10, 3, density=0.5)])
|
||
|
def test_robust_scaler_attributes(X, with_centering, with_scaling):
|
||
|
# check consistent type of attributes
|
||
|
if with_centering and sparse.issparse(X):
|
||
|
pytest.skip("RobustScaler cannot center sparse matrix")
|
||
|
|
||
|
scaler = RobustScaler(with_centering=with_centering, with_scaling=with_scaling)
|
||
|
scaler.fit(X)
|
||
|
|
||
|
if with_centering:
|
||
|
assert isinstance(scaler.center_, np.ndarray)
|
||
|
else:
|
||
|
assert scaler.center_ is None
|
||
|
if with_scaling:
|
||
|
assert isinstance(scaler.scale_, np.ndarray)
|
||
|
else:
|
||
|
assert scaler.scale_ is None
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_robust_scaler_col_zero_sparse(csr_container):
|
||
|
# check that the scaler is working when there is not data materialized in a
|
||
|
# column of a sparse matrix
|
||
|
X = np.random.randn(10, 5)
|
||
|
X[:, 0] = 0
|
||
|
X = csr_container(X)
|
||
|
|
||
|
scaler = RobustScaler(with_centering=False)
|
||
|
scaler.fit(X)
|
||
|
assert scaler.scale_[0] == pytest.approx(1)
|
||
|
|
||
|
X_trans = scaler.transform(X)
|
||
|
assert_allclose(X[:, [0]].toarray(), X_trans[:, [0]].toarray())
|
||
|
|
||
|
|
||
|
def test_robust_scaler_2d_arrays():
|
||
|
# Test robust scaling of 2d array along first axis
|
||
|
rng = np.random.RandomState(0)
|
||
|
X = rng.randn(4, 5)
|
||
|
X[:, 0] = 0.0 # first feature is always of zero
|
||
|
|
||
|
scaler = RobustScaler()
|
||
|
X_scaled = scaler.fit(X).transform(X)
|
||
|
|
||
|
assert_array_almost_equal(np.median(X_scaled, axis=0), 5 * [0.0])
|
||
|
assert_array_almost_equal(X_scaled.std(axis=0)[0], 0)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("density", [0, 0.05, 0.1, 0.5, 1])
|
||
|
@pytest.mark.parametrize("strictly_signed", ["positive", "negative", "zeros", None])
|
||
|
def test_robust_scaler_equivalence_dense_sparse(density, strictly_signed):
|
||
|
# Check the equivalence of the fitting with dense and sparse matrices
|
||
|
X_sparse = sparse.rand(1000, 5, density=density).tocsc()
|
||
|
if strictly_signed == "positive":
|
||
|
X_sparse.data = np.abs(X_sparse.data)
|
||
|
elif strictly_signed == "negative":
|
||
|
X_sparse.data = -np.abs(X_sparse.data)
|
||
|
elif strictly_signed == "zeros":
|
||
|
X_sparse.data = np.zeros(X_sparse.data.shape, dtype=np.float64)
|
||
|
X_dense = X_sparse.toarray()
|
||
|
|
||
|
scaler_sparse = RobustScaler(with_centering=False)
|
||
|
scaler_dense = RobustScaler(with_centering=False)
|
||
|
|
||
|
scaler_sparse.fit(X_sparse)
|
||
|
scaler_dense.fit(X_dense)
|
||
|
|
||
|
assert_allclose(scaler_sparse.scale_, scaler_dense.scale_)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_robust_scaler_transform_one_row_csr(csr_container):
|
||
|
# Check RobustScaler on transforming csr matrix with one row
|
||
|
rng = np.random.RandomState(0)
|
||
|
X = rng.randn(4, 5)
|
||
|
single_row = np.array([[0.1, 1.0, 2.0, 0.0, -1.0]])
|
||
|
scaler = RobustScaler(with_centering=False)
|
||
|
scaler = scaler.fit(X)
|
||
|
row_trans = scaler.transform(csr_container(single_row))
|
||
|
row_expected = single_row / scaler.scale_
|
||
|
assert_array_almost_equal(row_trans.toarray(), row_expected)
|
||
|
row_scaled_back = scaler.inverse_transform(row_trans)
|
||
|
assert_array_almost_equal(single_row, row_scaled_back.toarray())
|
||
|
|
||
|
|
||
|
def test_robust_scaler_iris():
|
||
|
X = iris.data
|
||
|
scaler = RobustScaler()
|
||
|
X_trans = scaler.fit_transform(X)
|
||
|
assert_array_almost_equal(np.median(X_trans, axis=0), 0)
|
||
|
X_trans_inv = scaler.inverse_transform(X_trans)
|
||
|
assert_array_almost_equal(X, X_trans_inv)
|
||
|
q = np.percentile(X_trans, q=(25, 75), axis=0)
|
||
|
iqr = q[1] - q[0]
|
||
|
assert_array_almost_equal(iqr, 1)
|
||
|
|
||
|
|
||
|
def test_robust_scaler_iris_quantiles():
|
||
|
X = iris.data
|
||
|
scaler = RobustScaler(quantile_range=(10, 90))
|
||
|
X_trans = scaler.fit_transform(X)
|
||
|
assert_array_almost_equal(np.median(X_trans, axis=0), 0)
|
||
|
X_trans_inv = scaler.inverse_transform(X_trans)
|
||
|
assert_array_almost_equal(X, X_trans_inv)
|
||
|
q = np.percentile(X_trans, q=(10, 90), axis=0)
|
||
|
q_range = q[1] - q[0]
|
||
|
assert_array_almost_equal(q_range, 1)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||
|
def test_quantile_transform_iris(csc_container):
|
||
|
X = iris.data
|
||
|
# uniform output distribution
|
||
|
transformer = QuantileTransformer(n_quantiles=30)
|
||
|
X_trans = transformer.fit_transform(X)
|
||
|
X_trans_inv = transformer.inverse_transform(X_trans)
|
||
|
assert_array_almost_equal(X, X_trans_inv)
|
||
|
# normal output distribution
|
||
|
transformer = QuantileTransformer(n_quantiles=30, output_distribution="normal")
|
||
|
X_trans = transformer.fit_transform(X)
|
||
|
X_trans_inv = transformer.inverse_transform(X_trans)
|
||
|
assert_array_almost_equal(X, X_trans_inv)
|
||
|
# make sure it is possible to take the inverse of a sparse matrix
|
||
|
# which contain negative value; this is the case in the iris dataset
|
||
|
X_sparse = csc_container(X)
|
||
|
X_sparse_tran = transformer.fit_transform(X_sparse)
|
||
|
X_sparse_tran_inv = transformer.inverse_transform(X_sparse_tran)
|
||
|
assert_array_almost_equal(X_sparse.toarray(), X_sparse_tran_inv.toarray())
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||
|
def test_quantile_transform_check_error(csc_container):
|
||
|
X = np.transpose(
|
||
|
[
|
||
|
[0, 25, 50, 0, 0, 0, 75, 0, 0, 100],
|
||
|
[2, 4, 0, 0, 6, 8, 0, 10, 0, 0],
|
||
|
[0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1],
|
||
|
]
|
||
|
)
|
||
|
X = csc_container(X)
|
||
|
X_neg = np.transpose(
|
||
|
[
|
||
|
[0, 25, 50, 0, 0, 0, 75, 0, 0, 100],
|
||
|
[-2, 4, 0, 0, 6, 8, 0, 10, 0, 0],
|
||
|
[0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1],
|
||
|
]
|
||
|
)
|
||
|
X_neg = csc_container(X_neg)
|
||
|
|
||
|
err_msg = (
|
||
|
"The number of quantiles cannot be greater than "
|
||
|
"the number of samples used. Got 1000 quantiles "
|
||
|
"and 10 samples."
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=err_msg):
|
||
|
QuantileTransformer(subsample=10).fit(X)
|
||
|
|
||
|
transformer = QuantileTransformer(n_quantiles=10)
|
||
|
err_msg = "QuantileTransformer only accepts non-negative sparse matrices."
|
||
|
with pytest.raises(ValueError, match=err_msg):
|
||
|
transformer.fit(X_neg)
|
||
|
transformer.fit(X)
|
||
|
err_msg = "QuantileTransformer only accepts non-negative sparse matrices."
|
||
|
with pytest.raises(ValueError, match=err_msg):
|
||
|
transformer.transform(X_neg)
|
||
|
|
||
|
X_bad_feat = np.transpose(
|
||
|
[[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]
|
||
|
)
|
||
|
err_msg = (
|
||
|
"X has 2 features, but QuantileTransformer is expecting 3 features as input."
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=err_msg):
|
||
|
transformer.inverse_transform(X_bad_feat)
|
||
|
|
||
|
transformer = QuantileTransformer(n_quantiles=10).fit(X)
|
||
|
# check that an error is raised if input is scalar
|
||
|
with pytest.raises(ValueError, match="Expected 2D array, got scalar array instead"):
|
||
|
transformer.transform(10)
|
||
|
# check that a warning is raised is n_quantiles > n_samples
|
||
|
transformer = QuantileTransformer(n_quantiles=100)
|
||
|
warn_msg = "n_quantiles is set to n_samples"
|
||
|
with pytest.warns(UserWarning, match=warn_msg) as record:
|
||
|
transformer.fit(X)
|
||
|
assert len(record) == 1
|
||
|
assert transformer.n_quantiles_ == X.shape[0]
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||
|
def test_quantile_transform_sparse_ignore_zeros(csc_container):
|
||
|
X = np.array([[0, 1], [0, 0], [0, 2], [0, 2], [0, 1]])
|
||
|
X_sparse = csc_container(X)
|
||
|
transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5)
|
||
|
|
||
|
# dense case -> warning raise
|
||
|
warning_message = (
|
||
|
"'ignore_implicit_zeros' takes effect"
|
||
|
" only with sparse matrix. This parameter has no"
|
||
|
" effect."
|
||
|
)
|
||
|
with pytest.warns(UserWarning, match=warning_message):
|
||
|
transformer.fit(X)
|
||
|
|
||
|
X_expected = np.array([[0, 0], [0, 0], [0, 1], [0, 1], [0, 0]])
|
||
|
X_trans = transformer.fit_transform(X_sparse)
|
||
|
assert_almost_equal(X_expected, X_trans.toarray())
|
||
|
|
||
|
# consider the case where sparse entries are missing values and user-given
|
||
|
# zeros are to be considered
|
||
|
X_data = np.array([0, 0, 1, 0, 2, 2, 1, 0, 1, 2, 0])
|
||
|
X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1])
|
||
|
X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6, 7, 8])
|
||
|
X_sparse = csc_container((X_data, (X_row, X_col)))
|
||
|
X_trans = transformer.fit_transform(X_sparse)
|
||
|
X_expected = np.array(
|
||
|
[
|
||
|
[0.0, 0.5],
|
||
|
[0.0, 0.0],
|
||
|
[0.0, 1.0],
|
||
|
[0.0, 1.0],
|
||
|
[0.0, 0.5],
|
||
|
[0.0, 0.0],
|
||
|
[0.0, 0.5],
|
||
|
[0.0, 1.0],
|
||
|
[0.0, 0.0],
|
||
|
]
|
||
|
)
|
||
|
assert_almost_equal(X_expected, X_trans.toarray())
|
||
|
|
||
|
transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5)
|
||
|
X_data = np.array([-1, -1, 1, 0, 0, 0, 1, -1, 1])
|
||
|
X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1])
|
||
|
X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6])
|
||
|
X_sparse = csc_container((X_data, (X_row, X_col)))
|
||
|
X_trans = transformer.fit_transform(X_sparse)
|
||
|
X_expected = np.array(
|
||
|
[[0, 1], [0, 0.375], [0, 0.375], [0, 0.375], [0, 1], [0, 0], [0, 1]]
|
||
|
)
|
||
|
assert_almost_equal(X_expected, X_trans.toarray())
|
||
|
assert_almost_equal(
|
||
|
X_sparse.toarray(), transformer.inverse_transform(X_trans).toarray()
|
||
|
)
|
||
|
|
||
|
# check in conjunction with subsampling
|
||
|
transformer = QuantileTransformer(
|
||
|
ignore_implicit_zeros=True, n_quantiles=5, subsample=8, random_state=0
|
||
|
)
|
||
|
X_trans = transformer.fit_transform(X_sparse)
|
||
|
assert_almost_equal(X_expected, X_trans.toarray())
|
||
|
assert_almost_equal(
|
||
|
X_sparse.toarray(), transformer.inverse_transform(X_trans).toarray()
|
||
|
)
|
||
|
|
||
|
|
||
|
def test_quantile_transform_dense_toy():
|
||
|
X = np.array(
|
||
|
[[0, 2, 2.6], [25, 4, 4.1], [50, 6, 2.3], [75, 8, 9.5], [100, 10, 0.1]]
|
||
|
)
|
||
|
|
||
|
transformer = QuantileTransformer(n_quantiles=5)
|
||
|
transformer.fit(X)
|
||
|
|
||
|
# using a uniform output, each entry of X should be map between 0 and 1
|
||
|
# and equally spaced
|
||
|
X_trans = transformer.fit_transform(X)
|
||
|
X_expected = np.tile(np.linspace(0, 1, num=5), (3, 1)).T
|
||
|
assert_almost_equal(np.sort(X_trans, axis=0), X_expected)
|
||
|
|
||
|
X_test = np.array(
|
||
|
[
|
||
|
[-1, 1, 0],
|
||
|
[101, 11, 10],
|
||
|
]
|
||
|
)
|
||
|
X_expected = np.array(
|
||
|
[
|
||
|
[0, 0, 0],
|
||
|
[1, 1, 1],
|
||
|
]
|
||
|
)
|
||
|
assert_array_almost_equal(transformer.transform(X_test), X_expected)
|
||
|
|
||
|
X_trans_inv = transformer.inverse_transform(X_trans)
|
||
|
assert_array_almost_equal(X, X_trans_inv)
|
||
|
|
||
|
|
||
|
def test_quantile_transform_subsampling():
|
||
|
# Test that subsampling the input yield to a consistent results We check
|
||
|
# that the computed quantiles are almost mapped to a [0, 1] vector where
|
||
|
# values are equally spaced. The infinite norm is checked to be smaller
|
||
|
# than a given threshold. This is repeated 5 times.
|
||
|
|
||
|
# dense support
|
||
|
n_samples = 1000000
|
||
|
n_quantiles = 1000
|
||
|
X = np.sort(np.random.sample((n_samples, 1)), axis=0)
|
||
|
ROUND = 5
|
||
|
inf_norm_arr = []
|
||
|
for random_state in range(ROUND):
|
||
|
transformer = QuantileTransformer(
|
||
|
random_state=random_state,
|
||
|
n_quantiles=n_quantiles,
|
||
|
subsample=n_samples // 10,
|
||
|
)
|
||
|
transformer.fit(X)
|
||
|
diff = np.linspace(0, 1, n_quantiles) - np.ravel(transformer.quantiles_)
|
||
|
inf_norm = np.max(np.abs(diff))
|
||
|
assert inf_norm < 1e-2
|
||
|
inf_norm_arr.append(inf_norm)
|
||
|
# each random subsampling yield a unique approximation to the expected
|
||
|
# linspace CDF
|
||
|
assert len(np.unique(inf_norm_arr)) == len(inf_norm_arr)
|
||
|
|
||
|
# sparse support
|
||
|
|
||
|
X = sparse.rand(n_samples, 1, density=0.99, format="csc", random_state=0)
|
||
|
inf_norm_arr = []
|
||
|
for random_state in range(ROUND):
|
||
|
transformer = QuantileTransformer(
|
||
|
random_state=random_state,
|
||
|
n_quantiles=n_quantiles,
|
||
|
subsample=n_samples // 10,
|
||
|
)
|
||
|
transformer.fit(X)
|
||
|
diff = np.linspace(0, 1, n_quantiles) - np.ravel(transformer.quantiles_)
|
||
|
inf_norm = np.max(np.abs(diff))
|
||
|
assert inf_norm < 1e-1
|
||
|
inf_norm_arr.append(inf_norm)
|
||
|
# each random subsampling yield a unique approximation to the expected
|
||
|
# linspace CDF
|
||
|
assert len(np.unique(inf_norm_arr)) == len(inf_norm_arr)
|
||
|
|
||
|
|
||
|
def test_quantile_transform_subsampling_disabled():
|
||
|
"""Check the behaviour of `QuantileTransformer` when `subsample=None`."""
|
||
|
X = np.random.RandomState(0).normal(size=(200, 1))
|
||
|
|
||
|
n_quantiles = 5
|
||
|
transformer = QuantileTransformer(n_quantiles=n_quantiles, subsample=None).fit(X)
|
||
|
|
||
|
expected_references = np.linspace(0, 1, n_quantiles)
|
||
|
assert_allclose(transformer.references_, expected_references)
|
||
|
expected_quantiles = np.quantile(X.ravel(), expected_references)
|
||
|
assert_allclose(transformer.quantiles_.ravel(), expected_quantiles)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||
|
def test_quantile_transform_sparse_toy(csc_container):
|
||
|
X = np.array(
|
||
|
[
|
||
|
[0.0, 2.0, 0.0],
|
||
|
[25.0, 4.0, 0.0],
|
||
|
[50.0, 0.0, 2.6],
|
||
|
[0.0, 0.0, 4.1],
|
||
|
[0.0, 6.0, 0.0],
|
||
|
[0.0, 8.0, 0.0],
|
||
|
[75.0, 0.0, 2.3],
|
||
|
[0.0, 10.0, 0.0],
|
||
|
[0.0, 0.0, 9.5],
|
||
|
[100.0, 0.0, 0.1],
|
||
|
]
|
||
|
)
|
||
|
|
||
|
X = csc_container(X)
|
||
|
|
||
|
transformer = QuantileTransformer(n_quantiles=10)
|
||
|
transformer.fit(X)
|
||
|
|
||
|
X_trans = transformer.fit_transform(X)
|
||
|
assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.0)
|
||
|
assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.0)
|
||
|
|
||
|
X_trans_inv = transformer.inverse_transform(X_trans)
|
||
|
assert_array_almost_equal(X.toarray(), X_trans_inv.toarray())
|
||
|
|
||
|
transformer_dense = QuantileTransformer(n_quantiles=10).fit(X.toarray())
|
||
|
|
||
|
X_trans = transformer_dense.transform(X)
|
||
|
assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.0)
|
||
|
assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.0)
|
||
|
|
||
|
X_trans_inv = transformer_dense.inverse_transform(X_trans)
|
||
|
assert_array_almost_equal(X.toarray(), X_trans_inv.toarray())
|
||
|
|
||
|
|
||
|
def test_quantile_transform_axis1():
|
||
|
X = np.array([[0, 25, 50, 75, 100], [2, 4, 6, 8, 10], [2.6, 4.1, 2.3, 9.5, 0.1]])
|
||
|
|
||
|
X_trans_a0 = quantile_transform(X.T, axis=0, n_quantiles=5)
|
||
|
X_trans_a1 = quantile_transform(X, axis=1, n_quantiles=5)
|
||
|
assert_array_almost_equal(X_trans_a0, X_trans_a1.T)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||
|
def test_quantile_transform_bounds(csc_container):
|
||
|
# Lower and upper bounds are manually mapped. We checked that in the case
|
||
|
# of a constant feature and binary feature, the bounds are properly mapped.
|
||
|
X_dense = np.array([[0, 0], [0, 0], [1, 0]])
|
||
|
X_sparse = csc_container(X_dense)
|
||
|
|
||
|
# check sparse and dense are consistent
|
||
|
X_trans = QuantileTransformer(n_quantiles=3, random_state=0).fit_transform(X_dense)
|
||
|
assert_array_almost_equal(X_trans, X_dense)
|
||
|
X_trans_sp = QuantileTransformer(n_quantiles=3, random_state=0).fit_transform(
|
||
|
X_sparse
|
||
|
)
|
||
|
assert_array_almost_equal(X_trans_sp.toarray(), X_dense)
|
||
|
assert_array_almost_equal(X_trans, X_trans_sp.toarray())
|
||
|
|
||
|
# check the consistency of the bounds by learning on 1 matrix
|
||
|
# and transforming another
|
||
|
X = np.array([[0, 1], [0, 0.5], [1, 0]])
|
||
|
X1 = np.array([[0, 0.1], [0, 0.5], [1, 0.1]])
|
||
|
transformer = QuantileTransformer(n_quantiles=3).fit(X)
|
||
|
X_trans = transformer.transform(X1)
|
||
|
assert_array_almost_equal(X_trans, X1)
|
||
|
|
||
|
# check that values outside of the range learned will be mapped properly.
|
||
|
X = np.random.random((1000, 1))
|
||
|
transformer = QuantileTransformer()
|
||
|
transformer.fit(X)
|
||
|
assert transformer.transform([[-10]]) == transformer.transform([[np.min(X)]])
|
||
|
assert transformer.transform([[10]]) == transformer.transform([[np.max(X)]])
|
||
|
assert transformer.inverse_transform([[-10]]) == transformer.inverse_transform(
|
||
|
[[np.min(transformer.references_)]]
|
||
|
)
|
||
|
assert transformer.inverse_transform([[10]]) == transformer.inverse_transform(
|
||
|
[[np.max(transformer.references_)]]
|
||
|
)
|
||
|
|
||
|
|
||
|
def test_quantile_transform_and_inverse():
|
||
|
X_1 = iris.data
|
||
|
X_2 = np.array([[0.0], [BOUNDS_THRESHOLD / 10], [1.5], [2], [3], [3], [4]])
|
||
|
for X in [X_1, X_2]:
|
||
|
transformer = QuantileTransformer(n_quantiles=1000, random_state=0)
|
||
|
X_trans = transformer.fit_transform(X)
|
||
|
X_trans_inv = transformer.inverse_transform(X_trans)
|
||
|
assert_array_almost_equal(X, X_trans_inv, decimal=9)
|
||
|
|
||
|
|
||
|
def test_quantile_transform_nan():
|
||
|
X = np.array([[np.nan, 0, 0, 1], [np.nan, np.nan, 0, 0.5], [np.nan, 1, 1, 0]])
|
||
|
|
||
|
transformer = QuantileTransformer(n_quantiles=10, random_state=42)
|
||
|
transformer.fit_transform(X)
|
||
|
|
||
|
# check that the quantile of the first column is all NaN
|
||
|
assert np.isnan(transformer.quantiles_[:, 0]).all()
|
||
|
# all other column should not contain NaN
|
||
|
assert not np.isnan(transformer.quantiles_[:, 1:]).any()
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("array_type", ["array", "sparse"])
|
||
|
def test_quantile_transformer_sorted_quantiles(array_type):
|
||
|
# Non-regression test for:
|
||
|
# https://github.com/scikit-learn/scikit-learn/issues/15733
|
||
|
# Taken from upstream bug report:
|
||
|
# https://github.com/numpy/numpy/issues/14685
|
||
|
X = np.array([0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 1, 1, 9, 9, 9, 8, 8, 7] * 10)
|
||
|
X = 0.1 * X.reshape(-1, 1)
|
||
|
X = _convert_container(X, array_type)
|
||
|
|
||
|
n_quantiles = 100
|
||
|
qt = QuantileTransformer(n_quantiles=n_quantiles).fit(X)
|
||
|
|
||
|
# Check that the estimated quantile thresholds are monotically
|
||
|
# increasing:
|
||
|
quantiles = qt.quantiles_[:, 0]
|
||
|
assert len(quantiles) == 100
|
||
|
assert all(np.diff(quantiles) >= 0)
|
||
|
|
||
|
|
||
|
def test_robust_scaler_invalid_range():
|
||
|
for range_ in [
|
||
|
(-1, 90),
|
||
|
(-2, -3),
|
||
|
(10, 101),
|
||
|
(100.5, 101),
|
||
|
(90, 50),
|
||
|
]:
|
||
|
scaler = RobustScaler(quantile_range=range_)
|
||
|
|
||
|
with pytest.raises(ValueError, match=r"Invalid quantile range: \("):
|
||
|
scaler.fit(iris.data)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_scale_function_without_centering(csr_container):
|
||
|
rng = np.random.RandomState(42)
|
||
|
X = rng.randn(4, 5)
|
||
|
X[:, 0] = 0.0 # first feature is always of zero
|
||
|
X_csr = csr_container(X)
|
||
|
|
||
|
X_scaled = scale(X, with_mean=False)
|
||
|
assert not np.any(np.isnan(X_scaled))
|
||
|
|
||
|
X_csr_scaled = scale(X_csr, with_mean=False)
|
||
|
assert not np.any(np.isnan(X_csr_scaled.data))
|
||
|
|
||
|
# test csc has same outcome
|
||
|
X_csc_scaled = scale(X_csr.tocsc(), with_mean=False)
|
||
|
assert_array_almost_equal(X_scaled, X_csc_scaled.toarray())
|
||
|
|
||
|
# raises value error on axis != 0
|
||
|
with pytest.raises(ValueError):
|
||
|
scale(X_csr, with_mean=False, axis=1)
|
||
|
|
||
|
assert_array_almost_equal(
|
||
|
X_scaled.mean(axis=0), [0.0, -0.01, 2.24, -0.35, -0.78], 2
|
||
|
)
|
||
|
assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
|
||
|
# Check that X has not been copied
|
||
|
assert X_scaled is not X
|
||
|
|
||
|
X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0)
|
||
|
assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
|
||
|
assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
|
||
|
|
||
|
# null scale
|
||
|
X_csr_scaled = scale(X_csr, with_mean=False, with_std=False, copy=True)
|
||
|
assert_array_almost_equal(X_csr.toarray(), X_csr_scaled.toarray())
|
||
|
|
||
|
|
||
|
def test_robust_scale_axis1():
|
||
|
X = iris.data
|
||
|
X_trans = robust_scale(X, axis=1)
|
||
|
assert_array_almost_equal(np.median(X_trans, axis=1), 0)
|
||
|
q = np.percentile(X_trans, q=(25, 75), axis=1)
|
||
|
iqr = q[1] - q[0]
|
||
|
assert_array_almost_equal(iqr, 1)
|
||
|
|
||
|
|
||
|
def test_robust_scale_1d_array():
|
||
|
X = iris.data[:, 1]
|
||
|
X_trans = robust_scale(X)
|
||
|
assert_array_almost_equal(np.median(X_trans), 0)
|
||
|
q = np.percentile(X_trans, q=(25, 75))
|
||
|
iqr = q[1] - q[0]
|
||
|
assert_array_almost_equal(iqr, 1)
|
||
|
|
||
|
|
||
|
def test_robust_scaler_zero_variance_features():
|
||
|
# Check RobustScaler on toy data with zero variance features
|
||
|
X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.1], [0.0, 1.0, +1.1]]
|
||
|
|
||
|
scaler = RobustScaler()
|
||
|
X_trans = scaler.fit_transform(X)
|
||
|
|
||
|
# NOTE: for such a small sample size, what we expect in the third column
|
||
|
# depends HEAVILY on the method used to calculate quantiles. The values
|
||
|
# here were calculated to fit the quantiles produces by np.percentile
|
||
|
# using numpy 1.9 Calculating quantiles with
|
||
|
# scipy.stats.mstats.scoreatquantile or scipy.stats.mstats.mquantiles
|
||
|
# would yield very different results!
|
||
|
X_expected = [[0.0, 0.0, +0.0], [0.0, 0.0, -1.0], [0.0, 0.0, +1.0]]
|
||
|
assert_array_almost_equal(X_trans, X_expected)
|
||
|
X_trans_inv = scaler.inverse_transform(X_trans)
|
||
|
assert_array_almost_equal(X, X_trans_inv)
|
||
|
|
||
|
# make sure new data gets transformed correctly
|
||
|
X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]]
|
||
|
X_trans_new = scaler.transform(X_new)
|
||
|
X_expected_new = [[+0.0, 1.0, +0.0], [-1.0, 0.0, -0.83333], [+0.0, 0.0, +1.66667]]
|
||
|
assert_array_almost_equal(X_trans_new, X_expected_new, decimal=3)
|
||
|
|
||
|
|
||
|
def test_robust_scaler_unit_variance():
|
||
|
# Check RobustScaler with unit_variance=True on standard normal data with
|
||
|
# outliers
|
||
|
rng = np.random.RandomState(42)
|
||
|
X = rng.randn(1000000, 1)
|
||
|
X_with_outliers = np.vstack([X, np.ones((100, 1)) * 100, np.ones((100, 1)) * -100])
|
||
|
|
||
|
quantile_range = (1, 99)
|
||
|
robust_scaler = RobustScaler(quantile_range=quantile_range, unit_variance=True).fit(
|
||
|
X_with_outliers
|
||
|
)
|
||
|
X_trans = robust_scaler.transform(X)
|
||
|
|
||
|
assert robust_scaler.center_ == pytest.approx(0, abs=1e-3)
|
||
|
assert robust_scaler.scale_ == pytest.approx(1, abs=1e-2)
|
||
|
assert X_trans.std() == pytest.approx(1, abs=1e-2)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
|
||
|
def test_maxabs_scaler_zero_variance_features(sparse_container):
|
||
|
# Check MaxAbsScaler on toy data with zero variance features
|
||
|
X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.3], [0.0, 1.0, +1.5], [0.0, 0.0, +0.0]]
|
||
|
|
||
|
scaler = MaxAbsScaler()
|
||
|
X_trans = scaler.fit_transform(X)
|
||
|
X_expected = [
|
||
|
[0.0, 1.0, 1.0 / 3.0],
|
||
|
[0.0, 1.0, -0.2],
|
||
|
[0.0, 1.0, 1.0],
|
||
|
[0.0, 0.0, 0.0],
|
||
|
]
|
||
|
assert_array_almost_equal(X_trans, X_expected)
|
||
|
X_trans_inv = scaler.inverse_transform(X_trans)
|
||
|
assert_array_almost_equal(X, X_trans_inv)
|
||
|
|
||
|
# make sure new data gets transformed correctly
|
||
|
X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]]
|
||
|
X_trans_new = scaler.transform(X_new)
|
||
|
X_expected_new = [[+0.0, 2.0, 1.0 / 3.0], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.0]]
|
||
|
|
||
|
assert_array_almost_equal(X_trans_new, X_expected_new, decimal=2)
|
||
|
|
||
|
# function interface
|
||
|
X_trans = maxabs_scale(X)
|
||
|
assert_array_almost_equal(X_trans, X_expected)
|
||
|
|
||
|
# sparse data
|
||
|
X_sparse = sparse_container(X)
|
||
|
X_trans_sparse = scaler.fit_transform(X_sparse)
|
||
|
X_expected = [
|
||
|
[0.0, 1.0, 1.0 / 3.0],
|
||
|
[0.0, 1.0, -0.2],
|
||
|
[0.0, 1.0, 1.0],
|
||
|
[0.0, 0.0, 0.0],
|
||
|
]
|
||
|
assert_array_almost_equal(X_trans_sparse.toarray(), X_expected)
|
||
|
X_trans_sparse_inv = scaler.inverse_transform(X_trans_sparse)
|
||
|
assert_array_almost_equal(X, X_trans_sparse_inv.toarray())
|
||
|
|
||
|
|
||
|
def test_maxabs_scaler_large_negative_value():
|
||
|
# Check MaxAbsScaler on toy data with a large negative value
|
||
|
X = [
|
||
|
[0.0, 1.0, +0.5, -1.0],
|
||
|
[0.0, 1.0, -0.3, -0.5],
|
||
|
[0.0, 1.0, -100.0, 0.0],
|
||
|
[0.0, 0.0, +0.0, -2.0],
|
||
|
]
|
||
|
|
||
|
scaler = MaxAbsScaler()
|
||
|
X_trans = scaler.fit_transform(X)
|
||
|
X_expected = [
|
||
|
[0.0, 1.0, 0.005, -0.5],
|
||
|
[0.0, 1.0, -0.003, -0.25],
|
||
|
[0.0, 1.0, -1.0, 0.0],
|
||
|
[0.0, 0.0, 0.0, -1.0],
|
||
|
]
|
||
|
assert_array_almost_equal(X_trans, X_expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_maxabs_scaler_transform_one_row_csr(csr_container):
|
||
|
# Check MaxAbsScaler on transforming csr matrix with one row
|
||
|
X = csr_container([[0.5, 1.0, 1.0]])
|
||
|
scaler = MaxAbsScaler()
|
||
|
scaler = scaler.fit(X)
|
||
|
X_trans = scaler.transform(X)
|
||
|
X_expected = csr_container([[1.0, 1.0, 1.0]])
|
||
|
assert_array_almost_equal(X_trans.toarray(), X_expected.toarray())
|
||
|
X_scaled_back = scaler.inverse_transform(X_trans)
|
||
|
assert_array_almost_equal(X.toarray(), X_scaled_back.toarray())
|
||
|
|
||
|
|
||
|
def test_maxabs_scaler_1d():
|
||
|
# Test scaling of dataset along single axis
|
||
|
for X in [X_1row, X_1col, X_list_1row, X_list_1row]:
|
||
|
scaler = MaxAbsScaler(copy=True)
|
||
|
X_scaled = scaler.fit(X).transform(X)
|
||
|
|
||
|
if isinstance(X, list):
|
||
|
X = np.array(X) # cast only after scaling done
|
||
|
|
||
|
if _check_dim_1axis(X) == 1:
|
||
|
assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), np.ones(n_features))
|
||
|
else:
|
||
|
assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.0)
|
||
|
assert scaler.n_samples_seen_ == X.shape[0]
|
||
|
|
||
|
# check inverse transform
|
||
|
X_scaled_back = scaler.inverse_transform(X_scaled)
|
||
|
assert_array_almost_equal(X_scaled_back, X)
|
||
|
|
||
|
# Constant feature
|
||
|
X = np.ones((5, 1))
|
||
|
scaler = MaxAbsScaler()
|
||
|
X_scaled = scaler.fit(X).transform(X)
|
||
|
assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.0)
|
||
|
assert scaler.n_samples_seen_ == X.shape[0]
|
||
|
|
||
|
# function interface
|
||
|
X_1d = X_1row.ravel()
|
||
|
max_abs = np.abs(X_1d).max()
|
||
|
assert_array_almost_equal(X_1d / max_abs, maxabs_scale(X_1d, copy=True))
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_maxabs_scaler_partial_fit(csr_container):
|
||
|
# Test if partial_fit run over many batches of size 1 and 50
|
||
|
# gives the same results as fit
|
||
|
X = X_2d[:100, :]
|
||
|
n = X.shape[0]
|
||
|
|
||
|
for chunk_size in [1, 2, 50, n, n + 42]:
|
||
|
# Test mean at the end of the process
|
||
|
scaler_batch = MaxAbsScaler().fit(X)
|
||
|
|
||
|
scaler_incr = MaxAbsScaler()
|
||
|
scaler_incr_csr = MaxAbsScaler()
|
||
|
scaler_incr_csc = MaxAbsScaler()
|
||
|
for batch in gen_batches(n, chunk_size):
|
||
|
scaler_incr = scaler_incr.partial_fit(X[batch])
|
||
|
X_csr = csr_container(X[batch])
|
||
|
scaler_incr_csr = scaler_incr_csr.partial_fit(X_csr)
|
||
|
X_csc = csr_container(X[batch])
|
||
|
scaler_incr_csc = scaler_incr_csc.partial_fit(X_csc)
|
||
|
|
||
|
assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
|
||
|
assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csr.max_abs_)
|
||
|
assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csc.max_abs_)
|
||
|
assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
|
||
|
assert scaler_batch.n_samples_seen_ == scaler_incr_csr.n_samples_seen_
|
||
|
assert scaler_batch.n_samples_seen_ == scaler_incr_csc.n_samples_seen_
|
||
|
assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
|
||
|
assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csr.scale_)
|
||
|
assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csc.scale_)
|
||
|
assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X))
|
||
|
|
||
|
# Test std after 1 step
|
||
|
batch0 = slice(0, chunk_size)
|
||
|
scaler_batch = MaxAbsScaler().fit(X[batch0])
|
||
|
scaler_incr = MaxAbsScaler().partial_fit(X[batch0])
|
||
|
|
||
|
assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
|
||
|
assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
|
||
|
assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
|
||
|
assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X))
|
||
|
|
||
|
# Test std until the end of partial fits, and
|
||
|
scaler_batch = MaxAbsScaler().fit(X)
|
||
|
scaler_incr = MaxAbsScaler() # Clean estimator
|
||
|
for i, batch in enumerate(gen_batches(n, chunk_size)):
|
||
|
scaler_incr = scaler_incr.partial_fit(X[batch])
|
||
|
assert_correct_incr(
|
||
|
i,
|
||
|
batch_start=batch.start,
|
||
|
batch_stop=batch.stop,
|
||
|
n=n,
|
||
|
chunk_size=chunk_size,
|
||
|
n_samples_seen=scaler_incr.n_samples_seen_,
|
||
|
)
|
||
|
|
||
|
|
||
|
def check_normalizer(norm, X_norm):
|
||
|
"""
|
||
|
Convenient checking function for `test_normalizer_l1_l2_max` and
|
||
|
`test_normalizer_l1_l2_max_non_csr`
|
||
|
"""
|
||
|
if norm == "l1":
|
||
|
row_sums = np.abs(X_norm).sum(axis=1)
|
||
|
for i in range(3):
|
||
|
assert_almost_equal(row_sums[i], 1.0)
|
||
|
assert_almost_equal(row_sums[3], 0.0)
|
||
|
elif norm == "l2":
|
||
|
for i in range(3):
|
||
|
assert_almost_equal(la.norm(X_norm[i]), 1.0)
|
||
|
assert_almost_equal(la.norm(X_norm[3]), 0.0)
|
||
|
elif norm == "max":
|
||
|
row_maxs = abs(X_norm).max(axis=1)
|
||
|
for i in range(3):
|
||
|
assert_almost_equal(row_maxs[i], 1.0)
|
||
|
assert_almost_equal(row_maxs[3], 0.0)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("norm", ["l1", "l2", "max"])
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_normalizer_l1_l2_max(norm, csr_container):
|
||
|
rng = np.random.RandomState(0)
|
||
|
X_dense = rng.randn(4, 5)
|
||
|
X_sparse_unpruned = csr_container(X_dense)
|
||
|
|
||
|
# set the row number 3 to zero
|
||
|
X_dense[3, :] = 0.0
|
||
|
|
||
|
# set the row number 3 to zero without pruning (can happen in real life)
|
||
|
indptr_3 = X_sparse_unpruned.indptr[3]
|
||
|
indptr_4 = X_sparse_unpruned.indptr[4]
|
||
|
X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0
|
||
|
|
||
|
# build the pruned variant using the regular constructor
|
||
|
X_sparse_pruned = csr_container(X_dense)
|
||
|
|
||
|
# check inputs that support the no-copy optim
|
||
|
for X in (X_dense, X_sparse_pruned, X_sparse_unpruned):
|
||
|
normalizer = Normalizer(norm=norm, copy=True)
|
||
|
X_norm1 = normalizer.transform(X)
|
||
|
assert X_norm1 is not X
|
||
|
X_norm1 = toarray(X_norm1)
|
||
|
|
||
|
normalizer = Normalizer(norm=norm, copy=False)
|
||
|
X_norm2 = normalizer.transform(X)
|
||
|
assert X_norm2 is X
|
||
|
X_norm2 = toarray(X_norm2)
|
||
|
|
||
|
for X_norm in (X_norm1, X_norm2):
|
||
|
check_normalizer(norm, X_norm)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("norm", ["l1", "l2", "max"])
|
||
|
@pytest.mark.parametrize(
|
||
|
"sparse_container", COO_CONTAINERS + CSC_CONTAINERS + LIL_CONTAINERS
|
||
|
)
|
||
|
def test_normalizer_l1_l2_max_non_csr(norm, sparse_container):
|
||
|
rng = np.random.RandomState(0)
|
||
|
X_dense = rng.randn(4, 5)
|
||
|
|
||
|
# set the row number 3 to zero
|
||
|
X_dense[3, :] = 0.0
|
||
|
|
||
|
X = sparse_container(X_dense)
|
||
|
X_norm = Normalizer(norm=norm, copy=False).transform(X)
|
||
|
|
||
|
assert X_norm is not X
|
||
|
assert sparse.issparse(X_norm) and X_norm.format == "csr"
|
||
|
|
||
|
X_norm = toarray(X_norm)
|
||
|
check_normalizer(norm, X_norm)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_normalizer_max_sign(csr_container):
|
||
|
# check that we normalize by a positive number even for negative data
|
||
|
rng = np.random.RandomState(0)
|
||
|
X_dense = rng.randn(4, 5)
|
||
|
# set the row number 3 to zero
|
||
|
X_dense[3, :] = 0.0
|
||
|
# check for mixed data where the value with
|
||
|
# largest magnitude is negative
|
||
|
X_dense[2, abs(X_dense[2, :]).argmax()] *= -1
|
||
|
X_all_neg = -np.abs(X_dense)
|
||
|
X_all_neg_sparse = csr_container(X_all_neg)
|
||
|
|
||
|
for X in (X_dense, X_all_neg, X_all_neg_sparse):
|
||
|
normalizer = Normalizer(norm="max")
|
||
|
X_norm = normalizer.transform(X)
|
||
|
assert X_norm is not X
|
||
|
X_norm = toarray(X_norm)
|
||
|
assert_array_equal(np.sign(X_norm), np.sign(toarray(X)))
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_normalize(csr_container):
|
||
|
# Test normalize function
|
||
|
# Only tests functionality not used by the tests for Normalizer.
|
||
|
X = np.random.RandomState(37).randn(3, 2)
|
||
|
assert_array_equal(normalize(X, copy=False), normalize(X.T, axis=0, copy=False).T)
|
||
|
|
||
|
rs = np.random.RandomState(0)
|
||
|
X_dense = rs.randn(10, 5)
|
||
|
X_sparse = csr_container(X_dense)
|
||
|
ones = np.ones((10))
|
||
|
for X in (X_dense, X_sparse):
|
||
|
for dtype in (np.float32, np.float64):
|
||
|
for norm in ("l1", "l2"):
|
||
|
X = X.astype(dtype)
|
||
|
X_norm = normalize(X, norm=norm)
|
||
|
assert X_norm.dtype == dtype
|
||
|
|
||
|
X_norm = toarray(X_norm)
|
||
|
if norm == "l1":
|
||
|
row_sums = np.abs(X_norm).sum(axis=1)
|
||
|
else:
|
||
|
X_norm_squared = X_norm**2
|
||
|
row_sums = X_norm_squared.sum(axis=1)
|
||
|
|
||
|
assert_array_almost_equal(row_sums, ones)
|
||
|
|
||
|
# Test return_norm
|
||
|
X_dense = np.array([[3.0, 0, 4.0], [1.0, 0.0, 0.0], [2.0, 3.0, 0.0]])
|
||
|
for norm in ("l1", "l2", "max"):
|
||
|
_, norms = normalize(X_dense, norm=norm, return_norm=True)
|
||
|
if norm == "l1":
|
||
|
assert_array_almost_equal(norms, np.array([7.0, 1.0, 5.0]))
|
||
|
elif norm == "l2":
|
||
|
assert_array_almost_equal(norms, np.array([5.0, 1.0, 3.60555127]))
|
||
|
else:
|
||
|
assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0]))
|
||
|
|
||
|
X_sparse = csr_container(X_dense)
|
||
|
for norm in ("l1", "l2"):
|
||
|
with pytest.raises(NotImplementedError):
|
||
|
normalize(X_sparse, norm=norm, return_norm=True)
|
||
|
_, norms = normalize(X_sparse, norm="max", return_norm=True)
|
||
|
assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0]))
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"constructor", [np.array, list] + CSC_CONTAINERS + CSR_CONTAINERS
|
||
|
)
|
||
|
def test_binarizer(constructor):
|
||
|
X_ = np.array([[1, 0, 5], [2, 3, -1]])
|
||
|
X = constructor(X_.copy())
|
||
|
|
||
|
binarizer = Binarizer(threshold=2.0, copy=True)
|
||
|
X_bin = toarray(binarizer.transform(X))
|
||
|
assert np.sum(X_bin == 0) == 4
|
||
|
assert np.sum(X_bin == 1) == 2
|
||
|
X_bin = binarizer.transform(X)
|
||
|
assert sparse.issparse(X) == sparse.issparse(X_bin)
|
||
|
|
||
|
binarizer = Binarizer(copy=True).fit(X)
|
||
|
X_bin = toarray(binarizer.transform(X))
|
||
|
assert X_bin is not X
|
||
|
assert np.sum(X_bin == 0) == 2
|
||
|
assert np.sum(X_bin == 1) == 4
|
||
|
|
||
|
binarizer = Binarizer(copy=True)
|
||
|
X_bin = binarizer.transform(X)
|
||
|
assert X_bin is not X
|
||
|
X_bin = toarray(X_bin)
|
||
|
assert np.sum(X_bin == 0) == 2
|
||
|
assert np.sum(X_bin == 1) == 4
|
||
|
|
||
|
binarizer = Binarizer(copy=False)
|
||
|
X_bin = binarizer.transform(X)
|
||
|
if constructor is not list:
|
||
|
assert X_bin is X
|
||
|
|
||
|
binarizer = Binarizer(copy=False)
|
||
|
X_float = np.array([[1, 0, 5], [2, 3, -1]], dtype=np.float64)
|
||
|
X_bin = binarizer.transform(X_float)
|
||
|
if constructor is not list:
|
||
|
assert X_bin is X_float
|
||
|
|
||
|
X_bin = toarray(X_bin)
|
||
|
assert np.sum(X_bin == 0) == 2
|
||
|
assert np.sum(X_bin == 1) == 4
|
||
|
|
||
|
binarizer = Binarizer(threshold=-0.5, copy=True)
|
||
|
if constructor in (np.array, list):
|
||
|
X = constructor(X_.copy())
|
||
|
|
||
|
X_bin = toarray(binarizer.transform(X))
|
||
|
assert np.sum(X_bin == 0) == 1
|
||
|
assert np.sum(X_bin == 1) == 5
|
||
|
X_bin = binarizer.transform(X)
|
||
|
|
||
|
# Cannot use threshold < 0 for sparse
|
||
|
if constructor in CSC_CONTAINERS:
|
||
|
with pytest.raises(ValueError):
|
||
|
binarizer.transform(constructor(X))
|
||
|
|
||
|
|
||
|
def test_center_kernel():
|
||
|
# Test that KernelCenterer is equivalent to StandardScaler
|
||
|
# in feature space
|
||
|
rng = np.random.RandomState(0)
|
||
|
X_fit = rng.random_sample((5, 4))
|
||
|
scaler = StandardScaler(with_std=False)
|
||
|
scaler.fit(X_fit)
|
||
|
X_fit_centered = scaler.transform(X_fit)
|
||
|
K_fit = np.dot(X_fit, X_fit.T)
|
||
|
|
||
|
# center fit time matrix
|
||
|
centerer = KernelCenterer()
|
||
|
K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T)
|
||
|
K_fit_centered2 = centerer.fit_transform(K_fit)
|
||
|
assert_array_almost_equal(K_fit_centered, K_fit_centered2)
|
||
|
|
||
|
# center predict time matrix
|
||
|
X_pred = rng.random_sample((2, 4))
|
||
|
K_pred = np.dot(X_pred, X_fit.T)
|
||
|
X_pred_centered = scaler.transform(X_pred)
|
||
|
K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T)
|
||
|
K_pred_centered2 = centerer.transform(K_pred)
|
||
|
assert_array_almost_equal(K_pred_centered, K_pred_centered2)
|
||
|
|
||
|
# check the results coherence with the method proposed in:
|
||
|
# B. Schölkopf, A. Smola, and K.R. Müller,
|
||
|
# "Nonlinear component analysis as a kernel eigenvalue problem"
|
||
|
# equation (B.3)
|
||
|
|
||
|
# K_centered3 = (I - 1_M) K (I - 1_M)
|
||
|
# = K - 1_M K - K 1_M + 1_M K 1_M
|
||
|
ones_M = np.ones_like(K_fit) / K_fit.shape[0]
|
||
|
K_fit_centered3 = K_fit - ones_M @ K_fit - K_fit @ ones_M + ones_M @ K_fit @ ones_M
|
||
|
assert_allclose(K_fit_centered, K_fit_centered3)
|
||
|
|
||
|
# K_test_centered3 = (K_test - 1'_M K)(I - 1_M)
|
||
|
# = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M
|
||
|
ones_prime_M = np.ones_like(K_pred) / K_fit.shape[0]
|
||
|
K_pred_centered3 = (
|
||
|
K_pred - ones_prime_M @ K_fit - K_pred @ ones_M + ones_prime_M @ K_fit @ ones_M
|
||
|
)
|
||
|
assert_allclose(K_pred_centered, K_pred_centered3)
|
||
|
|
||
|
|
||
|
def test_kernelcenterer_non_linear_kernel():
|
||
|
"""Check kernel centering for non-linear kernel."""
|
||
|
rng = np.random.RandomState(0)
|
||
|
X, X_test = rng.randn(100, 50), rng.randn(20, 50)
|
||
|
|
||
|
def phi(X):
|
||
|
"""Our mapping function phi."""
|
||
|
return np.vstack(
|
||
|
[
|
||
|
np.clip(X, a_min=0, a_max=None),
|
||
|
-np.clip(X, a_min=None, a_max=0),
|
||
|
]
|
||
|
)
|
||
|
|
||
|
phi_X = phi(X)
|
||
|
phi_X_test = phi(X_test)
|
||
|
|
||
|
# centered the projection
|
||
|
scaler = StandardScaler(with_std=False)
|
||
|
phi_X_center = scaler.fit_transform(phi_X)
|
||
|
phi_X_test_center = scaler.transform(phi_X_test)
|
||
|
|
||
|
# create the different kernel
|
||
|
K = phi_X @ phi_X.T
|
||
|
K_test = phi_X_test @ phi_X.T
|
||
|
K_center = phi_X_center @ phi_X_center.T
|
||
|
K_test_center = phi_X_test_center @ phi_X_center.T
|
||
|
|
||
|
kernel_centerer = KernelCenterer()
|
||
|
kernel_centerer.fit(K)
|
||
|
|
||
|
assert_allclose(kernel_centerer.transform(K), K_center)
|
||
|
assert_allclose(kernel_centerer.transform(K_test), K_test_center)
|
||
|
|
||
|
# check the results coherence with the method proposed in:
|
||
|
# B. Schölkopf, A. Smola, and K.R. Müller,
|
||
|
# "Nonlinear component analysis as a kernel eigenvalue problem"
|
||
|
# equation (B.3)
|
||
|
|
||
|
# K_centered = (I - 1_M) K (I - 1_M)
|
||
|
# = K - 1_M K - K 1_M + 1_M K 1_M
|
||
|
ones_M = np.ones_like(K) / K.shape[0]
|
||
|
K_centered = K - ones_M @ K - K @ ones_M + ones_M @ K @ ones_M
|
||
|
assert_allclose(kernel_centerer.transform(K), K_centered)
|
||
|
|
||
|
# K_test_centered = (K_test - 1'_M K)(I - 1_M)
|
||
|
# = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M
|
||
|
ones_prime_M = np.ones_like(K_test) / K.shape[0]
|
||
|
K_test_centered = (
|
||
|
K_test - ones_prime_M @ K - K_test @ ones_M + ones_prime_M @ K @ ones_M
|
||
|
)
|
||
|
assert_allclose(kernel_centerer.transform(K_test), K_test_centered)
|
||
|
|
||
|
|
||
|
def test_cv_pipeline_precomputed():
|
||
|
# Cross-validate a regression on four coplanar points with the same
|
||
|
# value. Use precomputed kernel to ensure Pipeline with KernelCenterer
|
||
|
# is treated as a pairwise operation.
|
||
|
X = np.array([[3, 0, 0], [0, 3, 0], [0, 0, 3], [1, 1, 1]])
|
||
|
y_true = np.ones((4,))
|
||
|
K = X.dot(X.T)
|
||
|
kcent = KernelCenterer()
|
||
|
pipeline = Pipeline([("kernel_centerer", kcent), ("svr", SVR())])
|
||
|
|
||
|
# did the pipeline set the pairwise attribute?
|
||
|
assert pipeline._get_tags()["pairwise"]
|
||
|
|
||
|
# test cross-validation, score should be almost perfect
|
||
|
# NB: this test is pretty vacuous -- it's mainly to test integration
|
||
|
# of Pipeline and KernelCenterer
|
||
|
y_pred = cross_val_predict(pipeline, K, y_true, cv=2)
|
||
|
assert_array_almost_equal(y_true, y_pred)
|
||
|
|
||
|
|
||
|
def test_fit_transform():
|
||
|
rng = np.random.RandomState(0)
|
||
|
X = rng.random_sample((5, 4))
|
||
|
for obj in (StandardScaler(), Normalizer(), Binarizer()):
|
||
|
X_transformed = obj.fit(X).transform(X)
|
||
|
X_transformed2 = obj.fit_transform(X)
|
||
|
assert_array_equal(X_transformed, X_transformed2)
|
||
|
|
||
|
|
||
|
def test_add_dummy_feature():
|
||
|
X = [[1, 0], [0, 1], [0, 1]]
|
||
|
X = add_dummy_feature(X)
|
||
|
assert_array_equal(X, [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS
|
||
|
)
|
||
|
def test_add_dummy_feature_sparse(sparse_container):
|
||
|
X = sparse_container([[1, 0], [0, 1], [0, 1]])
|
||
|
desired_format = X.format
|
||
|
X = add_dummy_feature(X)
|
||
|
assert sparse.issparse(X) and X.format == desired_format, X
|
||
|
assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
|
||
|
|
||
|
|
||
|
def test_fit_cold_start():
|
||
|
X = iris.data
|
||
|
X_2d = X[:, :2]
|
||
|
|
||
|
# Scalers that have a partial_fit method
|
||
|
scalers = [
|
||
|
StandardScaler(with_mean=False, with_std=False),
|
||
|
MinMaxScaler(),
|
||
|
MaxAbsScaler(),
|
||
|
]
|
||
|
|
||
|
for scaler in scalers:
|
||
|
scaler.fit_transform(X)
|
||
|
# with a different shape, this may break the scaler unless the internal
|
||
|
# state is reset
|
||
|
scaler.fit_transform(X_2d)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
|
||
|
def test_power_transformer_notfitted(method):
|
||
|
pt = PowerTransformer(method=method)
|
||
|
X = np.abs(X_1col)
|
||
|
with pytest.raises(NotFittedError):
|
||
|
pt.transform(X)
|
||
|
with pytest.raises(NotFittedError):
|
||
|
pt.inverse_transform(X)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
|
||
|
@pytest.mark.parametrize("standardize", [True, False])
|
||
|
@pytest.mark.parametrize("X", [X_1col, X_2d])
|
||
|
def test_power_transformer_inverse(method, standardize, X):
|
||
|
# Make sure we get the original input when applying transform and then
|
||
|
# inverse transform
|
||
|
X = np.abs(X) if method == "box-cox" else X
|
||
|
pt = PowerTransformer(method=method, standardize=standardize)
|
||
|
X_trans = pt.fit_transform(X)
|
||
|
assert_almost_equal(X, pt.inverse_transform(X_trans))
|
||
|
|
||
|
|
||
|
def test_power_transformer_1d():
|
||
|
X = np.abs(X_1col)
|
||
|
|
||
|
for standardize in [True, False]:
|
||
|
pt = PowerTransformer(method="box-cox", standardize=standardize)
|
||
|
|
||
|
X_trans = pt.fit_transform(X)
|
||
|
X_trans_func = power_transform(X, method="box-cox", standardize=standardize)
|
||
|
|
||
|
X_expected, lambda_expected = stats.boxcox(X.flatten())
|
||
|
|
||
|
if standardize:
|
||
|
X_expected = scale(X_expected)
|
||
|
|
||
|
assert_almost_equal(X_expected.reshape(-1, 1), X_trans)
|
||
|
assert_almost_equal(X_expected.reshape(-1, 1), X_trans_func)
|
||
|
|
||
|
assert_almost_equal(X, pt.inverse_transform(X_trans))
|
||
|
assert_almost_equal(lambda_expected, pt.lambdas_[0])
|
||
|
|
||
|
assert len(pt.lambdas_) == X.shape[1]
|
||
|
assert isinstance(pt.lambdas_, np.ndarray)
|
||
|
|
||
|
|
||
|
def test_power_transformer_2d():
|
||
|
X = np.abs(X_2d)
|
||
|
|
||
|
for standardize in [True, False]:
|
||
|
pt = PowerTransformer(method="box-cox", standardize=standardize)
|
||
|
|
||
|
X_trans_class = pt.fit_transform(X)
|
||
|
X_trans_func = power_transform(X, method="box-cox", standardize=standardize)
|
||
|
|
||
|
for X_trans in [X_trans_class, X_trans_func]:
|
||
|
for j in range(X_trans.shape[1]):
|
||
|
X_expected, lmbda = stats.boxcox(X[:, j].flatten())
|
||
|
|
||
|
if standardize:
|
||
|
X_expected = scale(X_expected)
|
||
|
|
||
|
assert_almost_equal(X_trans[:, j], X_expected)
|
||
|
assert_almost_equal(lmbda, pt.lambdas_[j])
|
||
|
|
||
|
# Test inverse transformation
|
||
|
X_inv = pt.inverse_transform(X_trans)
|
||
|
assert_array_almost_equal(X_inv, X)
|
||
|
|
||
|
assert len(pt.lambdas_) == X.shape[1]
|
||
|
assert isinstance(pt.lambdas_, np.ndarray)
|
||
|
|
||
|
|
||
|
def test_power_transformer_boxcox_strictly_positive_exception():
|
||
|
# Exceptions should be raised for negative arrays and zero arrays when
|
||
|
# method is boxcox
|
||
|
|
||
|
pt = PowerTransformer(method="box-cox")
|
||
|
pt.fit(np.abs(X_2d))
|
||
|
X_with_negatives = X_2d
|
||
|
not_positive_message = "strictly positive"
|
||
|
|
||
|
with pytest.raises(ValueError, match=not_positive_message):
|
||
|
pt.transform(X_with_negatives)
|
||
|
|
||
|
with pytest.raises(ValueError, match=not_positive_message):
|
||
|
pt.fit(X_with_negatives)
|
||
|
|
||
|
with pytest.raises(ValueError, match=not_positive_message):
|
||
|
power_transform(X_with_negatives, method="box-cox")
|
||
|
|
||
|
with pytest.raises(ValueError, match=not_positive_message):
|
||
|
pt.transform(np.zeros(X_2d.shape))
|
||
|
|
||
|
with pytest.raises(ValueError, match=not_positive_message):
|
||
|
pt.fit(np.zeros(X_2d.shape))
|
||
|
|
||
|
with pytest.raises(ValueError, match=not_positive_message):
|
||
|
power_transform(np.zeros(X_2d.shape), method="box-cox")
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("X", [X_2d, np.abs(X_2d), -np.abs(X_2d), np.zeros(X_2d.shape)])
|
||
|
def test_power_transformer_yeojohnson_any_input(X):
|
||
|
# Yeo-Johnson method should support any kind of input
|
||
|
power_transform(X, method="yeo-johnson")
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
|
||
|
def test_power_transformer_shape_exception(method):
|
||
|
pt = PowerTransformer(method=method)
|
||
|
X = np.abs(X_2d)
|
||
|
pt.fit(X)
|
||
|
|
||
|
# Exceptions should be raised for arrays with different num_columns
|
||
|
# than during fitting
|
||
|
wrong_shape_message = (
|
||
|
r"X has \d+ features, but PowerTransformer is " r"expecting \d+ features"
|
||
|
)
|
||
|
|
||
|
with pytest.raises(ValueError, match=wrong_shape_message):
|
||
|
pt.transform(X[:, 0:1])
|
||
|
|
||
|
with pytest.raises(ValueError, match=wrong_shape_message):
|
||
|
pt.inverse_transform(X[:, 0:1])
|
||
|
|
||
|
|
||
|
def test_power_transformer_lambda_zero():
|
||
|
pt = PowerTransformer(method="box-cox", standardize=False)
|
||
|
X = np.abs(X_2d)[:, 0:1]
|
||
|
|
||
|
# Test the lambda = 0 case
|
||
|
pt.lambdas_ = np.array([0])
|
||
|
X_trans = pt.transform(X)
|
||
|
assert_array_almost_equal(pt.inverse_transform(X_trans), X)
|
||
|
|
||
|
|
||
|
def test_power_transformer_lambda_one():
|
||
|
# Make sure lambda = 1 corresponds to the identity for yeo-johnson
|
||
|
pt = PowerTransformer(method="yeo-johnson", standardize=False)
|
||
|
X = np.abs(X_2d)[:, 0:1]
|
||
|
|
||
|
pt.lambdas_ = np.array([1])
|
||
|
X_trans = pt.transform(X)
|
||
|
assert_array_almost_equal(X_trans, X)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"method, lmbda",
|
||
|
[
|
||
|
("box-cox", 0.1),
|
||
|
("box-cox", 0.5),
|
||
|
("yeo-johnson", 0.1),
|
||
|
("yeo-johnson", 0.5),
|
||
|
("yeo-johnson", 1.0),
|
||
|
],
|
||
|
)
|
||
|
def test_optimization_power_transformer(method, lmbda):
|
||
|
# Test the optimization procedure:
|
||
|
# - set a predefined value for lambda
|
||
|
# - apply inverse_transform to a normal dist (we get X_inv)
|
||
|
# - apply fit_transform to X_inv (we get X_inv_trans)
|
||
|
# - check that X_inv_trans is roughly equal to X
|
||
|
|
||
|
rng = np.random.RandomState(0)
|
||
|
n_samples = 20000
|
||
|
X = rng.normal(loc=0, scale=1, size=(n_samples, 1))
|
||
|
|
||
|
pt = PowerTransformer(method=method, standardize=False)
|
||
|
pt.lambdas_ = [lmbda]
|
||
|
X_inv = pt.inverse_transform(X)
|
||
|
|
||
|
pt = PowerTransformer(method=method, standardize=False)
|
||
|
X_inv_trans = pt.fit_transform(X_inv)
|
||
|
|
||
|
assert_almost_equal(0, np.linalg.norm(X - X_inv_trans) / n_samples, decimal=2)
|
||
|
assert_almost_equal(0, X_inv_trans.mean(), decimal=1)
|
||
|
assert_almost_equal(1, X_inv_trans.std(), decimal=1)
|
||
|
|
||
|
|
||
|
def test_yeo_johnson_darwin_example():
|
||
|
# test from original paper "A new family of power transformations to
|
||
|
# improve normality or symmetry" by Yeo and Johnson.
|
||
|
X = [6.1, -8.4, 1.0, 2.0, 0.7, 2.9, 3.5, 5.1, 1.8, 3.6, 7.0, 3.0, 9.3, 7.5, -6.0]
|
||
|
X = np.array(X).reshape(-1, 1)
|
||
|
lmbda = PowerTransformer(method="yeo-johnson").fit(X).lambdas_
|
||
|
assert np.allclose(lmbda, 1.305, atol=1e-3)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
|
||
|
def test_power_transformer_nans(method):
|
||
|
# Make sure lambda estimation is not influenced by NaN values
|
||
|
# and that transform() supports NaN silently
|
||
|
|
||
|
X = np.abs(X_1col)
|
||
|
pt = PowerTransformer(method=method)
|
||
|
pt.fit(X)
|
||
|
lmbda_no_nans = pt.lambdas_[0]
|
||
|
|
||
|
# concat nans at the end and check lambda stays the same
|
||
|
X = np.concatenate([X, np.full_like(X, np.nan)])
|
||
|
X = shuffle(X, random_state=0)
|
||
|
|
||
|
pt.fit(X)
|
||
|
lmbda_nans = pt.lambdas_[0]
|
||
|
|
||
|
assert_almost_equal(lmbda_no_nans, lmbda_nans, decimal=5)
|
||
|
|
||
|
X_trans = pt.transform(X)
|
||
|
assert_array_equal(np.isnan(X_trans), np.isnan(X))
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
|
||
|
@pytest.mark.parametrize("standardize", [True, False])
|
||
|
def test_power_transformer_fit_transform(method, standardize):
|
||
|
# check that fit_transform() and fit().transform() return the same values
|
||
|
X = X_1col
|
||
|
if method == "box-cox":
|
||
|
X = np.abs(X)
|
||
|
|
||
|
pt = PowerTransformer(method, standardize=standardize)
|
||
|
assert_array_almost_equal(pt.fit(X).transform(X), pt.fit_transform(X))
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
|
||
|
@pytest.mark.parametrize("standardize", [True, False])
|
||
|
def test_power_transformer_copy_True(method, standardize):
|
||
|
# Check that neither fit, transform, fit_transform nor inverse_transform
|
||
|
# modify X inplace when copy=True
|
||
|
X = X_1col
|
||
|
if method == "box-cox":
|
||
|
X = np.abs(X)
|
||
|
|
||
|
X_original = X.copy()
|
||
|
assert X is not X_original # sanity checks
|
||
|
assert_array_almost_equal(X, X_original)
|
||
|
|
||
|
pt = PowerTransformer(method, standardize=standardize, copy=True)
|
||
|
|
||
|
pt.fit(X)
|
||
|
assert_array_almost_equal(X, X_original)
|
||
|
X_trans = pt.transform(X)
|
||
|
assert X_trans is not X
|
||
|
|
||
|
X_trans = pt.fit_transform(X)
|
||
|
assert_array_almost_equal(X, X_original)
|
||
|
assert X_trans is not X
|
||
|
|
||
|
X_inv_trans = pt.inverse_transform(X_trans)
|
||
|
assert X_trans is not X_inv_trans
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
|
||
|
@pytest.mark.parametrize("standardize", [True, False])
|
||
|
def test_power_transformer_copy_False(method, standardize):
|
||
|
# check that when copy=False fit doesn't change X inplace but transform,
|
||
|
# fit_transform and inverse_transform do.
|
||
|
X = X_1col
|
||
|
if method == "box-cox":
|
||
|
X = np.abs(X)
|
||
|
|
||
|
X_original = X.copy()
|
||
|
assert X is not X_original # sanity checks
|
||
|
assert_array_almost_equal(X, X_original)
|
||
|
|
||
|
pt = PowerTransformer(method, standardize=standardize, copy=False)
|
||
|
|
||
|
pt.fit(X)
|
||
|
assert_array_almost_equal(X, X_original) # fit didn't change X
|
||
|
|
||
|
X_trans = pt.transform(X)
|
||
|
assert X_trans is X
|
||
|
|
||
|
if method == "box-cox":
|
||
|
X = np.abs(X)
|
||
|
X_trans = pt.fit_transform(X)
|
||
|
assert X_trans is X
|
||
|
|
||
|
X_inv_trans = pt.inverse_transform(X_trans)
|
||
|
assert X_trans is X_inv_trans
|
||
|
|
||
|
|
||
|
def test_power_transformer_box_cox_raise_all_nans_col():
|
||
|
"""Check that box-cox raises informative when a column contains all nans.
|
||
|
|
||
|
Non-regression test for gh-26303
|
||
|
"""
|
||
|
X = rng.random_sample((4, 5))
|
||
|
X[:, 0] = np.nan
|
||
|
|
||
|
err_msg = "Column must not be all nan."
|
||
|
|
||
|
pt = PowerTransformer(method="box-cox")
|
||
|
with pytest.raises(ValueError, match=err_msg):
|
||
|
pt.fit_transform(X)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"X_2",
|
||
|
[sparse.random(10, 1, density=0.8, random_state=0)]
|
||
|
+ [
|
||
|
csr_container(np.full((10, 1), fill_value=np.nan))
|
||
|
for csr_container in CSR_CONTAINERS
|
||
|
],
|
||
|
)
|
||
|
def test_standard_scaler_sparse_partial_fit_finite_variance(X_2):
|
||
|
# non-regression test for:
|
||
|
# https://github.com/scikit-learn/scikit-learn/issues/16448
|
||
|
X_1 = sparse.random(5, 1, density=0.8)
|
||
|
scaler = StandardScaler(with_mean=False)
|
||
|
scaler.fit(X_1).partial_fit(X_2)
|
||
|
assert np.isfinite(scaler.var_[0])
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("feature_range", [(0, 1), (-10, 10)])
|
||
|
def test_minmax_scaler_clip(feature_range):
|
||
|
# test behaviour of the parameter 'clip' in MinMaxScaler
|
||
|
X = iris.data
|
||
|
scaler = MinMaxScaler(feature_range=feature_range, clip=True).fit(X)
|
||
|
X_min, X_max = np.min(X, axis=0), np.max(X, axis=0)
|
||
|
X_test = [np.r_[X_min[:2] - 10, X_max[2:] + 10]]
|
||
|
X_transformed = scaler.transform(X_test)
|
||
|
assert_allclose(
|
||
|
X_transformed,
|
||
|
[[feature_range[0], feature_range[0], feature_range[1], feature_range[1]]],
|
||
|
)
|
||
|
|
||
|
|
||
|
def test_standard_scaler_raise_error_for_1d_input():
|
||
|
"""Check that `inverse_transform` from `StandardScaler` raises an error
|
||
|
with 1D array.
|
||
|
Non-regression test for:
|
||
|
https://github.com/scikit-learn/scikit-learn/issues/19518
|
||
|
"""
|
||
|
scaler = StandardScaler().fit(X_2d)
|
||
|
err_msg = "Expected 2D array, got 1D array instead"
|
||
|
with pytest.raises(ValueError, match=err_msg):
|
||
|
scaler.inverse_transform(X_2d[:, 0])
|
||
|
|
||
|
|
||
|
def test_power_transformer_significantly_non_gaussian():
|
||
|
"""Check that significantly non-Gaussian data before transforms correctly.
|
||
|
|
||
|
For some explored lambdas, the transformed data may be constant and will
|
||
|
be rejected. Non-regression test for
|
||
|
https://github.com/scikit-learn/scikit-learn/issues/14959
|
||
|
"""
|
||
|
|
||
|
X_non_gaussian = 1e6 * np.array(
|
||
|
[0.6, 2.0, 3.0, 4.0] * 4 + [11, 12, 12, 16, 17, 20, 85, 90], dtype=np.float64
|
||
|
).reshape(-1, 1)
|
||
|
pt = PowerTransformer()
|
||
|
|
||
|
with warnings.catch_warnings():
|
||
|
warnings.simplefilter("error", RuntimeWarning)
|
||
|
X_trans = pt.fit_transform(X_non_gaussian)
|
||
|
|
||
|
assert not np.any(np.isnan(X_trans))
|
||
|
assert X_trans.mean() == pytest.approx(0.0)
|
||
|
assert X_trans.std() == pytest.approx(1.0)
|
||
|
assert X_trans.min() > -2
|
||
|
assert X_trans.max() < 2
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"Transformer",
|
||
|
[
|
||
|
MinMaxScaler,
|
||
|
MaxAbsScaler,
|
||
|
RobustScaler,
|
||
|
StandardScaler,
|
||
|
QuantileTransformer,
|
||
|
PowerTransformer,
|
||
|
],
|
||
|
)
|
||
|
def test_one_to_one_features(Transformer):
|
||
|
"""Check one-to-one transformers give correct feature names."""
|
||
|
tr = Transformer().fit(iris.data)
|
||
|
names_out = tr.get_feature_names_out(iris.feature_names)
|
||
|
assert_array_equal(names_out, iris.feature_names)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"Transformer",
|
||
|
[
|
||
|
MinMaxScaler,
|
||
|
MaxAbsScaler,
|
||
|
RobustScaler,
|
||
|
StandardScaler,
|
||
|
QuantileTransformer,
|
||
|
PowerTransformer,
|
||
|
Normalizer,
|
||
|
Binarizer,
|
||
|
],
|
||
|
)
|
||
|
def test_one_to_one_features_pandas(Transformer):
|
||
|
"""Check one-to-one transformers give correct feature names."""
|
||
|
pd = pytest.importorskip("pandas")
|
||
|
|
||
|
df = pd.DataFrame(iris.data, columns=iris.feature_names)
|
||
|
tr = Transformer().fit(df)
|
||
|
|
||
|
names_out_df_default = tr.get_feature_names_out()
|
||
|
assert_array_equal(names_out_df_default, iris.feature_names)
|
||
|
|
||
|
names_out_df_valid_in = tr.get_feature_names_out(iris.feature_names)
|
||
|
assert_array_equal(names_out_df_valid_in, iris.feature_names)
|
||
|
|
||
|
msg = re.escape("input_features is not equal to feature_names_in_")
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
invalid_names = list("abcd")
|
||
|
tr.get_feature_names_out(invalid_names)
|
||
|
|
||
|
|
||
|
def test_kernel_centerer_feature_names_out():
|
||
|
"""Test that kernel centerer `feature_names_out`."""
|
||
|
|
||
|
rng = np.random.RandomState(0)
|
||
|
X = rng.random_sample((6, 4))
|
||
|
X_pairwise = linear_kernel(X)
|
||
|
centerer = KernelCenterer().fit(X_pairwise)
|
||
|
|
||
|
names_out = centerer.get_feature_names_out()
|
||
|
samples_out2 = X_pairwise.shape[1]
|
||
|
assert_array_equal(names_out, [f"kernelcenterer{i}" for i in range(samples_out2)])
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("standardize", [True, False])
|
||
|
def test_power_transformer_constant_feature(standardize):
|
||
|
"""Check that PowerTransfomer leaves constant features unchanged."""
|
||
|
X = [[-2, 0, 2], [-2, 0, 2], [-2, 0, 2]]
|
||
|
|
||
|
pt = PowerTransformer(method="yeo-johnson", standardize=standardize).fit(X)
|
||
|
|
||
|
assert_allclose(pt.lambdas_, [1, 1, 1])
|
||
|
|
||
|
Xft = pt.fit_transform(X)
|
||
|
Xt = pt.transform(X)
|
||
|
|
||
|
for Xt_ in [Xft, Xt]:
|
||
|
if standardize:
|
||
|
assert_allclose(Xt_, np.zeros_like(X))
|
||
|
else:
|
||
|
assert_allclose(Xt_, X)
|