1259 lines
41 KiB
Python
1259 lines
41 KiB
Python
|
import sys
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
from numpy.testing import assert_allclose, assert_array_equal
|
||
|
from scipy import sparse
|
||
|
from scipy.interpolate import BSpline
|
||
|
from scipy.sparse import random as sparse_random
|
||
|
|
||
|
from sklearn.linear_model import LinearRegression
|
||
|
from sklearn.pipeline import Pipeline
|
||
|
from sklearn.preprocessing import (
|
||
|
KBinsDiscretizer,
|
||
|
PolynomialFeatures,
|
||
|
SplineTransformer,
|
||
|
)
|
||
|
from sklearn.preprocessing._csr_polynomial_expansion import (
|
||
|
_calc_expanded_nnz,
|
||
|
_calc_total_nnz,
|
||
|
_get_sizeof_LARGEST_INT_t,
|
||
|
)
|
||
|
from sklearn.utils._testing import assert_array_almost_equal
|
||
|
from sklearn.utils.fixes import (
|
||
|
CSC_CONTAINERS,
|
||
|
CSR_CONTAINERS,
|
||
|
parse_version,
|
||
|
sp_version,
|
||
|
)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("est", (PolynomialFeatures, SplineTransformer))
|
||
|
def test_polynomial_and_spline_array_order(est):
|
||
|
"""Test that output array has the given order."""
|
||
|
X = np.arange(10).reshape(5, 2)
|
||
|
|
||
|
def is_c_contiguous(a):
|
||
|
return np.isfortran(a.T)
|
||
|
|
||
|
assert is_c_contiguous(est().fit_transform(X))
|
||
|
assert is_c_contiguous(est(order="C").fit_transform(X))
|
||
|
assert np.isfortran(est(order="F").fit_transform(X))
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"params, err_msg",
|
||
|
[
|
||
|
({"knots": [[1]]}, r"Number of knots, knots.shape\[0\], must be >= 2."),
|
||
|
({"knots": [[1, 1], [2, 2]]}, r"knots.shape\[1\] == n_features is violated"),
|
||
|
({"knots": [[1], [0]]}, "knots must be sorted without duplicates."),
|
||
|
],
|
||
|
)
|
||
|
def test_spline_transformer_input_validation(params, err_msg):
|
||
|
"""Test that we raise errors for invalid input in SplineTransformer."""
|
||
|
X = [[1], [2]]
|
||
|
|
||
|
with pytest.raises(ValueError, match=err_msg):
|
||
|
SplineTransformer(**params).fit(X)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("extrapolation", ["continue", "periodic"])
|
||
|
def test_spline_transformer_integer_knots(extrapolation):
|
||
|
"""Test that SplineTransformer accepts integer value knot positions."""
|
||
|
X = np.arange(20).reshape(10, 2)
|
||
|
knots = [[0, 1], [1, 2], [5, 5], [11, 10], [12, 11]]
|
||
|
_ = SplineTransformer(
|
||
|
degree=3, knots=knots, extrapolation=extrapolation
|
||
|
).fit_transform(X)
|
||
|
|
||
|
|
||
|
def test_spline_transformer_feature_names():
|
||
|
"""Test that SplineTransformer generates correct features name."""
|
||
|
X = np.arange(20).reshape(10, 2)
|
||
|
splt = SplineTransformer(n_knots=3, degree=3, include_bias=True).fit(X)
|
||
|
feature_names = splt.get_feature_names_out()
|
||
|
assert_array_equal(
|
||
|
feature_names,
|
||
|
[
|
||
|
"x0_sp_0",
|
||
|
"x0_sp_1",
|
||
|
"x0_sp_2",
|
||
|
"x0_sp_3",
|
||
|
"x0_sp_4",
|
||
|
"x1_sp_0",
|
||
|
"x1_sp_1",
|
||
|
"x1_sp_2",
|
||
|
"x1_sp_3",
|
||
|
"x1_sp_4",
|
||
|
],
|
||
|
)
|
||
|
|
||
|
splt = SplineTransformer(n_knots=3, degree=3, include_bias=False).fit(X)
|
||
|
feature_names = splt.get_feature_names_out(["a", "b"])
|
||
|
assert_array_equal(
|
||
|
feature_names,
|
||
|
[
|
||
|
"a_sp_0",
|
||
|
"a_sp_1",
|
||
|
"a_sp_2",
|
||
|
"a_sp_3",
|
||
|
"b_sp_0",
|
||
|
"b_sp_1",
|
||
|
"b_sp_2",
|
||
|
"b_sp_3",
|
||
|
],
|
||
|
)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"extrapolation",
|
||
|
["constant", "linear", "continue", "periodic"],
|
||
|
)
|
||
|
@pytest.mark.parametrize("degree", [2, 3])
|
||
|
def test_split_transform_feature_names_extrapolation_degree(extrapolation, degree):
|
||
|
"""Test feature names are correct for different extrapolations and degree.
|
||
|
|
||
|
Non-regression test for gh-25292.
|
||
|
"""
|
||
|
X = np.arange(20).reshape(10, 2)
|
||
|
splt = SplineTransformer(degree=degree, extrapolation=extrapolation).fit(X)
|
||
|
feature_names = splt.get_feature_names_out(["a", "b"])
|
||
|
assert len(feature_names) == splt.n_features_out_
|
||
|
|
||
|
X_trans = splt.transform(X)
|
||
|
assert X_trans.shape[1] == len(feature_names)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("degree", range(1, 5))
|
||
|
@pytest.mark.parametrize("n_knots", range(3, 5))
|
||
|
@pytest.mark.parametrize("knots", ["uniform", "quantile"])
|
||
|
@pytest.mark.parametrize("extrapolation", ["constant", "periodic"])
|
||
|
def test_spline_transformer_unity_decomposition(degree, n_knots, knots, extrapolation):
|
||
|
"""Test that B-splines are indeed a decomposition of unity.
|
||
|
|
||
|
Splines basis functions must sum up to 1 per row, if we stay in between boundaries.
|
||
|
"""
|
||
|
X = np.linspace(0, 1, 100)[:, None]
|
||
|
# make the boundaries 0 and 1 part of X_train, for sure.
|
||
|
X_train = np.r_[[[0]], X[::2, :], [[1]]]
|
||
|
X_test = X[1::2, :]
|
||
|
|
||
|
if extrapolation == "periodic":
|
||
|
n_knots = n_knots + degree # periodic splines require degree < n_knots
|
||
|
|
||
|
splt = SplineTransformer(
|
||
|
n_knots=n_knots,
|
||
|
degree=degree,
|
||
|
knots=knots,
|
||
|
include_bias=True,
|
||
|
extrapolation=extrapolation,
|
||
|
)
|
||
|
splt.fit(X_train)
|
||
|
for X in [X_train, X_test]:
|
||
|
assert_allclose(np.sum(splt.transform(X), axis=1), 1)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
|
||
|
def test_spline_transformer_linear_regression(bias, intercept):
|
||
|
"""Test that B-splines fit a sinusodial curve pretty well."""
|
||
|
X = np.linspace(0, 10, 100)[:, None]
|
||
|
y = np.sin(X[:, 0]) + 2 # +2 to avoid the value 0 in assert_allclose
|
||
|
pipe = Pipeline(
|
||
|
steps=[
|
||
|
(
|
||
|
"spline",
|
||
|
SplineTransformer(
|
||
|
n_knots=15,
|
||
|
degree=3,
|
||
|
include_bias=bias,
|
||
|
extrapolation="constant",
|
||
|
),
|
||
|
),
|
||
|
("ols", LinearRegression(fit_intercept=intercept)),
|
||
|
]
|
||
|
)
|
||
|
pipe.fit(X, y)
|
||
|
assert_allclose(pipe.predict(X), y, rtol=1e-3)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
["knots", "n_knots", "sample_weight", "expected_knots"],
|
||
|
[
|
||
|
("uniform", 3, None, np.array([[0, 2], [3, 8], [6, 14]])),
|
||
|
(
|
||
|
"uniform",
|
||
|
3,
|
||
|
np.array([0, 0, 1, 1, 0, 3, 1]),
|
||
|
np.array([[2, 2], [4, 8], [6, 14]]),
|
||
|
),
|
||
|
("uniform", 4, None, np.array([[0, 2], [2, 6], [4, 10], [6, 14]])),
|
||
|
("quantile", 3, None, np.array([[0, 2], [3, 3], [6, 14]])),
|
||
|
(
|
||
|
"quantile",
|
||
|
3,
|
||
|
np.array([0, 0, 1, 1, 0, 3, 1]),
|
||
|
np.array([[2, 2], [5, 8], [6, 14]]),
|
||
|
),
|
||
|
],
|
||
|
)
|
||
|
def test_spline_transformer_get_base_knot_positions(
|
||
|
knots, n_knots, sample_weight, expected_knots
|
||
|
):
|
||
|
"""Check the behaviour to find knot positions with and without sample_weight."""
|
||
|
X = np.array([[0, 2], [0, 2], [2, 2], [3, 3], [4, 6], [5, 8], [6, 14]])
|
||
|
base_knots = SplineTransformer._get_base_knot_positions(
|
||
|
X=X, knots=knots, n_knots=n_knots, sample_weight=sample_weight
|
||
|
)
|
||
|
assert_allclose(base_knots, expected_knots)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
|
||
|
def test_spline_transformer_periodic_linear_regression(bias, intercept):
|
||
|
"""Test that B-splines fit a periodic curve pretty well."""
|
||
|
|
||
|
# "+ 3" to avoid the value 0 in assert_allclose
|
||
|
def f(x):
|
||
|
return np.sin(2 * np.pi * x) - np.sin(8 * np.pi * x) + 3
|
||
|
|
||
|
X = np.linspace(0, 1, 101)[:, None]
|
||
|
pipe = Pipeline(
|
||
|
steps=[
|
||
|
(
|
||
|
"spline",
|
||
|
SplineTransformer(
|
||
|
n_knots=20,
|
||
|
degree=3,
|
||
|
include_bias=bias,
|
||
|
extrapolation="periodic",
|
||
|
),
|
||
|
),
|
||
|
("ols", LinearRegression(fit_intercept=intercept)),
|
||
|
]
|
||
|
)
|
||
|
pipe.fit(X, f(X[:, 0]))
|
||
|
|
||
|
# Generate larger array to check periodic extrapolation
|
||
|
X_ = np.linspace(-1, 2, 301)[:, None]
|
||
|
predictions = pipe.predict(X_)
|
||
|
assert_allclose(predictions, f(X_[:, 0]), atol=0.01, rtol=0.01)
|
||
|
assert_allclose(predictions[0:100], predictions[100:200], rtol=1e-3)
|
||
|
|
||
|
|
||
|
def test_spline_transformer_periodic_spline_backport():
|
||
|
"""Test that the backport of extrapolate="periodic" works correctly"""
|
||
|
X = np.linspace(-2, 3.5, 10)[:, None]
|
||
|
degree = 2
|
||
|
|
||
|
# Use periodic extrapolation backport in SplineTransformer
|
||
|
transformer = SplineTransformer(
|
||
|
degree=degree, extrapolation="periodic", knots=[[-1.0], [0.0], [1.0]]
|
||
|
)
|
||
|
Xt = transformer.fit_transform(X)
|
||
|
|
||
|
# Use periodic extrapolation in BSpline
|
||
|
coef = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0]])
|
||
|
spl = BSpline(np.arange(-3, 4), coef, degree, "periodic")
|
||
|
Xspl = spl(X[:, 0])
|
||
|
assert_allclose(Xt, Xspl)
|
||
|
|
||
|
|
||
|
def test_spline_transformer_periodic_splines_periodicity():
|
||
|
"""Test if shifted knots result in the same transformation up to permutation."""
|
||
|
X = np.linspace(0, 10, 101)[:, None]
|
||
|
|
||
|
transformer_1 = SplineTransformer(
|
||
|
degree=3,
|
||
|
extrapolation="periodic",
|
||
|
knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]],
|
||
|
)
|
||
|
|
||
|
transformer_2 = SplineTransformer(
|
||
|
degree=3,
|
||
|
extrapolation="periodic",
|
||
|
knots=[[1.0], [3.0], [4.0], [5.0], [8.0], [9.0]],
|
||
|
)
|
||
|
|
||
|
Xt_1 = transformer_1.fit_transform(X)
|
||
|
Xt_2 = transformer_2.fit_transform(X)
|
||
|
|
||
|
assert_allclose(Xt_1, Xt_2[:, [4, 0, 1, 2, 3]])
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("degree", [3, 5])
|
||
|
def test_spline_transformer_periodic_splines_smoothness(degree):
|
||
|
"""Test that spline transformation is smooth at first / last knot."""
|
||
|
X = np.linspace(-2, 10, 10_000)[:, None]
|
||
|
|
||
|
transformer = SplineTransformer(
|
||
|
degree=degree,
|
||
|
extrapolation="periodic",
|
||
|
knots=[[0.0], [1.0], [3.0], [4.0], [5.0], [8.0]],
|
||
|
)
|
||
|
Xt = transformer.fit_transform(X)
|
||
|
|
||
|
delta = (X.max() - X.min()) / len(X)
|
||
|
tol = 10 * delta
|
||
|
|
||
|
dXt = Xt
|
||
|
# We expect splines of degree `degree` to be (`degree`-1) times
|
||
|
# continuously differentiable. I.e. for d = 0, ..., `degree` - 1 the d-th
|
||
|
# derivative should be continuous. This is the case if the (d+1)-th
|
||
|
# numerical derivative is reasonably small (smaller than `tol` in absolute
|
||
|
# value). We thus compute d-th numeric derivatives for d = 1, ..., `degree`
|
||
|
# and compare them to `tol`.
|
||
|
#
|
||
|
# Note that the 0-th derivative is the function itself, such that we are
|
||
|
# also checking its continuity.
|
||
|
for d in range(1, degree + 1):
|
||
|
# Check continuity of the (d-1)-th derivative
|
||
|
diff = np.diff(dXt, axis=0)
|
||
|
assert np.abs(diff).max() < tol
|
||
|
# Compute d-th numeric derivative
|
||
|
dXt = diff / delta
|
||
|
|
||
|
# As degree `degree` splines are not `degree` times continuously
|
||
|
# differentiable at the knots, the `degree + 1`-th numeric derivative
|
||
|
# should have spikes at the knots.
|
||
|
diff = np.diff(dXt, axis=0)
|
||
|
assert np.abs(diff).max() > 1
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(["bias", "intercept"], [(True, False), (False, True)])
|
||
|
@pytest.mark.parametrize("degree", [1, 2, 3, 4, 5])
|
||
|
def test_spline_transformer_extrapolation(bias, intercept, degree):
|
||
|
"""Test that B-spline extrapolation works correctly."""
|
||
|
# we use a straight line for that
|
||
|
X = np.linspace(-1, 1, 100)[:, None]
|
||
|
y = X.squeeze()
|
||
|
|
||
|
# 'constant'
|
||
|
pipe = Pipeline(
|
||
|
[
|
||
|
[
|
||
|
"spline",
|
||
|
SplineTransformer(
|
||
|
n_knots=4,
|
||
|
degree=degree,
|
||
|
include_bias=bias,
|
||
|
extrapolation="constant",
|
||
|
),
|
||
|
],
|
||
|
["ols", LinearRegression(fit_intercept=intercept)],
|
||
|
]
|
||
|
)
|
||
|
pipe.fit(X, y)
|
||
|
assert_allclose(pipe.predict([[-10], [5]]), [-1, 1])
|
||
|
|
||
|
# 'linear'
|
||
|
pipe = Pipeline(
|
||
|
[
|
||
|
[
|
||
|
"spline",
|
||
|
SplineTransformer(
|
||
|
n_knots=4,
|
||
|
degree=degree,
|
||
|
include_bias=bias,
|
||
|
extrapolation="linear",
|
||
|
),
|
||
|
],
|
||
|
["ols", LinearRegression(fit_intercept=intercept)],
|
||
|
]
|
||
|
)
|
||
|
pipe.fit(X, y)
|
||
|
assert_allclose(pipe.predict([[-10], [5]]), [-10, 5])
|
||
|
|
||
|
# 'error'
|
||
|
splt = SplineTransformer(
|
||
|
n_knots=4, degree=degree, include_bias=bias, extrapolation="error"
|
||
|
)
|
||
|
splt.fit(X)
|
||
|
msg = "X contains values beyond the limits of the knots"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
splt.transform([[-10]])
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
splt.transform([[5]])
|
||
|
|
||
|
|
||
|
def test_spline_transformer_kbindiscretizer():
|
||
|
"""Test that a B-spline of degree=0 is equivalent to KBinsDiscretizer."""
|
||
|
rng = np.random.RandomState(97531)
|
||
|
X = rng.randn(200).reshape(200, 1)
|
||
|
n_bins = 5
|
||
|
n_knots = n_bins + 1
|
||
|
|
||
|
splt = SplineTransformer(
|
||
|
n_knots=n_knots, degree=0, knots="quantile", include_bias=True
|
||
|
)
|
||
|
splines = splt.fit_transform(X)
|
||
|
|
||
|
kbd = KBinsDiscretizer(n_bins=n_bins, encode="onehot-dense", strategy="quantile")
|
||
|
kbins = kbd.fit_transform(X)
|
||
|
|
||
|
# Though they should be exactly equal, we test approximately with high
|
||
|
# accuracy.
|
||
|
assert_allclose(splines, kbins, rtol=1e-13)
|
||
|
|
||
|
|
||
|
@pytest.mark.skipif(
|
||
|
sp_version < parse_version("1.8.0"),
|
||
|
reason="The option `sparse_output` is available as of scipy 1.8.0",
|
||
|
)
|
||
|
@pytest.mark.parametrize("degree", range(1, 3))
|
||
|
@pytest.mark.parametrize("knots", ["uniform", "quantile"])
|
||
|
@pytest.mark.parametrize(
|
||
|
"extrapolation", ["error", "constant", "linear", "continue", "periodic"]
|
||
|
)
|
||
|
@pytest.mark.parametrize("include_bias", [False, True])
|
||
|
def test_spline_transformer_sparse_output(
|
||
|
degree, knots, extrapolation, include_bias, global_random_seed
|
||
|
):
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
X = rng.randn(200).reshape(40, 5)
|
||
|
|
||
|
splt_dense = SplineTransformer(
|
||
|
degree=degree,
|
||
|
knots=knots,
|
||
|
extrapolation=extrapolation,
|
||
|
include_bias=include_bias,
|
||
|
sparse_output=False,
|
||
|
)
|
||
|
splt_sparse = SplineTransformer(
|
||
|
degree=degree,
|
||
|
knots=knots,
|
||
|
extrapolation=extrapolation,
|
||
|
include_bias=include_bias,
|
||
|
sparse_output=True,
|
||
|
)
|
||
|
|
||
|
splt_dense.fit(X)
|
||
|
splt_sparse.fit(X)
|
||
|
|
||
|
X_trans_sparse = splt_sparse.transform(X)
|
||
|
X_trans_dense = splt_dense.transform(X)
|
||
|
assert sparse.issparse(X_trans_sparse) and X_trans_sparse.format == "csr"
|
||
|
assert_allclose(X_trans_dense, X_trans_sparse.toarray())
|
||
|
|
||
|
# extrapolation regime
|
||
|
X_min = np.amin(X, axis=0)
|
||
|
X_max = np.amax(X, axis=0)
|
||
|
X_extra = np.r_[
|
||
|
np.linspace(X_min - 5, X_min, 10), np.linspace(X_max, X_max + 5, 10)
|
||
|
]
|
||
|
if extrapolation == "error":
|
||
|
msg = "X contains values beyond the limits of the knots"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
splt_dense.transform(X_extra)
|
||
|
msg = "Out of bounds"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
splt_sparse.transform(X_extra)
|
||
|
else:
|
||
|
assert_allclose(
|
||
|
splt_dense.transform(X_extra), splt_sparse.transform(X_extra).toarray()
|
||
|
)
|
||
|
|
||
|
|
||
|
@pytest.mark.skipif(
|
||
|
sp_version >= parse_version("1.8.0"),
|
||
|
reason="The option `sparse_output` is available as of scipy 1.8.0",
|
||
|
)
|
||
|
def test_spline_transformer_sparse_output_raise_error_for_old_scipy():
|
||
|
"""Test that SplineTransformer with sparse=True raises for scipy<1.8.0."""
|
||
|
X = [[1], [2]]
|
||
|
with pytest.raises(ValueError, match="scipy>=1.8.0"):
|
||
|
SplineTransformer(sparse_output=True).fit(X)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("n_knots", [5, 10])
|
||
|
@pytest.mark.parametrize("include_bias", [True, False])
|
||
|
@pytest.mark.parametrize("degree", [3, 4])
|
||
|
@pytest.mark.parametrize(
|
||
|
"extrapolation", ["error", "constant", "linear", "continue", "periodic"]
|
||
|
)
|
||
|
@pytest.mark.parametrize("sparse_output", [False, True])
|
||
|
def test_spline_transformer_n_features_out(
|
||
|
n_knots, include_bias, degree, extrapolation, sparse_output
|
||
|
):
|
||
|
"""Test that transform results in n_features_out_ features."""
|
||
|
if sparse_output and sp_version < parse_version("1.8.0"):
|
||
|
pytest.skip("The option `sparse_output` is available as of scipy 1.8.0")
|
||
|
|
||
|
splt = SplineTransformer(
|
||
|
n_knots=n_knots,
|
||
|
degree=degree,
|
||
|
include_bias=include_bias,
|
||
|
extrapolation=extrapolation,
|
||
|
sparse_output=sparse_output,
|
||
|
)
|
||
|
X = np.linspace(0, 1, 10)[:, None]
|
||
|
splt.fit(X)
|
||
|
|
||
|
assert splt.transform(X).shape[1] == splt.n_features_out_
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"params, err_msg",
|
||
|
[
|
||
|
({"degree": (-1, 2)}, r"degree=\(min_degree, max_degree\) must"),
|
||
|
({"degree": (0, 1.5)}, r"degree=\(min_degree, max_degree\) must"),
|
||
|
({"degree": (3, 2)}, r"degree=\(min_degree, max_degree\) must"),
|
||
|
({"degree": (1, 2, 3)}, r"int or tuple \(min_degree, max_degree\)"),
|
||
|
],
|
||
|
)
|
||
|
def test_polynomial_features_input_validation(params, err_msg):
|
||
|
"""Test that we raise errors for invalid input in PolynomialFeatures."""
|
||
|
X = [[1], [2]]
|
||
|
|
||
|
with pytest.raises(ValueError, match=err_msg):
|
||
|
PolynomialFeatures(**params).fit(X)
|
||
|
|
||
|
|
||
|
@pytest.fixture()
|
||
|
def single_feature_degree3():
|
||
|
X = np.arange(6)[:, np.newaxis]
|
||
|
P = np.hstack([np.ones_like(X), X, X**2, X**3])
|
||
|
return X, P
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"degree, include_bias, interaction_only, indices",
|
||
|
[
|
||
|
(3, True, False, slice(None, None)),
|
||
|
(3, False, False, slice(1, None)),
|
||
|
(3, True, True, [0, 1]),
|
||
|
(3, False, True, [1]),
|
||
|
((2, 3), True, False, [0, 2, 3]),
|
||
|
((2, 3), False, False, [2, 3]),
|
||
|
((2, 3), True, True, [0]),
|
||
|
((2, 3), False, True, []),
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize("X_container", [None] + CSR_CONTAINERS + CSC_CONTAINERS)
|
||
|
def test_polynomial_features_one_feature(
|
||
|
single_feature_degree3,
|
||
|
degree,
|
||
|
include_bias,
|
||
|
interaction_only,
|
||
|
indices,
|
||
|
X_container,
|
||
|
):
|
||
|
"""Test PolynomialFeatures on single feature up to degree 3."""
|
||
|
X, P = single_feature_degree3
|
||
|
if X_container is not None:
|
||
|
X = X_container(X)
|
||
|
tf = PolynomialFeatures(
|
||
|
degree=degree, include_bias=include_bias, interaction_only=interaction_only
|
||
|
).fit(X)
|
||
|
out = tf.transform(X)
|
||
|
if X_container is not None:
|
||
|
out = out.toarray()
|
||
|
assert_allclose(out, P[:, indices])
|
||
|
if tf.n_output_features_ > 0:
|
||
|
assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_)
|
||
|
|
||
|
|
||
|
@pytest.fixture()
|
||
|
def two_features_degree3():
|
||
|
X = np.arange(6).reshape((3, 2))
|
||
|
x1 = X[:, :1]
|
||
|
x2 = X[:, 1:]
|
||
|
P = np.hstack(
|
||
|
[
|
||
|
x1**0 * x2**0, # 0
|
||
|
x1**1 * x2**0, # 1
|
||
|
x1**0 * x2**1, # 2
|
||
|
x1**2 * x2**0, # 3
|
||
|
x1**1 * x2**1, # 4
|
||
|
x1**0 * x2**2, # 5
|
||
|
x1**3 * x2**0, # 6
|
||
|
x1**2 * x2**1, # 7
|
||
|
x1**1 * x2**2, # 8
|
||
|
x1**0 * x2**3, # 9
|
||
|
]
|
||
|
)
|
||
|
return X, P
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"degree, include_bias, interaction_only, indices",
|
||
|
[
|
||
|
(2, True, False, slice(0, 6)),
|
||
|
(2, False, False, slice(1, 6)),
|
||
|
(2, True, True, [0, 1, 2, 4]),
|
||
|
(2, False, True, [1, 2, 4]),
|
||
|
((2, 2), True, False, [0, 3, 4, 5]),
|
||
|
((2, 2), False, False, [3, 4, 5]),
|
||
|
((2, 2), True, True, [0, 4]),
|
||
|
((2, 2), False, True, [4]),
|
||
|
(3, True, False, slice(None, None)),
|
||
|
(3, False, False, slice(1, None)),
|
||
|
(3, True, True, [0, 1, 2, 4]),
|
||
|
(3, False, True, [1, 2, 4]),
|
||
|
((2, 3), True, False, [0, 3, 4, 5, 6, 7, 8, 9]),
|
||
|
((2, 3), False, False, slice(3, None)),
|
||
|
((2, 3), True, True, [0, 4]),
|
||
|
((2, 3), False, True, [4]),
|
||
|
((3, 3), True, False, [0, 6, 7, 8, 9]),
|
||
|
((3, 3), False, False, [6, 7, 8, 9]),
|
||
|
((3, 3), True, True, [0]),
|
||
|
((3, 3), False, True, []), # would need 3 input features
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize("X_container", [None] + CSR_CONTAINERS + CSC_CONTAINERS)
|
||
|
def test_polynomial_features_two_features(
|
||
|
two_features_degree3,
|
||
|
degree,
|
||
|
include_bias,
|
||
|
interaction_only,
|
||
|
indices,
|
||
|
X_container,
|
||
|
):
|
||
|
"""Test PolynomialFeatures on 2 features up to degree 3."""
|
||
|
X, P = two_features_degree3
|
||
|
if X_container is not None:
|
||
|
X = X_container(X)
|
||
|
tf = PolynomialFeatures(
|
||
|
degree=degree, include_bias=include_bias, interaction_only=interaction_only
|
||
|
).fit(X)
|
||
|
out = tf.transform(X)
|
||
|
if X_container is not None:
|
||
|
out = out.toarray()
|
||
|
assert_allclose(out, P[:, indices])
|
||
|
if tf.n_output_features_ > 0:
|
||
|
assert tf.powers_.shape == (tf.n_output_features_, tf.n_features_in_)
|
||
|
|
||
|
|
||
|
def test_polynomial_feature_names():
|
||
|
X = np.arange(30).reshape(10, 3)
|
||
|
poly = PolynomialFeatures(degree=2, include_bias=True).fit(X)
|
||
|
feature_names = poly.get_feature_names_out()
|
||
|
assert_array_equal(
|
||
|
["1", "x0", "x1", "x2", "x0^2", "x0 x1", "x0 x2", "x1^2", "x1 x2", "x2^2"],
|
||
|
feature_names,
|
||
|
)
|
||
|
assert len(feature_names) == poly.transform(X).shape[1]
|
||
|
|
||
|
poly = PolynomialFeatures(degree=3, include_bias=False).fit(X)
|
||
|
feature_names = poly.get_feature_names_out(["a", "b", "c"])
|
||
|
assert_array_equal(
|
||
|
[
|
||
|
"a",
|
||
|
"b",
|
||
|
"c",
|
||
|
"a^2",
|
||
|
"a b",
|
||
|
"a c",
|
||
|
"b^2",
|
||
|
"b c",
|
||
|
"c^2",
|
||
|
"a^3",
|
||
|
"a^2 b",
|
||
|
"a^2 c",
|
||
|
"a b^2",
|
||
|
"a b c",
|
||
|
"a c^2",
|
||
|
"b^3",
|
||
|
"b^2 c",
|
||
|
"b c^2",
|
||
|
"c^3",
|
||
|
],
|
||
|
feature_names,
|
||
|
)
|
||
|
assert len(feature_names) == poly.transform(X).shape[1]
|
||
|
|
||
|
poly = PolynomialFeatures(degree=(2, 3), include_bias=False).fit(X)
|
||
|
feature_names = poly.get_feature_names_out(["a", "b", "c"])
|
||
|
assert_array_equal(
|
||
|
[
|
||
|
"a^2",
|
||
|
"a b",
|
||
|
"a c",
|
||
|
"b^2",
|
||
|
"b c",
|
||
|
"c^2",
|
||
|
"a^3",
|
||
|
"a^2 b",
|
||
|
"a^2 c",
|
||
|
"a b^2",
|
||
|
"a b c",
|
||
|
"a c^2",
|
||
|
"b^3",
|
||
|
"b^2 c",
|
||
|
"b c^2",
|
||
|
"c^3",
|
||
|
],
|
||
|
feature_names,
|
||
|
)
|
||
|
assert len(feature_names) == poly.transform(X).shape[1]
|
||
|
|
||
|
poly = PolynomialFeatures(
|
||
|
degree=(3, 3), include_bias=True, interaction_only=True
|
||
|
).fit(X)
|
||
|
feature_names = poly.get_feature_names_out(["a", "b", "c"])
|
||
|
assert_array_equal(["1", "a b c"], feature_names)
|
||
|
assert len(feature_names) == poly.transform(X).shape[1]
|
||
|
|
||
|
# test some unicode
|
||
|
poly = PolynomialFeatures(degree=1, include_bias=True).fit(X)
|
||
|
feature_names = poly.get_feature_names_out(["\u0001F40D", "\u262e", "\u05d0"])
|
||
|
assert_array_equal(["1", "\u0001F40D", "\u262e", "\u05d0"], feature_names)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
["deg", "include_bias", "interaction_only", "dtype"],
|
||
|
[
|
||
|
(1, True, False, int),
|
||
|
(2, True, False, int),
|
||
|
(2, True, False, np.float32),
|
||
|
(2, True, False, np.float64),
|
||
|
(3, False, False, np.float64),
|
||
|
(3, False, True, np.float64),
|
||
|
(4, False, False, np.float64),
|
||
|
(4, False, True, np.float64),
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||
|
def test_polynomial_features_csc_X(
|
||
|
deg, include_bias, interaction_only, dtype, csc_container
|
||
|
):
|
||
|
rng = np.random.RandomState(0)
|
||
|
X = rng.randint(0, 2, (100, 2))
|
||
|
X_csc = csc_container(X)
|
||
|
|
||
|
est = PolynomialFeatures(
|
||
|
deg, include_bias=include_bias, interaction_only=interaction_only
|
||
|
)
|
||
|
Xt_csc = est.fit_transform(X_csc.astype(dtype))
|
||
|
Xt_dense = est.fit_transform(X.astype(dtype))
|
||
|
|
||
|
assert sparse.issparse(Xt_csc) and Xt_csc.format == "csc"
|
||
|
assert Xt_csc.dtype == Xt_dense.dtype
|
||
|
assert_array_almost_equal(Xt_csc.toarray(), Xt_dense)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
["deg", "include_bias", "interaction_only", "dtype"],
|
||
|
[
|
||
|
(1, True, False, int),
|
||
|
(2, True, False, int),
|
||
|
(2, True, False, np.float32),
|
||
|
(2, True, False, np.float64),
|
||
|
(3, False, False, np.float64),
|
||
|
(3, False, True, np.float64),
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_polynomial_features_csr_X(
|
||
|
deg, include_bias, interaction_only, dtype, csr_container
|
||
|
):
|
||
|
rng = np.random.RandomState(0)
|
||
|
X = rng.randint(0, 2, (100, 2))
|
||
|
X_csr = csr_container(X)
|
||
|
|
||
|
est = PolynomialFeatures(
|
||
|
deg, include_bias=include_bias, interaction_only=interaction_only
|
||
|
)
|
||
|
Xt_csr = est.fit_transform(X_csr.astype(dtype))
|
||
|
Xt_dense = est.fit_transform(X.astype(dtype, copy=False))
|
||
|
|
||
|
assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr"
|
||
|
assert Xt_csr.dtype == Xt_dense.dtype
|
||
|
assert_array_almost_equal(Xt_csr.toarray(), Xt_dense)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("n_features", [1, 4, 5])
|
||
|
@pytest.mark.parametrize(
|
||
|
"min_degree, max_degree", [(0, 1), (0, 2), (1, 3), (0, 4), (3, 4)]
|
||
|
)
|
||
|
@pytest.mark.parametrize("interaction_only", [True, False])
|
||
|
@pytest.mark.parametrize("include_bias", [True, False])
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_num_combinations(
|
||
|
n_features, min_degree, max_degree, interaction_only, include_bias, csr_container
|
||
|
):
|
||
|
"""
|
||
|
Test that n_output_features_ is calculated correctly.
|
||
|
"""
|
||
|
x = csr_container(([1], ([0], [n_features - 1])))
|
||
|
est = PolynomialFeatures(
|
||
|
degree=max_degree,
|
||
|
interaction_only=interaction_only,
|
||
|
include_bias=include_bias,
|
||
|
)
|
||
|
est.fit(x)
|
||
|
num_combos = est.n_output_features_
|
||
|
|
||
|
combos = PolynomialFeatures._combinations(
|
||
|
n_features=n_features,
|
||
|
min_degree=0,
|
||
|
max_degree=max_degree,
|
||
|
interaction_only=interaction_only,
|
||
|
include_bias=include_bias,
|
||
|
)
|
||
|
assert num_combos == sum([1 for _ in combos])
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
["deg", "include_bias", "interaction_only", "dtype"],
|
||
|
[
|
||
|
(2, True, False, np.float32),
|
||
|
(2, True, False, np.float64),
|
||
|
(3, False, False, np.float64),
|
||
|
(3, False, True, np.float64),
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_polynomial_features_csr_X_floats(
|
||
|
deg, include_bias, interaction_only, dtype, csr_container
|
||
|
):
|
||
|
X_csr = csr_container(sparse_random(1000, 10, 0.5, random_state=0))
|
||
|
X = X_csr.toarray()
|
||
|
|
||
|
est = PolynomialFeatures(
|
||
|
deg, include_bias=include_bias, interaction_only=interaction_only
|
||
|
)
|
||
|
Xt_csr = est.fit_transform(X_csr.astype(dtype))
|
||
|
Xt_dense = est.fit_transform(X.astype(dtype))
|
||
|
|
||
|
assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr"
|
||
|
assert Xt_csr.dtype == Xt_dense.dtype
|
||
|
assert_array_almost_equal(Xt_csr.toarray(), Xt_dense)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
["zero_row_index", "deg", "interaction_only"],
|
||
|
[
|
||
|
(0, 2, True),
|
||
|
(1, 2, True),
|
||
|
(2, 2, True),
|
||
|
(0, 3, True),
|
||
|
(1, 3, True),
|
||
|
(2, 3, True),
|
||
|
(0, 2, False),
|
||
|
(1, 2, False),
|
||
|
(2, 2, False),
|
||
|
(0, 3, False),
|
||
|
(1, 3, False),
|
||
|
(2, 3, False),
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_polynomial_features_csr_X_zero_row(
|
||
|
zero_row_index, deg, interaction_only, csr_container
|
||
|
):
|
||
|
X_csr = csr_container(sparse_random(3, 10, 1.0, random_state=0))
|
||
|
X_csr[zero_row_index, :] = 0.0
|
||
|
X = X_csr.toarray()
|
||
|
|
||
|
est = PolynomialFeatures(deg, include_bias=False, interaction_only=interaction_only)
|
||
|
Xt_csr = est.fit_transform(X_csr)
|
||
|
Xt_dense = est.fit_transform(X)
|
||
|
|
||
|
assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr"
|
||
|
assert Xt_csr.dtype == Xt_dense.dtype
|
||
|
assert_array_almost_equal(Xt_csr.toarray(), Xt_dense)
|
||
|
|
||
|
|
||
|
# This degree should always be one more than the highest degree supported by
|
||
|
# _csr_expansion.
|
||
|
@pytest.mark.parametrize(
|
||
|
["include_bias", "interaction_only"],
|
||
|
[(True, True), (True, False), (False, True), (False, False)],
|
||
|
)
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_polynomial_features_csr_X_degree_4(
|
||
|
include_bias, interaction_only, csr_container
|
||
|
):
|
||
|
X_csr = csr_container(sparse_random(1000, 10, 0.5, random_state=0))
|
||
|
X = X_csr.toarray()
|
||
|
|
||
|
est = PolynomialFeatures(
|
||
|
4, include_bias=include_bias, interaction_only=interaction_only
|
||
|
)
|
||
|
Xt_csr = est.fit_transform(X_csr)
|
||
|
Xt_dense = est.fit_transform(X)
|
||
|
|
||
|
assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr"
|
||
|
assert Xt_csr.dtype == Xt_dense.dtype
|
||
|
assert_array_almost_equal(Xt_csr.toarray(), Xt_dense)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
["deg", "dim", "interaction_only"],
|
||
|
[
|
||
|
(2, 1, True),
|
||
|
(2, 2, True),
|
||
|
(3, 1, True),
|
||
|
(3, 2, True),
|
||
|
(3, 3, True),
|
||
|
(2, 1, False),
|
||
|
(2, 2, False),
|
||
|
(3, 1, False),
|
||
|
(3, 2, False),
|
||
|
(3, 3, False),
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_polynomial_features_csr_X_dim_edges(deg, dim, interaction_only, csr_container):
|
||
|
X_csr = csr_container(sparse_random(1000, dim, 0.5, random_state=0))
|
||
|
X = X_csr.toarray()
|
||
|
|
||
|
est = PolynomialFeatures(deg, interaction_only=interaction_only)
|
||
|
Xt_csr = est.fit_transform(X_csr)
|
||
|
Xt_dense = est.fit_transform(X)
|
||
|
|
||
|
assert sparse.issparse(Xt_csr) and Xt_csr.format == "csr"
|
||
|
assert Xt_csr.dtype == Xt_dense.dtype
|
||
|
assert_array_almost_equal(Xt_csr.toarray(), Xt_dense)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("interaction_only", [True, False])
|
||
|
@pytest.mark.parametrize("include_bias", [True, False])
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_csr_polynomial_expansion_index_overflow_non_regression(
|
||
|
interaction_only, include_bias, csr_container
|
||
|
):
|
||
|
"""Check the automatic index dtype promotion to `np.int64` when needed.
|
||
|
|
||
|
This ensures that sufficiently large input configurations get
|
||
|
properly promoted to use `np.int64` for index and indptr representation
|
||
|
while preserving data integrity. Non-regression test for gh-16803.
|
||
|
|
||
|
Note that this is only possible for Python runtimes with a 64 bit address
|
||
|
space. On 32 bit platforms, a `ValueError` is raised instead.
|
||
|
"""
|
||
|
|
||
|
def degree_2_calc(d, i, j):
|
||
|
if interaction_only:
|
||
|
return d * i - (i**2 + 3 * i) // 2 - 1 + j
|
||
|
else:
|
||
|
return d * i - (i**2 + i) // 2 + j
|
||
|
|
||
|
n_samples = 13
|
||
|
n_features = 120001
|
||
|
data_dtype = np.float32
|
||
|
data = np.arange(1, 5, dtype=np.int64)
|
||
|
row = np.array([n_samples - 2, n_samples - 2, n_samples - 1, n_samples - 1])
|
||
|
# An int64 dtype is required to avoid overflow error on Windows within the
|
||
|
# `degree_2_calc` function.
|
||
|
col = np.array(
|
||
|
[n_features - 2, n_features - 1, n_features - 2, n_features - 1], dtype=np.int64
|
||
|
)
|
||
|
X = csr_container(
|
||
|
(data, (row, col)),
|
||
|
shape=(n_samples, n_features),
|
||
|
dtype=data_dtype,
|
||
|
)
|
||
|
pf = PolynomialFeatures(
|
||
|
interaction_only=interaction_only, include_bias=include_bias, degree=2
|
||
|
)
|
||
|
|
||
|
# Calculate the number of combinations a-priori, and if needed check for
|
||
|
# the correct ValueError and terminate the test early.
|
||
|
num_combinations = pf._num_combinations(
|
||
|
n_features=n_features,
|
||
|
min_degree=0,
|
||
|
max_degree=2,
|
||
|
interaction_only=pf.interaction_only,
|
||
|
include_bias=pf.include_bias,
|
||
|
)
|
||
|
if num_combinations > np.iinfo(np.intp).max:
|
||
|
msg = (
|
||
|
r"The output that would result from the current configuration would have"
|
||
|
r" \d* features which is too large to be indexed"
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
pf.fit(X)
|
||
|
return
|
||
|
X_trans = pf.fit_transform(X)
|
||
|
row_nonzero, col_nonzero = X_trans.nonzero()
|
||
|
n_degree_1_features_out = n_features + include_bias
|
||
|
max_degree_2_idx = (
|
||
|
degree_2_calc(n_features, col[int(not interaction_only)], col[1])
|
||
|
+ n_degree_1_features_out
|
||
|
)
|
||
|
|
||
|
# Account for bias of all samples except last one which will be handled
|
||
|
# separately since there are distinct data values before it
|
||
|
data_target = [1] * (n_samples - 2) if include_bias else []
|
||
|
col_nonzero_target = [0] * (n_samples - 2) if include_bias else []
|
||
|
|
||
|
for i in range(2):
|
||
|
x = data[2 * i]
|
||
|
y = data[2 * i + 1]
|
||
|
x_idx = col[2 * i]
|
||
|
y_idx = col[2 * i + 1]
|
||
|
if include_bias:
|
||
|
data_target.append(1)
|
||
|
col_nonzero_target.append(0)
|
||
|
data_target.extend([x, y])
|
||
|
col_nonzero_target.extend(
|
||
|
[x_idx + int(include_bias), y_idx + int(include_bias)]
|
||
|
)
|
||
|
if not interaction_only:
|
||
|
data_target.extend([x * x, x * y, y * y])
|
||
|
col_nonzero_target.extend(
|
||
|
[
|
||
|
degree_2_calc(n_features, x_idx, x_idx) + n_degree_1_features_out,
|
||
|
degree_2_calc(n_features, x_idx, y_idx) + n_degree_1_features_out,
|
||
|
degree_2_calc(n_features, y_idx, y_idx) + n_degree_1_features_out,
|
||
|
]
|
||
|
)
|
||
|
else:
|
||
|
data_target.extend([x * y])
|
||
|
col_nonzero_target.append(
|
||
|
degree_2_calc(n_features, x_idx, y_idx) + n_degree_1_features_out
|
||
|
)
|
||
|
|
||
|
nnz_per_row = int(include_bias) + 3 + 2 * int(not interaction_only)
|
||
|
|
||
|
assert pf.n_output_features_ == max_degree_2_idx + 1
|
||
|
assert X_trans.dtype == data_dtype
|
||
|
assert X_trans.shape == (n_samples, max_degree_2_idx + 1)
|
||
|
assert X_trans.indptr.dtype == X_trans.indices.dtype == np.int64
|
||
|
# Ensure that dtype promotion was actually required:
|
||
|
assert X_trans.indices.max() > np.iinfo(np.int32).max
|
||
|
|
||
|
row_nonzero_target = list(range(n_samples - 2)) if include_bias else []
|
||
|
row_nonzero_target.extend(
|
||
|
[n_samples - 2] * nnz_per_row + [n_samples - 1] * nnz_per_row
|
||
|
)
|
||
|
|
||
|
assert_allclose(X_trans.data, data_target)
|
||
|
assert_array_equal(row_nonzero, row_nonzero_target)
|
||
|
assert_array_equal(col_nonzero, col_nonzero_target)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"degree, n_features",
|
||
|
[
|
||
|
# Needs promotion to int64 when interaction_only=False
|
||
|
(2, 65535),
|
||
|
(3, 2344),
|
||
|
# This guarantees that the intermediate operation when calculating
|
||
|
# output columns would overflow a C-long, hence checks that python-
|
||
|
# longs are being used.
|
||
|
(2, int(np.sqrt(np.iinfo(np.int64).max) + 1)),
|
||
|
(3, 65535),
|
||
|
# This case tests the second clause of the overflow check which
|
||
|
# takes into account the value of `n_features` itself.
|
||
|
(2, int(np.sqrt(np.iinfo(np.int64).max))),
|
||
|
],
|
||
|
)
|
||
|
@pytest.mark.parametrize("interaction_only", [True, False])
|
||
|
@pytest.mark.parametrize("include_bias", [True, False])
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_csr_polynomial_expansion_index_overflow(
|
||
|
degree, n_features, interaction_only, include_bias, csr_container
|
||
|
):
|
||
|
"""Tests known edge-cases to the dtype promotion strategy and custom
|
||
|
Cython code, including a current bug in the upstream
|
||
|
`scipy.sparse.hstack`.
|
||
|
"""
|
||
|
data = [1.0]
|
||
|
row = [0]
|
||
|
col = [n_features - 1]
|
||
|
|
||
|
# First degree index
|
||
|
expected_indices = [
|
||
|
n_features - 1 + int(include_bias),
|
||
|
]
|
||
|
# Second degree index
|
||
|
expected_indices.append(n_features * (n_features + 1) // 2 + expected_indices[0])
|
||
|
# Third degree index
|
||
|
expected_indices.append(
|
||
|
n_features * (n_features + 1) * (n_features + 2) // 6 + expected_indices[1]
|
||
|
)
|
||
|
|
||
|
X = csr_container((data, (row, col)))
|
||
|
pf = PolynomialFeatures(
|
||
|
interaction_only=interaction_only, include_bias=include_bias, degree=degree
|
||
|
)
|
||
|
|
||
|
# Calculate the number of combinations a-priori, and if needed check for
|
||
|
# the correct ValueError and terminate the test early.
|
||
|
num_combinations = pf._num_combinations(
|
||
|
n_features=n_features,
|
||
|
min_degree=0,
|
||
|
max_degree=degree,
|
||
|
interaction_only=pf.interaction_only,
|
||
|
include_bias=pf.include_bias,
|
||
|
)
|
||
|
if num_combinations > np.iinfo(np.intp).max:
|
||
|
msg = (
|
||
|
r"The output that would result from the current configuration would have"
|
||
|
r" \d* features which is too large to be indexed"
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
pf.fit(X)
|
||
|
return
|
||
|
|
||
|
# In SciPy < 1.8, a bug occurs when an intermediate matrix in
|
||
|
# `to_stack` in `hstack` fits within int32 however would require int64 when
|
||
|
# combined with all previous matrices in `to_stack`.
|
||
|
if sp_version < parse_version("1.8.0"):
|
||
|
has_bug = False
|
||
|
max_int32 = np.iinfo(np.int32).max
|
||
|
cumulative_size = n_features + include_bias
|
||
|
for deg in range(2, degree + 1):
|
||
|
max_indptr = _calc_total_nnz(X.indptr, interaction_only, deg)
|
||
|
max_indices = _calc_expanded_nnz(n_features, interaction_only, deg) - 1
|
||
|
cumulative_size += max_indices + 1
|
||
|
needs_int64 = max(max_indices, max_indptr) > max_int32
|
||
|
has_bug |= not needs_int64 and cumulative_size > max_int32
|
||
|
if has_bug:
|
||
|
msg = r"In scipy versions `<1.8.0`, the function `scipy.sparse.hstack`"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
X_trans = pf.fit_transform(X)
|
||
|
return
|
||
|
|
||
|
# When `n_features>=65535`, `scipy.sparse.hstack` may not use the right
|
||
|
# dtype for representing indices and indptr if `n_features` is still
|
||
|
# small enough so that each block matrix's indices and indptr arrays
|
||
|
# can be represented with `np.int32`. We test `n_features==65535`
|
||
|
# since it is guaranteed to run into this bug.
|
||
|
if (
|
||
|
sp_version < parse_version("1.9.2")
|
||
|
and n_features == 65535
|
||
|
and degree == 2
|
||
|
and not interaction_only
|
||
|
): # pragma: no cover
|
||
|
msg = r"In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
X_trans = pf.fit_transform(X)
|
||
|
return
|
||
|
X_trans = pf.fit_transform(X)
|
||
|
|
||
|
expected_dtype = np.int64 if num_combinations > np.iinfo(np.int32).max else np.int32
|
||
|
# Terms higher than first degree
|
||
|
non_bias_terms = 1 + (degree - 1) * int(not interaction_only)
|
||
|
expected_nnz = int(include_bias) + non_bias_terms
|
||
|
assert X_trans.dtype == X.dtype
|
||
|
assert X_trans.shape == (1, pf.n_output_features_)
|
||
|
assert X_trans.indptr.dtype == X_trans.indices.dtype == expected_dtype
|
||
|
assert X_trans.nnz == expected_nnz
|
||
|
|
||
|
if include_bias:
|
||
|
assert X_trans[0, 0] == pytest.approx(1.0)
|
||
|
for idx in range(non_bias_terms):
|
||
|
assert X_trans[0, expected_indices[idx]] == pytest.approx(1.0)
|
||
|
|
||
|
offset = interaction_only * n_features
|
||
|
if degree == 3:
|
||
|
offset *= 1 + n_features
|
||
|
assert pf.n_output_features_ == expected_indices[degree - 1] + 1 - offset
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("interaction_only", [True, False])
|
||
|
@pytest.mark.parametrize("include_bias", [True, False])
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_csr_polynomial_expansion_too_large_to_index(
|
||
|
interaction_only, include_bias, csr_container
|
||
|
):
|
||
|
n_features = np.iinfo(np.int64).max // 2
|
||
|
data = [1.0]
|
||
|
row = [0]
|
||
|
col = [n_features - 1]
|
||
|
X = csr_container((data, (row, col)))
|
||
|
pf = PolynomialFeatures(
|
||
|
interaction_only=interaction_only, include_bias=include_bias, degree=(2, 2)
|
||
|
)
|
||
|
msg = (
|
||
|
r"The output that would result from the current configuration would have \d*"
|
||
|
r" features which is too large to be indexed"
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
pf.fit(X)
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
pf.fit_transform(X)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS)
|
||
|
def test_polynomial_features_behaviour_on_zero_degree(sparse_container):
|
||
|
"""Check that PolynomialFeatures raises error when degree=0 and include_bias=False,
|
||
|
and output a single constant column when include_bias=True
|
||
|
"""
|
||
|
X = np.ones((10, 2))
|
||
|
poly = PolynomialFeatures(degree=0, include_bias=False)
|
||
|
err_msg = (
|
||
|
"Setting degree to zero and include_bias to False would result in"
|
||
|
" an empty output array."
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=err_msg):
|
||
|
poly.fit_transform(X)
|
||
|
|
||
|
poly = PolynomialFeatures(degree=(0, 0), include_bias=False)
|
||
|
err_msg = (
|
||
|
"Setting both min_degree and max_degree to zero and include_bias to"
|
||
|
" False would result in an empty output array."
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=err_msg):
|
||
|
poly.fit_transform(X)
|
||
|
|
||
|
for _X in [X, sparse_container(X)]:
|
||
|
poly = PolynomialFeatures(degree=0, include_bias=True)
|
||
|
output = poly.fit_transform(_X)
|
||
|
# convert to dense array if needed
|
||
|
if sparse.issparse(output):
|
||
|
output = output.toarray()
|
||
|
assert_array_equal(output, np.ones((X.shape[0], 1)))
|
||
|
|
||
|
|
||
|
def test_sizeof_LARGEST_INT_t():
|
||
|
# On Windows, scikit-learn is typically compiled with MSVC that
|
||
|
# does not support int128 arithmetic (at the time of writing):
|
||
|
# https://stackoverflow.com/a/6761962/163740
|
||
|
if sys.platform == "win32" or (
|
||
|
sys.maxsize <= 2**32 and sys.platform != "emscripten"
|
||
|
):
|
||
|
expected_size = 8
|
||
|
else:
|
||
|
expected_size = 16
|
||
|
|
||
|
assert _get_sizeof_LARGEST_INT_t() == expected_size
|
||
|
|
||
|
|
||
|
@pytest.mark.xfail(
|
||
|
sys.platform == "win32",
|
||
|
reason=(
|
||
|
"On Windows, scikit-learn is typically compiled with MSVC that does not support"
|
||
|
" int128 arithmetic (at the time of writing)"
|
||
|
),
|
||
|
run=True,
|
||
|
)
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_csr_polynomial_expansion_windows_fail(csr_container):
|
||
|
# Minimum needed to ensure integer overflow occurs while guaranteeing an
|
||
|
# int64-indexable output.
|
||
|
n_features = int(np.iinfo(np.int64).max ** (1 / 3) + 3)
|
||
|
data = [1.0]
|
||
|
row = [0]
|
||
|
col = [n_features - 1]
|
||
|
|
||
|
# First degree index
|
||
|
expected_indices = [
|
||
|
n_features - 1,
|
||
|
]
|
||
|
# Second degree index
|
||
|
expected_indices.append(
|
||
|
int(n_features * (n_features + 1) // 2 + expected_indices[0])
|
||
|
)
|
||
|
# Third degree index
|
||
|
expected_indices.append(
|
||
|
int(n_features * (n_features + 1) * (n_features + 2) // 6 + expected_indices[1])
|
||
|
)
|
||
|
|
||
|
X = csr_container((data, (row, col)))
|
||
|
pf = PolynomialFeatures(interaction_only=False, include_bias=False, degree=3)
|
||
|
if sys.maxsize <= 2**32:
|
||
|
msg = (
|
||
|
r"The output that would result from the current configuration would"
|
||
|
r" have \d*"
|
||
|
r" features which is too large to be indexed"
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
pf.fit_transform(X)
|
||
|
else:
|
||
|
X_trans = pf.fit_transform(X)
|
||
|
for idx in range(3):
|
||
|
assert X_trans[0, expected_indices[idx]] == pytest.approx(1.0)
|