790 lines
27 KiB
Python
790 lines
27 KiB
Python
|
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||
|
# Fabian Pedregosa <fabian.pedregosa@inria.fr>
|
||
|
# Maria Telenczuk <https://github.com/maikia>
|
||
|
#
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
import warnings
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
from scipy import linalg, sparse
|
||
|
|
||
|
from sklearn.datasets import load_iris, make_regression, make_sparse_uncorrelated
|
||
|
from sklearn.linear_model import LinearRegression
|
||
|
from sklearn.linear_model._base import (
|
||
|
_preprocess_data,
|
||
|
_rescale_data,
|
||
|
make_dataset,
|
||
|
)
|
||
|
from sklearn.preprocessing import add_dummy_feature
|
||
|
from sklearn.utils._testing import (
|
||
|
assert_allclose,
|
||
|
assert_array_almost_equal,
|
||
|
assert_array_equal,
|
||
|
)
|
||
|
from sklearn.utils.fixes import (
|
||
|
COO_CONTAINERS,
|
||
|
CSC_CONTAINERS,
|
||
|
CSR_CONTAINERS,
|
||
|
LIL_CONTAINERS,
|
||
|
)
|
||
|
|
||
|
rtol = 1e-6
|
||
|
|
||
|
|
||
|
def test_linear_regression():
|
||
|
# Test LinearRegression on a simple dataset.
|
||
|
# a simple dataset
|
||
|
X = [[1], [2]]
|
||
|
Y = [1, 2]
|
||
|
|
||
|
reg = LinearRegression()
|
||
|
reg.fit(X, Y)
|
||
|
|
||
|
assert_array_almost_equal(reg.coef_, [1])
|
||
|
assert_array_almost_equal(reg.intercept_, [0])
|
||
|
assert_array_almost_equal(reg.predict(X), [1, 2])
|
||
|
|
||
|
# test it also for degenerate input
|
||
|
X = [[1]]
|
||
|
Y = [0]
|
||
|
|
||
|
reg = LinearRegression()
|
||
|
reg.fit(X, Y)
|
||
|
assert_array_almost_equal(reg.coef_, [0])
|
||
|
assert_array_almost_equal(reg.intercept_, [0])
|
||
|
assert_array_almost_equal(reg.predict(X), [0])
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
|
||
|
@pytest.mark.parametrize("fit_intercept", [True, False])
|
||
|
def test_linear_regression_sample_weights(
|
||
|
sparse_container, fit_intercept, global_random_seed
|
||
|
):
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
|
||
|
# It would not work with under-determined systems
|
||
|
n_samples, n_features = 6, 5
|
||
|
|
||
|
X = rng.normal(size=(n_samples, n_features))
|
||
|
if sparse_container is not None:
|
||
|
X = sparse_container(X)
|
||
|
y = rng.normal(size=n_samples)
|
||
|
|
||
|
sample_weight = 1.0 + rng.uniform(size=n_samples)
|
||
|
|
||
|
# LinearRegression with explicit sample_weight
|
||
|
reg = LinearRegression(fit_intercept=fit_intercept)
|
||
|
reg.fit(X, y, sample_weight=sample_weight)
|
||
|
coefs1 = reg.coef_
|
||
|
inter1 = reg.intercept_
|
||
|
|
||
|
assert reg.coef_.shape == (X.shape[1],) # sanity checks
|
||
|
|
||
|
# Closed form of the weighted least square
|
||
|
# theta = (X^T W X)^(-1) @ X^T W y
|
||
|
W = np.diag(sample_weight)
|
||
|
X_aug = X if not fit_intercept else add_dummy_feature(X)
|
||
|
|
||
|
Xw = X_aug.T @ W @ X_aug
|
||
|
yw = X_aug.T @ W @ y
|
||
|
coefs2 = linalg.solve(Xw, yw)
|
||
|
|
||
|
if not fit_intercept:
|
||
|
assert_allclose(coefs1, coefs2)
|
||
|
else:
|
||
|
assert_allclose(coefs1, coefs2[1:])
|
||
|
assert_allclose(inter1, coefs2[0])
|
||
|
|
||
|
|
||
|
def test_raises_value_error_if_positive_and_sparse():
|
||
|
error_msg = "Sparse data was passed for X, but dense data is required."
|
||
|
# X must not be sparse if positive == True
|
||
|
X = sparse.eye(10)
|
||
|
y = np.ones(10)
|
||
|
|
||
|
reg = LinearRegression(positive=True)
|
||
|
|
||
|
with pytest.raises(TypeError, match=error_msg):
|
||
|
reg.fit(X, y)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("n_samples, n_features", [(2, 3), (3, 2)])
|
||
|
def test_raises_value_error_if_sample_weights_greater_than_1d(n_samples, n_features):
|
||
|
# Sample weights must be either scalar or 1D
|
||
|
rng = np.random.RandomState(0)
|
||
|
X = rng.randn(n_samples, n_features)
|
||
|
y = rng.randn(n_samples)
|
||
|
sample_weights_OK = rng.randn(n_samples) ** 2 + 1
|
||
|
sample_weights_OK_1 = 1.0
|
||
|
sample_weights_OK_2 = 2.0
|
||
|
|
||
|
reg = LinearRegression()
|
||
|
|
||
|
# make sure the "OK" sample weights actually work
|
||
|
reg.fit(X, y, sample_weights_OK)
|
||
|
reg.fit(X, y, sample_weights_OK_1)
|
||
|
reg.fit(X, y, sample_weights_OK_2)
|
||
|
|
||
|
|
||
|
def test_fit_intercept():
|
||
|
# Test assertions on betas shape.
|
||
|
X2 = np.array([[0.38349978, 0.61650022], [0.58853682, 0.41146318]])
|
||
|
X3 = np.array(
|
||
|
[[0.27677969, 0.70693172, 0.01628859], [0.08385139, 0.20692515, 0.70922346]]
|
||
|
)
|
||
|
y = np.array([1, 1])
|
||
|
|
||
|
lr2_without_intercept = LinearRegression(fit_intercept=False).fit(X2, y)
|
||
|
lr2_with_intercept = LinearRegression().fit(X2, y)
|
||
|
|
||
|
lr3_without_intercept = LinearRegression(fit_intercept=False).fit(X3, y)
|
||
|
lr3_with_intercept = LinearRegression().fit(X3, y)
|
||
|
|
||
|
assert lr2_with_intercept.coef_.shape == lr2_without_intercept.coef_.shape
|
||
|
assert lr3_with_intercept.coef_.shape == lr3_without_intercept.coef_.shape
|
||
|
assert lr2_without_intercept.coef_.ndim == lr3_without_intercept.coef_.ndim
|
||
|
|
||
|
|
||
|
def test_linear_regression_sparse(global_random_seed):
|
||
|
# Test that linear regression also works with sparse data
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
n = 100
|
||
|
X = sparse.eye(n, n)
|
||
|
beta = rng.rand(n)
|
||
|
y = X @ beta
|
||
|
|
||
|
ols = LinearRegression()
|
||
|
ols.fit(X, y.ravel())
|
||
|
assert_array_almost_equal(beta, ols.coef_ + ols.intercept_)
|
||
|
|
||
|
assert_array_almost_equal(ols.predict(X) - y.ravel(), 0)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("fit_intercept", [True, False])
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_linear_regression_sparse_equal_dense(fit_intercept, csr_container):
|
||
|
# Test that linear regression agrees between sparse and dense
|
||
|
rng = np.random.RandomState(0)
|
||
|
n_samples = 200
|
||
|
n_features = 2
|
||
|
X = rng.randn(n_samples, n_features)
|
||
|
X[X < 0.1] = 0.0
|
||
|
Xcsr = csr_container(X)
|
||
|
y = rng.rand(n_samples)
|
||
|
params = dict(fit_intercept=fit_intercept)
|
||
|
clf_dense = LinearRegression(**params)
|
||
|
clf_sparse = LinearRegression(**params)
|
||
|
clf_dense.fit(X, y)
|
||
|
clf_sparse.fit(Xcsr, y)
|
||
|
assert clf_dense.intercept_ == pytest.approx(clf_sparse.intercept_)
|
||
|
assert_allclose(clf_dense.coef_, clf_sparse.coef_)
|
||
|
|
||
|
|
||
|
def test_linear_regression_multiple_outcome():
|
||
|
# Test multiple-outcome linear regressions
|
||
|
rng = np.random.RandomState(0)
|
||
|
X, y = make_regression(random_state=rng)
|
||
|
|
||
|
Y = np.vstack((y, y)).T
|
||
|
n_features = X.shape[1]
|
||
|
|
||
|
reg = LinearRegression()
|
||
|
reg.fit((X), Y)
|
||
|
assert reg.coef_.shape == (2, n_features)
|
||
|
Y_pred = reg.predict(X)
|
||
|
reg.fit(X, y)
|
||
|
y_pred = reg.predict(X)
|
||
|
assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
|
||
|
def test_linear_regression_sparse_multiple_outcome(global_random_seed, coo_container):
|
||
|
# Test multiple-outcome linear regressions with sparse data
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
X, y = make_sparse_uncorrelated(random_state=rng)
|
||
|
X = coo_container(X)
|
||
|
Y = np.vstack((y, y)).T
|
||
|
n_features = X.shape[1]
|
||
|
|
||
|
ols = LinearRegression()
|
||
|
ols.fit(X, Y)
|
||
|
assert ols.coef_.shape == (2, n_features)
|
||
|
Y_pred = ols.predict(X)
|
||
|
ols.fit(X, y.ravel())
|
||
|
y_pred = ols.predict(X)
|
||
|
assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
|
||
|
|
||
|
|
||
|
def test_linear_regression_positive():
|
||
|
# Test nonnegative LinearRegression on a simple dataset.
|
||
|
X = [[1], [2]]
|
||
|
y = [1, 2]
|
||
|
|
||
|
reg = LinearRegression(positive=True)
|
||
|
reg.fit(X, y)
|
||
|
|
||
|
assert_array_almost_equal(reg.coef_, [1])
|
||
|
assert_array_almost_equal(reg.intercept_, [0])
|
||
|
assert_array_almost_equal(reg.predict(X), [1, 2])
|
||
|
|
||
|
# test it also for degenerate input
|
||
|
X = [[1]]
|
||
|
y = [0]
|
||
|
|
||
|
reg = LinearRegression(positive=True)
|
||
|
reg.fit(X, y)
|
||
|
assert_allclose(reg.coef_, [0])
|
||
|
assert_allclose(reg.intercept_, [0])
|
||
|
assert_allclose(reg.predict(X), [0])
|
||
|
|
||
|
|
||
|
def test_linear_regression_positive_multiple_outcome(global_random_seed):
|
||
|
# Test multiple-outcome nonnegative linear regressions
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
X, y = make_sparse_uncorrelated(random_state=rng)
|
||
|
Y = np.vstack((y, y)).T
|
||
|
n_features = X.shape[1]
|
||
|
|
||
|
ols = LinearRegression(positive=True)
|
||
|
ols.fit(X, Y)
|
||
|
assert ols.coef_.shape == (2, n_features)
|
||
|
assert np.all(ols.coef_ >= 0.0)
|
||
|
Y_pred = ols.predict(X)
|
||
|
ols.fit(X, y.ravel())
|
||
|
y_pred = ols.predict(X)
|
||
|
assert_allclose(np.vstack((y_pred, y_pred)).T, Y_pred)
|
||
|
|
||
|
|
||
|
def test_linear_regression_positive_vs_nonpositive(global_random_seed):
|
||
|
# Test differences with LinearRegression when positive=False.
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
X, y = make_sparse_uncorrelated(random_state=rng)
|
||
|
|
||
|
reg = LinearRegression(positive=True)
|
||
|
reg.fit(X, y)
|
||
|
regn = LinearRegression(positive=False)
|
||
|
regn.fit(X, y)
|
||
|
|
||
|
assert np.mean((reg.coef_ - regn.coef_) ** 2) > 1e-3
|
||
|
|
||
|
|
||
|
def test_linear_regression_positive_vs_nonpositive_when_positive(global_random_seed):
|
||
|
# Test LinearRegression fitted coefficients
|
||
|
# when the problem is positive.
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
n_samples = 200
|
||
|
n_features = 4
|
||
|
X = rng.rand(n_samples, n_features)
|
||
|
y = X[:, 0] + 2 * X[:, 1] + 3 * X[:, 2] + 1.5 * X[:, 3]
|
||
|
|
||
|
reg = LinearRegression(positive=True)
|
||
|
reg.fit(X, y)
|
||
|
regn = LinearRegression(positive=False)
|
||
|
regn.fit(X, y)
|
||
|
|
||
|
assert np.mean((reg.coef_ - regn.coef_) ** 2) < 1e-6
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
|
||
|
@pytest.mark.parametrize("use_sw", [True, False])
|
||
|
def test_inplace_data_preprocessing(sparse_container, use_sw, global_random_seed):
|
||
|
# Check that the data is not modified inplace by the linear regression
|
||
|
# estimator.
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
original_X_data = rng.randn(10, 12)
|
||
|
original_y_data = rng.randn(10, 2)
|
||
|
orginal_sw_data = rng.rand(10)
|
||
|
|
||
|
if sparse_container is not None:
|
||
|
X = sparse_container(original_X_data)
|
||
|
else:
|
||
|
X = original_X_data.copy()
|
||
|
y = original_y_data.copy()
|
||
|
# XXX: Note hat y_sparse is not supported (broken?) in the current
|
||
|
# implementation of LinearRegression.
|
||
|
|
||
|
if use_sw:
|
||
|
sample_weight = orginal_sw_data.copy()
|
||
|
else:
|
||
|
sample_weight = None
|
||
|
|
||
|
# Do not allow inplace preprocessing of X and y:
|
||
|
reg = LinearRegression()
|
||
|
reg.fit(X, y, sample_weight=sample_weight)
|
||
|
if sparse_container is not None:
|
||
|
assert_allclose(X.toarray(), original_X_data)
|
||
|
else:
|
||
|
assert_allclose(X, original_X_data)
|
||
|
assert_allclose(y, original_y_data)
|
||
|
|
||
|
if use_sw:
|
||
|
assert_allclose(sample_weight, orginal_sw_data)
|
||
|
|
||
|
# Allow inplace preprocessing of X and y
|
||
|
reg = LinearRegression(copy_X=False)
|
||
|
reg.fit(X, y, sample_weight=sample_weight)
|
||
|
if sparse_container is not None:
|
||
|
# No optimization relying on the inplace modification of sparse input
|
||
|
# data has been implemented at this time.
|
||
|
assert_allclose(X.toarray(), original_X_data)
|
||
|
else:
|
||
|
# X has been offset (and optionally rescaled by sample weights)
|
||
|
# inplace. The 0.42 threshold is arbitrary and has been found to be
|
||
|
# robust to any random seed in the admissible range.
|
||
|
assert np.linalg.norm(X - original_X_data) > 0.42
|
||
|
|
||
|
# y should not have been modified inplace by LinearRegression.fit.
|
||
|
assert_allclose(y, original_y_data)
|
||
|
|
||
|
if use_sw:
|
||
|
# Sample weights have no reason to ever be modified inplace.
|
||
|
assert_allclose(sample_weight, orginal_sw_data)
|
||
|
|
||
|
|
||
|
def test_linear_regression_pd_sparse_dataframe_warning():
|
||
|
pd = pytest.importorskip("pandas")
|
||
|
|
||
|
# Warning is raised only when some of the columns is sparse
|
||
|
df = pd.DataFrame({"0": np.random.randn(10)})
|
||
|
for col in range(1, 4):
|
||
|
arr = np.random.randn(10)
|
||
|
arr[:8] = 0
|
||
|
# all columns but the first column is sparse
|
||
|
if col != 0:
|
||
|
arr = pd.arrays.SparseArray(arr, fill_value=0)
|
||
|
df[str(col)] = arr
|
||
|
|
||
|
msg = "pandas.DataFrame with sparse columns found."
|
||
|
|
||
|
reg = LinearRegression()
|
||
|
with pytest.warns(UserWarning, match=msg):
|
||
|
reg.fit(df.iloc[:, 0:2], df.iloc[:, 3])
|
||
|
|
||
|
# does not warn when the whole dataframe is sparse
|
||
|
df["0"] = pd.arrays.SparseArray(df["0"], fill_value=0)
|
||
|
assert hasattr(df, "sparse")
|
||
|
|
||
|
with warnings.catch_warnings():
|
||
|
warnings.simplefilter("error", UserWarning)
|
||
|
reg.fit(df.iloc[:, 0:2], df.iloc[:, 3])
|
||
|
|
||
|
|
||
|
def test_preprocess_data(global_random_seed):
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
n_samples = 200
|
||
|
n_features = 2
|
||
|
X = rng.rand(n_samples, n_features)
|
||
|
y = rng.rand(n_samples)
|
||
|
expected_X_mean = np.mean(X, axis=0)
|
||
|
expected_y_mean = np.mean(y, axis=0)
|
||
|
|
||
|
Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(X, y, fit_intercept=False)
|
||
|
assert_array_almost_equal(X_mean, np.zeros(n_features))
|
||
|
assert_array_almost_equal(y_mean, 0)
|
||
|
assert_array_almost_equal(X_scale, np.ones(n_features))
|
||
|
assert_array_almost_equal(Xt, X)
|
||
|
assert_array_almost_equal(yt, y)
|
||
|
|
||
|
Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(X, y, fit_intercept=True)
|
||
|
assert_array_almost_equal(X_mean, expected_X_mean)
|
||
|
assert_array_almost_equal(y_mean, expected_y_mean)
|
||
|
assert_array_almost_equal(X_scale, np.ones(n_features))
|
||
|
assert_array_almost_equal(Xt, X - expected_X_mean)
|
||
|
assert_array_almost_equal(yt, y - expected_y_mean)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS)
|
||
|
def test_preprocess_data_multioutput(global_random_seed, sparse_container):
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
n_samples = 200
|
||
|
n_features = 3
|
||
|
n_outputs = 2
|
||
|
X = rng.rand(n_samples, n_features)
|
||
|
y = rng.rand(n_samples, n_outputs)
|
||
|
expected_y_mean = np.mean(y, axis=0)
|
||
|
|
||
|
if sparse_container is not None:
|
||
|
X = sparse_container(X)
|
||
|
|
||
|
_, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=False)
|
||
|
assert_array_almost_equal(y_mean, np.zeros(n_outputs))
|
||
|
assert_array_almost_equal(yt, y)
|
||
|
|
||
|
_, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=True)
|
||
|
assert_array_almost_equal(y_mean, expected_y_mean)
|
||
|
assert_array_almost_equal(yt, y - y_mean)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
|
||
|
def test_preprocess_data_weighted(sparse_container, global_random_seed):
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
n_samples = 200
|
||
|
n_features = 4
|
||
|
# Generate random data with 50% of zero values to make sure
|
||
|
# that the sparse variant of this test is actually sparse. This also
|
||
|
# shifts the mean value for each columns in X further away from
|
||
|
# zero.
|
||
|
X = rng.rand(n_samples, n_features)
|
||
|
X[X < 0.5] = 0.0
|
||
|
|
||
|
# Scale the first feature of X to be 10 larger than the other to
|
||
|
# better check the impact of feature scaling.
|
||
|
X[:, 0] *= 10
|
||
|
|
||
|
# Constant non-zero feature.
|
||
|
X[:, 2] = 1.0
|
||
|
|
||
|
# Constant zero feature (non-materialized in the sparse case)
|
||
|
X[:, 3] = 0.0
|
||
|
y = rng.rand(n_samples)
|
||
|
|
||
|
sample_weight = rng.rand(n_samples)
|
||
|
expected_X_mean = np.average(X, axis=0, weights=sample_weight)
|
||
|
expected_y_mean = np.average(y, axis=0, weights=sample_weight)
|
||
|
|
||
|
X_sample_weight_avg = np.average(X, weights=sample_weight, axis=0)
|
||
|
X_sample_weight_var = np.average(
|
||
|
(X - X_sample_weight_avg) ** 2, weights=sample_weight, axis=0
|
||
|
)
|
||
|
constant_mask = X_sample_weight_var < 10 * np.finfo(X.dtype).eps
|
||
|
assert_array_equal(constant_mask, [0, 0, 1, 1])
|
||
|
expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(sample_weight.sum())
|
||
|
|
||
|
# near constant features should not be scaled
|
||
|
expected_X_scale[constant_mask] = 1
|
||
|
|
||
|
if sparse_container is not None:
|
||
|
X = sparse_container(X)
|
||
|
|
||
|
# normalize is False
|
||
|
Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
|
||
|
X,
|
||
|
y,
|
||
|
fit_intercept=True,
|
||
|
sample_weight=sample_weight,
|
||
|
)
|
||
|
assert_array_almost_equal(X_mean, expected_X_mean)
|
||
|
assert_array_almost_equal(y_mean, expected_y_mean)
|
||
|
assert_array_almost_equal(X_scale, np.ones(n_features))
|
||
|
if sparse_container is not None:
|
||
|
assert_array_almost_equal(Xt.toarray(), X.toarray())
|
||
|
else:
|
||
|
assert_array_almost_equal(Xt, X - expected_X_mean)
|
||
|
assert_array_almost_equal(yt, y - expected_y_mean)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
|
||
|
def test_sparse_preprocess_data_offsets(global_random_seed, lil_container):
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
n_samples = 200
|
||
|
n_features = 2
|
||
|
X = sparse.rand(n_samples, n_features, density=0.5, random_state=rng)
|
||
|
X = lil_container(X)
|
||
|
y = rng.rand(n_samples)
|
||
|
XA = X.toarray()
|
||
|
|
||
|
Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(X, y, fit_intercept=False)
|
||
|
assert_array_almost_equal(X_mean, np.zeros(n_features))
|
||
|
assert_array_almost_equal(y_mean, 0)
|
||
|
assert_array_almost_equal(X_scale, np.ones(n_features))
|
||
|
assert_array_almost_equal(Xt.toarray(), XA)
|
||
|
assert_array_almost_equal(yt, y)
|
||
|
|
||
|
Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(X, y, fit_intercept=True)
|
||
|
assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
|
||
|
assert_array_almost_equal(y_mean, np.mean(y, axis=0))
|
||
|
assert_array_almost_equal(X_scale, np.ones(n_features))
|
||
|
assert_array_almost_equal(Xt.toarray(), XA)
|
||
|
assert_array_almost_equal(yt, y - np.mean(y, axis=0))
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_csr_preprocess_data(csr_container):
|
||
|
# Test output format of _preprocess_data, when input is csr
|
||
|
X, y = make_regression()
|
||
|
X[X < 2.5] = 0.0
|
||
|
csr = csr_container(X)
|
||
|
csr_, y, _, _, _ = _preprocess_data(csr, y, fit_intercept=True)
|
||
|
assert csr_.format == "csr"
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
|
||
|
@pytest.mark.parametrize("to_copy", (True, False))
|
||
|
def test_preprocess_copy_data_no_checks(sparse_container, to_copy):
|
||
|
X, y = make_regression()
|
||
|
X[X < 2.5] = 0.0
|
||
|
|
||
|
if sparse_container is not None:
|
||
|
X = sparse_container(X)
|
||
|
|
||
|
X_, y_, _, _, _ = _preprocess_data(
|
||
|
X, y, fit_intercept=True, copy=to_copy, check_input=False
|
||
|
)
|
||
|
|
||
|
if to_copy and sparse_container is not None:
|
||
|
assert not np.may_share_memory(X_.data, X.data)
|
||
|
elif to_copy:
|
||
|
assert not np.may_share_memory(X_, X)
|
||
|
elif sparse_container is not None:
|
||
|
assert np.may_share_memory(X_.data, X.data)
|
||
|
else:
|
||
|
assert np.may_share_memory(X_, X)
|
||
|
|
||
|
|
||
|
def test_dtype_preprocess_data(global_random_seed):
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
n_samples = 200
|
||
|
n_features = 2
|
||
|
X = rng.rand(n_samples, n_features)
|
||
|
y = rng.rand(n_samples)
|
||
|
|
||
|
X_32 = np.asarray(X, dtype=np.float32)
|
||
|
y_32 = np.asarray(y, dtype=np.float32)
|
||
|
X_64 = np.asarray(X, dtype=np.float64)
|
||
|
y_64 = np.asarray(y, dtype=np.float64)
|
||
|
|
||
|
for fit_intercept in [True, False]:
|
||
|
Xt_32, yt_32, X_mean_32, y_mean_32, X_scale_32 = _preprocess_data(
|
||
|
X_32,
|
||
|
y_32,
|
||
|
fit_intercept=fit_intercept,
|
||
|
)
|
||
|
|
||
|
Xt_64, yt_64, X_mean_64, y_mean_64, X_scale_64 = _preprocess_data(
|
||
|
X_64,
|
||
|
y_64,
|
||
|
fit_intercept=fit_intercept,
|
||
|
)
|
||
|
|
||
|
Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_scale_3264 = _preprocess_data(
|
||
|
X_32,
|
||
|
y_64,
|
||
|
fit_intercept=fit_intercept,
|
||
|
)
|
||
|
|
||
|
Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_scale_6432 = _preprocess_data(
|
||
|
X_64,
|
||
|
y_32,
|
||
|
fit_intercept=fit_intercept,
|
||
|
)
|
||
|
|
||
|
assert Xt_32.dtype == np.float32
|
||
|
assert yt_32.dtype == np.float32
|
||
|
assert X_mean_32.dtype == np.float32
|
||
|
assert y_mean_32.dtype == np.float32
|
||
|
assert X_scale_32.dtype == np.float32
|
||
|
|
||
|
assert Xt_64.dtype == np.float64
|
||
|
assert yt_64.dtype == np.float64
|
||
|
assert X_mean_64.dtype == np.float64
|
||
|
assert y_mean_64.dtype == np.float64
|
||
|
assert X_scale_64.dtype == np.float64
|
||
|
|
||
|
assert Xt_3264.dtype == np.float32
|
||
|
assert yt_3264.dtype == np.float32
|
||
|
assert X_mean_3264.dtype == np.float32
|
||
|
assert y_mean_3264.dtype == np.float32
|
||
|
assert X_scale_3264.dtype == np.float32
|
||
|
|
||
|
assert Xt_6432.dtype == np.float64
|
||
|
assert yt_6432.dtype == np.float64
|
||
|
assert X_mean_6432.dtype == np.float64
|
||
|
assert y_mean_6432.dtype == np.float64
|
||
|
assert X_scale_6432.dtype == np.float64
|
||
|
|
||
|
assert X_32.dtype == np.float32
|
||
|
assert y_32.dtype == np.float32
|
||
|
assert X_64.dtype == np.float64
|
||
|
assert y_64.dtype == np.float64
|
||
|
|
||
|
assert_array_almost_equal(Xt_32, Xt_64)
|
||
|
assert_array_almost_equal(yt_32, yt_64)
|
||
|
assert_array_almost_equal(X_mean_32, X_mean_64)
|
||
|
assert_array_almost_equal(y_mean_32, y_mean_64)
|
||
|
assert_array_almost_equal(X_scale_32, X_scale_64)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("n_targets", [None, 2])
|
||
|
@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
|
||
|
def test_rescale_data(n_targets, sparse_container, global_random_seed):
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
n_samples = 200
|
||
|
n_features = 2
|
||
|
|
||
|
sample_weight = 1.0 + rng.rand(n_samples)
|
||
|
X = rng.rand(n_samples, n_features)
|
||
|
if n_targets is None:
|
||
|
y = rng.rand(n_samples)
|
||
|
else:
|
||
|
y = rng.rand(n_samples, n_targets)
|
||
|
|
||
|
expected_sqrt_sw = np.sqrt(sample_weight)
|
||
|
expected_rescaled_X = X * expected_sqrt_sw[:, np.newaxis]
|
||
|
|
||
|
if n_targets is None:
|
||
|
expected_rescaled_y = y * expected_sqrt_sw
|
||
|
else:
|
||
|
expected_rescaled_y = y * expected_sqrt_sw[:, np.newaxis]
|
||
|
|
||
|
if sparse_container is not None:
|
||
|
X = sparse_container(X)
|
||
|
if n_targets is None:
|
||
|
y = sparse_container(y.reshape(-1, 1))
|
||
|
else:
|
||
|
y = sparse_container(y)
|
||
|
|
||
|
rescaled_X, rescaled_y, sqrt_sw = _rescale_data(X, y, sample_weight)
|
||
|
|
||
|
assert_allclose(sqrt_sw, expected_sqrt_sw)
|
||
|
|
||
|
if sparse_container is not None:
|
||
|
rescaled_X = rescaled_X.toarray()
|
||
|
rescaled_y = rescaled_y.toarray()
|
||
|
if n_targets is None:
|
||
|
rescaled_y = rescaled_y.ravel()
|
||
|
|
||
|
assert_allclose(rescaled_X, expected_rescaled_X)
|
||
|
assert_allclose(rescaled_y, expected_rescaled_y)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||
|
def test_fused_types_make_dataset(csr_container):
|
||
|
iris = load_iris()
|
||
|
|
||
|
X_32 = iris.data.astype(np.float32)
|
||
|
y_32 = iris.target.astype(np.float32)
|
||
|
X_csr_32 = csr_container(X_32)
|
||
|
sample_weight_32 = np.arange(y_32.size, dtype=np.float32)
|
||
|
|
||
|
X_64 = iris.data.astype(np.float64)
|
||
|
y_64 = iris.target.astype(np.float64)
|
||
|
X_csr_64 = csr_container(X_64)
|
||
|
sample_weight_64 = np.arange(y_64.size, dtype=np.float64)
|
||
|
|
||
|
# array
|
||
|
dataset_32, _ = make_dataset(X_32, y_32, sample_weight_32)
|
||
|
dataset_64, _ = make_dataset(X_64, y_64, sample_weight_64)
|
||
|
xi_32, yi_32, _, _ = dataset_32._next_py()
|
||
|
xi_64, yi_64, _, _ = dataset_64._next_py()
|
||
|
xi_data_32, _, _ = xi_32
|
||
|
xi_data_64, _, _ = xi_64
|
||
|
|
||
|
assert xi_data_32.dtype == np.float32
|
||
|
assert xi_data_64.dtype == np.float64
|
||
|
assert_allclose(yi_64, yi_32, rtol=rtol)
|
||
|
|
||
|
# csr
|
||
|
datasetcsr_32, _ = make_dataset(X_csr_32, y_32, sample_weight_32)
|
||
|
datasetcsr_64, _ = make_dataset(X_csr_64, y_64, sample_weight_64)
|
||
|
xicsr_32, yicsr_32, _, _ = datasetcsr_32._next_py()
|
||
|
xicsr_64, yicsr_64, _, _ = datasetcsr_64._next_py()
|
||
|
xicsr_data_32, _, _ = xicsr_32
|
||
|
xicsr_data_64, _, _ = xicsr_64
|
||
|
|
||
|
assert xicsr_data_32.dtype == np.float32
|
||
|
assert xicsr_data_64.dtype == np.float64
|
||
|
|
||
|
assert_allclose(xicsr_data_64, xicsr_data_32, rtol=rtol)
|
||
|
assert_allclose(yicsr_64, yicsr_32, rtol=rtol)
|
||
|
|
||
|
assert_array_equal(xi_data_32, xicsr_data_32)
|
||
|
assert_array_equal(xi_data_64, xicsr_data_64)
|
||
|
assert_array_equal(yi_32, yicsr_32)
|
||
|
assert_array_equal(yi_64, yicsr_64)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
|
||
|
@pytest.mark.parametrize("fit_intercept", [False, True])
|
||
|
def test_linear_regression_sample_weight_consistency(
|
||
|
sparse_container, fit_intercept, global_random_seed
|
||
|
):
|
||
|
"""Test that the impact of sample_weight is consistent.
|
||
|
|
||
|
Note that this test is stricter than the common test
|
||
|
check_sample_weights_invariance alone and also tests sparse X.
|
||
|
It is very similar to test_enet_sample_weight_consistency.
|
||
|
"""
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
n_samples, n_features = 10, 5
|
||
|
|
||
|
X = rng.rand(n_samples, n_features)
|
||
|
y = rng.rand(n_samples)
|
||
|
if sparse_container is not None:
|
||
|
X = sparse_container(X)
|
||
|
params = dict(fit_intercept=fit_intercept)
|
||
|
|
||
|
reg = LinearRegression(**params).fit(X, y, sample_weight=None)
|
||
|
coef = reg.coef_.copy()
|
||
|
if fit_intercept:
|
||
|
intercept = reg.intercept_
|
||
|
|
||
|
# 1) sample_weight=np.ones(..) must be equivalent to sample_weight=None
|
||
|
# same check as check_sample_weights_invariance(name, reg, kind="ones"), but we also
|
||
|
# test with sparse input.
|
||
|
sample_weight = np.ones_like(y)
|
||
|
reg.fit(X, y, sample_weight=sample_weight)
|
||
|
assert_allclose(reg.coef_, coef, rtol=1e-6)
|
||
|
if fit_intercept:
|
||
|
assert_allclose(reg.intercept_, intercept)
|
||
|
|
||
|
# 2) sample_weight=None should be equivalent to sample_weight = number
|
||
|
sample_weight = 123.0
|
||
|
reg.fit(X, y, sample_weight=sample_weight)
|
||
|
assert_allclose(reg.coef_, coef, rtol=1e-6)
|
||
|
if fit_intercept:
|
||
|
assert_allclose(reg.intercept_, intercept)
|
||
|
|
||
|
# 3) scaling of sample_weight should have no effect, cf. np.average()
|
||
|
sample_weight = rng.uniform(low=0.01, high=2, size=X.shape[0])
|
||
|
reg = reg.fit(X, y, sample_weight=sample_weight)
|
||
|
coef = reg.coef_.copy()
|
||
|
if fit_intercept:
|
||
|
intercept = reg.intercept_
|
||
|
|
||
|
reg.fit(X, y, sample_weight=np.pi * sample_weight)
|
||
|
assert_allclose(reg.coef_, coef, rtol=1e-6 if sparse_container is None else 1e-5)
|
||
|
if fit_intercept:
|
||
|
assert_allclose(reg.intercept_, intercept)
|
||
|
|
||
|
# 4) setting elements of sample_weight to 0 is equivalent to removing these samples
|
||
|
sample_weight_0 = sample_weight.copy()
|
||
|
sample_weight_0[-5:] = 0
|
||
|
y[-5:] *= 1000 # to make excluding those samples important
|
||
|
reg.fit(X, y, sample_weight=sample_weight_0)
|
||
|
coef_0 = reg.coef_.copy()
|
||
|
if fit_intercept:
|
||
|
intercept_0 = reg.intercept_
|
||
|
reg.fit(X[:-5], y[:-5], sample_weight=sample_weight[:-5])
|
||
|
if fit_intercept and sparse_container is None:
|
||
|
# FIXME: https://github.com/scikit-learn/scikit-learn/issues/26164
|
||
|
# This often fails, e.g. when calling
|
||
|
# SKLEARN_TESTS_GLOBAL_RANDOM_SEED="all" pytest \
|
||
|
# sklearn/linear_model/tests/test_base.py\
|
||
|
# ::test_linear_regression_sample_weight_consistency
|
||
|
pass
|
||
|
else:
|
||
|
assert_allclose(reg.coef_, coef_0, rtol=1e-5)
|
||
|
if fit_intercept:
|
||
|
assert_allclose(reg.intercept_, intercept_0)
|
||
|
|
||
|
# 5) check that multiplying sample_weight by 2 is equivalent to repeating
|
||
|
# corresponding samples twice
|
||
|
if sparse_container is not None:
|
||
|
X2 = sparse.vstack([X, X[: n_samples // 2]], format="csc")
|
||
|
else:
|
||
|
X2 = np.concatenate([X, X[: n_samples // 2]], axis=0)
|
||
|
y2 = np.concatenate([y, y[: n_samples // 2]])
|
||
|
sample_weight_1 = sample_weight.copy()
|
||
|
sample_weight_1[: n_samples // 2] *= 2
|
||
|
sample_weight_2 = np.concatenate(
|
||
|
[sample_weight, sample_weight[: n_samples // 2]], axis=0
|
||
|
)
|
||
|
|
||
|
reg1 = LinearRegression(**params).fit(X, y, sample_weight=sample_weight_1)
|
||
|
reg2 = LinearRegression(**params).fit(X2, y2, sample_weight=sample_weight_2)
|
||
|
assert_allclose(reg1.coef_, reg2.coef_, rtol=1e-6)
|
||
|
if fit_intercept:
|
||
|
assert_allclose(reg1.intercept_, reg2.intercept_)
|