import re import warnings import numpy as np import pytest import scipy as sp from numpy.testing import assert_array_equal from sklearn import config_context, datasets from sklearn.base import clone from sklearn.datasets import load_iris, make_classification, make_low_rank_matrix from sklearn.decomposition import PCA from sklearn.decomposition._pca import _assess_dimension, _infer_dimension from sklearn.utils._array_api import ( _atol_for_type, _convert_to_numpy, yield_namespace_device_dtype_combinations, ) from sklearn.utils._array_api import device as array_device from sklearn.utils._testing import _array_api_for_tests, assert_allclose from sklearn.utils.estimator_checks import ( _get_check_estimator_ids, check_array_api_input_and_values, ) from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS iris = datasets.load_iris() PCA_SOLVERS = ["full", "covariance_eigh", "arpack", "randomized", "auto"] # `SPARSE_M` and `SPARSE_N` could be larger, but be aware: # * SciPy's generation of random sparse matrix can be costly # * A (SPARSE_M, SPARSE_N) dense array is allocated to compare against SPARSE_M, SPARSE_N = 1000, 300 # arbitrary SPARSE_MAX_COMPONENTS = min(SPARSE_M, SPARSE_N) def _check_fitted_pca_close(pca1, pca2, rtol=1e-7, atol=1e-12): assert_allclose(pca1.components_, pca2.components_, rtol=rtol, atol=atol) assert_allclose( pca1.explained_variance_, pca2.explained_variance_, rtol=rtol, atol=atol ) assert_allclose(pca1.singular_values_, pca2.singular_values_, rtol=rtol, atol=atol) assert_allclose(pca1.mean_, pca2.mean_, rtol=rtol, atol=atol) assert_allclose(pca1.noise_variance_, pca2.noise_variance_, rtol=rtol, atol=atol) assert pca1.n_components_ == pca2.n_components_ assert pca1.n_samples_ == pca2.n_samples_ assert pca1.n_features_in_ == pca2.n_features_in_ @pytest.mark.parametrize("svd_solver", PCA_SOLVERS) @pytest.mark.parametrize("n_components", range(1, iris.data.shape[1])) def test_pca(svd_solver, n_components): X = iris.data pca = PCA(n_components=n_components, svd_solver=svd_solver) # check the shape of fit.transform X_r = pca.fit(X).transform(X) assert X_r.shape[1] == n_components # check the equivalence of fit.transform and fit_transform X_r2 = pca.fit_transform(X) assert_allclose(X_r, X_r2) X_r = pca.transform(X) assert_allclose(X_r, X_r2) # Test get_covariance and get_precision cov = pca.get_covariance() precision = pca.get_precision() assert_allclose(np.dot(cov, precision), np.eye(X.shape[1]), atol=1e-12) @pytest.mark.parametrize("density", [0.01, 0.1, 0.30]) @pytest.mark.parametrize("n_components", [1, 2, 10]) @pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) @pytest.mark.parametrize("svd_solver", ["arpack", "covariance_eigh"]) @pytest.mark.parametrize("scale", [1, 10, 100]) def test_pca_sparse( global_random_seed, svd_solver, sparse_container, n_components, density, scale ): """Check that the results are the same for sparse and dense input.""" # Set atol in addition of the default rtol to account for the very wide range of # result values (1e-8 to 1e0). atol = 1e-12 transform_atol = 1e-10 random_state = np.random.default_rng(global_random_seed) X = sparse_container( sp.sparse.random( SPARSE_M, SPARSE_N, random_state=random_state, density=density, ) ) # Scale the data + vary the column means scale_vector = random_state.random(X.shape[1]) * scale X = X.multiply(scale_vector) pca = PCA( n_components=n_components, svd_solver=svd_solver, random_state=global_random_seed, ) pca.fit(X) Xd = X.toarray() pcad = PCA( n_components=n_components, svd_solver=svd_solver, random_state=global_random_seed, ) pcad.fit(Xd) # Fitted attributes equality _check_fitted_pca_close(pca, pcad, atol=atol) # Test transform X2 = sparse_container( sp.sparse.random( SPARSE_M, SPARSE_N, random_state=random_state, density=density, ) ) X2d = X2.toarray() assert_allclose(pca.transform(X2), pca.transform(X2d), atol=transform_atol) assert_allclose(pca.transform(X2), pcad.transform(X2d), atol=transform_atol) @pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) def test_pca_sparse_fit_transform(global_random_seed, sparse_container): random_state = np.random.default_rng(global_random_seed) X = sparse_container( sp.sparse.random( SPARSE_M, SPARSE_N, random_state=random_state, density=0.01, ) ) X2 = sparse_container( sp.sparse.random( SPARSE_M, SPARSE_N, random_state=random_state, density=0.01, ) ) pca_fit = PCA(n_components=10, svd_solver="arpack", random_state=global_random_seed) pca_fit_transform = PCA( n_components=10, svd_solver="arpack", random_state=global_random_seed ) pca_fit.fit(X) transformed_X = pca_fit_transform.fit_transform(X) _check_fitted_pca_close(pca_fit, pca_fit_transform) assert_allclose(transformed_X, pca_fit_transform.transform(X)) assert_allclose(transformed_X, pca_fit.transform(X)) assert_allclose(pca_fit.transform(X2), pca_fit_transform.transform(X2)) @pytest.mark.parametrize("svd_solver", ["randomized", "full"]) @pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) def test_sparse_pca_solver_error(global_random_seed, svd_solver, sparse_container): random_state = np.random.RandomState(global_random_seed) X = sparse_container( sp.sparse.random( SPARSE_M, SPARSE_N, random_state=random_state, ) ) pca = PCA(n_components=30, svd_solver=svd_solver) error_msg_pattern = ( 'PCA only support sparse inputs with the "arpack" and "covariance_eigh"' f' solvers, while "{svd_solver}" was passed' ) with pytest.raises(TypeError, match=error_msg_pattern): pca.fit(X) @pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) def test_sparse_pca_auto_arpack_singluar_values_consistency( global_random_seed, sparse_container ): """Check that "auto" and "arpack" solvers are equivalent for sparse inputs.""" random_state = np.random.RandomState(global_random_seed) X = sparse_container( sp.sparse.random( SPARSE_M, SPARSE_N, random_state=random_state, ) ) pca_arpack = PCA(n_components=10, svd_solver="arpack").fit(X) pca_auto = PCA(n_components=10, svd_solver="auto").fit(X) assert_allclose(pca_arpack.singular_values_, pca_auto.singular_values_, rtol=5e-3) def test_no_empty_slice_warning(): # test if we avoid numpy warnings for computing over empty arrays n_components = 10 n_features = n_components + 2 # anything > n_comps triggered it in 0.16 X = np.random.uniform(-1, 1, size=(n_components, n_features)) pca = PCA(n_components=n_components) with warnings.catch_warnings(): warnings.simplefilter("error", RuntimeWarning) pca.fit(X) @pytest.mark.parametrize("copy", [True, False]) @pytest.mark.parametrize("solver", PCA_SOLVERS) def test_whitening(solver, copy): # Check that PCA output has unit-variance rng = np.random.RandomState(0) n_samples = 100 n_features = 80 n_components = 30 rank = 50 # some low rank data with correlated features X = np.dot( rng.randn(n_samples, rank), np.dot(np.diag(np.linspace(10.0, 1.0, rank)), rng.randn(rank, n_features)), ) # the component-wise variance of the first 50 features is 3 times the # mean component-wise variance of the remaining 30 features X[:, :50] *= 3 assert X.shape == (n_samples, n_features) # the component-wise variance is thus highly varying: assert X.std(axis=0).std() > 43.8 # whiten the data while projecting to the lower dim subspace X_ = X.copy() # make sure we keep an original across iterations. pca = PCA( n_components=n_components, whiten=True, copy=copy, svd_solver=solver, random_state=0, iterated_power=7, ) # test fit_transform X_whitened = pca.fit_transform(X_.copy()) assert X_whitened.shape == (n_samples, n_components) X_whitened2 = pca.transform(X_) assert_allclose(X_whitened, X_whitened2, rtol=5e-4) assert_allclose(X_whitened.std(ddof=1, axis=0), np.ones(n_components)) assert_allclose(X_whitened.mean(axis=0), np.zeros(n_components), atol=1e-12) X_ = X.copy() pca = PCA( n_components=n_components, whiten=False, copy=copy, svd_solver=solver ).fit(X_.copy()) X_unwhitened = pca.transform(X_) assert X_unwhitened.shape == (n_samples, n_components) # in that case the output components still have varying variances assert X_unwhitened.std(axis=0).std() == pytest.approx(74.1, rel=1e-1) # we always center, so no test for non-centering. @pytest.mark.parametrize( "other_svd_solver", sorted(list(set(PCA_SOLVERS) - {"full", "auto"})) ) @pytest.mark.parametrize("data_shape", ["tall", "wide"]) @pytest.mark.parametrize("rank_deficient", [False, True]) @pytest.mark.parametrize("whiten", [False, True]) def test_pca_solver_equivalence( other_svd_solver, data_shape, rank_deficient, whiten, global_random_seed, global_dtype, ): if data_shape == "tall": n_samples, n_features = 100, 30 else: n_samples, n_features = 30, 100 n_samples_test = 10 if rank_deficient: rng = np.random.default_rng(global_random_seed) rank = min(n_samples, n_features) // 2 X = rng.standard_normal( size=(n_samples + n_samples_test, rank) ) @ rng.standard_normal(size=(rank, n_features)) else: X = make_low_rank_matrix( n_samples=n_samples + n_samples_test, n_features=n_features, tail_strength=0.5, random_state=global_random_seed, ) # With a non-zero tail strength, the data is actually full-rank. rank = min(n_samples, n_features) X = X.astype(global_dtype, copy=False) X_train, X_test = X[:n_samples], X[n_samples:] if global_dtype == np.float32: tols = dict(atol=3e-2, rtol=1e-5) variance_threshold = 1e-5 else: tols = dict(atol=1e-10, rtol=1e-12) variance_threshold = 1e-12 extra_other_kwargs = {} if other_svd_solver == "randomized": # Only check for a truncated result with a large number of iterations # to make sure that we can recover precise results. n_components = 10 extra_other_kwargs = {"iterated_power": 50} elif other_svd_solver == "arpack": # Test all components except the last one which cannot be estimated by # arpack. n_components = np.minimum(n_samples, n_features) - 1 else: # Test all components to high precision. n_components = None pca_full = PCA(n_components=n_components, svd_solver="full", whiten=whiten) pca_other = PCA( n_components=n_components, svd_solver=other_svd_solver, whiten=whiten, random_state=global_random_seed, **extra_other_kwargs, ) X_trans_full_train = pca_full.fit_transform(X_train) assert np.isfinite(X_trans_full_train).all() assert X_trans_full_train.dtype == global_dtype X_trans_other_train = pca_other.fit_transform(X_train) assert np.isfinite(X_trans_other_train).all() assert X_trans_other_train.dtype == global_dtype assert (pca_full.explained_variance_ >= 0).all() assert_allclose(pca_full.explained_variance_, pca_other.explained_variance_, **tols) assert_allclose( pca_full.explained_variance_ratio_, pca_other.explained_variance_ratio_, **tols, ) reference_components = pca_full.components_ assert np.isfinite(reference_components).all() other_components = pca_other.components_ assert np.isfinite(other_components).all() # For some choice of n_components and data distribution, some components # might be pure noise, let's ignore them in the comparison: stable = pca_full.explained_variance_ > variance_threshold assert stable.sum() > 1 assert_allclose(reference_components[stable], other_components[stable], **tols) # As a result the output of fit_transform should be the same: assert_allclose( X_trans_other_train[:, stable], X_trans_full_train[:, stable], **tols ) # And similarly for the output of transform on new data (except for the # last component that can be underdetermined): X_trans_full_test = pca_full.transform(X_test) assert np.isfinite(X_trans_full_test).all() assert X_trans_full_test.dtype == global_dtype X_trans_other_test = pca_other.transform(X_test) assert np.isfinite(X_trans_other_test).all() assert X_trans_other_test.dtype == global_dtype assert_allclose(X_trans_other_test[:, stable], X_trans_full_test[:, stable], **tols) # Check that inverse transform reconstructions for both solvers are # compatible. X_recons_full_test = pca_full.inverse_transform(X_trans_full_test) assert np.isfinite(X_recons_full_test).all() assert X_recons_full_test.dtype == global_dtype X_recons_other_test = pca_other.inverse_transform(X_trans_other_test) assert np.isfinite(X_recons_other_test).all() assert X_recons_other_test.dtype == global_dtype if pca_full.components_.shape[0] == pca_full.components_.shape[1]: # In this case, the models should have learned the same invertible # transform. They should therefore both be able to reconstruct the test # data. assert_allclose(X_recons_full_test, X_test, **tols) assert_allclose(X_recons_other_test, X_test, **tols) elif pca_full.components_.shape[0] < rank: # In the absence of noisy components, both models should be able to # reconstruct the same low-rank approximation of the original data. assert pca_full.explained_variance_.min() > variance_threshold assert_allclose(X_recons_full_test, X_recons_other_test, **tols) else: # When n_features > n_samples and n_components is larger than the rank # of the training set, the output of the `inverse_transform` function # is ill-defined. We can only check that we reach the same fixed point # after another round of transform: assert_allclose( pca_full.transform(X_recons_full_test)[:, stable], pca_other.transform(X_recons_other_test)[:, stable], **tols, ) @pytest.mark.parametrize( "X", [ np.random.RandomState(0).randn(100, 80), datasets.make_classification(100, 80, n_informative=78, random_state=0)[0], np.random.RandomState(0).randn(10, 100), ], ids=["random-tall", "correlated-tall", "random-wide"], ) @pytest.mark.parametrize("svd_solver", PCA_SOLVERS) def test_pca_explained_variance_empirical(X, svd_solver): pca = PCA(n_components=2, svd_solver=svd_solver, random_state=0) X_pca = pca.fit_transform(X) assert_allclose(pca.explained_variance_, np.var(X_pca, ddof=1, axis=0)) expected_result = np.linalg.eig(np.cov(X, rowvar=False))[0] expected_result = sorted(expected_result, reverse=True)[:2] assert_allclose(pca.explained_variance_, expected_result, rtol=5e-3) @pytest.mark.parametrize("svd_solver", ["arpack", "randomized"]) def test_pca_singular_values_consistency(svd_solver): rng = np.random.RandomState(0) n_samples, n_features = 100, 80 X = rng.randn(n_samples, n_features) pca_full = PCA(n_components=2, svd_solver="full", random_state=rng) pca_other = PCA(n_components=2, svd_solver=svd_solver, random_state=rng) pca_full.fit(X) pca_other.fit(X) assert_allclose(pca_full.singular_values_, pca_other.singular_values_, rtol=5e-3) @pytest.mark.parametrize("svd_solver", PCA_SOLVERS) def test_pca_singular_values(svd_solver): rng = np.random.RandomState(0) n_samples, n_features = 100, 80 X = rng.randn(n_samples, n_features) pca = PCA(n_components=2, svd_solver=svd_solver, random_state=rng) X_trans = pca.fit_transform(X) # compare to the Frobenius norm assert_allclose( np.sum(pca.singular_values_**2), np.linalg.norm(X_trans, "fro") ** 2 ) # Compare to the 2-norms of the score vectors assert_allclose(pca.singular_values_, np.sqrt(np.sum(X_trans**2, axis=0))) # set the singular values and see what er get back n_samples, n_features = 100, 110 X = rng.randn(n_samples, n_features) pca = PCA(n_components=3, svd_solver=svd_solver, random_state=rng) X_trans = pca.fit_transform(X) X_trans /= np.sqrt(np.sum(X_trans**2, axis=0)) X_trans[:, 0] *= 3.142 X_trans[:, 1] *= 2.718 X_hat = np.dot(X_trans, pca.components_) pca.fit(X_hat) assert_allclose(pca.singular_values_, [3.142, 2.718, 1.0]) @pytest.mark.parametrize("svd_solver", PCA_SOLVERS) def test_pca_check_projection(svd_solver): # Test that the projection of data is correct rng = np.random.RandomState(0) n, p = 100, 3 X = rng.randn(n, p) * 0.1 X[:10] += np.array([3, 4, 5]) Xt = 0.1 * rng.randn(1, p) + np.array([3, 4, 5]) Yt = PCA(n_components=2, svd_solver=svd_solver).fit(X).transform(Xt) Yt /= np.sqrt((Yt**2).sum()) assert_allclose(np.abs(Yt[0][0]), 1.0, rtol=5e-3) @pytest.mark.parametrize("svd_solver", PCA_SOLVERS) def test_pca_check_projection_list(svd_solver): # Test that the projection of data is correct X = [[1.0, 0.0], [0.0, 1.0]] pca = PCA(n_components=1, svd_solver=svd_solver, random_state=0) X_trans = pca.fit_transform(X) assert X_trans.shape, (2, 1) assert_allclose(X_trans.mean(), 0.00, atol=1e-12) assert_allclose(X_trans.std(), 0.71, rtol=5e-3) @pytest.mark.parametrize("svd_solver", ["full", "arpack", "randomized"]) @pytest.mark.parametrize("whiten", [False, True]) def test_pca_inverse(svd_solver, whiten): # Test that the projection of data can be inverted rng = np.random.RandomState(0) n, p = 50, 3 X = rng.randn(n, p) # spherical data X[:, 1] *= 0.00001 # make middle component relatively small X += [5, 4, 3] # make a large mean # same check that we can find the original data from the transformed # signal (since the data is almost of rank n_components) pca = PCA(n_components=2, svd_solver=svd_solver, whiten=whiten).fit(X) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) assert_allclose(X, Y_inverse, rtol=5e-6) @pytest.mark.parametrize( "data", [np.array([[0, 1, 0], [1, 0, 0]]), np.array([[0, 1, 0], [1, 0, 0]]).T] ) @pytest.mark.parametrize( "svd_solver, n_components, err_msg", [ ("arpack", 0, r"must be between 1 and min\(n_samples, n_features\)"), ("randomized", 0, r"must be between 1 and min\(n_samples, n_features\)"), ("arpack", 2, r"must be strictly less than min"), ( "auto", 3, ( r"n_components=3 must be between 0 and min\(n_samples, " r"n_features\)=2 with svd_solver='full'" ), ), ], ) def test_pca_validation(svd_solver, data, n_components, err_msg): # Ensures that solver-specific extreme inputs for the n_components # parameter raise errors smallest_d = 2 # The smallest dimension pca_fitted = PCA(n_components, svd_solver=svd_solver) with pytest.raises(ValueError, match=err_msg): pca_fitted.fit(data) # Additional case for arpack if svd_solver == "arpack": n_components = smallest_d err_msg = ( "n_components={}L? must be strictly less than " r"min\(n_samples, n_features\)={}L? with " "svd_solver='arpack'".format(n_components, smallest_d) ) with pytest.raises(ValueError, match=err_msg): PCA(n_components, svd_solver=svd_solver).fit(data) @pytest.mark.parametrize( "solver, n_components_", [ ("full", min(iris.data.shape)), ("arpack", min(iris.data.shape) - 1), ("randomized", min(iris.data.shape)), ], ) @pytest.mark.parametrize("data", [iris.data, iris.data.T]) def test_n_components_none(data, solver, n_components_): pca = PCA(svd_solver=solver) pca.fit(data) assert pca.n_components_ == n_components_ @pytest.mark.parametrize("svd_solver", ["auto", "full"]) def test_n_components_mle(svd_solver): # Ensure that n_components == 'mle' doesn't raise error for auto/full rng = np.random.RandomState(0) n_samples, n_features = 600, 10 X = rng.randn(n_samples, n_features) pca = PCA(n_components="mle", svd_solver=svd_solver) pca.fit(X) assert pca.n_components_ == 1 @pytest.mark.parametrize("svd_solver", ["arpack", "randomized"]) def test_n_components_mle_error(svd_solver): # Ensure that n_components == 'mle' will raise an error for unsupported # solvers rng = np.random.RandomState(0) n_samples, n_features = 600, 10 X = rng.randn(n_samples, n_features) pca = PCA(n_components="mle", svd_solver=svd_solver) err_msg = "n_components='mle' cannot be a string with svd_solver='{}'".format( svd_solver ) with pytest.raises(ValueError, match=err_msg): pca.fit(X) def test_pca_dim(): # Check automated dimensionality setting rng = np.random.RandomState(0) n, p = 100, 5 X = rng.randn(n, p) * 0.1 X[:10] += np.array([3, 4, 5, 1, 2]) pca = PCA(n_components="mle", svd_solver="full").fit(X) assert pca.n_components == "mle" assert pca.n_components_ == 1 def test_infer_dim_1(): # TODO: explain what this is testing # Or at least use explicit variable names... n, p = 1000, 5 rng = np.random.RandomState(0) X = ( rng.randn(n, p) * 0.1 + rng.randn(n, 1) * np.array([3, 4, 5, 1, 2]) + np.array([1, 0, 7, 4, 6]) ) pca = PCA(n_components=p, svd_solver="full") pca.fit(X) spect = pca.explained_variance_ ll = np.array([_assess_dimension(spect, k, n) for k in range(1, p)]) assert ll[1] > ll.max() - 0.01 * n def test_infer_dim_2(): # TODO: explain what this is testing # Or at least use explicit variable names... n, p = 1000, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * 0.1 X[:10] += np.array([3, 4, 5, 1, 2]) X[10:20] += np.array([6, 0, 7, 2, -1]) pca = PCA(n_components=p, svd_solver="full") pca.fit(X) spect = pca.explained_variance_ assert _infer_dimension(spect, n) > 1 def test_infer_dim_3(): n, p = 100, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * 0.1 X[:10] += np.array([3, 4, 5, 1, 2]) X[10:20] += np.array([6, 0, 7, 2, -1]) X[30:40] += 2 * np.array([-1, 1, -1, 1, -1]) pca = PCA(n_components=p, svd_solver="full") pca.fit(X) spect = pca.explained_variance_ assert _infer_dimension(spect, n) > 2 @pytest.mark.parametrize( "X, n_components, n_components_validated", [ (iris.data, 0.95, 2), # row > col (iris.data, 0.01, 1), # row > col (np.random.RandomState(0).rand(5, 20), 0.5, 2), ], # row < col ) def test_infer_dim_by_explained_variance(X, n_components, n_components_validated): pca = PCA(n_components=n_components, svd_solver="full") pca.fit(X) assert pca.n_components == pytest.approx(n_components) assert pca.n_components_ == n_components_validated @pytest.mark.parametrize("svd_solver", PCA_SOLVERS) def test_pca_score(svd_solver): # Test that probabilistic PCA scoring yields a reasonable score n, p = 1000, 3 rng = np.random.RandomState(0) X = rng.randn(n, p) * 0.1 + np.array([3, 4, 5]) pca = PCA(n_components=2, svd_solver=svd_solver) pca.fit(X) ll1 = pca.score(X) h = -0.5 * np.log(2 * np.pi * np.exp(1) * 0.1**2) * p assert_allclose(ll1 / h, 1, rtol=5e-2) ll2 = pca.score(rng.randn(n, p) * 0.2 + np.array([3, 4, 5])) assert ll1 > ll2 pca = PCA(n_components=2, whiten=True, svd_solver=svd_solver) pca.fit(X) ll2 = pca.score(X) assert ll1 > ll2 def test_pca_score3(): # Check that probabilistic PCA selects the right model n, p = 200, 3 rng = np.random.RandomState(0) Xl = rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) + np.array([1, 0, 7]) Xt = rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) + np.array([1, 0, 7]) ll = np.zeros(p) for k in range(p): pca = PCA(n_components=k, svd_solver="full") pca.fit(Xl) ll[k] = pca.score(Xt) assert ll.argmax() == 1 @pytest.mark.parametrize("svd_solver", PCA_SOLVERS) def test_pca_sanity_noise_variance(svd_solver): # Sanity check for the noise_variance_. For more details see # https://github.com/scikit-learn/scikit-learn/issues/7568 # https://github.com/scikit-learn/scikit-learn/issues/8541 # https://github.com/scikit-learn/scikit-learn/issues/8544 X, _ = datasets.load_digits(return_X_y=True) pca = PCA(n_components=30, svd_solver=svd_solver, random_state=0) pca.fit(X) assert np.all((pca.explained_variance_ - pca.noise_variance_) >= 0) @pytest.mark.parametrize("svd_solver", ["arpack", "randomized"]) def test_pca_score_consistency_solvers(svd_solver): # Check the consistency of score between solvers X, _ = datasets.load_digits(return_X_y=True) pca_full = PCA(n_components=30, svd_solver="full", random_state=0) pca_other = PCA(n_components=30, svd_solver=svd_solver, random_state=0) pca_full.fit(X) pca_other.fit(X) assert_allclose(pca_full.score(X), pca_other.score(X), rtol=5e-6) # arpack raises ValueError for n_components == min(n_samples, n_features) @pytest.mark.parametrize("svd_solver", ["full", "randomized"]) def test_pca_zero_noise_variance_edge_cases(svd_solver): # ensure that noise_variance_ is 0 in edge cases # when n_components == min(n_samples, n_features) n, p = 100, 3 rng = np.random.RandomState(0) X = rng.randn(n, p) * 0.1 + np.array([3, 4, 5]) pca = PCA(n_components=p, svd_solver=svd_solver) pca.fit(X) assert pca.noise_variance_ == 0 # Non-regression test for gh-12489 # ensure no divide-by-zero error for n_components == n_features < n_samples pca.score(X) pca.fit(X.T) assert pca.noise_variance_ == 0 # Non-regression test for gh-12489 # ensure no divide-by-zero error for n_components == n_samples < n_features pca.score(X.T) @pytest.mark.parametrize( "n_samples, n_features, n_components, expected_solver", [ # case: n_samples < 10 * n_features and max(X.shape) <= 500 => 'full' (10, 50, 5, "full"), # case: n_samples > 10 * n_features and n_features < 500 => 'covariance_eigh' (1000, 50, 50, "covariance_eigh"), # case: n_components >= .8 * min(X.shape) => 'full' (1000, 500, 400, "full"), # n_components >= 1 and n_components < .8*min(X.shape) => 'randomized' (1000, 500, 10, "randomized"), # case: n_components in (0,1) => 'full' (1000, 500, 0.5, "full"), ], ) def test_pca_svd_solver_auto(n_samples, n_features, n_components, expected_solver): data = np.random.RandomState(0).uniform(size=(n_samples, n_features)) pca_auto = PCA(n_components=n_components, random_state=0) pca_test = PCA( n_components=n_components, svd_solver=expected_solver, random_state=0 ) pca_auto.fit(data) assert pca_auto._fit_svd_solver == expected_solver pca_test.fit(data) assert_allclose(pca_auto.components_, pca_test.components_) @pytest.mark.parametrize("svd_solver", PCA_SOLVERS) def test_pca_deterministic_output(svd_solver): rng = np.random.RandomState(0) X = rng.rand(10, 10) transformed_X = np.zeros((20, 2)) for i in range(20): pca = PCA(n_components=2, svd_solver=svd_solver, random_state=rng) transformed_X[i, :] = pca.fit_transform(X)[0] assert_allclose(transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2)) @pytest.mark.parametrize("svd_solver", PCA_SOLVERS) def test_pca_dtype_preservation(svd_solver, global_random_seed): check_pca_float_dtype_preservation(svd_solver, global_random_seed) check_pca_int_dtype_upcast_to_double(svd_solver) def check_pca_float_dtype_preservation(svd_solver, seed): # Ensure that PCA does not upscale the dtype when input is float32 X = np.random.RandomState(seed).rand(1000, 4) X_float64 = X.astype(np.float64, copy=False) X_float32 = X.astype(np.float32) pca_64 = PCA(n_components=3, svd_solver=svd_solver, random_state=seed).fit( X_float64 ) pca_32 = PCA(n_components=3, svd_solver=svd_solver, random_state=seed).fit( X_float32 ) assert pca_64.components_.dtype == np.float64 assert pca_32.components_.dtype == np.float32 assert pca_64.transform(X_float64).dtype == np.float64 assert pca_32.transform(X_float32).dtype == np.float32 # The atol and rtol are set such that the test passes for all random seeds # on all supported platforms on our CI and conda-forge with the default # random seed. assert_allclose(pca_64.components_, pca_32.components_, rtol=1e-3, atol=1e-3) def check_pca_int_dtype_upcast_to_double(svd_solver): # Ensure that all int types will be upcast to float64 X_i64 = np.random.RandomState(0).randint(0, 1000, (1000, 4)) X_i64 = X_i64.astype(np.int64, copy=False) X_i32 = X_i64.astype(np.int32, copy=False) pca_64 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_i64) pca_32 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_i32) assert pca_64.components_.dtype == np.float64 assert pca_32.components_.dtype == np.float64 assert pca_64.transform(X_i64).dtype == np.float64 assert pca_32.transform(X_i32).dtype == np.float64 assert_allclose(pca_64.components_, pca_32.components_, rtol=1e-4) def test_pca_n_components_mostly_explained_variance_ratio(): # when n_components is the second highest cumulative sum of the # explained_variance_ratio_, then n_components_ should equal the # number of features in the dataset #15669 X, y = load_iris(return_X_y=True) pca1 = PCA().fit(X, y) n_components = pca1.explained_variance_ratio_.cumsum()[-2] pca2 = PCA(n_components=n_components).fit(X, y) assert pca2.n_components_ == X.shape[1] def test_assess_dimension_bad_rank(): # Test error when tested rank not in [1, n_features - 1] spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) n_samples = 10 for rank in (0, 5): with pytest.raises(ValueError, match=r"should be in \[1, n_features - 1\]"): _assess_dimension(spectrum, rank, n_samples) def test_small_eigenvalues_mle(): # Test rank associated with tiny eigenvalues are given a log-likelihood of # -inf. The inferred rank will be 1 spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) assert _assess_dimension(spectrum, rank=1, n_samples=10) > -np.inf for rank in (2, 3): assert _assess_dimension(spectrum, rank, 10) == -np.inf assert _infer_dimension(spectrum, 10) == 1 def test_mle_redundant_data(): # Test 'mle' with pathological X: only one relevant feature should give a # rank of 1 X, _ = datasets.make_classification( n_features=20, n_informative=1, n_repeated=18, n_redundant=1, n_clusters_per_class=1, random_state=42, ) pca = PCA(n_components="mle").fit(X) assert pca.n_components_ == 1 def test_fit_mle_too_few_samples(): # Tests that an error is raised when the number of samples is smaller # than the number of features during an mle fit X, _ = datasets.make_classification(n_samples=20, n_features=21, random_state=42) pca = PCA(n_components="mle", svd_solver="full") with pytest.raises( ValueError, match="n_components='mle' is only supported if n_samples >= n_features", ): pca.fit(X) def test_mle_simple_case(): # non-regression test for issue # https://github.com/scikit-learn/scikit-learn/issues/16730 n_samples, n_dim = 1000, 10 X = np.random.RandomState(0).randn(n_samples, n_dim) X[:, -1] = np.mean(X[:, :-1], axis=-1) # true X dim is ndim - 1 pca_skl = PCA("mle", svd_solver="full") pca_skl.fit(X) assert pca_skl.n_components_ == n_dim - 1 def test_assess_dimesion_rank_one(): # Make sure assess_dimension works properly on a matrix of rank 1 n_samples, n_features = 9, 6 X = np.ones((n_samples, n_features)) # rank 1 matrix _, s, _ = np.linalg.svd(X, full_matrices=True) # except for rank 1, all eigenvalues are 0 resp. close to 0 (FP) assert_allclose(s[1:], np.zeros(n_features - 1), atol=1e-12) assert np.isfinite(_assess_dimension(s, rank=1, n_samples=n_samples)) for rank in range(2, n_features): assert _assess_dimension(s, rank, n_samples) == -np.inf def test_pca_randomized_svd_n_oversamples(): """Check that exposing and setting `n_oversamples` will provide accurate results even when `X` as a large number of features. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/20589 """ rng = np.random.RandomState(0) n_features = 100 X = rng.randn(1_000, n_features) # The default value of `n_oversamples` will lead to inaccurate results # We force it to the number of features. pca_randomized = PCA( n_components=1, svd_solver="randomized", n_oversamples=n_features, random_state=0, ).fit(X) pca_full = PCA(n_components=1, svd_solver="full").fit(X) pca_arpack = PCA(n_components=1, svd_solver="arpack", random_state=0).fit(X) assert_allclose(np.abs(pca_full.components_), np.abs(pca_arpack.components_)) assert_allclose(np.abs(pca_randomized.components_), np.abs(pca_arpack.components_)) def test_feature_names_out(): """Check feature names out for PCA.""" pca = PCA(n_components=2).fit(iris.data) names = pca.get_feature_names_out() assert_array_equal([f"pca{i}" for i in range(2)], names) @pytest.mark.parametrize("copy", [True, False]) def test_variance_correctness(copy): """Check the accuracy of PCA's internal variance calculation""" rng = np.random.RandomState(0) X = rng.randn(1000, 200) pca = PCA().fit(X) pca_var = pca.explained_variance_ / pca.explained_variance_ratio_ true_var = np.var(X, ddof=1, axis=0).sum() np.testing.assert_allclose(pca_var, true_var) def check_array_api_get_precision(name, estimator, array_namespace, device, dtype_name): xp = _array_api_for_tests(array_namespace, device) iris_np = iris.data.astype(dtype_name) iris_xp = xp.asarray(iris_np, device=device) estimator.fit(iris_np) precision_np = estimator.get_precision() covariance_np = estimator.get_covariance() rtol = 2e-4 if iris_np.dtype == "float32" else 2e-7 with config_context(array_api_dispatch=True): estimator_xp = clone(estimator).fit(iris_xp) precision_xp = estimator_xp.get_precision() assert precision_xp.shape == (4, 4) assert precision_xp.dtype == iris_xp.dtype assert_allclose( _convert_to_numpy(precision_xp, xp=xp), precision_np, rtol=rtol, atol=_atol_for_type(dtype_name), ) covariance_xp = estimator_xp.get_covariance() assert covariance_xp.shape == (4, 4) assert covariance_xp.dtype == iris_xp.dtype assert_allclose( _convert_to_numpy(covariance_xp, xp=xp), covariance_np, rtol=rtol, atol=_atol_for_type(dtype_name), ) @pytest.mark.parametrize( "array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations() ) @pytest.mark.parametrize( "check", [check_array_api_input_and_values, check_array_api_get_precision], ids=_get_check_estimator_ids, ) @pytest.mark.parametrize( "estimator", [ PCA(n_components=2, svd_solver="full"), PCA(n_components=2, svd_solver="full", whiten=True), PCA(n_components=0.1, svd_solver="full", whiten=True), PCA(n_components=2, svd_solver="covariance_eigh"), PCA(n_components=2, svd_solver="covariance_eigh", whiten=True), PCA( n_components=2, svd_solver="randomized", power_iteration_normalizer="QR", random_state=0, # how to use global_random_seed here? ), ], ids=_get_check_estimator_ids, ) def test_pca_array_api_compliance( estimator, check, array_namespace, device, dtype_name ): name = estimator.__class__.__name__ check(name, estimator, array_namespace, device=device, dtype_name=dtype_name) @pytest.mark.parametrize( "array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations() ) @pytest.mark.parametrize( "check", [check_array_api_get_precision], ids=_get_check_estimator_ids, ) @pytest.mark.parametrize( "estimator", [ # PCA with mle cannot use check_array_api_input_and_values because of # rounding errors in the noisy (low variance) components. Even checking # the shape of the `components_` is problematic because the number of # components depends on trimming threshold of the mle algorithm which # can depend on device-specific rounding errors. PCA(n_components="mle", svd_solver="full"), ], ids=_get_check_estimator_ids, ) def test_pca_mle_array_api_compliance( estimator, check, array_namespace, device, dtype_name ): name = estimator.__class__.__name__ check(name, estimator, array_namespace, device=device, dtype_name=dtype_name) # Simpler variant of the generic check_array_api_input checker tailored for # the specific case of PCA with mle-trimmed components. xp = _array_api_for_tests(array_namespace, device) X, y = make_classification(random_state=42) X = X.astype(dtype_name, copy=False) atol = _atol_for_type(X.dtype) est = clone(estimator) X_xp = xp.asarray(X, device=device) y_xp = xp.asarray(y, device=device) est.fit(X, y) components_np = est.components_ explained_variance_np = est.explained_variance_ est_xp = clone(est) with config_context(array_api_dispatch=True): est_xp.fit(X_xp, y_xp) components_xp = est_xp.components_ assert array_device(components_xp) == array_device(X_xp) components_xp_np = _convert_to_numpy(components_xp, xp=xp) explained_variance_xp = est_xp.explained_variance_ assert array_device(explained_variance_xp) == array_device(X_xp) explained_variance_xp_np = _convert_to_numpy(explained_variance_xp, xp=xp) assert components_xp_np.dtype == components_np.dtype assert components_xp_np.shape[1] == components_np.shape[1] assert explained_variance_xp_np.dtype == explained_variance_np.dtype # Check that the explained variance values match for the # common components: min_components = min(components_xp_np.shape[0], components_np.shape[0]) assert_allclose( explained_variance_xp_np[:min_components], explained_variance_np[:min_components], atol=atol, ) # If the number of components differ, check that the explained variance of # the trimmed components is very small. if components_xp_np.shape[0] != components_np.shape[0]: reference_variance = explained_variance_np[-1] extra_variance_np = explained_variance_np[min_components:] extra_variance_xp_np = explained_variance_xp_np[min_components:] assert all(np.abs(extra_variance_np - reference_variance) < atol) assert all(np.abs(extra_variance_xp_np - reference_variance) < atol) def test_array_api_error_and_warnings_on_unsupported_params(): pytest.importorskip("array_api_compat") xp = pytest.importorskip("array_api_strict") iris_xp = xp.asarray(iris.data) pca = PCA(n_components=2, svd_solver="arpack", random_state=0) expected_msg = re.escape( "PCA with svd_solver='arpack' is not supported for Array API inputs." ) with pytest.raises(ValueError, match=expected_msg): with config_context(array_api_dispatch=True): pca.fit(iris_xp) pca.set_params(svd_solver="randomized", power_iteration_normalizer="LU") expected_msg = re.escape( "Array API does not support LU factorization. Set" " `power_iteration_normalizer='QR'` instead." ) with pytest.raises(ValueError, match=expected_msg): with config_context(array_api_dispatch=True): pca.fit(iris_xp) pca.set_params(svd_solver="randomized", power_iteration_normalizer="auto") expected_msg = re.escape( "Array API does not support LU factorization, falling back to QR instead. Set" " `power_iteration_normalizer='QR'` explicitly to silence this warning." ) with pytest.warns(UserWarning, match=expected_msg): with config_context(array_api_dispatch=True): pca.fit(iris_xp)