Inzynierka/Lib/site-packages/sklearn/decomposition/tests/test_kernel_pca.py

526 lines
19 KiB
Python
Raw Normal View History

2023-06-02 12:51:02 +02:00
import numpy as np
import scipy.sparse as sp
import pytest
import warnings
from sklearn.utils._testing import (
assert_array_almost_equal,
assert_array_equal,
assert_allclose,
)
from sklearn.decomposition import PCA, KernelPCA
from sklearn.datasets import make_circles
from sklearn.datasets import make_blobs
from sklearn.exceptions import NotFittedError
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.utils.validation import _check_psd_eigenvalues
def test_kernel_pca():
"""Nominal test for all solvers and all known kernels + a custom one
It tests
- that fit_transform is equivalent to fit+transform
- that the shapes of transforms and inverse transforms are correct
"""
rng = np.random.RandomState(0)
X_fit = rng.random_sample((5, 4))
X_pred = rng.random_sample((2, 4))
def histogram(x, y, **kwargs):
# Histogram kernel implemented as a callable.
assert kwargs == {} # no kernel_params that we didn't ask for
return np.minimum(x, y).sum()
for eigen_solver in ("auto", "dense", "arpack", "randomized"):
for kernel in ("linear", "rbf", "poly", histogram):
# histogram kernel produces singular matrix inside linalg.solve
# XXX use a least-squares approximation?
inv = not callable(kernel)
# transform fit data
kpca = KernelPCA(
4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=inv
)
X_fit_transformed = kpca.fit_transform(X_fit)
X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
assert_array_almost_equal(
np.abs(X_fit_transformed), np.abs(X_fit_transformed2)
)
# non-regression test: previously, gamma would be 0 by default,
# forcing all eigenvalues to 0 under the poly kernel
assert X_fit_transformed.size != 0
# transform new data
X_pred_transformed = kpca.transform(X_pred)
assert X_pred_transformed.shape[1] == X_fit_transformed.shape[1]
# inverse transform
if inv:
X_pred2 = kpca.inverse_transform(X_pred_transformed)
assert X_pred2.shape == X_pred.shape
def test_kernel_pca_invalid_parameters():
"""Check that kPCA raises an error if the parameters are invalid
Tests fitting inverse transform with a precomputed kernel raises a
ValueError.
"""
estimator = KernelPCA(
n_components=10, fit_inverse_transform=True, kernel="precomputed"
)
err_ms = "Cannot fit_inverse_transform with a precomputed kernel"
with pytest.raises(ValueError, match=err_ms):
estimator.fit(np.random.randn(10, 10))
def test_kernel_pca_consistent_transform():
"""Check robustness to mutations in the original training array
Test that after fitting a kPCA model, it stays independent of any
mutation of the values of the original data object by relying on an
internal copy.
"""
# X_fit_ needs to retain the old, unmodified copy of X
state = np.random.RandomState(0)
X = state.rand(10, 10)
kpca = KernelPCA(random_state=state).fit(X)
transformed1 = kpca.transform(X)
X_copy = X.copy()
X[:, 0] = 666
transformed2 = kpca.transform(X_copy)
assert_array_almost_equal(transformed1, transformed2)
def test_kernel_pca_deterministic_output():
"""Test that Kernel PCA produces deterministic output
Tests that the same inputs and random state produce the same output.
"""
rng = np.random.RandomState(0)
X = rng.rand(10, 10)
eigen_solver = ("arpack", "dense")
for solver in eigen_solver:
transformed_X = np.zeros((20, 2))
for i in range(20):
kpca = KernelPCA(n_components=2, eigen_solver=solver, random_state=rng)
transformed_X[i, :] = kpca.fit_transform(X)[0]
assert_allclose(transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2))
def test_kernel_pca_sparse():
"""Test that kPCA works on a sparse data input.
Same test as ``test_kernel_pca except inverse_transform`` since it's not
implemented for sparse matrices.
"""
rng = np.random.RandomState(0)
X_fit = sp.csr_matrix(rng.random_sample((5, 4)))
X_pred = sp.csr_matrix(rng.random_sample((2, 4)))
for eigen_solver in ("auto", "arpack", "randomized"):
for kernel in ("linear", "rbf", "poly"):
# transform fit data
kpca = KernelPCA(
4,
kernel=kernel,
eigen_solver=eigen_solver,
fit_inverse_transform=False,
random_state=0,
)
X_fit_transformed = kpca.fit_transform(X_fit)
X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
assert_array_almost_equal(
np.abs(X_fit_transformed), np.abs(X_fit_transformed2)
)
# transform new data
X_pred_transformed = kpca.transform(X_pred)
assert X_pred_transformed.shape[1] == X_fit_transformed.shape[1]
# inverse transform: not available for sparse matrices
# XXX: should we raise another exception type here? For instance:
# NotImplementedError.
with pytest.raises(NotFittedError):
kpca.inverse_transform(X_pred_transformed)
@pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"])
@pytest.mark.parametrize("n_features", [4, 10])
def test_kernel_pca_linear_kernel(solver, n_features):
"""Test that kPCA with linear kernel is equivalent to PCA for all solvers.
KernelPCA with linear kernel should produce the same output as PCA.
"""
rng = np.random.RandomState(0)
X_fit = rng.random_sample((5, n_features))
X_pred = rng.random_sample((2, n_features))
# for a linear kernel, kernel PCA should find the same projection as PCA
# modulo the sign (direction)
# fit only the first four components: fifth is near zero eigenvalue, so
# can be trimmed due to roundoff error
n_comps = 3 if solver == "arpack" else 4
assert_array_almost_equal(
np.abs(KernelPCA(n_comps, eigen_solver=solver).fit(X_fit).transform(X_pred)),
np.abs(
PCA(n_comps, svd_solver=solver if solver != "dense" else "full")
.fit(X_fit)
.transform(X_pred)
),
)
def test_kernel_pca_n_components():
"""Test that `n_components` is correctly taken into account for projections
For all solvers this tests that the output has the correct shape depending
on the selected number of components.
"""
rng = np.random.RandomState(0)
X_fit = rng.random_sample((5, 4))
X_pred = rng.random_sample((2, 4))
for eigen_solver in ("dense", "arpack", "randomized"):
for c in [1, 2, 4]:
kpca = KernelPCA(n_components=c, eigen_solver=eigen_solver)
shape = kpca.fit(X_fit).transform(X_pred).shape
assert shape == (2, c)
def test_remove_zero_eig():
"""Check that the ``remove_zero_eig`` parameter works correctly.
Tests that the null-space (Zero) eigenvalues are removed when
remove_zero_eig=True, whereas they are not by default.
"""
X = np.array([[1 - 1e-30, 1], [1, 1], [1, 1 - 1e-20]])
# n_components=None (default) => remove_zero_eig is True
kpca = KernelPCA()
Xt = kpca.fit_transform(X)
assert Xt.shape == (3, 0)
kpca = KernelPCA(n_components=2)
Xt = kpca.fit_transform(X)
assert Xt.shape == (3, 2)
kpca = KernelPCA(n_components=2, remove_zero_eig=True)
Xt = kpca.fit_transform(X)
assert Xt.shape == (3, 0)
def test_leave_zero_eig():
"""Non-regression test for issue #12141 (PR #12143)
This test checks that fit().transform() returns the same result as
fit_transform() in case of non-removed zero eigenvalue.
"""
X_fit = np.array([[1, 1], [0, 0]])
# Assert that even with all np warnings on, there is no div by zero warning
with warnings.catch_warnings():
# There might be warnings about the kernel being badly conditioned,
# but there should not be warnings about division by zero.
# (Numpy division by zero warning can have many message variants, but
# at least we know that it is a RuntimeWarning so lets check only this)
warnings.simplefilter("error", RuntimeWarning)
with np.errstate(all="warn"):
k = KernelPCA(n_components=2, remove_zero_eig=False, eigen_solver="dense")
# Fit, then transform
A = k.fit(X_fit).transform(X_fit)
# Do both at once
B = k.fit_transform(X_fit)
# Compare
assert_array_almost_equal(np.abs(A), np.abs(B))
def test_kernel_pca_precomputed():
"""Test that kPCA works with a precomputed kernel, for all solvers"""
rng = np.random.RandomState(0)
X_fit = rng.random_sample((5, 4))
X_pred = rng.random_sample((2, 4))
for eigen_solver in ("dense", "arpack", "randomized"):
X_kpca = (
KernelPCA(4, eigen_solver=eigen_solver, random_state=0)
.fit(X_fit)
.transform(X_pred)
)
X_kpca2 = (
KernelPCA(
4, eigen_solver=eigen_solver, kernel="precomputed", random_state=0
)
.fit(np.dot(X_fit, X_fit.T))
.transform(np.dot(X_pred, X_fit.T))
)
X_kpca_train = KernelPCA(
4, eigen_solver=eigen_solver, kernel="precomputed", random_state=0
).fit_transform(np.dot(X_fit, X_fit.T))
X_kpca_train2 = (
KernelPCA(
4, eigen_solver=eigen_solver, kernel="precomputed", random_state=0
)
.fit(np.dot(X_fit, X_fit.T))
.transform(np.dot(X_fit, X_fit.T))
)
assert_array_almost_equal(np.abs(X_kpca), np.abs(X_kpca2))
assert_array_almost_equal(np.abs(X_kpca_train), np.abs(X_kpca_train2))
@pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"])
def test_kernel_pca_precomputed_non_symmetric(solver):
"""Check that the kernel centerer works.
Tests that a non symmetric precomputed kernel is actually accepted
because the kernel centerer does its job correctly.
"""
# a non symmetric gram matrix
K = [[1, 2], [3, 40]]
kpca = KernelPCA(
kernel="precomputed", eigen_solver=solver, n_components=1, random_state=0
)
kpca.fit(K) # no error
# same test with centered kernel
Kc = [[9, -9], [-9, 9]]
kpca_c = KernelPCA(
kernel="precomputed", eigen_solver=solver, n_components=1, random_state=0
)
kpca_c.fit(Kc)
# comparison between the non-centered and centered versions
assert_array_equal(kpca.eigenvectors_, kpca_c.eigenvectors_)
assert_array_equal(kpca.eigenvalues_, kpca_c.eigenvalues_)
def test_gridsearch_pipeline():
"""Check that kPCA works as expected in a grid search pipeline
Test if we can do a grid-search to find parameters to separate
circles with a perceptron model.
"""
X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0)
kpca = KernelPCA(kernel="rbf", n_components=2)
pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron(max_iter=5))])
param_grid = dict(kernel_pca__gamma=2.0 ** np.arange(-2, 2))
grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid)
grid_search.fit(X, y)
assert grid_search.best_score_ == 1
def test_gridsearch_pipeline_precomputed():
"""Check that kPCA works as expected in a grid search pipeline (2)
Test if we can do a grid-search to find parameters to separate
circles with a perceptron model. This test uses a precomputed kernel.
"""
X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0)
kpca = KernelPCA(kernel="precomputed", n_components=2)
pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron(max_iter=5))])
param_grid = dict(Perceptron__max_iter=np.arange(1, 5))
grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid)
X_kernel = rbf_kernel(X, gamma=2.0)
grid_search.fit(X_kernel, y)
assert grid_search.best_score_ == 1
def test_nested_circles():
"""Check that kPCA projects in a space where nested circles are separable
Tests that 2D nested circles become separable with a perceptron when
projected in the first 2 kPCA using an RBF kernel, while raw samples
are not directly separable in the original space.
"""
X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0)
# 2D nested circles are not linearly separable
train_score = Perceptron(max_iter=5).fit(X, y).score(X, y)
assert train_score < 0.8
# Project the circles data into the first 2 components of a RBF Kernel
# PCA model.
# Note that the gamma value is data dependent. If this test breaks
# and the gamma value has to be updated, the Kernel PCA example will
# have to be updated too.
kpca = KernelPCA(
kernel="rbf", n_components=2, fit_inverse_transform=True, gamma=2.0
)
X_kpca = kpca.fit_transform(X)
# The data is perfectly linearly separable in that space
train_score = Perceptron(max_iter=5).fit(X_kpca, y).score(X_kpca, y)
assert train_score == 1.0
def test_kernel_conditioning():
"""Check that ``_check_psd_eigenvalues`` is correctly called in kPCA
Non-regression test for issue #12140 (PR #12145).
"""
# create a pathological X leading to small non-zero eigenvalue
X = [[5, 1], [5 + 1e-8, 1e-8], [5 + 1e-8, 0]]
kpca = KernelPCA(kernel="linear", n_components=2, fit_inverse_transform=True)
kpca.fit(X)
# check that the small non-zero eigenvalue was correctly set to zero
assert kpca.eigenvalues_.min() == 0
assert np.all(kpca.eigenvalues_ == _check_psd_eigenvalues(kpca.eigenvalues_))
@pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"])
def test_precomputed_kernel_not_psd(solver):
"""Check how KernelPCA works with non-PSD kernels depending on n_components
Tests for all methods what happens with a non PSD gram matrix (this
can happen in an isomap scenario, or with custom kernel functions, or
maybe with ill-posed datasets).
When ``n_component`` is large enough to capture a negative eigenvalue, an
error should be raised. Otherwise, KernelPCA should run without error
since the negative eigenvalues are not selected.
"""
# a non PSD kernel with large eigenvalues, already centered
# it was captured from an isomap call and multiplied by 100 for compacity
K = [
[4.48, -1.0, 8.07, 2.33, 2.33, 2.33, -5.76, -12.78],
[-1.0, -6.48, 4.5, -1.24, -1.24, -1.24, -0.81, 7.49],
[8.07, 4.5, 15.48, 2.09, 2.09, 2.09, -11.1, -23.23],
[2.33, -1.24, 2.09, 4.0, -3.65, -3.65, 1.02, -0.9],
[2.33, -1.24, 2.09, -3.65, 4.0, -3.65, 1.02, -0.9],
[2.33, -1.24, 2.09, -3.65, -3.65, 4.0, 1.02, -0.9],
[-5.76, -0.81, -11.1, 1.02, 1.02, 1.02, 4.86, 9.75],
[-12.78, 7.49, -23.23, -0.9, -0.9, -0.9, 9.75, 21.46],
]
# this gram matrix has 5 positive eigenvalues and 3 negative ones
# [ 52.72, 7.65, 7.65, 5.02, 0. , -0. , -6.13, -15.11]
# 1. ask for enough components to get a significant negative one
kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, n_components=7)
# make sure that the appropriate error is raised
with pytest.raises(ValueError, match="There are significant negative eigenvalues"):
kpca.fit(K)
# 2. ask for a small enough n_components to get only positive ones
kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, n_components=2)
if solver == "randomized":
# the randomized method is still inconsistent with the others on this
# since it selects the eigenvalues based on the largest 2 modules, not
# on the largest 2 values.
#
# At least we can ensure that we return an error instead of returning
# the wrong eigenvalues
with pytest.raises(
ValueError, match="There are significant negative eigenvalues"
):
kpca.fit(K)
else:
# general case: make sure that it works
kpca.fit(K)
@pytest.mark.parametrize("n_components", [4, 10, 20])
def test_kernel_pca_solvers_equivalence(n_components):
"""Check that 'dense' 'arpack' & 'randomized' solvers give similar results"""
# Generate random data
n_train, n_test = 1_000, 100
X, _ = make_circles(
n_samples=(n_train + n_test), factor=0.3, noise=0.05, random_state=0
)
X_fit, X_pred = X[:n_train, :], X[n_train:, :]
# reference (full)
ref_pred = (
KernelPCA(n_components, eigen_solver="dense", random_state=0)
.fit(X_fit)
.transform(X_pred)
)
# arpack
a_pred = (
KernelPCA(n_components, eigen_solver="arpack", random_state=0)
.fit(X_fit)
.transform(X_pred)
)
# check that the result is still correct despite the approx
assert_array_almost_equal(np.abs(a_pred), np.abs(ref_pred))
# randomized
r_pred = (
KernelPCA(n_components, eigen_solver="randomized", random_state=0)
.fit(X_fit)
.transform(X_pred)
)
# check that the result is still correct despite the approximation
assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred))
def test_kernel_pca_inverse_transform_reconstruction():
"""Test if the reconstruction is a good approximation.
Note that in general it is not possible to get an arbitrarily good
reconstruction because of kernel centering that does not
preserve all the information of the original data.
"""
X, *_ = make_blobs(n_samples=100, n_features=4, random_state=0)
kpca = KernelPCA(
n_components=20, kernel="rbf", fit_inverse_transform=True, alpha=1e-3
)
X_trans = kpca.fit_transform(X)
X_reconst = kpca.inverse_transform(X_trans)
assert np.linalg.norm(X - X_reconst) / np.linalg.norm(X) < 1e-1
def test_kernel_pca_raise_not_fitted_error():
X = np.random.randn(15).reshape(5, 3)
kpca = KernelPCA()
kpca.fit(X)
with pytest.raises(NotFittedError):
kpca.inverse_transform(X)
def test_32_64_decomposition_shape():
"""Test that the decomposition is similar for 32 and 64 bits data
Non regression test for
https://github.com/scikit-learn/scikit-learn/issues/18146
"""
X, y = make_blobs(
n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, cluster_std=0.1
)
X = StandardScaler().fit_transform(X)
X -= X.min()
# Compare the shapes (corresponds to the number of non-zero eigenvalues)
kpca = KernelPCA()
assert kpca.fit_transform(X).shape == kpca.fit_transform(X.astype(np.float32)).shape
def test_kernel_pca_feature_names_out():
"""Check feature names out for KernelPCA."""
X, *_ = make_blobs(n_samples=100, n_features=4, random_state=0)
kpca = KernelPCA(n_components=2).fit(X)
names = kpca.get_feature_names_out()
assert_array_equal([f"kernelpca{i}" for i in range(2)], names)