Inzynierka/Lib/site-packages/sklearn/manifold/tests/test_spectral_embedding.py

from unittest.mock import Mock
import pytest

import numpy as np

from scipy import sparse
from scipy.sparse import csgraph
from scipy.linalg import eigh
from scipy.sparse.linalg import eigsh

from sklearn.manifold import SpectralEmbedding, _spectral_embedding
from sklearn.manifold._spectral_embedding import _graph_is_connected
from sklearn.manifold._spectral_embedding import _graph_connected_component
from sklearn.manifold import spectral_embedding
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics import normalized_mutual_info_score, pairwise_distances
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.utils.extmath import _deterministic_vector_sign_flip
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_array_equal
from sklearn.utils.fixes import lobpcg

try:
    from pyamg import smoothed_aggregation_solver  # noqa

    pyamg_available = True
except ImportError:
    pyamg_available = False
skip_if_no_pyamg = pytest.mark.skipif(
    not pyamg_available, reason="PyAMG is required for the tests in this function."
)

# non centered, sparse centers to check the
centers = np.array(
    [
        [0.0, 5.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 4.0, 0.0, 0.0],
        [1.0, 0.0, 0.0, 5.0, 1.0],
    ]
)
n_samples = 1000
n_clusters, n_features = centers.shape
S, true_labels = make_blobs(
    n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42
)


def _assert_equal_with_sign_flipping(A, B, tol=0.0):
    """Check array A and B are equal with possible sign flipping on
    each columns"""
    tol_squared = tol**2
    for A_col, B_col in zip(A.T, B.T):
        assert (
            np.max((A_col - B_col) ** 2) <= tol_squared
            or np.max((A_col + B_col) ** 2) <= tol_squared
        )


def test_sparse_graph_connected_component():
    rng = np.random.RandomState(42)
    n_samples = 300
    boundaries = [0, 42, 121, 200, n_samples]
    p = rng.permutation(n_samples)
    connections = []

    for start, stop in zip(boundaries[:-1], boundaries[1:]):
        group = p[start:stop]
        # Connect all elements within the group at least once via an
        # arbitrary path that spans the group.
        for i in range(len(group) - 1):
            connections.append((group[i], group[i + 1]))

        # Add some more random connections within the group
        min_idx, max_idx = 0, len(group) - 1
        n_random_connections = 1000
        source = rng.randint(min_idx, max_idx, size=n_random_connections)
        target = rng.randint(min_idx, max_idx, size=n_random_connections)
        connections.extend(zip(group[source], group[target]))

    # Build a symmetric affinity matrix
    row_idx, column_idx = tuple(np.array(connections).T)
    data = rng.uniform(0.1, 42, size=len(connections))
    affinity = sparse.coo_matrix((data, (row_idx, column_idx)))
    affinity = 0.5 * (affinity + affinity.T)

    for start, stop in zip(boundaries[:-1], boundaries[1:]):
        component_1 = _graph_connected_component(affinity, p[start])
        component_size = stop - start
        assert component_1.sum() == component_size

        # We should retrieve the same component mask by starting by both ends
        # of the group
        component_2 = _graph_connected_component(affinity, p[stop - 1])
        assert component_2.sum() == component_size
        assert_array_equal(component_1, component_2)


# TODO: investigate why this test is seed-sensitive on 32-bit Python
# runtimes. Is this revealing a numerical stability problem ? Or is it
# expected from the test numerical design ? In the latter case the test
# should be made less seed-sensitive instead.
@pytest.mark.parametrize(
    "eigen_solver",
    [
        "arpack",
        "lobpcg",
        pytest.param("amg", marks=skip_if_no_pyamg),
    ],
)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_spectral_embedding_two_components(eigen_solver, dtype, seed=0):
    # Test spectral embedding with two components
    random_state = np.random.RandomState(seed)
    n_sample = 100
    affinity = np.zeros(shape=[n_sample * 2, n_sample * 2])
    # first component
    affinity[0:n_sample, 0:n_sample] = (
        np.abs(random_state.randn(n_sample, n_sample)) + 2
    )
    # second component
    affinity[n_sample::, n_sample::] = (
        np.abs(random_state.randn(n_sample, n_sample)) + 2
    )

    # Test of internal _graph_connected_component before connection
    component = _graph_connected_component(affinity, 0)
    assert component[:n_sample].all()
    assert not component[n_sample:].any()
    component = _graph_connected_component(affinity, -1)
    assert not component[:n_sample].any()
    assert component[n_sample:].all()

    # connection
    affinity[0, n_sample + 1] = 1
    affinity[n_sample + 1, 0] = 1
    affinity.flat[:: 2 * n_sample + 1] = 0
    affinity = 0.5 * (affinity + affinity.T)

    true_label = np.zeros(shape=2 * n_sample)
    true_label[0:n_sample] = 1

    se_precomp = SpectralEmbedding(
        n_components=1,
        affinity="precomputed",
        random_state=np.random.RandomState(seed),
        eigen_solver=eigen_solver,
    )

    embedded_coordinate = se_precomp.fit_transform(affinity.astype(dtype))
    # thresholding on the first components using 0.
    label_ = np.array(embedded_coordinate.ravel() < 0, dtype=np.int64)
    assert normalized_mutual_info_score(true_label, label_) == pytest.approx(1.0)


@pytest.mark.parametrize("X", [S, sparse.csr_matrix(S)], ids=["dense", "sparse"])
@pytest.mark.parametrize(
    "eigen_solver",
    [
        "arpack",
        "lobpcg",
        pytest.param("amg", marks=skip_if_no_pyamg),
    ],
)
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
def test_spectral_embedding_precomputed_affinity(X, eigen_solver, dtype, seed=36):
    # Test spectral embedding with precomputed kernel
    gamma = 1.0
    se_precomp = SpectralEmbedding(
        n_components=2,
        affinity="precomputed",
        random_state=np.random.RandomState(seed),
        eigen_solver=eigen_solver,
    )
    se_rbf = SpectralEmbedding(
        n_components=2,
        affinity="rbf",
        gamma=gamma,
        random_state=np.random.RandomState(seed),
        eigen_solver=eigen_solver,
    )
    embed_precomp = se_precomp.fit_transform(rbf_kernel(X.astype(dtype), gamma=gamma))
    embed_rbf = se_rbf.fit_transform(X.astype(dtype))
    assert_array_almost_equal(se_precomp.affinity_matrix_, se_rbf.affinity_matrix_)
    _assert_equal_with_sign_flipping(embed_precomp, embed_rbf, 0.05)


def test_precomputed_nearest_neighbors_filtering():
    # Test precomputed graph filtering when containing too many neighbors
    n_neighbors = 2
    results = []
    for additional_neighbors in [0, 10]:
        nn = NearestNeighbors(n_neighbors=n_neighbors + additional_neighbors).fit(S)
        graph = nn.kneighbors_graph(S, mode="connectivity")
        embedding = (
            SpectralEmbedding(
                random_state=0,
                n_components=2,
                affinity="precomputed_nearest_neighbors",
                n_neighbors=n_neighbors,
            )
            .fit(graph)
            .embedding_
        )
        results.append(embedding)

    assert_array_equal(results[0], results[1])


@pytest.mark.parametrize("X", [S, sparse.csr_matrix(S)], ids=["dense", "sparse"])
def test_spectral_embedding_callable_affinity(X, seed=36):
    # Test spectral embedding with callable affinity
    gamma = 0.9
    kern = rbf_kernel(S, gamma=gamma)
    se_callable = SpectralEmbedding(
        n_components=2,
        affinity=(lambda x: rbf_kernel(x, gamma=gamma)),
        gamma=gamma,
        random_state=np.random.RandomState(seed),
    )
    se_rbf = SpectralEmbedding(
        n_components=2,
        affinity="rbf",
        gamma=gamma,
        random_state=np.random.RandomState(seed),
    )
    embed_rbf = se_rbf.fit_transform(X)
    embed_callable = se_callable.fit_transform(X)
    assert_array_almost_equal(se_callable.affinity_matrix_, se_rbf.affinity_matrix_)
    assert_array_almost_equal(kern, se_rbf.affinity_matrix_)
    _assert_equal_with_sign_flipping(embed_rbf, embed_callable, 0.05)


# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand
# https://github.com/scikit-learn/scikit-learn/issues/15913
@pytest.mark.filterwarnings(
    "ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*"
)
# TODO: Remove when pyamg removes the use of np.float
@pytest.mark.filterwarnings(
    "ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*"
)
# TODO: Remove when pyamg removes the use of pinv2
@pytest.mark.filterwarnings(
    "ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*"
)
@pytest.mark.skipif(
    not pyamg_available, reason="PyAMG is required for the tests in this function."
)
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
def test_spectral_embedding_amg_solver(dtype, seed=36):
    se_amg = SpectralEmbedding(
        n_components=2,
        affinity="nearest_neighbors",
        eigen_solver="amg",
        n_neighbors=5,
        random_state=np.random.RandomState(seed),
    )
    se_arpack = SpectralEmbedding(
        n_components=2,
        affinity="nearest_neighbors",
        eigen_solver="arpack",
        n_neighbors=5,
        random_state=np.random.RandomState(seed),
    )
    embed_amg = se_amg.fit_transform(S.astype(dtype))
    embed_arpack = se_arpack.fit_transform(S.astype(dtype))
    _assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)

    # same with special case in which amg is not actually used
    # regression test for #10715
    # affinity between nodes
    row = [0, 0, 1, 2, 3, 3, 4]
    col = [1, 2, 2, 3, 4, 5, 5]
    val = [100, 100, 100, 1, 100, 100, 100]

    affinity = sparse.coo_matrix(
        (val + val, (row + col, col + row)), shape=(6, 6)
    ).toarray()
    se_amg.affinity = "precomputed"
    se_arpack.affinity = "precomputed"
    embed_amg = se_amg.fit_transform(affinity.astype(dtype))
    embed_arpack = se_arpack.fit_transform(affinity.astype(dtype))
    _assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)


# TODO: Remove filterwarnings when pyamg does replaces sp.rand call with
# np.random.rand:
# https://github.com/scikit-learn/scikit-learn/issues/15913
@pytest.mark.filterwarnings(
    "ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*"
)
# TODO: Remove when pyamg removes the use of np.float
@pytest.mark.filterwarnings(
    "ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*"
)
# TODO: Remove when pyamg removes the use of pinv2
@pytest.mark.filterwarnings(
    "ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*"
)
@pytest.mark.skipif(
    not pyamg_available, reason="PyAMG is required for the tests in this function."
)
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
def test_spectral_embedding_amg_solver_failure(dtype, seed=36):
    # Non-regression test for amg solver failure (issue #13393 on github)
    num_nodes = 100
    X = sparse.rand(num_nodes, num_nodes, density=0.1, random_state=seed)
    X = X.astype(dtype)
    upper = sparse.triu(X) - sparse.diags(X.diagonal())
    sym_matrix = upper + upper.T
    embedding = spectral_embedding(
        sym_matrix, n_components=10, eigen_solver="amg", random_state=0
    )

    # Check that the learned embedding is stable w.r.t. random solver init:
    for i in range(3):
        new_embedding = spectral_embedding(
            sym_matrix, n_components=10, eigen_solver="amg", random_state=i + 1
        )
        _assert_equal_with_sign_flipping(embedding, new_embedding, tol=0.05)


@pytest.mark.filterwarnings("ignore:the behavior of nmi will change in version 0.22")
def test_pipeline_spectral_clustering(seed=36):
    # Test using pipeline to do spectral clustering
    random_state = np.random.RandomState(seed)
    se_rbf = SpectralEmbedding(
        n_components=n_clusters, affinity="rbf", random_state=random_state
    )
    se_knn = SpectralEmbedding(
        n_components=n_clusters,
        affinity="nearest_neighbors",
        n_neighbors=5,
        random_state=random_state,
    )
    for se in [se_rbf, se_knn]:
        km = KMeans(n_clusters=n_clusters, random_state=random_state, n_init="auto")
        km.fit(se.fit_transform(S))
        assert_array_almost_equal(
            normalized_mutual_info_score(km.labels_, true_labels), 1.0, 2
        )


def test_connectivity(seed=36):
    # Test that graph connectivity test works as expected
    graph = np.array(
        [
            [1, 0, 0, 0, 0],
            [0, 1, 1, 0, 0],
            [0, 1, 1, 1, 0],
            [0, 0, 1, 1, 1],
            [0, 0, 0, 1, 1],
        ]
    )
    assert not _graph_is_connected(graph)
    assert not _graph_is_connected(sparse.csr_matrix(graph))
    assert not _graph_is_connected(sparse.csc_matrix(graph))
    graph = np.array(
        [
            [1, 1, 0, 0, 0],
            [1, 1, 1, 0, 0],
            [0, 1, 1, 1, 0],
            [0, 0, 1, 1, 1],
            [0, 0, 0, 1, 1],
        ]
    )
    assert _graph_is_connected(graph)
    assert _graph_is_connected(sparse.csr_matrix(graph))
    assert _graph_is_connected(sparse.csc_matrix(graph))


def test_spectral_embedding_deterministic():
    # Test that Spectral Embedding is deterministic
    random_state = np.random.RandomState(36)
    data = random_state.randn(10, 30)
    sims = rbf_kernel(data)
    embedding_1 = spectral_embedding(sims)
    embedding_2 = spectral_embedding(sims)
    assert_array_almost_equal(embedding_1, embedding_2)


def test_spectral_embedding_unnormalized():
    # Test that spectral_embedding is also processing unnormalized laplacian
    # correctly
    random_state = np.random.RandomState(36)
    data = random_state.randn(10, 30)
    sims = rbf_kernel(data)
    n_components = 8
    embedding_1 = spectral_embedding(
        sims, norm_laplacian=False, n_components=n_components, drop_first=False
    )

    # Verify using manual computation with dense eigh
    laplacian, dd = csgraph.laplacian(sims, normed=False, return_diag=True)
    _, diffusion_map = eigh(laplacian)
    embedding_2 = diffusion_map.T[:n_components]
    embedding_2 = _deterministic_vector_sign_flip(embedding_2).T

    assert_array_almost_equal(embedding_1, embedding_2)


def test_spectral_embedding_first_eigen_vector():
    # Test that the first eigenvector of spectral_embedding
    # is constant and that the second is not (for a connected graph)
    random_state = np.random.RandomState(36)
    data = random_state.randn(10, 30)
    sims = rbf_kernel(data)
    n_components = 2

    for seed in range(10):
        embedding = spectral_embedding(
            sims,
            norm_laplacian=False,
            n_components=n_components,
            drop_first=False,
            random_state=seed,
        )

        assert np.std(embedding[:, 0]) == pytest.approx(0)
        assert np.std(embedding[:, 1]) > 1e-3


@pytest.mark.parametrize(
    "eigen_solver",
    [
        "arpack",
        "lobpcg",
        pytest.param("amg", marks=skip_if_no_pyamg),
    ],
)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_spectral_embedding_preserves_dtype(eigen_solver, dtype):
    """Check that `SpectralEmbedding is preserving the dtype of the fitted
    attribute and transformed data.

    Ideally, this test should be covered by the common test
    `check_transformer_preserve_dtypes`. However, this test only run
    with transformers implementing `transform` while `SpectralEmbedding`
    implements only `fit_transform`.
    """
    X = S.astype(dtype)
    se = SpectralEmbedding(
        n_components=2, affinity="rbf", eigen_solver=eigen_solver, random_state=0
    )
    X_trans = se.fit_transform(X)

    assert X_trans.dtype == dtype
    assert se.embedding_.dtype == dtype
    assert se.affinity_matrix_.dtype == dtype


@pytest.mark.skipif(
    pyamg_available,
    reason="PyAMG is installed and we should not test for an error.",
)
def test_error_pyamg_not_available():
    se_precomp = SpectralEmbedding(
        n_components=2,
        affinity="rbf",
        eigen_solver="amg",
    )
    err_msg = "The eigen_solver was set to 'amg', but pyamg is not available."
    with pytest.raises(ValueError, match=err_msg):
        se_precomp.fit_transform(S)


@pytest.mark.parametrize("solver", ["arpack", "amg", "lobpcg"])
def test_spectral_eigen_tol_auto(monkeypatch, solver):
    """Test that `eigen_tol="auto"` is resolved correctly"""
    if solver == "amg" and not pyamg_available:
        pytest.skip("PyAMG is not available.")
    X, _ = make_blobs(
        n_samples=200, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
    )
    D = pairwise_distances(X)  # Distance matrix
    S = np.max(D) - D  # Similarity matrix

    solver_func = eigsh if solver == "arpack" else lobpcg
    default_value = 0 if solver == "arpack" else None
    if solver == "amg":
        S = sparse.csr_matrix(S)

    mocked_solver = Mock(side_effect=solver_func)

    monkeypatch.setattr(_spectral_embedding, solver_func.__qualname__, mocked_solver)

    spectral_embedding(S, random_state=42, eigen_solver=solver, eigen_tol="auto")
    mocked_solver.assert_called()

    _, kwargs = mocked_solver.call_args
    assert kwargs["tol"] == default_value