331 lines
11 KiB
Python
331 lines
11 KiB
Python
"""Testing for Spectral Clustering methods"""
|
|
import re
|
|
|
|
import numpy as np
|
|
from scipy import sparse
|
|
from scipy.linalg import LinAlgError
|
|
|
|
import pytest
|
|
|
|
import pickle
|
|
|
|
from sklearn.utils import check_random_state
|
|
from sklearn.utils._testing import assert_array_equal
|
|
|
|
from sklearn.cluster import SpectralClustering, spectral_clustering
|
|
from sklearn.cluster._spectral import discretize, cluster_qr
|
|
from sklearn.feature_extraction import img_to_graph
|
|
from sklearn.metrics import adjusted_rand_score
|
|
from sklearn.metrics.pairwise import kernel_metrics, rbf_kernel
|
|
from sklearn.neighbors import NearestNeighbors
|
|
from sklearn.datasets import make_blobs
|
|
|
|
try:
|
|
from pyamg import smoothed_aggregation_solver # noqa
|
|
|
|
amg_loaded = True
|
|
except ImportError:
|
|
amg_loaded = False
|
|
|
|
centers = np.array([[1, 1], [-1, -1], [1, -1]]) + 10
|
|
X, _ = make_blobs(
|
|
n_samples=60,
|
|
n_features=2,
|
|
centers=centers,
|
|
cluster_std=0.4,
|
|
shuffle=True,
|
|
random_state=0,
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize("eigen_solver", ("arpack", "lobpcg"))
|
|
@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
|
|
def test_spectral_clustering(eigen_solver, assign_labels):
|
|
S = np.array(
|
|
[
|
|
[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
|
|
[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
|
|
[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0],
|
|
[0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0],
|
|
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
|
|
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
|
|
[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
|
|
]
|
|
)
|
|
|
|
for mat in (S, sparse.csr_matrix(S)):
|
|
model = SpectralClustering(
|
|
random_state=0,
|
|
n_clusters=2,
|
|
affinity="precomputed",
|
|
eigen_solver=eigen_solver,
|
|
assign_labels=assign_labels,
|
|
).fit(mat)
|
|
labels = model.labels_
|
|
if labels[0] == 0:
|
|
labels = 1 - labels
|
|
|
|
assert adjusted_rand_score(labels, [1, 1, 1, 0, 0, 0, 0]) == 1
|
|
|
|
model_copy = pickle.loads(pickle.dumps(model))
|
|
assert model_copy.n_clusters == model.n_clusters
|
|
assert model_copy.eigen_solver == model.eigen_solver
|
|
assert_array_equal(model_copy.labels_, model.labels_)
|
|
|
|
|
|
@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
|
|
def test_spectral_clustering_sparse(assign_labels):
|
|
X, y = make_blobs(
|
|
n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
|
|
)
|
|
|
|
S = rbf_kernel(X, gamma=1)
|
|
S = np.maximum(S - 1e-4, 0)
|
|
S = sparse.coo_matrix(S)
|
|
|
|
labels = (
|
|
SpectralClustering(
|
|
random_state=0,
|
|
n_clusters=2,
|
|
affinity="precomputed",
|
|
assign_labels=assign_labels,
|
|
)
|
|
.fit(S)
|
|
.labels_
|
|
)
|
|
assert adjusted_rand_score(y, labels) == 1
|
|
|
|
|
|
def test_precomputed_nearest_neighbors_filtering():
|
|
# Test precomputed graph filtering when containing too many neighbors
|
|
X, y = make_blobs(
|
|
n_samples=200, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
|
|
)
|
|
|
|
n_neighbors = 2
|
|
results = []
|
|
for additional_neighbors in [0, 10]:
|
|
nn = NearestNeighbors(n_neighbors=n_neighbors + additional_neighbors).fit(X)
|
|
graph = nn.kneighbors_graph(X, mode="connectivity")
|
|
labels = (
|
|
SpectralClustering(
|
|
random_state=0,
|
|
n_clusters=2,
|
|
affinity="precomputed_nearest_neighbors",
|
|
n_neighbors=n_neighbors,
|
|
)
|
|
.fit(graph)
|
|
.labels_
|
|
)
|
|
results.append(labels)
|
|
|
|
assert_array_equal(results[0], results[1])
|
|
|
|
|
|
def test_affinities():
|
|
# Note: in the following, random_state has been selected to have
|
|
# a dataset that yields a stable eigen decomposition both when built
|
|
# on OSX and Linux
|
|
X, y = make_blobs(
|
|
n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
|
|
)
|
|
# nearest neighbors affinity
|
|
sp = SpectralClustering(n_clusters=2, affinity="nearest_neighbors", random_state=0)
|
|
with pytest.warns(UserWarning, match="not fully connected"):
|
|
sp.fit(X)
|
|
assert adjusted_rand_score(y, sp.labels_) == 1
|
|
|
|
sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0)
|
|
labels = sp.fit(X).labels_
|
|
assert adjusted_rand_score(y, labels) == 1
|
|
|
|
X = check_random_state(10).rand(10, 5) * 10
|
|
|
|
kernels_available = kernel_metrics()
|
|
for kern in kernels_available:
|
|
# Additive chi^2 gives a negative similarity matrix which
|
|
# doesn't make sense for spectral clustering
|
|
if kern != "additive_chi2":
|
|
sp = SpectralClustering(n_clusters=2, affinity=kern, random_state=0)
|
|
labels = sp.fit(X).labels_
|
|
assert (X.shape[0],) == labels.shape
|
|
|
|
sp = SpectralClustering(n_clusters=2, affinity=lambda x, y: 1, random_state=0)
|
|
labels = sp.fit(X).labels_
|
|
assert (X.shape[0],) == labels.shape
|
|
|
|
def histogram(x, y, **kwargs):
|
|
# Histogram kernel implemented as a callable.
|
|
assert kwargs == {} # no kernel_params that we didn't ask for
|
|
return np.minimum(x, y).sum()
|
|
|
|
sp = SpectralClustering(n_clusters=2, affinity=histogram, random_state=0)
|
|
labels = sp.fit(X).labels_
|
|
assert (X.shape[0],) == labels.shape
|
|
|
|
|
|
def test_cluster_qr():
|
|
# cluster_qr by itself should not be used for clustering generic data
|
|
# other than the rows of the eigenvectors within spectral clustering,
|
|
# but cluster_qr must still preserve the labels for different dtypes
|
|
# of the generic fixed input even if the labels may be meaningless.
|
|
random_state = np.random.RandomState(seed=8)
|
|
n_samples, n_components = 10, 5
|
|
data = random_state.randn(n_samples, n_components)
|
|
labels_float64 = cluster_qr(data.astype(np.float64))
|
|
# Each sample is assigned a cluster identifier
|
|
assert labels_float64.shape == (n_samples,)
|
|
# All components should be covered by the assignment
|
|
assert np.array_equal(np.unique(labels_float64), np.arange(n_components))
|
|
# Single precision data should yield the same cluster assignments
|
|
labels_float32 = cluster_qr(data.astype(np.float32))
|
|
assert np.array_equal(labels_float64, labels_float32)
|
|
|
|
|
|
def test_cluster_qr_permutation_invariance():
|
|
# cluster_qr must be invariant to sample permutation.
|
|
random_state = np.random.RandomState(seed=8)
|
|
n_samples, n_components = 100, 5
|
|
data = random_state.randn(n_samples, n_components)
|
|
perm = random_state.permutation(n_samples)
|
|
assert np.array_equal(
|
|
cluster_qr(data)[perm],
|
|
cluster_qr(data[perm]),
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize("n_samples", [50, 100, 150, 500])
|
|
def test_discretize(n_samples):
|
|
# Test the discretize using a noise assignment matrix
|
|
random_state = np.random.RandomState(seed=8)
|
|
for n_class in range(2, 10):
|
|
# random class labels
|
|
y_true = random_state.randint(0, n_class + 1, n_samples)
|
|
y_true = np.array(y_true, float)
|
|
# noise class assignment matrix
|
|
y_indicator = sparse.coo_matrix(
|
|
(np.ones(n_samples), (np.arange(n_samples), y_true)),
|
|
shape=(n_samples, n_class + 1),
|
|
)
|
|
y_true_noisy = y_indicator.toarray() + 0.1 * random_state.randn(
|
|
n_samples, n_class + 1
|
|
)
|
|
y_pred = discretize(y_true_noisy, random_state=random_state)
|
|
assert adjusted_rand_score(y_true, y_pred) > 0.8
|
|
|
|
|
|
# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand
|
|
# https://github.com/scikit-learn/scikit-learn/issues/15913
|
|
@pytest.mark.filterwarnings(
|
|
"ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*"
|
|
)
|
|
# TODO: Remove when pyamg removes the use of np.float
|
|
@pytest.mark.filterwarnings(
|
|
"ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*"
|
|
)
|
|
# TODO: Remove when pyamg removes the use of pinv2
|
|
@pytest.mark.filterwarnings(
|
|
"ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*"
|
|
)
|
|
def test_spectral_clustering_with_arpack_amg_solvers():
|
|
# Test that spectral_clustering is the same for arpack and amg solver
|
|
# Based on toy example from plot_segmentation_toy.py
|
|
|
|
# a small two coin image
|
|
x, y = np.indices((40, 40))
|
|
|
|
center1, center2 = (14, 12), (20, 25)
|
|
radius1, radius2 = 8, 7
|
|
|
|
circle1 = (x - center1[0]) ** 2 + (y - center1[1]) ** 2 < radius1**2
|
|
circle2 = (x - center2[0]) ** 2 + (y - center2[1]) ** 2 < radius2**2
|
|
|
|
circles = circle1 | circle2
|
|
mask = circles.copy()
|
|
img = circles.astype(float)
|
|
|
|
graph = img_to_graph(img, mask=mask)
|
|
graph.data = np.exp(-graph.data / graph.data.std())
|
|
|
|
labels_arpack = spectral_clustering(
|
|
graph, n_clusters=2, eigen_solver="arpack", random_state=0
|
|
)
|
|
|
|
assert len(np.unique(labels_arpack)) == 2
|
|
|
|
if amg_loaded:
|
|
labels_amg = spectral_clustering(
|
|
graph, n_clusters=2, eigen_solver="amg", random_state=0
|
|
)
|
|
assert adjusted_rand_score(labels_arpack, labels_amg) == 1
|
|
else:
|
|
with pytest.raises(ValueError):
|
|
spectral_clustering(graph, n_clusters=2, eigen_solver="amg", random_state=0)
|
|
|
|
|
|
def test_n_components():
|
|
# Test that after adding n_components, result is different and
|
|
# n_components = n_clusters by default
|
|
X, y = make_blobs(
|
|
n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
|
|
)
|
|
sp = SpectralClustering(n_clusters=2, random_state=0)
|
|
labels = sp.fit(X).labels_
|
|
# set n_components = n_cluster and test if result is the same
|
|
labels_same_ncomp = (
|
|
SpectralClustering(n_clusters=2, n_components=2, random_state=0).fit(X).labels_
|
|
)
|
|
# test that n_components=n_clusters by default
|
|
assert_array_equal(labels, labels_same_ncomp)
|
|
|
|
# test that n_components affect result
|
|
# n_clusters=8 by default, and set n_components=2
|
|
labels_diff_ncomp = (
|
|
SpectralClustering(n_components=2, random_state=0).fit(X).labels_
|
|
)
|
|
assert not np.array_equal(labels, labels_diff_ncomp)
|
|
|
|
|
|
@pytest.mark.parametrize("assign_labels", ("kmeans", "discretize", "cluster_qr"))
|
|
def test_verbose(assign_labels, capsys):
|
|
# Check verbose mode of KMeans for better coverage.
|
|
X, y = make_blobs(
|
|
n_samples=20, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
|
|
)
|
|
|
|
SpectralClustering(n_clusters=2, random_state=42, verbose=1).fit(X)
|
|
|
|
captured = capsys.readouterr()
|
|
|
|
assert re.search(r"Computing label assignment using", captured.out)
|
|
|
|
if assign_labels == "kmeans":
|
|
assert re.search(r"Initialization complete", captured.out)
|
|
assert re.search(r"Iteration [0-9]+, inertia", captured.out)
|
|
|
|
|
|
def test_spectral_clustering_np_matrix_raises():
|
|
"""Check that spectral_clustering raises an informative error when passed
|
|
a np.matrix. See #10993"""
|
|
X = np.matrix([[0.0, 2.0], [2.0, 0.0]])
|
|
|
|
msg = r"spectral_clustering does not support passing in affinity as an np\.matrix"
|
|
with pytest.raises(TypeError, match=msg):
|
|
spectral_clustering(X)
|
|
|
|
|
|
def test_spectral_clustering_not_infinite_loop(capsys, monkeypatch):
|
|
"""Check that discretize raises LinAlgError when svd never converges.
|
|
|
|
Non-regression test for #21380
|
|
"""
|
|
|
|
def new_svd(*args, **kwargs):
|
|
raise LinAlgError()
|
|
|
|
monkeypatch.setattr(np.linalg, "svd", new_svd)
|
|
vectors = np.ones((10, 4))
|
|
|
|
with pytest.raises(LinAlgError, match="SVD did not converge"):
|
|
discretize(vectors)
|