""" Several basic tests for hierarchical clustering procedures """ # Authors: Vincent Michel, 2010, Gael Varoquaux 2012, # Matteo Visconti di Oleggio Castello 2014 # License: BSD 3 clause from tempfile import mkdtemp import shutil import pytest from functools import partial import numpy as np from scipy import sparse from scipy.cluster import hierarchy from sklearn.metrics.cluster import adjusted_rand_score from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_raise_message from sklearn.utils._testing import ignore_warnings from sklearn.cluster import ward_tree from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration from sklearn.cluster._agglomerative import (_hc_cut, _TREE_BUILDERS, linkage_tree, _fix_connectivity) from sklearn.feature_extraction.image import grid_to_graph from sklearn.metrics.pairwise import PAIRED_DISTANCES, cosine_distances,\ manhattan_distances, pairwise_distances from sklearn.metrics.cluster import normalized_mutual_info_score from sklearn.neighbors import kneighbors_graph from sklearn.cluster._hierarchical_fast import average_merge, max_merge from sklearn.utils._fast_dict import IntFloatDict from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_warns from sklearn.datasets import make_moons, make_circles def test_linkage_misc(): # Misc tests on linkage rng = np.random.RandomState(42) X = rng.normal(size=(5, 5)) with pytest.raises(ValueError): AgglomerativeClustering(linkage='foo').fit(X) with pytest.raises(ValueError): linkage_tree(X, linkage='foo') with pytest.raises(ValueError): linkage_tree(X, connectivity=np.ones((4, 4))) # Smoke test FeatureAgglomeration FeatureAgglomeration().fit(X) # test hierarchical clustering on a precomputed distances matrix dis = cosine_distances(X) res = linkage_tree(dis, affinity="precomputed") assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0]) # test hierarchical clustering on a precomputed distances matrix res = linkage_tree(X, affinity=manhattan_distances) assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0]) def test_structured_linkage_tree(): # Check that we obtain the correct solution for structured linkage trees. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=bool) # Avoiding a mask with only 'True' entries mask[4:7, 4:7] = 0 X = rng.randn(50, 100) connectivity = grid_to_graph(*mask.shape) for tree_builder in _TREE_BUILDERS.values(): children, n_components, n_leaves, parent = \ tree_builder(X.T, connectivity=connectivity) n_nodes = 2 * X.shape[1] - 1 assert len(children) + n_leaves == n_nodes # Check that ward_tree raises a ValueError with a connectivity matrix # of the wrong shape with pytest.raises(ValueError): tree_builder(X.T, connectivity=np.ones((4, 4))) # Check that fitting with no samples raises an error with pytest.raises(ValueError): tree_builder(X.T[:0], connectivity=connectivity) def test_unstructured_linkage_tree(): # Check that we obtain the correct solution for unstructured linkage trees. rng = np.random.RandomState(0) X = rng.randn(50, 100) for this_X in (X, X[0]): # With specified a number of clusters just for the sake of # raising a warning and testing the warning code with ignore_warnings(): children, n_nodes, n_leaves, parent = assert_warns( UserWarning, ward_tree, this_X.T, n_clusters=10) n_nodes = 2 * X.shape[1] - 1 assert len(children) + n_leaves == n_nodes for tree_builder in _TREE_BUILDERS.values(): for this_X in (X, X[0]): with ignore_warnings(): children, n_nodes, n_leaves, parent = assert_warns( UserWarning, tree_builder, this_X.T, n_clusters=10) n_nodes = 2 * X.shape[1] - 1 assert len(children) + n_leaves == n_nodes def test_height_linkage_tree(): # Check that the height of the results of linkage tree is sorted. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=bool) X = rng.randn(50, 100) connectivity = grid_to_graph(*mask.shape) for linkage_func in _TREE_BUILDERS.values(): children, n_nodes, n_leaves, parent = linkage_func( X.T, connectivity=connectivity) n_nodes = 2 * X.shape[1] - 1 assert len(children) + n_leaves == n_nodes def test_agglomerative_clustering_wrong_arg_memory(): # Test either if an error is raised when memory is not # either a str or a joblib.Memory instance rng = np.random.RandomState(0) n_samples = 100 X = rng.randn(n_samples, 50) memory = 5 clustering = AgglomerativeClustering(memory=memory) with pytest.raises(ValueError): clustering.fit(X) def test_zero_cosine_linkage_tree(): # Check that zero vectors in X produce an error when # 'cosine' affinity is used X = np.array([[0, 1], [0, 0]]) msg = 'Cosine affinity cannot be used when X contains zero vectors' assert_raise_message(ValueError, msg, linkage_tree, X, affinity='cosine') @pytest.mark.parametrize('n_clusters, distance_threshold', [(None, 0.5), (10, None)]) @pytest.mark.parametrize('compute_distances', [True, False]) @pytest.mark.parametrize('linkage', ["ward", "complete", "average", "single"]) def test_agglomerative_clustering_distances(n_clusters, compute_distances, distance_threshold, linkage): # Check that when `compute_distances` is True or `distance_threshold` is # given, the fitted model has an attribute `distances_`. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) clustering = AgglomerativeClustering(n_clusters=n_clusters, connectivity=connectivity, linkage=linkage, distance_threshold=distance_threshold, compute_distances=compute_distances) clustering.fit(X) if compute_distances or (distance_threshold is not None): assert hasattr(clustering, 'distances_') n_children = clustering.children_.shape[0] n_nodes = n_children + 1 assert clustering.distances_.shape == (n_nodes-1, ) else: assert not hasattr(clustering, 'distances_') def test_agglomerative_clustering(): # Check that we obtain the correct number of clusters with # agglomerative clustering. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) for linkage in ("ward", "complete", "average", "single"): clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) clustering.fit(X) # test caching try: tempdir = mkdtemp() clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity, memory=tempdir, linkage=linkage) clustering.fit(X) labels = clustering.labels_ assert np.size(np.unique(labels)) == 10 finally: shutil.rmtree(tempdir) # Turn caching off now clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity, linkage=linkage) # Check that we obtain the same solution with early-stopping of the # tree building clustering.compute_full_tree = False clustering.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, labels), 1) clustering.connectivity = None clustering.fit(X) assert np.size(np.unique(clustering.labels_)) == 10 # Check that we raise a TypeError on dense matrices clustering = AgglomerativeClustering( n_clusters=10, connectivity=sparse.lil_matrix( connectivity.toarray()[:10, :10]), linkage=linkage) with pytest.raises(ValueError): clustering.fit(X) # Test that using ward with another metric than euclidean raises an # exception clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity.toarray(), affinity="manhattan", linkage="ward") with pytest.raises(ValueError): clustering.fit(X) # Test using another metric than euclidean works with linkage complete for affinity in PAIRED_DISTANCES.keys(): # Compare our (structured) implementation to scipy clustering = AgglomerativeClustering( n_clusters=10, connectivity=np.ones((n_samples, n_samples)), affinity=affinity, linkage="complete") clustering.fit(X) clustering2 = AgglomerativeClustering( n_clusters=10, connectivity=None, affinity=affinity, linkage="complete") clustering2.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1) # Test that using a distance matrix (affinity = 'precomputed') has same # results (with connectivity constraints) clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage="complete") clustering.fit(X) X_dist = pairwise_distances(X) clustering2 = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, affinity='precomputed', linkage="complete") clustering2.fit(X_dist) assert_array_equal(clustering.labels_, clustering2.labels_) def test_ward_agglomeration(): # Check that we obtain the correct solution in a simplistic case rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=bool) X = rng.randn(50, 100) connectivity = grid_to_graph(*mask.shape) agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity) agglo.fit(X) assert np.size(np.unique(agglo.labels_)) == 5 X_red = agglo.transform(X) assert X_red.shape[1] == 5 X_full = agglo.inverse_transform(X_red) assert np.unique(X_full[0]).size == 5 assert_array_almost_equal(agglo.transform(X_full), X_red) # Check that fitting with no samples raises a ValueError with pytest.raises(ValueError): agglo.fit(X[:0]) def test_single_linkage_clustering(): # Check that we get the correct result in two emblematic cases moons, moon_labels = make_moons(noise=0.05, random_state=42) clustering = AgglomerativeClustering(n_clusters=2, linkage='single') clustering.fit(moons) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, moon_labels), 1) circles, circle_labels = make_circles(factor=0.5, noise=0.025, random_state=42) clustering = AgglomerativeClustering(n_clusters=2, linkage='single') clustering.fit(circles) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, circle_labels), 1) def assess_same_labelling(cut1, cut2): """Util for comparison with scipy""" co_clust = [] for cut in [cut1, cut2]: n = len(cut) k = cut.max() + 1 ecut = np.zeros((n, k)) ecut[np.arange(n), cut] = 1 co_clust.append(np.dot(ecut, ecut.T)) assert (co_clust[0] == co_clust[1]).all() def test_sparse_scikit_vs_scipy(): # Test scikit linkage with full connectivity (i.e. unstructured) vs scipy n, p, k = 10, 5, 3 rng = np.random.RandomState(0) # Not using a lil_matrix here, just to check that non sparse # matrices are well handled connectivity = np.ones((n, n)) for linkage in _TREE_BUILDERS.keys(): for i in range(5): X = .1 * rng.normal(size=(n, p)) X -= 4. * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out = hierarchy.linkage(X, method=linkage) children_ = out[:, :2].astype(int, copy=False) children, _, n_leaves, _ = _TREE_BUILDERS[linkage]( X, connectivity=connectivity) # Sort the order of child nodes per row for consistency children.sort(axis=1) assert_array_equal(children, children_, 'linkage tree differs' ' from scipy impl for' ' linkage: ' + linkage) cut = _hc_cut(k, children, n_leaves) cut_ = _hc_cut(k, children_, n_leaves) assess_same_labelling(cut, cut_) # Test error management in _hc_cut with pytest.raises(ValueError): _hc_cut(n_leaves + 1, children, n_leaves) # Make sure our custom mst_linkage_core gives # the same results as scipy's builtin @pytest.mark.parametrize('seed', range(5)) def test_vector_scikit_single_vs_scipy_single(seed): n_samples, n_features, n_clusters = 10, 5, 3 rng = np.random.RandomState(seed) X = .1 * rng.normal(size=(n_samples, n_features)) X -= 4. * np.arange(n_samples)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out = hierarchy.linkage(X, method='single') children_scipy = out[:, :2].astype(int) children, _, n_leaves, _ = _TREE_BUILDERS['single'](X) # Sort the order of child nodes per row for consistency children.sort(axis=1) assert_array_equal(children, children_scipy, 'linkage tree differs' ' from scipy impl for' ' single linkage.') cut = _hc_cut(n_clusters, children, n_leaves) cut_scipy = _hc_cut(n_clusters, children_scipy, n_leaves) assess_same_labelling(cut, cut_scipy) def test_identical_points(): # Ensure identical points are handled correctly when using mst with # a sparse connectivity matrix X = np.array([[0, 0, 0], [0, 0, 0], [1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]]) true_labels = np.array([0, 0, 1, 1, 2, 2]) connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) connectivity, n_components = _fix_connectivity(X, connectivity, 'euclidean') for linkage in ('single', 'average', 'average', 'ward'): clustering = AgglomerativeClustering(n_clusters=3, linkage=linkage, connectivity=connectivity) clustering.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, true_labels), 1) def test_connectivity_propagation(): # Check that connectivity in the ward tree is propagated correctly during # merging. X = np.array([(.014, .120), (.014, .099), (.014, .097), (.017, .153), (.017, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .152), (.018, .149), (.018, .144)]) connectivity = kneighbors_graph(X, 10, include_self=False) ward = AgglomerativeClustering( n_clusters=4, connectivity=connectivity, linkage='ward') # If changes are not propagated correctly, fit crashes with an # IndexError ward.fit(X) def test_ward_tree_children_order(): # Check that children are ordered in the same way for both structured and # unstructured versions of ward_tree. # test on five random datasets n, p = 10, 5 rng = np.random.RandomState(0) connectivity = np.ones((n, n)) for i in range(5): X = .1 * rng.normal(size=(n, p)) X -= 4. * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out_unstructured = ward_tree(X) out_structured = ward_tree(X, connectivity=connectivity) assert_array_equal(out_unstructured[0], out_structured[0]) def test_ward_linkage_tree_return_distance(): # Test return_distance option on linkage and ward trees # test that return_distance when set true, gives same # output on both structured and unstructured clustering. n, p = 10, 5 rng = np.random.RandomState(0) connectivity = np.ones((n, n)) for i in range(5): X = .1 * rng.normal(size=(n, p)) X -= 4. * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out_unstructured = ward_tree(X, return_distance=True) out_structured = ward_tree(X, connectivity=connectivity, return_distance=True) # get children children_unstructured = out_unstructured[0] children_structured = out_structured[0] # check if we got the same clusters assert_array_equal(children_unstructured, children_structured) # check if the distances are the same dist_unstructured = out_unstructured[-1] dist_structured = out_structured[-1] assert_array_almost_equal(dist_unstructured, dist_structured) for linkage in ['average', 'complete', 'single']: structured_items = linkage_tree( X, connectivity=connectivity, linkage=linkage, return_distance=True)[-1] unstructured_items = linkage_tree( X, linkage=linkage, return_distance=True)[-1] structured_dist = structured_items[-1] unstructured_dist = unstructured_items[-1] structured_children = structured_items[0] unstructured_children = unstructured_items[0] assert_array_almost_equal(structured_dist, unstructured_dist) assert_array_almost_equal( structured_children, unstructured_children) # test on the following dataset where we know the truth # taken from scipy/cluster/tests/hierarchy_test_data.py X = np.array([[1.43054825, -7.5693489], [6.95887839, 6.82293382], [2.87137846, -9.68248579], [7.87974764, -6.05485803], [8.24018364, -6.09495602], [7.39020262, 8.54004355]]) # truth linkage_X_ward = np.array([[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 9.10208346, 4.], [7., 9., 24.7784379, 6.]]) linkage_X_complete = np.array( [[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 6.96742194, 4.], [7., 9., 18.77445997, 6.]]) linkage_X_average = np.array( [[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 6.55832839, 4.], [7., 9., 15.44089605, 6.]]) n_samples, n_features = np.shape(X) connectivity_X = np.ones((n_samples, n_samples)) out_X_unstructured = ward_tree(X, return_distance=True) out_X_structured = ward_tree(X, connectivity=connectivity_X, return_distance=True) # check that the labels are the same assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0]) assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0]) # check that the distances are correct assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4]) assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4]) linkage_options = ['complete', 'average', 'single'] X_linkage_truth = [linkage_X_complete, linkage_X_average] for (linkage, X_truth) in zip(linkage_options, X_linkage_truth): out_X_unstructured = linkage_tree( X, return_distance=True, linkage=linkage) out_X_structured = linkage_tree( X, connectivity=connectivity_X, linkage=linkage, return_distance=True) # check that the labels are the same assert_array_equal(X_truth[:, :2], out_X_unstructured[0]) assert_array_equal(X_truth[:, :2], out_X_structured[0]) # check that the distances are correct assert_array_almost_equal(X_truth[:, 2], out_X_unstructured[4]) assert_array_almost_equal(X_truth[:, 2], out_X_structured[4]) def test_connectivity_fixing_non_lil(): # Check non regression of a bug if a non item assignable connectivity is # provided with more than one component. # create dummy data x = np.array([[0, 0], [1, 1]]) # create a mask with several components to force connectivity fixing m = np.array([[True, False], [False, True]]) c = grid_to_graph(n_x=2, n_y=2, mask=m) w = AgglomerativeClustering(connectivity=c, linkage='ward') assert_warns(UserWarning, w.fit, x) def test_int_float_dict(): rng = np.random.RandomState(0) keys = np.unique(rng.randint(100, size=10).astype(np.intp, copy=False)) values = rng.rand(len(keys)) d = IntFloatDict(keys, values) for key, value in zip(keys, values): assert d[key] == value other_keys = np.arange(50, dtype=np.intp)[::2] other_values = np.full(50, 0.5)[::2] other = IntFloatDict(other_keys, other_values) # Complete smoke test max_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1) average_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1) def test_connectivity_callable(): rng = np.random.RandomState(0) X = rng.rand(20, 5) connectivity = kneighbors_graph(X, 3, include_self=False) aglc1 = AgglomerativeClustering(connectivity=connectivity) aglc2 = AgglomerativeClustering( connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False)) aglc1.fit(X) aglc2.fit(X) assert_array_equal(aglc1.labels_, aglc2.labels_) def test_connectivity_ignores_diagonal(): rng = np.random.RandomState(0) X = rng.rand(20, 5) connectivity = kneighbors_graph(X, 3, include_self=False) connectivity_include_self = kneighbors_graph(X, 3, include_self=True) aglc1 = AgglomerativeClustering(connectivity=connectivity) aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self) aglc1.fit(X) aglc2.fit(X) assert_array_equal(aglc1.labels_, aglc2.labels_) def test_compute_full_tree(): # Test that the full tree is computed if n_clusters is small rng = np.random.RandomState(0) X = rng.randn(10, 2) connectivity = kneighbors_graph(X, 5, include_self=False) # When n_clusters is less, the full tree should be built # that is the number of merges should be n_samples - 1 agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity) agc.fit(X) n_samples = X.shape[0] n_nodes = agc.children_.shape[0] assert n_nodes == n_samples - 1 # When n_clusters is large, greater than max of 100 and 0.02 * n_samples. # we should stop when there are n_clusters. n_clusters = 101 X = rng.randn(200, 2) connectivity = kneighbors_graph(X, 10, include_self=False) agc = AgglomerativeClustering(n_clusters=n_clusters, connectivity=connectivity) agc.fit(X) n_samples = X.shape[0] n_nodes = agc.children_.shape[0] assert n_nodes == n_samples - n_clusters def test_n_components(): # Test n_components returned by linkage, average and ward tree rng = np.random.RandomState(0) X = rng.rand(5, 5) # Connectivity matrix having five components. connectivity = np.eye(5) for linkage_func in _TREE_BUILDERS.values(): assert ignore_warnings(linkage_func)(X, connectivity)[1] == 5 def test_agg_n_clusters(): # Test that an error is raised when n_clusters <= 0 rng = np.random.RandomState(0) X = rng.rand(20, 10) for n_clus in [-1, 0]: agc = AgglomerativeClustering(n_clusters=n_clus) msg = ("n_clusters should be an integer greater than 0." " %s was provided." % str(agc.n_clusters)) assert_raise_message(ValueError, msg, agc.fit, X) def test_affinity_passed_to_fix_connectivity(): # Test that the affinity parameter is actually passed to the pairwise # function size = 2 rng = np.random.RandomState(0) X = rng.randn(size, size) mask = np.array([True, False, False, True]) connectivity = grid_to_graph(n_x=size, n_y=size, mask=mask, return_as=np.ndarray) class FakeAffinity: def __init__(self): self.counter = 0 def increment(self, *args, **kwargs): self.counter += 1 return self.counter fa = FakeAffinity() linkage_tree(X, connectivity=connectivity, affinity=fa.increment) assert fa.counter == 3 @pytest.mark.parametrize('linkage', ['ward', 'complete', 'average']) def test_agglomerative_clustering_with_distance_threshold(linkage): # Check that we obtain the correct number of clusters with # agglomerative clustering with distance_threshold. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) # test when distance threshold is set to 10 distance_threshold = 10 for conn in [None, connectivity]: clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=distance_threshold, connectivity=conn, linkage=linkage) clustering.fit(X) clusters_produced = clustering.labels_ num_clusters_produced = len(np.unique(clustering.labels_)) # test if the clusters produced match the point in the linkage tree # where the distance exceeds the threshold tree_builder = _TREE_BUILDERS[linkage] children, n_components, n_leaves, parent, distances = \ tree_builder(X, connectivity=conn, n_clusters=None, return_distance=True) num_clusters_at_threshold = np.count_nonzero( distances >= distance_threshold) + 1 # test number of clusters produced assert num_clusters_at_threshold == num_clusters_produced # test clusters produced clusters_at_threshold = _hc_cut(n_clusters=num_clusters_produced, children=children, n_leaves=n_leaves) assert np.array_equiv(clusters_produced, clusters_at_threshold) def test_small_distance_threshold(): rng = np.random.RandomState(0) n_samples = 10 X = rng.randint(-300, 300, size=(n_samples, 3)) # this should result in all data in their own clusters, given that # their pairwise distances are bigger than .1 (which may not be the case # with a different random seed). clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=1., linkage="single").fit(X) # check that the pairwise distances are indeed all larger than .1 all_distances = pairwise_distances(X, metric='minkowski', p=2) np.fill_diagonal(all_distances, np.inf) assert np.all(all_distances > .1) assert clustering.n_clusters_ == n_samples def test_cluster_distances_with_distance_threshold(): rng = np.random.RandomState(0) n_samples = 100 X = rng.randint(-10, 10, size=(n_samples, 3)) # check the distances within the clusters and with other clusters distance_threshold = 4 clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=distance_threshold, linkage="single").fit(X) labels = clustering.labels_ D = pairwise_distances(X, metric="minkowski", p=2) # to avoid taking the 0 diagonal in min() np.fill_diagonal(D, np.inf) for label in np.unique(labels): in_cluster_mask = labels == label max_in_cluster_distance = (D[in_cluster_mask][:, in_cluster_mask] .min(axis=0).max()) min_out_cluster_distance = (D[in_cluster_mask][:, ~in_cluster_mask] .min(axis=0).min()) # single data point clusters only have that inf diagonal here if in_cluster_mask.sum() > 1: assert max_in_cluster_distance < distance_threshold assert min_out_cluster_distance >= distance_threshold @pytest.mark.parametrize('linkage', ['ward', 'complete', 'average']) @pytest.mark.parametrize(('threshold', 'y_true'), [(0.5, [1, 0]), (1.0, [1, 0]), (1.5, [0, 0])]) def test_agglomerative_clustering_with_distance_threshold_edge_case( linkage, threshold, y_true): # test boundary case of distance_threshold matching the distance X = [[0], [1]] clusterer = AgglomerativeClustering( n_clusters=None, distance_threshold=threshold, linkage=linkage) y_pred = clusterer.fit_predict(X) assert adjusted_rand_score(y_true, y_pred) == 1 def test_dist_threshold_invalid_parameters(): X = [[0], [1]] with pytest.raises(ValueError, match="Exactly one of "): AgglomerativeClustering(n_clusters=None, distance_threshold=None).fit(X) with pytest.raises(ValueError, match="Exactly one of "): AgglomerativeClustering(n_clusters=2, distance_threshold=1).fit(X) X = [[0], [1]] with pytest.raises(ValueError, match="compute_full_tree must be True if"): AgglomerativeClustering(n_clusters=None, distance_threshold=1, compute_full_tree=False).fit(X) def test_invalid_shape_precomputed_dist_matrix(): # Check that an error is raised when affinity='precomputed' # and a non square matrix is passed (PR #16257). rng = np.random.RandomState(0) X = rng.rand(5, 3) with pytest.raises(ValueError, match="Distance matrix should be square, "): AgglomerativeClustering(affinity='precomputed', linkage='complete').fit(X)