"""Testing for K-means""" import re import sys from io import StringIO import numpy as np import pytest from scipy import sparse as sp from sklearn import _threadpool_controller from sklearn.base import clone from sklearn.cluster import KMeans, MiniBatchKMeans, k_means, kmeans_plusplus from sklearn.cluster._k_means_common import ( _euclidean_dense_dense_wrapper, _euclidean_sparse_dense_wrapper, _inertia_dense, _inertia_sparse, _is_same_clustering, _relocate_empty_clusters_dense, _relocate_empty_clusters_sparse, ) from sklearn.cluster._kmeans import _labels_inertia, _mini_batch_step from sklearn.datasets import make_blobs from sklearn.exceptions import ConvergenceWarning from sklearn.metrics import pairwise_distances, pairwise_distances_argmin from sklearn.metrics.cluster import v_measure_score from sklearn.metrics.pairwise import euclidean_distances from sklearn.utils._testing import ( assert_allclose, assert_array_equal, create_memmap_backed_data, ) from sklearn.utils.extmath import row_norms from sklearn.utils.fixes import CSR_CONTAINERS # non centered, sparse centers to check the centers = np.array( [ [0.0, 5.0, 0.0, 0.0, 0.0], [1.0, 1.0, 4.0, 0.0, 0.0], [1.0, 0.0, 0.0, 5.0, 1.0], ] ) n_samples = 100 n_clusters, n_features = centers.shape X, true_labels = make_blobs( n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42 ) X_as_any_csr = [container(X) for container in CSR_CONTAINERS] data_containers = [np.array] + CSR_CONTAINERS data_containers_ids = ( ["dense", "sparse_matrix", "sparse_array"] if len(X_as_any_csr) == 2 else ["dense", "sparse_matrix"] ) @pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids) @pytest.mark.parametrize("algo", ["lloyd", "elkan"]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_kmeans_results(array_constr, algo, dtype): # Checks that KMeans works as intended on toy dataset by comparing with # expected results computed by hand. X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=dtype) sample_weight = [3, 1, 1, 3] init_centers = np.array([[0, 0], [1, 1]], dtype=dtype) expected_labels = [0, 0, 1, 1] expected_inertia = 0.375 expected_centers = np.array([[0.125, 0], [0.875, 1]], dtype=dtype) expected_n_iter = 2 kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) kmeans.fit(X, sample_weight=sample_weight) assert_array_equal(kmeans.labels_, expected_labels) assert_allclose(kmeans.inertia_, expected_inertia) assert_allclose(kmeans.cluster_centers_, expected_centers) assert kmeans.n_iter_ == expected_n_iter @pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids) @pytest.mark.parametrize("algo", ["lloyd", "elkan"]) def test_kmeans_relocated_clusters(array_constr, algo): # check that empty clusters are relocated as expected X = array_constr([[0, 0], [0.5, 0], [0.5, 1], [1, 1]]) # second center too far from others points will be empty at first iter init_centers = np.array([[0.5, 0.5], [3, 3]]) kmeans = KMeans(n_clusters=2, n_init=1, init=init_centers, algorithm=algo) kmeans.fit(X) expected_n_iter = 3 expected_inertia = 0.25 assert_allclose(kmeans.inertia_, expected_inertia) assert kmeans.n_iter_ == expected_n_iter # There are two acceptable ways of relocating clusters in this example, the output # depends on how the argpartition strategy breaks ties. We accept both outputs. try: expected_labels = [0, 0, 1, 1] expected_centers = [[0.25, 0], [0.75, 1]] assert_array_equal(kmeans.labels_, expected_labels) assert_allclose(kmeans.cluster_centers_, expected_centers) except AssertionError: expected_labels = [1, 1, 0, 0] expected_centers = [[0.75, 1.0], [0.25, 0.0]] assert_array_equal(kmeans.labels_, expected_labels) assert_allclose(kmeans.cluster_centers_, expected_centers) @pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids) def test_relocate_empty_clusters(array_constr): # test for the _relocate_empty_clusters_(dense/sparse) helpers # Synthetic dataset with 3 obvious clusters of different sizes X = np.array([-10.0, -9.5, -9, -8.5, -8, -1, 1, 9, 9.5, 10]).reshape(-1, 1) X = array_constr(X) sample_weight = np.ones(10) # centers all initialized to the first point of X centers_old = np.array([-10.0, -10, -10]).reshape(-1, 1) # With this initialization, all points will be assigned to the first center # At this point a center in centers_new is the weighted sum of the points # it contains if it's not empty, otherwise it is the same as before. centers_new = np.array([-16.5, -10, -10]).reshape(-1, 1) weight_in_clusters = np.array([10.0, 0, 0]) labels = np.zeros(10, dtype=np.int32) if array_constr is np.array: _relocate_empty_clusters_dense( X, sample_weight, centers_old, centers_new, weight_in_clusters, labels ) else: _relocate_empty_clusters_sparse( X.data, X.indices, X.indptr, sample_weight, centers_old, centers_new, weight_in_clusters, labels, ) # The relocation scheme will take the 2 points farthest from the center and # assign them to the 2 empty clusters, i.e. points at 10 and at 9.9. The # first center will be updated to contain the other 8 points. assert_array_equal(weight_in_clusters, [8, 1, 1]) assert_allclose(centers_new, [[-36], [10], [9.5]]) @pytest.mark.parametrize("distribution", ["normal", "blobs"]) @pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids) @pytest.mark.parametrize("tol", [1e-2, 1e-8, 1e-100, 0]) def test_kmeans_elkan_results(distribution, array_constr, tol, global_random_seed): # Check that results are identical between lloyd and elkan algorithms rnd = np.random.RandomState(global_random_seed) if distribution == "normal": X = rnd.normal(size=(5000, 10)) else: X, _ = make_blobs(random_state=rnd) X[X < 0] = 0 X = array_constr(X) km_lloyd = KMeans(n_clusters=5, random_state=global_random_seed, n_init=1, tol=tol) km_elkan = KMeans( algorithm="elkan", n_clusters=5, random_state=global_random_seed, n_init=1, tol=tol, ) km_lloyd.fit(X) km_elkan.fit(X) assert_allclose(km_elkan.cluster_centers_, km_lloyd.cluster_centers_) assert_array_equal(km_elkan.labels_, km_lloyd.labels_) assert km_elkan.n_iter_ == km_lloyd.n_iter_ assert km_elkan.inertia_ == pytest.approx(km_lloyd.inertia_, rel=1e-6) @pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) def test_kmeans_convergence(algorithm, global_random_seed): # Check that KMeans stops when convergence is reached when tol=0. (#16075) rnd = np.random.RandomState(global_random_seed) X = rnd.normal(size=(5000, 10)) max_iter = 300 km = KMeans( algorithm=algorithm, n_clusters=5, random_state=global_random_seed, n_init=1, tol=0, max_iter=max_iter, ).fit(X) assert km.n_iter_ < max_iter @pytest.mark.parametrize("X_csr", X_as_any_csr) def test_minibatch_update_consistency(X_csr, global_random_seed): # Check that dense and sparse minibatch update give the same results rng = np.random.RandomState(global_random_seed) centers_old = centers + rng.normal(size=centers.shape) centers_old_csr = centers_old.copy() centers_new = np.zeros_like(centers_old) centers_new_csr = np.zeros_like(centers_old_csr) weight_sums = np.zeros(centers_old.shape[0], dtype=X.dtype) weight_sums_csr = np.zeros(centers_old.shape[0], dtype=X.dtype) sample_weight = np.ones(X.shape[0], dtype=X.dtype) # extract a small minibatch X_mb = X[:10] X_mb_csr = X_csr[:10] sample_weight_mb = sample_weight[:10] # step 1: compute the dense minibatch update old_inertia = _mini_batch_step( X_mb, sample_weight_mb, centers_old, centers_new, weight_sums, np.random.RandomState(global_random_seed), random_reassign=False, ) assert old_inertia > 0.0 # compute the new inertia on the same batch to check that it decreased labels, new_inertia = _labels_inertia(X_mb, sample_weight_mb, centers_new) assert new_inertia > 0.0 assert new_inertia < old_inertia # step 2: compute the sparse minibatch update old_inertia_csr = _mini_batch_step( X_mb_csr, sample_weight_mb, centers_old_csr, centers_new_csr, weight_sums_csr, np.random.RandomState(global_random_seed), random_reassign=False, ) assert old_inertia_csr > 0.0 # compute the new inertia on the same batch to check that it decreased labels_csr, new_inertia_csr = _labels_inertia( X_mb_csr, sample_weight_mb, centers_new_csr ) assert new_inertia_csr > 0.0 assert new_inertia_csr < old_inertia_csr # step 3: check that sparse and dense updates lead to the same results assert_array_equal(labels, labels_csr) assert_allclose(centers_new, centers_new_csr) assert_allclose(old_inertia, old_inertia_csr) assert_allclose(new_inertia, new_inertia_csr) def _check_fitted_model(km): # check that the number of clusters centers and distinct labels match # the expectation centers = km.cluster_centers_ assert centers.shape == (n_clusters, n_features) labels = km.labels_ assert np.unique(labels).shape[0] == n_clusters # check that the labels assignment are perfect (up to a permutation) assert_allclose(v_measure_score(true_labels, labels), 1.0) assert km.inertia_ > 0.0 @pytest.mark.parametrize( "input_data", [X] + X_as_any_csr, ids=data_containers_ids, ) @pytest.mark.parametrize( "init", ["random", "k-means++", centers, lambda X, k, random_state: centers], ids=["random", "k-means++", "ndarray", "callable"], ) @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) def test_all_init(Estimator, input_data, init): # Check KMeans and MiniBatchKMeans with all possible init. n_init = 10 if isinstance(init, str) else 1 km = Estimator( init=init, n_clusters=n_clusters, random_state=42, n_init=n_init ).fit(input_data) _check_fitted_model(km) @pytest.mark.parametrize( "init", ["random", "k-means++", centers, lambda X, k, random_state: centers], ids=["random", "k-means++", "ndarray", "callable"], ) def test_minibatch_kmeans_partial_fit_init(init): # Check MiniBatchKMeans init with partial_fit n_init = 10 if isinstance(init, str) else 1 km = MiniBatchKMeans( init=init, n_clusters=n_clusters, random_state=0, n_init=n_init ) for i in range(100): # "random" init requires many batches to recover the true labels. km.partial_fit(X) _check_fitted_model(km) @pytest.mark.parametrize( "init, expected_n_init", [ ("k-means++", 1), ("random", "default"), ( lambda X, n_clusters, random_state: random_state.uniform( size=(n_clusters, X.shape[1]) ), "default", ), ("array-like", 1), ], ) @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) def test_kmeans_init_auto_with_initial_centroids(Estimator, init, expected_n_init): """Check that `n_init="auto"` chooses the right number of initializations. Non-regression test for #26657: https://github.com/scikit-learn/scikit-learn/pull/26657 """ n_sample, n_features, n_clusters = 100, 10, 5 X = np.random.randn(n_sample, n_features) if init == "array-like": init = np.random.randn(n_clusters, n_features) if expected_n_init == "default": expected_n_init = 3 if Estimator is MiniBatchKMeans else 10 kmeans = Estimator(n_clusters=n_clusters, init=init, n_init="auto").fit(X) assert kmeans._n_init == expected_n_init @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) def test_fortran_aligned_data(Estimator, global_random_seed): # Check that KMeans works with fortran-aligned data. X_fortran = np.asfortranarray(X) centers_fortran = np.asfortranarray(centers) km_c = Estimator( n_clusters=n_clusters, init=centers, n_init=1, random_state=global_random_seed ).fit(X) km_f = Estimator( n_clusters=n_clusters, init=centers_fortran, n_init=1, random_state=global_random_seed, ).fit(X_fortran) assert_allclose(km_c.cluster_centers_, km_f.cluster_centers_) assert_array_equal(km_c.labels_, km_f.labels_) def test_minibatch_kmeans_verbose(): # Check verbose mode of MiniBatchKMeans for better coverage. km = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, verbose=1) old_stdout = sys.stdout sys.stdout = StringIO() try: km.fit(X) finally: sys.stdout = old_stdout @pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) @pytest.mark.parametrize("tol", [1e-2, 0]) def test_kmeans_verbose(algorithm, tol, capsys): # Check verbose mode of KMeans for better coverage. X = np.random.RandomState(0).normal(size=(5000, 10)) KMeans( algorithm=algorithm, n_clusters=n_clusters, random_state=42, init="random", n_init=1, tol=tol, verbose=1, ).fit(X) captured = capsys.readouterr() assert re.search(r"Initialization complete", captured.out) assert re.search(r"Iteration [0-9]+, inertia", captured.out) if tol == 0: assert re.search(r"strict convergence", captured.out) else: assert re.search(r"center shift .* within tolerance", captured.out) def test_minibatch_kmeans_warning_init_size(): # Check that a warning is raised when init_size is smaller than n_clusters with pytest.warns( RuntimeWarning, match=r"init_size.* should be larger than n_clusters" ): MiniBatchKMeans(init_size=10, n_clusters=20).fit(X) @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) def test_warning_n_init_precomputed_centers(Estimator): # Check that a warning is raised when n_init > 1 and an array is passed for # the init parameter. with pytest.warns( RuntimeWarning, match="Explicit initial center position passed: performing only one init", ): Estimator(init=centers, n_clusters=n_clusters, n_init=10).fit(X) def test_minibatch_sensible_reassign(global_random_seed): # check that identical initial clusters are reassigned # also a regression test for when there are more desired reassignments than # samples. zeroed_X, true_labels = make_blobs( n_samples=100, centers=5, random_state=global_random_seed ) zeroed_X[::2, :] = 0 km = MiniBatchKMeans( n_clusters=20, batch_size=10, random_state=global_random_seed, init="random" ).fit(zeroed_X) # there should not be too many exact zero cluster centers assert km.cluster_centers_.any(axis=1).sum() > 10 # do the same with batch-size > X.shape[0] (regression test) km = MiniBatchKMeans( n_clusters=20, batch_size=200, random_state=global_random_seed, init="random" ).fit(zeroed_X) # there should not be too many exact zero cluster centers assert km.cluster_centers_.any(axis=1).sum() > 10 # do the same with partial_fit API km = MiniBatchKMeans(n_clusters=20, random_state=global_random_seed, init="random") for i in range(100): km.partial_fit(zeroed_X) # there should not be too many exact zero cluster centers assert km.cluster_centers_.any(axis=1).sum() > 10 @pytest.mark.parametrize( "input_data", [X] + X_as_any_csr, ids=data_containers_ids, ) def test_minibatch_reassign(input_data, global_random_seed): # Check the reassignment part of the minibatch step with very high or very # low reassignment ratio. perfect_centers = np.empty((n_clusters, n_features)) for i in range(n_clusters): perfect_centers[i] = X[true_labels == i].mean(axis=0) sample_weight = np.ones(n_samples) centers_new = np.empty_like(perfect_centers) # Give a perfect initialization, but a large reassignment_ratio, as a # result many centers should be reassigned and the model should no longer # be good score_before = -_labels_inertia(input_data, sample_weight, perfect_centers, 1)[1] _mini_batch_step( input_data, sample_weight, perfect_centers, centers_new, np.zeros(n_clusters), np.random.RandomState(global_random_seed), random_reassign=True, reassignment_ratio=1, ) score_after = -_labels_inertia(input_data, sample_weight, centers_new, 1)[1] assert score_before > score_after # Give a perfect initialization, with a small reassignment_ratio, # no center should be reassigned. _mini_batch_step( input_data, sample_weight, perfect_centers, centers_new, np.zeros(n_clusters), np.random.RandomState(global_random_seed), random_reassign=True, reassignment_ratio=1e-15, ) assert_allclose(centers_new, perfect_centers) def test_minibatch_with_many_reassignments(): # Test for the case that the number of clusters to reassign is bigger # than the batch_size. Run the test with 100 clusters and a batch_size of # 10 because it turned out that these values ensure that the number of # clusters to reassign is always bigger than the batch_size. MiniBatchKMeans( n_clusters=100, batch_size=10, init_size=n_samples, random_state=42, verbose=True, ).fit(X) def test_minibatch_kmeans_init_size(): # Check the internal _init_size attribute of MiniBatchKMeans # default init size should be 3 * batch_size km = MiniBatchKMeans(n_clusters=10, batch_size=5, n_init=1).fit(X) assert km._init_size == 15 # if 3 * batch size < n_clusters, it should then be 3 * n_clusters km = MiniBatchKMeans(n_clusters=10, batch_size=1, n_init=1).fit(X) assert km._init_size == 30 # it should not be larger than n_samples km = MiniBatchKMeans( n_clusters=10, batch_size=5, n_init=1, init_size=n_samples + 1 ).fit(X) assert km._init_size == n_samples @pytest.mark.parametrize("tol, max_no_improvement", [(1e-4, None), (0, 10)]) def test_minibatch_declared_convergence(capsys, tol, max_no_improvement): # Check convergence detection based on ewa batch inertia or on # small center change. X, _, centers = make_blobs(centers=3, random_state=0, return_centers=True) km = MiniBatchKMeans( n_clusters=3, init=centers, batch_size=20, tol=tol, random_state=0, max_iter=10, n_init=1, verbose=1, max_no_improvement=max_no_improvement, ) km.fit(X) assert 1 < km.n_iter_ < 10 captured = capsys.readouterr() if max_no_improvement is None: assert "Converged (small centers change)" in captured.out if tol == 0: assert "Converged (lack of improvement in inertia)" in captured.out def test_minibatch_iter_steps(): # Check consistency of n_iter_ and n_steps_ attributes. batch_size = 30 n_samples = X.shape[0] km = MiniBatchKMeans(n_clusters=3, batch_size=batch_size, random_state=0).fit(X) # n_iter_ is the number of started epochs assert km.n_iter_ == np.ceil((km.n_steps_ * batch_size) / n_samples) assert isinstance(km.n_iter_, int) # without stopping condition, max_iter should be reached km = MiniBatchKMeans( n_clusters=3, batch_size=batch_size, random_state=0, tol=0, max_no_improvement=None, max_iter=10, ).fit(X) assert km.n_iter_ == 10 assert km.n_steps_ == (10 * n_samples) // batch_size assert isinstance(km.n_steps_, int) def test_kmeans_copyx(): # Check that copy_x=False returns nearly equal X after de-centering. my_X = X.copy() km = KMeans(copy_x=False, n_clusters=n_clusters, random_state=42) km.fit(my_X) _check_fitted_model(km) # check that my_X is de-centered assert_allclose(my_X, X) @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) def test_score_max_iter(Estimator, global_random_seed): # Check that fitting KMeans or MiniBatchKMeans with more iterations gives # better score X = np.random.RandomState(global_random_seed).randn(100, 10) km1 = Estimator(n_init=1, random_state=global_random_seed, max_iter=1) s1 = km1.fit(X).score(X) km2 = Estimator(n_init=1, random_state=global_random_seed, max_iter=10) s2 = km2.fit(X).score(X) assert s2 > s1 @pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids) @pytest.mark.parametrize( "Estimator, algorithm", [(KMeans, "lloyd"), (KMeans, "elkan"), (MiniBatchKMeans, None)], ) @pytest.mark.parametrize("max_iter", [2, 100]) def test_kmeans_predict( Estimator, algorithm, array_constr, max_iter, global_dtype, global_random_seed ): # Check the predict method and the equivalence between fit.predict and # fit_predict. X, _ = make_blobs( n_samples=200, n_features=10, centers=10, random_state=global_random_seed ) X = array_constr(X, dtype=global_dtype) km = Estimator( n_clusters=10, init="random", n_init=10, max_iter=max_iter, random_state=global_random_seed, ) if algorithm is not None: km.set_params(algorithm=algorithm) km.fit(X) labels = km.labels_ # re-predict labels for training set using predict pred = km.predict(X) assert_array_equal(pred, labels) # re-predict labels for training set using fit_predict pred = km.fit_predict(X) assert_array_equal(pred, labels) # predict centroid labels pred = km.predict(km.cluster_centers_) assert_array_equal(pred, np.arange(10)) @pytest.mark.parametrize("X_csr", X_as_any_csr) @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) def test_dense_sparse(Estimator, X_csr, global_random_seed): # Check that the results are the same for dense and sparse input. sample_weight = np.random.RandomState(global_random_seed).random_sample( (n_samples,) ) km_dense = Estimator( n_clusters=n_clusters, random_state=global_random_seed, n_init=1 ) km_dense.fit(X, sample_weight=sample_weight) km_sparse = Estimator( n_clusters=n_clusters, random_state=global_random_seed, n_init=1 ) km_sparse.fit(X_csr, sample_weight=sample_weight) assert_array_equal(km_dense.labels_, km_sparse.labels_) assert_allclose(km_dense.cluster_centers_, km_sparse.cluster_centers_) @pytest.mark.parametrize("X_csr", X_as_any_csr) @pytest.mark.parametrize( "init", ["random", "k-means++", centers], ids=["random", "k-means++", "ndarray"] ) @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) def test_predict_dense_sparse(Estimator, init, X_csr): # check that models trained on sparse input also works for dense input at # predict time and vice versa. n_init = 10 if isinstance(init, str) else 1 km = Estimator(n_clusters=n_clusters, init=init, n_init=n_init, random_state=0) km.fit(X_csr) assert_array_equal(km.predict(X), km.labels_) km.fit(X) assert_array_equal(km.predict(X_csr), km.labels_) @pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids) @pytest.mark.parametrize("dtype", [np.int32, np.int64]) @pytest.mark.parametrize("init", ["k-means++", "ndarray"]) @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) def test_integer_input(Estimator, array_constr, dtype, init, global_random_seed): # Check that KMeans and MiniBatchKMeans work with integer input. X_dense = np.array([[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]]) X = array_constr(X_dense, dtype=dtype) n_init = 1 if init == "ndarray" else 10 init = X_dense[:2] if init == "ndarray" else init km = Estimator( n_clusters=2, init=init, n_init=n_init, random_state=global_random_seed ) if Estimator is MiniBatchKMeans: km.set_params(batch_size=2) km.fit(X) # Internally integer input should be converted to float64 assert km.cluster_centers_.dtype == np.float64 expected_labels = [0, 1, 1, 0, 0, 1] assert_allclose(v_measure_score(km.labels_, expected_labels), 1.0) # Same with partial_fit (#14314) if Estimator is MiniBatchKMeans: km = clone(km).partial_fit(X) assert km.cluster_centers_.dtype == np.float64 @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) def test_transform(Estimator, global_random_seed): # Check the transform method km = Estimator(n_clusters=n_clusters, random_state=global_random_seed).fit(X) # Transorfming cluster_centers_ should return the pairwise distances # between centers Xt = km.transform(km.cluster_centers_) assert_allclose(Xt, pairwise_distances(km.cluster_centers_)) # In particular, diagonal must be 0 assert_array_equal(Xt.diagonal(), np.zeros(n_clusters)) # Transorfming X should return the pairwise distances between X and the # centers Xt = km.transform(X) assert_allclose(Xt, pairwise_distances(X, km.cluster_centers_)) @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) def test_fit_transform(Estimator, global_random_seed): # Check equivalence between fit.transform and fit_transform X1 = Estimator(random_state=global_random_seed, n_init=1).fit(X).transform(X) X2 = Estimator(random_state=global_random_seed, n_init=1).fit_transform(X) assert_allclose(X1, X2) def test_n_init(global_random_seed): # Check that increasing the number of init increases the quality previous_inertia = np.inf for n_init in [1, 5, 10]: # set max_iter=1 to avoid finding the global minimum and get the same # inertia each time km = KMeans( n_clusters=n_clusters, init="random", n_init=n_init, random_state=global_random_seed, max_iter=1, ).fit(X) assert km.inertia_ <= previous_inertia def test_k_means_function(global_random_seed): # test calling the k_means function directly cluster_centers, labels, inertia = k_means( X, n_clusters=n_clusters, sample_weight=None, random_state=global_random_seed ) assert cluster_centers.shape == (n_clusters, n_features) assert np.unique(labels).shape[0] == n_clusters # check that the labels assignment are perfect (up to a permutation) assert_allclose(v_measure_score(true_labels, labels), 1.0) assert inertia > 0.0 @pytest.mark.parametrize( "input_data", [X] + X_as_any_csr, ids=data_containers_ids, ) @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) def test_float_precision(Estimator, input_data, global_random_seed): # Check that the results are the same for single and double precision. km = Estimator(n_init=1, random_state=global_random_seed) inertia = {} Xt = {} centers = {} labels = {} for dtype in [np.float64, np.float32]: X = input_data.astype(dtype, copy=False) km.fit(X) inertia[dtype] = km.inertia_ Xt[dtype] = km.transform(X) centers[dtype] = km.cluster_centers_ labels[dtype] = km.labels_ # dtype of cluster centers has to be the dtype of the input data assert km.cluster_centers_.dtype == dtype # same with partial_fit if Estimator is MiniBatchKMeans: km.partial_fit(X[0:3]) assert km.cluster_centers_.dtype == dtype # compare arrays with low precision since the difference between 32 and # 64 bit comes from an accumulation of rounding errors. assert_allclose(inertia[np.float32], inertia[np.float64], rtol=1e-4) assert_allclose(Xt[np.float32], Xt[np.float64], atol=Xt[np.float64].max() * 1e-4) assert_allclose( centers[np.float32], centers[np.float64], atol=centers[np.float64].max() * 1e-4 ) assert_array_equal(labels[np.float32], labels[np.float64]) @pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) def test_centers_not_mutated(Estimator, dtype): # Check that KMeans and MiniBatchKMeans won't mutate the user provided # init centers silently even if input data and init centers have the same # type. X_new_type = X.astype(dtype, copy=False) centers_new_type = centers.astype(dtype, copy=False) km = Estimator(init=centers_new_type, n_clusters=n_clusters, n_init=1) km.fit(X_new_type) assert not np.may_share_memory(km.cluster_centers_, centers_new_type) @pytest.mark.parametrize( "input_data", [X] + X_as_any_csr, ids=data_containers_ids, ) def test_kmeans_init_fitted_centers(input_data): # Check that starting fitting from a local optimum shouldn't change the # solution km1 = KMeans(n_clusters=n_clusters).fit(input_data) km2 = KMeans(n_clusters=n_clusters, init=km1.cluster_centers_, n_init=1).fit( input_data ) assert_allclose(km1.cluster_centers_, km2.cluster_centers_) def test_kmeans_warns_less_centers_than_unique_points(global_random_seed): # Check KMeans when the number of found clusters is smaller than expected X = np.asarray([[0, 0], [0, 1], [1, 0], [1, 0]]) # last point is duplicated km = KMeans(n_clusters=4, random_state=global_random_seed) # KMeans should warn that fewer labels than cluster centers have been used msg = ( r"Number of distinct clusters \(3\) found smaller than " r"n_clusters \(4\). Possibly due to duplicate points in X." ) with pytest.warns(ConvergenceWarning, match=msg): km.fit(X) # only three distinct points, so only three clusters # can have points assigned to them assert set(km.labels_) == set(range(3)) def _sort_centers(centers): return np.sort(centers, axis=0) def test_weighted_vs_repeated(global_random_seed): # Check that a sample weight of N should yield the same result as an N-fold # repetition of the sample. Valid only if init is precomputed, otherwise # rng produces different results. Not valid for MinibatchKMeans due to rng # to extract minibatches. sample_weight = np.random.RandomState(global_random_seed).randint( 1, 5, size=n_samples ) X_repeat = np.repeat(X, sample_weight, axis=0) km = KMeans( init=centers, n_init=1, n_clusters=n_clusters, random_state=global_random_seed ) km_weighted = clone(km).fit(X, sample_weight=sample_weight) repeated_labels = np.repeat(km_weighted.labels_, sample_weight) km_repeated = clone(km).fit(X_repeat) assert_array_equal(km_repeated.labels_, repeated_labels) assert_allclose(km_weighted.inertia_, km_repeated.inertia_) assert_allclose( _sort_centers(km_weighted.cluster_centers_), _sort_centers(km_repeated.cluster_centers_), ) @pytest.mark.parametrize( "input_data", [X] + X_as_any_csr, ids=data_containers_ids, ) @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) def test_unit_weights_vs_no_weights(Estimator, input_data, global_random_seed): # Check that not passing sample weights should be equivalent to passing # sample weights all equal to one. sample_weight = np.ones(n_samples) km = Estimator(n_clusters=n_clusters, random_state=global_random_seed, n_init=1) km_none = clone(km).fit(input_data, sample_weight=None) km_ones = clone(km).fit(input_data, sample_weight=sample_weight) assert_array_equal(km_none.labels_, km_ones.labels_) assert_allclose(km_none.cluster_centers_, km_ones.cluster_centers_) @pytest.mark.parametrize( "input_data", [X] + X_as_any_csr, ids=data_containers_ids, ) @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) def test_scaled_weights(Estimator, input_data, global_random_seed): # Check that scaling all sample weights by a common factor # shouldn't change the result sample_weight = np.random.RandomState(global_random_seed).uniform(size=n_samples) km = Estimator(n_clusters=n_clusters, random_state=global_random_seed, n_init=1) km_orig = clone(km).fit(input_data, sample_weight=sample_weight) km_scaled = clone(km).fit(input_data, sample_weight=0.5 * sample_weight) assert_array_equal(km_orig.labels_, km_scaled.labels_) assert_allclose(km_orig.cluster_centers_, km_scaled.cluster_centers_) def test_kmeans_elkan_iter_attribute(): # Regression test on bad n_iter_ value. Previous bug n_iter_ was one off # it's right value (#11340). km = KMeans(algorithm="elkan", max_iter=1).fit(X) assert km.n_iter_ == 1 @pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids) def test_kmeans_empty_cluster_relocated(array_constr): # check that empty clusters are correctly relocated when using sample # weights (#13486) X = array_constr([[-1], [1]]) sample_weight = [1.9, 0.1] init = np.array([[-1], [10]]) km = KMeans(n_clusters=2, init=init, n_init=1) km.fit(X, sample_weight=sample_weight) assert len(set(km.labels_)) == 2 assert_allclose(km.cluster_centers_, [[-1], [1]]) @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) def test_result_equal_in_diff_n_threads(Estimator, global_random_seed): # Check that KMeans/MiniBatchKMeans give the same results in parallel mode # than in sequential mode. rnd = np.random.RandomState(global_random_seed) X = rnd.normal(size=(50, 10)) with _threadpool_controller.limit(limits=1, user_api="openmp"): result_1 = ( Estimator(n_clusters=n_clusters, random_state=global_random_seed) .fit(X) .labels_ ) with _threadpool_controller.limit(limits=2, user_api="openmp"): result_2 = ( Estimator(n_clusters=n_clusters, random_state=global_random_seed) .fit(X) .labels_ ) assert_array_equal(result_1, result_2) def test_warning_elkan_1_cluster(): # Check warning messages specific to KMeans with pytest.warns( RuntimeWarning, match="algorithm='elkan' doesn't make sense for a single cluster", ): KMeans(n_clusters=1, algorithm="elkan").fit(X) @pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids) @pytest.mark.parametrize("algo", ["lloyd", "elkan"]) def test_k_means_1_iteration(array_constr, algo, global_random_seed): # check the results after a single iteration (E-step M-step E-step) by # comparing against a pure python implementation. X = np.random.RandomState(global_random_seed).uniform(size=(100, 5)) init_centers = X[:5] X = array_constr(X) def py_kmeans(X, init): new_centers = init.copy() labels = pairwise_distances_argmin(X, init) for label in range(init.shape[0]): new_centers[label] = X[labels == label].mean(axis=0) labels = pairwise_distances_argmin(X, new_centers) return labels, new_centers py_labels, py_centers = py_kmeans(X, init_centers) cy_kmeans = KMeans( n_clusters=5, n_init=1, init=init_centers, algorithm=algo, max_iter=1 ).fit(X) cy_labels = cy_kmeans.labels_ cy_centers = cy_kmeans.cluster_centers_ assert_array_equal(py_labels, cy_labels) assert_allclose(py_centers, cy_centers) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("squared", [True, False]) def test_euclidean_distance(dtype, squared, global_random_seed): # Check that the _euclidean_(dense/sparse)_dense helpers produce correct # results rng = np.random.RandomState(global_random_seed) a_sparse = sp.random( 1, 100, density=0.5, format="csr", random_state=rng, dtype=dtype ) a_dense = a_sparse.toarray().reshape(-1) b = rng.randn(100).astype(dtype, copy=False) b_squared_norm = (b**2).sum() expected = ((a_dense - b) ** 2).sum() expected = expected if squared else np.sqrt(expected) distance_dense_dense = _euclidean_dense_dense_wrapper(a_dense, b, squared) distance_sparse_dense = _euclidean_sparse_dense_wrapper( a_sparse.data, a_sparse.indices, b, b_squared_norm, squared ) rtol = 1e-4 if dtype == np.float32 else 1e-7 assert_allclose(distance_dense_dense, distance_sparse_dense, rtol=rtol) assert_allclose(distance_dense_dense, expected, rtol=rtol) assert_allclose(distance_sparse_dense, expected, rtol=rtol) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_inertia(dtype, global_random_seed): # Check that the _inertia_(dense/sparse) helpers produce correct results. rng = np.random.RandomState(global_random_seed) X_sparse = sp.random( 100, 10, density=0.5, format="csr", random_state=rng, dtype=dtype ) X_dense = X_sparse.toarray() sample_weight = rng.randn(100).astype(dtype, copy=False) centers = rng.randn(5, 10).astype(dtype, copy=False) labels = rng.randint(5, size=100, dtype=np.int32) distances = ((X_dense - centers[labels]) ** 2).sum(axis=1) expected = np.sum(distances * sample_weight) inertia_dense = _inertia_dense(X_dense, sample_weight, centers, labels, n_threads=1) inertia_sparse = _inertia_sparse( X_sparse, sample_weight, centers, labels, n_threads=1 ) rtol = 1e-4 if dtype == np.float32 else 1e-6 assert_allclose(inertia_dense, inertia_sparse, rtol=rtol) assert_allclose(inertia_dense, expected, rtol=rtol) assert_allclose(inertia_sparse, expected, rtol=rtol) # Check the single_label parameter. label = 1 mask = labels == label distances = ((X_dense[mask] - centers[label]) ** 2).sum(axis=1) expected = np.sum(distances * sample_weight[mask]) inertia_dense = _inertia_dense( X_dense, sample_weight, centers, labels, n_threads=1, single_label=label ) inertia_sparse = _inertia_sparse( X_sparse, sample_weight, centers, labels, n_threads=1, single_label=label ) assert_allclose(inertia_dense, inertia_sparse, rtol=rtol) assert_allclose(inertia_dense, expected, rtol=rtol) assert_allclose(inertia_sparse, expected, rtol=rtol) @pytest.mark.parametrize("Klass, default_n_init", [(KMeans, 10), (MiniBatchKMeans, 3)]) def test_n_init_auto(Klass, default_n_init): est = Klass(n_init="auto", init="k-means++") est.fit(X) assert est._n_init == 1 est = Klass(n_init="auto", init="random") est.fit(X) assert est._n_init == 10 if Klass.__name__ == "KMeans" else 3 @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) def test_sample_weight_unchanged(Estimator): # Check that sample_weight is not modified in place by KMeans (#17204) X = np.array([[1], [2], [4]]) sample_weight = np.array([0.5, 0.2, 0.3]) Estimator(n_clusters=2, random_state=0).fit(X, sample_weight=sample_weight) assert_array_equal(sample_weight, np.array([0.5, 0.2, 0.3])) @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans]) @pytest.mark.parametrize( "param, match", [ ({"n_clusters": n_samples + 1}, r"n_samples.* should be >= n_clusters"), ( {"init": X[:2]}, r"The shape of the initial centers .* does not match " r"the number of clusters", ), ( {"init": lambda X_, k, random_state: X_[:2]}, r"The shape of the initial centers .* does not match " r"the number of clusters", ), ( {"init": X[:8, :2]}, r"The shape of the initial centers .* does not match " r"the number of features of the data", ), ( {"init": lambda X_, k, random_state: X_[:8, :2]}, r"The shape of the initial centers .* does not match " r"the number of features of the data", ), ], ) def test_wrong_params(Estimator, param, match): # Check that error are raised with clear error message when wrong values # are passed for the parameters # Set n_init=1 by default to avoid warning with precomputed init km = Estimator(n_init=1) with pytest.raises(ValueError, match=match): km.set_params(**param).fit(X) @pytest.mark.parametrize( "param, match", [ ( {"x_squared_norms": X[:2]}, r"The length of x_squared_norms .* should " r"be equal to the length of n_samples", ), ], ) def test_kmeans_plusplus_wrong_params(param, match): with pytest.raises(ValueError, match=match): kmeans_plusplus(X, n_clusters, **param) @pytest.mark.parametrize( "input_data", [X] + X_as_any_csr, ) @pytest.mark.parametrize("dtype", [np.float64, np.float32]) def test_kmeans_plusplus_output(input_data, dtype, global_random_seed): # Check for the correct number of seeds and all positive values data = input_data.astype(dtype) centers, indices = kmeans_plusplus( data, n_clusters, random_state=global_random_seed ) # Check there are the correct number of indices and that all indices are # positive and within the number of samples assert indices.shape[0] == n_clusters assert (indices >= 0).all() assert (indices <= data.shape[0]).all() # Check for the correct number of seeds and that they are bound by the data assert centers.shape[0] == n_clusters assert (centers.max(axis=0) <= data.max(axis=0)).all() assert (centers.min(axis=0) >= data.min(axis=0)).all() # Check that indices correspond to reported centers # Use X for comparison rather than data, test still works against centers # calculated with sparse data. assert_allclose(X[indices].astype(dtype), centers) @pytest.mark.parametrize("x_squared_norms", [row_norms(X, squared=True), None]) def test_kmeans_plusplus_norms(x_squared_norms): # Check that defining x_squared_norms returns the same as default=None. centers, indices = kmeans_plusplus(X, n_clusters, x_squared_norms=x_squared_norms) assert_allclose(X[indices], centers) def test_kmeans_plusplus_dataorder(global_random_seed): # Check that memory layout does not effect result centers_c, _ = kmeans_plusplus(X, n_clusters, random_state=global_random_seed) X_fortran = np.asfortranarray(X) centers_fortran, _ = kmeans_plusplus( X_fortran, n_clusters, random_state=global_random_seed ) assert_allclose(centers_c, centers_fortran) def test_is_same_clustering(): # Sanity check for the _is_same_clustering utility function labels1 = np.array([1, 0, 0, 1, 2, 0, 2, 1], dtype=np.int32) assert _is_same_clustering(labels1, labels1, 3) # these other labels represent the same clustering since we can retrieve the first # labels by simply renaming the labels: 0 -> 1, 1 -> 2, 2 -> 0. labels2 = np.array([0, 2, 2, 0, 1, 2, 1, 0], dtype=np.int32) assert _is_same_clustering(labels1, labels2, 3) # these other labels do not represent the same clustering since not all ones are # mapped to a same value labels3 = np.array([1, 0, 0, 2, 2, 0, 2, 1], dtype=np.int32) assert not _is_same_clustering(labels1, labels3, 3) @pytest.mark.parametrize( "kwargs", ({"init": np.str_("k-means++")}, {"init": [[0, 0], [1, 1]], "n_init": 1}) ) def test_kmeans_with_array_like_or_np_scalar_init(kwargs): """Check that init works with numpy scalar strings. Non-regression test for #21964. """ X = np.asarray([[0, 0], [0.5, 0], [0.5, 1], [1, 1]], dtype=np.float64) clustering = KMeans(n_clusters=2, **kwargs) # Does not raise clustering.fit(X) @pytest.mark.parametrize( "Klass, method", [(KMeans, "fit"), (MiniBatchKMeans, "fit"), (MiniBatchKMeans, "partial_fit")], ) def test_feature_names_out(Klass, method): """Check `feature_names_out` for `KMeans` and `MiniBatchKMeans`.""" class_name = Klass.__name__.lower() kmeans = Klass() getattr(kmeans, method)(X) n_clusters = kmeans.cluster_centers_.shape[0] names_out = kmeans.get_feature_names_out() assert_array_equal([f"{class_name}{i}" for i in range(n_clusters)], names_out) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS + [None]) def test_predict_does_not_change_cluster_centers(csr_container): """Check that predict does not change cluster centers. Non-regression test for gh-24253. """ X, _ = make_blobs(n_samples=200, n_features=10, centers=10, random_state=0) if csr_container is not None: X = csr_container(X) kmeans = KMeans() y_pred1 = kmeans.fit_predict(X) # Make cluster_centers readonly kmeans.cluster_centers_ = create_memmap_backed_data(kmeans.cluster_centers_) kmeans.labels_ = create_memmap_backed_data(kmeans.labels_) y_pred2 = kmeans.predict(X) assert_array_equal(y_pred1, y_pred2) @pytest.mark.parametrize("init", ["k-means++", "random"]) def test_sample_weight_init(init, global_random_seed): """Check that sample weight is used during init. `_init_centroids` is shared across all classes inheriting from _BaseKMeans so it's enough to check for KMeans. """ rng = np.random.RandomState(global_random_seed) X, _ = make_blobs( n_samples=200, n_features=10, centers=10, random_state=global_random_seed ) x_squared_norms = row_norms(X, squared=True) kmeans = KMeans() clusters_weighted = kmeans._init_centroids( X=X, x_squared_norms=x_squared_norms, init=init, sample_weight=rng.uniform(size=X.shape[0]), n_centroids=5, random_state=np.random.RandomState(global_random_seed), ) clusters = kmeans._init_centroids( X=X, x_squared_norms=x_squared_norms, init=init, sample_weight=np.ones(X.shape[0]), n_centroids=5, random_state=np.random.RandomState(global_random_seed), ) with pytest.raises(AssertionError): assert_allclose(clusters_weighted, clusters) @pytest.mark.parametrize("init", ["k-means++", "random"]) def test_sample_weight_zero(init, global_random_seed): """Check that if sample weight is 0, this sample won't be chosen. `_init_centroids` is shared across all classes inheriting from _BaseKMeans so it's enough to check for KMeans. """ rng = np.random.RandomState(global_random_seed) X, _ = make_blobs( n_samples=100, n_features=5, centers=5, random_state=global_random_seed ) sample_weight = rng.uniform(size=X.shape[0]) sample_weight[::2] = 0 x_squared_norms = row_norms(X, squared=True) kmeans = KMeans() clusters_weighted = kmeans._init_centroids( X=X, x_squared_norms=x_squared_norms, init=init, sample_weight=sample_weight, n_centroids=10, random_state=np.random.RandomState(global_random_seed), ) # No center should be one of the 0 sample weight point # (i.e. be at a distance=0 from it) d = euclidean_distances(X[::2], clusters_weighted) assert not np.any(np.isclose(d, 0)) @pytest.mark.parametrize("array_constr", data_containers, ids=data_containers_ids) @pytest.mark.parametrize("algorithm", ["lloyd", "elkan"]) def test_relocating_with_duplicates(algorithm, array_constr): """Check that kmeans stops when there are more centers than non-duplicate samples Non-regression test for issue: https://github.com/scikit-learn/scikit-learn/issues/28055 """ X = np.array([[0, 0], [1, 1], [1, 1], [1, 0], [0, 1]]) km = KMeans(n_clusters=5, init=X, algorithm=algorithm) msg = r"Number of distinct clusters \(4\) found smaller than n_clusters \(5\)" with pytest.warns(ConvergenceWarning, match=msg): km.fit(array_constr(X)) assert km.n_iter_ == 1