# Sebastian Raschka 2014-2020 # mlxtend Machine Learning Library Extensions # # Estimator for Linear Regression # Author: Sebastian Raschka # # License: BSD 3 clause import numpy as np from .._base import _Cluster from .._base import _BaseModel from .._base import _IterativeModel # from scipy.spatial.distance import euclidean class Kmeans(_BaseModel, _Cluster, _IterativeModel): """ K-means clustering class. Added in 0.4.1dev Parameters ------------ k : int Number of clusters max_iter : int (default: 10) Number of iterations during cluster assignment. Cluster re-assignment stops automatically when the algorithm converged. convergence_tolerance : float (default: 1e-05) Compares current centroids with centroids of the previous iteration using the given tolerance (a small positive float)to determine if the algorithm converged early. random_seed : int (default: None) Set random state for the initial centroid assignment. print_progress : int (default: 0) Prints progress in fitting to stderr. 0: No output 1: Iterations elapsed 2: 1 plus time elapsed 3: 2 plus estimated time until completion Attributes ----------- centroids_ : 2d-array, shape={k, n_features} Feature values of the k cluster centroids. custers_ : dictionary The cluster assignments stored as a Python dictionary; the dictionary keys denote the cluster indeces and the items are Python lists of the sample indices that were assigned to each cluster. iterations_ : int Number of iterations until convergence. Examples ----------- For usage examples, please see http://rasbt.github.io/mlxtend/user_guide/classifier/Kmeans/ """ def __init__(self, k, max_iter=10, convergence_tolerance=1e-05, random_seed=None, print_progress=0): _BaseModel.__init__(self) _Cluster.__init__(self) _IterativeModel.__init__(self) self.k = k self.max_iter = max_iter self.convergence_tolerance = convergence_tolerance self.random_seed = random_seed self.print_progress = print_progress self._is_fitted = False def _fit(self, X, init_params=True): """Learn cluster centroids from training data. Called in self.fit """ n_samples = X.shape[0] if init_params: self.iterations_ = 0 # initialize centroids rgen = np.random.RandomState(self.random_seed) idx = rgen.choice(n_samples, self.k, replace=False) self.centroids_ = X[idx] for _ in range(self.max_iter): # assign samples to cluster centroids self.clusters_ = {i: [] for i in range(self.k)} for sample_idx, cluster_idx in enumerate( self._get_cluster_idx(X=X, centroids=self.centroids_)): self.clusters_[cluster_idx].append(sample_idx) # recompute centroids new_centroids = np.array([np.mean(X[self.clusters_[k]], axis=0) for k in sorted(self.clusters_.keys())]) # stop if cluster assignment doesn't change if np.allclose(self.centroids_, new_centroids, rtol=self.convergence_tolerance, atol=1e-08, equal_nan=False): break else: self.centroids_ = new_centroids self.iterations_ += 1 if self.print_progress: self._print_progress(iteration=self.iterations_, n_iter=self.max_iter) return self def _get_cluster_idx(self, X, centroids): for sample_idx, sample in enumerate(X): # dist = [euclidean(sample, c) for c in self.centroids_] dist = np.sqrt(np.sum(np.square(sample - self.centroids_), axis=1)) yield np.argmin(dist) def _predict(self, X): """Predict cluster labels of X. Called in self.predict """ pred = np.array([idx for idx in self._get_cluster_idx(X=X, centroids=self.centroids_)]) return pred