projektAI/venv/Lib/site-packages/mlxtend/cluster/kmeans.py
2021-06-06 22:13:05 +02:00

134 lines
4.3 KiB
Python

# Sebastian Raschka 2014-2020
# mlxtend Machine Learning Library Extensions
#
# Estimator for Linear Regression
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause
import numpy as np
from .._base import _Cluster
from .._base import _BaseModel
from .._base import _IterativeModel
# from scipy.spatial.distance import euclidean
class Kmeans(_BaseModel, _Cluster, _IterativeModel):
""" K-means clustering class.
Added in 0.4.1dev
Parameters
------------
k : int
Number of clusters
max_iter : int (default: 10)
Number of iterations during cluster assignment.
Cluster re-assignment stops automatically when the algorithm
converged.
convergence_tolerance : float (default: 1e-05)
Compares current centroids with centroids of the previous iteration
using the given tolerance (a small positive float)to determine
if the algorithm converged early.
random_seed : int (default: None)
Set random state for the initial centroid assignment.
print_progress : int (default: 0)
Prints progress in fitting to stderr.
0: No output
1: Iterations elapsed
2: 1 plus time elapsed
3: 2 plus estimated time until completion
Attributes
-----------
centroids_ : 2d-array, shape={k, n_features}
Feature values of the k cluster centroids.
custers_ : dictionary
The cluster assignments stored as a Python dictionary;
the dictionary keys denote the cluster indeces and the items are
Python lists of the sample indices that were assigned to each
cluster.
iterations_ : int
Number of iterations until convergence.
Examples
-----------
For usage examples, please see
http://rasbt.github.io/mlxtend/user_guide/classifier/Kmeans/
"""
def __init__(self, k, max_iter=10,
convergence_tolerance=1e-05,
random_seed=None, print_progress=0):
_BaseModel.__init__(self)
_Cluster.__init__(self)
_IterativeModel.__init__(self)
self.k = k
self.max_iter = max_iter
self.convergence_tolerance = convergence_tolerance
self.random_seed = random_seed
self.print_progress = print_progress
self._is_fitted = False
def _fit(self, X, init_params=True):
"""Learn cluster centroids from training data.
Called in self.fit
"""
n_samples = X.shape[0]
if init_params:
self.iterations_ = 0
# initialize centroids
rgen = np.random.RandomState(self.random_seed)
idx = rgen.choice(n_samples, self.k, replace=False)
self.centroids_ = X[idx]
for _ in range(self.max_iter):
# assign samples to cluster centroids
self.clusters_ = {i: [] for i in range(self.k)}
for sample_idx, cluster_idx in enumerate(
self._get_cluster_idx(X=X, centroids=self.centroids_)):
self.clusters_[cluster_idx].append(sample_idx)
# recompute centroids
new_centroids = np.array([np.mean(X[self.clusters_[k]], axis=0)
for k in sorted(self.clusters_.keys())])
# stop if cluster assignment doesn't change
if np.allclose(self.centroids_, new_centroids,
rtol=self.convergence_tolerance,
atol=1e-08, equal_nan=False):
break
else:
self.centroids_ = new_centroids
self.iterations_ += 1
if self.print_progress:
self._print_progress(iteration=self.iterations_,
n_iter=self.max_iter)
return self
def _get_cluster_idx(self, X, centroids):
for sample_idx, sample in enumerate(X):
# dist = [euclidean(sample, c) for c in self.centroids_]
dist = np.sqrt(np.sum(np.square(sample - self.centroids_), axis=1))
yield np.argmin(dist)
def _predict(self, X):
"""Predict cluster labels of X.
Called in self.predict
"""
pred = np.array([idx for idx in self._get_cluster_idx(X=X,
centroids=self.centroids_)])
return pred