307 lines
9.3 KiB
Python
307 lines
9.3 KiB
Python
|
"""
|
||
|
cmeans.py : Fuzzy C-means clustering algorithm.
|
||
|
"""
|
||
|
import numpy as np
|
||
|
from scipy.spatial.distance import cdist
|
||
|
from .normalize_columns import normalize_columns, normalize_power_columns
|
||
|
|
||
|
|
||
|
def _cmeans0(data, u_old, c, m, metric):
|
||
|
"""
|
||
|
Single step in generic fuzzy c-means clustering algorithm.
|
||
|
|
||
|
Modified from Ross, Fuzzy Logic w/Engineering Applications (2010),
|
||
|
pages 352-353, equations 10.28 - 10.35.
|
||
|
|
||
|
Parameters inherited from cmeans()
|
||
|
"""
|
||
|
# Normalizing, then eliminating any potential zero values.
|
||
|
u_old = normalize_columns(u_old)
|
||
|
u_old = np.fmax(u_old, np.finfo(np.float64).eps)
|
||
|
|
||
|
um = u_old ** m
|
||
|
|
||
|
# Calculate cluster centers
|
||
|
data = data.T
|
||
|
cntr = um.dot(data) / np.atleast_2d(um.sum(axis=1)).T
|
||
|
|
||
|
d = _distance(data, cntr, metric)
|
||
|
d = np.fmax(d, np.finfo(np.float64).eps)
|
||
|
|
||
|
jm = (um * d ** 2).sum()
|
||
|
|
||
|
u = normalize_power_columns(d, - 2. / (m - 1))
|
||
|
|
||
|
return cntr, u, jm, d
|
||
|
|
||
|
|
||
|
def _distance(data, centers, metric='euclidean'):
|
||
|
"""
|
||
|
Euclidean distance from each point to each cluster center.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
data : 2d array (N x Q)
|
||
|
Data to be analyzed. There are N data points.
|
||
|
centers : 2d array (C x Q)
|
||
|
Cluster centers. There are C clusters, with Q features.
|
||
|
metric: string
|
||
|
By default is set to euclidean. Passes any option accepted by
|
||
|
``scipy.spatial.distance.cdist``.
|
||
|
Returns
|
||
|
-------
|
||
|
dist : 2d array (C x N)
|
||
|
Euclidean distance from each point, to each cluster center.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
scipy.spatial.distance.cdist
|
||
|
"""
|
||
|
return cdist(data, centers, metric=metric).T
|
||
|
|
||
|
|
||
|
def _fp_coeff(u):
|
||
|
"""
|
||
|
Fuzzy partition coefficient `fpc` relative to fuzzy c-partitioned
|
||
|
matrix `u`. Measures 'fuzziness' in partitioned clustering.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
u : 2d array (C, N)
|
||
|
Fuzzy c-partitioned matrix; N = number of data points and C = number
|
||
|
of clusters.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
fpc : float
|
||
|
Fuzzy partition coefficient.
|
||
|
|
||
|
"""
|
||
|
n = u.shape[1]
|
||
|
|
||
|
return np.trace(u.dot(u.T)) / float(n)
|
||
|
|
||
|
|
||
|
def cmeans(data, c, m, error, maxiter, metric='euclidean', init=None, seed=None):
|
||
|
"""
|
||
|
Fuzzy c-means clustering algorithm [1].
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
data : 2d array, size (S, N)
|
||
|
Data to be clustered. N is the number of data sets; S is the number
|
||
|
of features within each sample vector.
|
||
|
c : int
|
||
|
Desired number of clusters or classes.
|
||
|
m : float
|
||
|
Array exponentiation applied to the membership function u_old at each
|
||
|
iteration, where U_new = u_old ** m.
|
||
|
error : float
|
||
|
Stopping criterion; stop early if the norm of (u[p] - u[p-1]) < error.
|
||
|
maxiter : int
|
||
|
Maximum number of iterations allowed.
|
||
|
metric: string
|
||
|
By default is set to euclidean. Passes any option accepted by
|
||
|
``scipy.spatial.distance.cdist``.
|
||
|
init : 2d array, size (S, N)
|
||
|
Initial fuzzy c-partitioned matrix. If none provided, algorithm is
|
||
|
randomly initialized.
|
||
|
seed : int
|
||
|
If provided, sets random seed of init. No effect if init is
|
||
|
provided. Mainly for debug/testing purposes.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
cntr : 2d array, size (S, c)
|
||
|
Cluster centers. Data for each center along each feature provided
|
||
|
for every cluster (of the `c` requested clusters).
|
||
|
u : 2d array, (S, N)
|
||
|
Final fuzzy c-partitioned matrix.
|
||
|
u0 : 2d array, (S, N)
|
||
|
Initial guess at fuzzy c-partitioned matrix (either provided init or
|
||
|
random guess used if init was not provided).
|
||
|
d : 2d array, (S, N)
|
||
|
Final Euclidian distance matrix.
|
||
|
jm : 1d array, length P
|
||
|
Objective function history.
|
||
|
p : int
|
||
|
Number of iterations run.
|
||
|
fpc : float
|
||
|
Final fuzzy partition coefficient.
|
||
|
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The algorithm implemented is from Ross et al. [1]_.
|
||
|
|
||
|
Fuzzy C-Means has a known problem with high dimensionality datasets, where
|
||
|
the majority of cluster centers are pulled into the overall center of
|
||
|
gravity. If you are clustering data with very high dimensionality and
|
||
|
encounter this issue, another clustering method may be required. For more
|
||
|
information and the theory behind this, see Winkler et al. [2]_.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] Ross, Timothy J. Fuzzy Logic With Engineering Applications, 3rd ed.
|
||
|
Wiley. 2010. ISBN 978-0-470-74376-8 pp 352-353, eq 10.28 - 10.35.
|
||
|
|
||
|
.. [2] Winkler, R., Klawonn, F., & Kruse, R. Fuzzy c-means in high
|
||
|
dimensional spaces. 2012. Contemporary Theory and Pragmatic
|
||
|
Approaches in Fuzzy Computing Utilization, 1.
|
||
|
"""
|
||
|
# Setup u0
|
||
|
if init is None:
|
||
|
if seed is not None:
|
||
|
np.random.seed(seed=seed)
|
||
|
n = data.shape[1]
|
||
|
u0 = np.random.rand(c, n)
|
||
|
u0 = normalize_columns(u0)
|
||
|
init = u0.copy()
|
||
|
u0 = init
|
||
|
u = np.fmax(u0, np.finfo(np.float64).eps)
|
||
|
|
||
|
# Initialize loop parameters
|
||
|
jm = np.zeros(0)
|
||
|
p = 0
|
||
|
|
||
|
# Main cmeans loop
|
||
|
while p < maxiter - 1:
|
||
|
u2 = u.copy()
|
||
|
[cntr, u, Jjm, d] = _cmeans0(data, u2, c, m, metric)
|
||
|
jm = np.hstack((jm, Jjm))
|
||
|
p += 1
|
||
|
|
||
|
# Stopping rule
|
||
|
if np.linalg.norm(u - u2) < error:
|
||
|
break
|
||
|
|
||
|
# Final calculations
|
||
|
error = np.linalg.norm(u - u2)
|
||
|
fpc = _fp_coeff(u)
|
||
|
|
||
|
return cntr, u, u0, d, jm, p, fpc
|
||
|
|
||
|
|
||
|
def cmeans_predict(test_data, cntr_trained, m, error, maxiter, metric='euclidean', init=None,
|
||
|
seed=None):
|
||
|
"""
|
||
|
Prediction of new data in given a trained fuzzy c-means framework [1].
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
test_data : 2d array, size (S, N)
|
||
|
New, independent data set to be predicted based on trained c-means
|
||
|
from ``cmeans``. N is the number of data sets; S is the number of
|
||
|
features within each sample vector.
|
||
|
cntr_trained : 2d array, size (S, c)
|
||
|
Location of trained centers from prior training c-means.
|
||
|
m : float
|
||
|
Array exponentiation applied to the membership function u_old at each
|
||
|
iteration, where U_new = u_old ** m.
|
||
|
error : float
|
||
|
Stopping criterion; stop early if the norm of (u[p] - u[p-1]) < error.
|
||
|
maxiter : int
|
||
|
Maximum number of iterations allowed.
|
||
|
metric: string
|
||
|
By default is set to euclidean. Passes any option accepted by
|
||
|
``scipy.spatial.distance.cdist``.
|
||
|
init : 2d array, size (S, N)
|
||
|
Initial fuzzy c-partitioned matrix. If none provided, algorithm is
|
||
|
randomly initialized.
|
||
|
seed : int
|
||
|
If provided, sets random seed of init. No effect if init is
|
||
|
provided. Mainly for debug/testing purposes.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
u : 2d array, (S, N)
|
||
|
Final fuzzy c-partitioned matrix.
|
||
|
u0 : 2d array, (S, N)
|
||
|
Initial guess at fuzzy c-partitioned matrix (either provided init or
|
||
|
random guess used if init was not provided).
|
||
|
d : 2d array, (S, N)
|
||
|
Final Euclidian distance matrix.
|
||
|
jm : 1d array, length P
|
||
|
Objective function history.
|
||
|
p : int
|
||
|
Number of iterations run.
|
||
|
fpc : float
|
||
|
Final fuzzy partition coefficient.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Ross et al. [1]_ did not include a prediction algorithm to go along with
|
||
|
fuzzy c-means. This prediction algorithm works by repeating the clustering
|
||
|
with fixed centers, then efficiently finds the fuzzy membership at all
|
||
|
points.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] Ross, Timothy J. Fuzzy Logic With Engineering Applications, 3rd ed.
|
||
|
Wiley. 2010. ISBN 978-0-470-74376-8 pp 352-353, eq 10.28 - 10.35.
|
||
|
"""
|
||
|
c = cntr_trained.shape[0]
|
||
|
|
||
|
# Setup u0
|
||
|
if init is None:
|
||
|
if seed is not None:
|
||
|
np.random.seed(seed=seed)
|
||
|
n = test_data.shape[1]
|
||
|
u0 = np.random.rand(c, n)
|
||
|
u0 = normalize_columns(u0)
|
||
|
init = u0.copy()
|
||
|
u0 = init
|
||
|
u = np.fmax(u0, np.finfo(np.float64).eps)
|
||
|
|
||
|
# Initialize loop parameters
|
||
|
jm = np.zeros(0)
|
||
|
p = 0
|
||
|
|
||
|
# Main cmeans loop
|
||
|
while p < maxiter - 1:
|
||
|
u2 = u.copy()
|
||
|
[u, Jjm, d] = _cmeans_predict0(test_data, cntr_trained, u2, c, m, metric)
|
||
|
jm = np.hstack((jm, Jjm))
|
||
|
p += 1
|
||
|
|
||
|
# Stopping rule
|
||
|
if np.linalg.norm(u - u2) < error:
|
||
|
break
|
||
|
|
||
|
# Final calculations
|
||
|
error = np.linalg.norm(u - u2)
|
||
|
fpc = _fp_coeff(u)
|
||
|
|
||
|
return u, u0, d, jm, p, fpc
|
||
|
|
||
|
|
||
|
def _cmeans_predict0(test_data, cntr, u_old, c, m, metric):
|
||
|
"""
|
||
|
Single step in fuzzy c-means prediction algorithm. Clustering algorithm
|
||
|
modified from Ross, Fuzzy Logic w/Engineering Applications (2010)
|
||
|
p.352-353, equations 10.28 - 10.35, but this method to generate fuzzy
|
||
|
predictions was independently derived by Josh Warner.
|
||
|
|
||
|
Parameters inherited from cmeans()
|
||
|
|
||
|
Very similar to initial clustering, except `cntr` is not updated, thus
|
||
|
the new test data are forced into known (trained) clusters.
|
||
|
"""
|
||
|
# Normalizing, then eliminating any potential zero values.
|
||
|
u_old = normalize_columns(u_old)
|
||
|
u_old = np.fmax(u_old, np.finfo(np.float64).eps)
|
||
|
|
||
|
um = u_old ** m
|
||
|
test_data = test_data.T
|
||
|
|
||
|
# For prediction, we do not recalculate cluster centers. The test_data is
|
||
|
# forced to conform to the prior clustering.
|
||
|
|
||
|
d = _distance(test_data, cntr, metric)
|
||
|
d = np.fmax(d, np.finfo(np.float64).eps)
|
||
|
|
||
|
jm = (um * d ** 2).sum()
|
||
|
|
||
|
u = normalize_power_columns(d, - 2. / (m - 1))
|
||
|
|
||
|
return u, jm, d
|