332 lines
10 KiB
Cython
332 lines
10 KiB
Cython
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
|
|
# Olivier Grisel <olivier.grisel@ensta.org>
|
|
# Lars Buitinck
|
|
#
|
|
# License: BSD 3 clause
|
|
|
|
import numpy as np
|
|
from cython cimport floating
|
|
from cython.parallel cimport prange
|
|
from libc.math cimport sqrt
|
|
|
|
from ..utils.extmath import row_norms
|
|
|
|
|
|
# Number of samples per data chunk defined as a global constant.
|
|
CHUNK_SIZE = 256
|
|
|
|
|
|
cdef floating _euclidean_dense_dense(
|
|
const floating* a, # IN
|
|
const floating* b, # IN
|
|
int n_features,
|
|
bint squared
|
|
) noexcept nogil:
|
|
"""Euclidean distance between a dense and b dense"""
|
|
cdef:
|
|
int i
|
|
int n = n_features // 4
|
|
int rem = n_features % 4
|
|
floating result = 0
|
|
|
|
# We manually unroll the loop for better cache optimization.
|
|
for i in range(n):
|
|
result += (
|
|
(a[0] - b[0]) * (a[0] - b[0]) +
|
|
(a[1] - b[1]) * (a[1] - b[1]) +
|
|
(a[2] - b[2]) * (a[2] - b[2]) +
|
|
(a[3] - b[3]) * (a[3] - b[3])
|
|
)
|
|
a += 4
|
|
b += 4
|
|
|
|
for i in range(rem):
|
|
result += (a[i] - b[i]) * (a[i] - b[i])
|
|
|
|
return result if squared else sqrt(result)
|
|
|
|
|
|
def _euclidean_dense_dense_wrapper(
|
|
const floating[::1] a,
|
|
const floating[::1] b,
|
|
bint squared
|
|
):
|
|
"""Wrapper of _euclidean_dense_dense for testing purpose"""
|
|
return _euclidean_dense_dense(&a[0], &b[0], a.shape[0], squared)
|
|
|
|
|
|
cdef floating _euclidean_sparse_dense(
|
|
const floating[::1] a_data, # IN
|
|
const int[::1] a_indices, # IN
|
|
const floating[::1] b, # IN
|
|
floating b_squared_norm,
|
|
bint squared
|
|
) noexcept nogil:
|
|
"""Euclidean distance between a sparse and b dense"""
|
|
cdef:
|
|
int nnz = a_indices.shape[0]
|
|
int i
|
|
floating tmp, bi
|
|
floating result = 0.0
|
|
|
|
for i in range(nnz):
|
|
bi = b[a_indices[i]]
|
|
tmp = a_data[i] - bi
|
|
result += tmp * tmp - bi * bi
|
|
|
|
result += b_squared_norm
|
|
|
|
if result < 0:
|
|
result = 0.0
|
|
|
|
return result if squared else sqrt(result)
|
|
|
|
|
|
def _euclidean_sparse_dense_wrapper(
|
|
const floating[::1] a_data,
|
|
const int[::1] a_indices,
|
|
const floating[::1] b,
|
|
floating b_squared_norm,
|
|
bint squared
|
|
):
|
|
"""Wrapper of _euclidean_sparse_dense for testing purpose"""
|
|
return _euclidean_sparse_dense(
|
|
a_data, a_indices, b, b_squared_norm, squared)
|
|
|
|
|
|
cpdef floating _inertia_dense(
|
|
const floating[:, ::1] X, # IN
|
|
const floating[::1] sample_weight, # IN
|
|
const floating[:, ::1] centers, # IN
|
|
const int[::1] labels, # IN
|
|
int n_threads,
|
|
int single_label=-1,
|
|
):
|
|
"""Compute inertia for dense input data
|
|
|
|
Sum of squared distance between each sample and its assigned center.
|
|
|
|
If single_label is >= 0, the inertia is computed only for that label.
|
|
"""
|
|
cdef:
|
|
int n_samples = X.shape[0]
|
|
int n_features = X.shape[1]
|
|
int i, j
|
|
|
|
floating sq_dist = 0.0
|
|
floating inertia = 0.0
|
|
|
|
for i in prange(n_samples, nogil=True, num_threads=n_threads,
|
|
schedule='static'):
|
|
j = labels[i]
|
|
if single_label < 0 or single_label == j:
|
|
sq_dist = _euclidean_dense_dense(&X[i, 0], ¢ers[j, 0],
|
|
n_features, True)
|
|
inertia += sq_dist * sample_weight[i]
|
|
|
|
return inertia
|
|
|
|
|
|
cpdef floating _inertia_sparse(
|
|
X, # IN
|
|
const floating[::1] sample_weight, # IN
|
|
const floating[:, ::1] centers, # IN
|
|
const int[::1] labels, # IN
|
|
int n_threads,
|
|
int single_label=-1,
|
|
):
|
|
"""Compute inertia for sparse input data
|
|
|
|
Sum of squared distance between each sample and its assigned center.
|
|
|
|
If single_label is >= 0, the inertia is computed only for that label.
|
|
"""
|
|
cdef:
|
|
floating[::1] X_data = X.data
|
|
int[::1] X_indices = X.indices
|
|
int[::1] X_indptr = X.indptr
|
|
|
|
int n_samples = X.shape[0]
|
|
int i, j
|
|
|
|
floating sq_dist = 0.0
|
|
floating inertia = 0.0
|
|
|
|
floating[::1] centers_squared_norms = row_norms(centers, squared=True)
|
|
|
|
for i in prange(n_samples, nogil=True, num_threads=n_threads,
|
|
schedule='static'):
|
|
j = labels[i]
|
|
if single_label < 0 or single_label == j:
|
|
sq_dist = _euclidean_sparse_dense(
|
|
X_data[X_indptr[i]: X_indptr[i + 1]],
|
|
X_indices[X_indptr[i]: X_indptr[i + 1]],
|
|
centers[j], centers_squared_norms[j], True)
|
|
inertia += sq_dist * sample_weight[i]
|
|
|
|
return inertia
|
|
|
|
|
|
cpdef void _relocate_empty_clusters_dense(
|
|
const floating[:, ::1] X, # IN
|
|
const floating[::1] sample_weight, # IN
|
|
const floating[:, ::1] centers_old, # IN
|
|
floating[:, ::1] centers_new, # INOUT
|
|
floating[::1] weight_in_clusters, # INOUT
|
|
const int[::1] labels # IN
|
|
):
|
|
"""Relocate centers which have no sample assigned to them."""
|
|
cdef:
|
|
int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)
|
|
int n_empty = empty_clusters.shape[0]
|
|
|
|
if n_empty == 0:
|
|
return
|
|
|
|
cdef:
|
|
int n_features = X.shape[1]
|
|
|
|
floating[::1] distances = ((np.asarray(X) - np.asarray(centers_old)[labels])**2).sum(axis=1)
|
|
int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32)
|
|
|
|
int new_cluster_id, old_cluster_id, far_idx, idx, k
|
|
floating weight
|
|
|
|
if np.max(distances) == 0:
|
|
# Happens when there are more clusters than non-duplicate samples. Relocating
|
|
# is pointless in this case.
|
|
return
|
|
|
|
for idx in range(n_empty):
|
|
|
|
new_cluster_id = empty_clusters[idx]
|
|
|
|
far_idx = far_from_centers[idx]
|
|
weight = sample_weight[far_idx]
|
|
|
|
old_cluster_id = labels[far_idx]
|
|
|
|
for k in range(n_features):
|
|
centers_new[old_cluster_id, k] -= X[far_idx, k] * weight
|
|
centers_new[new_cluster_id, k] = X[far_idx, k] * weight
|
|
|
|
weight_in_clusters[new_cluster_id] = weight
|
|
weight_in_clusters[old_cluster_id] -= weight
|
|
|
|
|
|
cpdef void _relocate_empty_clusters_sparse(
|
|
const floating[::1] X_data, # IN
|
|
const int[::1] X_indices, # IN
|
|
const int[::1] X_indptr, # IN
|
|
const floating[::1] sample_weight, # IN
|
|
const floating[:, ::1] centers_old, # IN
|
|
floating[:, ::1] centers_new, # INOUT
|
|
floating[::1] weight_in_clusters, # INOUT
|
|
const int[::1] labels # IN
|
|
):
|
|
"""Relocate centers which have no sample assigned to them."""
|
|
cdef:
|
|
int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)
|
|
int n_empty = empty_clusters.shape[0]
|
|
|
|
if n_empty == 0:
|
|
return
|
|
|
|
cdef:
|
|
int n_samples = X_indptr.shape[0] - 1
|
|
int i, j, k
|
|
|
|
floating[::1] distances = np.zeros(n_samples, dtype=X_data.base.dtype)
|
|
floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
|
|
|
|
for i in range(n_samples):
|
|
j = labels[i]
|
|
distances[i] = _euclidean_sparse_dense(
|
|
X_data[X_indptr[i]: X_indptr[i + 1]],
|
|
X_indices[X_indptr[i]: X_indptr[i + 1]],
|
|
centers_old[j], centers_squared_norms[j], True)
|
|
|
|
if np.max(distances) == 0:
|
|
# Happens when there are more clusters than non-duplicate samples. Relocating
|
|
# is pointless in this case.
|
|
return
|
|
|
|
cdef:
|
|
int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32)
|
|
|
|
int new_cluster_id, old_cluster_id, far_idx, idx
|
|
floating weight
|
|
|
|
for idx in range(n_empty):
|
|
|
|
new_cluster_id = empty_clusters[idx]
|
|
|
|
far_idx = far_from_centers[idx]
|
|
weight = sample_weight[far_idx]
|
|
|
|
old_cluster_id = labels[far_idx]
|
|
|
|
for k in range(X_indptr[far_idx], X_indptr[far_idx + 1]):
|
|
centers_new[old_cluster_id, X_indices[k]] -= X_data[k] * weight
|
|
centers_new[new_cluster_id, X_indices[k]] = X_data[k] * weight
|
|
|
|
weight_in_clusters[new_cluster_id] = weight
|
|
weight_in_clusters[old_cluster_id] -= weight
|
|
|
|
|
|
cdef void _average_centers(
|
|
floating[:, ::1] centers, # INOUT
|
|
const floating[::1] weight_in_clusters # IN
|
|
):
|
|
"""Average new centers wrt weights."""
|
|
cdef:
|
|
int n_clusters = centers.shape[0]
|
|
int n_features = centers.shape[1]
|
|
int j, k
|
|
floating alpha
|
|
int argmax_weight = np.argmax(weight_in_clusters)
|
|
|
|
for j in range(n_clusters):
|
|
if weight_in_clusters[j] > 0:
|
|
alpha = 1.0 / weight_in_clusters[j]
|
|
for k in range(n_features):
|
|
centers[j, k] *= alpha
|
|
else:
|
|
# For convenience, we avoid setting empty clusters at the origin but place
|
|
# them at the location of the biggest cluster.
|
|
for k in range(n_features):
|
|
centers[j, k] = centers[argmax_weight, k]
|
|
|
|
|
|
cdef void _center_shift(
|
|
const floating[:, ::1] centers_old, # IN
|
|
const floating[:, ::1] centers_new, # IN
|
|
floating[::1] center_shift # OUT
|
|
):
|
|
"""Compute shift between old and new centers."""
|
|
cdef:
|
|
int n_clusters = centers_old.shape[0]
|
|
int n_features = centers_old.shape[1]
|
|
int j
|
|
|
|
for j in range(n_clusters):
|
|
center_shift[j] = _euclidean_dense_dense(
|
|
¢ers_new[j, 0], ¢ers_old[j, 0], n_features, False)
|
|
|
|
|
|
def _is_same_clustering(
|
|
const int[::1] labels1,
|
|
const int[::1] labels2,
|
|
n_clusters
|
|
):
|
|
"""Check if two arrays of labels are the same up to a permutation of the labels"""
|
|
cdef int[::1] mapping = np.full(fill_value=-1, shape=(n_clusters,), dtype=np.int32)
|
|
cdef int i
|
|
|
|
for i in range(labels1.shape[0]):
|
|
if mapping[labels1[i]] == -1:
|
|
mapping[labels1[i]] = labels2[i]
|
|
elif mapping[labels1[i]] != labels2[i]:
|
|
return False
|
|
return True
|