3RNN/Lib/site-packages/sklearn/cluster/_k_means_minibatch.pyx

from cython cimport floating
from cython.parallel cimport parallel, prange
from libc.stdlib cimport malloc, free


def _minibatch_update_dense(
        const floating[:, ::1] X,            # IN
        const floating[::1] sample_weight,   # IN
        const floating[:, ::1] centers_old,  # IN
        floating[:, ::1] centers_new,        # OUT
        floating[::1] weight_sums,           # INOUT
        const int[::1] labels,               # IN
        int n_threads):
    """Update of the centers for dense MiniBatchKMeans.

    Parameters
    ----------
    X : ndarray of shape (n_samples, n_features), dtype=floating
        The observations to cluster.

    sample_weight : ndarray of shape (n_samples,), dtype=floating
        The weights for each observation in X.

    centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
        Centers before previous iteration, placeholder for the centers after
        previous iteration.

    centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
        Centers after previous iteration, placeholder for the new centers
        computed during this iteration.

    weight_sums : ndarray of shape (n_clusters,), dtype=floating
        Current sums of the accumulated weights for each center.

    labels : ndarray of shape (n_samples,), dtype=int
        labels assignment.

    n_threads : int
        The number of threads to be used by openmp.
    """
    cdef:
        int n_samples = X.shape[0]
        int n_clusters = centers_old.shape[0]
        int cluster_idx

        int *indices

    with nogil, parallel(num_threads=n_threads):
        indices = <int*> malloc(n_samples * sizeof(int))

        for cluster_idx in prange(n_clusters, schedule="static"):
            update_center_dense(cluster_idx, X, sample_weight,
                                centers_old, centers_new, weight_sums, labels,
                                indices)

        free(indices)


cdef void update_center_dense(
        int cluster_idx,
        const floating[:, ::1] X,            # IN
        const floating[::1] sample_weight,   # IN
        const floating[:, ::1] centers_old,  # IN
        floating[:, ::1] centers_new,        # OUT
        floating[::1] weight_sums,           # INOUT
        const int[::1] labels,               # IN
        int *indices) noexcept nogil:        # TMP
    """Update of a single center for dense MinibatchKMeans"""
    cdef:
        int n_samples = sample_weight.shape[0]
        int n_features = centers_old.shape[1]
        floating alpha
        int n_indices
        int k, sample_idx, feature_idx

        floating wsum = 0

    # indices = np.where(labels == cluster_idx)[0]
    k = 0
    for sample_idx in range(n_samples):
        if labels[sample_idx] == cluster_idx:
            indices[k] = sample_idx
            wsum += sample_weight[sample_idx]
            k += 1
    n_indices = k

    if wsum > 0:
        # Undo the previous count-based scaling for this cluster center
        for feature_idx in range(n_features):
            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx]

        # Update cluster with new point members
        for k in range(n_indices):
            sample_idx = indices[k]
            for feature_idx in range(n_features):
                centers_new[cluster_idx, feature_idx] += X[sample_idx, feature_idx] * sample_weight[sample_idx]

        # Update the count statistics for this center
        weight_sums[cluster_idx] += wsum

        # Rescale to compute mean of all points (old and new)
        alpha = 1 / weight_sums[cluster_idx]
        for feature_idx in range(n_features):
            centers_new[cluster_idx, feature_idx] *= alpha
    else:
        # No sample was assigned to this cluster in this batch of data
        for feature_idx in range(n_features):
            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx]


def _minibatch_update_sparse(
        X,                                   # IN
        const floating[::1] sample_weight,   # IN
        const floating[:, ::1] centers_old,  # IN
        floating[:, ::1] centers_new,        # OUT
        floating[::1] weight_sums,           # INOUT
        const int[::1] labels,               # IN
        int n_threads):
    """Update of the centers for sparse MiniBatchKMeans.

    Parameters
    ----------
    X : sparse matrix of shape (n_samples, n_features), dtype=floating
        The observations to cluster. Must be in CSR format.

    sample_weight : ndarray of shape (n_samples,), dtype=floating
        The weights for each observation in X.

    centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
        Centers before previous iteration, placeholder for the centers after
        previous iteration.

    centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
        Centers after previous iteration, placeholder for the new centers
        computed during this iteration.

    weight_sums : ndarray of shape (n_clusters,), dtype=floating
        Current sums of the accumulated weights for each center.

    labels : ndarray of shape (n_samples,), dtype=int
        labels assignment.

    n_threads : int
        The number of threads to be used by openmp.
    """
    cdef:
        floating[::1] X_data = X.data
        int[::1] X_indices = X.indices
        int[::1] X_indptr = X.indptr
        int n_samples = X.shape[0]
        int n_clusters = centers_old.shape[0]
        int cluster_idx

        int *indices

    with nogil, parallel(num_threads=n_threads):
        indices = <int*> malloc(n_samples * sizeof(int))

        for cluster_idx in prange(n_clusters, schedule="static"):
            update_center_sparse(cluster_idx, X_data, X_indices, X_indptr,
                                 sample_weight, centers_old, centers_new,
                                 weight_sums, labels, indices)

        free(indices)


cdef void update_center_sparse(
        int cluster_idx,
        const floating[::1] X_data,          # IN
        const int[::1] X_indices,            # IN
        const int[::1] X_indptr,             # IN
        const floating[::1] sample_weight,   # IN
        const floating[:, ::1] centers_old,  # IN
        floating[:, ::1] centers_new,        # OUT
        floating[::1] weight_sums,           # INOUT
        const int[::1] labels,               # IN
        int *indices) noexcept nogil:        # TMP
    """Update of a single center for sparse MinibatchKMeans"""
    cdef:
        int n_samples = sample_weight.shape[0]
        int n_features = centers_old.shape[1]
        floating alpha
        int n_indices
        int k, sample_idx, feature_idx

        floating wsum = 0

    # indices = np.where(labels == cluster_idx)[0]
    k = 0
    for sample_idx in range(n_samples):
        if labels[sample_idx] == cluster_idx:
            indices[k] = sample_idx
            wsum += sample_weight[sample_idx]
            k += 1
    n_indices = k

    if wsum > 0:
        # Undo the previous count-based scaling for this cluster center:
        for feature_idx in range(n_features):
            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx]

        # Update cluster with new point members
        for k in range(n_indices):
            sample_idx = indices[k]
            for feature_idx in range(X_indptr[sample_idx], X_indptr[sample_idx + 1]):
                centers_new[cluster_idx, X_indices[feature_idx]] += X_data[feature_idx] * sample_weight[sample_idx]

        # Update the count statistics for this center
        weight_sums[cluster_idx] += wsum

        # Rescale to compute mean of all points (old and new)
        alpha = 1 / weight_sums[cluster_idx]
        for feature_idx in range(n_features):
            centers_new[cluster_idx, feature_idx] *= alpha
    else:
        # No sample was assigned to this cluster in this batch of data
        for feature_idx in range(n_features):
            centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx]
1.0 2024-05-26 19:49:15 +02:00			`from cython cimport floating`
			`from cython.parallel cimport parallel, prange`
			`from libc.stdlib cimport malloc, free`


			`def _minibatch_update_dense(`
			`const floating[:, ::1] X, # IN`
			`const floating[::1] sample_weight, # IN`
			`const floating[:, ::1] centers_old, # IN`
			`floating[:, ::1] centers_new, # OUT`
			`floating[::1] weight_sums, # INOUT`
			`const int[::1] labels, # IN`
			`int n_threads):`
			`"""Update of the centers for dense MiniBatchKMeans.`

			`Parameters`
			`----------`
			`X : ndarray of shape (n_samples, n_features), dtype=floating`
			`The observations to cluster.`

			`sample_weight : ndarray of shape (n_samples,), dtype=floating`
			`The weights for each observation in X.`

			`centers_old : ndarray of shape (n_clusters, n_features), dtype=floating`
			`Centers before previous iteration, placeholder for the centers after`
			`previous iteration.`

			`centers_new : ndarray of shape (n_clusters, n_features), dtype=floating`
			`Centers after previous iteration, placeholder for the new centers`
			`computed during this iteration.`

			`weight_sums : ndarray of shape (n_clusters,), dtype=floating`
			`Current sums of the accumulated weights for each center.`

			`labels : ndarray of shape (n_samples,), dtype=int`
			`labels assignment.`

			`n_threads : int`
			`The number of threads to be used by openmp.`
			`"""`
			`cdef:`
			`int n_samples = X.shape[0]`
			`int n_clusters = centers_old.shape[0]`
			`int cluster_idx`

			`int *indices`

			`with nogil, parallel(num_threads=n_threads):`
			`indices = <int> malloc(n_samples sizeof(int))`

			`for cluster_idx in prange(n_clusters, schedule="static"):`
			`update_center_dense(cluster_idx, X, sample_weight,`
			`centers_old, centers_new, weight_sums, labels,`
			`indices)`

			`free(indices)`


			`cdef void update_center_dense(`
			`int cluster_idx,`
			`const floating[:, ::1] X, # IN`
			`const floating[::1] sample_weight, # IN`
			`const floating[:, ::1] centers_old, # IN`
			`floating[:, ::1] centers_new, # OUT`
			`floating[::1] weight_sums, # INOUT`
			`const int[::1] labels, # IN`
			`int *indices) noexcept nogil: # TMP`
			`"""Update of a single center for dense MinibatchKMeans"""`
			`cdef:`
			`int n_samples = sample_weight.shape[0]`
			`int n_features = centers_old.shape[1]`
			`floating alpha`
			`int n_indices`
			`int k, sample_idx, feature_idx`

			`floating wsum = 0`

			`# indices = np.where(labels == cluster_idx)[0]`
			`k = 0`
			`for sample_idx in range(n_samples):`
			`if labels[sample_idx] == cluster_idx:`
			`indices[k] = sample_idx`
			`wsum += sample_weight[sample_idx]`
			`k += 1`
			`n_indices = k`

			`if wsum > 0:`
			`# Undo the previous count-based scaling for this cluster center`
			`for feature_idx in range(n_features):`
			`centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx]`

			`# Update cluster with new point members`
			`for k in range(n_indices):`
			`sample_idx = indices[k]`
			`for feature_idx in range(n_features):`
			`centers_new[cluster_idx, feature_idx] += X[sample_idx, feature_idx] * sample_weight[sample_idx]`

			`# Update the count statistics for this center`
			`weight_sums[cluster_idx] += wsum`

			`# Rescale to compute mean of all points (old and new)`
			`alpha = 1 / weight_sums[cluster_idx]`
			`for feature_idx in range(n_features):`
			`centers_new[cluster_idx, feature_idx] *= alpha`
			`else:`
			`# No sample was assigned to this cluster in this batch of data`
			`for feature_idx in range(n_features):`
			`centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx]`


			`def _minibatch_update_sparse(`
			`X, # IN`
			`const floating[::1] sample_weight, # IN`
			`const floating[:, ::1] centers_old, # IN`
			`floating[:, ::1] centers_new, # OUT`
			`floating[::1] weight_sums, # INOUT`
			`const int[::1] labels, # IN`
			`int n_threads):`
			`"""Update of the centers for sparse MiniBatchKMeans.`

			`Parameters`
			`----------`
			`X : sparse matrix of shape (n_samples, n_features), dtype=floating`
			`The observations to cluster. Must be in CSR format.`

			`sample_weight : ndarray of shape (n_samples,), dtype=floating`
			`The weights for each observation in X.`

			`centers_old : ndarray of shape (n_clusters, n_features), dtype=floating`
			`Centers before previous iteration, placeholder for the centers after`
			`previous iteration.`

			`centers_new : ndarray of shape (n_clusters, n_features), dtype=floating`
			`Centers after previous iteration, placeholder for the new centers`
			`computed during this iteration.`

			`weight_sums : ndarray of shape (n_clusters,), dtype=floating`
			`Current sums of the accumulated weights for each center.`

			`labels : ndarray of shape (n_samples,), dtype=int`
			`labels assignment.`

			`n_threads : int`
			`The number of threads to be used by openmp.`
			`"""`
			`cdef:`
			`floating[::1] X_data = X.data`
			`int[::1] X_indices = X.indices`
			`int[::1] X_indptr = X.indptr`
			`int n_samples = X.shape[0]`
			`int n_clusters = centers_old.shape[0]`
			`int cluster_idx`

			`int *indices`

			`with nogil, parallel(num_threads=n_threads):`
			`indices = <int> malloc(n_samples sizeof(int))`

			`for cluster_idx in prange(n_clusters, schedule="static"):`
			`update_center_sparse(cluster_idx, X_data, X_indices, X_indptr,`
			`sample_weight, centers_old, centers_new,`
			`weight_sums, labels, indices)`

			`free(indices)`


			`cdef void update_center_sparse(`
			`int cluster_idx,`
			`const floating[::1] X_data, # IN`
			`const int[::1] X_indices, # IN`
			`const int[::1] X_indptr, # IN`
			`const floating[::1] sample_weight, # IN`
			`const floating[:, ::1] centers_old, # IN`
			`floating[:, ::1] centers_new, # OUT`
			`floating[::1] weight_sums, # INOUT`
			`const int[::1] labels, # IN`
			`int *indices) noexcept nogil: # TMP`
			`"""Update of a single center for sparse MinibatchKMeans"""`
			`cdef:`
			`int n_samples = sample_weight.shape[0]`
			`int n_features = centers_old.shape[1]`
			`floating alpha`
			`int n_indices`
			`int k, sample_idx, feature_idx`

			`floating wsum = 0`

			`# indices = np.where(labels == cluster_idx)[0]`
			`k = 0`
			`for sample_idx in range(n_samples):`
			`if labels[sample_idx] == cluster_idx:`
			`indices[k] = sample_idx`
			`wsum += sample_weight[sample_idx]`
			`k += 1`
			`n_indices = k`

			`if wsum > 0:`
			`# Undo the previous count-based scaling for this cluster center:`
			`for feature_idx in range(n_features):`
			`centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx] * weight_sums[cluster_idx]`

			`# Update cluster with new point members`
			`for k in range(n_indices):`
			`sample_idx = indices[k]`
			`for feature_idx in range(X_indptr[sample_idx], X_indptr[sample_idx + 1]):`
			`centers_new[cluster_idx, X_indices[feature_idx]] += X_data[feature_idx] * sample_weight[sample_idx]`

			`# Update the count statistics for this center`
			`weight_sums[cluster_idx] += wsum`

			`# Rescale to compute mean of all points (old and new)`
			`alpha = 1 / weight_sums[cluster_idx]`
			`for feature_idx in range(n_features):`
			`centers_new[cluster_idx, feature_idx] *= alpha`
			`else:`
			`# No sample was assigned to this cluster in this batch of data`
			`for feature_idx in range(n_features):`
			`centers_new[cluster_idx, feature_idx] = centers_old[cluster_idx, feature_idx]`