3RNN/Lib/site-packages/sklearn/manifold/_utils.pyx

import numpy as np

from libc cimport math
from libc.math cimport INFINITY

from ..utils._typedefs cimport float32_t, float64_t


cdef float EPSILON_DBL = 1e-8
cdef float PERPLEXITY_TOLERANCE = 1e-5


# TODO: have this function support float32 and float64 and preserve inputs' dtypes.
def _binary_search_perplexity(
        const float32_t[:, :] sqdistances,
        float desired_perplexity,
        int verbose):
    """Binary search for sigmas of conditional Gaussians.

    This approximation reduces the computational complexity from O(N^2) to
    O(uN).

    Parameters
    ----------
    sqdistances : ndarray of shape (n_samples, n_neighbors), dtype=np.float32
        Distances between training samples and their k nearest neighbors.
        When using the exact method, this is a square (n_samples, n_samples)
        distance matrix. The TSNE default metric is "euclidean" which is
        interpreted as squared euclidean distance.

    desired_perplexity : float
        Desired perplexity (2^entropy) of the conditional Gaussians.

    verbose : int
        Verbosity level.

    Returns
    -------
    P : ndarray of shape (n_samples, n_samples), dtype=np.float64
        Probabilities of conditional Gaussian distributions p_i|j.
    """
    # Maximum number of binary search steps
    cdef long n_steps = 100

    cdef long n_samples = sqdistances.shape[0]
    cdef long n_neighbors = sqdistances.shape[1]
    cdef int using_neighbors = n_neighbors < n_samples
    # Precisions of conditional Gaussian distributions
    cdef double beta
    cdef double beta_min
    cdef double beta_max
    cdef double beta_sum = 0.0

    # Use log scale
    cdef double desired_entropy = math.log(desired_perplexity)
    cdef double entropy_diff

    cdef double entropy
    cdef double sum_Pi
    cdef double sum_disti_Pi
    cdef long i, j, l

    # This array is later used as a 32bit array. It has multiple intermediate
    # floating point additions that benefit from the extra precision
    cdef float64_t[:, :] P = np.zeros(
        (n_samples, n_neighbors), dtype=np.float64)

    for i in range(n_samples):
        beta_min = -INFINITY
        beta_max = INFINITY
        beta = 1.0

        # Binary search of precision for i-th conditional distribution
        for l in range(n_steps):
            # Compute current entropy and corresponding probabilities
            # computed just over the nearest neighbors or over all data
            # if we're not using neighbors
            sum_Pi = 0.0
            for j in range(n_neighbors):
                if j != i or using_neighbors:
                    P[i, j] = math.exp(-sqdistances[i, j] * beta)
                    sum_Pi += P[i, j]

            if sum_Pi == 0.0:
                sum_Pi = EPSILON_DBL
            sum_disti_Pi = 0.0

            for j in range(n_neighbors):
                P[i, j] /= sum_Pi
                sum_disti_Pi += sqdistances[i, j] * P[i, j]

            entropy = math.log(sum_Pi) + beta * sum_disti_Pi
            entropy_diff = entropy - desired_entropy

            if math.fabs(entropy_diff) <= PERPLEXITY_TOLERANCE:
                break

            if entropy_diff > 0.0:
                beta_min = beta
                if beta_max == INFINITY:
                    beta *= 2.0
                else:
                    beta = (beta + beta_max) / 2.0
            else:
                beta_max = beta
                if beta_min == -INFINITY:
                    beta /= 2.0
                else:
                    beta = (beta + beta_min) / 2.0

        beta_sum += beta

        if verbose and ((i + 1) % 1000 == 0 or i + 1 == n_samples):
            print("[t-SNE] Computed conditional probabilities for sample "
                  "%d / %d" % (i + 1, n_samples))

    if verbose:
        print("[t-SNE] Mean sigma: %f"
              % np.mean(math.sqrt(n_samples / beta_sum)))
    return np.asarray(P)
1.0 2024-05-26 19:49:15 +02:00			`import numpy as np`

			`from libc cimport math`
			`from libc.math cimport INFINITY`

			`from ..utils._typedefs cimport float32_t, float64_t`


			`cdef float EPSILON_DBL = 1e-8`
			`cdef float PERPLEXITY_TOLERANCE = 1e-5`


			`# TODO: have this function support float32 and float64 and preserve inputs' dtypes.`
			`def _binary_search_perplexity(`
			`const float32_t[:, :] sqdistances,`
			`float desired_perplexity,`
			`int verbose):`
			`"""Binary search for sigmas of conditional Gaussians.`

			`This approximation reduces the computational complexity from O(N^2) to`
			`O(uN).`

			`Parameters`
			`----------`
			`sqdistances : ndarray of shape (n_samples, n_neighbors), dtype=np.float32`
			`Distances between training samples and their k nearest neighbors.`
			`When using the exact method, this is a square (n_samples, n_samples)`
			`distance matrix. The TSNE default metric is "euclidean" which is`
			`interpreted as squared euclidean distance.`

			`desired_perplexity : float`
			`Desired perplexity (2^entropy) of the conditional Gaussians.`

			`verbose : int`
			`Verbosity level.`

			`Returns`
			`-------`
			`P : ndarray of shape (n_samples, n_samples), dtype=np.float64`
			`Probabilities of conditional Gaussian distributions p_i\|j.`
			`"""`
			`# Maximum number of binary search steps`
			`cdef long n_steps = 100`

			`cdef long n_samples = sqdistances.shape[0]`
			`cdef long n_neighbors = sqdistances.shape[1]`
			`cdef int using_neighbors = n_neighbors < n_samples`
			`# Precisions of conditional Gaussian distributions`
			`cdef double beta`
			`cdef double beta_min`
			`cdef double beta_max`
			`cdef double beta_sum = 0.0`

			`# Use log scale`
			`cdef double desired_entropy = math.log(desired_perplexity)`
			`cdef double entropy_diff`

			`cdef double entropy`
			`cdef double sum_Pi`
			`cdef double sum_disti_Pi`
			`cdef long i, j, l`

			`# This array is later used as a 32bit array. It has multiple intermediate`
			`# floating point additions that benefit from the extra precision`
			`cdef float64_t[:, :] P = np.zeros(`
			`(n_samples, n_neighbors), dtype=np.float64)`

			`for i in range(n_samples):`
			`beta_min = -INFINITY`
			`beta_max = INFINITY`
			`beta = 1.0`

			`# Binary search of precision for i-th conditional distribution`
			`for l in range(n_steps):`
			`# Compute current entropy and corresponding probabilities`
			`# computed just over the nearest neighbors or over all data`
			`# if we're not using neighbors`
			`sum_Pi = 0.0`
			`for j in range(n_neighbors):`
			`if j != i or using_neighbors:`
			`P[i, j] = math.exp(-sqdistances[i, j] * beta)`
			`sum_Pi += P[i, j]`

			`if sum_Pi == 0.0:`
			`sum_Pi = EPSILON_DBL`
			`sum_disti_Pi = 0.0`

			`for j in range(n_neighbors):`
			`P[i, j] /= sum_Pi`
			`sum_disti_Pi += sqdistances[i, j] * P[i, j]`

			`entropy = math.log(sum_Pi) + beta * sum_disti_Pi`
			`entropy_diff = entropy - desired_entropy`

			`if math.fabs(entropy_diff) <= PERPLEXITY_TOLERANCE:`
			`break`

			`if entropy_diff > 0.0:`
			`beta_min = beta`
			`if beta_max == INFINITY:`
			`beta *= 2.0`
			`else:`
			`beta = (beta + beta_max) / 2.0`
			`else:`
			`beta_max = beta`
			`if beta_min == -INFINITY:`
			`beta /= 2.0`
			`else:`
			`beta = (beta + beta_min) / 2.0`

			`beta_sum += beta`

			`if verbose and ((i + 1) % 1000 == 0 or i + 1 == n_samples):`
			`print("[t-SNE] Computed conditional probabilities for sample "`
			`"%d / %d" % (i + 1, n_samples))`

			`if verbose:`
			`print("[t-SNE] Mean sigma: %f"`
			`% np.mean(math.sqrt(n_samples / beta_sum)))`
			`return np.asarray(P)`