3RNN/Lib/site-packages/sklearn/manifold/_utils.pyx
2024-05-26 19:49:15 +02:00

121 lines
3.8 KiB
Cython

import numpy as np
from libc cimport math
from libc.math cimport INFINITY
from ..utils._typedefs cimport float32_t, float64_t
cdef float EPSILON_DBL = 1e-8
cdef float PERPLEXITY_TOLERANCE = 1e-5
# TODO: have this function support float32 and float64 and preserve inputs' dtypes.
def _binary_search_perplexity(
const float32_t[:, :] sqdistances,
float desired_perplexity,
int verbose):
"""Binary search for sigmas of conditional Gaussians.
This approximation reduces the computational complexity from O(N^2) to
O(uN).
Parameters
----------
sqdistances : ndarray of shape (n_samples, n_neighbors), dtype=np.float32
Distances between training samples and their k nearest neighbors.
When using the exact method, this is a square (n_samples, n_samples)
distance matrix. The TSNE default metric is "euclidean" which is
interpreted as squared euclidean distance.
desired_perplexity : float
Desired perplexity (2^entropy) of the conditional Gaussians.
verbose : int
Verbosity level.
Returns
-------
P : ndarray of shape (n_samples, n_samples), dtype=np.float64
Probabilities of conditional Gaussian distributions p_i|j.
"""
# Maximum number of binary search steps
cdef long n_steps = 100
cdef long n_samples = sqdistances.shape[0]
cdef long n_neighbors = sqdistances.shape[1]
cdef int using_neighbors = n_neighbors < n_samples
# Precisions of conditional Gaussian distributions
cdef double beta
cdef double beta_min
cdef double beta_max
cdef double beta_sum = 0.0
# Use log scale
cdef double desired_entropy = math.log(desired_perplexity)
cdef double entropy_diff
cdef double entropy
cdef double sum_Pi
cdef double sum_disti_Pi
cdef long i, j, l
# This array is later used as a 32bit array. It has multiple intermediate
# floating point additions that benefit from the extra precision
cdef float64_t[:, :] P = np.zeros(
(n_samples, n_neighbors), dtype=np.float64)
for i in range(n_samples):
beta_min = -INFINITY
beta_max = INFINITY
beta = 1.0
# Binary search of precision for i-th conditional distribution
for l in range(n_steps):
# Compute current entropy and corresponding probabilities
# computed just over the nearest neighbors or over all data
# if we're not using neighbors
sum_Pi = 0.0
for j in range(n_neighbors):
if j != i or using_neighbors:
P[i, j] = math.exp(-sqdistances[i, j] * beta)
sum_Pi += P[i, j]
if sum_Pi == 0.0:
sum_Pi = EPSILON_DBL
sum_disti_Pi = 0.0
for j in range(n_neighbors):
P[i, j] /= sum_Pi
sum_disti_Pi += sqdistances[i, j] * P[i, j]
entropy = math.log(sum_Pi) + beta * sum_disti_Pi
entropy_diff = entropy - desired_entropy
if math.fabs(entropy_diff) <= PERPLEXITY_TOLERANCE:
break
if entropy_diff > 0.0:
beta_min = beta
if beta_max == INFINITY:
beta *= 2.0
else:
beta = (beta + beta_max) / 2.0
else:
beta_max = beta
if beta_min == -INFINITY:
beta /= 2.0
else:
beta = (beta + beta_min) / 2.0
beta_sum += beta
if verbose and ((i + 1) % 1000 == 0 or i + 1 == n_samples):
print("[t-SNE] Computed conditional probabilities for sample "
"%d / %d" % (i + 1, n_samples))
if verbose:
print("[t-SNE] Mean sigma: %f"
% np.mean(math.sqrt(n_samples / beta_sum)))
return np.asarray(P)