import numpy as np from libc cimport math from libc.math cimport INFINITY from ..utils._typedefs cimport float32_t, float64_t cdef float EPSILON_DBL = 1e-8 cdef float PERPLEXITY_TOLERANCE = 1e-5 # TODO: have this function support float32 and float64 and preserve inputs' dtypes. def _binary_search_perplexity( const float32_t[:, :] sqdistances, float desired_perplexity, int verbose): """Binary search for sigmas of conditional Gaussians. This approximation reduces the computational complexity from O(N^2) to O(uN). Parameters ---------- sqdistances : ndarray of shape (n_samples, n_neighbors), dtype=np.float32 Distances between training samples and their k nearest neighbors. When using the exact method, this is a square (n_samples, n_samples) distance matrix. The TSNE default metric is "euclidean" which is interpreted as squared euclidean distance. desired_perplexity : float Desired perplexity (2^entropy) of the conditional Gaussians. verbose : int Verbosity level. Returns ------- P : ndarray of shape (n_samples, n_samples), dtype=np.float64 Probabilities of conditional Gaussian distributions p_i|j. """ # Maximum number of binary search steps cdef long n_steps = 100 cdef long n_samples = sqdistances.shape[0] cdef long n_neighbors = sqdistances.shape[1] cdef int using_neighbors = n_neighbors < n_samples # Precisions of conditional Gaussian distributions cdef double beta cdef double beta_min cdef double beta_max cdef double beta_sum = 0.0 # Use log scale cdef double desired_entropy = math.log(desired_perplexity) cdef double entropy_diff cdef double entropy cdef double sum_Pi cdef double sum_disti_Pi cdef long i, j, l # This array is later used as a 32bit array. It has multiple intermediate # floating point additions that benefit from the extra precision cdef float64_t[:, :] P = np.zeros( (n_samples, n_neighbors), dtype=np.float64) for i in range(n_samples): beta_min = -INFINITY beta_max = INFINITY beta = 1.0 # Binary search of precision for i-th conditional distribution for l in range(n_steps): # Compute current entropy and corresponding probabilities # computed just over the nearest neighbors or over all data # if we're not using neighbors sum_Pi = 0.0 for j in range(n_neighbors): if j != i or using_neighbors: P[i, j] = math.exp(-sqdistances[i, j] * beta) sum_Pi += P[i, j] if sum_Pi == 0.0: sum_Pi = EPSILON_DBL sum_disti_Pi = 0.0 for j in range(n_neighbors): P[i, j] /= sum_Pi sum_disti_Pi += sqdistances[i, j] * P[i, j] entropy = math.log(sum_Pi) + beta * sum_disti_Pi entropy_diff = entropy - desired_entropy if math.fabs(entropy_diff) <= PERPLEXITY_TOLERANCE: break if entropy_diff > 0.0: beta_min = beta if beta_max == INFINITY: beta *= 2.0 else: beta = (beta + beta_max) / 2.0 else: beta_max = beta if beta_min == -INFINITY: beta /= 2.0 else: beta = (beta + beta_min) / 2.0 beta_sum += beta if verbose and ((i + 1) % 1000 == 0 or i + 1 == n_samples): print("[t-SNE] Computed conditional probabilities for sample " "%d / %d" % (i + 1, n_samples)) if verbose: print("[t-SNE] Mean sigma: %f" % np.mean(math.sqrt(n_samples / beta_sum))) return np.asarray(P)