121 lines
3.8 KiB
Cython
121 lines
3.8 KiB
Cython
import numpy as np
|
|
|
|
from libc cimport math
|
|
from libc.math cimport INFINITY
|
|
|
|
from ..utils._typedefs cimport float32_t, float64_t
|
|
|
|
|
|
cdef float EPSILON_DBL = 1e-8
|
|
cdef float PERPLEXITY_TOLERANCE = 1e-5
|
|
|
|
|
|
# TODO: have this function support float32 and float64 and preserve inputs' dtypes.
|
|
def _binary_search_perplexity(
|
|
const float32_t[:, :] sqdistances,
|
|
float desired_perplexity,
|
|
int verbose):
|
|
"""Binary search for sigmas of conditional Gaussians.
|
|
|
|
This approximation reduces the computational complexity from O(N^2) to
|
|
O(uN).
|
|
|
|
Parameters
|
|
----------
|
|
sqdistances : ndarray of shape (n_samples, n_neighbors), dtype=np.float32
|
|
Distances between training samples and their k nearest neighbors.
|
|
When using the exact method, this is a square (n_samples, n_samples)
|
|
distance matrix. The TSNE default metric is "euclidean" which is
|
|
interpreted as squared euclidean distance.
|
|
|
|
desired_perplexity : float
|
|
Desired perplexity (2^entropy) of the conditional Gaussians.
|
|
|
|
verbose : int
|
|
Verbosity level.
|
|
|
|
Returns
|
|
-------
|
|
P : ndarray of shape (n_samples, n_samples), dtype=np.float64
|
|
Probabilities of conditional Gaussian distributions p_i|j.
|
|
"""
|
|
# Maximum number of binary search steps
|
|
cdef long n_steps = 100
|
|
|
|
cdef long n_samples = sqdistances.shape[0]
|
|
cdef long n_neighbors = sqdistances.shape[1]
|
|
cdef int using_neighbors = n_neighbors < n_samples
|
|
# Precisions of conditional Gaussian distributions
|
|
cdef double beta
|
|
cdef double beta_min
|
|
cdef double beta_max
|
|
cdef double beta_sum = 0.0
|
|
|
|
# Use log scale
|
|
cdef double desired_entropy = math.log(desired_perplexity)
|
|
cdef double entropy_diff
|
|
|
|
cdef double entropy
|
|
cdef double sum_Pi
|
|
cdef double sum_disti_Pi
|
|
cdef long i, j, l
|
|
|
|
# This array is later used as a 32bit array. It has multiple intermediate
|
|
# floating point additions that benefit from the extra precision
|
|
cdef float64_t[:, :] P = np.zeros(
|
|
(n_samples, n_neighbors), dtype=np.float64)
|
|
|
|
for i in range(n_samples):
|
|
beta_min = -INFINITY
|
|
beta_max = INFINITY
|
|
beta = 1.0
|
|
|
|
# Binary search of precision for i-th conditional distribution
|
|
for l in range(n_steps):
|
|
# Compute current entropy and corresponding probabilities
|
|
# computed just over the nearest neighbors or over all data
|
|
# if we're not using neighbors
|
|
sum_Pi = 0.0
|
|
for j in range(n_neighbors):
|
|
if j != i or using_neighbors:
|
|
P[i, j] = math.exp(-sqdistances[i, j] * beta)
|
|
sum_Pi += P[i, j]
|
|
|
|
if sum_Pi == 0.0:
|
|
sum_Pi = EPSILON_DBL
|
|
sum_disti_Pi = 0.0
|
|
|
|
for j in range(n_neighbors):
|
|
P[i, j] /= sum_Pi
|
|
sum_disti_Pi += sqdistances[i, j] * P[i, j]
|
|
|
|
entropy = math.log(sum_Pi) + beta * sum_disti_Pi
|
|
entropy_diff = entropy - desired_entropy
|
|
|
|
if math.fabs(entropy_diff) <= PERPLEXITY_TOLERANCE:
|
|
break
|
|
|
|
if entropy_diff > 0.0:
|
|
beta_min = beta
|
|
if beta_max == INFINITY:
|
|
beta *= 2.0
|
|
else:
|
|
beta = (beta + beta_max) / 2.0
|
|
else:
|
|
beta_max = beta
|
|
if beta_min == -INFINITY:
|
|
beta /= 2.0
|
|
else:
|
|
beta = (beta + beta_min) / 2.0
|
|
|
|
beta_sum += beta
|
|
|
|
if verbose and ((i + 1) % 1000 == 0 or i + 1 == n_samples):
|
|
print("[t-SNE] Computed conditional probabilities for sample "
|
|
"%d / %d" % (i + 1, n_samples))
|
|
|
|
if verbose:
|
|
print("[t-SNE] Mean sigma: %f"
|
|
% np.mean(math.sqrt(n_samples / beta_sum)))
|
|
return np.asarray(P)
|