""" Kernel Density Estimation ------------------------- """ # Author: Jake Vanderplas import itertools from numbers import Integral, Real import numpy as np from scipy.special import gammainc from ..base import BaseEstimator from ..neighbors._base import VALID_METRICS from ..utils import check_random_state from ..utils.validation import _check_sample_weight, check_is_fitted from ..utils._param_validation import Interval, StrOptions from ..utils.extmath import row_norms from ._ball_tree import BallTree, DTYPE from ._kd_tree import KDTree VALID_KERNELS = [ "gaussian", "tophat", "epanechnikov", "exponential", "linear", "cosine", ] TREE_DICT = {"ball_tree": BallTree, "kd_tree": KDTree} # TODO: implement a brute force version for testing purposes # TODO: create a density estimation base class? class KernelDensity(BaseEstimator): """Kernel Density Estimation. Read more in the :ref:`User Guide `. Parameters ---------- bandwidth : float or {"scott", "silverman"}, default=1.0 The bandwidth of the kernel. If bandwidth is a float, it defines the bandwidth of the kernel. If bandwidth is a string, one of the estimation methods is implemented. algorithm : {'kd_tree', 'ball_tree', 'auto'}, default='auto' The tree algorithm to use. kernel : {'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', \ 'cosine'}, default='gaussian' The kernel to use. metric : str, default='euclidean' Metric to use for distance computation. See the documentation of `scipy.spatial.distance `_ and the metrics listed in :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric values. Not all metrics are valid with all algorithms: refer to the documentation of :class:`BallTree` and :class:`KDTree`. Note that the normalization of the density output is correct only for the Euclidean distance metric. atol : float, default=0 The desired absolute tolerance of the result. A larger tolerance will generally lead to faster execution. rtol : float, default=0 The desired relative tolerance of the result. A larger tolerance will generally lead to faster execution. breadth_first : bool, default=True If true (default), use a breadth-first approach to the problem. Otherwise use a depth-first approach. leaf_size : int, default=40 Specify the leaf size of the underlying tree. See :class:`BallTree` or :class:`KDTree` for details. metric_params : dict, default=None Additional parameters to be passed to the tree for use with the metric. For more information, see the documentation of :class:`BallTree` or :class:`KDTree`. Attributes ---------- n_features_in_ : int Number of features seen during :term:`fit`. .. versionadded:: 0.24 tree_ : ``BinaryTree`` instance The tree algorithm for fast generalized N-point problems. feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. bandwidth_ : float Value of the bandwidth, given directly by the bandwidth parameter or estimated using the 'scott' or 'silverman' method. .. versionadded:: 1.0 See Also -------- sklearn.neighbors.KDTree : K-dimensional tree for fast generalized N-point problems. sklearn.neighbors.BallTree : Ball tree for fast generalized N-point problems. Examples -------- Compute a gaussian kernel density estimate with a fixed bandwidth. >>> from sklearn.neighbors import KernelDensity >>> import numpy as np >>> rng = np.random.RandomState(42) >>> X = rng.random_sample((100, 3)) >>> kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(X) >>> log_density = kde.score_samples(X[:3]) >>> log_density array([-1.52955942, -1.51462041, -1.60244657]) """ _parameter_constraints: dict = { "bandwidth": [ Interval(Real, 0, None, closed="neither"), StrOptions({"scott", "silverman"}), ], "algorithm": [StrOptions(set(TREE_DICT.keys()) | {"auto"})], "kernel": [StrOptions(set(VALID_KERNELS))], "metric": [ StrOptions( set(itertools.chain(*[VALID_METRICS[alg] for alg in TREE_DICT.keys()])) ) ], "atol": [Interval(Real, 0, None, closed="left")], "rtol": [Interval(Real, 0, None, closed="left")], "breadth_first": ["boolean"], "leaf_size": [Interval(Integral, 1, None, closed="left")], "metric_params": [None, dict], } def __init__( self, *, bandwidth=1.0, algorithm="auto", kernel="gaussian", metric="euclidean", atol=0, rtol=0, breadth_first=True, leaf_size=40, metric_params=None, ): self.algorithm = algorithm self.bandwidth = bandwidth self.kernel = kernel self.metric = metric self.atol = atol self.rtol = rtol self.breadth_first = breadth_first self.leaf_size = leaf_size self.metric_params = metric_params def _choose_algorithm(self, algorithm, metric): # given the algorithm string + metric string, choose the optimal # algorithm to compute the result. if algorithm == "auto": # use KD Tree if possible if metric in KDTree.valid_metrics: return "kd_tree" elif metric in BallTree.valid_metrics: return "ball_tree" else: # kd_tree or ball_tree if metric not in TREE_DICT[algorithm].valid_metrics: raise ValueError( "invalid metric for {0}: '{1}'".format(TREE_DICT[algorithm], metric) ) return algorithm def fit(self, X, y=None, sample_weight=None): """Fit the Kernel Density model on the data. Parameters ---------- X : array-like of shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. y : None Ignored. This parameter exists only for compatibility with :class:`~sklearn.pipeline.Pipeline`. sample_weight : array-like of shape (n_samples,), default=None List of sample weights attached to the data X. .. versionadded:: 0.20 Returns ------- self : object Returns the instance itself. """ self._validate_params() algorithm = self._choose_algorithm(self.algorithm, self.metric) if isinstance(self.bandwidth, str): if self.bandwidth == "scott": self.bandwidth_ = X.shape[0] ** (-1 / (X.shape[1] + 4)) elif self.bandwidth == "silverman": self.bandwidth_ = (X.shape[0] * (X.shape[1] + 2) / 4) ** ( -1 / (X.shape[1] + 4) ) else: self.bandwidth_ = self.bandwidth X = self._validate_data(X, order="C", dtype=DTYPE) if sample_weight is not None: sample_weight = _check_sample_weight( sample_weight, X, DTYPE, only_non_negative=True ) kwargs = self.metric_params if kwargs is None: kwargs = {} self.tree_ = TREE_DICT[algorithm]( X, metric=self.metric, leaf_size=self.leaf_size, sample_weight=sample_weight, **kwargs, ) return self def score_samples(self, X): """Compute the log-likelihood of each sample under the model. Parameters ---------- X : array-like of shape (n_samples, n_features) An array of points to query. Last dimension should match dimension of training data (n_features). Returns ------- density : ndarray of shape (n_samples,) Log-likelihood of each sample in `X`. These are normalized to be probability densities, so values will be low for high-dimensional data. """ check_is_fitted(self) # The returned density is normalized to the number of points. # For it to be a probability, we must scale it. For this reason # we'll also scale atol. X = self._validate_data(X, order="C", dtype=DTYPE, reset=False) if self.tree_.sample_weight is None: N = self.tree_.data.shape[0] else: N = self.tree_.sum_weight atol_N = self.atol * N log_density = self.tree_.kernel_density( X, h=self.bandwidth_, kernel=self.kernel, atol=atol_N, rtol=self.rtol, breadth_first=self.breadth_first, return_log=True, ) log_density -= np.log(N) return log_density def score(self, X, y=None): """Compute the total log-likelihood under the model. Parameters ---------- X : array-like of shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. y : None Ignored. This parameter exists only for compatibility with :class:`~sklearn.pipeline.Pipeline`. Returns ------- logprob : float Total log-likelihood of the data in X. This is normalized to be a probability density, so the value will be low for high-dimensional data. """ return np.sum(self.score_samples(X)) def sample(self, n_samples=1, random_state=None): """Generate random samples from the model. Currently, this is implemented only for gaussian and tophat kernels. Parameters ---------- n_samples : int, default=1 Number of samples to generate. random_state : int, RandomState instance or None, default=None Determines random number generation used to generate random samples. Pass an int for reproducible results across multiple function calls. See :term:`Glossary `. Returns ------- X : array-like of shape (n_samples, n_features) List of samples. """ check_is_fitted(self) # TODO: implement sampling for other valid kernel shapes if self.kernel not in ["gaussian", "tophat"]: raise NotImplementedError() data = np.asarray(self.tree_.data) rng = check_random_state(random_state) u = rng.uniform(0, 1, size=n_samples) if self.tree_.sample_weight is None: i = (u * data.shape[0]).astype(np.int64) else: cumsum_weight = np.cumsum(np.asarray(self.tree_.sample_weight)) sum_weight = cumsum_weight[-1] i = np.searchsorted(cumsum_weight, u * sum_weight) if self.kernel == "gaussian": return np.atleast_2d(rng.normal(data[i], self.bandwidth_)) elif self.kernel == "tophat": # we first draw points from a d-dimensional normal distribution, # then use an incomplete gamma function to map them to a uniform # d-dimensional tophat distribution. dim = data.shape[1] X = rng.normal(size=(n_samples, dim)) s_sq = row_norms(X, squared=True) correction = ( gammainc(0.5 * dim, 0.5 * s_sq) ** (1.0 / dim) * self.bandwidth_ / np.sqrt(s_sq) ) return data[i] + X * correction[:, np.newaxis] def _more_tags(self): return { "_xfail_checks": { "check_sample_weights_invariance": ( "sample_weight must have positive values" ), } }