840 lines
31 KiB
Python
840 lines
31 KiB
Python
|
"""Nearest Neighbor Classification"""
|
||
|
|
||
|
# Authors: Jake Vanderplas <vanderplas@astro.washington.edu>
|
||
|
# Fabian Pedregosa <fabian.pedregosa@inria.fr>
|
||
|
# Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||
|
# Sparseness support by Lars Buitinck
|
||
|
# Multi-output support by Arnaud Joly <a.joly@ulg.ac.be>
|
||
|
#
|
||
|
# License: BSD 3 clause (C) INRIA, University of Amsterdam
|
||
|
import warnings
|
||
|
from numbers import Integral
|
||
|
|
||
|
import numpy as np
|
||
|
|
||
|
from sklearn.neighbors._base import _check_precomputed
|
||
|
|
||
|
from ..base import ClassifierMixin, _fit_context
|
||
|
from ..metrics._pairwise_distances_reduction import (
|
||
|
ArgKminClassMode,
|
||
|
RadiusNeighborsClassMode,
|
||
|
)
|
||
|
from ..utils._param_validation import StrOptions
|
||
|
from ..utils.arrayfuncs import _all_with_any_reduction_axis_1
|
||
|
from ..utils.extmath import weighted_mode
|
||
|
from ..utils.fixes import _mode
|
||
|
from ..utils.validation import _is_arraylike, _num_samples, check_is_fitted
|
||
|
from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin, _get_weights
|
||
|
|
||
|
|
||
|
def _adjusted_metric(metric, metric_kwargs, p=None):
|
||
|
metric_kwargs = metric_kwargs or {}
|
||
|
if metric == "minkowski":
|
||
|
metric_kwargs["p"] = p
|
||
|
if p == 2:
|
||
|
metric = "euclidean"
|
||
|
return metric, metric_kwargs
|
||
|
|
||
|
|
||
|
class KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase):
|
||
|
"""Classifier implementing the k-nearest neighbors vote.
|
||
|
|
||
|
Read more in the :ref:`User Guide <classification>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
n_neighbors : int, default=5
|
||
|
Number of neighbors to use by default for :meth:`kneighbors` queries.
|
||
|
|
||
|
weights : {'uniform', 'distance'}, callable or None, default='uniform'
|
||
|
Weight function used in prediction. Possible values:
|
||
|
|
||
|
- 'uniform' : uniform weights. All points in each neighborhood
|
||
|
are weighted equally.
|
||
|
- 'distance' : weight points by the inverse of their distance.
|
||
|
in this case, closer neighbors of a query point will have a
|
||
|
greater influence than neighbors which are further away.
|
||
|
- [callable] : a user-defined function which accepts an
|
||
|
array of distances, and returns an array of the same shape
|
||
|
containing the weights.
|
||
|
|
||
|
Refer to the example entitled
|
||
|
:ref:`sphx_glr_auto_examples_neighbors_plot_classification.py`
|
||
|
showing the impact of the `weights` parameter on the decision
|
||
|
boundary.
|
||
|
|
||
|
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||
|
Algorithm used to compute the nearest neighbors:
|
||
|
|
||
|
- 'ball_tree' will use :class:`BallTree`
|
||
|
- 'kd_tree' will use :class:`KDTree`
|
||
|
- 'brute' will use a brute-force search.
|
||
|
- 'auto' will attempt to decide the most appropriate algorithm
|
||
|
based on the values passed to :meth:`fit` method.
|
||
|
|
||
|
Note: fitting on sparse input will override the setting of
|
||
|
this parameter, using brute force.
|
||
|
|
||
|
leaf_size : int, default=30
|
||
|
Leaf size passed to BallTree or KDTree. This can affect the
|
||
|
speed of the construction and query, as well as the memory
|
||
|
required to store the tree. The optimal value depends on the
|
||
|
nature of the problem.
|
||
|
|
||
|
p : float, default=2
|
||
|
Power parameter for the Minkowski metric. When p = 1, this is equivalent
|
||
|
to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2.
|
||
|
For arbitrary p, minkowski_distance (l_p) is used. This parameter is expected
|
||
|
to be positive.
|
||
|
|
||
|
metric : str or callable, default='minkowski'
|
||
|
Metric to use for distance computation. Default is "minkowski", which
|
||
|
results in the standard Euclidean distance when p = 2. See the
|
||
|
documentation of `scipy.spatial.distance
|
||
|
<https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
|
||
|
the metrics listed in
|
||
|
:class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
|
||
|
values.
|
||
|
|
||
|
If metric is "precomputed", X is assumed to be a distance matrix and
|
||
|
must be square during fit. X may be a :term:`sparse graph`, in which
|
||
|
case only "nonzero" elements may be considered neighbors.
|
||
|
|
||
|
If metric is a callable function, it takes two arrays representing 1D
|
||
|
vectors as inputs and must return one value indicating the distance
|
||
|
between those vectors. This works for Scipy's metrics, but is less
|
||
|
efficient than passing the metric name as a string.
|
||
|
|
||
|
metric_params : dict, default=None
|
||
|
Additional keyword arguments for the metric function.
|
||
|
|
||
|
n_jobs : int, default=None
|
||
|
The number of parallel jobs to run for neighbors search.
|
||
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||
|
for more details.
|
||
|
Doesn't affect :meth:`fit` method.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
classes_ : array of shape (n_classes,)
|
||
|
Class labels known to the classifier
|
||
|
|
||
|
effective_metric_ : str or callble
|
||
|
The distance metric used. It will be same as the `metric` parameter
|
||
|
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
|
||
|
'minkowski' and `p` parameter set to 2.
|
||
|
|
||
|
effective_metric_params_ : dict
|
||
|
Additional keyword arguments for the metric function. For most metrics
|
||
|
will be same with `metric_params` parameter, but may also contain the
|
||
|
`p` parameter value if the `effective_metric_` attribute is set to
|
||
|
'minkowski'.
|
||
|
|
||
|
n_features_in_ : int
|
||
|
Number of features seen during :term:`fit`.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
|
||
|
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||
|
Names of features seen during :term:`fit`. Defined only when `X`
|
||
|
has feature names that are all strings.
|
||
|
|
||
|
.. versionadded:: 1.0
|
||
|
|
||
|
n_samples_fit_ : int
|
||
|
Number of samples in the fitted data.
|
||
|
|
||
|
outputs_2d_ : bool
|
||
|
False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit
|
||
|
otherwise True.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
RadiusNeighborsClassifier: Classifier based on neighbors within a fixed radius.
|
||
|
KNeighborsRegressor: Regression based on k-nearest neighbors.
|
||
|
RadiusNeighborsRegressor: Regression based on neighbors within a fixed radius.
|
||
|
NearestNeighbors: Unsupervised learner for implementing neighbor searches.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
|
||
|
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
|
||
|
|
||
|
.. warning::
|
||
|
|
||
|
Regarding the Nearest Neighbors algorithms, if it is found that two
|
||
|
neighbors, neighbor `k+1` and `k`, have identical distances
|
||
|
but different labels, the results will depend on the ordering of the
|
||
|
training data.
|
||
|
|
||
|
https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> X = [[0], [1], [2], [3]]
|
||
|
>>> y = [0, 0, 1, 1]
|
||
|
>>> from sklearn.neighbors import KNeighborsClassifier
|
||
|
>>> neigh = KNeighborsClassifier(n_neighbors=3)
|
||
|
>>> neigh.fit(X, y)
|
||
|
KNeighborsClassifier(...)
|
||
|
>>> print(neigh.predict([[1.1]]))
|
||
|
[0]
|
||
|
>>> print(neigh.predict_proba([[0.9]]))
|
||
|
[[0.666... 0.333...]]
|
||
|
"""
|
||
|
|
||
|
_parameter_constraints: dict = {**NeighborsBase._parameter_constraints}
|
||
|
_parameter_constraints.pop("radius")
|
||
|
_parameter_constraints.update(
|
||
|
{"weights": [StrOptions({"uniform", "distance"}), callable, None]}
|
||
|
)
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
n_neighbors=5,
|
||
|
*,
|
||
|
weights="uniform",
|
||
|
algorithm="auto",
|
||
|
leaf_size=30,
|
||
|
p=2,
|
||
|
metric="minkowski",
|
||
|
metric_params=None,
|
||
|
n_jobs=None,
|
||
|
):
|
||
|
super().__init__(
|
||
|
n_neighbors=n_neighbors,
|
||
|
algorithm=algorithm,
|
||
|
leaf_size=leaf_size,
|
||
|
metric=metric,
|
||
|
p=p,
|
||
|
metric_params=metric_params,
|
||
|
n_jobs=n_jobs,
|
||
|
)
|
||
|
self.weights = weights
|
||
|
|
||
|
@_fit_context(
|
||
|
# KNeighborsClassifier.metric is not validated yet
|
||
|
prefer_skip_nested_validation=False
|
||
|
)
|
||
|
def fit(self, X, y):
|
||
|
"""Fit the k-nearest neighbors classifier from the training dataset.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
|
||
|
(n_samples, n_samples) if metric='precomputed'
|
||
|
Training data.
|
||
|
|
||
|
y : {array-like, sparse matrix} of shape (n_samples,) or \
|
||
|
(n_samples, n_outputs)
|
||
|
Target values.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self : KNeighborsClassifier
|
||
|
The fitted k-nearest neighbors classifier.
|
||
|
"""
|
||
|
return self._fit(X, y)
|
||
|
|
||
|
def predict(self, X):
|
||
|
"""Predict the class labels for the provided data.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_queries, n_features), \
|
||
|
or (n_queries, n_indexed) if metric == 'precomputed'
|
||
|
Test samples.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
y : ndarray of shape (n_queries,) or (n_queries, n_outputs)
|
||
|
Class labels for each data sample.
|
||
|
"""
|
||
|
check_is_fitted(self, "_fit_method")
|
||
|
if self.weights == "uniform":
|
||
|
if self._fit_method == "brute" and ArgKminClassMode.is_usable_for(
|
||
|
X, self._fit_X, self.metric
|
||
|
):
|
||
|
probabilities = self.predict_proba(X)
|
||
|
if self.outputs_2d_:
|
||
|
return np.stack(
|
||
|
[
|
||
|
self.classes_[idx][np.argmax(probas, axis=1)]
|
||
|
for idx, probas in enumerate(probabilities)
|
||
|
],
|
||
|
axis=1,
|
||
|
)
|
||
|
return self.classes_[np.argmax(probabilities, axis=1)]
|
||
|
# In that case, we do not need the distances to perform
|
||
|
# the weighting so we do not compute them.
|
||
|
neigh_ind = self.kneighbors(X, return_distance=False)
|
||
|
neigh_dist = None
|
||
|
else:
|
||
|
neigh_dist, neigh_ind = self.kneighbors(X)
|
||
|
|
||
|
classes_ = self.classes_
|
||
|
_y = self._y
|
||
|
if not self.outputs_2d_:
|
||
|
_y = self._y.reshape((-1, 1))
|
||
|
classes_ = [self.classes_]
|
||
|
|
||
|
n_outputs = len(classes_)
|
||
|
n_queries = _num_samples(X)
|
||
|
weights = _get_weights(neigh_dist, self.weights)
|
||
|
if weights is not None and _all_with_any_reduction_axis_1(weights, value=0):
|
||
|
raise ValueError(
|
||
|
"All neighbors of some sample is getting zero weights. "
|
||
|
"Please modify 'weights' to avoid this case if you are "
|
||
|
"using a user-defined function."
|
||
|
)
|
||
|
|
||
|
y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)
|
||
|
for k, classes_k in enumerate(classes_):
|
||
|
if weights is None:
|
||
|
mode, _ = _mode(_y[neigh_ind, k], axis=1)
|
||
|
else:
|
||
|
mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1)
|
||
|
|
||
|
mode = np.asarray(mode.ravel(), dtype=np.intp)
|
||
|
y_pred[:, k] = classes_k.take(mode)
|
||
|
|
||
|
if not self.outputs_2d_:
|
||
|
y_pred = y_pred.ravel()
|
||
|
|
||
|
return y_pred
|
||
|
|
||
|
def predict_proba(self, X):
|
||
|
"""Return probability estimates for the test data X.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_queries, n_features), \
|
||
|
or (n_queries, n_indexed) if metric == 'precomputed'
|
||
|
Test samples.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
p : ndarray of shape (n_queries, n_classes), or a list of n_outputs \
|
||
|
of such arrays if n_outputs > 1.
|
||
|
The class probabilities of the input samples. Classes are ordered
|
||
|
by lexicographic order.
|
||
|
"""
|
||
|
check_is_fitted(self, "_fit_method")
|
||
|
if self.weights == "uniform":
|
||
|
# TODO: systematize this mapping of metric for
|
||
|
# PairwiseDistancesReductions.
|
||
|
metric, metric_kwargs = _adjusted_metric(
|
||
|
metric=self.metric, metric_kwargs=self.metric_params, p=self.p
|
||
|
)
|
||
|
if (
|
||
|
self._fit_method == "brute"
|
||
|
and ArgKminClassMode.is_usable_for(X, self._fit_X, metric)
|
||
|
# TODO: Implement efficient multi-output solution
|
||
|
and not self.outputs_2d_
|
||
|
):
|
||
|
if self.metric == "precomputed":
|
||
|
X = _check_precomputed(X)
|
||
|
else:
|
||
|
X = self._validate_data(
|
||
|
X, accept_sparse="csr", reset=False, order="C"
|
||
|
)
|
||
|
|
||
|
probabilities = ArgKminClassMode.compute(
|
||
|
X,
|
||
|
self._fit_X,
|
||
|
k=self.n_neighbors,
|
||
|
weights=self.weights,
|
||
|
Y_labels=self._y,
|
||
|
unique_Y_labels=self.classes_,
|
||
|
metric=metric,
|
||
|
metric_kwargs=metric_kwargs,
|
||
|
# `strategy="parallel_on_X"` has in practice be shown
|
||
|
# to be more efficient than `strategy="parallel_on_Y``
|
||
|
# on many combination of datasets.
|
||
|
# Hence, we choose to enforce it here.
|
||
|
# For more information, see:
|
||
|
# https://github.com/scikit-learn/scikit-learn/pull/24076#issuecomment-1445258342 # noqa
|
||
|
# TODO: adapt the heuristic for `strategy="auto"` for
|
||
|
# `ArgKminClassMode` and use `strategy="auto"`.
|
||
|
strategy="parallel_on_X",
|
||
|
)
|
||
|
return probabilities
|
||
|
|
||
|
# In that case, we do not need the distances to perform
|
||
|
# the weighting so we do not compute them.
|
||
|
neigh_ind = self.kneighbors(X, return_distance=False)
|
||
|
neigh_dist = None
|
||
|
else:
|
||
|
neigh_dist, neigh_ind = self.kneighbors(X)
|
||
|
|
||
|
classes_ = self.classes_
|
||
|
_y = self._y
|
||
|
if not self.outputs_2d_:
|
||
|
_y = self._y.reshape((-1, 1))
|
||
|
classes_ = [self.classes_]
|
||
|
|
||
|
n_queries = _num_samples(X)
|
||
|
|
||
|
weights = _get_weights(neigh_dist, self.weights)
|
||
|
if weights is None:
|
||
|
weights = np.ones_like(neigh_ind)
|
||
|
elif _all_with_any_reduction_axis_1(weights, value=0):
|
||
|
raise ValueError(
|
||
|
"All neighbors of some sample is getting zero weights. "
|
||
|
"Please modify 'weights' to avoid this case if you are "
|
||
|
"using a user-defined function."
|
||
|
)
|
||
|
|
||
|
all_rows = np.arange(n_queries)
|
||
|
probabilities = []
|
||
|
for k, classes_k in enumerate(classes_):
|
||
|
pred_labels = _y[:, k][neigh_ind]
|
||
|
proba_k = np.zeros((n_queries, classes_k.size))
|
||
|
|
||
|
# a simple ':' index doesn't work right
|
||
|
for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors)
|
||
|
proba_k[all_rows, idx] += weights[:, i]
|
||
|
|
||
|
# normalize 'votes' into real [0,1] probabilities
|
||
|
normalizer = proba_k.sum(axis=1)[:, np.newaxis]
|
||
|
proba_k /= normalizer
|
||
|
|
||
|
probabilities.append(proba_k)
|
||
|
|
||
|
if not self.outputs_2d_:
|
||
|
probabilities = probabilities[0]
|
||
|
|
||
|
return probabilities
|
||
|
|
||
|
def _more_tags(self):
|
||
|
return {"multilabel": True}
|
||
|
|
||
|
|
||
|
class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, NeighborsBase):
|
||
|
"""Classifier implementing a vote among neighbors within a given radius.
|
||
|
|
||
|
Read more in the :ref:`User Guide <classification>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
radius : float, default=1.0
|
||
|
Range of parameter space to use by default for :meth:`radius_neighbors`
|
||
|
queries.
|
||
|
|
||
|
weights : {'uniform', 'distance'}, callable or None, default='uniform'
|
||
|
Weight function used in prediction. Possible values:
|
||
|
|
||
|
- 'uniform' : uniform weights. All points in each neighborhood
|
||
|
are weighted equally.
|
||
|
- 'distance' : weight points by the inverse of their distance.
|
||
|
in this case, closer neighbors of a query point will have a
|
||
|
greater influence than neighbors which are further away.
|
||
|
- [callable] : a user-defined function which accepts an
|
||
|
array of distances, and returns an array of the same shape
|
||
|
containing the weights.
|
||
|
|
||
|
Uniform weights are used by default.
|
||
|
|
||
|
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||
|
Algorithm used to compute the nearest neighbors:
|
||
|
|
||
|
- 'ball_tree' will use :class:`BallTree`
|
||
|
- 'kd_tree' will use :class:`KDTree`
|
||
|
- 'brute' will use a brute-force search.
|
||
|
- 'auto' will attempt to decide the most appropriate algorithm
|
||
|
based on the values passed to :meth:`fit` method.
|
||
|
|
||
|
Note: fitting on sparse input will override the setting of
|
||
|
this parameter, using brute force.
|
||
|
|
||
|
leaf_size : int, default=30
|
||
|
Leaf size passed to BallTree or KDTree. This can affect the
|
||
|
speed of the construction and query, as well as the memory
|
||
|
required to store the tree. The optimal value depends on the
|
||
|
nature of the problem.
|
||
|
|
||
|
p : float, default=2
|
||
|
Power parameter for the Minkowski metric. When p = 1, this is
|
||
|
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||
|
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||
|
This parameter is expected to be positive.
|
||
|
|
||
|
metric : str or callable, default='minkowski'
|
||
|
Metric to use for distance computation. Default is "minkowski", which
|
||
|
results in the standard Euclidean distance when p = 2. See the
|
||
|
documentation of `scipy.spatial.distance
|
||
|
<https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
|
||
|
the metrics listed in
|
||
|
:class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
|
||
|
values.
|
||
|
|
||
|
If metric is "precomputed", X is assumed to be a distance matrix and
|
||
|
must be square during fit. X may be a :term:`sparse graph`, in which
|
||
|
case only "nonzero" elements may be considered neighbors.
|
||
|
|
||
|
If metric is a callable function, it takes two arrays representing 1D
|
||
|
vectors as inputs and must return one value indicating the distance
|
||
|
between those vectors. This works for Scipy's metrics, but is less
|
||
|
efficient than passing the metric name as a string.
|
||
|
|
||
|
outlier_label : {manual label, 'most_frequent'}, default=None
|
||
|
Label for outlier samples (samples with no neighbors in given radius).
|
||
|
|
||
|
- manual label: str or int label (should be the same type as y)
|
||
|
or list of manual labels if multi-output is used.
|
||
|
- 'most_frequent' : assign the most frequent label of y to outliers.
|
||
|
- None : when any outlier is detected, ValueError will be raised.
|
||
|
|
||
|
The outlier label should be selected from among the unique 'Y' labels.
|
||
|
If it is specified with a different value a warning will be raised and
|
||
|
all class probabilities of outliers will be assigned to be 0.
|
||
|
|
||
|
metric_params : dict, default=None
|
||
|
Additional keyword arguments for the metric function.
|
||
|
|
||
|
n_jobs : int, default=None
|
||
|
The number of parallel jobs to run for neighbors search.
|
||
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||
|
for more details.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
classes_ : ndarray of shape (n_classes,)
|
||
|
Class labels known to the classifier.
|
||
|
|
||
|
effective_metric_ : str or callable
|
||
|
The distance metric used. It will be same as the `metric` parameter
|
||
|
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
|
||
|
'minkowski' and `p` parameter set to 2.
|
||
|
|
||
|
effective_metric_params_ : dict
|
||
|
Additional keyword arguments for the metric function. For most metrics
|
||
|
will be same with `metric_params` parameter, but may also contain the
|
||
|
`p` parameter value if the `effective_metric_` attribute is set to
|
||
|
'minkowski'.
|
||
|
|
||
|
n_features_in_ : int
|
||
|
Number of features seen during :term:`fit`.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
|
||
|
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||
|
Names of features seen during :term:`fit`. Defined only when `X`
|
||
|
has feature names that are all strings.
|
||
|
|
||
|
.. versionadded:: 1.0
|
||
|
|
||
|
n_samples_fit_ : int
|
||
|
Number of samples in the fitted data.
|
||
|
|
||
|
outlier_label_ : int or array-like of shape (n_class,)
|
||
|
Label which is given for outlier samples (samples with no neighbors
|
||
|
on given radius).
|
||
|
|
||
|
outputs_2d_ : bool
|
||
|
False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit
|
||
|
otherwise True.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
KNeighborsClassifier : Classifier implementing the k-nearest neighbors
|
||
|
vote.
|
||
|
RadiusNeighborsRegressor : Regression based on neighbors within a
|
||
|
fixed radius.
|
||
|
KNeighborsRegressor : Regression based on k-nearest neighbors.
|
||
|
NearestNeighbors : Unsupervised learner for implementing neighbor
|
||
|
searches.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
|
||
|
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
|
||
|
|
||
|
https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> X = [[0], [1], [2], [3]]
|
||
|
>>> y = [0, 0, 1, 1]
|
||
|
>>> from sklearn.neighbors import RadiusNeighborsClassifier
|
||
|
>>> neigh = RadiusNeighborsClassifier(radius=1.0)
|
||
|
>>> neigh.fit(X, y)
|
||
|
RadiusNeighborsClassifier(...)
|
||
|
>>> print(neigh.predict([[1.5]]))
|
||
|
[0]
|
||
|
>>> print(neigh.predict_proba([[1.0]]))
|
||
|
[[0.66666667 0.33333333]]
|
||
|
"""
|
||
|
|
||
|
_parameter_constraints: dict = {
|
||
|
**NeighborsBase._parameter_constraints,
|
||
|
"weights": [StrOptions({"uniform", "distance"}), callable, None],
|
||
|
"outlier_label": [Integral, str, "array-like", None],
|
||
|
}
|
||
|
_parameter_constraints.pop("n_neighbors")
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
radius=1.0,
|
||
|
*,
|
||
|
weights="uniform",
|
||
|
algorithm="auto",
|
||
|
leaf_size=30,
|
||
|
p=2,
|
||
|
metric="minkowski",
|
||
|
outlier_label=None,
|
||
|
metric_params=None,
|
||
|
n_jobs=None,
|
||
|
):
|
||
|
super().__init__(
|
||
|
radius=radius,
|
||
|
algorithm=algorithm,
|
||
|
leaf_size=leaf_size,
|
||
|
metric=metric,
|
||
|
p=p,
|
||
|
metric_params=metric_params,
|
||
|
n_jobs=n_jobs,
|
||
|
)
|
||
|
self.weights = weights
|
||
|
self.outlier_label = outlier_label
|
||
|
|
||
|
@_fit_context(
|
||
|
# RadiusNeighborsClassifier.metric is not validated yet
|
||
|
prefer_skip_nested_validation=False
|
||
|
)
|
||
|
def fit(self, X, y):
|
||
|
"""Fit the radius neighbors classifier from the training dataset.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
|
||
|
(n_samples, n_samples) if metric='precomputed'
|
||
|
Training data.
|
||
|
|
||
|
y : {array-like, sparse matrix} of shape (n_samples,) or \
|
||
|
(n_samples, n_outputs)
|
||
|
Target values.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self : RadiusNeighborsClassifier
|
||
|
The fitted radius neighbors classifier.
|
||
|
"""
|
||
|
self._fit(X, y)
|
||
|
|
||
|
classes_ = self.classes_
|
||
|
_y = self._y
|
||
|
if not self.outputs_2d_:
|
||
|
_y = self._y.reshape((-1, 1))
|
||
|
classes_ = [self.classes_]
|
||
|
|
||
|
if self.outlier_label is None:
|
||
|
outlier_label_ = None
|
||
|
|
||
|
elif self.outlier_label == "most_frequent":
|
||
|
outlier_label_ = []
|
||
|
# iterate over multi-output, get the most frequent label for each
|
||
|
# output.
|
||
|
for k, classes_k in enumerate(classes_):
|
||
|
label_count = np.bincount(_y[:, k])
|
||
|
outlier_label_.append(classes_k[label_count.argmax()])
|
||
|
|
||
|
else:
|
||
|
if _is_arraylike(self.outlier_label) and not isinstance(
|
||
|
self.outlier_label, str
|
||
|
):
|
||
|
if len(self.outlier_label) != len(classes_):
|
||
|
raise ValueError(
|
||
|
"The length of outlier_label: {} is "
|
||
|
"inconsistent with the output "
|
||
|
"length: {}".format(self.outlier_label, len(classes_))
|
||
|
)
|
||
|
outlier_label_ = self.outlier_label
|
||
|
else:
|
||
|
outlier_label_ = [self.outlier_label] * len(classes_)
|
||
|
|
||
|
for classes, label in zip(classes_, outlier_label_):
|
||
|
if _is_arraylike(label) and not isinstance(label, str):
|
||
|
# ensure the outlier label for each output is a scalar.
|
||
|
raise TypeError(
|
||
|
"The outlier_label of classes {} is "
|
||
|
"supposed to be a scalar, got "
|
||
|
"{}.".format(classes, label)
|
||
|
)
|
||
|
if np.append(classes, label).dtype != classes.dtype:
|
||
|
# ensure the dtype of outlier label is consistent with y.
|
||
|
raise TypeError(
|
||
|
"The dtype of outlier_label {} is "
|
||
|
"inconsistent with classes {} in "
|
||
|
"y.".format(label, classes)
|
||
|
)
|
||
|
|
||
|
self.outlier_label_ = outlier_label_
|
||
|
|
||
|
return self
|
||
|
|
||
|
def predict(self, X):
|
||
|
"""Predict the class labels for the provided data.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_queries, n_features), \
|
||
|
or (n_queries, n_indexed) if metric == 'precomputed'
|
||
|
Test samples.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
y : ndarray of shape (n_queries,) or (n_queries, n_outputs)
|
||
|
Class labels for each data sample.
|
||
|
"""
|
||
|
|
||
|
probs = self.predict_proba(X)
|
||
|
classes_ = self.classes_
|
||
|
|
||
|
if not self.outputs_2d_:
|
||
|
probs = [probs]
|
||
|
classes_ = [self.classes_]
|
||
|
|
||
|
n_outputs = len(classes_)
|
||
|
n_queries = probs[0].shape[0]
|
||
|
y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)
|
||
|
|
||
|
for k, prob in enumerate(probs):
|
||
|
# iterate over multi-output, assign labels based on probabilities
|
||
|
# of each output.
|
||
|
max_prob_index = prob.argmax(axis=1)
|
||
|
y_pred[:, k] = classes_[k].take(max_prob_index)
|
||
|
|
||
|
outlier_zero_probs = (prob == 0).all(axis=1)
|
||
|
if outlier_zero_probs.any():
|
||
|
zero_prob_index = np.flatnonzero(outlier_zero_probs)
|
||
|
y_pred[zero_prob_index, k] = self.outlier_label_[k]
|
||
|
|
||
|
if not self.outputs_2d_:
|
||
|
y_pred = y_pred.ravel()
|
||
|
|
||
|
return y_pred
|
||
|
|
||
|
def predict_proba(self, X):
|
||
|
"""Return probability estimates for the test data X.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix} of shape (n_queries, n_features), \
|
||
|
or (n_queries, n_indexed) if metric == 'precomputed'
|
||
|
Test samples.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
p : ndarray of shape (n_queries, n_classes), or a list of \
|
||
|
n_outputs of such arrays if n_outputs > 1.
|
||
|
The class probabilities of the input samples. Classes are ordered
|
||
|
by lexicographic order.
|
||
|
"""
|
||
|
check_is_fitted(self, "_fit_method")
|
||
|
n_queries = _num_samples(X)
|
||
|
|
||
|
metric, metric_kwargs = _adjusted_metric(
|
||
|
metric=self.metric, metric_kwargs=self.metric_params, p=self.p
|
||
|
)
|
||
|
|
||
|
if (
|
||
|
self.weights == "uniform"
|
||
|
and self._fit_method == "brute"
|
||
|
and not self.outputs_2d_
|
||
|
and RadiusNeighborsClassMode.is_usable_for(X, self._fit_X, metric)
|
||
|
):
|
||
|
probabilities = RadiusNeighborsClassMode.compute(
|
||
|
X=X,
|
||
|
Y=self._fit_X,
|
||
|
radius=self.radius,
|
||
|
weights=self.weights,
|
||
|
Y_labels=self._y,
|
||
|
unique_Y_labels=self.classes_,
|
||
|
outlier_label=self.outlier_label,
|
||
|
metric=metric,
|
||
|
metric_kwargs=metric_kwargs,
|
||
|
strategy="parallel_on_X",
|
||
|
# `strategy="parallel_on_X"` has in practice be shown
|
||
|
# to be more efficient than `strategy="parallel_on_Y``
|
||
|
# on many combination of datasets.
|
||
|
# Hence, we choose to enforce it here.
|
||
|
# For more information, see:
|
||
|
# https://github.com/scikit-learn/scikit-learn/pull/26828/files#r1282398471 # noqa
|
||
|
)
|
||
|
return probabilities
|
||
|
|
||
|
neigh_dist, neigh_ind = self.radius_neighbors(X)
|
||
|
outlier_mask = np.zeros(n_queries, dtype=bool)
|
||
|
outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind]
|
||
|
outliers = np.flatnonzero(outlier_mask)
|
||
|
inliers = np.flatnonzero(~outlier_mask)
|
||
|
|
||
|
classes_ = self.classes_
|
||
|
_y = self._y
|
||
|
if not self.outputs_2d_:
|
||
|
_y = self._y.reshape((-1, 1))
|
||
|
classes_ = [self.classes_]
|
||
|
|
||
|
if self.outlier_label_ is None and outliers.size > 0:
|
||
|
raise ValueError(
|
||
|
"No neighbors found for test samples %r, "
|
||
|
"you can try using larger radius, "
|
||
|
"giving a label for outliers, "
|
||
|
"or considering removing them from your dataset." % outliers
|
||
|
)
|
||
|
|
||
|
weights = _get_weights(neigh_dist, self.weights)
|
||
|
if weights is not None:
|
||
|
weights = weights[inliers]
|
||
|
|
||
|
probabilities = []
|
||
|
# iterate over multi-output, measure probabilities of the k-th output.
|
||
|
for k, classes_k in enumerate(classes_):
|
||
|
pred_labels = np.zeros(len(neigh_ind), dtype=object)
|
||
|
pred_labels[:] = [_y[ind, k] for ind in neigh_ind]
|
||
|
|
||
|
proba_k = np.zeros((n_queries, classes_k.size))
|
||
|
proba_inl = np.zeros((len(inliers), classes_k.size))
|
||
|
|
||
|
# samples have different size of neighbors within the same radius
|
||
|
if weights is None:
|
||
|
for i, idx in enumerate(pred_labels[inliers]):
|
||
|
proba_inl[i, :] = np.bincount(idx, minlength=classes_k.size)
|
||
|
else:
|
||
|
for i, idx in enumerate(pred_labels[inliers]):
|
||
|
proba_inl[i, :] = np.bincount(
|
||
|
idx, weights[i], minlength=classes_k.size
|
||
|
)
|
||
|
proba_k[inliers, :] = proba_inl
|
||
|
|
||
|
if outliers.size > 0:
|
||
|
_outlier_label = self.outlier_label_[k]
|
||
|
label_index = np.flatnonzero(classes_k == _outlier_label)
|
||
|
if label_index.size == 1:
|
||
|
proba_k[outliers, label_index[0]] = 1.0
|
||
|
else:
|
||
|
warnings.warn(
|
||
|
"Outlier label {} is not in training "
|
||
|
"classes. All class probabilities of "
|
||
|
"outliers will be assigned with 0."
|
||
|
"".format(self.outlier_label_[k])
|
||
|
)
|
||
|
|
||
|
# normalize 'votes' into real [0,1] probabilities
|
||
|
normalizer = proba_k.sum(axis=1)[:, np.newaxis]
|
||
|
normalizer[normalizer == 0.0] = 1.0
|
||
|
proba_k /= normalizer
|
||
|
|
||
|
probabilities.append(proba_k)
|
||
|
|
||
|
if not self.outputs_2d_:
|
||
|
probabilities = probabilities[0]
|
||
|
|
||
|
return probabilities
|
||
|
|
||
|
def _more_tags(self):
|
||
|
return {"multilabel": True}
|