2556 lines
85 KiB
Python
2556 lines
85 KiB
Python
|
# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
|||
|
# Mathieu Blondel <mathieu@mblondel.org>
|
|||
|
# Robert Layton <robertlayton@gmail.com>
|
|||
|
# Andreas Mueller <amueller@ais.uni-bonn.de>
|
|||
|
# Philippe Gervais <philippe.gervais@inria.fr>
|
|||
|
# Lars Buitinck
|
|||
|
# Joel Nothman <joel.nothman@gmail.com>
|
|||
|
# License: BSD 3 clause
|
|||
|
|
|||
|
import itertools
|
|||
|
import warnings
|
|||
|
from functools import partial
|
|||
|
from numbers import Integral, Real
|
|||
|
|
|||
|
import numpy as np
|
|||
|
from joblib import effective_n_jobs
|
|||
|
from scipy.sparse import csr_matrix, issparse
|
|||
|
from scipy.spatial import distance
|
|||
|
|
|||
|
from .. import config_context
|
|||
|
from ..exceptions import DataConversionWarning
|
|||
|
from ..preprocessing import normalize
|
|||
|
from ..utils import (
|
|||
|
check_array,
|
|||
|
gen_batches,
|
|||
|
gen_even_slices,
|
|||
|
)
|
|||
|
from ..utils._chunking import get_chunk_n_rows
|
|||
|
from ..utils._mask import _get_mask
|
|||
|
from ..utils._missing import is_scalar_nan
|
|||
|
from ..utils._param_validation import (
|
|||
|
Hidden,
|
|||
|
Interval,
|
|||
|
MissingValues,
|
|||
|
Options,
|
|||
|
StrOptions,
|
|||
|
validate_params,
|
|||
|
)
|
|||
|
from ..utils.extmath import row_norms, safe_sparse_dot
|
|||
|
from ..utils.fixes import parse_version, sp_base_version
|
|||
|
from ..utils.parallel import Parallel, delayed
|
|||
|
from ..utils.validation import _num_samples, check_non_negative
|
|||
|
from ._pairwise_distances_reduction import ArgKmin
|
|||
|
from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
|
|||
|
|
|||
|
|
|||
|
# Utility Functions
|
|||
|
def _return_float_dtype(X, Y):
|
|||
|
"""
|
|||
|
1. If dtype of X and Y is float32, then dtype float32 is returned.
|
|||
|
2. Else dtype float is returned.
|
|||
|
"""
|
|||
|
if not issparse(X) and not isinstance(X, np.ndarray):
|
|||
|
X = np.asarray(X)
|
|||
|
|
|||
|
if Y is None:
|
|||
|
Y_dtype = X.dtype
|
|||
|
elif not issparse(Y) and not isinstance(Y, np.ndarray):
|
|||
|
Y = np.asarray(Y)
|
|||
|
Y_dtype = Y.dtype
|
|||
|
else:
|
|||
|
Y_dtype = Y.dtype
|
|||
|
|
|||
|
if X.dtype == Y_dtype == np.float32:
|
|||
|
dtype = np.float32
|
|||
|
else:
|
|||
|
dtype = float
|
|||
|
|
|||
|
return X, Y, dtype
|
|||
|
|
|||
|
|
|||
|
def check_pairwise_arrays(
|
|||
|
X,
|
|||
|
Y,
|
|||
|
*,
|
|||
|
precomputed=False,
|
|||
|
dtype="infer_float",
|
|||
|
accept_sparse="csr",
|
|||
|
force_all_finite=True,
|
|||
|
ensure_2d=True,
|
|||
|
copy=False,
|
|||
|
):
|
|||
|
"""Set X and Y appropriately and checks inputs.
|
|||
|
|
|||
|
If Y is None, it is set as a pointer to X (i.e. not a copy).
|
|||
|
If Y is given, this does not happen.
|
|||
|
All distance metrics should use this function first to assert that the
|
|||
|
given parameters are correct and safe to use.
|
|||
|
|
|||
|
Specifically, this function first ensures that both X and Y are arrays,
|
|||
|
then checks that they are at least two dimensional while ensuring that
|
|||
|
their elements are floats (or dtype if provided). Finally, the function
|
|||
|
checks that the size of the second dimension of the two arrays is equal, or
|
|||
|
the equivalent check for a precomputed distance matrix.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
|||
|
|
|||
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)
|
|||
|
|
|||
|
precomputed : bool, default=False
|
|||
|
True if X is to be treated as precomputed distances to the samples in
|
|||
|
Y.
|
|||
|
|
|||
|
dtype : str, type, list of type or None default="infer_float"
|
|||
|
Data type required for X and Y. If "infer_float", the dtype will be an
|
|||
|
appropriate float type selected by _return_float_dtype. If None, the
|
|||
|
dtype of the input is preserved.
|
|||
|
|
|||
|
.. versionadded:: 0.18
|
|||
|
|
|||
|
accept_sparse : str, bool or list/tuple of str, default='csr'
|
|||
|
String[s] representing allowed sparse matrix formats, such as 'csc',
|
|||
|
'csr', etc. If the input is sparse but not in the allowed format,
|
|||
|
it will be converted to the first listed format. True allows the input
|
|||
|
to be any format. False means that a sparse matrix input will
|
|||
|
raise an error.
|
|||
|
|
|||
|
force_all_finite : bool or 'allow-nan', default=True
|
|||
|
Whether to raise an error on np.inf, np.nan, pd.NA in array. The
|
|||
|
possibilities are:
|
|||
|
|
|||
|
- True: Force all values of array to be finite.
|
|||
|
- False: accepts np.inf, np.nan, pd.NA in array.
|
|||
|
- 'allow-nan': accepts only np.nan and pd.NA values in array. Values
|
|||
|
cannot be infinite.
|
|||
|
|
|||
|
.. versionadded:: 0.22
|
|||
|
``force_all_finite`` accepts the string ``'allow-nan'``.
|
|||
|
|
|||
|
.. versionchanged:: 0.23
|
|||
|
Accepts `pd.NA` and converts it into `np.nan`.
|
|||
|
|
|||
|
ensure_2d : bool, default=True
|
|||
|
Whether to raise an error when the input arrays are not 2-dimensional. Setting
|
|||
|
this to `False` is necessary when using a custom metric with certain
|
|||
|
non-numerical inputs (e.g. a list of strings).
|
|||
|
|
|||
|
.. versionadded:: 1.5
|
|||
|
|
|||
|
copy : bool, default=False
|
|||
|
Whether a forced copy will be triggered. If copy=False, a copy might
|
|||
|
be triggered by a conversion.
|
|||
|
|
|||
|
.. versionadded:: 0.22
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
safe_X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
|||
|
An array equal to X, guaranteed to be a numpy array.
|
|||
|
|
|||
|
safe_Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)
|
|||
|
An array equal to Y if Y was not None, guaranteed to be a numpy array.
|
|||
|
If Y was None, safe_Y will be a pointer to X.
|
|||
|
"""
|
|||
|
X, Y, dtype_float = _return_float_dtype(X, Y)
|
|||
|
|
|||
|
estimator = "check_pairwise_arrays"
|
|||
|
if dtype == "infer_float":
|
|||
|
dtype = dtype_float
|
|||
|
|
|||
|
if Y is X or Y is None:
|
|||
|
X = Y = check_array(
|
|||
|
X,
|
|||
|
accept_sparse=accept_sparse,
|
|||
|
dtype=dtype,
|
|||
|
copy=copy,
|
|||
|
force_all_finite=force_all_finite,
|
|||
|
estimator=estimator,
|
|||
|
ensure_2d=ensure_2d,
|
|||
|
)
|
|||
|
else:
|
|||
|
X = check_array(
|
|||
|
X,
|
|||
|
accept_sparse=accept_sparse,
|
|||
|
dtype=dtype,
|
|||
|
copy=copy,
|
|||
|
force_all_finite=force_all_finite,
|
|||
|
estimator=estimator,
|
|||
|
ensure_2d=ensure_2d,
|
|||
|
)
|
|||
|
Y = check_array(
|
|||
|
Y,
|
|||
|
accept_sparse=accept_sparse,
|
|||
|
dtype=dtype,
|
|||
|
copy=copy,
|
|||
|
force_all_finite=force_all_finite,
|
|||
|
estimator=estimator,
|
|||
|
ensure_2d=ensure_2d,
|
|||
|
)
|
|||
|
|
|||
|
if precomputed:
|
|||
|
if X.shape[1] != Y.shape[0]:
|
|||
|
raise ValueError(
|
|||
|
"Precomputed metric requires shape "
|
|||
|
"(n_queries, n_indexed). Got (%d, %d) "
|
|||
|
"for %d indexed." % (X.shape[0], X.shape[1], Y.shape[0])
|
|||
|
)
|
|||
|
elif ensure_2d and X.shape[1] != Y.shape[1]:
|
|||
|
# Only check the number of features if 2d arrays are enforced. Otherwise,
|
|||
|
# validation is left to the user for custom metrics.
|
|||
|
raise ValueError(
|
|||
|
"Incompatible dimension for X and Y matrices: "
|
|||
|
"X.shape[1] == %d while Y.shape[1] == %d" % (X.shape[1], Y.shape[1])
|
|||
|
)
|
|||
|
|
|||
|
return X, Y
|
|||
|
|
|||
|
|
|||
|
def check_paired_arrays(X, Y):
|
|||
|
"""Set X and Y appropriately and checks inputs for paired distances.
|
|||
|
|
|||
|
All paired distance metrics should use this function first to assert that
|
|||
|
the given parameters are correct and safe to use.
|
|||
|
|
|||
|
Specifically, this function first ensures that both X and Y are arrays,
|
|||
|
then checks that they are at least two dimensional while ensuring that
|
|||
|
their elements are floats. Finally, the function checks that the size
|
|||
|
of the dimensions of the two arrays are equal.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
|||
|
|
|||
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
safe_X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
|||
|
An array equal to X, guaranteed to be a numpy array.
|
|||
|
|
|||
|
safe_Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)
|
|||
|
An array equal to Y if Y was not None, guaranteed to be a numpy array.
|
|||
|
If Y was None, safe_Y will be a pointer to X.
|
|||
|
"""
|
|||
|
X, Y = check_pairwise_arrays(X, Y)
|
|||
|
if X.shape != Y.shape:
|
|||
|
raise ValueError(
|
|||
|
"X and Y should be of same shape. They were respectively %r and %r long."
|
|||
|
% (X.shape, Y.shape)
|
|||
|
)
|
|||
|
return X, Y
|
|||
|
|
|||
|
|
|||
|
# Pairwise distances
|
|||
|
@validate_params(
|
|||
|
{
|
|||
|
"X": ["array-like", "sparse matrix"],
|
|||
|
"Y": ["array-like", "sparse matrix", None],
|
|||
|
"Y_norm_squared": ["array-like", None],
|
|||
|
"squared": ["boolean"],
|
|||
|
"X_norm_squared": ["array-like", None],
|
|||
|
},
|
|||
|
prefer_skip_nested_validation=True,
|
|||
|
)
|
|||
|
def euclidean_distances(
|
|||
|
X, Y=None, *, Y_norm_squared=None, squared=False, X_norm_squared=None
|
|||
|
):
|
|||
|
"""
|
|||
|
Compute the distance matrix between each pair from a vector array X and Y.
|
|||
|
|
|||
|
For efficiency reasons, the euclidean distance between a pair of row
|
|||
|
vector x and y is computed as::
|
|||
|
|
|||
|
dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))
|
|||
|
|
|||
|
This formulation has two advantages over other ways of computing distances.
|
|||
|
First, it is computationally efficient when dealing with sparse data.
|
|||
|
Second, if one argument varies but the other remains unchanged, then
|
|||
|
`dot(x, x)` and/or `dot(y, y)` can be pre-computed.
|
|||
|
|
|||
|
However, this is not the most precise way of doing this computation,
|
|||
|
because this equation potentially suffers from "catastrophic cancellation".
|
|||
|
Also, the distance matrix returned by this function may not be exactly
|
|||
|
symmetric as required by, e.g., ``scipy.spatial.distance`` functions.
|
|||
|
|
|||
|
Read more in the :ref:`User Guide <metrics>`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
|||
|
An array where each row is a sample and each column is a feature.
|
|||
|
|
|||
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), \
|
|||
|
default=None
|
|||
|
An array where each row is a sample and each column is a feature.
|
|||
|
If `None`, method uses `Y=X`.
|
|||
|
|
|||
|
Y_norm_squared : array-like of shape (n_samples_Y,) or (n_samples_Y, 1) \
|
|||
|
or (1, n_samples_Y), default=None
|
|||
|
Pre-computed dot-products of vectors in Y (e.g.,
|
|||
|
``(Y**2).sum(axis=1)``)
|
|||
|
May be ignored in some cases, see the note below.
|
|||
|
|
|||
|
squared : bool, default=False
|
|||
|
Return squared Euclidean distances.
|
|||
|
|
|||
|
X_norm_squared : array-like of shape (n_samples_X,) or (n_samples_X, 1) \
|
|||
|
or (1, n_samples_X), default=None
|
|||
|
Pre-computed dot-products of vectors in X (e.g.,
|
|||
|
``(X**2).sum(axis=1)``)
|
|||
|
May be ignored in some cases, see the note below.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
distances : ndarray of shape (n_samples_X, n_samples_Y)
|
|||
|
Returns the distances between the row vectors of `X`
|
|||
|
and the row vectors of `Y`.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
paired_distances : Distances between pairs of elements of X and Y.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
To achieve a better accuracy, `X_norm_squared` and `Y_norm_squared` may be
|
|||
|
unused if they are passed as `np.float32`.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from sklearn.metrics.pairwise import euclidean_distances
|
|||
|
>>> X = [[0, 1], [1, 1]]
|
|||
|
>>> # distance between rows of X
|
|||
|
>>> euclidean_distances(X, X)
|
|||
|
array([[0., 1.],
|
|||
|
[1., 0.]])
|
|||
|
>>> # get distance to origin
|
|||
|
>>> euclidean_distances(X, [[0, 0]])
|
|||
|
array([[1. ],
|
|||
|
[1.41421356]])
|
|||
|
"""
|
|||
|
X, Y = check_pairwise_arrays(X, Y)
|
|||
|
|
|||
|
if X_norm_squared is not None:
|
|||
|
X_norm_squared = check_array(X_norm_squared, ensure_2d=False)
|
|||
|
original_shape = X_norm_squared.shape
|
|||
|
if X_norm_squared.shape == (X.shape[0],):
|
|||
|
X_norm_squared = X_norm_squared.reshape(-1, 1)
|
|||
|
if X_norm_squared.shape == (1, X.shape[0]):
|
|||
|
X_norm_squared = X_norm_squared.T
|
|||
|
if X_norm_squared.shape != (X.shape[0], 1):
|
|||
|
raise ValueError(
|
|||
|
f"Incompatible dimensions for X of shape {X.shape} and "
|
|||
|
f"X_norm_squared of shape {original_shape}."
|
|||
|
)
|
|||
|
|
|||
|
if Y_norm_squared is not None:
|
|||
|
Y_norm_squared = check_array(Y_norm_squared, ensure_2d=False)
|
|||
|
original_shape = Y_norm_squared.shape
|
|||
|
if Y_norm_squared.shape == (Y.shape[0],):
|
|||
|
Y_norm_squared = Y_norm_squared.reshape(1, -1)
|
|||
|
if Y_norm_squared.shape == (Y.shape[0], 1):
|
|||
|
Y_norm_squared = Y_norm_squared.T
|
|||
|
if Y_norm_squared.shape != (1, Y.shape[0]):
|
|||
|
raise ValueError(
|
|||
|
f"Incompatible dimensions for Y of shape {Y.shape} and "
|
|||
|
f"Y_norm_squared of shape {original_shape}."
|
|||
|
)
|
|||
|
|
|||
|
return _euclidean_distances(X, Y, X_norm_squared, Y_norm_squared, squared)
|
|||
|
|
|||
|
|
|||
|
def _euclidean_distances(X, Y, X_norm_squared=None, Y_norm_squared=None, squared=False):
|
|||
|
"""Computational part of euclidean_distances
|
|||
|
|
|||
|
Assumes inputs are already checked.
|
|||
|
|
|||
|
If norms are passed as float32, they are unused. If arrays are passed as
|
|||
|
float32, norms needs to be recomputed on upcast chunks.
|
|||
|
TODO: use a float64 accumulator in row_norms to avoid the latter.
|
|||
|
"""
|
|||
|
if X_norm_squared is not None and X_norm_squared.dtype != np.float32:
|
|||
|
XX = X_norm_squared.reshape(-1, 1)
|
|||
|
elif X.dtype != np.float32:
|
|||
|
XX = row_norms(X, squared=True)[:, np.newaxis]
|
|||
|
else:
|
|||
|
XX = None
|
|||
|
|
|||
|
if Y is X:
|
|||
|
YY = None if XX is None else XX.T
|
|||
|
else:
|
|||
|
if Y_norm_squared is not None and Y_norm_squared.dtype != np.float32:
|
|||
|
YY = Y_norm_squared.reshape(1, -1)
|
|||
|
elif Y.dtype != np.float32:
|
|||
|
YY = row_norms(Y, squared=True)[np.newaxis, :]
|
|||
|
else:
|
|||
|
YY = None
|
|||
|
|
|||
|
if X.dtype == np.float32 or Y.dtype == np.float32:
|
|||
|
# To minimize precision issues with float32, we compute the distance
|
|||
|
# matrix on chunks of X and Y upcast to float64
|
|||
|
distances = _euclidean_distances_upcast(X, XX, Y, YY)
|
|||
|
else:
|
|||
|
# if dtype is already float64, no need to chunk and upcast
|
|||
|
distances = -2 * safe_sparse_dot(X, Y.T, dense_output=True)
|
|||
|
distances += XX
|
|||
|
distances += YY
|
|||
|
np.maximum(distances, 0, out=distances)
|
|||
|
|
|||
|
# Ensure that distances between vectors and themselves are set to 0.0.
|
|||
|
# This may not be the case due to floating point rounding errors.
|
|||
|
if X is Y:
|
|||
|
np.fill_diagonal(distances, 0)
|
|||
|
|
|||
|
return distances if squared else np.sqrt(distances, out=distances)
|
|||
|
|
|||
|
|
|||
|
@validate_params(
|
|||
|
{
|
|||
|
"X": ["array-like"],
|
|||
|
"Y": ["array-like", None],
|
|||
|
"squared": ["boolean"],
|
|||
|
"missing_values": [MissingValues(numeric_only=True)],
|
|||
|
"copy": ["boolean"],
|
|||
|
},
|
|||
|
prefer_skip_nested_validation=True,
|
|||
|
)
|
|||
|
def nan_euclidean_distances(
|
|||
|
X, Y=None, *, squared=False, missing_values=np.nan, copy=True
|
|||
|
):
|
|||
|
"""Calculate the euclidean distances in the presence of missing values.
|
|||
|
|
|||
|
Compute the euclidean distance between each pair of samples in X and Y,
|
|||
|
where Y=X is assumed if Y=None. When calculating the distance between a
|
|||
|
pair of samples, this formulation ignores feature coordinates with a
|
|||
|
missing value in either sample and scales up the weight of the remaining
|
|||
|
coordinates:
|
|||
|
|
|||
|
dist(x,y) = sqrt(weight * sq. distance from present coordinates)
|
|||
|
where,
|
|||
|
weight = Total # of coordinates / # of present coordinates
|
|||
|
|
|||
|
For example, the distance between ``[3, na, na, 6]`` and ``[1, na, 4, 5]``
|
|||
|
is:
|
|||
|
|
|||
|
.. math::
|
|||
|
\\sqrt{\\frac{4}{2}((3-1)^2 + (6-5)^2)}
|
|||
|
|
|||
|
If all the coordinates are missing or if there are no common present
|
|||
|
coordinates then NaN is returned for that pair.
|
|||
|
|
|||
|
Read more in the :ref:`User Guide <metrics>`.
|
|||
|
|
|||
|
.. versionadded:: 0.22
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : array-like of shape (n_samples_X, n_features)
|
|||
|
An array where each row is a sample and each column is a feature.
|
|||
|
|
|||
|
Y : array-like of shape (n_samples_Y, n_features), default=None
|
|||
|
An array where each row is a sample and each column is a feature.
|
|||
|
If `None`, method uses `Y=X`.
|
|||
|
|
|||
|
squared : bool, default=False
|
|||
|
Return squared Euclidean distances.
|
|||
|
|
|||
|
missing_values : np.nan, float or int, default=np.nan
|
|||
|
Representation of missing value.
|
|||
|
|
|||
|
copy : bool, default=True
|
|||
|
Make and use a deep copy of X and Y (if Y exists).
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
distances : ndarray of shape (n_samples_X, n_samples_Y)
|
|||
|
Returns the distances between the row vectors of `X`
|
|||
|
and the row vectors of `Y`.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
paired_distances : Distances between pairs of elements of X and Y.
|
|||
|
|
|||
|
References
|
|||
|
----------
|
|||
|
* John K. Dixon, "Pattern Recognition with Partly Missing Data",
|
|||
|
IEEE Transactions on Systems, Man, and Cybernetics, Volume: 9, Issue:
|
|||
|
10, pp. 617 - 621, Oct. 1979.
|
|||
|
http://ieeexplore.ieee.org/abstract/document/4310090/
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from sklearn.metrics.pairwise import nan_euclidean_distances
|
|||
|
>>> nan = float("NaN")
|
|||
|
>>> X = [[0, 1], [1, nan]]
|
|||
|
>>> nan_euclidean_distances(X, X) # distance between rows of X
|
|||
|
array([[0. , 1.41421356],
|
|||
|
[1.41421356, 0. ]])
|
|||
|
|
|||
|
>>> # get distance to origin
|
|||
|
>>> nan_euclidean_distances(X, [[0, 0]])
|
|||
|
array([[1. ],
|
|||
|
[1.41421356]])
|
|||
|
"""
|
|||
|
|
|||
|
force_all_finite = "allow-nan" if is_scalar_nan(missing_values) else True
|
|||
|
X, Y = check_pairwise_arrays(
|
|||
|
X, Y, accept_sparse=False, force_all_finite=force_all_finite, copy=copy
|
|||
|
)
|
|||
|
# Get missing mask for X
|
|||
|
missing_X = _get_mask(X, missing_values)
|
|||
|
|
|||
|
# Get missing mask for Y
|
|||
|
missing_Y = missing_X if Y is X else _get_mask(Y, missing_values)
|
|||
|
|
|||
|
# set missing values to zero
|
|||
|
X[missing_X] = 0
|
|||
|
Y[missing_Y] = 0
|
|||
|
|
|||
|
distances = euclidean_distances(X, Y, squared=True)
|
|||
|
|
|||
|
# Adjust distances for missing values
|
|||
|
XX = X * X
|
|||
|
YY = Y * Y
|
|||
|
distances -= np.dot(XX, missing_Y.T)
|
|||
|
distances -= np.dot(missing_X, YY.T)
|
|||
|
|
|||
|
np.clip(distances, 0, None, out=distances)
|
|||
|
|
|||
|
if X is Y:
|
|||
|
# Ensure that distances between vectors and themselves are set to 0.0.
|
|||
|
# This may not be the case due to floating point rounding errors.
|
|||
|
np.fill_diagonal(distances, 0.0)
|
|||
|
|
|||
|
present_X = 1 - missing_X
|
|||
|
present_Y = present_X if Y is X else ~missing_Y
|
|||
|
present_count = np.dot(present_X, present_Y.T)
|
|||
|
distances[present_count == 0] = np.nan
|
|||
|
# avoid divide by zero
|
|||
|
np.maximum(1, present_count, out=present_count)
|
|||
|
distances /= present_count
|
|||
|
distances *= X.shape[1]
|
|||
|
|
|||
|
if not squared:
|
|||
|
np.sqrt(distances, out=distances)
|
|||
|
|
|||
|
return distances
|
|||
|
|
|||
|
|
|||
|
def _euclidean_distances_upcast(X, XX=None, Y=None, YY=None, batch_size=None):
|
|||
|
"""Euclidean distances between X and Y.
|
|||
|
|
|||
|
Assumes X and Y have float32 dtype.
|
|||
|
Assumes XX and YY have float64 dtype or are None.
|
|||
|
|
|||
|
X and Y are upcast to float64 by chunks, which size is chosen to limit
|
|||
|
memory increase by approximately 10% (at least 10MiB).
|
|||
|
"""
|
|||
|
n_samples_X = X.shape[0]
|
|||
|
n_samples_Y = Y.shape[0]
|
|||
|
n_features = X.shape[1]
|
|||
|
|
|||
|
distances = np.empty((n_samples_X, n_samples_Y), dtype=np.float32)
|
|||
|
|
|||
|
if batch_size is None:
|
|||
|
x_density = X.nnz / np.prod(X.shape) if issparse(X) else 1
|
|||
|
y_density = Y.nnz / np.prod(Y.shape) if issparse(Y) else 1
|
|||
|
|
|||
|
# Allow 10% more memory than X, Y and the distance matrix take (at
|
|||
|
# least 10MiB)
|
|||
|
maxmem = max(
|
|||
|
(
|
|||
|
(x_density * n_samples_X + y_density * n_samples_Y) * n_features
|
|||
|
+ (x_density * n_samples_X * y_density * n_samples_Y)
|
|||
|
)
|
|||
|
/ 10,
|
|||
|
10 * 2**17,
|
|||
|
)
|
|||
|
|
|||
|
# The increase amount of memory in 8-byte blocks is:
|
|||
|
# - x_density * batch_size * n_features (copy of chunk of X)
|
|||
|
# - y_density * batch_size * n_features (copy of chunk of Y)
|
|||
|
# - batch_size * batch_size (chunk of distance matrix)
|
|||
|
# Hence x² + (xd+yd)kx = M, where x=batch_size, k=n_features, M=maxmem
|
|||
|
# xd=x_density and yd=y_density
|
|||
|
tmp = (x_density + y_density) * n_features
|
|||
|
batch_size = (-tmp + np.sqrt(tmp**2 + 4 * maxmem)) / 2
|
|||
|
batch_size = max(int(batch_size), 1)
|
|||
|
|
|||
|
x_batches = gen_batches(n_samples_X, batch_size)
|
|||
|
|
|||
|
for i, x_slice in enumerate(x_batches):
|
|||
|
X_chunk = X[x_slice].astype(np.float64)
|
|||
|
if XX is None:
|
|||
|
XX_chunk = row_norms(X_chunk, squared=True)[:, np.newaxis]
|
|||
|
else:
|
|||
|
XX_chunk = XX[x_slice]
|
|||
|
|
|||
|
y_batches = gen_batches(n_samples_Y, batch_size)
|
|||
|
|
|||
|
for j, y_slice in enumerate(y_batches):
|
|||
|
if X is Y and j < i:
|
|||
|
# when X is Y the distance matrix is symmetric so we only need
|
|||
|
# to compute half of it.
|
|||
|
d = distances[y_slice, x_slice].T
|
|||
|
|
|||
|
else:
|
|||
|
Y_chunk = Y[y_slice].astype(np.float64)
|
|||
|
if YY is None:
|
|||
|
YY_chunk = row_norms(Y_chunk, squared=True)[np.newaxis, :]
|
|||
|
else:
|
|||
|
YY_chunk = YY[:, y_slice]
|
|||
|
|
|||
|
d = -2 * safe_sparse_dot(X_chunk, Y_chunk.T, dense_output=True)
|
|||
|
d += XX_chunk
|
|||
|
d += YY_chunk
|
|||
|
|
|||
|
distances[x_slice, y_slice] = d.astype(np.float32, copy=False)
|
|||
|
|
|||
|
return distances
|
|||
|
|
|||
|
|
|||
|
def _argmin_min_reduce(dist, start):
|
|||
|
# `start` is specified in the signature but not used. This is because the higher
|
|||
|
# order `pairwise_distances_chunked` function needs reduction functions that are
|
|||
|
# passed as argument to have a two arguments signature.
|
|||
|
indices = dist.argmin(axis=1)
|
|||
|
values = dist[np.arange(dist.shape[0]), indices]
|
|||
|
return indices, values
|
|||
|
|
|||
|
|
|||
|
def _argmin_reduce(dist, start):
|
|||
|
# `start` is specified in the signature but not used. This is because the higher
|
|||
|
# order `pairwise_distances_chunked` function needs reduction functions that are
|
|||
|
# passed as argument to have a two arguments signature.
|
|||
|
return dist.argmin(axis=1)
|
|||
|
|
|||
|
|
|||
|
_VALID_METRICS = [
|
|||
|
"euclidean",
|
|||
|
"l2",
|
|||
|
"l1",
|
|||
|
"manhattan",
|
|||
|
"cityblock",
|
|||
|
"braycurtis",
|
|||
|
"canberra",
|
|||
|
"chebyshev",
|
|||
|
"correlation",
|
|||
|
"cosine",
|
|||
|
"dice",
|
|||
|
"hamming",
|
|||
|
"jaccard",
|
|||
|
"mahalanobis",
|
|||
|
"matching",
|
|||
|
"minkowski",
|
|||
|
"rogerstanimoto",
|
|||
|
"russellrao",
|
|||
|
"seuclidean",
|
|||
|
"sokalmichener",
|
|||
|
"sokalsneath",
|
|||
|
"sqeuclidean",
|
|||
|
"yule",
|
|||
|
"wminkowski",
|
|||
|
"nan_euclidean",
|
|||
|
"haversine",
|
|||
|
]
|
|||
|
if sp_base_version < parse_version("1.11"): # pragma: no cover
|
|||
|
# Deprecated in SciPy 1.9 and removed in SciPy 1.11
|
|||
|
_VALID_METRICS += ["kulsinski"]
|
|||
|
if sp_base_version < parse_version("1.9"):
|
|||
|
# Deprecated in SciPy 1.0 and removed in SciPy 1.9
|
|||
|
_VALID_METRICS += ["matching"]
|
|||
|
|
|||
|
_NAN_METRICS = ["nan_euclidean"]
|
|||
|
|
|||
|
|
|||
|
@validate_params(
|
|||
|
{
|
|||
|
"X": ["array-like", "sparse matrix"],
|
|||
|
"Y": ["array-like", "sparse matrix"],
|
|||
|
"axis": [Options(Integral, {0, 1})],
|
|||
|
"metric": [
|
|||
|
StrOptions(set(_VALID_METRICS).union(ArgKmin.valid_metrics())),
|
|||
|
callable,
|
|||
|
],
|
|||
|
"metric_kwargs": [dict, None],
|
|||
|
},
|
|||
|
prefer_skip_nested_validation=False, # metric is not validated yet
|
|||
|
)
|
|||
|
def pairwise_distances_argmin_min(
|
|||
|
X, Y, *, axis=1, metric="euclidean", metric_kwargs=None
|
|||
|
):
|
|||
|
"""Compute minimum distances between one point and a set of points.
|
|||
|
|
|||
|
This function computes for each row in X, the index of the row of Y which
|
|||
|
is closest (according to the specified distance). The minimal distances are
|
|||
|
also returned.
|
|||
|
|
|||
|
This is mostly equivalent to calling:
|
|||
|
|
|||
|
(pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis),
|
|||
|
pairwise_distances(X, Y=Y, metric=metric).min(axis=axis))
|
|||
|
|
|||
|
but uses much less memory, and is faster for large arrays.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
|||
|
Array containing points.
|
|||
|
|
|||
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)
|
|||
|
Array containing points.
|
|||
|
|
|||
|
axis : int, default=1
|
|||
|
Axis along which the argmin and distances are to be computed.
|
|||
|
|
|||
|
metric : str or callable, default='euclidean'
|
|||
|
Metric to use for distance computation. Any metric from scikit-learn
|
|||
|
or scipy.spatial.distance can be used.
|
|||
|
|
|||
|
If metric is a callable function, it is called on each
|
|||
|
pair of instances (rows) and the resulting value recorded. The callable
|
|||
|
should take two arrays as input and return one value indicating the
|
|||
|
distance between them. This works for Scipy's metrics, but is less
|
|||
|
efficient than passing the metric name as a string.
|
|||
|
|
|||
|
Distance matrices are not supported.
|
|||
|
|
|||
|
Valid values for metric are:
|
|||
|
|
|||
|
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
|
|||
|
'manhattan']
|
|||
|
|
|||
|
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
|
|||
|
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
|
|||
|
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
|
|||
|
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
|
|||
|
'yule']
|
|||
|
|
|||
|
See the documentation for scipy.spatial.distance for details on these
|
|||
|
metrics.
|
|||
|
|
|||
|
.. note::
|
|||
|
`'kulsinski'` is deprecated from SciPy 1.9 and will be removed in SciPy 1.11.
|
|||
|
|
|||
|
.. note::
|
|||
|
`'matching'` has been removed in SciPy 1.9 (use `'hamming'` instead).
|
|||
|
|
|||
|
metric_kwargs : dict, default=None
|
|||
|
Keyword arguments to pass to specified metric function.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
argmin : ndarray
|
|||
|
Y[argmin[i], :] is the row in Y that is closest to X[i, :].
|
|||
|
|
|||
|
distances : ndarray
|
|||
|
The array of minimum distances. `distances[i]` is the distance between
|
|||
|
the i-th row in X and the argmin[i]-th row in Y.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
pairwise_distances : Distances between every pair of samples of X and Y.
|
|||
|
pairwise_distances_argmin : Same as `pairwise_distances_argmin_min` but only
|
|||
|
returns the argmins.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from sklearn.metrics.pairwise import pairwise_distances_argmin_min
|
|||
|
>>> X = [[0, 0, 0], [1, 1, 1]]
|
|||
|
>>> Y = [[1, 0, 0], [1, 1, 0]]
|
|||
|
>>> argmin, distances = pairwise_distances_argmin_min(X, Y)
|
|||
|
>>> argmin
|
|||
|
array([0, 1])
|
|||
|
>>> distances
|
|||
|
array([1., 1.])
|
|||
|
"""
|
|||
|
X, Y = check_pairwise_arrays(X, Y)
|
|||
|
|
|||
|
if axis == 0:
|
|||
|
X, Y = Y, X
|
|||
|
|
|||
|
if metric_kwargs is None:
|
|||
|
metric_kwargs = {}
|
|||
|
|
|||
|
if ArgKmin.is_usable_for(X, Y, metric):
|
|||
|
# This is an adaptor for one "sqeuclidean" specification.
|
|||
|
# For this backend, we can directly use "sqeuclidean".
|
|||
|
if metric_kwargs.get("squared", False) and metric == "euclidean":
|
|||
|
metric = "sqeuclidean"
|
|||
|
metric_kwargs = {}
|
|||
|
|
|||
|
values, indices = ArgKmin.compute(
|
|||
|
X=X,
|
|||
|
Y=Y,
|
|||
|
k=1,
|
|||
|
metric=metric,
|
|||
|
metric_kwargs=metric_kwargs,
|
|||
|
strategy="auto",
|
|||
|
return_distance=True,
|
|||
|
)
|
|||
|
values = values.flatten()
|
|||
|
indices = indices.flatten()
|
|||
|
else:
|
|||
|
# Joblib-based backend, which is used when user-defined callable
|
|||
|
# are passed for metric.
|
|||
|
|
|||
|
# This won't be used in the future once PairwiseDistancesReductions support:
|
|||
|
# - DistanceMetrics which work on supposedly binary data
|
|||
|
# - CSR-dense and dense-CSR case if 'euclidean' in metric.
|
|||
|
|
|||
|
# Turn off check for finiteness because this is costly and because arrays
|
|||
|
# have already been validated.
|
|||
|
with config_context(assume_finite=True):
|
|||
|
indices, values = zip(
|
|||
|
*pairwise_distances_chunked(
|
|||
|
X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs
|
|||
|
)
|
|||
|
)
|
|||
|
indices = np.concatenate(indices)
|
|||
|
values = np.concatenate(values)
|
|||
|
|
|||
|
return indices, values
|
|||
|
|
|||
|
|
|||
|
@validate_params(
|
|||
|
{
|
|||
|
"X": ["array-like", "sparse matrix"],
|
|||
|
"Y": ["array-like", "sparse matrix"],
|
|||
|
"axis": [Options(Integral, {0, 1})],
|
|||
|
"metric": [
|
|||
|
StrOptions(set(_VALID_METRICS).union(ArgKmin.valid_metrics())),
|
|||
|
callable,
|
|||
|
],
|
|||
|
"metric_kwargs": [dict, None],
|
|||
|
},
|
|||
|
prefer_skip_nested_validation=False, # metric is not validated yet
|
|||
|
)
|
|||
|
def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs=None):
|
|||
|
"""Compute minimum distances between one point and a set of points.
|
|||
|
|
|||
|
This function computes for each row in X, the index of the row of Y which
|
|||
|
is closest (according to the specified distance).
|
|||
|
|
|||
|
This is mostly equivalent to calling:
|
|||
|
|
|||
|
pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis)
|
|||
|
|
|||
|
but uses much less memory, and is faster for large arrays.
|
|||
|
|
|||
|
This function works with dense 2D arrays only.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
|||
|
Array containing points.
|
|||
|
|
|||
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)
|
|||
|
Arrays containing points.
|
|||
|
|
|||
|
axis : int, default=1
|
|||
|
Axis along which the argmin and distances are to be computed.
|
|||
|
|
|||
|
metric : str or callable, default="euclidean"
|
|||
|
Metric to use for distance computation. Any metric from scikit-learn
|
|||
|
or scipy.spatial.distance can be used.
|
|||
|
|
|||
|
If metric is a callable function, it is called on each
|
|||
|
pair of instances (rows) and the resulting value recorded. The callable
|
|||
|
should take two arrays as input and return one value indicating the
|
|||
|
distance between them. This works for Scipy's metrics, but is less
|
|||
|
efficient than passing the metric name as a string.
|
|||
|
|
|||
|
Distance matrices are not supported.
|
|||
|
|
|||
|
Valid values for metric are:
|
|||
|
|
|||
|
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
|
|||
|
'manhattan']
|
|||
|
|
|||
|
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
|
|||
|
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
|
|||
|
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
|
|||
|
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
|
|||
|
'yule']
|
|||
|
|
|||
|
See the documentation for scipy.spatial.distance for details on these
|
|||
|
metrics.
|
|||
|
|
|||
|
.. note::
|
|||
|
`'kulsinski'` is deprecated from SciPy 1.9 and will be removed in SciPy 1.11.
|
|||
|
|
|||
|
.. note::
|
|||
|
`'matching'` has been removed in SciPy 1.9 (use `'hamming'` instead).
|
|||
|
|
|||
|
metric_kwargs : dict, default=None
|
|||
|
Keyword arguments to pass to specified metric function.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
argmin : numpy.ndarray
|
|||
|
Y[argmin[i], :] is the row in Y that is closest to X[i, :].
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
pairwise_distances : Distances between every pair of samples of X and Y.
|
|||
|
pairwise_distances_argmin_min : Same as `pairwise_distances_argmin` but also
|
|||
|
returns the distances.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from sklearn.metrics.pairwise import pairwise_distances_argmin
|
|||
|
>>> X = [[0, 0, 0], [1, 1, 1]]
|
|||
|
>>> Y = [[1, 0, 0], [1, 1, 0]]
|
|||
|
>>> pairwise_distances_argmin(X, Y)
|
|||
|
array([0, 1])
|
|||
|
"""
|
|||
|
X, Y = check_pairwise_arrays(X, Y)
|
|||
|
|
|||
|
if axis == 0:
|
|||
|
X, Y = Y, X
|
|||
|
|
|||
|
if metric_kwargs is None:
|
|||
|
metric_kwargs = {}
|
|||
|
|
|||
|
if ArgKmin.is_usable_for(X, Y, metric):
|
|||
|
# This is an adaptor for one "sqeuclidean" specification.
|
|||
|
# For this backend, we can directly use "sqeuclidean".
|
|||
|
if metric_kwargs.get("squared", False) and metric == "euclidean":
|
|||
|
metric = "sqeuclidean"
|
|||
|
metric_kwargs = {}
|
|||
|
|
|||
|
indices = ArgKmin.compute(
|
|||
|
X=X,
|
|||
|
Y=Y,
|
|||
|
k=1,
|
|||
|
metric=metric,
|
|||
|
metric_kwargs=metric_kwargs,
|
|||
|
strategy="auto",
|
|||
|
return_distance=False,
|
|||
|
)
|
|||
|
indices = indices.flatten()
|
|||
|
else:
|
|||
|
# Joblib-based backend, which is used when user-defined callable
|
|||
|
# are passed for metric.
|
|||
|
|
|||
|
# This won't be used in the future once PairwiseDistancesReductions support:
|
|||
|
# - DistanceMetrics which work on supposedly binary data
|
|||
|
# - CSR-dense and dense-CSR case if 'euclidean' in metric.
|
|||
|
|
|||
|
# Turn off check for finiteness because this is costly and because arrays
|
|||
|
# have already been validated.
|
|||
|
with config_context(assume_finite=True):
|
|||
|
indices = np.concatenate(
|
|||
|
list(
|
|||
|
# This returns a np.ndarray generator whose arrays we need
|
|||
|
# to flatten into one.
|
|||
|
pairwise_distances_chunked(
|
|||
|
X, Y, reduce_func=_argmin_reduce, metric=metric, **metric_kwargs
|
|||
|
)
|
|||
|
)
|
|||
|
)
|
|||
|
|
|||
|
return indices
|
|||
|
|
|||
|
|
|||
|
@validate_params(
|
|||
|
{"X": ["array-like", "sparse matrix"], "Y": ["array-like", "sparse matrix", None]},
|
|||
|
prefer_skip_nested_validation=True,
|
|||
|
)
|
|||
|
def haversine_distances(X, Y=None):
|
|||
|
"""Compute the Haversine distance between samples in X and Y.
|
|||
|
|
|||
|
The Haversine (or great circle) distance is the angular distance between
|
|||
|
two points on the surface of a sphere. The first coordinate of each point
|
|||
|
is assumed to be the latitude, the second is the longitude, given
|
|||
|
in radians. The dimension of the data must be 2.
|
|||
|
|
|||
|
.. math::
|
|||
|
D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x_{lat} - y_{lat}) / 2)
|
|||
|
+ \\cos(x_{lat})\\cos(y_{lat})\\
|
|||
|
sin^2((x_{lon} - y_{lon}) / 2)}]
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : {array-like, sparse matrix} of shape (n_samples_X, 2)
|
|||
|
A feature array.
|
|||
|
|
|||
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, 2), default=None
|
|||
|
An optional second feature array. If `None`, uses `Y=X`.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
distances : ndarray of shape (n_samples_X, n_samples_Y)
|
|||
|
The distance matrix.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
As the Earth is nearly spherical, the haversine formula provides a good
|
|||
|
approximation of the distance between two points of the Earth surface, with
|
|||
|
a less than 1% error on average.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
We want to calculate the distance between the Ezeiza Airport
|
|||
|
(Buenos Aires, Argentina) and the Charles de Gaulle Airport (Paris,
|
|||
|
France).
|
|||
|
|
|||
|
>>> from sklearn.metrics.pairwise import haversine_distances
|
|||
|
>>> from math import radians
|
|||
|
>>> bsas = [-34.83333, -58.5166646]
|
|||
|
>>> paris = [49.0083899664, 2.53844117956]
|
|||
|
>>> bsas_in_radians = [radians(_) for _ in bsas]
|
|||
|
>>> paris_in_radians = [radians(_) for _ in paris]
|
|||
|
>>> result = haversine_distances([bsas_in_radians, paris_in_radians])
|
|||
|
>>> result * 6371000/1000 # multiply by Earth radius to get kilometers
|
|||
|
array([[ 0. , 11099.54035582],
|
|||
|
[11099.54035582, 0. ]])
|
|||
|
"""
|
|||
|
from ..metrics import DistanceMetric
|
|||
|
|
|||
|
return DistanceMetric.get_metric("haversine").pairwise(X, Y)
|
|||
|
|
|||
|
|
|||
|
@validate_params(
|
|||
|
{
|
|||
|
"X": ["array-like", "sparse matrix"],
|
|||
|
"Y": ["array-like", "sparse matrix", None],
|
|||
|
},
|
|||
|
prefer_skip_nested_validation=True,
|
|||
|
)
|
|||
|
def manhattan_distances(X, Y=None):
|
|||
|
"""Compute the L1 distances between the vectors in X and Y.
|
|||
|
|
|||
|
Read more in the :ref:`User Guide <metrics>`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
|||
|
An array where each row is a sample and each column is a feature.
|
|||
|
|
|||
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None
|
|||
|
An array where each row is a sample and each column is a feature.
|
|||
|
If `None`, method uses `Y=X`.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
distances : ndarray of shape (n_samples_X, n_samples_Y)
|
|||
|
Pairwise L1 distances.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
When X and/or Y are CSR sparse matrices and they are not already
|
|||
|
in canonical format, this function modifies them in-place to
|
|||
|
make them canonical.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from sklearn.metrics.pairwise import manhattan_distances
|
|||
|
>>> manhattan_distances([[3]], [[3]])
|
|||
|
array([[0.]])
|
|||
|
>>> manhattan_distances([[3]], [[2]])
|
|||
|
array([[1.]])
|
|||
|
>>> manhattan_distances([[2]], [[3]])
|
|||
|
array([[1.]])
|
|||
|
>>> manhattan_distances([[1, 2], [3, 4]],\
|
|||
|
[[1, 2], [0, 3]])
|
|||
|
array([[0., 2.],
|
|||
|
[4., 4.]])
|
|||
|
"""
|
|||
|
X, Y = check_pairwise_arrays(X, Y)
|
|||
|
|
|||
|
if issparse(X) or issparse(Y):
|
|||
|
X = csr_matrix(X, copy=False)
|
|||
|
Y = csr_matrix(Y, copy=False)
|
|||
|
X.sum_duplicates() # this also sorts indices in-place
|
|||
|
Y.sum_duplicates()
|
|||
|
D = np.zeros((X.shape[0], Y.shape[0]))
|
|||
|
_sparse_manhattan(X.data, X.indices, X.indptr, Y.data, Y.indices, Y.indptr, D)
|
|||
|
return D
|
|||
|
|
|||
|
return distance.cdist(X, Y, "cityblock")
|
|||
|
|
|||
|
|
|||
|
@validate_params(
|
|||
|
{
|
|||
|
"X": ["array-like", "sparse matrix"],
|
|||
|
"Y": ["array-like", "sparse matrix", None],
|
|||
|
},
|
|||
|
prefer_skip_nested_validation=True,
|
|||
|
)
|
|||
|
def cosine_distances(X, Y=None):
|
|||
|
"""Compute cosine distance between samples in X and Y.
|
|||
|
|
|||
|
Cosine distance is defined as 1.0 minus the cosine similarity.
|
|||
|
|
|||
|
Read more in the :ref:`User Guide <metrics>`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
|||
|
Matrix `X`.
|
|||
|
|
|||
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), \
|
|||
|
default=None
|
|||
|
Matrix `Y`.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
distances : ndarray of shape (n_samples_X, n_samples_Y)
|
|||
|
Returns the cosine distance between samples in X and Y.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
cosine_similarity : Compute cosine similarity between samples in X and Y.
|
|||
|
scipy.spatial.distance.cosine : Dense matrices only.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from sklearn.metrics.pairwise import cosine_distances
|
|||
|
>>> X = [[0, 0, 0], [1, 1, 1]]
|
|||
|
>>> Y = [[1, 0, 0], [1, 1, 0]]
|
|||
|
>>> cosine_distances(X, Y)
|
|||
|
array([[1. , 1. ],
|
|||
|
[0.42..., 0.18...]])
|
|||
|
"""
|
|||
|
# 1.0 - cosine_similarity(X, Y) without copy
|
|||
|
S = cosine_similarity(X, Y)
|
|||
|
S *= -1
|
|||
|
S += 1
|
|||
|
np.clip(S, 0, 2, out=S)
|
|||
|
if X is Y or Y is None:
|
|||
|
# Ensure that distances between vectors and themselves are set to 0.0.
|
|||
|
# This may not be the case due to floating point rounding errors.
|
|||
|
np.fill_diagonal(S, 0.0)
|
|||
|
return S
|
|||
|
|
|||
|
|
|||
|
# Paired distances
|
|||
|
@validate_params(
|
|||
|
{"X": ["array-like", "sparse matrix"], "Y": ["array-like", "sparse matrix"]},
|
|||
|
prefer_skip_nested_validation=True,
|
|||
|
)
|
|||
|
def paired_euclidean_distances(X, Y):
|
|||
|
"""Compute the paired euclidean distances between X and Y.
|
|||
|
|
|||
|
Read more in the :ref:`User Guide <metrics>`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|||
|
Input array/matrix X.
|
|||
|
|
|||
|
Y : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|||
|
Input array/matrix Y.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
distances : ndarray of shape (n_samples,)
|
|||
|
Output array/matrix containing the calculated paired euclidean
|
|||
|
distances.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from sklearn.metrics.pairwise import paired_euclidean_distances
|
|||
|
>>> X = [[0, 0, 0], [1, 1, 1]]
|
|||
|
>>> Y = [[1, 0, 0], [1, 1, 0]]
|
|||
|
>>> paired_euclidean_distances(X, Y)
|
|||
|
array([1., 1.])
|
|||
|
"""
|
|||
|
X, Y = check_paired_arrays(X, Y)
|
|||
|
return row_norms(X - Y)
|
|||
|
|
|||
|
|
|||
|
@validate_params(
|
|||
|
{"X": ["array-like", "sparse matrix"], "Y": ["array-like", "sparse matrix"]},
|
|||
|
prefer_skip_nested_validation=True,
|
|||
|
)
|
|||
|
def paired_manhattan_distances(X, Y):
|
|||
|
"""Compute the paired L1 distances between X and Y.
|
|||
|
|
|||
|
Distances are calculated between (X[0], Y[0]), (X[1], Y[1]), ...,
|
|||
|
(X[n_samples], Y[n_samples]).
|
|||
|
|
|||
|
Read more in the :ref:`User Guide <metrics>`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|||
|
An array-like where each row is a sample and each column is a feature.
|
|||
|
|
|||
|
Y : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|||
|
An array-like where each row is a sample and each column is a feature.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
distances : ndarray of shape (n_samples,)
|
|||
|
L1 paired distances between the row vectors of `X`
|
|||
|
and the row vectors of `Y`.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from sklearn.metrics.pairwise import paired_manhattan_distances
|
|||
|
>>> import numpy as np
|
|||
|
>>> X = np.array([[1, 1, 0], [0, 1, 0], [0, 0, 1]])
|
|||
|
>>> Y = np.array([[0, 1, 0], [0, 0, 1], [0, 0, 0]])
|
|||
|
>>> paired_manhattan_distances(X, Y)
|
|||
|
array([1., 2., 1.])
|
|||
|
"""
|
|||
|
X, Y = check_paired_arrays(X, Y)
|
|||
|
diff = X - Y
|
|||
|
if issparse(diff):
|
|||
|
diff.data = np.abs(diff.data)
|
|||
|
return np.squeeze(np.array(diff.sum(axis=1)))
|
|||
|
else:
|
|||
|
return np.abs(diff).sum(axis=-1)
|
|||
|
|
|||
|
|
|||
|
@validate_params(
|
|||
|
{"X": ["array-like", "sparse matrix"], "Y": ["array-like", "sparse matrix"]},
|
|||
|
prefer_skip_nested_validation=True,
|
|||
|
)
|
|||
|
def paired_cosine_distances(X, Y):
|
|||
|
"""
|
|||
|
Compute the paired cosine distances between X and Y.
|
|||
|
|
|||
|
Read more in the :ref:`User Guide <metrics>`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|||
|
An array where each row is a sample and each column is a feature.
|
|||
|
|
|||
|
Y : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|||
|
An array where each row is a sample and each column is a feature.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
distances : ndarray of shape (n_samples,)
|
|||
|
Returns the distances between the row vectors of `X`
|
|||
|
and the row vectors of `Y`, where `distances[i]` is the
|
|||
|
distance between `X[i]` and `Y[i]`.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
The cosine distance is equivalent to the half the squared
|
|||
|
euclidean distance if each sample is normalized to unit norm.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from sklearn.metrics.pairwise import paired_cosine_distances
|
|||
|
>>> X = [[0, 0, 0], [1, 1, 1]]
|
|||
|
>>> Y = [[1, 0, 0], [1, 1, 0]]
|
|||
|
>>> paired_cosine_distances(X, Y)
|
|||
|
array([0.5 , 0.18...])
|
|||
|
"""
|
|||
|
X, Y = check_paired_arrays(X, Y)
|
|||
|
return 0.5 * row_norms(normalize(X) - normalize(Y), squared=True)
|
|||
|
|
|||
|
|
|||
|
PAIRED_DISTANCES = {
|
|||
|
"cosine": paired_cosine_distances,
|
|||
|
"euclidean": paired_euclidean_distances,
|
|||
|
"l2": paired_euclidean_distances,
|
|||
|
"l1": paired_manhattan_distances,
|
|||
|
"manhattan": paired_manhattan_distances,
|
|||
|
"cityblock": paired_manhattan_distances,
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
@validate_params(
|
|||
|
{
|
|||
|
"X": ["array-like"],
|
|||
|
"Y": ["array-like"],
|
|||
|
"metric": [StrOptions(set(PAIRED_DISTANCES)), callable],
|
|||
|
},
|
|||
|
prefer_skip_nested_validation=True,
|
|||
|
)
|
|||
|
def paired_distances(X, Y, *, metric="euclidean", **kwds):
|
|||
|
"""
|
|||
|
Compute the paired distances between X and Y.
|
|||
|
|
|||
|
Compute the distances between (X[0], Y[0]), (X[1], Y[1]), etc...
|
|||
|
|
|||
|
Read more in the :ref:`User Guide <metrics>`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : ndarray of shape (n_samples, n_features)
|
|||
|
Array 1 for distance computation.
|
|||
|
|
|||
|
Y : ndarray of shape (n_samples, n_features)
|
|||
|
Array 2 for distance computation.
|
|||
|
|
|||
|
metric : str or callable, default="euclidean"
|
|||
|
The metric to use when calculating distance between instances in a
|
|||
|
feature array. If metric is a string, it must be one of the options
|
|||
|
specified in PAIRED_DISTANCES, including "euclidean",
|
|||
|
"manhattan", or "cosine".
|
|||
|
Alternatively, if metric is a callable function, it is called on each
|
|||
|
pair of instances (rows) and the resulting value recorded. The callable
|
|||
|
should take two arrays from `X` as input and return a value indicating
|
|||
|
the distance between them.
|
|||
|
|
|||
|
**kwds : dict
|
|||
|
Unused parameters.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
distances : ndarray of shape (n_samples,)
|
|||
|
Returns the distances between the row vectors of `X`
|
|||
|
and the row vectors of `Y`.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
sklearn.metrics.pairwise_distances : Computes the distance between every pair of
|
|||
|
samples.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from sklearn.metrics.pairwise import paired_distances
|
|||
|
>>> X = [[0, 1], [1, 1]]
|
|||
|
>>> Y = [[0, 1], [2, 1]]
|
|||
|
>>> paired_distances(X, Y)
|
|||
|
array([0., 1.])
|
|||
|
"""
|
|||
|
|
|||
|
if metric in PAIRED_DISTANCES:
|
|||
|
func = PAIRED_DISTANCES[metric]
|
|||
|
return func(X, Y)
|
|||
|
elif callable(metric):
|
|||
|
# Check the matrix first (it is usually done by the metric)
|
|||
|
X, Y = check_paired_arrays(X, Y)
|
|||
|
distances = np.zeros(len(X))
|
|||
|
for i in range(len(X)):
|
|||
|
distances[i] = metric(X[i], Y[i])
|
|||
|
return distances
|
|||
|
|
|||
|
|
|||
|
# Kernels
|
|||
|
@validate_params(
|
|||
|
{
|
|||
|
"X": ["array-like", "sparse matrix"],
|
|||
|
"Y": ["array-like", "sparse matrix", None],
|
|||
|
"dense_output": ["boolean"],
|
|||
|
},
|
|||
|
prefer_skip_nested_validation=True,
|
|||
|
)
|
|||
|
def linear_kernel(X, Y=None, dense_output=True):
|
|||
|
"""
|
|||
|
Compute the linear kernel between X and Y.
|
|||
|
|
|||
|
Read more in the :ref:`User Guide <linear_kernel>`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
|||
|
A feature array.
|
|||
|
|
|||
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None
|
|||
|
An optional second feature array. If `None`, uses `Y=X`.
|
|||
|
|
|||
|
dense_output : bool, default=True
|
|||
|
Whether to return dense output even when the input is sparse. If
|
|||
|
``False``, the output is sparse if both input arrays are sparse.
|
|||
|
|
|||
|
.. versionadded:: 0.20
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
kernel : ndarray of shape (n_samples_X, n_samples_Y)
|
|||
|
The Gram matrix of the linear kernel, i.e. `X @ Y.T`.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from sklearn.metrics.pairwise import linear_kernel
|
|||
|
>>> X = [[0, 0, 0], [1, 1, 1]]
|
|||
|
>>> Y = [[1, 0, 0], [1, 1, 0]]
|
|||
|
>>> linear_kernel(X, Y)
|
|||
|
array([[0., 0.],
|
|||
|
[1., 2.]])
|
|||
|
"""
|
|||
|
X, Y = check_pairwise_arrays(X, Y)
|
|||
|
return safe_sparse_dot(X, Y.T, dense_output=dense_output)
|
|||
|
|
|||
|
|
|||
|
@validate_params(
|
|||
|
{
|
|||
|
"X": ["array-like", "sparse matrix"],
|
|||
|
"Y": ["array-like", "sparse matrix", None],
|
|||
|
"degree": [Interval(Real, 1, None, closed="left")],
|
|||
|
"gamma": [
|
|||
|
Interval(Real, 0, None, closed="left"),
|
|||
|
None,
|
|||
|
Hidden(np.ndarray),
|
|||
|
],
|
|||
|
"coef0": [Interval(Real, None, None, closed="neither")],
|
|||
|
},
|
|||
|
prefer_skip_nested_validation=True,
|
|||
|
)
|
|||
|
def polynomial_kernel(X, Y=None, degree=3, gamma=None, coef0=1):
|
|||
|
"""
|
|||
|
Compute the polynomial kernel between X and Y.
|
|||
|
|
|||
|
K(X, Y) = (gamma <X, Y> + coef0) ^ degree
|
|||
|
|
|||
|
Read more in the :ref:`User Guide <polynomial_kernel>`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
|||
|
A feature array.
|
|||
|
|
|||
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None
|
|||
|
An optional second feature array. If `None`, uses `Y=X`.
|
|||
|
|
|||
|
degree : float, default=3
|
|||
|
Kernel degree.
|
|||
|
|
|||
|
gamma : float, default=None
|
|||
|
Coefficient of the vector inner product. If None, defaults to 1.0 / n_features.
|
|||
|
|
|||
|
coef0 : float, default=1
|
|||
|
Constant offset added to scaled inner product.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
kernel : ndarray of shape (n_samples_X, n_samples_Y)
|
|||
|
The polynomial kernel.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from sklearn.metrics.pairwise import polynomial_kernel
|
|||
|
>>> X = [[0, 0, 0], [1, 1, 1]]
|
|||
|
>>> Y = [[1, 0, 0], [1, 1, 0]]
|
|||
|
>>> polynomial_kernel(X, Y, degree=2)
|
|||
|
array([[1. , 1. ],
|
|||
|
[1.77..., 2.77...]])
|
|||
|
"""
|
|||
|
X, Y = check_pairwise_arrays(X, Y)
|
|||
|
if gamma is None:
|
|||
|
gamma = 1.0 / X.shape[1]
|
|||
|
|
|||
|
K = safe_sparse_dot(X, Y.T, dense_output=True)
|
|||
|
K *= gamma
|
|||
|
K += coef0
|
|||
|
K **= degree
|
|||
|
return K
|
|||
|
|
|||
|
|
|||
|
@validate_params(
|
|||
|
{
|
|||
|
"X": ["array-like", "sparse matrix"],
|
|||
|
"Y": ["array-like", "sparse matrix", None],
|
|||
|
"gamma": [
|
|||
|
Interval(Real, 0, None, closed="left"),
|
|||
|
None,
|
|||
|
Hidden(np.ndarray),
|
|||
|
],
|
|||
|
"coef0": [Interval(Real, None, None, closed="neither")],
|
|||
|
},
|
|||
|
prefer_skip_nested_validation=True,
|
|||
|
)
|
|||
|
def sigmoid_kernel(X, Y=None, gamma=None, coef0=1):
|
|||
|
"""Compute the sigmoid kernel between X and Y.
|
|||
|
|
|||
|
K(X, Y) = tanh(gamma <X, Y> + coef0)
|
|||
|
|
|||
|
Read more in the :ref:`User Guide <sigmoid_kernel>`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
|||
|
A feature array.
|
|||
|
|
|||
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None
|
|||
|
An optional second feature array. If `None`, uses `Y=X`.
|
|||
|
|
|||
|
gamma : float, default=None
|
|||
|
Coefficient of the vector inner product. If None, defaults to 1.0 / n_features.
|
|||
|
|
|||
|
coef0 : float, default=1
|
|||
|
Constant offset added to scaled inner product.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
kernel : ndarray of shape (n_samples_X, n_samples_Y)
|
|||
|
Sigmoid kernel between two arrays.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from sklearn.metrics.pairwise import sigmoid_kernel
|
|||
|
>>> X = [[0, 0, 0], [1, 1, 1]]
|
|||
|
>>> Y = [[1, 0, 0], [1, 1, 0]]
|
|||
|
>>> sigmoid_kernel(X, Y)
|
|||
|
array([[0.76..., 0.76...],
|
|||
|
[0.87..., 0.93...]])
|
|||
|
"""
|
|||
|
X, Y = check_pairwise_arrays(X, Y)
|
|||
|
if gamma is None:
|
|||
|
gamma = 1.0 / X.shape[1]
|
|||
|
|
|||
|
K = safe_sparse_dot(X, Y.T, dense_output=True)
|
|||
|
K *= gamma
|
|||
|
K += coef0
|
|||
|
np.tanh(K, K) # compute tanh in-place
|
|||
|
return K
|
|||
|
|
|||
|
|
|||
|
@validate_params(
|
|||
|
{
|
|||
|
"X": ["array-like", "sparse matrix"],
|
|||
|
"Y": ["array-like", "sparse matrix", None],
|
|||
|
"gamma": [
|
|||
|
Interval(Real, 0, None, closed="left"),
|
|||
|
None,
|
|||
|
Hidden(np.ndarray),
|
|||
|
],
|
|||
|
},
|
|||
|
prefer_skip_nested_validation=True,
|
|||
|
)
|
|||
|
def rbf_kernel(X, Y=None, gamma=None):
|
|||
|
"""Compute the rbf (gaussian) kernel between X and Y.
|
|||
|
|
|||
|
K(x, y) = exp(-gamma ||x-y||^2)
|
|||
|
|
|||
|
for each pair of rows x in X and y in Y.
|
|||
|
|
|||
|
Read more in the :ref:`User Guide <rbf_kernel>`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
|||
|
A feature array.
|
|||
|
|
|||
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None
|
|||
|
An optional second feature array. If `None`, uses `Y=X`.
|
|||
|
|
|||
|
gamma : float, default=None
|
|||
|
If None, defaults to 1.0 / n_features.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
kernel : ndarray of shape (n_samples_X, n_samples_Y)
|
|||
|
The RBF kernel.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from sklearn.metrics.pairwise import rbf_kernel
|
|||
|
>>> X = [[0, 0, 0], [1, 1, 1]]
|
|||
|
>>> Y = [[1, 0, 0], [1, 1, 0]]
|
|||
|
>>> rbf_kernel(X, Y)
|
|||
|
array([[0.71..., 0.51...],
|
|||
|
[0.51..., 0.71...]])
|
|||
|
"""
|
|||
|
X, Y = check_pairwise_arrays(X, Y)
|
|||
|
if gamma is None:
|
|||
|
gamma = 1.0 / X.shape[1]
|
|||
|
|
|||
|
K = euclidean_distances(X, Y, squared=True)
|
|||
|
K *= -gamma
|
|||
|
np.exp(K, K) # exponentiate K in-place
|
|||
|
return K
|
|||
|
|
|||
|
|
|||
|
@validate_params(
|
|||
|
{
|
|||
|
"X": ["array-like", "sparse matrix"],
|
|||
|
"Y": ["array-like", "sparse matrix", None],
|
|||
|
"gamma": [
|
|||
|
Interval(Real, 0, None, closed="neither"),
|
|||
|
Hidden(np.ndarray),
|
|||
|
None,
|
|||
|
],
|
|||
|
},
|
|||
|
prefer_skip_nested_validation=True,
|
|||
|
)
|
|||
|
def laplacian_kernel(X, Y=None, gamma=None):
|
|||
|
"""Compute the laplacian kernel between X and Y.
|
|||
|
|
|||
|
The laplacian kernel is defined as::
|
|||
|
|
|||
|
K(x, y) = exp(-gamma ||x-y||_1)
|
|||
|
|
|||
|
for each pair of rows x in X and y in Y.
|
|||
|
Read more in the :ref:`User Guide <laplacian_kernel>`.
|
|||
|
|
|||
|
.. versionadded:: 0.17
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
|||
|
A feature array.
|
|||
|
|
|||
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None
|
|||
|
An optional second feature array. If `None`, uses `Y=X`.
|
|||
|
|
|||
|
gamma : float, default=None
|
|||
|
If None, defaults to 1.0 / n_features. Otherwise it should be strictly positive.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
kernel : ndarray of shape (n_samples_X, n_samples_Y)
|
|||
|
The kernel matrix.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from sklearn.metrics.pairwise import laplacian_kernel
|
|||
|
>>> X = [[0, 0, 0], [1, 1, 1]]
|
|||
|
>>> Y = [[1, 0, 0], [1, 1, 0]]
|
|||
|
>>> laplacian_kernel(X, Y)
|
|||
|
array([[0.71..., 0.51...],
|
|||
|
[0.51..., 0.71...]])
|
|||
|
"""
|
|||
|
X, Y = check_pairwise_arrays(X, Y)
|
|||
|
if gamma is None:
|
|||
|
gamma = 1.0 / X.shape[1]
|
|||
|
|
|||
|
K = -gamma * manhattan_distances(X, Y)
|
|||
|
np.exp(K, K) # exponentiate K in-place
|
|||
|
return K
|
|||
|
|
|||
|
|
|||
|
@validate_params(
|
|||
|
{
|
|||
|
"X": ["array-like", "sparse matrix"],
|
|||
|
"Y": ["array-like", "sparse matrix", None],
|
|||
|
"dense_output": ["boolean"],
|
|||
|
},
|
|||
|
prefer_skip_nested_validation=True,
|
|||
|
)
|
|||
|
def cosine_similarity(X, Y=None, dense_output=True):
|
|||
|
"""Compute cosine similarity between samples in X and Y.
|
|||
|
|
|||
|
Cosine similarity, or the cosine kernel, computes similarity as the
|
|||
|
normalized dot product of X and Y:
|
|||
|
|
|||
|
K(X, Y) = <X, Y> / (||X||*||Y||)
|
|||
|
|
|||
|
On L2-normalized data, this function is equivalent to linear_kernel.
|
|||
|
|
|||
|
Read more in the :ref:`User Guide <cosine_similarity>`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
|||
|
Input data.
|
|||
|
|
|||
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), \
|
|||
|
default=None
|
|||
|
Input data. If ``None``, the output will be the pairwise
|
|||
|
similarities between all samples in ``X``.
|
|||
|
|
|||
|
dense_output : bool, default=True
|
|||
|
Whether to return dense output even when the input is sparse. If
|
|||
|
``False``, the output is sparse if both input arrays are sparse.
|
|||
|
|
|||
|
.. versionadded:: 0.17
|
|||
|
parameter ``dense_output`` for dense output.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
similarities : ndarray or sparse matrix of shape (n_samples_X, n_samples_Y)
|
|||
|
Returns the cosine similarity between samples in X and Y.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from sklearn.metrics.pairwise import cosine_similarity
|
|||
|
>>> X = [[0, 0, 0], [1, 1, 1]]
|
|||
|
>>> Y = [[1, 0, 0], [1, 1, 0]]
|
|||
|
>>> cosine_similarity(X, Y)
|
|||
|
array([[0. , 0. ],
|
|||
|
[0.57..., 0.81...]])
|
|||
|
"""
|
|||
|
# to avoid recursive import
|
|||
|
|
|||
|
X, Y = check_pairwise_arrays(X, Y)
|
|||
|
|
|||
|
X_normalized = normalize(X, copy=True)
|
|||
|
if X is Y:
|
|||
|
Y_normalized = X_normalized
|
|||
|
else:
|
|||
|
Y_normalized = normalize(Y, copy=True)
|
|||
|
|
|||
|
K = safe_sparse_dot(X_normalized, Y_normalized.T, dense_output=dense_output)
|
|||
|
|
|||
|
return K
|
|||
|
|
|||
|
|
|||
|
@validate_params(
|
|||
|
{"X": ["array-like"], "Y": ["array-like", None]},
|
|||
|
prefer_skip_nested_validation=True,
|
|||
|
)
|
|||
|
def additive_chi2_kernel(X, Y=None):
|
|||
|
"""Compute the additive chi-squared kernel between observations in X and Y.
|
|||
|
|
|||
|
The chi-squared kernel is computed between each pair of rows in X and Y. X
|
|||
|
and Y have to be non-negative. This kernel is most commonly applied to
|
|||
|
histograms.
|
|||
|
|
|||
|
The chi-squared kernel is given by::
|
|||
|
|
|||
|
k(x, y) = -Sum [(x - y)^2 / (x + y)]
|
|||
|
|
|||
|
It can be interpreted as a weighted difference per entry.
|
|||
|
|
|||
|
Read more in the :ref:`User Guide <chi2_kernel>`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : array-like of shape (n_samples_X, n_features)
|
|||
|
A feature array.
|
|||
|
|
|||
|
Y : array-like of shape (n_samples_Y, n_features), default=None
|
|||
|
An optional second feature array. If `None`, uses `Y=X`.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
kernel : ndarray of shape (n_samples_X, n_samples_Y)
|
|||
|
The kernel matrix.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
chi2_kernel : The exponentiated version of the kernel, which is usually
|
|||
|
preferable.
|
|||
|
sklearn.kernel_approximation.AdditiveChi2Sampler : A Fourier approximation
|
|||
|
to this kernel.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
As the negative of a distance, this kernel is only conditionally positive
|
|||
|
definite.
|
|||
|
|
|||
|
References
|
|||
|
----------
|
|||
|
* Zhang, J. and Marszalek, M. and Lazebnik, S. and Schmid, C.
|
|||
|
Local features and kernels for classification of texture and object
|
|||
|
categories: A comprehensive study
|
|||
|
International Journal of Computer Vision 2007
|
|||
|
https://hal.archives-ouvertes.fr/hal-00171412/document
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from sklearn.metrics.pairwise import additive_chi2_kernel
|
|||
|
>>> X = [[0, 0, 0], [1, 1, 1]]
|
|||
|
>>> Y = [[1, 0, 0], [1, 1, 0]]
|
|||
|
>>> additive_chi2_kernel(X, Y)
|
|||
|
array([[-1., -2.],
|
|||
|
[-2., -1.]])
|
|||
|
"""
|
|||
|
X, Y = check_pairwise_arrays(X, Y, accept_sparse=False)
|
|||
|
if (X < 0).any():
|
|||
|
raise ValueError("X contains negative values.")
|
|||
|
if Y is not X and (Y < 0).any():
|
|||
|
raise ValueError("Y contains negative values.")
|
|||
|
|
|||
|
result = np.zeros((X.shape[0], Y.shape[0]), dtype=X.dtype)
|
|||
|
_chi2_kernel_fast(X, Y, result)
|
|||
|
return result
|
|||
|
|
|||
|
|
|||
|
@validate_params(
|
|||
|
{
|
|||
|
"X": ["array-like"],
|
|||
|
"Y": ["array-like", None],
|
|||
|
"gamma": [Interval(Real, 0, None, closed="neither"), Hidden(np.ndarray)],
|
|||
|
},
|
|||
|
prefer_skip_nested_validation=True,
|
|||
|
)
|
|||
|
def chi2_kernel(X, Y=None, gamma=1.0):
|
|||
|
"""Compute the exponential chi-squared kernel between X and Y.
|
|||
|
|
|||
|
The chi-squared kernel is computed between each pair of rows in X and Y. X
|
|||
|
and Y have to be non-negative. This kernel is most commonly applied to
|
|||
|
histograms.
|
|||
|
|
|||
|
The chi-squared kernel is given by::
|
|||
|
|
|||
|
k(x, y) = exp(-gamma Sum [(x - y)^2 / (x + y)])
|
|||
|
|
|||
|
It can be interpreted as a weighted difference per entry.
|
|||
|
|
|||
|
Read more in the :ref:`User Guide <chi2_kernel>`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : array-like of shape (n_samples_X, n_features)
|
|||
|
A feature array.
|
|||
|
|
|||
|
Y : array-like of shape (n_samples_Y, n_features), default=None
|
|||
|
An optional second feature array. If `None`, uses `Y=X`.
|
|||
|
|
|||
|
gamma : float, default=1
|
|||
|
Scaling parameter of the chi2 kernel.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
kernel : ndarray of shape (n_samples_X, n_samples_Y)
|
|||
|
The kernel matrix.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
additive_chi2_kernel : The additive version of this kernel.
|
|||
|
sklearn.kernel_approximation.AdditiveChi2Sampler : A Fourier approximation
|
|||
|
to the additive version of this kernel.
|
|||
|
|
|||
|
References
|
|||
|
----------
|
|||
|
* Zhang, J. and Marszalek, M. and Lazebnik, S. and Schmid, C.
|
|||
|
Local features and kernels for classification of texture and object
|
|||
|
categories: A comprehensive study
|
|||
|
International Journal of Computer Vision 2007
|
|||
|
https://hal.archives-ouvertes.fr/hal-00171412/document
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from sklearn.metrics.pairwise import chi2_kernel
|
|||
|
>>> X = [[0, 0, 0], [1, 1, 1]]
|
|||
|
>>> Y = [[1, 0, 0], [1, 1, 0]]
|
|||
|
>>> chi2_kernel(X, Y)
|
|||
|
array([[0.36..., 0.13...],
|
|||
|
[0.13..., 0.36...]])
|
|||
|
"""
|
|||
|
K = additive_chi2_kernel(X, Y)
|
|||
|
K *= gamma
|
|||
|
return np.exp(K, K)
|
|||
|
|
|||
|
|
|||
|
# Helper functions - distance
|
|||
|
PAIRWISE_DISTANCE_FUNCTIONS = {
|
|||
|
# If updating this dictionary, update the doc in both distance_metrics()
|
|||
|
# and also in pairwise_distances()!
|
|||
|
"cityblock": manhattan_distances,
|
|||
|
"cosine": cosine_distances,
|
|||
|
"euclidean": euclidean_distances,
|
|||
|
"haversine": haversine_distances,
|
|||
|
"l2": euclidean_distances,
|
|||
|
"l1": manhattan_distances,
|
|||
|
"manhattan": manhattan_distances,
|
|||
|
"precomputed": None, # HACK: precomputed is always allowed, never called
|
|||
|
"nan_euclidean": nan_euclidean_distances,
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
def distance_metrics():
|
|||
|
"""Valid metrics for pairwise_distances.
|
|||
|
|
|||
|
This function simply returns the valid pairwise distance metrics.
|
|||
|
It exists to allow for a description of the mapping for
|
|||
|
each of the valid strings.
|
|||
|
|
|||
|
The valid distance metrics, and the function they map to, are:
|
|||
|
|
|||
|
=============== ========================================
|
|||
|
metric Function
|
|||
|
=============== ========================================
|
|||
|
'cityblock' metrics.pairwise.manhattan_distances
|
|||
|
'cosine' metrics.pairwise.cosine_distances
|
|||
|
'euclidean' metrics.pairwise.euclidean_distances
|
|||
|
'haversine' metrics.pairwise.haversine_distances
|
|||
|
'l1' metrics.pairwise.manhattan_distances
|
|||
|
'l2' metrics.pairwise.euclidean_distances
|
|||
|
'manhattan' metrics.pairwise.manhattan_distances
|
|||
|
'nan_euclidean' metrics.pairwise.nan_euclidean_distances
|
|||
|
=============== ========================================
|
|||
|
|
|||
|
Read more in the :ref:`User Guide <metrics>`.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
distance_metrics : dict
|
|||
|
Returns valid metrics for pairwise_distances.
|
|||
|
"""
|
|||
|
return PAIRWISE_DISTANCE_FUNCTIONS
|
|||
|
|
|||
|
|
|||
|
def _dist_wrapper(dist_func, dist_matrix, slice_, *args, **kwargs):
|
|||
|
"""Write in-place to a slice of a distance matrix."""
|
|||
|
dist_matrix[:, slice_] = dist_func(*args, **kwargs)
|
|||
|
|
|||
|
|
|||
|
def _parallel_pairwise(X, Y, func, n_jobs, **kwds):
|
|||
|
"""Break the pairwise matrix in n_jobs even slices
|
|||
|
and compute them in parallel."""
|
|||
|
|
|||
|
if Y is None:
|
|||
|
Y = X
|
|||
|
X, Y, dtype = _return_float_dtype(X, Y)
|
|||
|
|
|||
|
if effective_n_jobs(n_jobs) == 1:
|
|||
|
return func(X, Y, **kwds)
|
|||
|
|
|||
|
# enforce a threading backend to prevent data communication overhead
|
|||
|
fd = delayed(_dist_wrapper)
|
|||
|
ret = np.empty((X.shape[0], Y.shape[0]), dtype=dtype, order="F")
|
|||
|
Parallel(backend="threading", n_jobs=n_jobs)(
|
|||
|
fd(func, ret, s, X, Y[s], **kwds)
|
|||
|
for s in gen_even_slices(_num_samples(Y), effective_n_jobs(n_jobs))
|
|||
|
)
|
|||
|
|
|||
|
if (X is Y or Y is None) and func is euclidean_distances:
|
|||
|
# zeroing diagonal for euclidean norm.
|
|||
|
# TODO: do it also for other norms.
|
|||
|
np.fill_diagonal(ret, 0)
|
|||
|
|
|||
|
return ret
|
|||
|
|
|||
|
|
|||
|
def _pairwise_callable(X, Y, metric, force_all_finite=True, **kwds):
|
|||
|
"""Handle the callable case for pairwise_{distances,kernels}."""
|
|||
|
X, Y = check_pairwise_arrays(
|
|||
|
X,
|
|||
|
Y,
|
|||
|
dtype=None,
|
|||
|
force_all_finite=force_all_finite,
|
|||
|
ensure_2d=False,
|
|||
|
)
|
|||
|
|
|||
|
if X is Y:
|
|||
|
# Only calculate metric for upper triangle
|
|||
|
out = np.zeros((X.shape[0], Y.shape[0]), dtype="float")
|
|||
|
iterator = itertools.combinations(range(X.shape[0]), 2)
|
|||
|
for i, j in iterator:
|
|||
|
# scipy has not yet implemented 1D sparse slices; once implemented this can
|
|||
|
# be removed and `arr[ind]` can be simply used.
|
|||
|
x = X[[i], :] if issparse(X) else X[i]
|
|||
|
y = Y[[j], :] if issparse(Y) else Y[j]
|
|||
|
out[i, j] = metric(x, y, **kwds)
|
|||
|
|
|||
|
# Make symmetric
|
|||
|
# NB: out += out.T will produce incorrect results
|
|||
|
out = out + out.T
|
|||
|
|
|||
|
# Calculate diagonal
|
|||
|
# NB: nonzero diagonals are allowed for both metrics and kernels
|
|||
|
for i in range(X.shape[0]):
|
|||
|
# scipy has not yet implemented 1D sparse slices; once implemented this can
|
|||
|
# be removed and `arr[ind]` can be simply used.
|
|||
|
x = X[[i], :] if issparse(X) else X[i]
|
|||
|
out[i, i] = metric(x, x, **kwds)
|
|||
|
|
|||
|
else:
|
|||
|
# Calculate all cells
|
|||
|
out = np.empty((X.shape[0], Y.shape[0]), dtype="float")
|
|||
|
iterator = itertools.product(range(X.shape[0]), range(Y.shape[0]))
|
|||
|
for i, j in iterator:
|
|||
|
# scipy has not yet implemented 1D sparse slices; once implemented this can
|
|||
|
# be removed and `arr[ind]` can be simply used.
|
|||
|
x = X[[i], :] if issparse(X) else X[i]
|
|||
|
y = Y[[j], :] if issparse(Y) else Y[j]
|
|||
|
out[i, j] = metric(x, y, **kwds)
|
|||
|
|
|||
|
return out
|
|||
|
|
|||
|
|
|||
|
def _check_chunk_size(reduced, chunk_size):
|
|||
|
"""Checks chunk is a sequence of expected size or a tuple of same."""
|
|||
|
if reduced is None:
|
|||
|
return
|
|||
|
is_tuple = isinstance(reduced, tuple)
|
|||
|
if not is_tuple:
|
|||
|
reduced = (reduced,)
|
|||
|
if any(isinstance(r, tuple) or not hasattr(r, "__iter__") for r in reduced):
|
|||
|
raise TypeError(
|
|||
|
"reduce_func returned %r. Expected sequence(s) of length %d."
|
|||
|
% (reduced if is_tuple else reduced[0], chunk_size)
|
|||
|
)
|
|||
|
if any(_num_samples(r) != chunk_size for r in reduced):
|
|||
|
actual_size = tuple(_num_samples(r) for r in reduced)
|
|||
|
raise ValueError(
|
|||
|
"reduce_func returned object of length %s. "
|
|||
|
"Expected same length as input: %d."
|
|||
|
% (actual_size if is_tuple else actual_size[0], chunk_size)
|
|||
|
)
|
|||
|
|
|||
|
|
|||
|
def _precompute_metric_params(X, Y, metric=None, **kwds):
|
|||
|
"""Precompute data-derived metric parameters if not provided."""
|
|||
|
if metric == "seuclidean" and "V" not in kwds:
|
|||
|
if X is Y:
|
|||
|
V = np.var(X, axis=0, ddof=1)
|
|||
|
else:
|
|||
|
raise ValueError(
|
|||
|
"The 'V' parameter is required for the seuclidean metric "
|
|||
|
"when Y is passed."
|
|||
|
)
|
|||
|
return {"V": V}
|
|||
|
if metric == "mahalanobis" and "VI" not in kwds:
|
|||
|
if X is Y:
|
|||
|
VI = np.linalg.inv(np.cov(X.T)).T
|
|||
|
else:
|
|||
|
raise ValueError(
|
|||
|
"The 'VI' parameter is required for the mahalanobis metric "
|
|||
|
"when Y is passed."
|
|||
|
)
|
|||
|
return {"VI": VI}
|
|||
|
return {}
|
|||
|
|
|||
|
|
|||
|
@validate_params(
|
|||
|
{
|
|||
|
"X": ["array-like", "sparse matrix"],
|
|||
|
"Y": ["array-like", "sparse matrix", None],
|
|||
|
"reduce_func": [callable, None],
|
|||
|
"metric": [StrOptions({"precomputed"}.union(_VALID_METRICS)), callable],
|
|||
|
"n_jobs": [Integral, None],
|
|||
|
"working_memory": [Interval(Real, 0, None, closed="left"), None],
|
|||
|
},
|
|||
|
prefer_skip_nested_validation=False, # metric is not validated yet
|
|||
|
)
|
|||
|
def pairwise_distances_chunked(
|
|||
|
X,
|
|||
|
Y=None,
|
|||
|
*,
|
|||
|
reduce_func=None,
|
|||
|
metric="euclidean",
|
|||
|
n_jobs=None,
|
|||
|
working_memory=None,
|
|||
|
**kwds,
|
|||
|
):
|
|||
|
"""Generate a distance matrix chunk by chunk with optional reduction.
|
|||
|
|
|||
|
In cases where not all of a pairwise distance matrix needs to be
|
|||
|
stored at once, this is used to calculate pairwise distances in
|
|||
|
``working_memory``-sized chunks. If ``reduce_func`` is given, it is
|
|||
|
run on each chunk and its return values are concatenated into lists,
|
|||
|
arrays or sparse matrices.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_samples_X) or \
|
|||
|
(n_samples_X, n_features)
|
|||
|
Array of pairwise distances between samples, or a feature array.
|
|||
|
The shape the array should be (n_samples_X, n_samples_X) if
|
|||
|
metric='precomputed' and (n_samples_X, n_features) otherwise.
|
|||
|
|
|||
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None
|
|||
|
An optional second feature array. Only allowed if
|
|||
|
metric != "precomputed".
|
|||
|
|
|||
|
reduce_func : callable, default=None
|
|||
|
The function which is applied on each chunk of the distance matrix,
|
|||
|
reducing it to needed values. ``reduce_func(D_chunk, start)``
|
|||
|
is called repeatedly, where ``D_chunk`` is a contiguous vertical
|
|||
|
slice of the pairwise distance matrix, starting at row ``start``.
|
|||
|
It should return one of: None; an array, a list, or a sparse matrix
|
|||
|
of length ``D_chunk.shape[0]``; or a tuple of such objects.
|
|||
|
Returning None is useful for in-place operations, rather than
|
|||
|
reductions.
|
|||
|
|
|||
|
If None, pairwise_distances_chunked returns a generator of vertical
|
|||
|
chunks of the distance matrix.
|
|||
|
|
|||
|
metric : str or callable, default='euclidean'
|
|||
|
The metric to use when calculating distance between instances in a
|
|||
|
feature array. If metric is a string, it must be one of the options
|
|||
|
allowed by scipy.spatial.distance.pdist for its metric parameter,
|
|||
|
or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
|
|||
|
If metric is "precomputed", X is assumed to be a distance matrix.
|
|||
|
Alternatively, if metric is a callable function, it is called on
|
|||
|
each pair of instances (rows) and the resulting value recorded.
|
|||
|
The callable should take two arrays from X as input and return a
|
|||
|
value indicating the distance between them.
|
|||
|
|
|||
|
n_jobs : int, default=None
|
|||
|
The number of jobs to use for the computation. This works by
|
|||
|
breaking down the pairwise matrix into n_jobs even slices and
|
|||
|
computing them in parallel.
|
|||
|
|
|||
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
|||
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
|||
|
for more details.
|
|||
|
|
|||
|
working_memory : float, default=None
|
|||
|
The sought maximum memory for temporary distance matrix chunks.
|
|||
|
When None (default), the value of
|
|||
|
``sklearn.get_config()['working_memory']`` is used.
|
|||
|
|
|||
|
**kwds : optional keyword parameters
|
|||
|
Any further parameters are passed directly to the distance function.
|
|||
|
If using a scipy.spatial.distance metric, the parameters are still
|
|||
|
metric dependent. See the scipy docs for usage examples.
|
|||
|
|
|||
|
Yields
|
|||
|
------
|
|||
|
D_chunk : {ndarray, sparse matrix}
|
|||
|
A contiguous slice of distance matrix, optionally processed by
|
|||
|
``reduce_func``.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
Without reduce_func:
|
|||
|
|
|||
|
>>> import numpy as np
|
|||
|
>>> from sklearn.metrics import pairwise_distances_chunked
|
|||
|
>>> X = np.random.RandomState(0).rand(5, 3)
|
|||
|
>>> D_chunk = next(pairwise_distances_chunked(X))
|
|||
|
>>> D_chunk
|
|||
|
array([[0. ..., 0.29..., 0.41..., 0.19..., 0.57...],
|
|||
|
[0.29..., 0. ..., 0.57..., 0.41..., 0.76...],
|
|||
|
[0.41..., 0.57..., 0. ..., 0.44..., 0.90...],
|
|||
|
[0.19..., 0.41..., 0.44..., 0. ..., 0.51...],
|
|||
|
[0.57..., 0.76..., 0.90..., 0.51..., 0. ...]])
|
|||
|
|
|||
|
Retrieve all neighbors and average distance within radius r:
|
|||
|
|
|||
|
>>> r = .2
|
|||
|
>>> def reduce_func(D_chunk, start):
|
|||
|
... neigh = [np.flatnonzero(d < r) for d in D_chunk]
|
|||
|
... avg_dist = (D_chunk * (D_chunk < r)).mean(axis=1)
|
|||
|
... return neigh, avg_dist
|
|||
|
>>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func)
|
|||
|
>>> neigh, avg_dist = next(gen)
|
|||
|
>>> neigh
|
|||
|
[array([0, 3]), array([1]), array([2]), array([0, 3]), array([4])]
|
|||
|
>>> avg_dist
|
|||
|
array([0.039..., 0. , 0. , 0.039..., 0. ])
|
|||
|
|
|||
|
Where r is defined per sample, we need to make use of ``start``:
|
|||
|
|
|||
|
>>> r = [.2, .4, .4, .3, .1]
|
|||
|
>>> def reduce_func(D_chunk, start):
|
|||
|
... neigh = [np.flatnonzero(d < r[i])
|
|||
|
... for i, d in enumerate(D_chunk, start)]
|
|||
|
... return neigh
|
|||
|
>>> neigh = next(pairwise_distances_chunked(X, reduce_func=reduce_func))
|
|||
|
>>> neigh
|
|||
|
[array([0, 3]), array([0, 1]), array([2]), array([0, 3]), array([4])]
|
|||
|
|
|||
|
Force row-by-row generation by reducing ``working_memory``:
|
|||
|
|
|||
|
>>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func,
|
|||
|
... working_memory=0)
|
|||
|
>>> next(gen)
|
|||
|
[array([0, 3])]
|
|||
|
>>> next(gen)
|
|||
|
[array([0, 1])]
|
|||
|
"""
|
|||
|
n_samples_X = _num_samples(X)
|
|||
|
if metric == "precomputed":
|
|||
|
slices = (slice(0, n_samples_X),)
|
|||
|
else:
|
|||
|
if Y is None:
|
|||
|
Y = X
|
|||
|
# We get as many rows as possible within our working_memory budget to
|
|||
|
# store len(Y) distances in each row of output.
|
|||
|
#
|
|||
|
# Note:
|
|||
|
# - this will get at least 1 row, even if 1 row of distances will
|
|||
|
# exceed working_memory.
|
|||
|
# - this does not account for any temporary memory usage while
|
|||
|
# calculating distances (e.g. difference of vectors in manhattan
|
|||
|
# distance.
|
|||
|
chunk_n_rows = get_chunk_n_rows(
|
|||
|
row_bytes=8 * _num_samples(Y),
|
|||
|
max_n_rows=n_samples_X,
|
|||
|
working_memory=working_memory,
|
|||
|
)
|
|||
|
slices = gen_batches(n_samples_X, chunk_n_rows)
|
|||
|
|
|||
|
# precompute data-derived metric params
|
|||
|
params = _precompute_metric_params(X, Y, metric=metric, **kwds)
|
|||
|
kwds.update(**params)
|
|||
|
|
|||
|
for sl in slices:
|
|||
|
if sl.start == 0 and sl.stop == n_samples_X:
|
|||
|
X_chunk = X # enable optimised paths for X is Y
|
|||
|
else:
|
|||
|
X_chunk = X[sl]
|
|||
|
D_chunk = pairwise_distances(X_chunk, Y, metric=metric, n_jobs=n_jobs, **kwds)
|
|||
|
if (X is Y or Y is None) and PAIRWISE_DISTANCE_FUNCTIONS.get(
|
|||
|
metric, None
|
|||
|
) is euclidean_distances:
|
|||
|
# zeroing diagonal, taking care of aliases of "euclidean",
|
|||
|
# i.e. "l2"
|
|||
|
D_chunk.flat[sl.start :: _num_samples(X) + 1] = 0
|
|||
|
if reduce_func is not None:
|
|||
|
chunk_size = D_chunk.shape[0]
|
|||
|
D_chunk = reduce_func(D_chunk, sl.start)
|
|||
|
_check_chunk_size(D_chunk, chunk_size)
|
|||
|
yield D_chunk
|
|||
|
|
|||
|
|
|||
|
@validate_params(
|
|||
|
{
|
|||
|
"X": ["array-like", "sparse matrix"],
|
|||
|
"Y": ["array-like", "sparse matrix", None],
|
|||
|
"metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
|
|||
|
"n_jobs": [Integral, None],
|
|||
|
"force_all_finite": ["boolean", StrOptions({"allow-nan"})],
|
|||
|
},
|
|||
|
prefer_skip_nested_validation=True,
|
|||
|
)
|
|||
|
def pairwise_distances(
|
|||
|
X,
|
|||
|
Y=None,
|
|||
|
metric="euclidean",
|
|||
|
*,
|
|||
|
n_jobs=None,
|
|||
|
force_all_finite=True,
|
|||
|
**kwds,
|
|||
|
):
|
|||
|
"""Compute the distance matrix from a vector array X and optional Y.
|
|||
|
|
|||
|
This method takes either a vector array or a distance matrix, and returns
|
|||
|
a distance matrix.
|
|||
|
If the input is a vector array, the distances are computed.
|
|||
|
If the input is a distances matrix, it is returned instead.
|
|||
|
If the input is a collection of non-numeric data (e.g. a list of strings or a
|
|||
|
boolean array), a custom metric must be passed.
|
|||
|
|
|||
|
This method provides a safe way to take a distance matrix as input, while
|
|||
|
preserving compatibility with many other algorithms that take a vector
|
|||
|
array.
|
|||
|
|
|||
|
If Y is given (default is None), then the returned matrix is the pairwise
|
|||
|
distance between the arrays from both X and Y.
|
|||
|
|
|||
|
Valid values for metric are:
|
|||
|
|
|||
|
- From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
|
|||
|
'manhattan']. These metrics support sparse matrix
|
|||
|
inputs.
|
|||
|
['nan_euclidean'] but it does not yet support sparse matrices.
|
|||
|
|
|||
|
- From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
|
|||
|
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
|
|||
|
'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
|
|||
|
'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
|
|||
|
See the documentation for scipy.spatial.distance for details on these
|
|||
|
metrics. These metrics do not support sparse matrix inputs.
|
|||
|
|
|||
|
.. note::
|
|||
|
`'kulsinski'` is deprecated from SciPy 1.9 and will be removed in SciPy 1.11.
|
|||
|
|
|||
|
.. note::
|
|||
|
`'matching'` has been removed in SciPy 1.9 (use `'hamming'` instead).
|
|||
|
|
|||
|
Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are
|
|||
|
valid scipy.spatial.distance metrics), the scikit-learn implementation
|
|||
|
will be used, which is faster and has support for sparse matrices (except
|
|||
|
for 'cityblock'). For a verbose description of the metrics from
|
|||
|
scikit-learn, see :func:`sklearn.metrics.pairwise.distance_metrics`
|
|||
|
function.
|
|||
|
|
|||
|
Read more in the :ref:`User Guide <metrics>`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_samples_X) or \
|
|||
|
(n_samples_X, n_features)
|
|||
|
Array of pairwise distances between samples, or a feature array.
|
|||
|
The shape of the array should be (n_samples_X, n_samples_X) if
|
|||
|
metric == "precomputed" and (n_samples_X, n_features) otherwise.
|
|||
|
|
|||
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None
|
|||
|
An optional second feature array. Only allowed if
|
|||
|
metric != "precomputed".
|
|||
|
|
|||
|
metric : str or callable, default='euclidean'
|
|||
|
The metric to use when calculating distance between instances in a
|
|||
|
feature array. If metric is a string, it must be one of the options
|
|||
|
allowed by scipy.spatial.distance.pdist for its metric parameter, or
|
|||
|
a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``.
|
|||
|
If metric is "precomputed", X is assumed to be a distance matrix.
|
|||
|
Alternatively, if metric is a callable function, it is called on each
|
|||
|
pair of instances (rows) and the resulting value recorded. The callable
|
|||
|
should take two arrays from X as input and return a value indicating
|
|||
|
the distance between them.
|
|||
|
|
|||
|
n_jobs : int, default=None
|
|||
|
The number of jobs to use for the computation. This works by breaking
|
|||
|
down the pairwise matrix into n_jobs even slices and computing them in
|
|||
|
parallel.
|
|||
|
|
|||
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
|||
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
|||
|
for more details.
|
|||
|
|
|||
|
The "euclidean" and "cosine" metrics rely heavily on BLAS which is already
|
|||
|
multithreaded. So, increasing `n_jobs` would likely cause oversubscription
|
|||
|
and quickly degrade performance.
|
|||
|
|
|||
|
force_all_finite : bool or 'allow-nan', default=True
|
|||
|
Whether to raise an error on np.inf, np.nan, pd.NA in array. Ignored
|
|||
|
for a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``. The
|
|||
|
possibilities are:
|
|||
|
|
|||
|
- True: Force all values of array to be finite.
|
|||
|
- False: accepts np.inf, np.nan, pd.NA in array.
|
|||
|
- 'allow-nan': accepts only np.nan and pd.NA values in array. Values
|
|||
|
cannot be infinite.
|
|||
|
|
|||
|
.. versionadded:: 0.22
|
|||
|
``force_all_finite`` accepts the string ``'allow-nan'``.
|
|||
|
|
|||
|
.. versionchanged:: 0.23
|
|||
|
Accepts `pd.NA` and converts it into `np.nan`.
|
|||
|
|
|||
|
**kwds : optional keyword parameters
|
|||
|
Any further parameters are passed directly to the distance function.
|
|||
|
If using a scipy.spatial.distance metric, the parameters are still
|
|||
|
metric dependent. See the scipy docs for usage examples.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
D : ndarray of shape (n_samples_X, n_samples_X) or \
|
|||
|
(n_samples_X, n_samples_Y)
|
|||
|
A distance matrix D such that D_{i, j} is the distance between the
|
|||
|
ith and jth vectors of the given matrix X, if Y is None.
|
|||
|
If Y is not None, then D_{i, j} is the distance between the ith array
|
|||
|
from X and the jth array from Y.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
pairwise_distances_chunked : Performs the same calculation as this
|
|||
|
function, but returns a generator of chunks of the distance matrix, in
|
|||
|
order to limit memory usage.
|
|||
|
sklearn.metrics.pairwise.paired_distances : Computes the distances between
|
|||
|
corresponding elements of two arrays.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from sklearn.metrics.pairwise import pairwise_distances
|
|||
|
>>> X = [[0, 0, 0], [1, 1, 1]]
|
|||
|
>>> Y = [[1, 0, 0], [1, 1, 0]]
|
|||
|
>>> pairwise_distances(X, Y, metric='sqeuclidean')
|
|||
|
array([[1., 2.],
|
|||
|
[2., 1.]])
|
|||
|
"""
|
|||
|
if metric == "precomputed":
|
|||
|
X, _ = check_pairwise_arrays(
|
|||
|
X, Y, precomputed=True, force_all_finite=force_all_finite
|
|||
|
)
|
|||
|
|
|||
|
whom = (
|
|||
|
"`pairwise_distances`. Precomputed distance "
|
|||
|
" need to have non-negative values."
|
|||
|
)
|
|||
|
check_non_negative(X, whom=whom)
|
|||
|
return X
|
|||
|
elif metric in PAIRWISE_DISTANCE_FUNCTIONS:
|
|||
|
func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
|
|||
|
elif callable(metric):
|
|||
|
func = partial(
|
|||
|
_pairwise_callable,
|
|||
|
metric=metric,
|
|||
|
force_all_finite=force_all_finite,
|
|||
|
**kwds,
|
|||
|
)
|
|||
|
else:
|
|||
|
if issparse(X) or issparse(Y):
|
|||
|
raise TypeError("scipy distance metrics do not support sparse matrices.")
|
|||
|
|
|||
|
dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else "infer_float"
|
|||
|
|
|||
|
if dtype == bool and (X.dtype != bool or (Y is not None and Y.dtype != bool)):
|
|||
|
msg = "Data was converted to boolean for metric %s" % metric
|
|||
|
warnings.warn(msg, DataConversionWarning)
|
|||
|
|
|||
|
X, Y = check_pairwise_arrays(
|
|||
|
X, Y, dtype=dtype, force_all_finite=force_all_finite
|
|||
|
)
|
|||
|
|
|||
|
# precompute data-derived metric params
|
|||
|
params = _precompute_metric_params(X, Y, metric=metric, **kwds)
|
|||
|
kwds.update(**params)
|
|||
|
|
|||
|
if effective_n_jobs(n_jobs) == 1 and X is Y:
|
|||
|
return distance.squareform(distance.pdist(X, metric=metric, **kwds))
|
|||
|
func = partial(distance.cdist, metric=metric, **kwds)
|
|||
|
|
|||
|
return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
|
|||
|
|
|||
|
|
|||
|
# These distances require boolean arrays, when using scipy.spatial.distance
|
|||
|
PAIRWISE_BOOLEAN_FUNCTIONS = [
|
|||
|
"dice",
|
|||
|
"jaccard",
|
|||
|
"rogerstanimoto",
|
|||
|
"russellrao",
|
|||
|
"sokalmichener",
|
|||
|
"sokalsneath",
|
|||
|
"yule",
|
|||
|
]
|
|||
|
if sp_base_version < parse_version("1.11"):
|
|||
|
# Deprecated in SciPy 1.9 and removed in SciPy 1.11
|
|||
|
PAIRWISE_BOOLEAN_FUNCTIONS += ["kulsinski"]
|
|||
|
if sp_base_version < parse_version("1.9"):
|
|||
|
# Deprecated in SciPy 1.0 and removed in SciPy 1.9
|
|||
|
PAIRWISE_BOOLEAN_FUNCTIONS += ["matching"]
|
|||
|
|
|||
|
# Helper functions - distance
|
|||
|
PAIRWISE_KERNEL_FUNCTIONS = {
|
|||
|
# If updating this dictionary, update the doc in both distance_metrics()
|
|||
|
# and also in pairwise_distances()!
|
|||
|
"additive_chi2": additive_chi2_kernel,
|
|||
|
"chi2": chi2_kernel,
|
|||
|
"linear": linear_kernel,
|
|||
|
"polynomial": polynomial_kernel,
|
|||
|
"poly": polynomial_kernel,
|
|||
|
"rbf": rbf_kernel,
|
|||
|
"laplacian": laplacian_kernel,
|
|||
|
"sigmoid": sigmoid_kernel,
|
|||
|
"cosine": cosine_similarity,
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
def kernel_metrics():
|
|||
|
"""Valid metrics for pairwise_kernels.
|
|||
|
|
|||
|
This function simply returns the valid pairwise distance metrics.
|
|||
|
It exists, however, to allow for a verbose description of the mapping for
|
|||
|
each of the valid strings.
|
|||
|
|
|||
|
The valid distance metrics, and the function they map to, are:
|
|||
|
=============== ========================================
|
|||
|
metric Function
|
|||
|
=============== ========================================
|
|||
|
'additive_chi2' sklearn.pairwise.additive_chi2_kernel
|
|||
|
'chi2' sklearn.pairwise.chi2_kernel
|
|||
|
'linear' sklearn.pairwise.linear_kernel
|
|||
|
'poly' sklearn.pairwise.polynomial_kernel
|
|||
|
'polynomial' sklearn.pairwise.polynomial_kernel
|
|||
|
'rbf' sklearn.pairwise.rbf_kernel
|
|||
|
'laplacian' sklearn.pairwise.laplacian_kernel
|
|||
|
'sigmoid' sklearn.pairwise.sigmoid_kernel
|
|||
|
'cosine' sklearn.pairwise.cosine_similarity
|
|||
|
=============== ========================================
|
|||
|
|
|||
|
Read more in the :ref:`User Guide <metrics>`.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
kernel_metrics : dict
|
|||
|
Returns valid metrics for pairwise_kernels.
|
|||
|
"""
|
|||
|
return PAIRWISE_KERNEL_FUNCTIONS
|
|||
|
|
|||
|
|
|||
|
KERNEL_PARAMS = {
|
|||
|
"additive_chi2": (),
|
|||
|
"chi2": frozenset(["gamma"]),
|
|||
|
"cosine": (),
|
|||
|
"linear": (),
|
|||
|
"poly": frozenset(["gamma", "degree", "coef0"]),
|
|||
|
"polynomial": frozenset(["gamma", "degree", "coef0"]),
|
|||
|
"rbf": frozenset(["gamma"]),
|
|||
|
"laplacian": frozenset(["gamma"]),
|
|||
|
"sigmoid": frozenset(["gamma", "coef0"]),
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
@validate_params(
|
|||
|
{
|
|||
|
"X": ["array-like", "sparse matrix"],
|
|||
|
"Y": ["array-like", "sparse matrix", None],
|
|||
|
"metric": [
|
|||
|
StrOptions(set(PAIRWISE_KERNEL_FUNCTIONS) | {"precomputed"}),
|
|||
|
callable,
|
|||
|
],
|
|||
|
"filter_params": ["boolean"],
|
|||
|
"n_jobs": [Integral, None],
|
|||
|
},
|
|||
|
prefer_skip_nested_validation=True,
|
|||
|
)
|
|||
|
def pairwise_kernels(
|
|||
|
X, Y=None, metric="linear", *, filter_params=False, n_jobs=None, **kwds
|
|||
|
):
|
|||
|
"""Compute the kernel between arrays X and optional array Y.
|
|||
|
|
|||
|
This method takes either a vector array or a kernel matrix, and returns
|
|||
|
a kernel matrix. If the input is a vector array, the kernels are
|
|||
|
computed. If the input is a kernel matrix, it is returned instead.
|
|||
|
|
|||
|
This method provides a safe way to take a kernel matrix as input, while
|
|||
|
preserving compatibility with many other algorithms that take a vector
|
|||
|
array.
|
|||
|
|
|||
|
If Y is given (default is None), then the returned matrix is the pairwise
|
|||
|
kernel between the arrays from both X and Y.
|
|||
|
|
|||
|
Valid values for metric are:
|
|||
|
['additive_chi2', 'chi2', 'linear', 'poly', 'polynomial', 'rbf',
|
|||
|
'laplacian', 'sigmoid', 'cosine']
|
|||
|
|
|||
|
Read more in the :ref:`User Guide <metrics>`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
X : {array-like, sparse matrix} of shape (n_samples_X, n_samples_X) or \
|
|||
|
(n_samples_X, n_features)
|
|||
|
Array of pairwise kernels between samples, or a feature array.
|
|||
|
The shape of the array should be (n_samples_X, n_samples_X) if
|
|||
|
metric == "precomputed" and (n_samples_X, n_features) otherwise.
|
|||
|
|
|||
|
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None
|
|||
|
A second feature array only if X has shape (n_samples_X, n_features).
|
|||
|
|
|||
|
metric : str or callable, default="linear"
|
|||
|
The metric to use when calculating kernel between instances in a
|
|||
|
feature array. If metric is a string, it must be one of the metrics
|
|||
|
in ``pairwise.PAIRWISE_KERNEL_FUNCTIONS``.
|
|||
|
If metric is "precomputed", X is assumed to be a kernel matrix.
|
|||
|
Alternatively, if metric is a callable function, it is called on each
|
|||
|
pair of instances (rows) and the resulting value recorded. The callable
|
|||
|
should take two rows from X as input and return the corresponding
|
|||
|
kernel value as a single number. This means that callables from
|
|||
|
:mod:`sklearn.metrics.pairwise` are not allowed, as they operate on
|
|||
|
matrices, not single samples. Use the string identifying the kernel
|
|||
|
instead.
|
|||
|
|
|||
|
filter_params : bool, default=False
|
|||
|
Whether to filter invalid parameters or not.
|
|||
|
|
|||
|
n_jobs : int, default=None
|
|||
|
The number of jobs to use for the computation. This works by breaking
|
|||
|
down the pairwise matrix into n_jobs even slices and computing them in
|
|||
|
parallel.
|
|||
|
|
|||
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
|||
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
|||
|
for more details.
|
|||
|
|
|||
|
**kwds : optional keyword parameters
|
|||
|
Any further parameters are passed directly to the kernel function.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
K : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_samples_Y)
|
|||
|
A kernel matrix K such that K_{i, j} is the kernel between the
|
|||
|
ith and jth vectors of the given matrix X, if Y is None.
|
|||
|
If Y is not None, then K_{i, j} is the kernel between the ith array
|
|||
|
from X and the jth array from Y.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
If metric is 'precomputed', Y is ignored and X is returned.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from sklearn.metrics.pairwise import pairwise_kernels
|
|||
|
>>> X = [[0, 0, 0], [1, 1, 1]]
|
|||
|
>>> Y = [[1, 0, 0], [1, 1, 0]]
|
|||
|
>>> pairwise_kernels(X, Y, metric='linear')
|
|||
|
array([[0., 0.],
|
|||
|
[1., 2.]])
|
|||
|
"""
|
|||
|
# import GPKernel locally to prevent circular imports
|
|||
|
from ..gaussian_process.kernels import Kernel as GPKernel
|
|||
|
|
|||
|
if metric == "precomputed":
|
|||
|
X, _ = check_pairwise_arrays(X, Y, precomputed=True)
|
|||
|
return X
|
|||
|
elif isinstance(metric, GPKernel):
|
|||
|
func = metric.__call__
|
|||
|
elif metric in PAIRWISE_KERNEL_FUNCTIONS:
|
|||
|
if filter_params:
|
|||
|
kwds = {k: kwds[k] for k in kwds if k in KERNEL_PARAMS[metric]}
|
|||
|
func = PAIRWISE_KERNEL_FUNCTIONS[metric]
|
|||
|
elif callable(metric):
|
|||
|
func = partial(_pairwise_callable, metric=metric, **kwds)
|
|||
|
|
|||
|
return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
|