2206 lines
74 KiB
Python
2206 lines
74 KiB
Python
# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||
# Mathieu Blondel <mathieu@mblondel.org>
|
||
# Robert Layton <robertlayton@gmail.com>
|
||
# Andreas Mueller <amueller@ais.uni-bonn.de>
|
||
# Philippe Gervais <philippe.gervais@inria.fr>
|
||
# Lars Buitinck
|
||
# Joel Nothman <joel.nothman@gmail.com>
|
||
# License: BSD 3 clause
|
||
|
||
import itertools
|
||
from functools import partial
|
||
import warnings
|
||
|
||
import numpy as np
|
||
from scipy.spatial import distance
|
||
from scipy.sparse import csr_matrix
|
||
from scipy.sparse import issparse
|
||
from joblib import effective_n_jobs
|
||
|
||
from .. import config_context
|
||
from ..utils.validation import _num_samples
|
||
from ..utils.validation import check_non_negative
|
||
from ..utils import check_array
|
||
from ..utils import gen_even_slices
|
||
from ..utils import gen_batches, get_chunk_n_rows
|
||
from ..utils import is_scalar_nan
|
||
from ..utils.extmath import row_norms, safe_sparse_dot
|
||
from ..preprocessing import normalize
|
||
from ..utils._mask import _get_mask
|
||
from ..utils.parallel import delayed, Parallel
|
||
from ..utils.fixes import sp_version, parse_version
|
||
|
||
from ._pairwise_distances_reduction import ArgKmin
|
||
from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
|
||
from ..exceptions import DataConversionWarning
|
||
|
||
|
||
# Utility Functions
|
||
def _return_float_dtype(X, Y):
|
||
"""
|
||
1. If dtype of X and Y is float32, then dtype float32 is returned.
|
||
2. Else dtype float is returned.
|
||
"""
|
||
if not issparse(X) and not isinstance(X, np.ndarray):
|
||
X = np.asarray(X)
|
||
|
||
if Y is None:
|
||
Y_dtype = X.dtype
|
||
elif not issparse(Y) and not isinstance(Y, np.ndarray):
|
||
Y = np.asarray(Y)
|
||
Y_dtype = Y.dtype
|
||
else:
|
||
Y_dtype = Y.dtype
|
||
|
||
if X.dtype == Y_dtype == np.float32:
|
||
dtype = np.float32
|
||
else:
|
||
dtype = float
|
||
|
||
return X, Y, dtype
|
||
|
||
|
||
def check_pairwise_arrays(
|
||
X,
|
||
Y,
|
||
*,
|
||
precomputed=False,
|
||
dtype=None,
|
||
accept_sparse="csr",
|
||
force_all_finite=True,
|
||
copy=False,
|
||
):
|
||
"""Set X and Y appropriately and checks inputs.
|
||
|
||
If Y is None, it is set as a pointer to X (i.e. not a copy).
|
||
If Y is given, this does not happen.
|
||
All distance metrics should use this function first to assert that the
|
||
given parameters are correct and safe to use.
|
||
|
||
Specifically, this function first ensures that both X and Y are arrays,
|
||
then checks that they are at least two dimensional while ensuring that
|
||
their elements are floats (or dtype if provided). Finally, the function
|
||
checks that the size of the second dimension of the two arrays is equal, or
|
||
the equivalent check for a precomputed distance matrix.
|
||
|
||
Parameters
|
||
----------
|
||
X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
||
|
||
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)
|
||
|
||
precomputed : bool, default=False
|
||
True if X is to be treated as precomputed distances to the samples in
|
||
Y.
|
||
|
||
dtype : str, type, list of type, default=None
|
||
Data type required for X and Y. If None, the dtype will be an
|
||
appropriate float type selected by _return_float_dtype.
|
||
|
||
.. versionadded:: 0.18
|
||
|
||
accept_sparse : str, bool or list/tuple of str, default='csr'
|
||
String[s] representing allowed sparse matrix formats, such as 'csc',
|
||
'csr', etc. If the input is sparse but not in the allowed format,
|
||
it will be converted to the first listed format. True allows the input
|
||
to be any format. False means that a sparse matrix input will
|
||
raise an error.
|
||
|
||
force_all_finite : bool or 'allow-nan', default=True
|
||
Whether to raise an error on np.inf, np.nan, pd.NA in array. The
|
||
possibilities are:
|
||
|
||
- True: Force all values of array to be finite.
|
||
- False: accepts np.inf, np.nan, pd.NA in array.
|
||
- 'allow-nan': accepts only np.nan and pd.NA values in array. Values
|
||
cannot be infinite.
|
||
|
||
.. versionadded:: 0.22
|
||
``force_all_finite`` accepts the string ``'allow-nan'``.
|
||
|
||
.. versionchanged:: 0.23
|
||
Accepts `pd.NA` and converts it into `np.nan`.
|
||
|
||
copy : bool, default=False
|
||
Whether a forced copy will be triggered. If copy=False, a copy might
|
||
be triggered by a conversion.
|
||
|
||
.. versionadded:: 0.22
|
||
|
||
Returns
|
||
-------
|
||
safe_X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
||
An array equal to X, guaranteed to be a numpy array.
|
||
|
||
safe_Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)
|
||
An array equal to Y if Y was not None, guaranteed to be a numpy array.
|
||
If Y was None, safe_Y will be a pointer to X.
|
||
"""
|
||
X, Y, dtype_float = _return_float_dtype(X, Y)
|
||
|
||
estimator = "check_pairwise_arrays"
|
||
if dtype is None:
|
||
dtype = dtype_float
|
||
|
||
if Y is X or Y is None:
|
||
X = Y = check_array(
|
||
X,
|
||
accept_sparse=accept_sparse,
|
||
dtype=dtype,
|
||
copy=copy,
|
||
force_all_finite=force_all_finite,
|
||
estimator=estimator,
|
||
)
|
||
else:
|
||
X = check_array(
|
||
X,
|
||
accept_sparse=accept_sparse,
|
||
dtype=dtype,
|
||
copy=copy,
|
||
force_all_finite=force_all_finite,
|
||
estimator=estimator,
|
||
)
|
||
Y = check_array(
|
||
Y,
|
||
accept_sparse=accept_sparse,
|
||
dtype=dtype,
|
||
copy=copy,
|
||
force_all_finite=force_all_finite,
|
||
estimator=estimator,
|
||
)
|
||
|
||
if precomputed:
|
||
if X.shape[1] != Y.shape[0]:
|
||
raise ValueError(
|
||
"Precomputed metric requires shape "
|
||
"(n_queries, n_indexed). Got (%d, %d) "
|
||
"for %d indexed." % (X.shape[0], X.shape[1], Y.shape[0])
|
||
)
|
||
elif X.shape[1] != Y.shape[1]:
|
||
raise ValueError(
|
||
"Incompatible dimension for X and Y matrices: "
|
||
"X.shape[1] == %d while Y.shape[1] == %d" % (X.shape[1], Y.shape[1])
|
||
)
|
||
|
||
return X, Y
|
||
|
||
|
||
def check_paired_arrays(X, Y):
|
||
"""Set X and Y appropriately and checks inputs for paired distances.
|
||
|
||
All paired distance metrics should use this function first to assert that
|
||
the given parameters are correct and safe to use.
|
||
|
||
Specifically, this function first ensures that both X and Y are arrays,
|
||
then checks that they are at least two dimensional while ensuring that
|
||
their elements are floats. Finally, the function checks that the size
|
||
of the dimensions of the two arrays are equal.
|
||
|
||
Parameters
|
||
----------
|
||
X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
||
|
||
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)
|
||
|
||
Returns
|
||
-------
|
||
safe_X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
||
An array equal to X, guaranteed to be a numpy array.
|
||
|
||
safe_Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)
|
||
An array equal to Y if Y was not None, guaranteed to be a numpy array.
|
||
If Y was None, safe_Y will be a pointer to X.
|
||
"""
|
||
X, Y = check_pairwise_arrays(X, Y)
|
||
if X.shape != Y.shape:
|
||
raise ValueError(
|
||
"X and Y should be of same shape. They were respectively %r and %r long."
|
||
% (X.shape, Y.shape)
|
||
)
|
||
return X, Y
|
||
|
||
|
||
# Pairwise distances
|
||
def euclidean_distances(
|
||
X, Y=None, *, Y_norm_squared=None, squared=False, X_norm_squared=None
|
||
):
|
||
"""
|
||
Compute the distance matrix between each pair from a vector array X and Y.
|
||
|
||
For efficiency reasons, the euclidean distance between a pair of row
|
||
vector x and y is computed as::
|
||
|
||
dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))
|
||
|
||
This formulation has two advantages over other ways of computing distances.
|
||
First, it is computationally efficient when dealing with sparse data.
|
||
Second, if one argument varies but the other remains unchanged, then
|
||
`dot(x, x)` and/or `dot(y, y)` can be pre-computed.
|
||
|
||
However, this is not the most precise way of doing this computation,
|
||
because this equation potentially suffers from "catastrophic cancellation".
|
||
Also, the distance matrix returned by this function may not be exactly
|
||
symmetric as required by, e.g., ``scipy.spatial.distance`` functions.
|
||
|
||
Read more in the :ref:`User Guide <metrics>`.
|
||
|
||
Parameters
|
||
----------
|
||
X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
||
An array where each row is a sample and each column is a feature.
|
||
|
||
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), \
|
||
default=None
|
||
An array where each row is a sample and each column is a feature.
|
||
If `None`, method uses `Y=X`.
|
||
|
||
Y_norm_squared : array-like of shape (n_samples_Y,) or (n_samples_Y, 1) \
|
||
or (1, n_samples_Y), default=None
|
||
Pre-computed dot-products of vectors in Y (e.g.,
|
||
``(Y**2).sum(axis=1)``)
|
||
May be ignored in some cases, see the note below.
|
||
|
||
squared : bool, default=False
|
||
Return squared Euclidean distances.
|
||
|
||
X_norm_squared : array-like of shape (n_samples_X,) or (n_samples_X, 1) \
|
||
or (1, n_samples_X), default=None
|
||
Pre-computed dot-products of vectors in X (e.g.,
|
||
``(X**2).sum(axis=1)``)
|
||
May be ignored in some cases, see the note below.
|
||
|
||
Returns
|
||
-------
|
||
distances : ndarray of shape (n_samples_X, n_samples_Y)
|
||
Returns the distances between the row vectors of `X`
|
||
and the row vectors of `Y`.
|
||
|
||
See Also
|
||
--------
|
||
paired_distances : Distances between pairs of elements of X and Y.
|
||
|
||
Notes
|
||
-----
|
||
To achieve a better accuracy, `X_norm_squared` and `Y_norm_squared` may be
|
||
unused if they are passed as `np.float32`.
|
||
|
||
Examples
|
||
--------
|
||
>>> from sklearn.metrics.pairwise import euclidean_distances
|
||
>>> X = [[0, 1], [1, 1]]
|
||
>>> # distance between rows of X
|
||
>>> euclidean_distances(X, X)
|
||
array([[0., 1.],
|
||
[1., 0.]])
|
||
>>> # get distance to origin
|
||
>>> euclidean_distances(X, [[0, 0]])
|
||
array([[1. ],
|
||
[1.41421356]])
|
||
"""
|
||
X, Y = check_pairwise_arrays(X, Y)
|
||
|
||
if X_norm_squared is not None:
|
||
X_norm_squared = check_array(X_norm_squared, ensure_2d=False)
|
||
original_shape = X_norm_squared.shape
|
||
if X_norm_squared.shape == (X.shape[0],):
|
||
X_norm_squared = X_norm_squared.reshape(-1, 1)
|
||
if X_norm_squared.shape == (1, X.shape[0]):
|
||
X_norm_squared = X_norm_squared.T
|
||
if X_norm_squared.shape != (X.shape[0], 1):
|
||
raise ValueError(
|
||
f"Incompatible dimensions for X of shape {X.shape} and "
|
||
f"X_norm_squared of shape {original_shape}."
|
||
)
|
||
|
||
if Y_norm_squared is not None:
|
||
Y_norm_squared = check_array(Y_norm_squared, ensure_2d=False)
|
||
original_shape = Y_norm_squared.shape
|
||
if Y_norm_squared.shape == (Y.shape[0],):
|
||
Y_norm_squared = Y_norm_squared.reshape(1, -1)
|
||
if Y_norm_squared.shape == (Y.shape[0], 1):
|
||
Y_norm_squared = Y_norm_squared.T
|
||
if Y_norm_squared.shape != (1, Y.shape[0]):
|
||
raise ValueError(
|
||
f"Incompatible dimensions for Y of shape {Y.shape} and "
|
||
f"Y_norm_squared of shape {original_shape}."
|
||
)
|
||
|
||
return _euclidean_distances(X, Y, X_norm_squared, Y_norm_squared, squared)
|
||
|
||
|
||
def _euclidean_distances(X, Y, X_norm_squared=None, Y_norm_squared=None, squared=False):
|
||
"""Computational part of euclidean_distances
|
||
|
||
Assumes inputs are already checked.
|
||
|
||
If norms are passed as float32, they are unused. If arrays are passed as
|
||
float32, norms needs to be recomputed on upcast chunks.
|
||
TODO: use a float64 accumulator in row_norms to avoid the latter.
|
||
"""
|
||
if X_norm_squared is not None:
|
||
if X_norm_squared.dtype == np.float32:
|
||
XX = None
|
||
else:
|
||
XX = X_norm_squared.reshape(-1, 1)
|
||
elif X.dtype == np.float32:
|
||
XX = None
|
||
else:
|
||
XX = row_norms(X, squared=True)[:, np.newaxis]
|
||
|
||
if Y is X:
|
||
YY = None if XX is None else XX.T
|
||
else:
|
||
if Y_norm_squared is not None:
|
||
if Y_norm_squared.dtype == np.float32:
|
||
YY = None
|
||
else:
|
||
YY = Y_norm_squared.reshape(1, -1)
|
||
elif Y.dtype == np.float32:
|
||
YY = None
|
||
else:
|
||
YY = row_norms(Y, squared=True)[np.newaxis, :]
|
||
|
||
if X.dtype == np.float32:
|
||
# To minimize precision issues with float32, we compute the distance
|
||
# matrix on chunks of X and Y upcast to float64
|
||
distances = _euclidean_distances_upcast(X, XX, Y, YY)
|
||
else:
|
||
# if dtype is already float64, no need to chunk and upcast
|
||
distances = -2 * safe_sparse_dot(X, Y.T, dense_output=True)
|
||
distances += XX
|
||
distances += YY
|
||
np.maximum(distances, 0, out=distances)
|
||
|
||
# Ensure that distances between vectors and themselves are set to 0.0.
|
||
# This may not be the case due to floating point rounding errors.
|
||
if X is Y:
|
||
np.fill_diagonal(distances, 0)
|
||
|
||
return distances if squared else np.sqrt(distances, out=distances)
|
||
|
||
|
||
def nan_euclidean_distances(
|
||
X, Y=None, *, squared=False, missing_values=np.nan, copy=True
|
||
):
|
||
"""Calculate the euclidean distances in the presence of missing values.
|
||
|
||
Compute the euclidean distance between each pair of samples in X and Y,
|
||
where Y=X is assumed if Y=None. When calculating the distance between a
|
||
pair of samples, this formulation ignores feature coordinates with a
|
||
missing value in either sample and scales up the weight of the remaining
|
||
coordinates:
|
||
|
||
dist(x,y) = sqrt(weight * sq. distance from present coordinates)
|
||
where,
|
||
weight = Total # of coordinates / # of present coordinates
|
||
|
||
For example, the distance between ``[3, na, na, 6]`` and ``[1, na, 4, 5]``
|
||
is:
|
||
|
||
.. math::
|
||
\\sqrt{\\frac{4}{2}((3-1)^2 + (6-5)^2)}
|
||
|
||
If all the coordinates are missing or if there are no common present
|
||
coordinates then NaN is returned for that pair.
|
||
|
||
Read more in the :ref:`User Guide <metrics>`.
|
||
|
||
.. versionadded:: 0.22
|
||
|
||
Parameters
|
||
----------
|
||
X : array-like of shape (n_samples_X, n_features)
|
||
An array where each row is a sample and each column is a feature.
|
||
|
||
Y : array-like of shape (n_samples_Y, n_features), default=None
|
||
An array where each row is a sample and each column is a feature.
|
||
If `None`, method uses `Y=X`.
|
||
|
||
squared : bool, default=False
|
||
Return squared Euclidean distances.
|
||
|
||
missing_values : np.nan or int, default=np.nan
|
||
Representation of missing value.
|
||
|
||
copy : bool, default=True
|
||
Make and use a deep copy of X and Y (if Y exists).
|
||
|
||
Returns
|
||
-------
|
||
distances : ndarray of shape (n_samples_X, n_samples_Y)
|
||
Returns the distances between the row vectors of `X`
|
||
and the row vectors of `Y`.
|
||
|
||
See Also
|
||
--------
|
||
paired_distances : Distances between pairs of elements of X and Y.
|
||
|
||
References
|
||
----------
|
||
* John K. Dixon, "Pattern Recognition with Partly Missing Data",
|
||
IEEE Transactions on Systems, Man, and Cybernetics, Volume: 9, Issue:
|
||
10, pp. 617 - 621, Oct. 1979.
|
||
http://ieeexplore.ieee.org/abstract/document/4310090/
|
||
|
||
Examples
|
||
--------
|
||
>>> from sklearn.metrics.pairwise import nan_euclidean_distances
|
||
>>> nan = float("NaN")
|
||
>>> X = [[0, 1], [1, nan]]
|
||
>>> nan_euclidean_distances(X, X) # distance between rows of X
|
||
array([[0. , 1.41421356],
|
||
[1.41421356, 0. ]])
|
||
|
||
>>> # get distance to origin
|
||
>>> nan_euclidean_distances(X, [[0, 0]])
|
||
array([[1. ],
|
||
[1.41421356]])
|
||
"""
|
||
|
||
force_all_finite = "allow-nan" if is_scalar_nan(missing_values) else True
|
||
X, Y = check_pairwise_arrays(
|
||
X, Y, accept_sparse=False, force_all_finite=force_all_finite, copy=copy
|
||
)
|
||
# Get missing mask for X
|
||
missing_X = _get_mask(X, missing_values)
|
||
|
||
# Get missing mask for Y
|
||
missing_Y = missing_X if Y is X else _get_mask(Y, missing_values)
|
||
|
||
# set missing values to zero
|
||
X[missing_X] = 0
|
||
Y[missing_Y] = 0
|
||
|
||
distances = euclidean_distances(X, Y, squared=True)
|
||
|
||
# Adjust distances for missing values
|
||
XX = X * X
|
||
YY = Y * Y
|
||
distances -= np.dot(XX, missing_Y.T)
|
||
distances -= np.dot(missing_X, YY.T)
|
||
|
||
np.clip(distances, 0, None, out=distances)
|
||
|
||
if X is Y:
|
||
# Ensure that distances between vectors and themselves are set to 0.0.
|
||
# This may not be the case due to floating point rounding errors.
|
||
np.fill_diagonal(distances, 0.0)
|
||
|
||
present_X = 1 - missing_X
|
||
present_Y = present_X if Y is X else ~missing_Y
|
||
present_count = np.dot(present_X, present_Y.T)
|
||
distances[present_count == 0] = np.nan
|
||
# avoid divide by zero
|
||
np.maximum(1, present_count, out=present_count)
|
||
distances /= present_count
|
||
distances *= X.shape[1]
|
||
|
||
if not squared:
|
||
np.sqrt(distances, out=distances)
|
||
|
||
return distances
|
||
|
||
|
||
def _euclidean_distances_upcast(X, XX=None, Y=None, YY=None, batch_size=None):
|
||
"""Euclidean distances between X and Y.
|
||
|
||
Assumes X and Y have float32 dtype.
|
||
Assumes XX and YY have float64 dtype or are None.
|
||
|
||
X and Y are upcast to float64 by chunks, which size is chosen to limit
|
||
memory increase by approximately 10% (at least 10MiB).
|
||
"""
|
||
n_samples_X = X.shape[0]
|
||
n_samples_Y = Y.shape[0]
|
||
n_features = X.shape[1]
|
||
|
||
distances = np.empty((n_samples_X, n_samples_Y), dtype=np.float32)
|
||
|
||
if batch_size is None:
|
||
x_density = X.nnz / np.prod(X.shape) if issparse(X) else 1
|
||
y_density = Y.nnz / np.prod(Y.shape) if issparse(Y) else 1
|
||
|
||
# Allow 10% more memory than X, Y and the distance matrix take (at
|
||
# least 10MiB)
|
||
maxmem = max(
|
||
(
|
||
(x_density * n_samples_X + y_density * n_samples_Y) * n_features
|
||
+ (x_density * n_samples_X * y_density * n_samples_Y)
|
||
)
|
||
/ 10,
|
||
10 * 2**17,
|
||
)
|
||
|
||
# The increase amount of memory in 8-byte blocks is:
|
||
# - x_density * batch_size * n_features (copy of chunk of X)
|
||
# - y_density * batch_size * n_features (copy of chunk of Y)
|
||
# - batch_size * batch_size (chunk of distance matrix)
|
||
# Hence x² + (xd+yd)kx = M, where x=batch_size, k=n_features, M=maxmem
|
||
# xd=x_density and yd=y_density
|
||
tmp = (x_density + y_density) * n_features
|
||
batch_size = (-tmp + np.sqrt(tmp**2 + 4 * maxmem)) / 2
|
||
batch_size = max(int(batch_size), 1)
|
||
|
||
x_batches = gen_batches(n_samples_X, batch_size)
|
||
|
||
for i, x_slice in enumerate(x_batches):
|
||
X_chunk = X[x_slice].astype(np.float64)
|
||
if XX is None:
|
||
XX_chunk = row_norms(X_chunk, squared=True)[:, np.newaxis]
|
||
else:
|
||
XX_chunk = XX[x_slice]
|
||
|
||
y_batches = gen_batches(n_samples_Y, batch_size)
|
||
|
||
for j, y_slice in enumerate(y_batches):
|
||
if X is Y and j < i:
|
||
# when X is Y the distance matrix is symmetric so we only need
|
||
# to compute half of it.
|
||
d = distances[y_slice, x_slice].T
|
||
|
||
else:
|
||
Y_chunk = Y[y_slice].astype(np.float64)
|
||
if YY is None:
|
||
YY_chunk = row_norms(Y_chunk, squared=True)[np.newaxis, :]
|
||
else:
|
||
YY_chunk = YY[:, y_slice]
|
||
|
||
d = -2 * safe_sparse_dot(X_chunk, Y_chunk.T, dense_output=True)
|
||
d += XX_chunk
|
||
d += YY_chunk
|
||
|
||
distances[x_slice, y_slice] = d.astype(np.float32, copy=False)
|
||
|
||
return distances
|
||
|
||
|
||
def _argmin_min_reduce(dist, start):
|
||
# `start` is specified in the signature but not used. This is because the higher
|
||
# order `pairwise_distances_chunked` function needs reduction functions that are
|
||
# passed as argument to have a two arguments signature.
|
||
indices = dist.argmin(axis=1)
|
||
values = dist[np.arange(dist.shape[0]), indices]
|
||
return indices, values
|
||
|
||
|
||
def _argmin_reduce(dist, start):
|
||
# `start` is specified in the signature but not used. This is because the higher
|
||
# order `pairwise_distances_chunked` function needs reduction functions that are
|
||
# passed as argument to have a two arguments signature.
|
||
return dist.argmin(axis=1)
|
||
|
||
|
||
def pairwise_distances_argmin_min(
|
||
X, Y, *, axis=1, metric="euclidean", metric_kwargs=None
|
||
):
|
||
"""Compute minimum distances between one point and a set of points.
|
||
|
||
This function computes for each row in X, the index of the row of Y which
|
||
is closest (according to the specified distance). The minimal distances are
|
||
also returned.
|
||
|
||
This is mostly equivalent to calling:
|
||
|
||
(pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis),
|
||
pairwise_distances(X, Y=Y, metric=metric).min(axis=axis))
|
||
|
||
but uses much less memory, and is faster for large arrays.
|
||
|
||
Parameters
|
||
----------
|
||
X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
||
Array containing points.
|
||
|
||
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)
|
||
Array containing points.
|
||
|
||
axis : int, default=1
|
||
Axis along which the argmin and distances are to be computed.
|
||
|
||
metric : str or callable, default='euclidean'
|
||
Metric to use for distance computation. Any metric from scikit-learn
|
||
or scipy.spatial.distance can be used.
|
||
|
||
If metric is a callable function, it is called on each
|
||
pair of instances (rows) and the resulting value recorded. The callable
|
||
should take two arrays as input and return one value indicating the
|
||
distance between them. This works for Scipy's metrics, but is less
|
||
efficient than passing the metric name as a string.
|
||
|
||
Distance matrices are not supported.
|
||
|
||
Valid values for metric are:
|
||
|
||
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
|
||
'manhattan']
|
||
|
||
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
|
||
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
|
||
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
|
||
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
|
||
'yule']
|
||
|
||
See the documentation for scipy.spatial.distance for details on these
|
||
metrics.
|
||
|
||
metric_kwargs : dict, default=None
|
||
Keyword arguments to pass to specified metric function.
|
||
|
||
Returns
|
||
-------
|
||
argmin : ndarray
|
||
Y[argmin[i], :] is the row in Y that is closest to X[i, :].
|
||
|
||
distances : ndarray
|
||
The array of minimum distances. `distances[i]` is the distance between
|
||
the i-th row in X and the argmin[i]-th row in Y.
|
||
|
||
See Also
|
||
--------
|
||
pairwise_distances : Distances between every pair of samples of X and Y.
|
||
pairwise_distances_argmin : Same as `pairwise_distances_argmin_min` but only
|
||
returns the argmins.
|
||
"""
|
||
X, Y = check_pairwise_arrays(X, Y)
|
||
|
||
if axis == 0:
|
||
X, Y = Y, X
|
||
|
||
if metric_kwargs is None:
|
||
metric_kwargs = {}
|
||
|
||
if ArgKmin.is_usable_for(X, Y, metric):
|
||
# This is an adaptor for one "sqeuclidean" specification.
|
||
# For this backend, we can directly use "sqeuclidean".
|
||
if metric_kwargs.get("squared", False) and metric == "euclidean":
|
||
metric = "sqeuclidean"
|
||
metric_kwargs = {}
|
||
|
||
values, indices = ArgKmin.compute(
|
||
X=X,
|
||
Y=Y,
|
||
k=1,
|
||
metric=metric,
|
||
metric_kwargs=metric_kwargs,
|
||
strategy="auto",
|
||
return_distance=True,
|
||
)
|
||
values = values.flatten()
|
||
indices = indices.flatten()
|
||
else:
|
||
# Joblib-based backend, which is used when user-defined callable
|
||
# are passed for metric.
|
||
|
||
# This won't be used in the future once PairwiseDistancesReductions support:
|
||
# - DistanceMetrics which work on supposedly binary data
|
||
# - CSR-dense and dense-CSR case if 'euclidean' in metric.
|
||
|
||
# Turn off check for finiteness because this is costly and because arrays
|
||
# have already been validated.
|
||
with config_context(assume_finite=True):
|
||
indices, values = zip(
|
||
*pairwise_distances_chunked(
|
||
X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs
|
||
)
|
||
)
|
||
indices = np.concatenate(indices)
|
||
values = np.concatenate(values)
|
||
|
||
return indices, values
|
||
|
||
|
||
def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs=None):
|
||
"""Compute minimum distances between one point and a set of points.
|
||
|
||
This function computes for each row in X, the index of the row of Y which
|
||
is closest (according to the specified distance).
|
||
|
||
This is mostly equivalent to calling:
|
||
|
||
pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis)
|
||
|
||
but uses much less memory, and is faster for large arrays.
|
||
|
||
This function works with dense 2D arrays only.
|
||
|
||
Parameters
|
||
----------
|
||
X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
||
Array containing points.
|
||
|
||
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)
|
||
Arrays containing points.
|
||
|
||
axis : int, default=1
|
||
Axis along which the argmin and distances are to be computed.
|
||
|
||
metric : str or callable, default="euclidean"
|
||
Metric to use for distance computation. Any metric from scikit-learn
|
||
or scipy.spatial.distance can be used.
|
||
|
||
If metric is a callable function, it is called on each
|
||
pair of instances (rows) and the resulting value recorded. The callable
|
||
should take two arrays as input and return one value indicating the
|
||
distance between them. This works for Scipy's metrics, but is less
|
||
efficient than passing the metric name as a string.
|
||
|
||
Distance matrices are not supported.
|
||
|
||
Valid values for metric are:
|
||
|
||
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
|
||
'manhattan']
|
||
|
||
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
|
||
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
|
||
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
|
||
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
|
||
'yule']
|
||
|
||
See the documentation for scipy.spatial.distance for details on these
|
||
metrics.
|
||
|
||
metric_kwargs : dict, default=None
|
||
Keyword arguments to pass to specified metric function.
|
||
|
||
Returns
|
||
-------
|
||
argmin : numpy.ndarray
|
||
Y[argmin[i], :] is the row in Y that is closest to X[i, :].
|
||
|
||
See Also
|
||
--------
|
||
pairwise_distances : Distances between every pair of samples of X and Y.
|
||
pairwise_distances_argmin_min : Same as `pairwise_distances_argmin` but also
|
||
returns the distances.
|
||
"""
|
||
if metric_kwargs is None:
|
||
metric_kwargs = {}
|
||
|
||
X, Y = check_pairwise_arrays(X, Y)
|
||
|
||
if axis == 0:
|
||
X, Y = Y, X
|
||
|
||
if metric_kwargs is None:
|
||
metric_kwargs = {}
|
||
|
||
if ArgKmin.is_usable_for(X, Y, metric):
|
||
# This is an adaptor for one "sqeuclidean" specification.
|
||
# For this backend, we can directly use "sqeuclidean".
|
||
if metric_kwargs.get("squared", False) and metric == "euclidean":
|
||
metric = "sqeuclidean"
|
||
metric_kwargs = {}
|
||
|
||
indices = ArgKmin.compute(
|
||
X=X,
|
||
Y=Y,
|
||
k=1,
|
||
metric=metric,
|
||
metric_kwargs=metric_kwargs,
|
||
strategy="auto",
|
||
return_distance=False,
|
||
)
|
||
indices = indices.flatten()
|
||
else:
|
||
# Joblib-based backend, which is used when user-defined callable
|
||
# are passed for metric.
|
||
|
||
# This won't be used in the future once PairwiseDistancesReductions support:
|
||
# - DistanceMetrics which work on supposedly binary data
|
||
# - CSR-dense and dense-CSR case if 'euclidean' in metric.
|
||
|
||
# Turn off check for finiteness because this is costly and because arrays
|
||
# have already been validated.
|
||
with config_context(assume_finite=True):
|
||
indices = np.concatenate(
|
||
list(
|
||
# This returns a np.ndarray generator whose arrays we need
|
||
# to flatten into one.
|
||
pairwise_distances_chunked(
|
||
X, Y, reduce_func=_argmin_reduce, metric=metric, **metric_kwargs
|
||
)
|
||
)
|
||
)
|
||
|
||
return indices
|
||
|
||
|
||
def haversine_distances(X, Y=None):
|
||
"""Compute the Haversine distance between samples in X and Y.
|
||
|
||
The Haversine (or great circle) distance is the angular distance between
|
||
two points on the surface of a sphere. The first coordinate of each point
|
||
is assumed to be the latitude, the second is the longitude, given
|
||
in radians. The dimension of the data must be 2.
|
||
|
||
.. math::
|
||
D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x1 - y1) / 2)
|
||
+ \\cos(x1)\\cos(y1)\\sin^2((x2 - y2) / 2)}]
|
||
|
||
Parameters
|
||
----------
|
||
X : array-like of shape (n_samples_X, 2)
|
||
A feature array.
|
||
|
||
Y : array-like of shape (n_samples_Y, 2), default=None
|
||
An optional second feature array. If `None`, uses `Y=X`.
|
||
|
||
Returns
|
||
-------
|
||
distance : ndarray of shape (n_samples_X, n_samples_Y)
|
||
The distance matrix.
|
||
|
||
Notes
|
||
-----
|
||
As the Earth is nearly spherical, the haversine formula provides a good
|
||
approximation of the distance between two points of the Earth surface, with
|
||
a less than 1% error on average.
|
||
|
||
Examples
|
||
--------
|
||
We want to calculate the distance between the Ezeiza Airport
|
||
(Buenos Aires, Argentina) and the Charles de Gaulle Airport (Paris,
|
||
France).
|
||
|
||
>>> from sklearn.metrics.pairwise import haversine_distances
|
||
>>> from math import radians
|
||
>>> bsas = [-34.83333, -58.5166646]
|
||
>>> paris = [49.0083899664, 2.53844117956]
|
||
>>> bsas_in_radians = [radians(_) for _ in bsas]
|
||
>>> paris_in_radians = [radians(_) for _ in paris]
|
||
>>> result = haversine_distances([bsas_in_radians, paris_in_radians])
|
||
>>> result * 6371000/1000 # multiply by Earth radius to get kilometers
|
||
array([[ 0. , 11099.54035582],
|
||
[11099.54035582, 0. ]])
|
||
"""
|
||
from ..metrics import DistanceMetric
|
||
|
||
return DistanceMetric.get_metric("haversine").pairwise(X, Y)
|
||
|
||
|
||
def manhattan_distances(X, Y=None, *, sum_over_features="deprecated"):
|
||
"""Compute the L1 distances between the vectors in X and Y.
|
||
|
||
With sum_over_features equal to False it returns the componentwise
|
||
distances.
|
||
|
||
Read more in the :ref:`User Guide <metrics>`.
|
||
|
||
Parameters
|
||
----------
|
||
X : array-like of shape (n_samples_X, n_features)
|
||
An array where each row is a sample and each column is a feature.
|
||
|
||
Y : array-like of shape (n_samples_Y, n_features), default=None
|
||
An array where each row is a sample and each column is a feature.
|
||
If `None`, method uses `Y=X`.
|
||
|
||
sum_over_features : bool, default=True
|
||
If True the function returns the pairwise distance matrix
|
||
else it returns the componentwise L1 pairwise-distances.
|
||
Not supported for sparse matrix inputs.
|
||
|
||
.. deprecated:: 1.2
|
||
``sum_over_features`` was deprecated in version 1.2 and will be removed in
|
||
1.4.
|
||
|
||
Returns
|
||
-------
|
||
D : ndarray of shape (n_samples_X * n_samples_Y, n_features) or \
|
||
(n_samples_X, n_samples_Y)
|
||
If sum_over_features is False shape is
|
||
(n_samples_X * n_samples_Y, n_features) and D contains the
|
||
componentwise L1 pairwise-distances (ie. absolute difference),
|
||
else shape is (n_samples_X, n_samples_Y) and D contains
|
||
the pairwise L1 distances.
|
||
|
||
Notes
|
||
-----
|
||
When X and/or Y are CSR sparse matrices and they are not already
|
||
in canonical format, this function modifies them in-place to
|
||
make them canonical.
|
||
|
||
Examples
|
||
--------
|
||
>>> from sklearn.metrics.pairwise import manhattan_distances
|
||
>>> manhattan_distances([[3]], [[3]])
|
||
array([[0.]])
|
||
>>> manhattan_distances([[3]], [[2]])
|
||
array([[1.]])
|
||
>>> manhattan_distances([[2]], [[3]])
|
||
array([[1.]])
|
||
>>> manhattan_distances([[1, 2], [3, 4]],\
|
||
[[1, 2], [0, 3]])
|
||
array([[0., 2.],
|
||
[4., 4.]])
|
||
"""
|
||
# TODO(1.4): remove sum_over_features
|
||
if sum_over_features != "deprecated":
|
||
warnings.warn(
|
||
"`sum_over_features` is deprecated in version 1.2 and will be"
|
||
" removed in version 1.4.",
|
||
FutureWarning,
|
||
)
|
||
else:
|
||
sum_over_features = True
|
||
|
||
X, Y = check_pairwise_arrays(X, Y)
|
||
|
||
if issparse(X) or issparse(Y):
|
||
if not sum_over_features:
|
||
raise TypeError(
|
||
"sum_over_features=%r not supported for sparse matrices"
|
||
% sum_over_features
|
||
)
|
||
|
||
X = csr_matrix(X, copy=False)
|
||
Y = csr_matrix(Y, copy=False)
|
||
X.sum_duplicates() # this also sorts indices in-place
|
||
Y.sum_duplicates()
|
||
D = np.zeros((X.shape[0], Y.shape[0]))
|
||
_sparse_manhattan(X.data, X.indices, X.indptr, Y.data, Y.indices, Y.indptr, D)
|
||
return D
|
||
|
||
if sum_over_features:
|
||
return distance.cdist(X, Y, "cityblock")
|
||
|
||
D = X[:, np.newaxis, :] - Y[np.newaxis, :, :]
|
||
D = np.abs(D, D)
|
||
return D.reshape((-1, X.shape[1]))
|
||
|
||
|
||
def cosine_distances(X, Y=None):
|
||
"""Compute cosine distance between samples in X and Y.
|
||
|
||
Cosine distance is defined as 1.0 minus the cosine similarity.
|
||
|
||
Read more in the :ref:`User Guide <metrics>`.
|
||
|
||
Parameters
|
||
----------
|
||
X : {array-like, sparse matrix} of shape (n_samples_X, n_features)
|
||
Matrix `X`.
|
||
|
||
Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), \
|
||
default=None
|
||
Matrix `Y`.
|
||
|
||
Returns
|
||
-------
|
||
distance matrix : ndarray of shape (n_samples_X, n_samples_Y)
|
||
Returns the cosine distance between samples in X and Y.
|
||
|
||
See Also
|
||
--------
|
||
cosine_similarity : Compute cosine similarity between samples in X and Y.
|
||
scipy.spatial.distance.cosine : Dense matrices only.
|
||
"""
|
||
# 1.0 - cosine_similarity(X, Y) without copy
|
||
S = cosine_similarity(X, Y)
|
||
S *= -1
|
||
S += 1
|
||
np.clip(S, 0, 2, out=S)
|
||
if X is Y or Y is None:
|
||
# Ensure that distances between vectors and themselves are set to 0.0.
|
||
# This may not be the case due to floating point rounding errors.
|
||
S[np.diag_indices_from(S)] = 0.0
|
||
return S
|
||
|
||
|
||
# Paired distances
|
||
def paired_euclidean_distances(X, Y):
|
||
"""Compute the paired euclidean distances between X and Y.
|
||
|
||
Read more in the :ref:`User Guide <metrics>`.
|
||
|
||
Parameters
|
||
----------
|
||
X : array-like of shape (n_samples, n_features)
|
||
Input array/matrix X.
|
||
|
||
Y : array-like of shape (n_samples, n_features)
|
||
Input array/matrix Y.
|
||
|
||
Returns
|
||
-------
|
||
distances : ndarray of shape (n_samples,)
|
||
Output array/matrix containing the calculated paired euclidean
|
||
distances.
|
||
"""
|
||
X, Y = check_paired_arrays(X, Y)
|
||
return row_norms(X - Y)
|
||
|
||
|
||
def paired_manhattan_distances(X, Y):
|
||
"""Compute the paired L1 distances between X and Y.
|
||
|
||
Distances are calculated between (X[0], Y[0]), (X[1], Y[1]), ...,
|
||
(X[n_samples], Y[n_samples]).
|
||
|
||
Read more in the :ref:`User Guide <metrics>`.
|
||
|
||
Parameters
|
||
----------
|
||
X : array-like of shape (n_samples, n_features)
|
||
An array-like where each row is a sample and each column is a feature.
|
||
|
||
Y : array-like of shape (n_samples, n_features)
|
||
An array-like where each row is a sample and each column is a feature.
|
||
|
||
Returns
|
||
-------
|
||
distances : ndarray of shape (n_samples,)
|
||
L1 paired distances between the row vectors of `X`
|
||
and the row vectors of `Y`.
|
||
|
||
Examples
|
||
--------
|
||
>>> from sklearn.metrics.pairwise import paired_manhattan_distances
|
||
>>> import numpy as np
|
||
>>> X = np.array([[1, 1, 0], [0, 1, 0], [0, 0, 1]])
|
||
>>> Y = np.array([[0, 1, 0], [0, 0, 1], [0, 0, 0]])
|
||
>>> paired_manhattan_distances(X, Y)
|
||
array([1., 2., 1.])
|
||
"""
|
||
X, Y = check_paired_arrays(X, Y)
|
||
diff = X - Y
|
||
if issparse(diff):
|
||
diff.data = np.abs(diff.data)
|
||
return np.squeeze(np.array(diff.sum(axis=1)))
|
||
else:
|
||
return np.abs(diff).sum(axis=-1)
|
||
|
||
|
||
def paired_cosine_distances(X, Y):
|
||
"""
|
||
Compute the paired cosine distances between X and Y.
|
||
|
||
Read more in the :ref:`User Guide <metrics>`.
|
||
|
||
Parameters
|
||
----------
|
||
X : array-like of shape (n_samples, n_features)
|
||
An array where each row is a sample and each column is a feature.
|
||
|
||
Y : array-like of shape (n_samples, n_features)
|
||
An array where each row is a sample and each column is a feature.
|
||
|
||
Returns
|
||
-------
|
||
distances : ndarray of shape (n_samples,)
|
||
Returns the distances between the row vectors of `X`
|
||
and the row vectors of `Y`, where `distances[i]` is the
|
||
distance between `X[i]` and `Y[i]`.
|
||
|
||
Notes
|
||
-----
|
||
The cosine distance is equivalent to the half the squared
|
||
euclidean distance if each sample is normalized to unit norm.
|
||
"""
|
||
X, Y = check_paired_arrays(X, Y)
|
||
return 0.5 * row_norms(normalize(X) - normalize(Y), squared=True)
|
||
|
||
|
||
PAIRED_DISTANCES = {
|
||
"cosine": paired_cosine_distances,
|
||
"euclidean": paired_euclidean_distances,
|
||
"l2": paired_euclidean_distances,
|
||
"l1": paired_manhattan_distances,
|
||
"manhattan": paired_manhattan_distances,
|
||
"cityblock": paired_manhattan_distances,
|
||
}
|
||
|
||
|
||
def paired_distances(X, Y, *, metric="euclidean", **kwds):
|
||
"""
|
||
Compute the paired distances between X and Y.
|
||
|
||
Compute the distances between (X[0], Y[0]), (X[1], Y[1]), etc...
|
||
|
||
Read more in the :ref:`User Guide <metrics>`.
|
||
|
||
Parameters
|
||
----------
|
||
X : ndarray of shape (n_samples, n_features)
|
||
Array 1 for distance computation.
|
||
|
||
Y : ndarray of shape (n_samples, n_features)
|
||
Array 2 for distance computation.
|
||
|
||
metric : str or callable, default="euclidean"
|
||
The metric to use when calculating distance between instances in a
|
||
feature array. If metric is a string, it must be one of the options
|
||
specified in PAIRED_DISTANCES, including "euclidean",
|
||
"manhattan", or "cosine".
|
||
Alternatively, if metric is a callable function, it is called on each
|
||
pair of instances (rows) and the resulting value recorded. The callable
|
||
should take two arrays from `X` as input and return a value indicating
|
||
the distance between them.
|
||
|
||
**kwds : dict
|
||
Unused parameters.
|
||
|
||
Returns
|
||
-------
|
||
distances : ndarray of shape (n_samples,)
|
||
Returns the distances between the row vectors of `X`
|
||
and the row vectors of `Y`.
|
||
|
||
See Also
|
||
--------
|
||
pairwise_distances : Computes the distance between every pair of samples.
|
||
|
||
Examples
|
||
--------
|
||
>>> from sklearn.metrics.pairwise import paired_distances
|
||
>>> X = [[0, 1], [1, 1]]
|
||
>>> Y = [[0, 1], [2, 1]]
|
||
>>> paired_distances(X, Y)
|
||
array([0., 1.])
|
||
"""
|
||
|
||
if metric in PAIRED_DISTANCES:
|
||
func = PAIRED_DISTANCES[metric]
|
||
return func(X, Y)
|
||
elif callable(metric):
|
||
# Check the matrix first (it is usually done by the metric)
|
||
X, Y = check_paired_arrays(X, Y)
|
||
distances = np.zeros(len(X))
|
||
for i in range(len(X)):
|
||
distances[i] = metric(X[i], Y[i])
|
||
return distances
|
||
else:
|
||
raise ValueError("Unknown distance %s" % metric)
|
||
|
||
|
||
# Kernels
|
||
def linear_kernel(X, Y=None, dense_output=True):
|
||
"""
|
||
Compute the linear kernel between X and Y.
|
||
|
||
Read more in the :ref:`User Guide <linear_kernel>`.
|
||
|
||
Parameters
|
||
----------
|
||
X : ndarray of shape (n_samples_X, n_features)
|
||
A feature array.
|
||
|
||
Y : ndarray of shape (n_samples_Y, n_features), default=None
|
||
An optional second feature array. If `None`, uses `Y=X`.
|
||
|
||
dense_output : bool, default=True
|
||
Whether to return dense output even when the input is sparse. If
|
||
``False``, the output is sparse if both input arrays are sparse.
|
||
|
||
.. versionadded:: 0.20
|
||
|
||
Returns
|
||
-------
|
||
Gram matrix : ndarray of shape (n_samples_X, n_samples_Y)
|
||
The Gram matrix of the linear kernel, i.e. `X @ Y.T`.
|
||
"""
|
||
X, Y = check_pairwise_arrays(X, Y)
|
||
return safe_sparse_dot(X, Y.T, dense_output=dense_output)
|
||
|
||
|
||
def polynomial_kernel(X, Y=None, degree=3, gamma=None, coef0=1):
|
||
"""
|
||
Compute the polynomial kernel between X and Y.
|
||
|
||
:math:`K(X, Y) = (gamma <X, Y> + coef0)^{degree}`
|
||
|
||
Read more in the :ref:`User Guide <polynomial_kernel>`.
|
||
|
||
Parameters
|
||
----------
|
||
X : ndarray of shape (n_samples_X, n_features)
|
||
A feature array.
|
||
|
||
Y : ndarray of shape (n_samples_Y, n_features), default=None
|
||
An optional second feature array. If `None`, uses `Y=X`.
|
||
|
||
degree : int, default=3
|
||
Kernel degree.
|
||
|
||
gamma : float, default=None
|
||
Coefficient of the vector inner product. If None, defaults to 1.0 / n_features.
|
||
|
||
coef0 : float, default=1
|
||
Constant offset added to scaled inner product.
|
||
|
||
Returns
|
||
-------
|
||
Gram matrix : ndarray of shape (n_samples_X, n_samples_Y)
|
||
The polynomial kernel.
|
||
"""
|
||
X, Y = check_pairwise_arrays(X, Y)
|
||
if gamma is None:
|
||
gamma = 1.0 / X.shape[1]
|
||
|
||
K = safe_sparse_dot(X, Y.T, dense_output=True)
|
||
K *= gamma
|
||
K += coef0
|
||
K **= degree
|
||
return K
|
||
|
||
|
||
def sigmoid_kernel(X, Y=None, gamma=None, coef0=1):
|
||
"""Compute the sigmoid kernel between X and Y.
|
||
|
||
K(X, Y) = tanh(gamma <X, Y> + coef0)
|
||
|
||
Read more in the :ref:`User Guide <sigmoid_kernel>`.
|
||
|
||
Parameters
|
||
----------
|
||
X : ndarray of shape (n_samples_X, n_features)
|
||
A feature array.
|
||
|
||
Y : ndarray of shape (n_samples_Y, n_features), default=None
|
||
An optional second feature array. If `None`, uses `Y=X`.
|
||
|
||
gamma : float, default=None
|
||
Coefficient of the vector inner product. If None, defaults to 1.0 / n_features.
|
||
|
||
coef0 : float, default=1
|
||
Constant offset added to scaled inner product.
|
||
|
||
Returns
|
||
-------
|
||
Gram matrix : ndarray of shape (n_samples_X, n_samples_Y)
|
||
Sigmoid kernel between two arrays.
|
||
"""
|
||
X, Y = check_pairwise_arrays(X, Y)
|
||
if gamma is None:
|
||
gamma = 1.0 / X.shape[1]
|
||
|
||
K = safe_sparse_dot(X, Y.T, dense_output=True)
|
||
K *= gamma
|
||
K += coef0
|
||
np.tanh(K, K) # compute tanh in-place
|
||
return K
|
||
|
||
|
||
def rbf_kernel(X, Y=None, gamma=None):
|
||
"""Compute the rbf (gaussian) kernel between X and Y.
|
||
|
||
K(x, y) = exp(-gamma ||x-y||^2)
|
||
|
||
for each pair of rows x in X and y in Y.
|
||
|
||
Read more in the :ref:`User Guide <rbf_kernel>`.
|
||
|
||
Parameters
|
||
----------
|
||
X : ndarray of shape (n_samples_X, n_features)
|
||
A feature array.
|
||
|
||
Y : ndarray of shape (n_samples_Y, n_features), default=None
|
||
An optional second feature array. If `None`, uses `Y=X`.
|
||
|
||
gamma : float, default=None
|
||
If None, defaults to 1.0 / n_features.
|
||
|
||
Returns
|
||
-------
|
||
kernel_matrix : ndarray of shape (n_samples_X, n_samples_Y)
|
||
The RBF kernel.
|
||
"""
|
||
X, Y = check_pairwise_arrays(X, Y)
|
||
if gamma is None:
|
||
gamma = 1.0 / X.shape[1]
|
||
|
||
K = euclidean_distances(X, Y, squared=True)
|
||
K *= -gamma
|
||
np.exp(K, K) # exponentiate K in-place
|
||
return K
|
||
|
||
|
||
def laplacian_kernel(X, Y=None, gamma=None):
|
||
"""Compute the laplacian kernel between X and Y.
|
||
|
||
The laplacian kernel is defined as::
|
||
|
||
K(x, y) = exp(-gamma ||x-y||_1)
|
||
|
||
for each pair of rows x in X and y in Y.
|
||
Read more in the :ref:`User Guide <laplacian_kernel>`.
|
||
|
||
.. versionadded:: 0.17
|
||
|
||
Parameters
|
||
----------
|
||
X : ndarray of shape (n_samples_X, n_features)
|
||
A feature array.
|
||
|
||
Y : ndarray of shape (n_samples_Y, n_features), default=None
|
||
An optional second feature array. If `None`, uses `Y=X`.
|
||
|
||
gamma : float, default=None
|
||
If None, defaults to 1.0 / n_features.
|
||
|
||
Returns
|
||
-------
|
||
kernel_matrix : ndarray of shape (n_samples_X, n_samples_Y)
|
||
The kernel matrix.
|
||
"""
|
||
X, Y = check_pairwise_arrays(X, Y)
|
||
if gamma is None:
|
||
gamma = 1.0 / X.shape[1]
|
||
|
||
K = -gamma * manhattan_distances(X, Y)
|
||
np.exp(K, K) # exponentiate K in-place
|
||
return K
|
||
|
||
|
||
def cosine_similarity(X, Y=None, dense_output=True):
|
||
"""Compute cosine similarity between samples in X and Y.
|
||
|
||
Cosine similarity, or the cosine kernel, computes similarity as the
|
||
normalized dot product of X and Y:
|
||
|
||
K(X, Y) = <X, Y> / (||X||*||Y||)
|
||
|
||
On L2-normalized data, this function is equivalent to linear_kernel.
|
||
|
||
Read more in the :ref:`User Guide <cosine_similarity>`.
|
||
|
||
Parameters
|
||
----------
|
||
X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
|
||
Input data.
|
||
|
||
Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features), \
|
||
default=None
|
||
Input data. If ``None``, the output will be the pairwise
|
||
similarities between all samples in ``X``.
|
||
|
||
dense_output : bool, default=True
|
||
Whether to return dense output even when the input is sparse. If
|
||
``False``, the output is sparse if both input arrays are sparse.
|
||
|
||
.. versionadded:: 0.17
|
||
parameter ``dense_output`` for dense output.
|
||
|
||
Returns
|
||
-------
|
||
kernel matrix : ndarray of shape (n_samples_X, n_samples_Y)
|
||
Returns the cosine similarity between samples in X and Y.
|
||
"""
|
||
# to avoid recursive import
|
||
|
||
X, Y = check_pairwise_arrays(X, Y)
|
||
|
||
X_normalized = normalize(X, copy=True)
|
||
if X is Y:
|
||
Y_normalized = X_normalized
|
||
else:
|
||
Y_normalized = normalize(Y, copy=True)
|
||
|
||
K = safe_sparse_dot(X_normalized, Y_normalized.T, dense_output=dense_output)
|
||
|
||
return K
|
||
|
||
|
||
def additive_chi2_kernel(X, Y=None):
|
||
"""Compute the additive chi-squared kernel between observations in X and Y.
|
||
|
||
The chi-squared kernel is computed between each pair of rows in X and Y. X
|
||
and Y have to be non-negative. This kernel is most commonly applied to
|
||
histograms.
|
||
|
||
The chi-squared kernel is given by::
|
||
|
||
k(x, y) = -Sum [(x - y)^2 / (x + y)]
|
||
|
||
It can be interpreted as a weighted difference per entry.
|
||
|
||
Read more in the :ref:`User Guide <chi2_kernel>`.
|
||
|
||
Parameters
|
||
----------
|
||
X : array-like of shape (n_samples_X, n_features)
|
||
A feature array.
|
||
|
||
Y : ndarray of shape (n_samples_Y, n_features), default=None
|
||
An optional second feature array. If `None`, uses `Y=X`.
|
||
|
||
Returns
|
||
-------
|
||
kernel_matrix : ndarray of shape (n_samples_X, n_samples_Y)
|
||
The kernel matrix.
|
||
|
||
See Also
|
||
--------
|
||
chi2_kernel : The exponentiated version of the kernel, which is usually
|
||
preferable.
|
||
sklearn.kernel_approximation.AdditiveChi2Sampler : A Fourier approximation
|
||
to this kernel.
|
||
|
||
Notes
|
||
-----
|
||
As the negative of a distance, this kernel is only conditionally positive
|
||
definite.
|
||
|
||
References
|
||
----------
|
||
* Zhang, J. and Marszalek, M. and Lazebnik, S. and Schmid, C.
|
||
Local features and kernels for classification of texture and object
|
||
categories: A comprehensive study
|
||
International Journal of Computer Vision 2007
|
||
https://hal.archives-ouvertes.fr/hal-00171412/document
|
||
"""
|
||
if issparse(X) or issparse(Y):
|
||
raise ValueError("additive_chi2 does not support sparse matrices.")
|
||
X, Y = check_pairwise_arrays(X, Y)
|
||
if (X < 0).any():
|
||
raise ValueError("X contains negative values.")
|
||
if Y is not X and (Y < 0).any():
|
||
raise ValueError("Y contains negative values.")
|
||
|
||
result = np.zeros((X.shape[0], Y.shape[0]), dtype=X.dtype)
|
||
_chi2_kernel_fast(X, Y, result)
|
||
return result
|
||
|
||
|
||
def chi2_kernel(X, Y=None, gamma=1.0):
|
||
"""Compute the exponential chi-squared kernel between X and Y.
|
||
|
||
The chi-squared kernel is computed between each pair of rows in X and Y. X
|
||
and Y have to be non-negative. This kernel is most commonly applied to
|
||
histograms.
|
||
|
||
The chi-squared kernel is given by::
|
||
|
||
k(x, y) = exp(-gamma Sum [(x - y)^2 / (x + y)])
|
||
|
||
It can be interpreted as a weighted difference per entry.
|
||
|
||
Read more in the :ref:`User Guide <chi2_kernel>`.
|
||
|
||
Parameters
|
||
----------
|
||
X : array-like of shape (n_samples_X, n_features)
|
||
A feature array.
|
||
|
||
Y : ndarray of shape (n_samples_Y, n_features), default=None
|
||
An optional second feature array. If `None`, uses `Y=X`.
|
||
|
||
gamma : float, default=1
|
||
Scaling parameter of the chi2 kernel.
|
||
|
||
Returns
|
||
-------
|
||
kernel_matrix : ndarray of shape (n_samples_X, n_samples_Y)
|
||
The kernel matrix.
|
||
|
||
See Also
|
||
--------
|
||
additive_chi2_kernel : The additive version of this kernel.
|
||
sklearn.kernel_approximation.AdditiveChi2Sampler : A Fourier approximation
|
||
to the additive version of this kernel.
|
||
|
||
References
|
||
----------
|
||
* Zhang, J. and Marszalek, M. and Lazebnik, S. and Schmid, C.
|
||
Local features and kernels for classification of texture and object
|
||
categories: A comprehensive study
|
||
International Journal of Computer Vision 2007
|
||
https://hal.archives-ouvertes.fr/hal-00171412/document
|
||
"""
|
||
K = additive_chi2_kernel(X, Y)
|
||
K *= gamma
|
||
return np.exp(K, K)
|
||
|
||
|
||
# Helper functions - distance
|
||
PAIRWISE_DISTANCE_FUNCTIONS = {
|
||
# If updating this dictionary, update the doc in both distance_metrics()
|
||
# and also in pairwise_distances()!
|
||
"cityblock": manhattan_distances,
|
||
"cosine": cosine_distances,
|
||
"euclidean": euclidean_distances,
|
||
"haversine": haversine_distances,
|
||
"l2": euclidean_distances,
|
||
"l1": manhattan_distances,
|
||
"manhattan": manhattan_distances,
|
||
"precomputed": None, # HACK: precomputed is always allowed, never called
|
||
"nan_euclidean": nan_euclidean_distances,
|
||
}
|
||
|
||
|
||
def distance_metrics():
|
||
"""Valid metrics for pairwise_distances.
|
||
|
||
This function simply returns the valid pairwise distance metrics.
|
||
It exists to allow for a description of the mapping for
|
||
each of the valid strings.
|
||
|
||
The valid distance metrics, and the function they map to, are:
|
||
|
||
=============== ========================================
|
||
metric Function
|
||
=============== ========================================
|
||
'cityblock' metrics.pairwise.manhattan_distances
|
||
'cosine' metrics.pairwise.cosine_distances
|
||
'euclidean' metrics.pairwise.euclidean_distances
|
||
'haversine' metrics.pairwise.haversine_distances
|
||
'l1' metrics.pairwise.manhattan_distances
|
||
'l2' metrics.pairwise.euclidean_distances
|
||
'manhattan' metrics.pairwise.manhattan_distances
|
||
'nan_euclidean' metrics.pairwise.nan_euclidean_distances
|
||
=============== ========================================
|
||
|
||
Read more in the :ref:`User Guide <metrics>`.
|
||
|
||
Returns
|
||
-------
|
||
distance_metrics : dict
|
||
Returns valid metrics for pairwise_distances.
|
||
"""
|
||
return PAIRWISE_DISTANCE_FUNCTIONS
|
||
|
||
|
||
def _dist_wrapper(dist_func, dist_matrix, slice_, *args, **kwargs):
|
||
"""Write in-place to a slice of a distance matrix."""
|
||
dist_matrix[:, slice_] = dist_func(*args, **kwargs)
|
||
|
||
|
||
def _parallel_pairwise(X, Y, func, n_jobs, **kwds):
|
||
"""Break the pairwise matrix in n_jobs even slices
|
||
and compute them in parallel."""
|
||
|
||
if Y is None:
|
||
Y = X
|
||
X, Y, dtype = _return_float_dtype(X, Y)
|
||
|
||
if effective_n_jobs(n_jobs) == 1:
|
||
return func(X, Y, **kwds)
|
||
|
||
# enforce a threading backend to prevent data communication overhead
|
||
fd = delayed(_dist_wrapper)
|
||
ret = np.empty((X.shape[0], Y.shape[0]), dtype=dtype, order="F")
|
||
Parallel(backend="threading", n_jobs=n_jobs)(
|
||
fd(func, ret, s, X, Y[s], **kwds)
|
||
for s in gen_even_slices(_num_samples(Y), effective_n_jobs(n_jobs))
|
||
)
|
||
|
||
if (X is Y or Y is None) and func is euclidean_distances:
|
||
# zeroing diagonal for euclidean norm.
|
||
# TODO: do it also for other norms.
|
||
np.fill_diagonal(ret, 0)
|
||
|
||
return ret
|
||
|
||
|
||
def _pairwise_callable(X, Y, metric, force_all_finite=True, **kwds):
|
||
"""Handle the callable case for pairwise_{distances,kernels}."""
|
||
X, Y = check_pairwise_arrays(X, Y, force_all_finite=force_all_finite)
|
||
|
||
if X is Y:
|
||
# Only calculate metric for upper triangle
|
||
out = np.zeros((X.shape[0], Y.shape[0]), dtype="float")
|
||
iterator = itertools.combinations(range(X.shape[0]), 2)
|
||
for i, j in iterator:
|
||
out[i, j] = metric(X[i], Y[j], **kwds)
|
||
|
||
# Make symmetric
|
||
# NB: out += out.T will produce incorrect results
|
||
out = out + out.T
|
||
|
||
# Calculate diagonal
|
||
# NB: nonzero diagonals are allowed for both metrics and kernels
|
||
for i in range(X.shape[0]):
|
||
x = X[i]
|
||
out[i, i] = metric(x, x, **kwds)
|
||
|
||
else:
|
||
# Calculate all cells
|
||
out = np.empty((X.shape[0], Y.shape[0]), dtype="float")
|
||
iterator = itertools.product(range(X.shape[0]), range(Y.shape[0]))
|
||
for i, j in iterator:
|
||
out[i, j] = metric(X[i], Y[j], **kwds)
|
||
|
||
return out
|
||
|
||
|
||
_VALID_METRICS = [
|
||
"euclidean",
|
||
"l2",
|
||
"l1",
|
||
"manhattan",
|
||
"cityblock",
|
||
"braycurtis",
|
||
"canberra",
|
||
"chebyshev",
|
||
"correlation",
|
||
"cosine",
|
||
"dice",
|
||
"hamming",
|
||
"jaccard",
|
||
"kulsinski",
|
||
"mahalanobis",
|
||
"matching",
|
||
"minkowski",
|
||
"rogerstanimoto",
|
||
"russellrao",
|
||
"seuclidean",
|
||
"sokalmichener",
|
||
"sokalsneath",
|
||
"sqeuclidean",
|
||
"yule",
|
||
"wminkowski",
|
||
"nan_euclidean",
|
||
"haversine",
|
||
]
|
||
|
||
_NAN_METRICS = ["nan_euclidean"]
|
||
|
||
|
||
def _check_chunk_size(reduced, chunk_size):
|
||
"""Checks chunk is a sequence of expected size or a tuple of same."""
|
||
if reduced is None:
|
||
return
|
||
is_tuple = isinstance(reduced, tuple)
|
||
if not is_tuple:
|
||
reduced = (reduced,)
|
||
if any(isinstance(r, tuple) or not hasattr(r, "__iter__") for r in reduced):
|
||
raise TypeError(
|
||
"reduce_func returned %r. Expected sequence(s) of length %d."
|
||
% (reduced if is_tuple else reduced[0], chunk_size)
|
||
)
|
||
if any(_num_samples(r) != chunk_size for r in reduced):
|
||
actual_size = tuple(_num_samples(r) for r in reduced)
|
||
raise ValueError(
|
||
"reduce_func returned object of length %s. "
|
||
"Expected same length as input: %d."
|
||
% (actual_size if is_tuple else actual_size[0], chunk_size)
|
||
)
|
||
|
||
|
||
def _precompute_metric_params(X, Y, metric=None, **kwds):
|
||
"""Precompute data-derived metric parameters if not provided."""
|
||
if metric == "seuclidean" and "V" not in kwds:
|
||
# There is a bug in scipy < 1.5 that will cause a crash if
|
||
# X.dtype != np.double (float64). See PR #15730
|
||
dtype = np.float64 if sp_version < parse_version("1.5") else None
|
||
if X is Y:
|
||
V = np.var(X, axis=0, ddof=1, dtype=dtype)
|
||
else:
|
||
raise ValueError(
|
||
"The 'V' parameter is required for the seuclidean metric "
|
||
"when Y is passed."
|
||
)
|
||
return {"V": V}
|
||
if metric == "mahalanobis" and "VI" not in kwds:
|
||
if X is Y:
|
||
VI = np.linalg.inv(np.cov(X.T)).T
|
||
else:
|
||
raise ValueError(
|
||
"The 'VI' parameter is required for the mahalanobis metric "
|
||
"when Y is passed."
|
||
)
|
||
return {"VI": VI}
|
||
return {}
|
||
|
||
|
||
def pairwise_distances_chunked(
|
||
X,
|
||
Y=None,
|
||
*,
|
||
reduce_func=None,
|
||
metric="euclidean",
|
||
n_jobs=None,
|
||
working_memory=None,
|
||
**kwds,
|
||
):
|
||
"""Generate a distance matrix chunk by chunk with optional reduction.
|
||
|
||
In cases where not all of a pairwise distance matrix needs to be
|
||
stored at once, this is used to calculate pairwise distances in
|
||
``working_memory``-sized chunks. If ``reduce_func`` is given, it is
|
||
run on each chunk and its return values are concatenated into lists,
|
||
arrays or sparse matrices.
|
||
|
||
Parameters
|
||
----------
|
||
X : ndarray of shape (n_samples_X, n_samples_X) or \
|
||
(n_samples_X, n_features)
|
||
Array of pairwise distances between samples, or a feature array.
|
||
The shape the array should be (n_samples_X, n_samples_X) if
|
||
metric='precomputed' and (n_samples_X, n_features) otherwise.
|
||
|
||
Y : ndarray of shape (n_samples_Y, n_features), default=None
|
||
An optional second feature array. Only allowed if
|
||
metric != "precomputed".
|
||
|
||
reduce_func : callable, default=None
|
||
The function which is applied on each chunk of the distance matrix,
|
||
reducing it to needed values. ``reduce_func(D_chunk, start)``
|
||
is called repeatedly, where ``D_chunk`` is a contiguous vertical
|
||
slice of the pairwise distance matrix, starting at row ``start``.
|
||
It should return one of: None; an array, a list, or a sparse matrix
|
||
of length ``D_chunk.shape[0]``; or a tuple of such objects.
|
||
Returning None is useful for in-place operations, rather than
|
||
reductions.
|
||
|
||
If None, pairwise_distances_chunked returns a generator of vertical
|
||
chunks of the distance matrix.
|
||
|
||
metric : str or callable, default='euclidean'
|
||
The metric to use when calculating distance between instances in a
|
||
feature array. If metric is a string, it must be one of the options
|
||
allowed by scipy.spatial.distance.pdist for its metric parameter,
|
||
or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
|
||
If metric is "precomputed", X is assumed to be a distance matrix.
|
||
Alternatively, if metric is a callable function, it is called on
|
||
each pair of instances (rows) and the resulting value recorded.
|
||
The callable should take two arrays from X as input and return a
|
||
value indicating the distance between them.
|
||
|
||
n_jobs : int, default=None
|
||
The number of jobs to use for the computation. This works by
|
||
breaking down the pairwise matrix into n_jobs even slices and
|
||
computing them in parallel.
|
||
|
||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||
for more details.
|
||
|
||
working_memory : int, default=None
|
||
The sought maximum memory for temporary distance matrix chunks.
|
||
When None (default), the value of
|
||
``sklearn.get_config()['working_memory']`` is used.
|
||
|
||
**kwds : optional keyword parameters
|
||
Any further parameters are passed directly to the distance function.
|
||
If using a scipy.spatial.distance metric, the parameters are still
|
||
metric dependent. See the scipy docs for usage examples.
|
||
|
||
Yields
|
||
------
|
||
D_chunk : {ndarray, sparse matrix}
|
||
A contiguous slice of distance matrix, optionally processed by
|
||
``reduce_func``.
|
||
|
||
Examples
|
||
--------
|
||
Without reduce_func:
|
||
|
||
>>> import numpy as np
|
||
>>> from sklearn.metrics import pairwise_distances_chunked
|
||
>>> X = np.random.RandomState(0).rand(5, 3)
|
||
>>> D_chunk = next(pairwise_distances_chunked(X))
|
||
>>> D_chunk
|
||
array([[0. ..., 0.29..., 0.41..., 0.19..., 0.57...],
|
||
[0.29..., 0. ..., 0.57..., 0.41..., 0.76...],
|
||
[0.41..., 0.57..., 0. ..., 0.44..., 0.90...],
|
||
[0.19..., 0.41..., 0.44..., 0. ..., 0.51...],
|
||
[0.57..., 0.76..., 0.90..., 0.51..., 0. ...]])
|
||
|
||
Retrieve all neighbors and average distance within radius r:
|
||
|
||
>>> r = .2
|
||
>>> def reduce_func(D_chunk, start):
|
||
... neigh = [np.flatnonzero(d < r) for d in D_chunk]
|
||
... avg_dist = (D_chunk * (D_chunk < r)).mean(axis=1)
|
||
... return neigh, avg_dist
|
||
>>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func)
|
||
>>> neigh, avg_dist = next(gen)
|
||
>>> neigh
|
||
[array([0, 3]), array([1]), array([2]), array([0, 3]), array([4])]
|
||
>>> avg_dist
|
||
array([0.039..., 0. , 0. , 0.039..., 0. ])
|
||
|
||
Where r is defined per sample, we need to make use of ``start``:
|
||
|
||
>>> r = [.2, .4, .4, .3, .1]
|
||
>>> def reduce_func(D_chunk, start):
|
||
... neigh = [np.flatnonzero(d < r[i])
|
||
... for i, d in enumerate(D_chunk, start)]
|
||
... return neigh
|
||
>>> neigh = next(pairwise_distances_chunked(X, reduce_func=reduce_func))
|
||
>>> neigh
|
||
[array([0, 3]), array([0, 1]), array([2]), array([0, 3]), array([4])]
|
||
|
||
Force row-by-row generation by reducing ``working_memory``:
|
||
|
||
>>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func,
|
||
... working_memory=0)
|
||
>>> next(gen)
|
||
[array([0, 3])]
|
||
>>> next(gen)
|
||
[array([0, 1])]
|
||
"""
|
||
n_samples_X = _num_samples(X)
|
||
if metric == "precomputed":
|
||
slices = (slice(0, n_samples_X),)
|
||
else:
|
||
if Y is None:
|
||
Y = X
|
||
# We get as many rows as possible within our working_memory budget to
|
||
# store len(Y) distances in each row of output.
|
||
#
|
||
# Note:
|
||
# - this will get at least 1 row, even if 1 row of distances will
|
||
# exceed working_memory.
|
||
# - this does not account for any temporary memory usage while
|
||
# calculating distances (e.g. difference of vectors in manhattan
|
||
# distance.
|
||
chunk_n_rows = get_chunk_n_rows(
|
||
row_bytes=8 * _num_samples(Y),
|
||
max_n_rows=n_samples_X,
|
||
working_memory=working_memory,
|
||
)
|
||
slices = gen_batches(n_samples_X, chunk_n_rows)
|
||
|
||
# precompute data-derived metric params
|
||
params = _precompute_metric_params(X, Y, metric=metric, **kwds)
|
||
kwds.update(**params)
|
||
|
||
for sl in slices:
|
||
if sl.start == 0 and sl.stop == n_samples_X:
|
||
X_chunk = X # enable optimised paths for X is Y
|
||
else:
|
||
X_chunk = X[sl]
|
||
D_chunk = pairwise_distances(X_chunk, Y, metric=metric, n_jobs=n_jobs, **kwds)
|
||
if (X is Y or Y is None) and PAIRWISE_DISTANCE_FUNCTIONS.get(
|
||
metric, None
|
||
) is euclidean_distances:
|
||
# zeroing diagonal, taking care of aliases of "euclidean",
|
||
# i.e. "l2"
|
||
D_chunk.flat[sl.start :: _num_samples(X) + 1] = 0
|
||
if reduce_func is not None:
|
||
chunk_size = D_chunk.shape[0]
|
||
D_chunk = reduce_func(D_chunk, sl.start)
|
||
_check_chunk_size(D_chunk, chunk_size)
|
||
yield D_chunk
|
||
|
||
|
||
def pairwise_distances(
|
||
X, Y=None, metric="euclidean", *, n_jobs=None, force_all_finite=True, **kwds
|
||
):
|
||
"""Compute the distance matrix from a vector array X and optional Y.
|
||
|
||
This method takes either a vector array or a distance matrix, and returns
|
||
a distance matrix. If the input is a vector array, the distances are
|
||
computed. If the input is a distances matrix, it is returned instead.
|
||
|
||
This method provides a safe way to take a distance matrix as input, while
|
||
preserving compatibility with many other algorithms that take a vector
|
||
array.
|
||
|
||
If Y is given (default is None), then the returned matrix is the pairwise
|
||
distance between the arrays from both X and Y.
|
||
|
||
Valid values for metric are:
|
||
|
||
- From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
|
||
'manhattan']. These metrics support sparse matrix
|
||
inputs.
|
||
['nan_euclidean'] but it does not yet support sparse matrices.
|
||
|
||
- From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
|
||
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
|
||
'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
|
||
'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
|
||
See the documentation for scipy.spatial.distance for details on these
|
||
metrics. These metrics do not support sparse matrix inputs.
|
||
|
||
Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are
|
||
valid scipy.spatial.distance metrics), the scikit-learn implementation
|
||
will be used, which is faster and has support for sparse matrices (except
|
||
for 'cityblock'). For a verbose description of the metrics from
|
||
scikit-learn, see :func:`sklearn.metrics.pairwise.distance_metrics`
|
||
function.
|
||
|
||
Read more in the :ref:`User Guide <metrics>`.
|
||
|
||
Parameters
|
||
----------
|
||
X : ndarray of shape (n_samples_X, n_samples_X) or \
|
||
(n_samples_X, n_features)
|
||
Array of pairwise distances between samples, or a feature array.
|
||
The shape of the array should be (n_samples_X, n_samples_X) if
|
||
metric == "precomputed" and (n_samples_X, n_features) otherwise.
|
||
|
||
Y : ndarray of shape (n_samples_Y, n_features), default=None
|
||
An optional second feature array. Only allowed if
|
||
metric != "precomputed".
|
||
|
||
metric : str or callable, default='euclidean'
|
||
The metric to use when calculating distance between instances in a
|
||
feature array. If metric is a string, it must be one of the options
|
||
allowed by scipy.spatial.distance.pdist for its metric parameter, or
|
||
a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``.
|
||
If metric is "precomputed", X is assumed to be a distance matrix.
|
||
Alternatively, if metric is a callable function, it is called on each
|
||
pair of instances (rows) and the resulting value recorded. The callable
|
||
should take two arrays from X as input and return a value indicating
|
||
the distance between them.
|
||
|
||
n_jobs : int, default=None
|
||
The number of jobs to use for the computation. This works by breaking
|
||
down the pairwise matrix into n_jobs even slices and computing them in
|
||
parallel.
|
||
|
||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||
for more details.
|
||
|
||
force_all_finite : bool or 'allow-nan', default=True
|
||
Whether to raise an error on np.inf, np.nan, pd.NA in array. Ignored
|
||
for a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``. The
|
||
possibilities are:
|
||
|
||
- True: Force all values of array to be finite.
|
||
- False: accepts np.inf, np.nan, pd.NA in array.
|
||
- 'allow-nan': accepts only np.nan and pd.NA values in array. Values
|
||
cannot be infinite.
|
||
|
||
.. versionadded:: 0.22
|
||
``force_all_finite`` accepts the string ``'allow-nan'``.
|
||
|
||
.. versionchanged:: 0.23
|
||
Accepts `pd.NA` and converts it into `np.nan`.
|
||
|
||
**kwds : optional keyword parameters
|
||
Any further parameters are passed directly to the distance function.
|
||
If using a scipy.spatial.distance metric, the parameters are still
|
||
metric dependent. See the scipy docs for usage examples.
|
||
|
||
Returns
|
||
-------
|
||
D : ndarray of shape (n_samples_X, n_samples_X) or \
|
||
(n_samples_X, n_samples_Y)
|
||
A distance matrix D such that D_{i, j} is the distance between the
|
||
ith and jth vectors of the given matrix X, if Y is None.
|
||
If Y is not None, then D_{i, j} is the distance between the ith array
|
||
from X and the jth array from Y.
|
||
|
||
See Also
|
||
--------
|
||
pairwise_distances_chunked : Performs the same calculation as this
|
||
function, but returns a generator of chunks of the distance matrix, in
|
||
order to limit memory usage.
|
||
paired_distances : Computes the distances between corresponding elements
|
||
of two arrays.
|
||
"""
|
||
if (
|
||
metric not in _VALID_METRICS
|
||
and not callable(metric)
|
||
and metric != "precomputed"
|
||
):
|
||
raise ValueError(
|
||
"Unknown metric %s. Valid metrics are %s, or 'precomputed', or a callable"
|
||
% (metric, _VALID_METRICS)
|
||
)
|
||
|
||
if metric == "precomputed":
|
||
X, _ = check_pairwise_arrays(
|
||
X, Y, precomputed=True, force_all_finite=force_all_finite
|
||
)
|
||
|
||
whom = (
|
||
"`pairwise_distances`. Precomputed distance "
|
||
" need to have non-negative values."
|
||
)
|
||
check_non_negative(X, whom=whom)
|
||
return X
|
||
elif metric in PAIRWISE_DISTANCE_FUNCTIONS:
|
||
func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
|
||
elif callable(metric):
|
||
func = partial(
|
||
_pairwise_callable, metric=metric, force_all_finite=force_all_finite, **kwds
|
||
)
|
||
else:
|
||
if issparse(X) or issparse(Y):
|
||
raise TypeError("scipy distance metrics do not support sparse matrices.")
|
||
|
||
dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None
|
||
|
||
if dtype == bool and (X.dtype != bool or (Y is not None and Y.dtype != bool)):
|
||
msg = "Data was converted to boolean for metric %s" % metric
|
||
warnings.warn(msg, DataConversionWarning)
|
||
|
||
X, Y = check_pairwise_arrays(
|
||
X, Y, dtype=dtype, force_all_finite=force_all_finite
|
||
)
|
||
|
||
# precompute data-derived metric params
|
||
params = _precompute_metric_params(X, Y, metric=metric, **kwds)
|
||
kwds.update(**params)
|
||
|
||
if effective_n_jobs(n_jobs) == 1 and X is Y:
|
||
return distance.squareform(distance.pdist(X, metric=metric, **kwds))
|
||
func = partial(distance.cdist, metric=metric, **kwds)
|
||
|
||
return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
|
||
|
||
|
||
# These distances require boolean arrays, when using scipy.spatial.distance
|
||
PAIRWISE_BOOLEAN_FUNCTIONS = [
|
||
"dice",
|
||
"jaccard",
|
||
"kulsinski",
|
||
"matching",
|
||
"rogerstanimoto",
|
||
"russellrao",
|
||
"sokalmichener",
|
||
"sokalsneath",
|
||
"yule",
|
||
]
|
||
|
||
# Helper functions - distance
|
||
PAIRWISE_KERNEL_FUNCTIONS = {
|
||
# If updating this dictionary, update the doc in both distance_metrics()
|
||
# and also in pairwise_distances()!
|
||
"additive_chi2": additive_chi2_kernel,
|
||
"chi2": chi2_kernel,
|
||
"linear": linear_kernel,
|
||
"polynomial": polynomial_kernel,
|
||
"poly": polynomial_kernel,
|
||
"rbf": rbf_kernel,
|
||
"laplacian": laplacian_kernel,
|
||
"sigmoid": sigmoid_kernel,
|
||
"cosine": cosine_similarity,
|
||
}
|
||
|
||
|
||
def kernel_metrics():
|
||
"""Valid metrics for pairwise_kernels.
|
||
|
||
This function simply returns the valid pairwise distance metrics.
|
||
It exists, however, to allow for a verbose description of the mapping for
|
||
each of the valid strings.
|
||
|
||
The valid distance metrics, and the function they map to, are:
|
||
=============== ========================================
|
||
metric Function
|
||
=============== ========================================
|
||
'additive_chi2' sklearn.pairwise.additive_chi2_kernel
|
||
'chi2' sklearn.pairwise.chi2_kernel
|
||
'linear' sklearn.pairwise.linear_kernel
|
||
'poly' sklearn.pairwise.polynomial_kernel
|
||
'polynomial' sklearn.pairwise.polynomial_kernel
|
||
'rbf' sklearn.pairwise.rbf_kernel
|
||
'laplacian' sklearn.pairwise.laplacian_kernel
|
||
'sigmoid' sklearn.pairwise.sigmoid_kernel
|
||
'cosine' sklearn.pairwise.cosine_similarity
|
||
=============== ========================================
|
||
|
||
Read more in the :ref:`User Guide <metrics>`.
|
||
|
||
Returns
|
||
-------
|
||
kernal_metrics : dict
|
||
Returns valid metrics for pairwise_kernels.
|
||
"""
|
||
return PAIRWISE_KERNEL_FUNCTIONS
|
||
|
||
|
||
KERNEL_PARAMS = {
|
||
"additive_chi2": (),
|
||
"chi2": frozenset(["gamma"]),
|
||
"cosine": (),
|
||
"linear": (),
|
||
"poly": frozenset(["gamma", "degree", "coef0"]),
|
||
"polynomial": frozenset(["gamma", "degree", "coef0"]),
|
||
"rbf": frozenset(["gamma"]),
|
||
"laplacian": frozenset(["gamma"]),
|
||
"sigmoid": frozenset(["gamma", "coef0"]),
|
||
}
|
||
|
||
|
||
def pairwise_kernels(
|
||
X, Y=None, metric="linear", *, filter_params=False, n_jobs=None, **kwds
|
||
):
|
||
"""Compute the kernel between arrays X and optional array Y.
|
||
|
||
This method takes either a vector array or a kernel matrix, and returns
|
||
a kernel matrix. If the input is a vector array, the kernels are
|
||
computed. If the input is a kernel matrix, it is returned instead.
|
||
|
||
This method provides a safe way to take a kernel matrix as input, while
|
||
preserving compatibility with many other algorithms that take a vector
|
||
array.
|
||
|
||
If Y is given (default is None), then the returned matrix is the pairwise
|
||
kernel between the arrays from both X and Y.
|
||
|
||
Valid values for metric are:
|
||
['additive_chi2', 'chi2', 'linear', 'poly', 'polynomial', 'rbf',
|
||
'laplacian', 'sigmoid', 'cosine']
|
||
|
||
Read more in the :ref:`User Guide <metrics>`.
|
||
|
||
Parameters
|
||
----------
|
||
X : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_features)
|
||
Array of pairwise kernels between samples, or a feature array.
|
||
The shape of the array should be (n_samples_X, n_samples_X) if
|
||
metric == "precomputed" and (n_samples_X, n_features) otherwise.
|
||
|
||
Y : ndarray of shape (n_samples_Y, n_features), default=None
|
||
A second feature array only if X has shape (n_samples_X, n_features).
|
||
|
||
metric : str or callable, default="linear"
|
||
The metric to use when calculating kernel between instances in a
|
||
feature array. If metric is a string, it must be one of the metrics
|
||
in pairwise.PAIRWISE_KERNEL_FUNCTIONS.
|
||
If metric is "precomputed", X is assumed to be a kernel matrix.
|
||
Alternatively, if metric is a callable function, it is called on each
|
||
pair of instances (rows) and the resulting value recorded. The callable
|
||
should take two rows from X as input and return the corresponding
|
||
kernel value as a single number. This means that callables from
|
||
:mod:`sklearn.metrics.pairwise` are not allowed, as they operate on
|
||
matrices, not single samples. Use the string identifying the kernel
|
||
instead.
|
||
|
||
filter_params : bool, default=False
|
||
Whether to filter invalid parameters or not.
|
||
|
||
n_jobs : int, default=None
|
||
The number of jobs to use for the computation. This works by breaking
|
||
down the pairwise matrix into n_jobs even slices and computing them in
|
||
parallel.
|
||
|
||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||
for more details.
|
||
|
||
**kwds : optional keyword parameters
|
||
Any further parameters are passed directly to the kernel function.
|
||
|
||
Returns
|
||
-------
|
||
K : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_samples_Y)
|
||
A kernel matrix K such that K_{i, j} is the kernel between the
|
||
ith and jth vectors of the given matrix X, if Y is None.
|
||
If Y is not None, then K_{i, j} is the kernel between the ith array
|
||
from X and the jth array from Y.
|
||
|
||
Notes
|
||
-----
|
||
If metric is 'precomputed', Y is ignored and X is returned.
|
||
"""
|
||
# import GPKernel locally to prevent circular imports
|
||
from ..gaussian_process.kernels import Kernel as GPKernel
|
||
|
||
if metric == "precomputed":
|
||
X, _ = check_pairwise_arrays(X, Y, precomputed=True)
|
||
return X
|
||
elif isinstance(metric, GPKernel):
|
||
func = metric.__call__
|
||
elif metric in PAIRWISE_KERNEL_FUNCTIONS:
|
||
if filter_params:
|
||
kwds = {k: kwds[k] for k in kwds if k in KERNEL_PARAMS[metric]}
|
||
func = PAIRWISE_KERNEL_FUNCTIONS[metric]
|
||
elif callable(metric):
|
||
func = partial(_pairwise_callable, metric=metric, **kwds)
|
||
else:
|
||
raise ValueError("Unknown kernel %r" % metric)
|
||
|
||
return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
|