Inzynierka/Lib/site-packages/sklearn/metrics/tests/test_pairwise_distances_reduction.py
2023-06-02 12:51:02 +02:00

1229 lines
40 KiB
Python

import itertools
import re
import warnings
from collections import defaultdict
import numpy as np
import pytest
import threadpoolctl
from math import log10, floor
from scipy.sparse import csr_matrix
from scipy.spatial.distance import cdist
from sklearn.metrics._pairwise_distances_reduction import (
BaseDistancesReductionDispatcher,
ArgKmin,
RadiusNeighbors,
sqeuclidean_row_norms,
)
from sklearn.metrics import euclidean_distances
from sklearn.utils.fixes import sp_version, parse_version
from sklearn.utils._testing import (
assert_array_equal,
assert_allclose,
create_memmap_backed_data,
)
# Common supported metric between scipy.spatial.distance.cdist
# and BaseDistanceReductionDispatcher.
# This allows constructing tests to check consistency of results
# of concrete BaseDistanceReductionDispatcher on some metrics using APIs
# from scipy and numpy.
CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS = [
"braycurtis",
"canberra",
"chebyshev",
"cityblock",
"euclidean",
"minkowski",
"seuclidean",
]
def _get_metric_params_list(metric: str, n_features: int, seed: int = 1):
"""Return list of dummy DistanceMetric kwargs for tests."""
# Distinguishing on cases not to compute unneeded datastructures.
rng = np.random.RandomState(seed)
if metric == "minkowski":
minkowski_kwargs = [dict(p=1.5), dict(p=2), dict(p=3), dict(p=np.inf)]
if sp_version >= parse_version("1.8.0.dev0"):
# TODO: remove the test once we no longer support scipy < 1.8.0.
# Recent scipy versions accept weights in the Minkowski metric directly:
# type: ignore
minkowski_kwargs.append(dict(p=3, w=rng.rand(n_features)))
return minkowski_kwargs
# TODO: remove this case for "wminkowski" once we no longer support scipy < 1.8.0.
if metric == "wminkowski":
weights = rng.random_sample(n_features)
weights /= weights.sum()
wminkowski_kwargs = [dict(p=1.5, w=weights)]
if sp_version < parse_version("1.8.0.dev0"):
# wminkowski was removed in scipy 1.8.0 but should work for previous
# versions.
wminkowski_kwargs.append(dict(p=3, w=rng.rand(n_features)))
return wminkowski_kwargs
if metric == "seuclidean":
return [dict(V=rng.rand(n_features))]
# Case of: "euclidean", "manhattan", "chebyshev", "haversine" or any other metric.
# In those cases, no kwargs is needed.
return [{}]
def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices, rtol=1e-7):
assert_array_equal(
ref_indices,
indices,
err_msg="Query vectors have different neighbors' indices",
)
assert_allclose(
ref_dist,
dist,
err_msg="Query vectors have different neighbors' distances",
rtol=rtol,
)
def relative_rounding(scalar, n_significant_digits):
"""Round a scalar to a number of significant digits relatively to its value."""
if scalar == 0:
return 0.0
magnitude = int(floor(log10(abs(scalar)))) + 1
return round(scalar, n_significant_digits - magnitude)
def test_relative_rounding():
assert relative_rounding(0, 1) == 0.0
assert relative_rounding(0, 10) == 0.0
assert relative_rounding(0, 123456) == 0.0
assert relative_rounding(123456789, 0) == 0
assert relative_rounding(123456789, 2) == 120000000
assert relative_rounding(123456789, 3) == 123000000
assert relative_rounding(123456789, 10) == 123456789
assert relative_rounding(123456789, 20) == 123456789
assert relative_rounding(1.23456789, 2) == 1.2
assert relative_rounding(1.23456789, 3) == 1.23
assert relative_rounding(1.23456789, 10) == 1.23456789
assert relative_rounding(123.456789, 3) == 123.0
assert relative_rounding(123.456789, 9) == 123.456789
assert relative_rounding(123.456789, 10) == 123.456789
def assert_argkmin_results_quasi_equality(
ref_dist,
dist,
ref_indices,
indices,
rtol=1e-4,
):
"""Assert that argkmin results are valid up to:
- relative tolerance on computed distance values
- permutations of indices for distances values that differ up to
a precision level
To be used for testing neighbors queries on float32 datasets: we
accept neighbors rank swaps only if they are caused by small
rounding errors on the distance computations.
"""
is_sorted = lambda a: np.all(a[:-1] <= a[1:])
n_significant_digits = -(int(floor(log10(abs(rtol)))) + 1)
assert (
ref_dist.shape == dist.shape == ref_indices.shape == indices.shape
), "Arrays of results have various shapes."
n_queries, n_neighbors = ref_dist.shape
# Asserting equality results one row at a time
for query_idx in range(n_queries):
ref_dist_row = ref_dist[query_idx]
dist_row = dist[query_idx]
assert is_sorted(
ref_dist_row
), f"Reference distances aren't sorted on row {query_idx}"
assert is_sorted(dist_row), f"Distances aren't sorted on row {query_idx}"
assert_allclose(ref_dist_row, dist_row, rtol=rtol)
ref_indices_row = ref_indices[query_idx]
indices_row = indices[query_idx]
# Grouping indices by distances using sets on a rounded distances up
# to a given number of decimals of significant digits derived from rtol.
reference_neighbors_groups = defaultdict(set)
effective_neighbors_groups = defaultdict(set)
for neighbor_rank in range(n_neighbors):
rounded_dist = relative_rounding(
ref_dist_row[neighbor_rank],
n_significant_digits=n_significant_digits,
)
reference_neighbors_groups[rounded_dist].add(ref_indices_row[neighbor_rank])
effective_neighbors_groups[rounded_dist].add(indices_row[neighbor_rank])
# Asserting equality of groups (sets) for each distance
msg = (
f"Neighbors indices for query {query_idx} are not matching "
f"when rounding distances at {n_significant_digits} significant digits "
f"derived from rtol={rtol:.1e}"
)
for rounded_distance in reference_neighbors_groups.keys():
assert (
reference_neighbors_groups[rounded_distance]
== effective_neighbors_groups[rounded_distance]
), msg
def assert_radius_neighbors_results_equality(
ref_dist, dist, ref_indices, indices, radius
):
# We get arrays of arrays and we need to check for individual pairs
for i in range(ref_dist.shape[0]):
assert (ref_dist[i] <= radius).all()
assert_array_equal(
ref_indices[i],
indices[i],
err_msg=f"Query vector #{i} has different neighbors' indices",
)
assert_allclose(
ref_dist[i],
dist[i],
err_msg=f"Query vector #{i} has different neighbors' distances",
rtol=1e-7,
)
def assert_radius_neighbors_results_quasi_equality(
ref_dist,
dist,
ref_indices,
indices,
radius,
rtol=1e-4,
):
"""Assert that radius neighborhood results are valid up to:
- relative tolerance on computed distance values
- permutations of indices for distances values that differ up to
a precision level
- missing or extra last elements if their distance is
close to the radius
To be used for testing neighbors queries on float32 datasets: we
accept neighbors rank swaps only if they are caused by small
rounding errors on the distance computations.
Input arrays must be sorted w.r.t distances.
"""
is_sorted = lambda a: np.all(a[:-1] <= a[1:])
n_significant_digits = -(int(floor(log10(abs(rtol)))) + 1)
assert (
len(ref_dist) == len(dist) == len(ref_indices) == len(indices)
), "Arrays of results have various lengths."
n_queries = len(ref_dist)
# Asserting equality of results one vector at a time
for query_idx in range(n_queries):
ref_dist_row = ref_dist[query_idx]
dist_row = dist[query_idx]
assert is_sorted(
ref_dist_row
), f"Reference distances aren't sorted on row {query_idx}"
assert is_sorted(dist_row), f"Distances aren't sorted on row {query_idx}"
# Vectors' lengths might be different due to small
# numerical differences of distance w.r.t the `radius` threshold.
largest_row = ref_dist_row if len(ref_dist_row) > len(dist_row) else dist_row
# For the longest distances vector, we check that last extra elements
# that aren't present in the other vector are all in: [radius ± rtol]
min_length = min(len(ref_dist_row), len(dist_row))
last_extra_elements = largest_row[min_length:]
if last_extra_elements.size > 0:
assert np.all(radius - rtol <= last_extra_elements <= radius + rtol), (
f"The last extra elements ({last_extra_elements}) aren't in [radius ±"
f" rtol]=[{radius} ± {rtol}]"
)
# We truncate the neighbors results list on the smallest length to
# be able to compare them, ignoring the elements checked above.
ref_dist_row = ref_dist_row[:min_length]
dist_row = dist_row[:min_length]
assert_allclose(ref_dist_row, dist_row, rtol=rtol)
ref_indices_row = ref_indices[query_idx]
indices_row = indices[query_idx]
# Grouping indices by distances using sets on a rounded distances up
# to a given number of significant digits derived from rtol.
reference_neighbors_groups = defaultdict(set)
effective_neighbors_groups = defaultdict(set)
for neighbor_rank in range(min_length):
rounded_dist = relative_rounding(
ref_dist_row[neighbor_rank],
n_significant_digits=n_significant_digits,
)
reference_neighbors_groups[rounded_dist].add(ref_indices_row[neighbor_rank])
effective_neighbors_groups[rounded_dist].add(indices_row[neighbor_rank])
# Asserting equality of groups (sets) for each distance
msg = (
f"Neighbors indices for query {query_idx} are not matching "
f"when rounding distances at {n_significant_digits} significant digits "
f"derived from rtol={rtol:.1e}"
)
for rounded_distance in reference_neighbors_groups.keys():
assert (
reference_neighbors_groups[rounded_distance]
== effective_neighbors_groups[rounded_distance]
), msg
ASSERT_RESULT = {
# In the case of 64bit, we test for exact equality of the results rankings
# and standard tolerance levels for the computed distance values.
#
# XXX: Note that in the future we might be interested in using quasi equality
# checks also for float64 data (with a larger number of significant digits)
# as the tests could be unstable because of numerically tied distances on
# some datasets (e.g. uniform grids).
(ArgKmin, np.float64): assert_argkmin_results_equality,
(
RadiusNeighbors,
np.float64,
): assert_radius_neighbors_results_equality,
# In the case of 32bit, indices can be permuted due to small difference
# in the computations of their associated distances, hence we test equality of
# results up to valid permutations.
(ArgKmin, np.float32): assert_argkmin_results_quasi_equality,
(
RadiusNeighbors,
np.float32,
): assert_radius_neighbors_results_quasi_equality,
}
def test_assert_argkmin_results_quasi_equality():
rtol = 1e-7
eps = 1e-7
_1m = 1.0 - eps
_1p = 1.0 + eps
_6_1m = 6.1 - eps
_6_1p = 6.1 + eps
ref_dist = np.array(
[
[1.2, 2.5, _6_1m, 6.1, _6_1p],
[_1m, _1m, 1, _1p, _1p],
]
)
ref_indices = np.array(
[
[1, 2, 3, 4, 5],
[6, 7, 8, 9, 10],
]
)
# Sanity check: compare the reference results to themselves.
assert_argkmin_results_quasi_equality(
ref_dist, ref_dist, ref_indices, ref_indices, rtol
)
# Apply valid permutation on indices: the last 3 points are
# all very close to one another so we accept any permutation
# on their rankings.
assert_argkmin_results_quasi_equality(
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
np.array([[1.2, 2.5, 6.1, 6.1, 6.1]]),
np.array([[1, 2, 3, 4, 5]]),
np.array([[1, 2, 4, 5, 3]]),
rtol=rtol,
)
# All points are have close distances so any ranking permutation
# is valid for this query result.
assert_argkmin_results_quasi_equality(
np.array([[_1m, _1m, 1, _1p, _1p]]),
np.array([[_1m, _1m, 1, _1p, _1p]]),
np.array([[6, 7, 8, 9, 10]]),
np.array([[6, 9, 7, 8, 10]]),
rtol=rtol,
)
# Apply invalid permutation on indices: permuting the ranks
# of the 2 nearest neighbors is invalid because the distance
# values are too different.
msg = "Neighbors indices for query 0 are not matching"
with pytest.raises(AssertionError, match=msg):
assert_argkmin_results_quasi_equality(
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
np.array([[1, 2, 3, 4, 5]]),
np.array([[2, 1, 3, 4, 5]]),
rtol=rtol,
)
# Indices aren't properly sorted w.r.t their distances
msg = "Neighbors indices for query 0 are not matching"
with pytest.raises(AssertionError, match=msg):
assert_argkmin_results_quasi_equality(
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
np.array([[1, 2, 3, 4, 5]]),
np.array([[2, 1, 4, 5, 3]]),
rtol=rtol,
)
# Distances aren't properly sorted
msg = "Distances aren't sorted on row 0"
with pytest.raises(AssertionError, match=msg):
assert_argkmin_results_quasi_equality(
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
np.array([[2.5, 1.2, _6_1m, 6.1, _6_1p]]),
np.array([[1, 2, 3, 4, 5]]),
np.array([[2, 1, 4, 5, 3]]),
rtol=rtol,
)
def test_assert_radius_neighbors_results_quasi_equality():
rtol = 1e-7
eps = 1e-7
_1m = 1.0 - eps
_1p = 1.0 + eps
_6_1m = 6.1 - eps
_6_1p = 6.1 + eps
ref_dist = [
np.array([1.2, 2.5, _6_1m, 6.1, _6_1p]),
np.array([_1m, 1, _1p, _1p]),
]
ref_indices = [
np.array([1, 2, 3, 4, 5]),
np.array([6, 7, 8, 9]),
]
# Sanity check: compare the reference results to themselves.
assert_radius_neighbors_results_quasi_equality(
ref_dist,
ref_dist,
ref_indices,
ref_indices,
radius=6.1,
rtol=rtol,
)
# Apply valid permutation on indices
assert_radius_neighbors_results_quasi_equality(
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1, 2, 3, 4, 5])]),
np.array([np.array([1, 2, 4, 5, 3])]),
radius=6.1,
rtol=rtol,
)
assert_radius_neighbors_results_quasi_equality(
np.array([np.array([_1m, _1m, 1, _1p, _1p])]),
np.array([np.array([_1m, _1m, 1, _1p, _1p])]),
np.array([np.array([6, 7, 8, 9, 10])]),
np.array([np.array([6, 9, 7, 8, 10])]),
radius=6.1,
rtol=rtol,
)
# Apply invalid permutation on indices
msg = "Neighbors indices for query 0 are not matching"
with pytest.raises(AssertionError, match=msg):
assert_radius_neighbors_results_quasi_equality(
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1, 2, 3, 4, 5])]),
np.array([np.array([2, 1, 3, 4, 5])]),
radius=6.1,
rtol=rtol,
)
# Having extra last elements is valid if they are in: [radius ± rtol]
assert_radius_neighbors_results_quasi_equality(
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1.2, 2.5, _6_1m, 6.1])]),
np.array([np.array([1, 2, 3, 4, 5])]),
np.array([np.array([1, 2, 3, 4])]),
radius=6.1,
rtol=rtol,
)
# Having extra last elements is invalid if they are lesser than radius - rtol
msg = re.escape(
"The last extra elements ([6.]) aren't in [radius ± rtol]=[6.1 ± 1e-07]"
)
with pytest.raises(AssertionError, match=msg):
assert_radius_neighbors_results_quasi_equality(
np.array([np.array([1.2, 2.5, 6])]),
np.array([np.array([1.2, 2.5])]),
np.array([np.array([1, 2, 3])]),
np.array([np.array([1, 2])]),
radius=6.1,
rtol=rtol,
)
# Indices aren't properly sorted w.r.t their distances
msg = "Neighbors indices for query 0 are not matching"
with pytest.raises(AssertionError, match=msg):
assert_radius_neighbors_results_quasi_equality(
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1, 2, 3, 4, 5])]),
np.array([np.array([2, 1, 4, 5, 3])]),
radius=6.1,
rtol=rtol,
)
# Distances aren't properly sorted
msg = "Distances aren't sorted on row 0"
with pytest.raises(AssertionError, match=msg):
assert_radius_neighbors_results_quasi_equality(
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([2.5, 1.2, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1, 2, 3, 4, 5])]),
np.array([np.array([2, 1, 4, 5, 3])]),
radius=6.1,
rtol=rtol,
)
def test_pairwise_distances_reduction_is_usable_for():
rng = np.random.RandomState(0)
X = rng.rand(100, 10)
Y = rng.rand(100, 10)
X_csr = csr_matrix(X)
Y_csr = csr_matrix(Y)
metric = "manhattan"
# Must be usable for all possible pair of {dense, sparse} datasets
assert BaseDistancesReductionDispatcher.is_usable_for(X, Y, metric)
assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y_csr, metric)
assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y, metric)
assert BaseDistancesReductionDispatcher.is_usable_for(X, Y_csr, metric)
assert BaseDistancesReductionDispatcher.is_usable_for(
X.astype(np.float64), Y.astype(np.float64), metric
)
assert BaseDistancesReductionDispatcher.is_usable_for(
X.astype(np.float32), Y.astype(np.float32), metric
)
assert not BaseDistancesReductionDispatcher.is_usable_for(
X.astype(np.int64), Y.astype(np.int64), metric
)
assert not BaseDistancesReductionDispatcher.is_usable_for(X, Y, metric="pyfunc")
assert not BaseDistancesReductionDispatcher.is_usable_for(
X.astype(np.float32), Y, metric
)
assert not BaseDistancesReductionDispatcher.is_usable_for(
X, Y.astype(np.int32), metric
)
# F-ordered arrays are not supported
assert not BaseDistancesReductionDispatcher.is_usable_for(
np.asfortranarray(X), Y, metric
)
# We prefer not to use those implementations for fused sparse-dense when
# metric="(sq)euclidean" because it's not yet the most efficient one on
# all configurations of datasets.
# See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1247996669 # noqa
# TODO: implement specialisation for (sq)euclidean on fused sparse-dense
# using sparse-dense routines for matrix-vector multiplications.
assert not BaseDistancesReductionDispatcher.is_usable_for(
X_csr, Y, metric="euclidean"
)
assert BaseDistancesReductionDispatcher.is_usable_for(
X_csr, Y_csr, metric="sqeuclidean"
)
assert BaseDistancesReductionDispatcher.is_usable_for(
X_csr, Y_csr, metric="euclidean"
)
# CSR matrices without non-zeros elements aren't currently supported
# TODO: support CSR matrices without non-zeros elements
X_csr_0_nnz = csr_matrix(X * 0)
assert not BaseDistancesReductionDispatcher.is_usable_for(X_csr_0_nnz, Y, metric)
# CSR matrices with int64 indices and indptr (e.g. large nnz, or large n_features)
# aren't supported as of now.
# See: https://github.com/scikit-learn/scikit-learn/issues/23653
# TODO: support CSR matrices with int64 indices and indptr
X_csr_int64 = csr_matrix(X)
X_csr_int64.indices = X_csr_int64.indices.astype(np.int64)
assert not BaseDistancesReductionDispatcher.is_usable_for(X_csr_int64, Y, metric)
def test_argkmin_factory_method_wrong_usages():
rng = np.random.RandomState(1)
X = rng.rand(100, 10)
Y = rng.rand(100, 10)
k = 5
metric = "euclidean"
msg = (
"Only float64 or float32 datasets pairs are supported at this time, "
"got: X.dtype=float32 and Y.dtype=float64"
)
with pytest.raises(ValueError, match=msg):
ArgKmin.compute(X=X.astype(np.float32), Y=Y, k=k, metric=metric)
msg = (
"Only float64 or float32 datasets pairs are supported at this time, "
"got: X.dtype=float64 and Y.dtype=int32"
)
with pytest.raises(ValueError, match=msg):
ArgKmin.compute(X=X, Y=Y.astype(np.int32), k=k, metric=metric)
with pytest.raises(ValueError, match="k == -1, must be >= 1."):
ArgKmin.compute(X=X, Y=Y, k=-1, metric=metric)
with pytest.raises(ValueError, match="k == 0, must be >= 1."):
ArgKmin.compute(X=X, Y=Y, k=0, metric=metric)
with pytest.raises(ValueError, match="Unrecognized metric"):
ArgKmin.compute(X=X, Y=Y, k=k, metric="wrong metric")
with pytest.raises(
ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
):
ArgKmin.compute(X=np.array([1.0, 2.0]), Y=Y, k=k, metric=metric)
with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
ArgKmin.compute(X=np.asfortranarray(X), Y=Y, k=k, metric=metric)
# A UserWarning must be raised in this case.
unused_metric_kwargs = {"p": 3}
message = r"Some metric_kwargs have been passed \({'p': 3}\) but"
with pytest.warns(UserWarning, match=message):
ArgKmin.compute(
X=X, Y=Y, k=k, metric=metric, metric_kwargs=unused_metric_kwargs
)
# A UserWarning must be raised in this case.
metric_kwargs = {
"p": 3, # unused
"Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
}
message = r"Some metric_kwargs have been passed \({'p': 3, 'Y_norm_squared'"
with pytest.warns(UserWarning, match=message):
ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)
# No user warning must be raised in this case.
metric_kwargs = {
"X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
}
with warnings.catch_warnings():
warnings.simplefilter("error", category=UserWarning)
ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)
# No user warning must be raised in this case.
metric_kwargs = {
"X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
"Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
}
with warnings.catch_warnings():
warnings.simplefilter("error", category=UserWarning)
ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)
def test_radius_neighbors_factory_method_wrong_usages():
rng = np.random.RandomState(1)
X = rng.rand(100, 10)
Y = rng.rand(100, 10)
radius = 5
metric = "euclidean"
msg = (
"Only float64 or float32 datasets pairs are supported at this time, "
"got: X.dtype=float32 and Y.dtype=float64"
)
with pytest.raises(
ValueError,
match=msg,
):
RadiusNeighbors.compute(
X=X.astype(np.float32), Y=Y, radius=radius, metric=metric
)
msg = (
"Only float64 or float32 datasets pairs are supported at this time, "
"got: X.dtype=float64 and Y.dtype=int32"
)
with pytest.raises(
ValueError,
match=msg,
):
RadiusNeighbors.compute(X=X, Y=Y.astype(np.int32), radius=radius, metric=metric)
with pytest.raises(ValueError, match="radius == -1.0, must be >= 0."):
RadiusNeighbors.compute(X=X, Y=Y, radius=-1, metric=metric)
with pytest.raises(ValueError, match="Unrecognized metric"):
RadiusNeighbors.compute(X=X, Y=Y, radius=radius, metric="wrong metric")
with pytest.raises(
ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
):
RadiusNeighbors.compute(
X=np.array([1.0, 2.0]), Y=Y, radius=radius, metric=metric
)
with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
RadiusNeighbors.compute(
X=np.asfortranarray(X), Y=Y, radius=radius, metric=metric
)
unused_metric_kwargs = {"p": 3}
# A UserWarning must be raised in this case.
message = r"Some metric_kwargs have been passed \({'p': 3}\) but"
with pytest.warns(UserWarning, match=message):
RadiusNeighbors.compute(
X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=unused_metric_kwargs
)
# A UserWarning must be raised in this case.
metric_kwargs = {
"p": 3, # unused
"Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
}
message = r"Some metric_kwargs have been passed \({'p': 3, 'Y_norm_squared'"
with pytest.warns(UserWarning, match=message):
RadiusNeighbors.compute(
X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
)
# No user warning must be raised in this case.
metric_kwargs = {
"X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
"Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
}
with warnings.catch_warnings():
warnings.simplefilter("error", category=UserWarning)
RadiusNeighbors.compute(
X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
)
# No user warning must be raised in this case.
metric_kwargs = {
"X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
}
with warnings.catch_warnings():
warnings.simplefilter("error", category=UserWarning)
RadiusNeighbors.compute(
X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
)
@pytest.mark.parametrize(
"n_samples_X, n_samples_Y", [(100, 100), (500, 100), (100, 500)]
)
@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_chunk_size_agnosticism(
global_random_seed,
Dispatcher,
n_samples_X,
n_samples_Y,
dtype,
n_features=100,
):
"""Check that results do not depend on the chunk size."""
rng = np.random.RandomState(global_random_seed)
spread = 100
X = rng.rand(n_samples_X, n_features).astype(dtype) * spread
Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread
if Dispatcher is ArgKmin:
parameter = 10
check_parameters = {}
compute_parameters = {}
else:
# Scaling the radius slightly with the numbers of dimensions
radius = 10 ** np.log(n_features)
parameter = radius
check_parameters = {"radius": radius}
compute_parameters = {"sort_results": True}
ref_dist, ref_indices = Dispatcher.compute(
X,
Y,
parameter,
chunk_size=256, # default
metric="manhattan",
return_distance=True,
**compute_parameters,
)
dist, indices = Dispatcher.compute(
X,
Y,
parameter,
chunk_size=41,
metric="manhattan",
return_distance=True,
**compute_parameters,
)
ASSERT_RESULT[(Dispatcher, dtype)](
ref_dist, dist, ref_indices, indices, **check_parameters
)
@pytest.mark.parametrize(
"n_samples_X, n_samples_Y", [(100, 100), (500, 100), (100, 500)]
)
@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_n_threads_agnosticism(
global_random_seed,
Dispatcher,
n_samples_X,
n_samples_Y,
dtype,
n_features=100,
):
"""Check that results do not depend on the number of threads."""
rng = np.random.RandomState(global_random_seed)
spread = 100
X = rng.rand(n_samples_X, n_features).astype(dtype) * spread
Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread
if Dispatcher is ArgKmin:
parameter = 10
check_parameters = {}
compute_parameters = {}
else:
# Scaling the radius slightly with the numbers of dimensions
radius = 10 ** np.log(n_features)
parameter = radius
check_parameters = {"radius": radius}
compute_parameters = {"sort_results": True}
ref_dist, ref_indices = Dispatcher.compute(
X,
Y,
parameter,
chunk_size=25, # make sure we use multiple threads
return_distance=True,
**compute_parameters,
)
with threadpoolctl.threadpool_limits(limits=1, user_api="openmp"):
dist, indices = Dispatcher.compute(
X,
Y,
parameter,
chunk_size=25,
return_distance=True,
**compute_parameters,
)
ASSERT_RESULT[(Dispatcher, dtype)](
ref_dist, dist, ref_indices, indices, **check_parameters
)
@pytest.mark.parametrize(
"Dispatcher, dtype",
[
(ArgKmin, np.float64),
(RadiusNeighbors, np.float32),
(ArgKmin, np.float32),
(RadiusNeighbors, np.float64),
],
)
def test_format_agnosticism(
global_random_seed,
Dispatcher,
dtype,
):
"""Check that results do not depend on the format (dense, sparse) of the input."""
rng = np.random.RandomState(global_random_seed)
spread = 100
n_samples, n_features = 100, 100
X = rng.rand(n_samples, n_features).astype(dtype) * spread
Y = rng.rand(n_samples, n_features).astype(dtype) * spread
X_csr = csr_matrix(X)
Y_csr = csr_matrix(Y)
if Dispatcher is ArgKmin:
parameter = 10
check_parameters = {}
compute_parameters = {}
else:
# Scaling the radius slightly with the numbers of dimensions
radius = 10 ** np.log(n_features)
parameter = radius
check_parameters = {"radius": radius}
compute_parameters = {"sort_results": True}
dist_dense, indices_dense = Dispatcher.compute(
X,
Y,
parameter,
chunk_size=50,
return_distance=True,
**compute_parameters,
)
for _X, _Y in itertools.product((X, X_csr), (Y, Y_csr)):
if _X is X and _Y is Y:
continue
dist, indices = Dispatcher.compute(
_X,
_Y,
parameter,
chunk_size=50,
return_distance=True,
**compute_parameters,
)
ASSERT_RESULT[(Dispatcher, dtype)](
dist_dense,
dist,
indices_dense,
indices,
**check_parameters,
)
@pytest.mark.parametrize(
"n_samples_X, n_samples_Y", [(100, 100), (100, 500), (500, 100)]
)
@pytest.mark.parametrize(
"metric",
["euclidean", "minkowski", "manhattan", "infinity", "seuclidean", "haversine"],
)
@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_strategies_consistency(
global_random_seed,
Dispatcher,
metric,
n_samples_X,
n_samples_Y,
dtype,
n_features=10,
):
"""Check that the results do not depend on the strategy used."""
rng = np.random.RandomState(global_random_seed)
spread = 100
X = rng.rand(n_samples_X, n_features).astype(dtype) * spread
Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread
# Haversine distance only accepts 2D data
if metric == "haversine":
X = np.ascontiguousarray(X[:, :2])
Y = np.ascontiguousarray(Y[:, :2])
if Dispatcher is ArgKmin:
parameter = 10
check_parameters = {}
compute_parameters = {}
else:
# Scaling the radius slightly with the numbers of dimensions
radius = 10 ** np.log(n_features)
parameter = radius
check_parameters = {"radius": radius}
compute_parameters = {"sort_results": True}
dist_par_X, indices_par_X = Dispatcher.compute(
X,
Y,
parameter,
metric=metric,
# Taking the first
metric_kwargs=_get_metric_params_list(
metric, n_features, seed=global_random_seed
)[0],
# To be sure to use parallelization
chunk_size=n_samples_X // 4,
strategy="parallel_on_X",
return_distance=True,
**compute_parameters,
)
dist_par_Y, indices_par_Y = Dispatcher.compute(
X,
Y,
parameter,
metric=metric,
# Taking the first
metric_kwargs=_get_metric_params_list(
metric, n_features, seed=global_random_seed
)[0],
# To be sure to use parallelization
chunk_size=n_samples_Y // 4,
strategy="parallel_on_Y",
return_distance=True,
**compute_parameters,
)
ASSERT_RESULT[(Dispatcher, dtype)](
dist_par_X, dist_par_Y, indices_par_X, indices_par_Y, **check_parameters
)
# "Concrete Dispatchers"-specific tests
# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize("n_features", [50, 500])
@pytest.mark.parametrize("translation", [0, 1e6])
@pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS)
@pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y"))
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_pairwise_distances_argkmin(
global_random_seed,
n_features,
translation,
metric,
strategy,
dtype,
n_samples=100,
k=10,
):
# TODO: can we easily fix this discrepancy?
edge_cases = [
(np.float32, "chebyshev", 1000000.0),
(np.float32, "cityblock", 1000000.0),
]
if (dtype, metric, translation) in edge_cases:
pytest.xfail("Numerical differences lead to small differences in results.")
rng = np.random.RandomState(global_random_seed)
spread = 1000
X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
X_csr = csr_matrix(X)
Y_csr = csr_matrix(Y)
# Haversine distance only accepts 2D data
if metric == "haversine":
X = np.ascontiguousarray(X[:, :2])
Y = np.ascontiguousarray(Y[:, :2])
metric_kwargs = _get_metric_params_list(metric, n_features)[0]
# Reference for argkmin results
if metric == "euclidean":
# Compare to scikit-learn GEMM optimized implementation
dist_matrix = euclidean_distances(X, Y)
else:
dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs)
# Taking argkmin (indices of the k smallest values)
argkmin_indices_ref = np.argsort(dist_matrix, axis=1)[:, :k]
# Getting the associated distances
argkmin_distances_ref = np.zeros(argkmin_indices_ref.shape, dtype=np.float64)
for row_idx in range(argkmin_indices_ref.shape[0]):
argkmin_distances_ref[row_idx] = dist_matrix[
row_idx, argkmin_indices_ref[row_idx]
]
for _X, _Y in [(X, Y), (X_csr, Y_csr)]:
argkmin_distances, argkmin_indices = ArgKmin.compute(
_X,
_Y,
k,
metric=metric,
metric_kwargs=metric_kwargs,
return_distance=True,
# So as to have more than a chunk, forcing parallelism.
chunk_size=n_samples // 4,
strategy=strategy,
)
ASSERT_RESULT[(ArgKmin, dtype)](
argkmin_distances,
argkmin_distances_ref,
argkmin_indices,
argkmin_indices_ref,
)
# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
@pytest.mark.parametrize("n_features", [50, 500])
@pytest.mark.parametrize("translation", [0, 1e6])
@pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS)
@pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y"))
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_pairwise_distances_radius_neighbors(
global_random_seed,
n_features,
translation,
metric,
strategy,
dtype,
n_samples=100,
):
rng = np.random.RandomState(global_random_seed)
spread = 1000
radius = spread * np.log(n_features)
X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
metric_kwargs = _get_metric_params_list(
metric, n_features, seed=global_random_seed
)[0]
# Reference for argkmin results
if metric == "euclidean":
# Compare to scikit-learn GEMM optimized implementation
dist_matrix = euclidean_distances(X, Y)
else:
dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs)
# Getting the neighbors for a given radius
neigh_indices_ref = []
neigh_distances_ref = []
for row in dist_matrix:
ind = np.arange(row.shape[0])[row <= radius]
dist = row[ind]
sort = np.argsort(dist)
ind, dist = ind[sort], dist[sort]
neigh_indices_ref.append(ind)
neigh_distances_ref.append(dist)
neigh_distances, neigh_indices = RadiusNeighbors.compute(
X,
Y,
radius,
metric=metric,
metric_kwargs=metric_kwargs,
return_distance=True,
# So as to have more than a chunk, forcing parallelism.
chunk_size=n_samples // 4,
strategy=strategy,
sort_results=True,
)
ASSERT_RESULT[(RadiusNeighbors, dtype)](
neigh_distances, neigh_distances_ref, neigh_indices, neigh_indices_ref, radius
)
@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
@pytest.mark.parametrize("metric", ["manhattan", "euclidean"])
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_memmap_backed_data(
metric,
Dispatcher,
dtype,
):
"""Check that the results do not depend on the datasets writability."""
rng = np.random.RandomState(0)
spread = 100
n_samples, n_features = 128, 10
X = rng.rand(n_samples, n_features).astype(dtype) * spread
Y = rng.rand(n_samples, n_features).astype(dtype) * spread
# Create read only datasets
X_mm, Y_mm = create_memmap_backed_data([X, Y])
if Dispatcher is ArgKmin:
parameter = 10
check_parameters = {}
compute_parameters = {}
else:
# Scaling the radius slightly with the numbers of dimensions
radius = 10 ** np.log(n_features)
parameter = radius
check_parameters = {"radius": radius}
compute_parameters = {"sort_results": True}
ref_dist, ref_indices = Dispatcher.compute(
X,
Y,
parameter,
metric=metric,
return_distance=True,
**compute_parameters,
)
dist_mm, indices_mm = Dispatcher.compute(
X_mm,
Y_mm,
parameter,
metric=metric,
return_distance=True,
**compute_parameters,
)
ASSERT_RESULT[(Dispatcher, dtype)](
ref_dist, dist_mm, ref_indices, indices_mm, **check_parameters
)
@pytest.mark.parametrize("n_samples", [100, 1000])
@pytest.mark.parametrize("n_features", [5, 10, 100])
@pytest.mark.parametrize("num_threads", [1, 2, 8])
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_sqeuclidean_row_norms(
global_random_seed,
n_samples,
n_features,
num_threads,
dtype,
):
rng = np.random.RandomState(global_random_seed)
spread = 100
X = rng.rand(n_samples, n_features).astype(dtype) * spread
X_csr = csr_matrix(X)
sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2
sq_row_norm = sqeuclidean_row_norms(X, num_threads=num_threads)
sq_row_norm_csr = sqeuclidean_row_norms(X_csr, num_threads=num_threads)
assert_allclose(sq_row_norm_reference, sq_row_norm)
assert_allclose(sq_row_norm_reference, sq_row_norm_csr)
with pytest.raises(ValueError):
X = np.asfortranarray(X)
sqeuclidean_row_norms(X, num_threads=num_threads)