1229 lines
40 KiB
Python
1229 lines
40 KiB
Python
|
import itertools
|
||
|
import re
|
||
|
import warnings
|
||
|
from collections import defaultdict
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
import threadpoolctl
|
||
|
from math import log10, floor
|
||
|
from scipy.sparse import csr_matrix
|
||
|
from scipy.spatial.distance import cdist
|
||
|
|
||
|
from sklearn.metrics._pairwise_distances_reduction import (
|
||
|
BaseDistancesReductionDispatcher,
|
||
|
ArgKmin,
|
||
|
RadiusNeighbors,
|
||
|
sqeuclidean_row_norms,
|
||
|
)
|
||
|
|
||
|
from sklearn.metrics import euclidean_distances
|
||
|
from sklearn.utils.fixes import sp_version, parse_version
|
||
|
from sklearn.utils._testing import (
|
||
|
assert_array_equal,
|
||
|
assert_allclose,
|
||
|
create_memmap_backed_data,
|
||
|
)
|
||
|
|
||
|
# Common supported metric between scipy.spatial.distance.cdist
|
||
|
# and BaseDistanceReductionDispatcher.
|
||
|
# This allows constructing tests to check consistency of results
|
||
|
# of concrete BaseDistanceReductionDispatcher on some metrics using APIs
|
||
|
# from scipy and numpy.
|
||
|
CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS = [
|
||
|
"braycurtis",
|
||
|
"canberra",
|
||
|
"chebyshev",
|
||
|
"cityblock",
|
||
|
"euclidean",
|
||
|
"minkowski",
|
||
|
"seuclidean",
|
||
|
]
|
||
|
|
||
|
|
||
|
def _get_metric_params_list(metric: str, n_features: int, seed: int = 1):
|
||
|
"""Return list of dummy DistanceMetric kwargs for tests."""
|
||
|
|
||
|
# Distinguishing on cases not to compute unneeded datastructures.
|
||
|
rng = np.random.RandomState(seed)
|
||
|
|
||
|
if metric == "minkowski":
|
||
|
minkowski_kwargs = [dict(p=1.5), dict(p=2), dict(p=3), dict(p=np.inf)]
|
||
|
if sp_version >= parse_version("1.8.0.dev0"):
|
||
|
# TODO: remove the test once we no longer support scipy < 1.8.0.
|
||
|
# Recent scipy versions accept weights in the Minkowski metric directly:
|
||
|
# type: ignore
|
||
|
minkowski_kwargs.append(dict(p=3, w=rng.rand(n_features)))
|
||
|
|
||
|
return minkowski_kwargs
|
||
|
|
||
|
# TODO: remove this case for "wminkowski" once we no longer support scipy < 1.8.0.
|
||
|
if metric == "wminkowski":
|
||
|
weights = rng.random_sample(n_features)
|
||
|
weights /= weights.sum()
|
||
|
wminkowski_kwargs = [dict(p=1.5, w=weights)]
|
||
|
if sp_version < parse_version("1.8.0.dev0"):
|
||
|
# wminkowski was removed in scipy 1.8.0 but should work for previous
|
||
|
# versions.
|
||
|
wminkowski_kwargs.append(dict(p=3, w=rng.rand(n_features)))
|
||
|
return wminkowski_kwargs
|
||
|
|
||
|
if metric == "seuclidean":
|
||
|
return [dict(V=rng.rand(n_features))]
|
||
|
|
||
|
# Case of: "euclidean", "manhattan", "chebyshev", "haversine" or any other metric.
|
||
|
# In those cases, no kwargs is needed.
|
||
|
return [{}]
|
||
|
|
||
|
|
||
|
def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices, rtol=1e-7):
|
||
|
assert_array_equal(
|
||
|
ref_indices,
|
||
|
indices,
|
||
|
err_msg="Query vectors have different neighbors' indices",
|
||
|
)
|
||
|
assert_allclose(
|
||
|
ref_dist,
|
||
|
dist,
|
||
|
err_msg="Query vectors have different neighbors' distances",
|
||
|
rtol=rtol,
|
||
|
)
|
||
|
|
||
|
|
||
|
def relative_rounding(scalar, n_significant_digits):
|
||
|
"""Round a scalar to a number of significant digits relatively to its value."""
|
||
|
if scalar == 0:
|
||
|
return 0.0
|
||
|
magnitude = int(floor(log10(abs(scalar)))) + 1
|
||
|
return round(scalar, n_significant_digits - magnitude)
|
||
|
|
||
|
|
||
|
def test_relative_rounding():
|
||
|
|
||
|
assert relative_rounding(0, 1) == 0.0
|
||
|
assert relative_rounding(0, 10) == 0.0
|
||
|
assert relative_rounding(0, 123456) == 0.0
|
||
|
|
||
|
assert relative_rounding(123456789, 0) == 0
|
||
|
assert relative_rounding(123456789, 2) == 120000000
|
||
|
assert relative_rounding(123456789, 3) == 123000000
|
||
|
assert relative_rounding(123456789, 10) == 123456789
|
||
|
assert relative_rounding(123456789, 20) == 123456789
|
||
|
|
||
|
assert relative_rounding(1.23456789, 2) == 1.2
|
||
|
assert relative_rounding(1.23456789, 3) == 1.23
|
||
|
assert relative_rounding(1.23456789, 10) == 1.23456789
|
||
|
|
||
|
assert relative_rounding(123.456789, 3) == 123.0
|
||
|
assert relative_rounding(123.456789, 9) == 123.456789
|
||
|
assert relative_rounding(123.456789, 10) == 123.456789
|
||
|
|
||
|
|
||
|
def assert_argkmin_results_quasi_equality(
|
||
|
ref_dist,
|
||
|
dist,
|
||
|
ref_indices,
|
||
|
indices,
|
||
|
rtol=1e-4,
|
||
|
):
|
||
|
"""Assert that argkmin results are valid up to:
|
||
|
- relative tolerance on computed distance values
|
||
|
- permutations of indices for distances values that differ up to
|
||
|
a precision level
|
||
|
|
||
|
To be used for testing neighbors queries on float32 datasets: we
|
||
|
accept neighbors rank swaps only if they are caused by small
|
||
|
rounding errors on the distance computations.
|
||
|
"""
|
||
|
is_sorted = lambda a: np.all(a[:-1] <= a[1:])
|
||
|
|
||
|
n_significant_digits = -(int(floor(log10(abs(rtol)))) + 1)
|
||
|
|
||
|
assert (
|
||
|
ref_dist.shape == dist.shape == ref_indices.shape == indices.shape
|
||
|
), "Arrays of results have various shapes."
|
||
|
|
||
|
n_queries, n_neighbors = ref_dist.shape
|
||
|
|
||
|
# Asserting equality results one row at a time
|
||
|
for query_idx in range(n_queries):
|
||
|
ref_dist_row = ref_dist[query_idx]
|
||
|
dist_row = dist[query_idx]
|
||
|
|
||
|
assert is_sorted(
|
||
|
ref_dist_row
|
||
|
), f"Reference distances aren't sorted on row {query_idx}"
|
||
|
assert is_sorted(dist_row), f"Distances aren't sorted on row {query_idx}"
|
||
|
|
||
|
assert_allclose(ref_dist_row, dist_row, rtol=rtol)
|
||
|
|
||
|
ref_indices_row = ref_indices[query_idx]
|
||
|
indices_row = indices[query_idx]
|
||
|
|
||
|
# Grouping indices by distances using sets on a rounded distances up
|
||
|
# to a given number of decimals of significant digits derived from rtol.
|
||
|
reference_neighbors_groups = defaultdict(set)
|
||
|
effective_neighbors_groups = defaultdict(set)
|
||
|
|
||
|
for neighbor_rank in range(n_neighbors):
|
||
|
rounded_dist = relative_rounding(
|
||
|
ref_dist_row[neighbor_rank],
|
||
|
n_significant_digits=n_significant_digits,
|
||
|
)
|
||
|
reference_neighbors_groups[rounded_dist].add(ref_indices_row[neighbor_rank])
|
||
|
effective_neighbors_groups[rounded_dist].add(indices_row[neighbor_rank])
|
||
|
|
||
|
# Asserting equality of groups (sets) for each distance
|
||
|
msg = (
|
||
|
f"Neighbors indices for query {query_idx} are not matching "
|
||
|
f"when rounding distances at {n_significant_digits} significant digits "
|
||
|
f"derived from rtol={rtol:.1e}"
|
||
|
)
|
||
|
for rounded_distance in reference_neighbors_groups.keys():
|
||
|
assert (
|
||
|
reference_neighbors_groups[rounded_distance]
|
||
|
== effective_neighbors_groups[rounded_distance]
|
||
|
), msg
|
||
|
|
||
|
|
||
|
def assert_radius_neighbors_results_equality(
|
||
|
ref_dist, dist, ref_indices, indices, radius
|
||
|
):
|
||
|
# We get arrays of arrays and we need to check for individual pairs
|
||
|
for i in range(ref_dist.shape[0]):
|
||
|
assert (ref_dist[i] <= radius).all()
|
||
|
assert_array_equal(
|
||
|
ref_indices[i],
|
||
|
indices[i],
|
||
|
err_msg=f"Query vector #{i} has different neighbors' indices",
|
||
|
)
|
||
|
assert_allclose(
|
||
|
ref_dist[i],
|
||
|
dist[i],
|
||
|
err_msg=f"Query vector #{i} has different neighbors' distances",
|
||
|
rtol=1e-7,
|
||
|
)
|
||
|
|
||
|
|
||
|
def assert_radius_neighbors_results_quasi_equality(
|
||
|
ref_dist,
|
||
|
dist,
|
||
|
ref_indices,
|
||
|
indices,
|
||
|
radius,
|
||
|
rtol=1e-4,
|
||
|
):
|
||
|
"""Assert that radius neighborhood results are valid up to:
|
||
|
- relative tolerance on computed distance values
|
||
|
- permutations of indices for distances values that differ up to
|
||
|
a precision level
|
||
|
- missing or extra last elements if their distance is
|
||
|
close to the radius
|
||
|
|
||
|
To be used for testing neighbors queries on float32 datasets: we
|
||
|
accept neighbors rank swaps only if they are caused by small
|
||
|
rounding errors on the distance computations.
|
||
|
|
||
|
Input arrays must be sorted w.r.t distances.
|
||
|
"""
|
||
|
is_sorted = lambda a: np.all(a[:-1] <= a[1:])
|
||
|
|
||
|
n_significant_digits = -(int(floor(log10(abs(rtol)))) + 1)
|
||
|
|
||
|
assert (
|
||
|
len(ref_dist) == len(dist) == len(ref_indices) == len(indices)
|
||
|
), "Arrays of results have various lengths."
|
||
|
|
||
|
n_queries = len(ref_dist)
|
||
|
|
||
|
# Asserting equality of results one vector at a time
|
||
|
for query_idx in range(n_queries):
|
||
|
|
||
|
ref_dist_row = ref_dist[query_idx]
|
||
|
dist_row = dist[query_idx]
|
||
|
|
||
|
assert is_sorted(
|
||
|
ref_dist_row
|
||
|
), f"Reference distances aren't sorted on row {query_idx}"
|
||
|
assert is_sorted(dist_row), f"Distances aren't sorted on row {query_idx}"
|
||
|
|
||
|
# Vectors' lengths might be different due to small
|
||
|
# numerical differences of distance w.r.t the `radius` threshold.
|
||
|
largest_row = ref_dist_row if len(ref_dist_row) > len(dist_row) else dist_row
|
||
|
|
||
|
# For the longest distances vector, we check that last extra elements
|
||
|
# that aren't present in the other vector are all in: [radius ± rtol]
|
||
|
min_length = min(len(ref_dist_row), len(dist_row))
|
||
|
last_extra_elements = largest_row[min_length:]
|
||
|
if last_extra_elements.size > 0:
|
||
|
assert np.all(radius - rtol <= last_extra_elements <= radius + rtol), (
|
||
|
f"The last extra elements ({last_extra_elements}) aren't in [radius ±"
|
||
|
f" rtol]=[{radius} ± {rtol}]"
|
||
|
)
|
||
|
|
||
|
# We truncate the neighbors results list on the smallest length to
|
||
|
# be able to compare them, ignoring the elements checked above.
|
||
|
ref_dist_row = ref_dist_row[:min_length]
|
||
|
dist_row = dist_row[:min_length]
|
||
|
|
||
|
assert_allclose(ref_dist_row, dist_row, rtol=rtol)
|
||
|
|
||
|
ref_indices_row = ref_indices[query_idx]
|
||
|
indices_row = indices[query_idx]
|
||
|
|
||
|
# Grouping indices by distances using sets on a rounded distances up
|
||
|
# to a given number of significant digits derived from rtol.
|
||
|
reference_neighbors_groups = defaultdict(set)
|
||
|
effective_neighbors_groups = defaultdict(set)
|
||
|
|
||
|
for neighbor_rank in range(min_length):
|
||
|
rounded_dist = relative_rounding(
|
||
|
ref_dist_row[neighbor_rank],
|
||
|
n_significant_digits=n_significant_digits,
|
||
|
)
|
||
|
reference_neighbors_groups[rounded_dist].add(ref_indices_row[neighbor_rank])
|
||
|
effective_neighbors_groups[rounded_dist].add(indices_row[neighbor_rank])
|
||
|
|
||
|
# Asserting equality of groups (sets) for each distance
|
||
|
msg = (
|
||
|
f"Neighbors indices for query {query_idx} are not matching "
|
||
|
f"when rounding distances at {n_significant_digits} significant digits "
|
||
|
f"derived from rtol={rtol:.1e}"
|
||
|
)
|
||
|
for rounded_distance in reference_neighbors_groups.keys():
|
||
|
assert (
|
||
|
reference_neighbors_groups[rounded_distance]
|
||
|
== effective_neighbors_groups[rounded_distance]
|
||
|
), msg
|
||
|
|
||
|
|
||
|
ASSERT_RESULT = {
|
||
|
# In the case of 64bit, we test for exact equality of the results rankings
|
||
|
# and standard tolerance levels for the computed distance values.
|
||
|
#
|
||
|
# XXX: Note that in the future we might be interested in using quasi equality
|
||
|
# checks also for float64 data (with a larger number of significant digits)
|
||
|
# as the tests could be unstable because of numerically tied distances on
|
||
|
# some datasets (e.g. uniform grids).
|
||
|
(ArgKmin, np.float64): assert_argkmin_results_equality,
|
||
|
(
|
||
|
RadiusNeighbors,
|
||
|
np.float64,
|
||
|
): assert_radius_neighbors_results_equality,
|
||
|
# In the case of 32bit, indices can be permuted due to small difference
|
||
|
# in the computations of their associated distances, hence we test equality of
|
||
|
# results up to valid permutations.
|
||
|
(ArgKmin, np.float32): assert_argkmin_results_quasi_equality,
|
||
|
(
|
||
|
RadiusNeighbors,
|
||
|
np.float32,
|
||
|
): assert_radius_neighbors_results_quasi_equality,
|
||
|
}
|
||
|
|
||
|
|
||
|
def test_assert_argkmin_results_quasi_equality():
|
||
|
|
||
|
rtol = 1e-7
|
||
|
eps = 1e-7
|
||
|
_1m = 1.0 - eps
|
||
|
_1p = 1.0 + eps
|
||
|
|
||
|
_6_1m = 6.1 - eps
|
||
|
_6_1p = 6.1 + eps
|
||
|
|
||
|
ref_dist = np.array(
|
||
|
[
|
||
|
[1.2, 2.5, _6_1m, 6.1, _6_1p],
|
||
|
[_1m, _1m, 1, _1p, _1p],
|
||
|
]
|
||
|
)
|
||
|
ref_indices = np.array(
|
||
|
[
|
||
|
[1, 2, 3, 4, 5],
|
||
|
[6, 7, 8, 9, 10],
|
||
|
]
|
||
|
)
|
||
|
|
||
|
# Sanity check: compare the reference results to themselves.
|
||
|
assert_argkmin_results_quasi_equality(
|
||
|
ref_dist, ref_dist, ref_indices, ref_indices, rtol
|
||
|
)
|
||
|
|
||
|
# Apply valid permutation on indices: the last 3 points are
|
||
|
# all very close to one another so we accept any permutation
|
||
|
# on their rankings.
|
||
|
assert_argkmin_results_quasi_equality(
|
||
|
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
|
||
|
np.array([[1.2, 2.5, 6.1, 6.1, 6.1]]),
|
||
|
np.array([[1, 2, 3, 4, 5]]),
|
||
|
np.array([[1, 2, 4, 5, 3]]),
|
||
|
rtol=rtol,
|
||
|
)
|
||
|
# All points are have close distances so any ranking permutation
|
||
|
# is valid for this query result.
|
||
|
assert_argkmin_results_quasi_equality(
|
||
|
np.array([[_1m, _1m, 1, _1p, _1p]]),
|
||
|
np.array([[_1m, _1m, 1, _1p, _1p]]),
|
||
|
np.array([[6, 7, 8, 9, 10]]),
|
||
|
np.array([[6, 9, 7, 8, 10]]),
|
||
|
rtol=rtol,
|
||
|
)
|
||
|
|
||
|
# Apply invalid permutation on indices: permuting the ranks
|
||
|
# of the 2 nearest neighbors is invalid because the distance
|
||
|
# values are too different.
|
||
|
msg = "Neighbors indices for query 0 are not matching"
|
||
|
with pytest.raises(AssertionError, match=msg):
|
||
|
assert_argkmin_results_quasi_equality(
|
||
|
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
|
||
|
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
|
||
|
np.array([[1, 2, 3, 4, 5]]),
|
||
|
np.array([[2, 1, 3, 4, 5]]),
|
||
|
rtol=rtol,
|
||
|
)
|
||
|
|
||
|
# Indices aren't properly sorted w.r.t their distances
|
||
|
msg = "Neighbors indices for query 0 are not matching"
|
||
|
with pytest.raises(AssertionError, match=msg):
|
||
|
assert_argkmin_results_quasi_equality(
|
||
|
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
|
||
|
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
|
||
|
np.array([[1, 2, 3, 4, 5]]),
|
||
|
np.array([[2, 1, 4, 5, 3]]),
|
||
|
rtol=rtol,
|
||
|
)
|
||
|
|
||
|
# Distances aren't properly sorted
|
||
|
msg = "Distances aren't sorted on row 0"
|
||
|
with pytest.raises(AssertionError, match=msg):
|
||
|
assert_argkmin_results_quasi_equality(
|
||
|
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
|
||
|
np.array([[2.5, 1.2, _6_1m, 6.1, _6_1p]]),
|
||
|
np.array([[1, 2, 3, 4, 5]]),
|
||
|
np.array([[2, 1, 4, 5, 3]]),
|
||
|
rtol=rtol,
|
||
|
)
|
||
|
|
||
|
|
||
|
def test_assert_radius_neighbors_results_quasi_equality():
|
||
|
|
||
|
rtol = 1e-7
|
||
|
eps = 1e-7
|
||
|
_1m = 1.0 - eps
|
||
|
_1p = 1.0 + eps
|
||
|
|
||
|
_6_1m = 6.1 - eps
|
||
|
_6_1p = 6.1 + eps
|
||
|
|
||
|
ref_dist = [
|
||
|
np.array([1.2, 2.5, _6_1m, 6.1, _6_1p]),
|
||
|
np.array([_1m, 1, _1p, _1p]),
|
||
|
]
|
||
|
|
||
|
ref_indices = [
|
||
|
np.array([1, 2, 3, 4, 5]),
|
||
|
np.array([6, 7, 8, 9]),
|
||
|
]
|
||
|
|
||
|
# Sanity check: compare the reference results to themselves.
|
||
|
assert_radius_neighbors_results_quasi_equality(
|
||
|
ref_dist,
|
||
|
ref_dist,
|
||
|
ref_indices,
|
||
|
ref_indices,
|
||
|
radius=6.1,
|
||
|
rtol=rtol,
|
||
|
)
|
||
|
|
||
|
# Apply valid permutation on indices
|
||
|
assert_radius_neighbors_results_quasi_equality(
|
||
|
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
|
||
|
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
|
||
|
np.array([np.array([1, 2, 3, 4, 5])]),
|
||
|
np.array([np.array([1, 2, 4, 5, 3])]),
|
||
|
radius=6.1,
|
||
|
rtol=rtol,
|
||
|
)
|
||
|
assert_radius_neighbors_results_quasi_equality(
|
||
|
np.array([np.array([_1m, _1m, 1, _1p, _1p])]),
|
||
|
np.array([np.array([_1m, _1m, 1, _1p, _1p])]),
|
||
|
np.array([np.array([6, 7, 8, 9, 10])]),
|
||
|
np.array([np.array([6, 9, 7, 8, 10])]),
|
||
|
radius=6.1,
|
||
|
rtol=rtol,
|
||
|
)
|
||
|
|
||
|
# Apply invalid permutation on indices
|
||
|
msg = "Neighbors indices for query 0 are not matching"
|
||
|
with pytest.raises(AssertionError, match=msg):
|
||
|
assert_radius_neighbors_results_quasi_equality(
|
||
|
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
|
||
|
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
|
||
|
np.array([np.array([1, 2, 3, 4, 5])]),
|
||
|
np.array([np.array([2, 1, 3, 4, 5])]),
|
||
|
radius=6.1,
|
||
|
rtol=rtol,
|
||
|
)
|
||
|
|
||
|
# Having extra last elements is valid if they are in: [radius ± rtol]
|
||
|
assert_radius_neighbors_results_quasi_equality(
|
||
|
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
|
||
|
np.array([np.array([1.2, 2.5, _6_1m, 6.1])]),
|
||
|
np.array([np.array([1, 2, 3, 4, 5])]),
|
||
|
np.array([np.array([1, 2, 3, 4])]),
|
||
|
radius=6.1,
|
||
|
rtol=rtol,
|
||
|
)
|
||
|
|
||
|
# Having extra last elements is invalid if they are lesser than radius - rtol
|
||
|
msg = re.escape(
|
||
|
"The last extra elements ([6.]) aren't in [radius ± rtol]=[6.1 ± 1e-07]"
|
||
|
)
|
||
|
with pytest.raises(AssertionError, match=msg):
|
||
|
assert_radius_neighbors_results_quasi_equality(
|
||
|
np.array([np.array([1.2, 2.5, 6])]),
|
||
|
np.array([np.array([1.2, 2.5])]),
|
||
|
np.array([np.array([1, 2, 3])]),
|
||
|
np.array([np.array([1, 2])]),
|
||
|
radius=6.1,
|
||
|
rtol=rtol,
|
||
|
)
|
||
|
|
||
|
# Indices aren't properly sorted w.r.t their distances
|
||
|
msg = "Neighbors indices for query 0 are not matching"
|
||
|
with pytest.raises(AssertionError, match=msg):
|
||
|
assert_radius_neighbors_results_quasi_equality(
|
||
|
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
|
||
|
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
|
||
|
np.array([np.array([1, 2, 3, 4, 5])]),
|
||
|
np.array([np.array([2, 1, 4, 5, 3])]),
|
||
|
radius=6.1,
|
||
|
rtol=rtol,
|
||
|
)
|
||
|
|
||
|
# Distances aren't properly sorted
|
||
|
msg = "Distances aren't sorted on row 0"
|
||
|
with pytest.raises(AssertionError, match=msg):
|
||
|
assert_radius_neighbors_results_quasi_equality(
|
||
|
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
|
||
|
np.array([np.array([2.5, 1.2, _6_1m, 6.1, _6_1p])]),
|
||
|
np.array([np.array([1, 2, 3, 4, 5])]),
|
||
|
np.array([np.array([2, 1, 4, 5, 3])]),
|
||
|
radius=6.1,
|
||
|
rtol=rtol,
|
||
|
)
|
||
|
|
||
|
|
||
|
def test_pairwise_distances_reduction_is_usable_for():
|
||
|
rng = np.random.RandomState(0)
|
||
|
X = rng.rand(100, 10)
|
||
|
Y = rng.rand(100, 10)
|
||
|
X_csr = csr_matrix(X)
|
||
|
Y_csr = csr_matrix(Y)
|
||
|
metric = "manhattan"
|
||
|
|
||
|
# Must be usable for all possible pair of {dense, sparse} datasets
|
||
|
assert BaseDistancesReductionDispatcher.is_usable_for(X, Y, metric)
|
||
|
assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y_csr, metric)
|
||
|
assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y, metric)
|
||
|
assert BaseDistancesReductionDispatcher.is_usable_for(X, Y_csr, metric)
|
||
|
|
||
|
assert BaseDistancesReductionDispatcher.is_usable_for(
|
||
|
X.astype(np.float64), Y.astype(np.float64), metric
|
||
|
)
|
||
|
|
||
|
assert BaseDistancesReductionDispatcher.is_usable_for(
|
||
|
X.astype(np.float32), Y.astype(np.float32), metric
|
||
|
)
|
||
|
|
||
|
assert not BaseDistancesReductionDispatcher.is_usable_for(
|
||
|
X.astype(np.int64), Y.astype(np.int64), metric
|
||
|
)
|
||
|
|
||
|
assert not BaseDistancesReductionDispatcher.is_usable_for(X, Y, metric="pyfunc")
|
||
|
assert not BaseDistancesReductionDispatcher.is_usable_for(
|
||
|
X.astype(np.float32), Y, metric
|
||
|
)
|
||
|
assert not BaseDistancesReductionDispatcher.is_usable_for(
|
||
|
X, Y.astype(np.int32), metric
|
||
|
)
|
||
|
|
||
|
# F-ordered arrays are not supported
|
||
|
assert not BaseDistancesReductionDispatcher.is_usable_for(
|
||
|
np.asfortranarray(X), Y, metric
|
||
|
)
|
||
|
|
||
|
# We prefer not to use those implementations for fused sparse-dense when
|
||
|
# metric="(sq)euclidean" because it's not yet the most efficient one on
|
||
|
# all configurations of datasets.
|
||
|
# See: https://github.com/scikit-learn/scikit-learn/pull/23585#issuecomment-1247996669 # noqa
|
||
|
# TODO: implement specialisation for (sq)euclidean on fused sparse-dense
|
||
|
# using sparse-dense routines for matrix-vector multiplications.
|
||
|
assert not BaseDistancesReductionDispatcher.is_usable_for(
|
||
|
X_csr, Y, metric="euclidean"
|
||
|
)
|
||
|
assert BaseDistancesReductionDispatcher.is_usable_for(
|
||
|
X_csr, Y_csr, metric="sqeuclidean"
|
||
|
)
|
||
|
assert BaseDistancesReductionDispatcher.is_usable_for(
|
||
|
X_csr, Y_csr, metric="euclidean"
|
||
|
)
|
||
|
|
||
|
# CSR matrices without non-zeros elements aren't currently supported
|
||
|
# TODO: support CSR matrices without non-zeros elements
|
||
|
X_csr_0_nnz = csr_matrix(X * 0)
|
||
|
assert not BaseDistancesReductionDispatcher.is_usable_for(X_csr_0_nnz, Y, metric)
|
||
|
|
||
|
# CSR matrices with int64 indices and indptr (e.g. large nnz, or large n_features)
|
||
|
# aren't supported as of now.
|
||
|
# See: https://github.com/scikit-learn/scikit-learn/issues/23653
|
||
|
# TODO: support CSR matrices with int64 indices and indptr
|
||
|
X_csr_int64 = csr_matrix(X)
|
||
|
X_csr_int64.indices = X_csr_int64.indices.astype(np.int64)
|
||
|
assert not BaseDistancesReductionDispatcher.is_usable_for(X_csr_int64, Y, metric)
|
||
|
|
||
|
|
||
|
def test_argkmin_factory_method_wrong_usages():
|
||
|
rng = np.random.RandomState(1)
|
||
|
X = rng.rand(100, 10)
|
||
|
Y = rng.rand(100, 10)
|
||
|
k = 5
|
||
|
metric = "euclidean"
|
||
|
|
||
|
msg = (
|
||
|
"Only float64 or float32 datasets pairs are supported at this time, "
|
||
|
"got: X.dtype=float32 and Y.dtype=float64"
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
ArgKmin.compute(X=X.astype(np.float32), Y=Y, k=k, metric=metric)
|
||
|
|
||
|
msg = (
|
||
|
"Only float64 or float32 datasets pairs are supported at this time, "
|
||
|
"got: X.dtype=float64 and Y.dtype=int32"
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
ArgKmin.compute(X=X, Y=Y.astype(np.int32), k=k, metric=metric)
|
||
|
|
||
|
with pytest.raises(ValueError, match="k == -1, must be >= 1."):
|
||
|
ArgKmin.compute(X=X, Y=Y, k=-1, metric=metric)
|
||
|
|
||
|
with pytest.raises(ValueError, match="k == 0, must be >= 1."):
|
||
|
ArgKmin.compute(X=X, Y=Y, k=0, metric=metric)
|
||
|
|
||
|
with pytest.raises(ValueError, match="Unrecognized metric"):
|
||
|
ArgKmin.compute(X=X, Y=Y, k=k, metric="wrong metric")
|
||
|
|
||
|
with pytest.raises(
|
||
|
ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
|
||
|
):
|
||
|
ArgKmin.compute(X=np.array([1.0, 2.0]), Y=Y, k=k, metric=metric)
|
||
|
|
||
|
with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
|
||
|
ArgKmin.compute(X=np.asfortranarray(X), Y=Y, k=k, metric=metric)
|
||
|
|
||
|
# A UserWarning must be raised in this case.
|
||
|
unused_metric_kwargs = {"p": 3}
|
||
|
|
||
|
message = r"Some metric_kwargs have been passed \({'p': 3}\) but"
|
||
|
|
||
|
with pytest.warns(UserWarning, match=message):
|
||
|
ArgKmin.compute(
|
||
|
X=X, Y=Y, k=k, metric=metric, metric_kwargs=unused_metric_kwargs
|
||
|
)
|
||
|
|
||
|
# A UserWarning must be raised in this case.
|
||
|
metric_kwargs = {
|
||
|
"p": 3, # unused
|
||
|
"Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
|
||
|
}
|
||
|
|
||
|
message = r"Some metric_kwargs have been passed \({'p': 3, 'Y_norm_squared'"
|
||
|
|
||
|
with pytest.warns(UserWarning, match=message):
|
||
|
ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)
|
||
|
|
||
|
# No user warning must be raised in this case.
|
||
|
metric_kwargs = {
|
||
|
"X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
|
||
|
}
|
||
|
with warnings.catch_warnings():
|
||
|
warnings.simplefilter("error", category=UserWarning)
|
||
|
ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)
|
||
|
|
||
|
# No user warning must be raised in this case.
|
||
|
metric_kwargs = {
|
||
|
"X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
|
||
|
"Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
|
||
|
}
|
||
|
with warnings.catch_warnings():
|
||
|
warnings.simplefilter("error", category=UserWarning)
|
||
|
ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)
|
||
|
|
||
|
|
||
|
def test_radius_neighbors_factory_method_wrong_usages():
|
||
|
rng = np.random.RandomState(1)
|
||
|
X = rng.rand(100, 10)
|
||
|
Y = rng.rand(100, 10)
|
||
|
radius = 5
|
||
|
metric = "euclidean"
|
||
|
|
||
|
msg = (
|
||
|
"Only float64 or float32 datasets pairs are supported at this time, "
|
||
|
"got: X.dtype=float32 and Y.dtype=float64"
|
||
|
)
|
||
|
with pytest.raises(
|
||
|
ValueError,
|
||
|
match=msg,
|
||
|
):
|
||
|
RadiusNeighbors.compute(
|
||
|
X=X.astype(np.float32), Y=Y, radius=radius, metric=metric
|
||
|
)
|
||
|
|
||
|
msg = (
|
||
|
"Only float64 or float32 datasets pairs are supported at this time, "
|
||
|
"got: X.dtype=float64 and Y.dtype=int32"
|
||
|
)
|
||
|
with pytest.raises(
|
||
|
ValueError,
|
||
|
match=msg,
|
||
|
):
|
||
|
RadiusNeighbors.compute(X=X, Y=Y.astype(np.int32), radius=radius, metric=metric)
|
||
|
|
||
|
with pytest.raises(ValueError, match="radius == -1.0, must be >= 0."):
|
||
|
RadiusNeighbors.compute(X=X, Y=Y, radius=-1, metric=metric)
|
||
|
|
||
|
with pytest.raises(ValueError, match="Unrecognized metric"):
|
||
|
RadiusNeighbors.compute(X=X, Y=Y, radius=radius, metric="wrong metric")
|
||
|
|
||
|
with pytest.raises(
|
||
|
ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
|
||
|
):
|
||
|
RadiusNeighbors.compute(
|
||
|
X=np.array([1.0, 2.0]), Y=Y, radius=radius, metric=metric
|
||
|
)
|
||
|
|
||
|
with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
|
||
|
RadiusNeighbors.compute(
|
||
|
X=np.asfortranarray(X), Y=Y, radius=radius, metric=metric
|
||
|
)
|
||
|
|
||
|
unused_metric_kwargs = {"p": 3}
|
||
|
|
||
|
# A UserWarning must be raised in this case.
|
||
|
message = r"Some metric_kwargs have been passed \({'p': 3}\) but"
|
||
|
|
||
|
with pytest.warns(UserWarning, match=message):
|
||
|
RadiusNeighbors.compute(
|
||
|
X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=unused_metric_kwargs
|
||
|
)
|
||
|
|
||
|
# A UserWarning must be raised in this case.
|
||
|
metric_kwargs = {
|
||
|
"p": 3, # unused
|
||
|
"Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
|
||
|
}
|
||
|
|
||
|
message = r"Some metric_kwargs have been passed \({'p': 3, 'Y_norm_squared'"
|
||
|
|
||
|
with pytest.warns(UserWarning, match=message):
|
||
|
RadiusNeighbors.compute(
|
||
|
X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
|
||
|
)
|
||
|
|
||
|
# No user warning must be raised in this case.
|
||
|
metric_kwargs = {
|
||
|
"X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
|
||
|
"Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
|
||
|
}
|
||
|
with warnings.catch_warnings():
|
||
|
warnings.simplefilter("error", category=UserWarning)
|
||
|
RadiusNeighbors.compute(
|
||
|
X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
|
||
|
)
|
||
|
|
||
|
# No user warning must be raised in this case.
|
||
|
metric_kwargs = {
|
||
|
"X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
|
||
|
}
|
||
|
with warnings.catch_warnings():
|
||
|
warnings.simplefilter("error", category=UserWarning)
|
||
|
RadiusNeighbors.compute(
|
||
|
X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
|
||
|
)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"n_samples_X, n_samples_Y", [(100, 100), (500, 100), (100, 500)]
|
||
|
)
|
||
|
@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
|
||
|
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
|
||
|
def test_chunk_size_agnosticism(
|
||
|
global_random_seed,
|
||
|
Dispatcher,
|
||
|
n_samples_X,
|
||
|
n_samples_Y,
|
||
|
dtype,
|
||
|
n_features=100,
|
||
|
):
|
||
|
"""Check that results do not depend on the chunk size."""
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
spread = 100
|
||
|
X = rng.rand(n_samples_X, n_features).astype(dtype) * spread
|
||
|
Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread
|
||
|
|
||
|
if Dispatcher is ArgKmin:
|
||
|
parameter = 10
|
||
|
check_parameters = {}
|
||
|
compute_parameters = {}
|
||
|
else:
|
||
|
# Scaling the radius slightly with the numbers of dimensions
|
||
|
radius = 10 ** np.log(n_features)
|
||
|
parameter = radius
|
||
|
check_parameters = {"radius": radius}
|
||
|
compute_parameters = {"sort_results": True}
|
||
|
|
||
|
ref_dist, ref_indices = Dispatcher.compute(
|
||
|
X,
|
||
|
Y,
|
||
|
parameter,
|
||
|
chunk_size=256, # default
|
||
|
metric="manhattan",
|
||
|
return_distance=True,
|
||
|
**compute_parameters,
|
||
|
)
|
||
|
|
||
|
dist, indices = Dispatcher.compute(
|
||
|
X,
|
||
|
Y,
|
||
|
parameter,
|
||
|
chunk_size=41,
|
||
|
metric="manhattan",
|
||
|
return_distance=True,
|
||
|
**compute_parameters,
|
||
|
)
|
||
|
|
||
|
ASSERT_RESULT[(Dispatcher, dtype)](
|
||
|
ref_dist, dist, ref_indices, indices, **check_parameters
|
||
|
)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"n_samples_X, n_samples_Y", [(100, 100), (500, 100), (100, 500)]
|
||
|
)
|
||
|
@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
|
||
|
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
|
||
|
def test_n_threads_agnosticism(
|
||
|
global_random_seed,
|
||
|
Dispatcher,
|
||
|
n_samples_X,
|
||
|
n_samples_Y,
|
||
|
dtype,
|
||
|
n_features=100,
|
||
|
):
|
||
|
"""Check that results do not depend on the number of threads."""
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
spread = 100
|
||
|
X = rng.rand(n_samples_X, n_features).astype(dtype) * spread
|
||
|
Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread
|
||
|
|
||
|
if Dispatcher is ArgKmin:
|
||
|
parameter = 10
|
||
|
check_parameters = {}
|
||
|
compute_parameters = {}
|
||
|
else:
|
||
|
# Scaling the radius slightly with the numbers of dimensions
|
||
|
radius = 10 ** np.log(n_features)
|
||
|
parameter = radius
|
||
|
check_parameters = {"radius": radius}
|
||
|
compute_parameters = {"sort_results": True}
|
||
|
|
||
|
ref_dist, ref_indices = Dispatcher.compute(
|
||
|
X,
|
||
|
Y,
|
||
|
parameter,
|
||
|
chunk_size=25, # make sure we use multiple threads
|
||
|
return_distance=True,
|
||
|
**compute_parameters,
|
||
|
)
|
||
|
|
||
|
with threadpoolctl.threadpool_limits(limits=1, user_api="openmp"):
|
||
|
dist, indices = Dispatcher.compute(
|
||
|
X,
|
||
|
Y,
|
||
|
parameter,
|
||
|
chunk_size=25,
|
||
|
return_distance=True,
|
||
|
**compute_parameters,
|
||
|
)
|
||
|
|
||
|
ASSERT_RESULT[(Dispatcher, dtype)](
|
||
|
ref_dist, dist, ref_indices, indices, **check_parameters
|
||
|
)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"Dispatcher, dtype",
|
||
|
[
|
||
|
(ArgKmin, np.float64),
|
||
|
(RadiusNeighbors, np.float32),
|
||
|
(ArgKmin, np.float32),
|
||
|
(RadiusNeighbors, np.float64),
|
||
|
],
|
||
|
)
|
||
|
def test_format_agnosticism(
|
||
|
global_random_seed,
|
||
|
Dispatcher,
|
||
|
dtype,
|
||
|
):
|
||
|
"""Check that results do not depend on the format (dense, sparse) of the input."""
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
spread = 100
|
||
|
n_samples, n_features = 100, 100
|
||
|
|
||
|
X = rng.rand(n_samples, n_features).astype(dtype) * spread
|
||
|
Y = rng.rand(n_samples, n_features).astype(dtype) * spread
|
||
|
|
||
|
X_csr = csr_matrix(X)
|
||
|
Y_csr = csr_matrix(Y)
|
||
|
|
||
|
if Dispatcher is ArgKmin:
|
||
|
parameter = 10
|
||
|
check_parameters = {}
|
||
|
compute_parameters = {}
|
||
|
else:
|
||
|
# Scaling the radius slightly with the numbers of dimensions
|
||
|
radius = 10 ** np.log(n_features)
|
||
|
parameter = radius
|
||
|
check_parameters = {"radius": radius}
|
||
|
compute_parameters = {"sort_results": True}
|
||
|
|
||
|
dist_dense, indices_dense = Dispatcher.compute(
|
||
|
X,
|
||
|
Y,
|
||
|
parameter,
|
||
|
chunk_size=50,
|
||
|
return_distance=True,
|
||
|
**compute_parameters,
|
||
|
)
|
||
|
|
||
|
for _X, _Y in itertools.product((X, X_csr), (Y, Y_csr)):
|
||
|
if _X is X and _Y is Y:
|
||
|
continue
|
||
|
dist, indices = Dispatcher.compute(
|
||
|
_X,
|
||
|
_Y,
|
||
|
parameter,
|
||
|
chunk_size=50,
|
||
|
return_distance=True,
|
||
|
**compute_parameters,
|
||
|
)
|
||
|
ASSERT_RESULT[(Dispatcher, dtype)](
|
||
|
dist_dense,
|
||
|
dist,
|
||
|
indices_dense,
|
||
|
indices,
|
||
|
**check_parameters,
|
||
|
)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"n_samples_X, n_samples_Y", [(100, 100), (100, 500), (500, 100)]
|
||
|
)
|
||
|
@pytest.mark.parametrize(
|
||
|
"metric",
|
||
|
["euclidean", "minkowski", "manhattan", "infinity", "seuclidean", "haversine"],
|
||
|
)
|
||
|
@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
|
||
|
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
|
||
|
def test_strategies_consistency(
|
||
|
global_random_seed,
|
||
|
Dispatcher,
|
||
|
metric,
|
||
|
n_samples_X,
|
||
|
n_samples_Y,
|
||
|
dtype,
|
||
|
n_features=10,
|
||
|
):
|
||
|
"""Check that the results do not depend on the strategy used."""
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
spread = 100
|
||
|
X = rng.rand(n_samples_X, n_features).astype(dtype) * spread
|
||
|
Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread
|
||
|
|
||
|
# Haversine distance only accepts 2D data
|
||
|
if metric == "haversine":
|
||
|
X = np.ascontiguousarray(X[:, :2])
|
||
|
Y = np.ascontiguousarray(Y[:, :2])
|
||
|
|
||
|
if Dispatcher is ArgKmin:
|
||
|
parameter = 10
|
||
|
check_parameters = {}
|
||
|
compute_parameters = {}
|
||
|
else:
|
||
|
# Scaling the radius slightly with the numbers of dimensions
|
||
|
radius = 10 ** np.log(n_features)
|
||
|
parameter = radius
|
||
|
check_parameters = {"radius": radius}
|
||
|
compute_parameters = {"sort_results": True}
|
||
|
|
||
|
dist_par_X, indices_par_X = Dispatcher.compute(
|
||
|
X,
|
||
|
Y,
|
||
|
parameter,
|
||
|
metric=metric,
|
||
|
# Taking the first
|
||
|
metric_kwargs=_get_metric_params_list(
|
||
|
metric, n_features, seed=global_random_seed
|
||
|
)[0],
|
||
|
# To be sure to use parallelization
|
||
|
chunk_size=n_samples_X // 4,
|
||
|
strategy="parallel_on_X",
|
||
|
return_distance=True,
|
||
|
**compute_parameters,
|
||
|
)
|
||
|
|
||
|
dist_par_Y, indices_par_Y = Dispatcher.compute(
|
||
|
X,
|
||
|
Y,
|
||
|
parameter,
|
||
|
metric=metric,
|
||
|
# Taking the first
|
||
|
metric_kwargs=_get_metric_params_list(
|
||
|
metric, n_features, seed=global_random_seed
|
||
|
)[0],
|
||
|
# To be sure to use parallelization
|
||
|
chunk_size=n_samples_Y // 4,
|
||
|
strategy="parallel_on_Y",
|
||
|
return_distance=True,
|
||
|
**compute_parameters,
|
||
|
)
|
||
|
|
||
|
ASSERT_RESULT[(Dispatcher, dtype)](
|
||
|
dist_par_X, dist_par_Y, indices_par_X, indices_par_Y, **check_parameters
|
||
|
)
|
||
|
|
||
|
|
||
|
# "Concrete Dispatchers"-specific tests
|
||
|
|
||
|
# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
|
||
|
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
|
||
|
@pytest.mark.parametrize("n_features", [50, 500])
|
||
|
@pytest.mark.parametrize("translation", [0, 1e6])
|
||
|
@pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS)
|
||
|
@pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y"))
|
||
|
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
|
||
|
def test_pairwise_distances_argkmin(
|
||
|
global_random_seed,
|
||
|
n_features,
|
||
|
translation,
|
||
|
metric,
|
||
|
strategy,
|
||
|
dtype,
|
||
|
n_samples=100,
|
||
|
k=10,
|
||
|
):
|
||
|
# TODO: can we easily fix this discrepancy?
|
||
|
edge_cases = [
|
||
|
(np.float32, "chebyshev", 1000000.0),
|
||
|
(np.float32, "cityblock", 1000000.0),
|
||
|
]
|
||
|
if (dtype, metric, translation) in edge_cases:
|
||
|
pytest.xfail("Numerical differences lead to small differences in results.")
|
||
|
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
spread = 1000
|
||
|
X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
|
||
|
Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
|
||
|
|
||
|
X_csr = csr_matrix(X)
|
||
|
Y_csr = csr_matrix(Y)
|
||
|
|
||
|
# Haversine distance only accepts 2D data
|
||
|
if metric == "haversine":
|
||
|
X = np.ascontiguousarray(X[:, :2])
|
||
|
Y = np.ascontiguousarray(Y[:, :2])
|
||
|
|
||
|
metric_kwargs = _get_metric_params_list(metric, n_features)[0]
|
||
|
|
||
|
# Reference for argkmin results
|
||
|
if metric == "euclidean":
|
||
|
# Compare to scikit-learn GEMM optimized implementation
|
||
|
dist_matrix = euclidean_distances(X, Y)
|
||
|
else:
|
||
|
dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs)
|
||
|
# Taking argkmin (indices of the k smallest values)
|
||
|
argkmin_indices_ref = np.argsort(dist_matrix, axis=1)[:, :k]
|
||
|
# Getting the associated distances
|
||
|
argkmin_distances_ref = np.zeros(argkmin_indices_ref.shape, dtype=np.float64)
|
||
|
for row_idx in range(argkmin_indices_ref.shape[0]):
|
||
|
argkmin_distances_ref[row_idx] = dist_matrix[
|
||
|
row_idx, argkmin_indices_ref[row_idx]
|
||
|
]
|
||
|
|
||
|
for _X, _Y in [(X, Y), (X_csr, Y_csr)]:
|
||
|
argkmin_distances, argkmin_indices = ArgKmin.compute(
|
||
|
_X,
|
||
|
_Y,
|
||
|
k,
|
||
|
metric=metric,
|
||
|
metric_kwargs=metric_kwargs,
|
||
|
return_distance=True,
|
||
|
# So as to have more than a chunk, forcing parallelism.
|
||
|
chunk_size=n_samples // 4,
|
||
|
strategy=strategy,
|
||
|
)
|
||
|
|
||
|
ASSERT_RESULT[(ArgKmin, dtype)](
|
||
|
argkmin_distances,
|
||
|
argkmin_distances_ref,
|
||
|
argkmin_indices,
|
||
|
argkmin_indices_ref,
|
||
|
)
|
||
|
|
||
|
|
||
|
# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
|
||
|
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
|
||
|
@pytest.mark.parametrize("n_features", [50, 500])
|
||
|
@pytest.mark.parametrize("translation", [0, 1e6])
|
||
|
@pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS)
|
||
|
@pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y"))
|
||
|
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
|
||
|
def test_pairwise_distances_radius_neighbors(
|
||
|
global_random_seed,
|
||
|
n_features,
|
||
|
translation,
|
||
|
metric,
|
||
|
strategy,
|
||
|
dtype,
|
||
|
n_samples=100,
|
||
|
):
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
spread = 1000
|
||
|
radius = spread * np.log(n_features)
|
||
|
X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
|
||
|
Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
|
||
|
|
||
|
metric_kwargs = _get_metric_params_list(
|
||
|
metric, n_features, seed=global_random_seed
|
||
|
)[0]
|
||
|
|
||
|
# Reference for argkmin results
|
||
|
if metric == "euclidean":
|
||
|
# Compare to scikit-learn GEMM optimized implementation
|
||
|
dist_matrix = euclidean_distances(X, Y)
|
||
|
else:
|
||
|
dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs)
|
||
|
|
||
|
# Getting the neighbors for a given radius
|
||
|
neigh_indices_ref = []
|
||
|
neigh_distances_ref = []
|
||
|
|
||
|
for row in dist_matrix:
|
||
|
ind = np.arange(row.shape[0])[row <= radius]
|
||
|
dist = row[ind]
|
||
|
|
||
|
sort = np.argsort(dist)
|
||
|
ind, dist = ind[sort], dist[sort]
|
||
|
|
||
|
neigh_indices_ref.append(ind)
|
||
|
neigh_distances_ref.append(dist)
|
||
|
|
||
|
neigh_distances, neigh_indices = RadiusNeighbors.compute(
|
||
|
X,
|
||
|
Y,
|
||
|
radius,
|
||
|
metric=metric,
|
||
|
metric_kwargs=metric_kwargs,
|
||
|
return_distance=True,
|
||
|
# So as to have more than a chunk, forcing parallelism.
|
||
|
chunk_size=n_samples // 4,
|
||
|
strategy=strategy,
|
||
|
sort_results=True,
|
||
|
)
|
||
|
|
||
|
ASSERT_RESULT[(RadiusNeighbors, dtype)](
|
||
|
neigh_distances, neigh_distances_ref, neigh_indices, neigh_indices_ref, radius
|
||
|
)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
|
||
|
@pytest.mark.parametrize("metric", ["manhattan", "euclidean"])
|
||
|
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
|
||
|
def test_memmap_backed_data(
|
||
|
metric,
|
||
|
Dispatcher,
|
||
|
dtype,
|
||
|
):
|
||
|
"""Check that the results do not depend on the datasets writability."""
|
||
|
rng = np.random.RandomState(0)
|
||
|
spread = 100
|
||
|
n_samples, n_features = 128, 10
|
||
|
X = rng.rand(n_samples, n_features).astype(dtype) * spread
|
||
|
Y = rng.rand(n_samples, n_features).astype(dtype) * spread
|
||
|
|
||
|
# Create read only datasets
|
||
|
X_mm, Y_mm = create_memmap_backed_data([X, Y])
|
||
|
|
||
|
if Dispatcher is ArgKmin:
|
||
|
parameter = 10
|
||
|
check_parameters = {}
|
||
|
compute_parameters = {}
|
||
|
else:
|
||
|
# Scaling the radius slightly with the numbers of dimensions
|
||
|
radius = 10 ** np.log(n_features)
|
||
|
parameter = radius
|
||
|
check_parameters = {"radius": radius}
|
||
|
compute_parameters = {"sort_results": True}
|
||
|
|
||
|
ref_dist, ref_indices = Dispatcher.compute(
|
||
|
X,
|
||
|
Y,
|
||
|
parameter,
|
||
|
metric=metric,
|
||
|
return_distance=True,
|
||
|
**compute_parameters,
|
||
|
)
|
||
|
|
||
|
dist_mm, indices_mm = Dispatcher.compute(
|
||
|
X_mm,
|
||
|
Y_mm,
|
||
|
parameter,
|
||
|
metric=metric,
|
||
|
return_distance=True,
|
||
|
**compute_parameters,
|
||
|
)
|
||
|
|
||
|
ASSERT_RESULT[(Dispatcher, dtype)](
|
||
|
ref_dist, dist_mm, ref_indices, indices_mm, **check_parameters
|
||
|
)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("n_samples", [100, 1000])
|
||
|
@pytest.mark.parametrize("n_features", [5, 10, 100])
|
||
|
@pytest.mark.parametrize("num_threads", [1, 2, 8])
|
||
|
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
|
||
|
def test_sqeuclidean_row_norms(
|
||
|
global_random_seed,
|
||
|
n_samples,
|
||
|
n_features,
|
||
|
num_threads,
|
||
|
dtype,
|
||
|
):
|
||
|
rng = np.random.RandomState(global_random_seed)
|
||
|
spread = 100
|
||
|
X = rng.rand(n_samples, n_features).astype(dtype) * spread
|
||
|
|
||
|
X_csr = csr_matrix(X)
|
||
|
|
||
|
sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2
|
||
|
sq_row_norm = sqeuclidean_row_norms(X, num_threads=num_threads)
|
||
|
|
||
|
sq_row_norm_csr = sqeuclidean_row_norms(X_csr, num_threads=num_threads)
|
||
|
|
||
|
assert_allclose(sq_row_norm_reference, sq_row_norm)
|
||
|
assert_allclose(sq_row_norm_reference, sq_row_norm_csr)
|
||
|
|
||
|
with pytest.raises(ValueError):
|
||
|
X = np.asfortranarray(X)
|
||
|
sqeuclidean_row_norms(X, num_threads=num_threads)
|