3RNN/Lib/site-packages/sklearn/metrics/tests/test_pairwise_distances_reduction.py
2024-05-26 19:49:15 +02:00

1644 lines
52 KiB
Python

import itertools
import re
import warnings
from functools import partial
import numpy as np
import pytest
from scipy.spatial.distance import cdist
from sklearn import _threadpool_controller
from sklearn.metrics import euclidean_distances, pairwise_distances
from sklearn.metrics._pairwise_distances_reduction import (
ArgKmin,
ArgKminClassMode,
BaseDistancesReductionDispatcher,
RadiusNeighbors,
RadiusNeighborsClassMode,
sqeuclidean_row_norms,
)
from sklearn.utils._testing import (
assert_allclose,
assert_array_equal,
create_memmap_backed_data,
)
from sklearn.utils.fixes import CSR_CONTAINERS
# Common supported metric between scipy.spatial.distance.cdist
# and BaseDistanceReductionDispatcher.
# This allows constructing tests to check consistency of results
# of concrete BaseDistanceReductionDispatcher on some metrics using APIs
# from scipy and numpy.
CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS = [
"braycurtis",
"canberra",
"chebyshev",
"cityblock",
"euclidean",
"minkowski",
"seuclidean",
]
def _get_metric_params_list(metric: str, n_features: int, seed: int = 1):
"""Return list of dummy DistanceMetric kwargs for tests."""
# Distinguishing on cases not to compute unneeded datastructures.
rng = np.random.RandomState(seed)
if metric == "minkowski":
minkowski_kwargs = [
dict(p=1.5),
dict(p=2),
dict(p=3),
dict(p=np.inf),
dict(p=3, w=rng.rand(n_features)),
]
return minkowski_kwargs
if metric == "seuclidean":
return [dict(V=rng.rand(n_features))]
# Case of: "euclidean", "manhattan", "chebyshev", "haversine" or any other metric.
# In those cases, no kwargs is needed.
return [{}]
def assert_same_distances_for_common_neighbors(
query_idx,
dist_row_a,
dist_row_b,
indices_row_a,
indices_row_b,
rtol,
atol,
):
"""Check that the distances of common neighbors are equal up to tolerance.
This does not check if there are missing neighbors in either result set.
Missingness is handled by assert_no_missing_neighbors.
"""
# Compute a mapping from indices to distances for each result set and
# check that the computed neighbors with matching indices are within
# the expected distance tolerance.
indices_to_dist_a = dict(zip(indices_row_a, dist_row_a))
indices_to_dist_b = dict(zip(indices_row_b, dist_row_b))
common_indices = set(indices_row_a).intersection(set(indices_row_b))
for idx in common_indices:
dist_a = indices_to_dist_a[idx]
dist_b = indices_to_dist_b[idx]
try:
assert_allclose(dist_a, dist_b, rtol=rtol, atol=atol)
except AssertionError as e:
# Wrap exception to provide more context while also including
# the original exception with the computed absolute and
# relative differences.
raise AssertionError(
f"Query vector with index {query_idx} lead to different distances"
f" for common neighbor with index {idx}:"
f" dist_a={dist_a} vs dist_b={dist_b} (with atol={atol} and"
f" rtol={rtol})"
) from e
def assert_no_missing_neighbors(
query_idx,
dist_row_a,
dist_row_b,
indices_row_a,
indices_row_b,
threshold,
):
"""Compare the indices of neighbors in two results sets.
Any neighbor index with a distance below the precision threshold should
match one in the other result set. We ignore the last few neighbors beyond
the threshold as those can typically be missing due to rounding errors.
For radius queries, the threshold is just the radius minus the expected
precision level.
For k-NN queries, it is the maximum distance to the k-th neighbor minus the
expected precision level.
"""
mask_a = dist_row_a < threshold
mask_b = dist_row_b < threshold
missing_from_b = np.setdiff1d(indices_row_a[mask_a], indices_row_b)
missing_from_a = np.setdiff1d(indices_row_b[mask_b], indices_row_a)
if len(missing_from_a) > 0 or len(missing_from_b) > 0:
raise AssertionError(
f"Query vector with index {query_idx} lead to mismatched result indices:\n"
f"neighbors in b missing from a: {missing_from_a}\n"
f"neighbors in a missing from b: {missing_from_b}\n"
f"dist_row_a={dist_row_a}\n"
f"dist_row_b={dist_row_b}\n"
f"indices_row_a={indices_row_a}\n"
f"indices_row_b={indices_row_b}\n"
)
def assert_compatible_argkmin_results(
neighbors_dists_a,
neighbors_dists_b,
neighbors_indices_a,
neighbors_indices_b,
rtol=1e-5,
atol=1e-6,
):
"""Assert that argkmin results are valid up to rounding errors.
This function asserts that the results of argkmin queries are valid up to:
- rounding error tolerance on distance values;
- permutations of indices for distances values that differ up to the
expected precision level.
Furthermore, the distances must be sorted.
To be used for testing neighbors queries on float32 datasets: we accept
neighbors rank swaps only if they are caused by small rounding errors on
the distance computations.
"""
is_sorted = lambda a: np.all(a[:-1] <= a[1:])
assert (
neighbors_dists_a.shape
== neighbors_dists_b.shape
== neighbors_indices_a.shape
== neighbors_indices_b.shape
), "Arrays of results have incompatible shapes."
n_queries, _ = neighbors_dists_a.shape
# Asserting equality results one row at a time
for query_idx in range(n_queries):
dist_row_a = neighbors_dists_a[query_idx]
dist_row_b = neighbors_dists_b[query_idx]
indices_row_a = neighbors_indices_a[query_idx]
indices_row_b = neighbors_indices_b[query_idx]
assert is_sorted(dist_row_a), f"Distances aren't sorted on row {query_idx}"
assert is_sorted(dist_row_b), f"Distances aren't sorted on row {query_idx}"
assert_same_distances_for_common_neighbors(
query_idx,
dist_row_a,
dist_row_b,
indices_row_a,
indices_row_b,
rtol,
atol,
)
# Check that any neighbor with distances below the rounding error
# threshold have matching indices. The threshold is the distance to the
# k-th neighbors minus the expected precision level:
#
# (1 - rtol) * dist_k - atol
#
# Where dist_k is defined as the maximum distance to the kth-neighbor
# among the two result sets. This way of defining the threshold is
# stricter than taking the minimum of the two.
threshold = (1 - rtol) * np.maximum(
np.max(dist_row_a), np.max(dist_row_b)
) - atol
assert_no_missing_neighbors(
query_idx,
dist_row_a,
dist_row_b,
indices_row_a,
indices_row_b,
threshold,
)
def _non_trivial_radius(
*,
X=None,
Y=None,
metric=None,
precomputed_dists=None,
expected_n_neighbors=10,
n_subsampled_queries=10,
**metric_kwargs,
):
# Find a non-trivial radius using a small subsample of the pairwise
# distances between X and Y: we want to return around expected_n_neighbors
# on average. Yielding too many results would make the test slow (because
# checking the results is expensive for large result sets), yielding 0 most
# of the time would make the test useless.
assert (
precomputed_dists is not None or metric is not None
), "Either metric or precomputed_dists must be provided."
if precomputed_dists is None:
assert X is not None
assert Y is not None
sampled_dists = pairwise_distances(X, Y, metric=metric, **metric_kwargs)
else:
sampled_dists = precomputed_dists[:n_subsampled_queries].copy()
sampled_dists.sort(axis=1)
return sampled_dists[:, expected_n_neighbors].mean()
def assert_compatible_radius_results(
neighbors_dists_a,
neighbors_dists_b,
neighbors_indices_a,
neighbors_indices_b,
radius,
check_sorted=True,
rtol=1e-5,
atol=1e-6,
):
"""Assert that radius neighborhood results are valid up to:
- relative and absolute tolerance on computed distance values
- permutations of indices for distances values that differ up to
a precision level
- missing or extra last elements if their distance is
close to the radius
To be used for testing neighbors queries on float32 datasets: we
accept neighbors rank swaps only if they are caused by small
rounding errors on the distance computations.
Input arrays must be sorted w.r.t distances.
"""
is_sorted = lambda a: np.all(a[:-1] <= a[1:])
assert (
len(neighbors_dists_a)
== len(neighbors_dists_b)
== len(neighbors_indices_a)
== len(neighbors_indices_b)
)
n_queries = len(neighbors_dists_a)
# Asserting equality of results one vector at a time
for query_idx in range(n_queries):
dist_row_a = neighbors_dists_a[query_idx]
dist_row_b = neighbors_dists_b[query_idx]
indices_row_a = neighbors_indices_a[query_idx]
indices_row_b = neighbors_indices_b[query_idx]
if check_sorted:
assert is_sorted(dist_row_a), f"Distances aren't sorted on row {query_idx}"
assert is_sorted(dist_row_b), f"Distances aren't sorted on row {query_idx}"
assert len(dist_row_a) == len(indices_row_a)
assert len(dist_row_b) == len(indices_row_b)
# Check that all distances are within the requested radius
if len(dist_row_a) > 0:
max_dist_a = np.max(dist_row_a)
assert max_dist_a <= radius, (
f"Largest returned distance {max_dist_a} not within requested"
f" radius {radius} on row {query_idx}"
)
if len(dist_row_b) > 0:
max_dist_b = np.max(dist_row_b)
assert max_dist_b <= radius, (
f"Largest returned distance {max_dist_b} not within requested"
f" radius {radius} on row {query_idx}"
)
assert_same_distances_for_common_neighbors(
query_idx,
dist_row_a,
dist_row_b,
indices_row_a,
indices_row_b,
rtol,
atol,
)
threshold = (1 - rtol) * radius - atol
assert_no_missing_neighbors(
query_idx,
dist_row_a,
dist_row_b,
indices_row_a,
indices_row_b,
threshold,
)
FLOAT32_TOLS = {
"atol": 1e-7,
"rtol": 1e-5,
}
FLOAT64_TOLS = {
"atol": 1e-9,
"rtol": 1e-7,
}
ASSERT_RESULT = {
(ArgKmin, np.float64): partial(assert_compatible_argkmin_results, **FLOAT64_TOLS),
(ArgKmin, np.float32): partial(assert_compatible_argkmin_results, **FLOAT32_TOLS),
(
RadiusNeighbors,
np.float64,
): partial(assert_compatible_radius_results, **FLOAT64_TOLS),
(
RadiusNeighbors,
np.float32,
): partial(assert_compatible_radius_results, **FLOAT32_TOLS),
}
def test_assert_compatible_argkmin_results():
atol = 1e-7
rtol = 0.0
tols = dict(atol=atol, rtol=rtol)
eps = atol / 3
_1m = 1.0 - eps
_1p = 1.0 + eps
_6_1m = 6.1 - eps
_6_1p = 6.1 + eps
ref_dist = np.array(
[
[1.2, 2.5, _6_1m, 6.1, _6_1p],
[_1m, _1m, 1, _1p, _1p],
]
)
ref_indices = np.array(
[
[1, 2, 3, 4, 5],
[6, 7, 8, 9, 10],
]
)
# Sanity check: compare the reference results to themselves.
assert_compatible_argkmin_results(
ref_dist, ref_dist, ref_indices, ref_indices, rtol
)
# Apply valid permutation on indices: the last 3 points are all very close
# to one another so we accept any permutation on their rankings.
assert_compatible_argkmin_results(
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
np.array([[1, 2, 3, 4, 5]]),
np.array([[1, 2, 5, 4, 3]]),
**tols,
)
# The last few indices do not necessarily have to match because of the rounding
# errors on the distances: there could be tied results at the boundary.
assert_compatible_argkmin_results(
np.array([[1.2, 2.5, 3.0, 6.1, _6_1p]]),
np.array([[1.2, 2.5, 3.0, _6_1m, 6.1]]),
np.array([[1, 2, 3, 4, 5]]),
np.array([[1, 2, 3, 6, 7]]),
**tols,
)
# All points have close distances so any ranking permutation
# is valid for this query result.
assert_compatible_argkmin_results(
np.array([[_1m, 1, _1p, _1p, _1p]]),
np.array([[1, 1, 1, 1, _1p]]),
np.array([[7, 6, 8, 10, 9]]),
np.array([[6, 9, 7, 8, 10]]),
**tols,
)
# They could also be nearly truncation of very large nearly tied result
# sets hence all indices can also be distinct in this case:
assert_compatible_argkmin_results(
np.array([[_1m, 1, _1p, _1p, _1p]]),
np.array([[_1m, 1, 1, 1, _1p]]),
np.array([[34, 30, 8, 12, 24]]),
np.array([[42, 1, 21, 13, 3]]),
**tols,
)
# Apply invalid permutation on indices: permuting the ranks of the 2
# nearest neighbors is invalid because the distance values are too
# different.
msg = re.escape(
"Query vector with index 0 lead to different distances for common neighbor with"
" index 1: dist_a=1.2 vs dist_b=2.5"
)
with pytest.raises(AssertionError, match=msg):
assert_compatible_argkmin_results(
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
np.array([[1, 2, 3, 4, 5]]),
np.array([[2, 1, 3, 4, 5]]),
**tols,
)
# Detect missing indices within the expected precision level, even when the
# distances match exactly.
msg = re.escape(
"neighbors in b missing from a: [12]\nneighbors in a missing from b: [1]"
)
with pytest.raises(AssertionError, match=msg):
assert_compatible_argkmin_results(
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
np.array([[1, 2, 3, 4, 5]]),
np.array([[12, 2, 4, 11, 3]]),
**tols,
)
# Detect missing indices outside the expected precision level.
msg = re.escape(
"neighbors in b missing from a: []\nneighbors in a missing from b: [3]"
)
with pytest.raises(AssertionError, match=msg):
assert_compatible_argkmin_results(
np.array([[_1m, 1.0, _6_1m, 6.1, _6_1p]]),
np.array([[1.0, 1.0, _6_1m, 6.1, 7]]),
np.array([[1, 2, 3, 4, 5]]),
np.array([[2, 1, 4, 5, 12]]),
**tols,
)
# Detect missing indices outside the expected precision level, in the other
# direction:
msg = re.escape(
"neighbors in b missing from a: [5]\nneighbors in a missing from b: []"
)
with pytest.raises(AssertionError, match=msg):
assert_compatible_argkmin_results(
np.array([[_1m, 1.0, _6_1m, 6.1, 7]]),
np.array([[1.0, 1.0, _6_1m, 6.1, _6_1p]]),
np.array([[1, 2, 3, 4, 12]]),
np.array([[2, 1, 5, 3, 4]]),
**tols,
)
# Distances aren't properly sorted
msg = "Distances aren't sorted on row 0"
with pytest.raises(AssertionError, match=msg):
assert_compatible_argkmin_results(
np.array([[1.2, 2.5, _6_1m, 6.1, _6_1p]]),
np.array([[2.5, 1.2, _6_1m, 6.1, _6_1p]]),
np.array([[1, 2, 3, 4, 5]]),
np.array([[2, 1, 4, 5, 3]]),
**tols,
)
@pytest.mark.parametrize("check_sorted", [True, False])
def test_assert_compatible_radius_results(check_sorted):
atol = 1e-7
rtol = 0.0
tols = dict(atol=atol, rtol=rtol)
eps = atol / 3
_1m = 1.0 - eps
_1p = 1.0 + eps
_6_1m = 6.1 - eps
_6_1p = 6.1 + eps
ref_dist = [
np.array([1.2, 2.5, _6_1m, 6.1, _6_1p]),
np.array([_1m, 1, _1p, _1p]),
]
ref_indices = [
np.array([1, 2, 3, 4, 5]),
np.array([6, 7, 8, 9]),
]
# Sanity check: compare the reference results to themselves.
assert_compatible_radius_results(
ref_dist,
ref_dist,
ref_indices,
ref_indices,
radius=7.0,
check_sorted=check_sorted,
**tols,
)
# Apply valid permutation on indices
assert_compatible_radius_results(
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1, 2, 3, 4, 5])]),
np.array([np.array([1, 2, 4, 5, 3])]),
radius=7.0,
check_sorted=check_sorted,
**tols,
)
assert_compatible_radius_results(
np.array([np.array([_1m, _1m, 1, _1p, _1p])]),
np.array([np.array([_1m, _1m, 1, _1p, _1p])]),
np.array([np.array([6, 7, 8, 9, 10])]),
np.array([np.array([6, 9, 7, 8, 10])]),
radius=7.0,
check_sorted=check_sorted,
**tols,
)
# Apply invalid permutation on indices
msg = re.escape(
"Query vector with index 0 lead to different distances for common neighbor with"
" index 1: dist_a=1.2 vs dist_b=2.5"
)
with pytest.raises(AssertionError, match=msg):
assert_compatible_radius_results(
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1, 2, 3, 4, 5])]),
np.array([np.array([2, 1, 3, 4, 5])]),
radius=7.0,
check_sorted=check_sorted,
**tols,
)
# Having extra last or missing elements is valid if they are in the
# tolerated rounding error range: [(1 - rtol) * radius - atol, radius]
assert_compatible_radius_results(
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p, _6_1p])]),
np.array([np.array([1.2, 2.5, _6_1m, 6.1])]),
np.array([np.array([1, 2, 3, 4, 5, 7])]),
np.array([np.array([1, 2, 3, 6])]),
radius=_6_1p,
check_sorted=check_sorted,
**tols,
)
# Any discrepancy outside the tolerated rounding error range is invalid and
# indicates a missing neighbor in one of the result sets.
msg = re.escape(
"Query vector with index 0 lead to mismatched result indices:\nneighbors in b"
" missing from a: []\nneighbors in a missing from b: [3]"
)
with pytest.raises(AssertionError, match=msg):
assert_compatible_radius_results(
np.array([np.array([1.2, 2.5, 6])]),
np.array([np.array([1.2, 2.5])]),
np.array([np.array([1, 2, 3])]),
np.array([np.array([1, 2])]),
radius=6.1,
check_sorted=check_sorted,
**tols,
)
msg = re.escape(
"Query vector with index 0 lead to mismatched result indices:\nneighbors in b"
" missing from a: [4]\nneighbors in a missing from b: [2]"
)
with pytest.raises(AssertionError, match=msg):
assert_compatible_radius_results(
np.array([np.array([1.2, 2.1, 2.5])]),
np.array([np.array([1.2, 2, 2.5])]),
np.array([np.array([1, 2, 3])]),
np.array([np.array([1, 4, 3])]),
radius=6.1,
check_sorted=check_sorted,
**tols,
)
# Radius upper bound is strictly checked
msg = re.escape(
"Largest returned distance 6.100000033333333 not within requested radius 6.1 on"
" row 0"
)
with pytest.raises(AssertionError, match=msg):
assert_compatible_radius_results(
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1.2, 2.5, _6_1m, 6.1, 6.1])]),
np.array([np.array([1, 2, 3, 4, 5])]),
np.array([np.array([2, 1, 4, 5, 3])]),
radius=6.1,
check_sorted=check_sorted,
**tols,
)
with pytest.raises(AssertionError, match=msg):
assert_compatible_radius_results(
np.array([np.array([1.2, 2.5, _6_1m, 6.1, 6.1])]),
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1, 2, 3, 4, 5])]),
np.array([np.array([2, 1, 4, 5, 3])]),
radius=6.1,
check_sorted=check_sorted,
**tols,
)
if check_sorted:
# Distances aren't properly sorted
msg = "Distances aren't sorted on row 0"
with pytest.raises(AssertionError, match=msg):
assert_compatible_radius_results(
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([2.5, 1.2, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1, 2, 3, 4, 5])]),
np.array([np.array([2, 1, 4, 5, 3])]),
radius=_6_1p,
check_sorted=True,
**tols,
)
else:
assert_compatible_radius_results(
np.array([np.array([1.2, 2.5, _6_1m, 6.1, _6_1p])]),
np.array([np.array([2.5, 1.2, _6_1m, 6.1, _6_1p])]),
np.array([np.array([1, 2, 3, 4, 5])]),
np.array([np.array([2, 1, 4, 5, 3])]),
radius=_6_1p,
check_sorted=False,
**tols,
)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_pairwise_distances_reduction_is_usable_for(csr_container):
rng = np.random.RandomState(0)
X = rng.rand(100, 10)
Y = rng.rand(100, 10)
X_csr = csr_container(X)
Y_csr = csr_container(Y)
metric = "manhattan"
# Must be usable for all possible pair of {dense, sparse} datasets
assert BaseDistancesReductionDispatcher.is_usable_for(X, Y, metric)
assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y_csr, metric)
assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y, metric)
assert BaseDistancesReductionDispatcher.is_usable_for(X, Y_csr, metric)
assert BaseDistancesReductionDispatcher.is_usable_for(
X.astype(np.float64), Y.astype(np.float64), metric
)
assert BaseDistancesReductionDispatcher.is_usable_for(
X.astype(np.float32), Y.astype(np.float32), metric
)
assert not BaseDistancesReductionDispatcher.is_usable_for(
X.astype(np.int64), Y.astype(np.int64), metric
)
assert not BaseDistancesReductionDispatcher.is_usable_for(X, Y, metric="pyfunc")
assert not BaseDistancesReductionDispatcher.is_usable_for(
X.astype(np.float32), Y, metric
)
assert not BaseDistancesReductionDispatcher.is_usable_for(
X, Y.astype(np.int32), metric
)
# F-ordered arrays are not supported
assert not BaseDistancesReductionDispatcher.is_usable_for(
np.asfortranarray(X), Y, metric
)
assert BaseDistancesReductionDispatcher.is_usable_for(X_csr, Y, metric="euclidean")
assert BaseDistancesReductionDispatcher.is_usable_for(
X, Y_csr, metric="sqeuclidean"
)
# FIXME: the current Cython implementation is too slow for a large number of
# features. We temporarily disable it to fallback on SciPy's implementation.
# See: https://github.com/scikit-learn/scikit-learn/issues/28191
assert not BaseDistancesReductionDispatcher.is_usable_for(
X_csr, Y_csr, metric="sqeuclidean"
)
assert not BaseDistancesReductionDispatcher.is_usable_for(
X_csr, Y_csr, metric="euclidean"
)
# CSR matrices without non-zeros elements aren't currently supported
# TODO: support CSR matrices without non-zeros elements
X_csr_0_nnz = csr_container(X * 0)
assert not BaseDistancesReductionDispatcher.is_usable_for(X_csr_0_nnz, Y, metric)
# CSR matrices with int64 indices and indptr (e.g. large nnz, or large n_features)
# aren't supported as of now.
# See: https://github.com/scikit-learn/scikit-learn/issues/23653
# TODO: support CSR matrices with int64 indices and indptr
X_csr_int64 = csr_container(X)
X_csr_int64.indices = X_csr_int64.indices.astype(np.int64)
assert not BaseDistancesReductionDispatcher.is_usable_for(X_csr_int64, Y, metric)
def test_argkmin_factory_method_wrong_usages():
rng = np.random.RandomState(1)
X = rng.rand(100, 10)
Y = rng.rand(100, 10)
k = 5
metric = "euclidean"
msg = (
"Only float64 or float32 datasets pairs are supported at this time, "
"got: X.dtype=float32 and Y.dtype=float64"
)
with pytest.raises(ValueError, match=msg):
ArgKmin.compute(X=X.astype(np.float32), Y=Y, k=k, metric=metric)
msg = (
"Only float64 or float32 datasets pairs are supported at this time, "
"got: X.dtype=float64 and Y.dtype=int32"
)
with pytest.raises(ValueError, match=msg):
ArgKmin.compute(X=X, Y=Y.astype(np.int32), k=k, metric=metric)
with pytest.raises(ValueError, match="k == -1, must be >= 1."):
ArgKmin.compute(X=X, Y=Y, k=-1, metric=metric)
with pytest.raises(ValueError, match="k == 0, must be >= 1."):
ArgKmin.compute(X=X, Y=Y, k=0, metric=metric)
with pytest.raises(ValueError, match="Unrecognized metric"):
ArgKmin.compute(X=X, Y=Y, k=k, metric="wrong metric")
with pytest.raises(
ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
):
ArgKmin.compute(X=np.array([1.0, 2.0]), Y=Y, k=k, metric=metric)
with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
ArgKmin.compute(X=np.asfortranarray(X), Y=Y, k=k, metric=metric)
# A UserWarning must be raised in this case.
unused_metric_kwargs = {"p": 3}
message = r"Some metric_kwargs have been passed \({'p': 3}\) but"
with pytest.warns(UserWarning, match=message):
ArgKmin.compute(
X=X, Y=Y, k=k, metric=metric, metric_kwargs=unused_metric_kwargs
)
# A UserWarning must be raised in this case.
metric_kwargs = {
"p": 3, # unused
"Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
}
message = r"Some metric_kwargs have been passed \({'p': 3, 'Y_norm_squared'"
with pytest.warns(UserWarning, match=message):
ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)
# No user warning must be raised in this case.
metric_kwargs = {
"X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
}
with warnings.catch_warnings():
warnings.simplefilter("error", category=UserWarning)
ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)
# No user warning must be raised in this case.
metric_kwargs = {
"X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
"Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
}
with warnings.catch_warnings():
warnings.simplefilter("error", category=UserWarning)
ArgKmin.compute(X=X, Y=Y, k=k, metric=metric, metric_kwargs=metric_kwargs)
def test_argkmin_classmode_factory_method_wrong_usages():
rng = np.random.RandomState(1)
X = rng.rand(100, 10)
Y = rng.rand(100, 10)
k = 5
metric = "manhattan"
weights = "uniform"
Y_labels = rng.randint(low=0, high=10, size=100)
unique_Y_labels = np.unique(Y_labels)
msg = (
"Only float64 or float32 datasets pairs are supported at this time, "
"got: X.dtype=float32 and Y.dtype=float64"
)
with pytest.raises(ValueError, match=msg):
ArgKminClassMode.compute(
X=X.astype(np.float32),
Y=Y,
k=k,
metric=metric,
weights=weights,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
)
msg = (
"Only float64 or float32 datasets pairs are supported at this time, "
"got: X.dtype=float64 and Y.dtype=int32"
)
with pytest.raises(ValueError, match=msg):
ArgKminClassMode.compute(
X=X,
Y=Y.astype(np.int32),
k=k,
metric=metric,
weights=weights,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
)
with pytest.raises(ValueError, match="k == -1, must be >= 1."):
ArgKminClassMode.compute(
X=X,
Y=Y,
k=-1,
metric=metric,
weights=weights,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
)
with pytest.raises(ValueError, match="k == 0, must be >= 1."):
ArgKminClassMode.compute(
X=X,
Y=Y,
k=0,
metric=metric,
weights=weights,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
)
with pytest.raises(ValueError, match="Unrecognized metric"):
ArgKminClassMode.compute(
X=X,
Y=Y,
k=k,
metric="wrong metric",
weights=weights,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
)
with pytest.raises(
ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
):
ArgKminClassMode.compute(
X=np.array([1.0, 2.0]),
Y=Y,
k=k,
metric=metric,
weights=weights,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
)
with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
ArgKminClassMode.compute(
X=np.asfortranarray(X),
Y=Y,
k=k,
metric=metric,
weights=weights,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
)
non_existent_weights_strategy = "non_existent_weights_strategy"
message = (
"Only the 'uniform' or 'distance' weights options are supported at this time. "
f"Got: weights='{non_existent_weights_strategy}'."
)
with pytest.raises(ValueError, match=message):
ArgKminClassMode.compute(
X=X,
Y=Y,
k=k,
metric=metric,
weights=non_existent_weights_strategy,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
)
# TODO: introduce assertions on UserWarnings once the Euclidean specialisation
# of ArgKminClassMode is supported.
def test_radius_neighbors_factory_method_wrong_usages():
rng = np.random.RandomState(1)
X = rng.rand(100, 10)
Y = rng.rand(100, 10)
radius = 5
metric = "euclidean"
msg = (
"Only float64 or float32 datasets pairs are supported at this time, "
"got: X.dtype=float32 and Y.dtype=float64"
)
with pytest.raises(
ValueError,
match=msg,
):
RadiusNeighbors.compute(
X=X.astype(np.float32), Y=Y, radius=radius, metric=metric
)
msg = (
"Only float64 or float32 datasets pairs are supported at this time, "
"got: X.dtype=float64 and Y.dtype=int32"
)
with pytest.raises(
ValueError,
match=msg,
):
RadiusNeighbors.compute(X=X, Y=Y.astype(np.int32), radius=radius, metric=metric)
with pytest.raises(ValueError, match="radius == -1.0, must be >= 0."):
RadiusNeighbors.compute(X=X, Y=Y, radius=-1, metric=metric)
with pytest.raises(ValueError, match="Unrecognized metric"):
RadiusNeighbors.compute(X=X, Y=Y, radius=radius, metric="wrong metric")
with pytest.raises(
ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
):
RadiusNeighbors.compute(
X=np.array([1.0, 2.0]), Y=Y, radius=radius, metric=metric
)
with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
RadiusNeighbors.compute(
X=np.asfortranarray(X), Y=Y, radius=radius, metric=metric
)
unused_metric_kwargs = {"p": 3}
# A UserWarning must be raised in this case.
message = r"Some metric_kwargs have been passed \({'p': 3}\) but"
with pytest.warns(UserWarning, match=message):
RadiusNeighbors.compute(
X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=unused_metric_kwargs
)
# A UserWarning must be raised in this case.
metric_kwargs = {
"p": 3, # unused
"Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
}
message = r"Some metric_kwargs have been passed \({'p': 3, 'Y_norm_squared'"
with pytest.warns(UserWarning, match=message):
RadiusNeighbors.compute(
X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
)
# No user warning must be raised in this case.
metric_kwargs = {
"X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
"Y_norm_squared": sqeuclidean_row_norms(Y, num_threads=2),
}
with warnings.catch_warnings():
warnings.simplefilter("error", category=UserWarning)
RadiusNeighbors.compute(
X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
)
# No user warning must be raised in this case.
metric_kwargs = {
"X_norm_squared": sqeuclidean_row_norms(X, num_threads=2),
}
with warnings.catch_warnings():
warnings.simplefilter("error", category=UserWarning)
RadiusNeighbors.compute(
X=X, Y=Y, radius=radius, metric=metric, metric_kwargs=metric_kwargs
)
def test_radius_neighbors_classmode_factory_method_wrong_usages():
rng = np.random.RandomState(1)
X = rng.rand(100, 10)
Y = rng.rand(100, 10)
radius = 5
metric = "manhattan"
weights = "uniform"
Y_labels = rng.randint(low=0, high=10, size=100)
unique_Y_labels = np.unique(Y_labels)
msg = (
"Only float64 or float32 datasets pairs are supported at this time, "
"got: X.dtype=float32 and Y.dtype=float64"
)
with pytest.raises(ValueError, match=msg):
RadiusNeighborsClassMode.compute(
X=X.astype(np.float32),
Y=Y,
radius=radius,
metric=metric,
weights=weights,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
outlier_label=None,
)
msg = (
"Only float64 or float32 datasets pairs are supported at this time, "
"got: X.dtype=float64 and Y.dtype=int32"
)
with pytest.raises(ValueError, match=msg):
RadiusNeighborsClassMode.compute(
X=X,
Y=Y.astype(np.int32),
radius=radius,
metric=metric,
weights=weights,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
outlier_label=None,
)
with pytest.raises(ValueError, match="radius == -1.0, must be >= 0."):
RadiusNeighborsClassMode.compute(
X=X,
Y=Y,
radius=-1,
metric=metric,
weights=weights,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
outlier_label=None,
)
with pytest.raises(ValueError, match="Unrecognized metric"):
RadiusNeighborsClassMode.compute(
X=X,
Y=Y,
radius=-1,
metric="wrong_metric",
weights=weights,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
outlier_label=None,
)
with pytest.raises(
ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)"
):
RadiusNeighborsClassMode.compute(
X=np.array([1.0, 2.0]),
Y=Y,
radius=radius,
metric=metric,
weights=weights,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
outlier_label=None,
)
with pytest.raises(ValueError, match="ndarray is not C-contiguous"):
RadiusNeighborsClassMode.compute(
X=np.asfortranarray(X),
Y=Y,
radius=radius,
metric=metric,
weights=weights,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
outlier_label=None,
)
non_existent_weights_strategy = "non_existent_weights_strategy"
msg = (
"Only the 'uniform' or 'distance' weights options are supported at this time. "
f"Got: weights='{non_existent_weights_strategy}'."
)
with pytest.raises(ValueError, match=msg):
RadiusNeighborsClassMode.compute(
X=X,
Y=Y,
radius=radius,
metric="wrong_metric",
weights=non_existent_weights_strategy,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
outlier_label=None,
)
@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_chunk_size_agnosticism(
global_random_seed,
Dispatcher,
dtype,
n_features=100,
):
"""Check that results do not depend on the chunk size."""
rng = np.random.RandomState(global_random_seed)
spread = 100
n_samples_X, n_samples_Y = rng.choice([97, 100, 101, 500], size=2, replace=False)
X = rng.rand(n_samples_X, n_features).astype(dtype) * spread
Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread
if Dispatcher is ArgKmin:
parameter = 10
check_parameters = {}
compute_parameters = {}
else:
radius = _non_trivial_radius(X=X, Y=Y, metric="euclidean")
parameter = radius
check_parameters = {"radius": radius}
compute_parameters = {"sort_results": True}
ref_dist, ref_indices = Dispatcher.compute(
X,
Y,
parameter,
chunk_size=256, # default
metric="manhattan",
return_distance=True,
**compute_parameters,
)
dist, indices = Dispatcher.compute(
X,
Y,
parameter,
chunk_size=41,
metric="manhattan",
return_distance=True,
**compute_parameters,
)
ASSERT_RESULT[(Dispatcher, dtype)](
ref_dist, dist, ref_indices, indices, **check_parameters
)
@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_n_threads_agnosticism(
global_random_seed,
Dispatcher,
dtype,
n_features=100,
):
"""Check that results do not depend on the number of threads."""
rng = np.random.RandomState(global_random_seed)
n_samples_X, n_samples_Y = rng.choice([97, 100, 101, 500], size=2, replace=False)
spread = 100
X = rng.rand(n_samples_X, n_features).astype(dtype) * spread
Y = rng.rand(n_samples_Y, n_features).astype(dtype) * spread
if Dispatcher is ArgKmin:
parameter = 10
check_parameters = {}
compute_parameters = {}
else:
radius = _non_trivial_radius(X=X, Y=Y, metric="euclidean")
parameter = radius
check_parameters = {"radius": radius}
compute_parameters = {"sort_results": True}
ref_dist, ref_indices = Dispatcher.compute(
X,
Y,
parameter,
chunk_size=25, # make sure we use multiple threads
return_distance=True,
**compute_parameters,
)
with _threadpool_controller.limit(limits=1, user_api="openmp"):
dist, indices = Dispatcher.compute(
X,
Y,
parameter,
chunk_size=25,
return_distance=True,
**compute_parameters,
)
ASSERT_RESULT[(Dispatcher, dtype)](
ref_dist, dist, ref_indices, indices, **check_parameters
)
@pytest.mark.parametrize(
"Dispatcher, dtype",
[
(ArgKmin, np.float64),
(RadiusNeighbors, np.float32),
(ArgKmin, np.float32),
(RadiusNeighbors, np.float64),
],
)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_format_agnosticism(
global_random_seed,
Dispatcher,
dtype,
csr_container,
):
"""Check that results do not depend on the format (dense, sparse) of the input."""
rng = np.random.RandomState(global_random_seed)
spread = 100
n_samples, n_features = 100, 100
X = rng.rand(n_samples, n_features).astype(dtype) * spread
Y = rng.rand(n_samples, n_features).astype(dtype) * spread
X_csr = csr_container(X)
Y_csr = csr_container(Y)
if Dispatcher is ArgKmin:
parameter = 10
check_parameters = {}
compute_parameters = {}
else:
# Adjusting the radius to ensure that the expected results is neither
# trivially empty nor too large.
radius = _non_trivial_radius(X=X, Y=Y, metric="euclidean")
parameter = radius
check_parameters = {"radius": radius}
compute_parameters = {"sort_results": True}
dist_dense, indices_dense = Dispatcher.compute(
X,
Y,
parameter,
chunk_size=50,
return_distance=True,
**compute_parameters,
)
for _X, _Y in itertools.product((X, X_csr), (Y, Y_csr)):
if _X is X and _Y is Y:
continue
dist, indices = Dispatcher.compute(
_X,
_Y,
parameter,
chunk_size=50,
return_distance=True,
**compute_parameters,
)
ASSERT_RESULT[(Dispatcher, dtype)](
dist_dense,
dist,
indices_dense,
indices,
**check_parameters,
)
@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
def test_strategies_consistency(
global_random_seed,
global_dtype,
Dispatcher,
n_features=10,
):
"""Check that the results do not depend on the strategy used."""
rng = np.random.RandomState(global_random_seed)
metric = rng.choice(
np.array(
[
"euclidean",
"minkowski",
"manhattan",
"haversine",
],
dtype=object,
)
)
n_samples_X, n_samples_Y = rng.choice([97, 100, 101, 500], size=2, replace=False)
spread = 100
X = rng.rand(n_samples_X, n_features).astype(global_dtype) * spread
Y = rng.rand(n_samples_Y, n_features).astype(global_dtype) * spread
# Haversine distance only accepts 2D data
if metric == "haversine":
X = np.ascontiguousarray(X[:, :2])
Y = np.ascontiguousarray(Y[:, :2])
if Dispatcher is ArgKmin:
parameter = 10
check_parameters = {}
compute_parameters = {}
else:
radius = _non_trivial_radius(X=X, Y=Y, metric=metric)
parameter = radius
check_parameters = {"radius": radius}
compute_parameters = {"sort_results": True}
dist_par_X, indices_par_X = Dispatcher.compute(
X,
Y,
parameter,
metric=metric,
# Taking the first
metric_kwargs=_get_metric_params_list(
metric, n_features, seed=global_random_seed
)[0],
# To be sure to use parallelization
chunk_size=n_samples_X // 4,
strategy="parallel_on_X",
return_distance=True,
**compute_parameters,
)
dist_par_Y, indices_par_Y = Dispatcher.compute(
X,
Y,
parameter,
metric=metric,
# Taking the first
metric_kwargs=_get_metric_params_list(
metric, n_features, seed=global_random_seed
)[0],
# To be sure to use parallelization
chunk_size=n_samples_Y // 4,
strategy="parallel_on_Y",
return_distance=True,
**compute_parameters,
)
ASSERT_RESULT[(Dispatcher, global_dtype)](
dist_par_X, dist_par_Y, indices_par_X, indices_par_Y, **check_parameters
)
# "Concrete Dispatchers"-specific tests
@pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS)
@pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y"))
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_pairwise_distances_argkmin(
global_random_seed,
metric,
strategy,
dtype,
csr_container,
n_queries=5,
n_samples=100,
k=10,
):
rng = np.random.RandomState(global_random_seed)
n_features = rng.choice([50, 500])
translation = rng.choice([0, 1e6])
spread = 1000
X = translation + rng.rand(n_queries, n_features).astype(dtype) * spread
Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
X_csr = csr_container(X)
Y_csr = csr_container(Y)
# Haversine distance only accepts 2D data
if metric == "haversine":
X = np.ascontiguousarray(X[:, :2])
Y = np.ascontiguousarray(Y[:, :2])
metric_kwargs = _get_metric_params_list(metric, n_features)[0]
# Reference for argkmin results
if metric == "euclidean":
# Compare to scikit-learn GEMM optimized implementation
dist_matrix = euclidean_distances(X, Y)
else:
dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs)
# Taking argkmin (indices of the k smallest values)
argkmin_indices_ref = np.argsort(dist_matrix, axis=1)[:, :k]
# Getting the associated distances
argkmin_distances_ref = np.zeros(argkmin_indices_ref.shape, dtype=np.float64)
for row_idx in range(argkmin_indices_ref.shape[0]):
argkmin_distances_ref[row_idx] = dist_matrix[
row_idx, argkmin_indices_ref[row_idx]
]
for _X, _Y in itertools.product((X, X_csr), (Y, Y_csr)):
argkmin_distances, argkmin_indices = ArgKmin.compute(
_X,
_Y,
k,
metric=metric,
metric_kwargs=metric_kwargs,
return_distance=True,
# So as to have more than a chunk, forcing parallelism.
chunk_size=n_samples // 4,
strategy=strategy,
)
ASSERT_RESULT[(ArgKmin, dtype)](
argkmin_distances,
argkmin_distances_ref,
argkmin_indices,
argkmin_indices_ref,
)
@pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS)
@pytest.mark.parametrize("strategy", ("parallel_on_X", "parallel_on_Y"))
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_pairwise_distances_radius_neighbors(
global_random_seed,
metric,
strategy,
dtype,
n_queries=5,
n_samples=100,
):
rng = np.random.RandomState(global_random_seed)
n_features = rng.choice([50, 500])
translation = rng.choice([0, 1e6])
spread = 1000
X = translation + rng.rand(n_queries, n_features).astype(dtype) * spread
Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread
metric_kwargs = _get_metric_params_list(
metric, n_features, seed=global_random_seed
)[0]
# Reference for argkmin results
if metric == "euclidean":
# Compare to scikit-learn GEMM optimized implementation
dist_matrix = euclidean_distances(X, Y)
else:
dist_matrix = cdist(X, Y, metric=metric, **metric_kwargs)
radius = _non_trivial_radius(precomputed_dists=dist_matrix)
# Getting the neighbors for a given radius
neigh_indices_ref = []
neigh_distances_ref = []
for row in dist_matrix:
ind = np.arange(row.shape[0])[row <= radius]
dist = row[ind]
sort = np.argsort(dist)
ind, dist = ind[sort], dist[sort]
neigh_indices_ref.append(ind)
neigh_distances_ref.append(dist)
neigh_distances, neigh_indices = RadiusNeighbors.compute(
X,
Y,
radius,
metric=metric,
metric_kwargs=metric_kwargs,
return_distance=True,
# So as to have more than a chunk, forcing parallelism.
chunk_size=n_samples // 4,
strategy=strategy,
sort_results=True,
)
ASSERT_RESULT[(RadiusNeighbors, dtype)](
neigh_distances, neigh_distances_ref, neigh_indices, neigh_indices_ref, radius
)
@pytest.mark.parametrize("Dispatcher", [ArgKmin, RadiusNeighbors])
@pytest.mark.parametrize("metric", ["manhattan", "euclidean"])
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
def test_memmap_backed_data(
metric,
Dispatcher,
dtype,
):
"""Check that the results do not depend on the datasets writability."""
rng = np.random.RandomState(0)
spread = 100
n_samples, n_features = 128, 10
X = rng.rand(n_samples, n_features).astype(dtype) * spread
Y = rng.rand(n_samples, n_features).astype(dtype) * spread
# Create read only datasets
X_mm, Y_mm = create_memmap_backed_data([X, Y])
if Dispatcher is ArgKmin:
parameter = 10
check_parameters = {}
compute_parameters = {}
else:
# Scaling the radius slightly with the numbers of dimensions
radius = 10 ** np.log(n_features)
parameter = radius
check_parameters = {"radius": radius}
compute_parameters = {"sort_results": True}
ref_dist, ref_indices = Dispatcher.compute(
X,
Y,
parameter,
metric=metric,
return_distance=True,
**compute_parameters,
)
dist_mm, indices_mm = Dispatcher.compute(
X_mm,
Y_mm,
parameter,
metric=metric,
return_distance=True,
**compute_parameters,
)
ASSERT_RESULT[(Dispatcher, dtype)](
ref_dist, dist_mm, ref_indices, indices_mm, **check_parameters
)
@pytest.mark.parametrize("dtype", [np.float64, np.float32])
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_sqeuclidean_row_norms(
global_random_seed,
dtype,
csr_container,
):
rng = np.random.RandomState(global_random_seed)
spread = 100
n_samples = rng.choice([97, 100, 101, 1000])
n_features = rng.choice([5, 10, 100])
num_threads = rng.choice([1, 2, 8])
X = rng.rand(n_samples, n_features).astype(dtype) * spread
X_csr = csr_container(X)
sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2
sq_row_norm = sqeuclidean_row_norms(X, num_threads=num_threads)
sq_row_norm_csr = sqeuclidean_row_norms(X_csr, num_threads=num_threads)
assert_allclose(sq_row_norm_reference, sq_row_norm)
assert_allclose(sq_row_norm_reference, sq_row_norm_csr)
with pytest.raises(ValueError):
X = np.asfortranarray(X)
sqeuclidean_row_norms(X, num_threads=num_threads)
def test_argkmin_classmode_strategy_consistent():
rng = np.random.RandomState(1)
X = rng.rand(100, 10)
Y = rng.rand(100, 10)
k = 5
metric = "manhattan"
weights = "uniform"
Y_labels = rng.randint(low=0, high=10, size=100)
unique_Y_labels = np.unique(Y_labels)
results_X = ArgKminClassMode.compute(
X=X,
Y=Y,
k=k,
metric=metric,
weights=weights,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
strategy="parallel_on_X",
)
results_Y = ArgKminClassMode.compute(
X=X,
Y=Y,
k=k,
metric=metric,
weights=weights,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
strategy="parallel_on_Y",
)
assert_array_equal(results_X, results_Y)
@pytest.mark.parametrize("outlier_label", [None, 0, 3, 6, 9])
def test_radius_neighbors_classmode_strategy_consistent(outlier_label):
rng = np.random.RandomState(1)
X = rng.rand(100, 10)
Y = rng.rand(100, 10)
radius = 5
metric = "manhattan"
weights = "uniform"
Y_labels = rng.randint(low=0, high=10, size=100)
unique_Y_labels = np.unique(Y_labels)
results_X = RadiusNeighborsClassMode.compute(
X=X,
Y=Y,
radius=radius,
metric=metric,
weights=weights,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
outlier_label=outlier_label,
strategy="parallel_on_X",
)
results_Y = RadiusNeighborsClassMode.compute(
X=X,
Y=Y,
radius=radius,
metric=metric,
weights=weights,
Y_labels=Y_labels,
unique_Y_labels=unique_Y_labels,
outlier_label=outlier_label,
strategy="parallel_on_Y",
)
assert_allclose(results_X, results_Y)